View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.Collection;
35  import java.util.HashMap;
36  import java.util.HashSet;
37  import java.util.LinkedList;
38  import java.util.List;
39  import java.util.Map;
40  import java.util.NavigableMap;
41  import java.util.Set;
42  import java.util.UUID;
43  import java.util.concurrent.Callable;
44  import java.util.concurrent.CountDownLatch;
45  import java.util.concurrent.ExecutorService;
46  import java.util.concurrent.Executors;
47  import java.util.concurrent.Future;
48  import java.util.concurrent.ScheduledThreadPoolExecutor;
49  import java.util.concurrent.SynchronousQueue;
50  import java.util.concurrent.ThreadPoolExecutor;
51  import java.util.concurrent.TimeUnit;
52  import java.util.concurrent.atomic.AtomicBoolean;
53  
54  import org.apache.commons.io.IOUtils;
55  import org.apache.commons.logging.Log;
56  import org.apache.commons.logging.LogFactory;
57  import org.apache.hadoop.conf.Configuration;
58  import org.apache.hadoop.fs.FileStatus;
59  import org.apache.hadoop.fs.FileSystem;
60  import org.apache.hadoop.fs.Path;
61  import org.apache.hadoop.hbase.ClusterStatus;
62  import org.apache.hadoop.hbase.HBaseTestingUtility;
63  import org.apache.hadoop.hbase.HColumnDescriptor;
64  import org.apache.hadoop.hbase.HConstants;
65  import org.apache.hadoop.hbase.HRegionInfo;
66  import org.apache.hadoop.hbase.HRegionLocation;
67  import org.apache.hadoop.hbase.HTableDescriptor;
68  import org.apache.hadoop.hbase.TableExistsException;
69  import org.apache.hadoop.hbase.io.HFileLink;
70  import org.apache.hadoop.hbase.io.hfile.HFile;
71  import org.apache.hadoop.hbase.io.hfile.HFileContext;
72  import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
73  import org.apache.hadoop.hbase.testclassification.LargeTests;
74  import org.apache.hadoop.hbase.MiniHBaseCluster;
75  import org.apache.hadoop.hbase.ServerName;
76  import org.apache.hadoop.hbase.TableName;
77  import org.apache.hadoop.hbase.MetaTableAccessor;
78  import org.apache.hadoop.hbase.client.Admin;
79  import org.apache.hadoop.hbase.client.ClusterConnection;
80  import org.apache.hadoop.hbase.client.Connection;
81  import org.apache.hadoop.hbase.client.ConnectionFactory;
82  import org.apache.hadoop.hbase.client.Delete;
83  import org.apache.hadoop.hbase.client.Durability;
84  import org.apache.hadoop.hbase.client.Get;
85  import org.apache.hadoop.hbase.client.HBaseAdmin;
86  import org.apache.hadoop.hbase.client.HConnection;
87  import org.apache.hadoop.hbase.client.HTable;
88  import org.apache.hadoop.hbase.client.MetaScanner;
89  import org.apache.hadoop.hbase.client.Put;
90  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
91  import org.apache.hadoop.hbase.client.Result;
92  import org.apache.hadoop.hbase.client.ResultScanner;
93  import org.apache.hadoop.hbase.client.Scan;
94  import org.apache.hadoop.hbase.client.Table;
95  import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver;
96  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
97  import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
98  import org.apache.hadoop.hbase.coprocessor.ObserverContext;
99  import org.apache.hadoop.hbase.io.hfile.TestHFile;
100 import org.apache.hadoop.hbase.master.AssignmentManager;
101 import org.apache.hadoop.hbase.master.HMaster;
102 import org.apache.hadoop.hbase.master.RegionState;
103 import org.apache.hadoop.hbase.master.RegionStates;
104 import org.apache.hadoop.hbase.master.TableLockManager;
105 import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
106 import org.apache.hadoop.hbase.mob.MobFileName;
107 import org.apache.hadoop.hbase.mob.MobUtils;
108 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
109 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
110 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
111 import org.apache.hadoop.hbase.regionserver.HRegion;
112 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
113 import org.apache.hadoop.hbase.regionserver.HRegionServer;
114 import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
115 import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
116 import org.apache.hadoop.hbase.testclassification.LargeTests;
117 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
118 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
119 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
120 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
121 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
122 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
123 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
124 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
125 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
126 import org.apache.zookeeper.KeeperException;
127 import org.junit.AfterClass;
128 import org.junit.Assert;
129 import org.junit.Before;
130 import org.junit.BeforeClass;
131 import org.junit.Ignore;
132 import org.junit.Test;
133 import org.junit.experimental.categories.Category;
134 import org.junit.rules.TestName;
135 
136 import com.google.common.collect.Multimap;
137 
138 /**
139  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
140  */
141 @Category(LargeTests.class)
142 public class TestHBaseFsck {
143   static final int POOL_SIZE = 7;
144 
145   final static Log LOG = LogFactory.getLog(TestHBaseFsck.class);
146   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
147   private final static Configuration conf = TEST_UTIL.getConfiguration();
148   private final static String FAM_STR = "fam";
149   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
150   private final static int REGION_ONLINE_TIMEOUT = 800;
151   private static RegionStates regionStates;
152   private static ExecutorService tableExecutorService;
153   private static ScheduledThreadPoolExecutor hbfsckExecutorService;
154   private static ClusterConnection connection;
155   private static Admin admin;
156 
157   // for the instance, reset every test run
158   private HTable tbl;
159   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
160     Bytes.toBytes("B"), Bytes.toBytes("C") };
161   // one row per region.
162   private final static byte[][] ROWKEYS= new byte[][] {
163     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
164     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
165 
166   @BeforeClass
167   public static void setUpBeforeClass() throws Exception {
168     TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
169       MasterSyncObserver.class.getName());
170 
171     conf.setInt("hbase.regionserver.handler.count", 2);
172     conf.setInt("hbase.regionserver.metahandler.count", 30);
173 
174     conf.setInt("hbase.htable.threads.max", POOL_SIZE);
175     conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
176     conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
177     conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
178     conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 8 * REGION_ONLINE_TIMEOUT);
179     TEST_UTIL.startMiniCluster(3);
180 
181     tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
182         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
183 
184     hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
185 
186     AssignmentManager assignmentManager =
187       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
188     regionStates = assignmentManager.getRegionStates();
189 
190     connection = (ClusterConnection) TEST_UTIL.getConnection();
191 
192     admin = connection.getAdmin();
193     admin.setBalancerRunning(false, true);
194 
195     TEST_UTIL.waitUntilAllSystemRegionsAssigned();
196 
197   }
198 
199   @AfterClass
200   public static void tearDownAfterClass() throws Exception {
201     tableExecutorService.shutdown();
202     hbfsckExecutorService.shutdown();
203     admin.close();
204     TEST_UTIL.shutdownMiniCluster();
205   }
206 
207   @Before
208   public void setUp() {
209     EnvironmentEdgeManager.reset();
210   }
211 
212   @Test (timeout=180000)
213   public void testHBaseFsck() throws Exception {
214     assertNoErrors(doFsck(conf, false));
215     TableName table = TableName.valueOf("tableBadMetaAssign");
216     HTableDescriptor desc = new HTableDescriptor(table);
217     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
218     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
219     createTable(TEST_UTIL, desc, null);
220 
221     // We created 1 table, should be fine
222     assertNoErrors(doFsck(conf, false));
223 
224     // Now let's mess it up and change the assignment in hbase:meta to
225     // point to a different region server
226     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
227     Scan scan = new Scan();
228     scan.setStartRow(Bytes.toBytes(table+",,"));
229     ResultScanner scanner = meta.getScanner(scan);
230     HRegionInfo hri = null;
231 
232     Result res = scanner.next();
233     ServerName currServer =
234       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
235           HConstants.SERVER_QUALIFIER));
236     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
237         HConstants.STARTCODE_QUALIFIER));
238 
239     for (JVMClusterUtil.RegionServerThread rs :
240         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
241 
242       ServerName sn = rs.getRegionServer().getServerName();
243 
244       // When we find a diff RS, change the assignment and break
245       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
246           startCode != sn.getStartcode()) {
247         Put put = new Put(res.getRow());
248         put.setDurability(Durability.SKIP_WAL);
249         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
250           Bytes.toBytes(sn.getHostAndPort()));
251         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
252           Bytes.toBytes(sn.getStartcode()));
253         meta.put(put);
254         hri = MetaTableAccessor.getHRegionInfo(res);
255         break;
256       }
257     }
258 
259     // Try to fix the data
260     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
261         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
262 
263     TEST_UTIL.getHBaseCluster().getMaster()
264       .getAssignmentManager().waitForAssignment(hri);
265 
266     // Should be fixed now
267     assertNoErrors(doFsck(conf, false));
268 
269     // comment needed - what is the purpose of this line
270     Table t = connection.getTable(table, tableExecutorService);
271     ResultScanner s = t.getScanner(new Scan());
272     s.close();
273     t.close();
274 
275     scanner.close();
276     meta.close();
277   }
278 
279   @Test(timeout=180000)
280   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
281     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
282     admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
283     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
284     new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
285     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
286     HBaseFsck hbck = doFsck(conf, true);
287     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
288         ERROR_CODE.NULL_META_REGION });
289     assertNoErrors(doFsck(conf, false));
290   }
291 
292   /**
293    * Create a new region in META.
294    */
295   private HRegionInfo createRegion(final HTableDescriptor
296       htd, byte[] startKey, byte[] endKey)
297       throws IOException {
298     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
299     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
300     MetaTableAccessor.addRegionToMeta(meta, hri);
301     meta.close();
302     return hri;
303   }
304 
305   /**
306    * Debugging method to dump the contents of meta.
307    */
308   private void dumpMeta(TableName tableName) throws IOException {
309     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
310     for (byte[] row : metaRows) {
311       LOG.info(Bytes.toString(row));
312     }
313   }
314 
315   /**
316    * This method is used to undeploy a region -- close it and attempt to
317    * remove its state from the Master.
318    */
319   private void undeployRegion(Connection conn, ServerName sn,
320       HRegionInfo hri) throws IOException, InterruptedException {
321     try {
322       HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
323       if (!hri.isMetaTable()) {
324         admin.offline(hri.getRegionName());
325       }
326     } catch (IOException ioe) {
327       LOG.warn("Got exception when attempting to offline region "
328           + Bytes.toString(hri.getRegionName()), ioe);
329     }
330   }
331   /**
332    * Delete a region from assignments, meta, or completely from hdfs.
333    * @param unassign if true unassign region if assigned
334    * @param metaRow  if true remove region's row from META
335    * @param hdfs if true remove region's dir in HDFS
336    */
337   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
338       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
339       boolean hdfs) throws IOException, InterruptedException {
340     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, HRegionInfo.DEFAULT_REPLICA_ID);
341   }
342 
343   /**
344    * Delete a region from assignments, meta, or completely from hdfs.
345    * @param unassign if true unassign region if assigned
346    * @param metaRow  if true remove region's row from META
347    * @param hdfs if true remove region's dir in HDFS
348    * @param regionInfoOnly if true remove a region dir's .regioninfo file
349    * @param replicaId replica id
350    */
351   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
352       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
353       boolean hdfs, boolean regionInfoOnly, int replicaId)
354           throws IOException, InterruptedException {
355     LOG.info("** Before delete:");
356     dumpMeta(htd.getTableName());
357 
358     List<HRegionLocation> locations = tbl.getAllRegionLocations();
359     for (HRegionLocation location : locations) {
360       HRegionInfo hri = location.getRegionInfo();
361       ServerName hsa = location.getServerName();
362       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
363           && Bytes.compareTo(hri.getEndKey(), endKey) == 0
364           && hri.getReplicaId() == replicaId) {
365 
366         LOG.info("RegionName: " +hri.getRegionNameAsString());
367         byte[] deleteRow = hri.getRegionName();
368 
369         if (unassign) {
370           LOG.info("Undeploying region " + hri + " from server " + hsa);
371           undeployRegion(connection, hsa, hri);
372         }
373 
374         if (regionInfoOnly) {
375           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
376           Path rootDir = FSUtils.getRootDir(conf);
377           FileSystem fs = rootDir.getFileSystem(conf);
378           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
379               hri.getEncodedName());
380           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
381           fs.delete(hriPath, true);
382         }
383 
384         if (hdfs) {
385           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
386           Path rootDir = FSUtils.getRootDir(conf);
387           FileSystem fs = rootDir.getFileSystem(conf);
388           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
389               hri.getEncodedName());
390           HBaseFsck.debugLsr(conf, p);
391           boolean success = fs.delete(p, true);
392           LOG.info("Deleted " + p + " sucessfully? " + success);
393           HBaseFsck.debugLsr(conf, p);
394         }
395 
396         if (metaRow) {
397           try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
398             Delete delete = new Delete(deleteRow);
399             meta.delete(delete);
400           }
401         }
402       }
403       LOG.info(hri.toString() + hsa.toString());
404     }
405 
406     TEST_UTIL.getMetaTableRows(htd.getTableName());
407     LOG.info("*** After delete:");
408     dumpMeta(htd.getTableName());
409   }
410 
411   /**
412    * Setup a clean table before we start mucking with it.
413    *
414    * It will set tbl which needs to be closed after test
415    *
416    * @throws IOException
417    * @throws InterruptedException
418    * @throws KeeperException
419    */
420   void setupTable(TableName tablename) throws Exception {
421     setupTableWithRegionReplica(tablename, 1);
422   }
423 
424   /**
425    * Setup a clean table with a certain region_replica count
426    *
427    * It will set tbl which needs to be closed after test
428    *
429    * @param tableName
430    * @param replicaCount
431    * @throws Exception
432    */
433   void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
434     HTableDescriptor desc = new HTableDescriptor(tablename);
435     desc.setRegionReplication(replicaCount);
436     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
437     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
438     createTable(TEST_UTIL, desc, SPLITS);
439 
440     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
441     List<Put> puts = new ArrayList<Put>();
442     for (byte[] row : ROWKEYS) {
443       Put p = new Put(row);
444       p.add(FAM, Bytes.toBytes("val"), row);
445       puts.add(p);
446     }
447     tbl.put(puts);
448     tbl.flushCommits();
449   }
450 
451   /**
452    * Setup a clean table with a mob-enabled column.
453    *
454    * @param tableName The name of a table to be created.
455    * @throws Exception
456    */
457   void setupMobTable(TableName tablename) throws Exception {
458     HTableDescriptor desc = new HTableDescriptor(tablename);
459     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
460     hcd.setMobEnabled(true);
461     hcd.setMobThreshold(0);
462     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
463     createTable(TEST_UTIL, desc, SPLITS);
464 
465     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
466     List<Put> puts = new ArrayList<Put>();
467     for (byte[] row : ROWKEYS) {
468       Put p = new Put(row);
469       p.add(FAM, Bytes.toBytes("val"), row);
470       puts.add(p);
471     }
472     tbl.put(puts);
473     tbl.flushCommits();
474   }
475 
476   /**
477    * Counts the number of rows to verify data loss or non-dataloss.
478    */
479   int countRows() throws IOException {
480      Scan s = new Scan();
481      ResultScanner rs = tbl.getScanner(s);
482      int i = 0;
483      while(rs.next() !=null) {
484        i++;
485      }
486      return i;
487   }
488 
489   /**
490    * Counts the number of rows to verify data loss or non-dataloss.
491    */
492   int countRows(byte[] start, byte[] end) throws IOException {
493     Scan s = new Scan(start, end);
494     ResultScanner rs = tbl.getScanner(s);
495     int i = 0;
496     while (rs.next() != null) {
497       i++;
498     }
499     return i;
500   }
501 
502   /**
503    * delete table in preparation for next test
504    *
505    * @param tablename
506    * @throws IOException
507    */
508   void cleanupTable(TableName tablename) throws Exception {
509     if (tbl != null) {
510       tbl.close();
511       tbl = null;
512     }
513 
514     ((ClusterConnection) connection).clearRegionCache();
515     deleteTable(TEST_UTIL, tablename);
516   }
517 
518   /**
519    * This creates a clean table and confirms that the table is clean.
520    */
521   @Test (timeout=180000)
522   public void testHBaseFsckClean() throws Exception {
523     assertNoErrors(doFsck(conf, false));
524     TableName table = TableName.valueOf("tableClean");
525     try {
526       HBaseFsck hbck = doFsck(conf, false);
527       assertNoErrors(hbck);
528 
529       setupTable(table);
530       assertEquals(ROWKEYS.length, countRows());
531 
532       // We created 1 table, should be fine
533       hbck = doFsck(conf, false);
534       assertNoErrors(hbck);
535       assertEquals(0, hbck.getOverlapGroups(table).size());
536       assertEquals(ROWKEYS.length, countRows());
537     } finally {
538       cleanupTable(table);
539     }
540   }
541 
542   /**
543    * Test thread pooling in the case where there are more regions than threads
544    */
545   @Test (timeout=180000)
546   public void testHbckThreadpooling() throws Exception {
547     TableName table =
548         TableName.valueOf("tableDupeStartKey");
549     try {
550       // Create table with 4 regions
551       setupTable(table);
552 
553       // limit number of threads to 1.
554       Configuration newconf = new Configuration(conf);
555       newconf.setInt("hbasefsck.numthreads", 1);
556       assertNoErrors(doFsck(newconf, false));
557 
558       // We should pass without triggering a RejectedExecutionException
559     } finally {
560       cleanupTable(table);
561     }
562   }
563 
564   @Test (timeout=180000)
565   public void testHbckFixOrphanTable() throws Exception {
566     TableName table = TableName.valueOf("tableInfo");
567     FileSystem fs = null;
568     Path tableinfo = null;
569     try {
570       setupTable(table);
571 
572       Path hbaseTableDir = FSUtils.getTableDir(
573           FSUtils.getRootDir(conf), table);
574       fs = hbaseTableDir.getFileSystem(conf);
575       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
576       tableinfo = status.getPath();
577       fs.rename(tableinfo, new Path("/.tableinfo"));
578 
579       //to report error if .tableinfo is missing.
580       HBaseFsck hbck = doFsck(conf, false);
581       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
582 
583       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
584       hbck = doFsck(conf, true);
585       assertNoErrors(hbck);
586       status = null;
587       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
588       assertNotNull(status);
589 
590       HTableDescriptor htd = admin.getTableDescriptor(table);
591       htd.setValue("NOT_DEFAULT", "true");
592       admin.disableTable(table);
593       admin.modifyTable(table, htd);
594       admin.enableTable(table);
595       fs.delete(status.getPath(), true);
596 
597       // fix OrphanTable with cache
598       htd = admin.getTableDescriptor(table); // warms up cached htd on master
599       hbck = doFsck(conf, true);
600       assertNoErrors(hbck);
601       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
602       assertNotNull(status);
603       htd = admin.getTableDescriptor(table);
604       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
605     } finally {
606       fs.rename(new Path("/.tableinfo"), tableinfo);
607       cleanupTable(table);
608     }
609   }
610 
611   /**
612    * This test makes sure that parallel instances of Hbck is disabled.
613    *
614    * @throws Exception
615    */
616   @Test (timeout=180000)
617   public void testParallelHbck() throws Exception {
618     final ExecutorService service;
619     final Future<HBaseFsck> hbck1,hbck2;
620 
621     class RunHbck implements Callable<HBaseFsck>{
622       boolean fail = true;
623       @Override
624       public HBaseFsck call(){
625         Configuration c = new Configuration(conf);
626         c.setInt("hbase.hbck.lockfile.attempts", 1);
627         // HBASE-13574 found that in HADOOP-2.6 and later, the create file would internally retry.
628         // To avoid flakiness of the test, set low max wait time.
629         c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
630         try{
631           return doFsck(c, false);
632         } catch(Exception e){
633           if (e.getMessage().contains("Duplicate hbck")) {
634             fail = false;
635           }
636         }
637         // If we reach here, then an exception was caught
638         if (fail) fail();
639         return null;
640       }
641     }
642     service = Executors.newFixedThreadPool(2);
643     hbck1 = service.submit(new RunHbck());
644     hbck2 = service.submit(new RunHbck());
645     service.shutdown();
646     //wait for 15 seconds, for both hbck calls finish
647     service.awaitTermination(15, TimeUnit.SECONDS);
648     HBaseFsck h1 = hbck1.get();
649     HBaseFsck h2 = hbck2.get();
650     // Make sure only one of the calls was successful
651     assert(h1 == null || h2 == null);
652     if (h1 != null) {
653       assert(h1.getRetCode() >= 0);
654     }
655     if (h2 != null) {
656       assert(h2.getRetCode() >= 0);
657     }
658   }
659 
660   /**
661    * This test makes sure that with enough retries both parallel instances
662    * of hbck will be completed successfully.
663    *
664    * @throws Exception
665    */
666   @Test (timeout=180000)
667   public void testParallelWithRetriesHbck() throws Exception {
668     final ExecutorService service;
669     final Future<HBaseFsck> hbck1,hbck2;
670 
671     // With the ExponentialBackoffPolicyWithLimit (starting with 200 milliseconds sleep time, and
672     // max sleep time of 5 seconds), we can retry around 15 times within 80 seconds before bail out.
673     //
674     // Note: the reason to use 80 seconds is that in HADOOP-2.6 and later, the create file would
675     // retry up to HdfsConstants.LEASE_SOFTLIMIT_PERIOD (60 seconds).  See HBASE-13574 for more
676     // details.
677     final int timeoutInSeconds = 80;
678     final int sleepIntervalInMilliseconds = 200;
679     final int maxSleepTimeInMilliseconds = 6000;
680     final int maxRetryAttempts = 15;
681 
682     class RunHbck implements Callable<HBaseFsck>{
683 
684       @Override
685       public HBaseFsck call() throws Exception {
686         // Increase retry attempts to make sure the non-active hbck doesn't get starved
687         Configuration c = new Configuration(conf);
688         c.setInt("hbase.hbck.lockfile.maxwaittime", timeoutInSeconds);
689         c.setInt("hbase.hbck.lockfile.attempt.sleep.interval", sleepIntervalInMilliseconds);
690         c.setInt("hbase.hbck.lockfile.attempt.maxsleeptime", maxSleepTimeInMilliseconds);
691         c.setInt("hbase.hbck.lockfile.attempts", maxRetryAttempts);
692         return doFsck(c, false);
693       }
694     }
695 
696     service = Executors.newFixedThreadPool(2);
697     hbck1 = service.submit(new RunHbck());
698     hbck2 = service.submit(new RunHbck());
699     service.shutdown();
700     //wait for some time, for both hbck calls finish
701     service.awaitTermination(timeoutInSeconds * 2, TimeUnit.SECONDS);
702     HBaseFsck h1 = hbck1.get();
703     HBaseFsck h2 = hbck2.get();
704     // Both should be successful
705     assertNotNull(h1);
706     assertNotNull(h2);
707     assert(h1.getRetCode() >= 0);
708     assert(h2.getRetCode() >= 0);
709 
710   }
711 
712   /**
713    * This create and fixes a bad table with regions that have a duplicate
714    * start key
715    */
716   @Test (timeout=180000)
717   public void testDupeStartKey() throws Exception {
718     TableName table =
719         TableName.valueOf("tableDupeStartKey");
720     try {
721       setupTable(table);
722       assertNoErrors(doFsck(conf, false));
723       assertEquals(ROWKEYS.length, countRows());
724 
725       // Now let's mess it up, by adding a region with a duplicate startkey
726       HRegionInfo hriDupe =
727           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
728       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
729       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
730           .waitForAssignment(hriDupe);
731       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
732       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
733 
734       HBaseFsck hbck = doFsck(conf, false);
735       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
736             ERROR_CODE.DUPE_STARTKEYS});
737       assertEquals(2, hbck.getOverlapGroups(table).size());
738       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
739 
740       // fix the degenerate region.
741       doFsck(conf,true);
742 
743       // check that the degenerate region is gone and no data loss
744       HBaseFsck hbck2 = doFsck(conf,false);
745       assertNoErrors(hbck2);
746       assertEquals(0, hbck2.getOverlapGroups(table).size());
747       assertEquals(ROWKEYS.length, countRows());
748     } finally {
749       cleanupTable(table);
750     }
751   }
752 
753   /*
754    * This creates a table with region_replica > 1 and verifies hbck runs
755    * successfully
756    */
757   @Test (timeout=180000)
758   public void testHbckWithRegionReplica() throws Exception {
759     TableName table =
760         TableName.valueOf("testHbckWithRegionReplica");
761     try {
762       setupTableWithRegionReplica(table, 2);
763       TEST_UTIL.getHBaseAdmin().flush(table.getName());
764       assertNoErrors(doFsck(conf, false));
765     } finally {
766       cleanupTable(table);
767     }
768   }
769 
770   @Test
771   public void testHbckWithFewerReplica() throws Exception {
772     TableName table =
773         TableName.valueOf("testHbckWithFewerReplica");
774     try {
775       setupTableWithRegionReplica(table, 2);
776       TEST_UTIL.getHBaseAdmin().flush(table.getName());
777       assertNoErrors(doFsck(conf, false));
778       assertEquals(ROWKEYS.length, countRows());
779       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
780           Bytes.toBytes("C"), true, false, false, false, 1); // unassign one replica
781       // check that problem exists
782       HBaseFsck hbck = doFsck(conf, false);
783       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_DEPLOYED});
784       // fix the problem
785       hbck = doFsck(conf, true);
786       // run hbck again to make sure we don't see any errors
787       hbck = doFsck(conf, false);
788       assertErrors(hbck, new ERROR_CODE[]{});
789     } finally {
790       cleanupTable(table);
791     }
792   }
793 
794   @Test
795   public void testHbckWithExcessReplica() throws Exception {
796     TableName table =
797         TableName.valueOf("testHbckWithExcessReplica");
798     try {
799       setupTableWithRegionReplica(table, 2);
800       TEST_UTIL.getHBaseAdmin().flush(table.getName());
801       assertNoErrors(doFsck(conf, false));
802       assertEquals(ROWKEYS.length, countRows());
803       // the next few lines inject a location in meta for a replica, and then
804       // asks the master to assign the replica (the meta needs to be injected
805       // for the master to treat the request for assignment as valid; the master
806       // checks the region is valid either from its memory or meta)
807       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
808       List<HRegionInfo> regions = TEST_UTIL.getHBaseAdmin().getTableRegions(table);
809       byte[] startKey = Bytes.toBytes("B");
810       byte[] endKey = Bytes.toBytes("C");
811       byte[] metaKey = null;
812       HRegionInfo newHri = null;
813       for (HRegionInfo h : regions) {
814         if (Bytes.compareTo(h.getStartKey(), startKey) == 0  &&
815             Bytes.compareTo(h.getEndKey(), endKey) == 0 &&
816             h.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
817           metaKey = h.getRegionName();
818           //create a hri with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
819           newHri = RegionReplicaUtil.getRegionInfoForReplica(h, 2);
820           break;
821         }
822       }
823       Put put = new Put(metaKey);
824       ServerName sn = TEST_UTIL.getHBaseAdmin().getClusterStatus().getServers()
825           .toArray(new ServerName[0])[0];
826       //add a location with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
827       MetaTableAccessor.addLocation(put, sn, sn.getStartcode(), -1, 2);
828       meta.put(put);
829       meta.flushCommits();
830       // assign the new replica
831       HBaseFsckRepair.fixUnassigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
832       HBaseFsckRepair.waitUntilAssigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
833       // now reset the meta row to its original value
834       Delete delete = new Delete(metaKey);
835       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(2));
836       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(2));
837       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getSeqNumColumn(2));
838       meta.delete(delete);
839       meta.flushCommits();
840       meta.close();
841       // check that problem exists
842       HBaseFsck hbck = doFsck(conf, false);
843       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_IN_META});
844       // fix the problem
845       hbck = doFsck(conf, true);
846       // run hbck again to make sure we don't see any errors
847       hbck = doFsck(conf, false);
848       assertErrors(hbck, new ERROR_CODE[]{});
849     } finally {
850       cleanupTable(table);
851     }
852   }
853   /**
854    * Get region info from local cluster.
855    */
856   Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
857     ClusterStatus status = admin.getClusterStatus();
858     Collection<ServerName> regionServers = status.getServers();
859     Map<ServerName, List<String>> mm =
860         new HashMap<ServerName, List<String>>();
861     for (ServerName hsi : regionServers) {
862       AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
863 
864       // list all online regions from this region server
865       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
866       List<String> regionNames = new ArrayList<String>();
867       for (HRegionInfo hri : regions) {
868         regionNames.add(hri.getRegionNameAsString());
869       }
870       mm.put(hsi, regionNames);
871     }
872     return mm;
873   }
874 
875   /**
876    * Returns the HSI a region info is on.
877    */
878   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
879     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
880       if (e.getValue().contains(hri.getRegionNameAsString())) {
881         return e.getKey();
882       }
883     }
884     return null;
885   }
886 
887   /**
888    * This create and fixes a bad table with regions that have a duplicate
889    * start key
890    */
891   @Test (timeout=180000)
892   public void testDupeRegion() throws Exception {
893     TableName table =
894         TableName.valueOf("tableDupeRegion");
895     try {
896       setupTable(table);
897       assertNoErrors(doFsck(conf, false));
898       assertEquals(ROWKEYS.length, countRows());
899 
900       // Now let's mess it up, by adding a region with a duplicate startkey
901       HRegionInfo hriDupe =
902           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
903 
904       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
905       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
906           .waitForAssignment(hriDupe);
907       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
908       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
909 
910       // Yikes! The assignment manager can't tell between diff between two
911       // different regions with the same start/endkeys since it doesn't
912       // differentiate on ts/regionId!  We actually need to recheck
913       // deployments!
914       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
915         Thread.sleep(250);
916       }
917 
918       LOG.debug("Finished assignment of dupe region");
919 
920       // TODO why is dupe region different from dupe start keys?
921       HBaseFsck hbck = doFsck(conf, false);
922       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
923             ERROR_CODE.DUPE_STARTKEYS});
924       assertEquals(2, hbck.getOverlapGroups(table).size());
925       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
926 
927       // fix the degenerate region.
928       doFsck(conf,true);
929 
930       // check that the degenerate region is gone and no data loss
931       HBaseFsck hbck2 = doFsck(conf,false);
932       assertNoErrors(hbck2);
933       assertEquals(0, hbck2.getOverlapGroups(table).size());
934       assertEquals(ROWKEYS.length, countRows());
935     } finally {
936       cleanupTable(table);
937     }
938   }
939 
940   /**
941    * This creates and fixes a bad table with regions that has startkey == endkey
942    */
943   @Test (timeout=180000)
944   public void testDegenerateRegions() throws Exception {
945     TableName table = TableName.valueOf("tableDegenerateRegions");
946     try {
947       setupTable(table);
948       assertNoErrors(doFsck(conf,false));
949       assertEquals(ROWKEYS.length, countRows());
950 
951       // Now let's mess it up, by adding a region with a duplicate startkey
952       HRegionInfo hriDupe =
953           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
954       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
955       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
956           .waitForAssignment(hriDupe);
957       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
958       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
959 
960       HBaseFsck hbck = doFsck(conf,false);
961       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
962           ERROR_CODE.DUPE_STARTKEYS });
963       assertEquals(2, hbck.getOverlapGroups(table).size());
964       assertEquals(ROWKEYS.length, countRows());
965 
966       // fix the degenerate region.
967       doFsck(conf,true);
968 
969       // check that the degenerate region is gone and no data loss
970       HBaseFsck hbck2 = doFsck(conf,false);
971       assertNoErrors(hbck2);
972       assertEquals(0, hbck2.getOverlapGroups(table).size());
973       assertEquals(ROWKEYS.length, countRows());
974     } finally {
975       cleanupTable(table);
976     }
977   }
978 
979   /**
980    * This creates and fixes a bad table where a region is completely contained
981    * by another region.
982    */
983   @Test (timeout=180000)
984   public void testContainedRegionOverlap() throws Exception {
985     TableName table =
986         TableName.valueOf("tableContainedRegionOverlap");
987     try {
988       setupTable(table);
989       assertEquals(ROWKEYS.length, countRows());
990 
991       // Mess it up by creating an overlap in the metadata
992       HRegionInfo hriOverlap =
993           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
994       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
995       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
996           .waitForAssignment(hriOverlap);
997       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
998       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
999 
1000       HBaseFsck hbck = doFsck(conf, false);
1001       assertErrors(hbck, new ERROR_CODE[] {
1002           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1003       assertEquals(2, hbck.getOverlapGroups(table).size());
1004       assertEquals(ROWKEYS.length, countRows());
1005 
1006       // fix the problem.
1007       doFsck(conf, true);
1008 
1009       // verify that overlaps are fixed
1010       HBaseFsck hbck2 = doFsck(conf,false);
1011       assertNoErrors(hbck2);
1012       assertEquals(0, hbck2.getOverlapGroups(table).size());
1013       assertEquals(ROWKEYS.length, countRows());
1014     } finally {
1015       cleanupTable(table);
1016     }
1017   }
1018 
1019   /**
1020    * This creates and fixes a bad table where an overlap group of
1021    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
1022    * region. Mess around the meta data so that closeRegion/offlineRegion
1023    * throws exceptions.
1024    */
1025   @Test (timeout=180000)
1026   public void testSidelineOverlapRegion() throws Exception {
1027     TableName table =
1028         TableName.valueOf("testSidelineOverlapRegion");
1029     try {
1030       setupTable(table);
1031       assertEquals(ROWKEYS.length, countRows());
1032 
1033       // Mess it up by creating an overlap
1034       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1035       HMaster master = cluster.getMaster();
1036       HRegionInfo hriOverlap1 =
1037           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
1038       master.assignRegion(hriOverlap1);
1039       master.getAssignmentManager().waitForAssignment(hriOverlap1);
1040       HRegionInfo hriOverlap2 =
1041           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
1042       master.assignRegion(hriOverlap2);
1043       master.getAssignmentManager().waitForAssignment(hriOverlap2);
1044 
1045       HBaseFsck hbck = doFsck(conf, false);
1046       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
1047         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
1048       assertEquals(3, hbck.getOverlapGroups(table).size());
1049       assertEquals(ROWKEYS.length, countRows());
1050 
1051       // mess around the overlapped regions, to trigger NotServingRegionException
1052       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
1053       ServerName serverName = null;
1054       byte[] regionName = null;
1055       for (HbckInfo hbi: overlapGroups.values()) {
1056         if ("A".equals(Bytes.toString(hbi.getStartKey()))
1057             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
1058           regionName = hbi.getRegionName();
1059 
1060           // get an RS not serving the region to force bad assignment info in to META.
1061           int k = cluster.getServerWith(regionName);
1062           for (int i = 0; i < 3; i++) {
1063             if (i != k) {
1064               HRegionServer rs = cluster.getRegionServer(i);
1065               serverName = rs.getServerName();
1066               break;
1067             }
1068           }
1069 
1070           HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
1071               cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
1072           admin.offline(regionName);
1073           break;
1074         }
1075       }
1076 
1077       assertNotNull(regionName);
1078       assertNotNull(serverName);
1079       try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
1080         Put put = new Put(regionName);
1081         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
1082             Bytes.toBytes(serverName.getHostAndPort()));
1083         meta.put(put);
1084       }
1085 
1086       // fix the problem.
1087       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1088       fsck.connect();
1089       fsck.setDisplayFullReport(); // i.e. -details
1090       fsck.setTimeLag(0);
1091       fsck.setFixAssignments(true);
1092       fsck.setFixMeta(true);
1093       fsck.setFixHdfsHoles(true);
1094       fsck.setFixHdfsOverlaps(true);
1095       fsck.setFixHdfsOrphans(true);
1096       fsck.setFixVersionFile(true);
1097       fsck.setSidelineBigOverlaps(true);
1098       fsck.setMaxMerge(2);
1099       fsck.onlineHbck();
1100       fsck.close();
1101 
1102       // verify that overlaps are fixed, and there are less rows
1103       // since one region is sidelined.
1104       HBaseFsck hbck2 = doFsck(conf,false);
1105       assertNoErrors(hbck2);
1106       assertEquals(0, hbck2.getOverlapGroups(table).size());
1107       assertTrue(ROWKEYS.length > countRows());
1108     } finally {
1109       cleanupTable(table);
1110     }
1111   }
1112 
1113   /**
1114    * This creates and fixes a bad table where a region is completely contained
1115    * by another region, and there is a hole (sort of like a bad split)
1116    */
1117   @Test (timeout=180000)
1118   public void testOverlapAndOrphan() throws Exception {
1119     TableName table =
1120         TableName.valueOf("tableOverlapAndOrphan");
1121     try {
1122       setupTable(table);
1123       assertEquals(ROWKEYS.length, countRows());
1124 
1125       // Mess it up by creating an overlap in the metadata
1126       admin.disableTable(table);
1127       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1128           Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1129       TEST_UTIL.getHBaseAdmin().enableTable(table);
1130 
1131       HRegionInfo hriOverlap =
1132           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1133       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1134       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1135           .waitForAssignment(hriOverlap);
1136       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1137       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1138 
1139       HBaseFsck hbck = doFsck(conf, false);
1140       assertErrors(hbck, new ERROR_CODE[] {
1141           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1142           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1143 
1144       // fix the problem.
1145       doFsck(conf, true);
1146 
1147       // verify that overlaps are fixed
1148       HBaseFsck hbck2 = doFsck(conf,false);
1149       assertNoErrors(hbck2);
1150       assertEquals(0, hbck2.getOverlapGroups(table).size());
1151       assertEquals(ROWKEYS.length, countRows());
1152     } finally {
1153       cleanupTable(table);
1154     }
1155   }
1156 
1157   /**
1158    * This creates and fixes a bad table where a region overlaps two regions --
1159    * a start key contained in another region and its end key is contained in
1160    * yet another region.
1161    */
1162   @Test (timeout=180000)
1163   public void testCoveredStartKey() throws Exception {
1164     TableName table =
1165         TableName.valueOf("tableCoveredStartKey");
1166     try {
1167       setupTable(table);
1168       assertEquals(ROWKEYS.length, countRows());
1169 
1170       // Mess it up by creating an overlap in the metadata
1171       HRegionInfo hriOverlap =
1172           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
1173       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1174       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1175           .waitForAssignment(hriOverlap);
1176       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1177       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1178 
1179       HBaseFsck hbck = doFsck(conf, false);
1180       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
1181           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1182       assertEquals(3, hbck.getOverlapGroups(table).size());
1183       assertEquals(ROWKEYS.length, countRows());
1184 
1185       // fix the problem.
1186       doFsck(conf, true);
1187 
1188       // verify that overlaps are fixed
1189       HBaseFsck hbck2 = doFsck(conf, false);
1190       assertErrors(hbck2, new ERROR_CODE[0]);
1191       assertEquals(0, hbck2.getOverlapGroups(table).size());
1192       assertEquals(ROWKEYS.length, countRows());
1193     } finally {
1194       cleanupTable(table);
1195     }
1196   }
1197 
1198   /**
1199    * This creates and fixes a bad table with a missing region -- hole in meta
1200    * and data missing in the fs.
1201    */
1202   @Test (timeout=180000)
1203   public void testRegionHole() throws Exception {
1204     TableName table =
1205         TableName.valueOf("tableRegionHole");
1206     try {
1207       setupTable(table);
1208       assertEquals(ROWKEYS.length, countRows());
1209 
1210       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1211       admin.disableTable(table);
1212       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1213           Bytes.toBytes("C"), true, true, true);
1214       admin.enableTable(table);
1215 
1216       HBaseFsck hbck = doFsck(conf, false);
1217       assertErrors(hbck, new ERROR_CODE[] {
1218           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1219       // holes are separate from overlap groups
1220       assertEquals(0, hbck.getOverlapGroups(table).size());
1221 
1222       // fix hole
1223       doFsck(conf, true);
1224 
1225       // check that hole fixed
1226       assertNoErrors(doFsck(conf,false));
1227       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1228     } finally {
1229       cleanupTable(table);
1230     }
1231   }
1232 
1233   /**
1234    * This creates and fixes a bad table with a missing region -- hole in meta
1235    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1236    */
1237   @Test (timeout=180000)
1238   public void testHDFSRegioninfoMissing() throws Exception {
1239     TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1240     try {
1241       setupTable(table);
1242       assertEquals(ROWKEYS.length, countRows());
1243 
1244       // Mess it up by leaving a hole in the meta data
1245       admin.disableTable(table);
1246       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1247           Bytes.toBytes("C"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1248       TEST_UTIL.getHBaseAdmin().enableTable(table);
1249 
1250       HBaseFsck hbck = doFsck(conf, false);
1251       assertErrors(hbck, new ERROR_CODE[] {
1252           ERROR_CODE.ORPHAN_HDFS_REGION,
1253           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1254           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1255       // holes are separate from overlap groups
1256       assertEquals(0, hbck.getOverlapGroups(table).size());
1257 
1258       // fix hole
1259       doFsck(conf, true);
1260 
1261       // check that hole fixed
1262       assertNoErrors(doFsck(conf, false));
1263       assertEquals(ROWKEYS.length, countRows());
1264     } finally {
1265       cleanupTable(table);
1266     }
1267   }
1268 
1269   /**
1270    * This creates and fixes a bad table with a missing region -- hole in meta and data present but
1271    * .regioninfo missing (an orphan hdfs region)in the fs. At last we check every row was present
1272    * at the correct region.
1273    */
1274   @Test(timeout = 180000)
1275   public void testHDFSRegioninfoMissingAndCheckRegionBoundary() throws Exception {
1276     TableName table = TableName.valueOf("testHDFSRegioninfoMissingAndCheckRegionBoundary");
1277     try {
1278       setupTable(table);
1279       assertEquals(ROWKEYS.length, countRows());
1280 
1281       // Mess it up by leaving a hole in the meta data
1282       admin.disableTable(table);
1283       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
1284         true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1285       admin.enableTable(table);
1286 
1287       HBaseFsck hbck = doFsck(conf, false);
1288       assertErrors(hbck,
1289         new HBaseFsck.ErrorReporter.ERROR_CODE[] {
1290             HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION,
1291             HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1292             HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
1293       // holes are separate from overlap groups
1294       assertEquals(0, hbck.getOverlapGroups(table).size());
1295 
1296       // fix hole
1297       doFsck(conf, true);
1298 
1299       // check that hole fixed
1300       assertNoErrors(doFsck(conf, false));
1301 
1302       // check data belong to the correct region,every scan should get one row.
1303       for (int i = 0; i < ROWKEYS.length; i++) {
1304         if (i != ROWKEYS.length - 1) {
1305           assertEquals(1, countRows(ROWKEYS[i], ROWKEYS[i + 1]));
1306         } else {
1307           assertEquals(1, countRows(ROWKEYS[i], null));
1308         }
1309       }
1310 
1311     } finally {
1312       cleanupTable(table);
1313     }
1314   }
1315 
1316   /**
1317    * This creates and fixes a bad table with a region that is missing meta and
1318    * not assigned to a region server.
1319    */
1320   @Test (timeout=180000)
1321   public void testNotInMetaOrDeployedHole() throws Exception {
1322     TableName table =
1323         TableName.valueOf("tableNotInMetaOrDeployedHole");
1324     try {
1325       setupTable(table);
1326       assertEquals(ROWKEYS.length, countRows());
1327 
1328       // Mess it up by leaving a hole in the meta data
1329       admin.disableTable(table);
1330       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1331           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1332       admin.enableTable(table);
1333 
1334       HBaseFsck hbck = doFsck(conf, false);
1335       assertErrors(hbck, new ERROR_CODE[] {
1336           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1337       // holes are separate from overlap groups
1338       assertEquals(0, hbck.getOverlapGroups(table).size());
1339 
1340       // fix hole
1341       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1342           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1343 
1344       // check that hole fixed
1345       assertNoErrors(doFsck(conf,false));
1346       assertEquals(ROWKEYS.length, countRows());
1347     } finally {
1348       cleanupTable(table);
1349     }
1350   }
1351 
1352   /**
1353    * This creates fixes a bad table with a hole in meta.
1354    */
1355   @Test (timeout=180000)
1356   public void testNotInMetaHole() throws Exception {
1357     TableName table =
1358         TableName.valueOf("tableNotInMetaHole");
1359     try {
1360       setupTable(table);
1361       assertEquals(ROWKEYS.length, countRows());
1362 
1363       // Mess it up by leaving a hole in the meta data
1364       admin.disableTable(table);
1365       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1366           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1367       admin.enableTable(table);
1368 
1369       HBaseFsck hbck = doFsck(conf, false);
1370       assertErrors(hbck, new ERROR_CODE[] {
1371           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1372       // holes are separate from overlap groups
1373       assertEquals(0, hbck.getOverlapGroups(table).size());
1374 
1375       // fix hole
1376       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1377           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1378 
1379       // check that hole fixed
1380       assertNoErrors(doFsck(conf,false));
1381       assertEquals(ROWKEYS.length, countRows());
1382     } finally {
1383       cleanupTable(table);
1384     }
1385   }
1386 
1387   /**
1388    * This creates and fixes a bad table with a region that is in meta but has
1389    * no deployment or data hdfs
1390    */
1391   @Test (timeout=180000)
1392   public void testNotInHdfs() throws Exception {
1393     TableName table =
1394         TableName.valueOf("tableNotInHdfs");
1395     try {
1396       setupTable(table);
1397       assertEquals(ROWKEYS.length, countRows());
1398 
1399       // make sure data in regions, if in wal only there is no data loss
1400       admin.flush(table);
1401 
1402       // Mess it up by leaving a hole in the hdfs data
1403       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1404           Bytes.toBytes("C"), false, false, true); // don't rm meta
1405 
1406       HBaseFsck hbck = doFsck(conf, false);
1407       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1408       // holes are separate from overlap groups
1409       assertEquals(0, hbck.getOverlapGroups(table).size());
1410 
1411       // fix hole
1412       doFsck(conf, true);
1413 
1414       // check that hole fixed
1415       assertNoErrors(doFsck(conf,false));
1416       assertEquals(ROWKEYS.length - 2, countRows());
1417     } finally {
1418       cleanupTable(table);
1419     }
1420   }
1421 
1422   /**
1423    * This creates and fixes a bad table with a region that is in meta but has
1424    * no deployment or data hdfs. The table has region_replication set to 2.
1425    */
1426   @Test (timeout=180000)
1427   public void testNotInHdfsWithReplicas() throws Exception {
1428     TableName table =
1429         TableName.valueOf("tableNotInHdfs");
1430     HBaseAdmin admin = new HBaseAdmin(conf);
1431     try {
1432       HRegionInfo[] oldHris = new HRegionInfo[2];
1433       setupTableWithRegionReplica(table, 2);
1434       assertEquals(ROWKEYS.length, countRows());
1435       NavigableMap<HRegionInfo, ServerName> map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(),
1436           tbl.getName());
1437       int i = 0;
1438       // store the HRIs of the regions we will mess up
1439       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1440         if (m.getKey().getStartKey().length > 0 &&
1441             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1442           LOG.debug("Initially server hosting " + m.getKey() + " is " + m.getValue());
1443           oldHris[i++] = m.getKey();
1444         }
1445       }
1446       // make sure data in regions
1447       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1448 
1449       // Mess it up by leaving a hole in the hdfs data
1450       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1451           Bytes.toBytes("C"), false, false, true); // don't rm meta
1452 
1453       HBaseFsck hbck = doFsck(conf, false);
1454       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1455 
1456       // fix hole
1457       doFsck(conf, true);
1458 
1459       // check that hole fixed
1460       assertNoErrors(doFsck(conf,false));
1461       assertEquals(ROWKEYS.length - 2, countRows());
1462 
1463       // the following code checks whether the old primary/secondary has
1464       // been unassigned and the new primary/secondary has been assigned
1465       i = 0;
1466       HRegionInfo[] newHris = new HRegionInfo[2];
1467       // get all table's regions from meta
1468       map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(), tbl.getName());
1469       // get the HRIs of the new regions (hbck created new regions for fixing the hdfs mess-up)
1470       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1471         if (m.getKey().getStartKey().length > 0 &&
1472             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1473           newHris[i++] = m.getKey();
1474         }
1475       }
1476       // get all the online regions in the regionservers
1477       Collection<ServerName> servers = admin.getClusterStatus().getServers();
1478       Set<HRegionInfo> onlineRegions = new HashSet<HRegionInfo>();
1479       for (ServerName s : servers) {
1480         List<HRegionInfo> list = admin.getOnlineRegions(s);
1481         onlineRegions.addAll(list);
1482       }
1483       // the new HRIs must be a subset of the online regions
1484       assertTrue(onlineRegions.containsAll(Arrays.asList(newHris)));
1485       // the old HRIs must not be part of the set (removeAll would return false if
1486       // the set didn't change)
1487       assertFalse(onlineRegions.removeAll(Arrays.asList(oldHris)));
1488     } finally {
1489       cleanupTable(table);
1490       admin.close();
1491     }
1492   }
1493 
1494 
1495   /**
1496    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1497    * remove the table.
1498    */
1499   @Test (timeout=180000)
1500   public void testNoHdfsTable() throws Exception {
1501     TableName table = TableName.valueOf("NoHdfsTable");
1502     setupTable(table);
1503     assertEquals(ROWKEYS.length, countRows());
1504 
1505     // make sure data in regions, if in wal only there is no data loss
1506     admin.flush(table);
1507 
1508     // Mess it up by deleting hdfs dirs
1509     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1510         Bytes.toBytes("A"), false, false, true); // don't rm meta
1511     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1512         Bytes.toBytes("B"), false, false, true); // don't rm meta
1513     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1514         Bytes.toBytes("C"), false, false, true); // don't rm meta
1515     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1516         Bytes.toBytes(""), false, false, true); // don't rm meta
1517 
1518     // also remove the table directory in hdfs
1519     deleteTableDir(table);
1520 
1521     HBaseFsck hbck = doFsck(conf, false);
1522     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1523         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1524         ERROR_CODE.NOT_IN_HDFS,});
1525     // holes are separate from overlap groups
1526     assertEquals(0, hbck.getOverlapGroups(table).size());
1527 
1528     // fix hole
1529     doFsck(conf, true); // detect dangling regions and remove those
1530 
1531     // check that hole fixed
1532     assertNoErrors(doFsck(conf,false));
1533     assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1534   }
1535 
1536   public void deleteTableDir(TableName table) throws IOException {
1537     Path rootDir = FSUtils.getRootDir(conf);
1538     FileSystem fs = rootDir.getFileSystem(conf);
1539     Path p = FSUtils.getTableDir(rootDir, table);
1540     HBaseFsck.debugLsr(conf, p);
1541     boolean success = fs.delete(p, true);
1542     LOG.info("Deleted " + p + " sucessfully? " + success);
1543   }
1544 
1545   /**
1546    * when the hbase.version file missing, It is fix the fault.
1547    */
1548   @Test (timeout=180000)
1549   public void testNoVersionFile() throws Exception {
1550     // delete the hbase.version file
1551     Path rootDir = FSUtils.getRootDir(conf);
1552     FileSystem fs = rootDir.getFileSystem(conf);
1553     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1554     fs.delete(versionFile, true);
1555 
1556     // test
1557     HBaseFsck hbck = doFsck(conf, false);
1558     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1559     // fix hbase.version missing
1560     doFsck(conf, true);
1561 
1562     // no version file fixed
1563     assertNoErrors(doFsck(conf, false));
1564   }
1565 
1566   /**
1567    * The region is not deployed when the table is disabled.
1568    */
1569   @Test (timeout=180000)
1570   public void testRegionShouldNotBeDeployed() throws Exception {
1571     TableName table =
1572         TableName.valueOf("tableRegionShouldNotBeDeployed");
1573     try {
1574       LOG.info("Starting testRegionShouldNotBeDeployed.");
1575       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1576       assertTrue(cluster.waitForActiveAndReadyMaster());
1577 
1578 
1579       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1580           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1581       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1582       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1583 
1584       // Write the .tableinfo
1585       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1586       fstd.createTableDescriptor(htdDisabled);
1587       List<HRegionInfo> disabledRegions =
1588           TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1589 
1590       // Let's just assign everything to first RS
1591       HRegionServer hrs = cluster.getRegionServer(0);
1592 
1593       // Create region files.
1594       admin.disableTable(table);
1595       admin.enableTable(table);
1596 
1597       // Disable the table and close its regions
1598       admin.disableTable(table);
1599       HRegionInfo region = disabledRegions.remove(0);
1600       byte[] regionName = region.getRegionName();
1601 
1602       // The region should not be assigned currently
1603       assertTrue(cluster.getServerWith(regionName) == -1);
1604 
1605       // Directly open a region on a region server.
1606       // If going through AM/ZK, the region won't be open.
1607       // Even it is opened, AM will close it which causes
1608       // flakiness of this test.
1609       HRegion r = HRegion.openHRegion(
1610         region, htdDisabled, hrs.getWAL(region), conf);
1611       hrs.addToOnlineRegions(r);
1612 
1613       HBaseFsck hbck = doFsck(conf, false);
1614       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1615 
1616       // fix this fault
1617       doFsck(conf, true);
1618 
1619       // check result
1620       assertNoErrors(doFsck(conf, false));
1621     } finally {
1622       admin.enableTable(table);
1623       cleanupTable(table);
1624     }
1625   }
1626 
1627   /**
1628    * This creates two tables and mess both of them and fix them one by one
1629    */
1630   @Test (timeout=180000)
1631   public void testFixByTable() throws Exception {
1632     TableName table1 =
1633         TableName.valueOf("testFixByTable1");
1634     TableName table2 =
1635         TableName.valueOf("testFixByTable2");
1636     try {
1637       setupTable(table1);
1638       // make sure data in regions, if in wal only there is no data loss
1639       admin.flush(table1);
1640       // Mess them up by leaving a hole in the hdfs data
1641       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1642         Bytes.toBytes("C"), false, false, true); // don't rm meta
1643 
1644       setupTable(table2);
1645       // make sure data in regions, if in wal only there is no data loss
1646       admin.flush(table2);
1647       // Mess them up by leaving a hole in the hdfs data
1648       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1649         Bytes.toBytes("C"), false, false, true); // don't rm meta
1650 
1651       HBaseFsck hbck = doFsck(conf, false);
1652       assertErrors(hbck, new ERROR_CODE[] {
1653         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1654 
1655       // fix hole in table 1
1656       doFsck(conf, true, table1);
1657       // check that hole in table 1 fixed
1658       assertNoErrors(doFsck(conf, false, table1));
1659       // check that hole in table 2 still there
1660       assertErrors(doFsck(conf, false, table2),
1661         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1662 
1663       // fix hole in table 2
1664       doFsck(conf, true, table2);
1665       // check that hole in both tables fixed
1666       assertNoErrors(doFsck(conf, false));
1667       assertEquals(ROWKEYS.length - 2, countRows());
1668     } finally {
1669       cleanupTable(table1);
1670       cleanupTable(table2);
1671     }
1672   }
1673   /**
1674    * A split parent in meta, in hdfs, and not deployed
1675    */
1676   @Test (timeout=180000)
1677   public void testLingeringSplitParent() throws Exception {
1678     TableName table =
1679         TableName.valueOf("testLingeringSplitParent");
1680     Table meta = null;
1681     try {
1682       setupTable(table);
1683       assertEquals(ROWKEYS.length, countRows());
1684 
1685       // make sure data in regions, if in wal only there is no data loss
1686       admin.flush(table);
1687       HRegionLocation location = tbl.getRegionLocation("B");
1688 
1689       // Delete one region from meta, but not hdfs, unassign it.
1690       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1691         Bytes.toBytes("C"), true, true, false);
1692 
1693       // Create a new meta entry to fake it as a split parent.
1694       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1695       HRegionInfo hri = location.getRegionInfo();
1696 
1697       HRegionInfo a = new HRegionInfo(tbl.getName(),
1698         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1699       HRegionInfo b = new HRegionInfo(tbl.getName(),
1700         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1701 
1702       hri.setOffline(true);
1703       hri.setSplit(true);
1704 
1705       MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1706       meta.close();
1707       admin.flush(TableName.META_TABLE_NAME);
1708 
1709       HBaseFsck hbck = doFsck(conf, false);
1710       assertErrors(hbck, new ERROR_CODE[] {
1711         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1712 
1713       // regular repair cannot fix lingering split parent
1714       hbck = doFsck(conf, true);
1715       assertErrors(hbck, new ERROR_CODE[] {
1716         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1717       assertFalse(hbck.shouldRerun());
1718       hbck = doFsck(conf, false);
1719       assertErrors(hbck, new ERROR_CODE[] {
1720         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1721 
1722       // fix lingering split parent
1723       hbck = new HBaseFsck(conf, hbfsckExecutorService);
1724       hbck.connect();
1725       hbck.setDisplayFullReport(); // i.e. -details
1726       hbck.setTimeLag(0);
1727       hbck.setFixSplitParents(true);
1728       hbck.onlineHbck();
1729       assertTrue(hbck.shouldRerun());
1730       hbck.close();
1731 
1732       Get get = new Get(hri.getRegionName());
1733       Result result = meta.get(get);
1734       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1735         HConstants.SPLITA_QUALIFIER).isEmpty());
1736       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1737         HConstants.SPLITB_QUALIFIER).isEmpty());
1738       admin.flush(TableName.META_TABLE_NAME);
1739 
1740       // fix other issues
1741       doFsck(conf, true);
1742 
1743       // check that all are fixed
1744       assertNoErrors(doFsck(conf, false));
1745       assertEquals(ROWKEYS.length, countRows());
1746     } finally {
1747       cleanupTable(table);
1748       IOUtils.closeQuietly(meta);
1749     }
1750   }
1751 
1752   /**
1753    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1754    * valid cases where the daughters are there.
1755    */
1756   @Test (timeout=180000)
1757   public void testValidLingeringSplitParent() throws Exception {
1758     TableName table =
1759         TableName.valueOf("testLingeringSplitParent");
1760     Table meta = null;
1761     try {
1762       setupTable(table);
1763       assertEquals(ROWKEYS.length, countRows());
1764 
1765       // make sure data in regions, if in wal only there is no data loss
1766       admin.flush(table);
1767       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1768 
1769       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1770       HRegionInfo hri = location.getRegionInfo();
1771 
1772       // do a regular split
1773       byte[] regionName = location.getRegionInfo().getRegionName();
1774       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1775       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1776 
1777       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1778       // for some time until children references are deleted. HBCK erroneously sees this as
1779       // overlapping regions
1780       HBaseFsck hbck = doFsck(
1781         conf, true, true, false, false, false, true, true, true, true, false, false, false, null);
1782       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1783 
1784       // assert that the split hbase:meta entry is still there.
1785       Get get = new Get(hri.getRegionName());
1786       Result result = meta.get(get);
1787       assertNotNull(result);
1788       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1789 
1790       assertEquals(ROWKEYS.length, countRows());
1791 
1792       // assert that we still have the split regions
1793       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1794       assertNoErrors(doFsck(conf, false));
1795     } finally {
1796       cleanupTable(table);
1797       IOUtils.closeQuietly(meta);
1798     }
1799   }
1800 
1801   /**
1802    * Split crashed after write to hbase:meta finished for the parent region, but
1803    * failed to write daughters (pre HBASE-7721 codebase)
1804    */
1805   @Test(timeout=75000)
1806   public void testSplitDaughtersNotInMeta() throws Exception {
1807     TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
1808     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1809     try {
1810       setupTable(table);
1811       assertEquals(ROWKEYS.length, countRows());
1812 
1813       // make sure data in regions, if in wal only there is no data loss
1814       admin.flush(table);
1815       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1816 
1817       HRegionInfo hri = location.getRegionInfo();
1818 
1819       // do a regular split
1820       byte[] regionName = location.getRegionInfo().getRegionName();
1821       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1822       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1823 
1824       PairOfSameType<HRegionInfo> daughters =
1825           MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
1826 
1827       // Delete daughter regions from meta, but not hdfs, unassign it.
1828       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1829       undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
1830       undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
1831 
1832       List<Delete> deletes = new ArrayList<>();
1833       deletes.add(new Delete(daughters.getFirst().getRegionName()));
1834       deletes.add(new Delete(daughters.getSecond().getRegionName()));
1835       meta.delete(deletes);
1836 
1837       // Remove daughters from regionStates
1838       RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
1839           getAssignmentManager().getRegionStates();
1840       regionStates.deleteRegion(daughters.getFirst());
1841       regionStates.deleteRegion(daughters.getSecond());
1842 
1843       HBaseFsck hbck = doFsck(conf, false);
1844       assertErrors(hbck,
1845           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1846               ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
1847 
1848       // now fix it. The fix should not revert the region split, but add daughters to META
1849       hbck = doFsck(
1850         conf, true, true, false, false, false, false, false, false, false, false, false, false, null);
1851       assertErrors(hbck,
1852           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1853               ERROR_CODE.HOLE_IN_REGION_CHAIN });
1854 
1855       // assert that the split hbase:meta entry is still there.
1856       Get get = new Get(hri.getRegionName());
1857       Result result = meta.get(get);
1858       assertNotNull(result);
1859       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1860 
1861       assertEquals(ROWKEYS.length, countRows());
1862 
1863       // assert that we still have the split regions
1864       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1865       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1866     } finally {
1867       meta.close();
1868       cleanupTable(table);
1869     }
1870   }
1871 
1872   /**
1873    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1874    * meta and data missing in the fs.
1875    */
1876   @Test(timeout=120000)
1877   public void testMissingFirstRegion() throws Exception {
1878     TableName table = TableName.valueOf("testMissingFirstRegion");
1879     try {
1880       setupTable(table);
1881       assertEquals(ROWKEYS.length, countRows());
1882 
1883       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1884       admin.disableTable(table);
1885       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1886           true, true);
1887       admin.enableTable(table);
1888 
1889       HBaseFsck hbck = doFsck(conf, false);
1890       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1891       // fix hole
1892       doFsck(conf, true);
1893       // check that hole fixed
1894       assertNoErrors(doFsck(conf, false));
1895     } finally {
1896       cleanupTable(table);
1897     }
1898   }
1899 
1900   /**
1901    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1902    * meta and data missing in the fs.
1903    */
1904   @Test(timeout=120000)
1905   public void testRegionDeployedNotInHdfs() throws Exception {
1906     TableName table =
1907         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1908     try {
1909       setupTable(table);
1910       admin.flush(table);
1911 
1912       // Mess it up by deleting region dir
1913       deleteRegion(conf, tbl.getTableDescriptor(),
1914         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1915         false, true);
1916 
1917       HBaseFsck hbck = doFsck(conf, false);
1918       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1919       // fix hole
1920       doFsck(conf, true);
1921       // check that hole fixed
1922       assertNoErrors(doFsck(conf, false));
1923     } finally {
1924       cleanupTable(table);
1925     }
1926   }
1927 
1928   /**
1929    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1930    * the fs.
1931    */
1932   @Test(timeout=120000)
1933   public void testMissingLastRegion() throws Exception {
1934     TableName table =
1935         TableName.valueOf("testMissingLastRegion");
1936     try {
1937       setupTable(table);
1938       assertEquals(ROWKEYS.length, countRows());
1939 
1940       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1941       admin.disableTable(table);
1942       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1943           true, true);
1944       admin.enableTable(table);
1945 
1946       HBaseFsck hbck = doFsck(conf, false);
1947       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1948       // fix hole
1949       doFsck(conf, true);
1950       // check that hole fixed
1951       assertNoErrors(doFsck(conf, false));
1952     } finally {
1953       cleanupTable(table);
1954     }
1955   }
1956 
1957   /**
1958    * Test -noHdfsChecking option can detect and fix assignments issue.
1959    */
1960   @Test (timeout=180000)
1961   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1962     TableName table =
1963         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1964     try {
1965       setupTable(table);
1966       assertEquals(ROWKEYS.length, countRows());
1967 
1968       // Mess it up by closing a region
1969       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1970         Bytes.toBytes("B"), true, false, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
1971 
1972       // verify there is no other errors
1973       HBaseFsck hbck = doFsck(conf, false);
1974       assertErrors(hbck, new ERROR_CODE[] {
1975         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1976 
1977       // verify that noHdfsChecking report the same errors
1978       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1979       fsck.connect();
1980       fsck.setDisplayFullReport(); // i.e. -details
1981       fsck.setTimeLag(0);
1982       fsck.setCheckHdfs(false);
1983       fsck.onlineHbck();
1984       assertErrors(fsck, new ERROR_CODE[] {
1985         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1986       fsck.close();
1987 
1988       // verify that fixAssignments works fine with noHdfsChecking
1989       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1990       fsck.connect();
1991       fsck.setDisplayFullReport(); // i.e. -details
1992       fsck.setTimeLag(0);
1993       fsck.setCheckHdfs(false);
1994       fsck.setFixAssignments(true);
1995       fsck.onlineHbck();
1996       assertTrue(fsck.shouldRerun());
1997       fsck.onlineHbck();
1998       assertNoErrors(fsck);
1999 
2000       assertEquals(ROWKEYS.length, countRows());
2001 
2002       fsck.close();
2003     } finally {
2004       cleanupTable(table);
2005     }
2006   }
2007 
2008   /**
2009    * Test -noHdfsChecking option can detect region is not in meta but deployed.
2010    * However, it can not fix it without checking Hdfs because we need to get
2011    * the region info from Hdfs in this case, then to patch the meta.
2012    */
2013   @Test (timeout=180000)
2014   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
2015     TableName table =
2016         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
2017     try {
2018       setupTable(table);
2019       assertEquals(ROWKEYS.length, countRows());
2020 
2021       // Mess it up by deleting a region from the metadata
2022       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2023         Bytes.toBytes("B"), false, true, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
2024 
2025       // verify there is no other errors
2026       HBaseFsck hbck = doFsck(conf, false);
2027       assertErrors(hbck,
2028           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2029 
2030       // verify that noHdfsChecking report the same errors
2031       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2032       fsck.connect();
2033       fsck.setDisplayFullReport(); // i.e. -details
2034       fsck.setTimeLag(0);
2035       fsck.setCheckHdfs(false);
2036       fsck.onlineHbck();
2037       assertErrors(fsck,
2038           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2039       fsck.close();
2040 
2041       // verify that fixMeta doesn't work with noHdfsChecking
2042       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2043       fsck.connect();
2044       fsck.setDisplayFullReport(); // i.e. -details
2045       fsck.setTimeLag(0);
2046       fsck.setCheckHdfs(false);
2047       fsck.setFixAssignments(true);
2048       fsck.setFixMeta(true);
2049       fsck.onlineHbck();
2050       assertFalse(fsck.shouldRerun());
2051       assertErrors(fsck,
2052           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
2053       fsck.close();
2054 
2055       // fix the cluster so other tests won't be impacted
2056       fsck = doFsck(conf, true);
2057       assertTrue(fsck.shouldRerun());
2058       fsck = doFsck(conf, true);
2059       assertNoErrors(fsck);
2060     } finally {
2061       cleanupTable(table);
2062     }
2063   }
2064 
2065   /**
2066    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
2067    * and -noHdfsChecking can't detect orphan Hdfs region.
2068    */
2069   @Test (timeout=180000)
2070   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
2071     TableName table =
2072         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
2073     try {
2074       setupTable(table);
2075       assertEquals(ROWKEYS.length, countRows());
2076 
2077       // Mess it up by creating an overlap in the metadata
2078       admin.disableTable(table);
2079       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
2080         Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
2081       TEST_UTIL.getHBaseAdmin().enableTable(table);
2082 
2083       HRegionInfo hriOverlap =
2084           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
2085       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
2086       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
2087         .waitForAssignment(hriOverlap);
2088       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
2089       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
2090 
2091       HBaseFsck hbck = doFsck(conf, false);
2092       assertErrors(hbck, new ERROR_CODE[] {
2093         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2094         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2095 
2096       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
2097       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2098       fsck.connect();
2099       fsck.setDisplayFullReport(); // i.e. -details
2100       fsck.setTimeLag(0);
2101       fsck.setCheckHdfs(false);
2102       fsck.onlineHbck();
2103       assertErrors(fsck, new ERROR_CODE[] {
2104         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2105       fsck.close();
2106 
2107       // verify that fixHdfsHoles doesn't work with noHdfsChecking
2108       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2109       fsck.connect();
2110       fsck.setDisplayFullReport(); // i.e. -details
2111       fsck.setTimeLag(0);
2112       fsck.setCheckHdfs(false);
2113       fsck.setFixHdfsHoles(true);
2114       fsck.setFixHdfsOverlaps(true);
2115       fsck.setFixHdfsOrphans(true);
2116       fsck.onlineHbck();
2117       assertFalse(fsck.shouldRerun());
2118       assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
2119       fsck.close();
2120     } finally {
2121       if (admin.isTableDisabled(table)) {
2122         admin.enableTable(table);
2123       }
2124       cleanupTable(table);
2125     }
2126   }
2127 
2128   /**
2129    * We don't have an easy way to verify that a flush completed, so we loop until we find a
2130    * legitimate hfile and return it.
2131    * @param fs
2132    * @param table
2133    * @return Path of a flushed hfile.
2134    * @throws IOException
2135    */
2136   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
2137     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2138     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2139     Path famDir = new Path(regionDir, FAM_STR);
2140 
2141     // keep doing this until we get a legit hfile
2142     while (true) {
2143       FileStatus[] hfFss = fs.listStatus(famDir);
2144       if (hfFss.length == 0) {
2145         continue;
2146       }
2147       for (FileStatus hfs : hfFss) {
2148         if (!hfs.isDirectory()) {
2149           return hfs.getPath();
2150         }
2151       }
2152     }
2153   }
2154 
2155   /**
2156    * Gets flushed mob files.
2157    * @param fs The current file system.
2158    * @param table The current table name.
2159    * @return Path of a flushed hfile.
2160    * @throws IOException
2161    */
2162   Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException {
2163     Path regionDir = MobUtils.getMobRegionPath(conf, table);
2164     Path famDir = new Path(regionDir, FAM_STR);
2165 
2166     // keep doing this until we get a legit hfile
2167     while (true) {
2168       FileStatus[] hfFss = fs.listStatus(famDir);
2169       if (hfFss.length == 0) {
2170         continue;
2171       }
2172       for (FileStatus hfs : hfFss) {
2173         if (!hfs.isDirectory()) {
2174           return hfs.getPath();
2175         }
2176       }
2177     }
2178   }
2179 
2180   /**
2181    * Creates a new mob file name by the old one.
2182    * @param oldFileName The old mob file name.
2183    * @return The new mob file name.
2184    */
2185   String createMobFileName(String oldFileName) {
2186     MobFileName mobFileName = MobFileName.create(oldFileName);
2187     String startKey = mobFileName.getStartKey();
2188     String date = mobFileName.getDate();
2189     return MobFileName.create(startKey, date, UUID.randomUUID().toString().replaceAll("-", ""))
2190       .getFileName();
2191   }
2192 
2193   /**
2194    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
2195    */
2196   @Test(timeout=180000)
2197   public void testQuarantineCorruptHFile() throws Exception {
2198     TableName table = TableName.valueOf(name.getMethodName());
2199     try {
2200       setupTable(table);
2201       assertEquals(ROWKEYS.length, countRows());
2202       admin.flush(table); // flush is async.
2203 
2204       FileSystem fs = FileSystem.get(conf);
2205       Path hfile = getFlushedHFile(fs, table);
2206 
2207       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2208       admin.disableTable(table);
2209 
2210       // create new corrupt file called deadbeef (valid hfile name)
2211       Path corrupt = new Path(hfile.getParent(), "deadbeef");
2212       TestHFile.truncateFile(fs, hfile, corrupt);
2213       LOG.info("Created corrupted file " + corrupt);
2214       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
2215 
2216       // we cannot enable here because enable never finished due to the corrupt region.
2217       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
2218       assertEquals(res.getRetCode(), 0);
2219       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2220       assertEquals(hfcc.getHFilesChecked(), 5);
2221       assertEquals(hfcc.getCorrupted().size(), 1);
2222       assertEquals(hfcc.getFailures().size(), 0);
2223       assertEquals(hfcc.getQuarantined().size(), 1);
2224       assertEquals(hfcc.getMissing().size(), 0);
2225 
2226       // Its been fixed, verify that we can enable.
2227       admin.enableTable(table);
2228     } finally {
2229       cleanupTable(table);
2230     }
2231   }
2232 
2233   /**
2234    * This creates a table and then corrupts a mob file.  Hbck should quarantine the file.
2235    */
2236   @Test(timeout=180000)
2237   public void testQuarantineCorruptMobFile() throws Exception {
2238     TableName table = TableName.valueOf(name.getMethodName());
2239     try {
2240       setupMobTable(table);
2241       assertEquals(ROWKEYS.length, countRows());
2242       admin.flush(table);
2243 
2244       FileSystem fs = FileSystem.get(conf);
2245       Path mobFile = getFlushedMobFile(fs, table);
2246       admin.disableTable(table);
2247       // create new corrupt mob file.
2248       String corruptMobFile = createMobFileName(mobFile.getName());
2249       Path corrupt = new Path(mobFile.getParent(), corruptMobFile);
2250       TestHFile.truncateFile(fs, mobFile, corrupt);
2251       LOG.info("Created corrupted mob file " + corrupt);
2252       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
2253       HBaseFsck.debugLsr(conf, MobUtils.getMobHome(conf));
2254 
2255       // A corrupt mob file doesn't abort the start of regions, so we can enable the table.
2256       admin.enableTable(table);
2257       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
2258       assertEquals(res.getRetCode(), 0);
2259       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2260       assertEquals(hfcc.getHFilesChecked(), 4);
2261       assertEquals(hfcc.getCorrupted().size(), 0);
2262       assertEquals(hfcc.getFailures().size(), 0);
2263       assertEquals(hfcc.getQuarantined().size(), 0);
2264       assertEquals(hfcc.getMissing().size(), 0);
2265       assertEquals(hfcc.getMobFilesChecked(), 5);
2266       assertEquals(hfcc.getCorruptedMobFiles().size(), 1);
2267       assertEquals(hfcc.getFailureMobFiles().size(), 0);
2268       assertEquals(hfcc.getQuarantinedMobFiles().size(), 1);
2269       assertEquals(hfcc.getMissedMobFiles().size(), 0);
2270       String quarantinedMobFile = hfcc.getQuarantinedMobFiles().iterator().next().getName();
2271       assertEquals(corruptMobFile, quarantinedMobFile);
2272     } finally {
2273       cleanupTable(table);
2274     }
2275   }
2276 
2277   /**
2278    * Test that use this should have a timeout, because this method could potentially wait forever.
2279   */
2280   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
2281                                 int corrupt, int fail, int quar, int missing) throws Exception {
2282     try {
2283       setupTable(table);
2284       assertEquals(ROWKEYS.length, countRows());
2285       admin.flush(table); // flush is async.
2286 
2287       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2288       admin.disableTable(table);
2289 
2290       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
2291           table.getNameAsString()};
2292       HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
2293 
2294       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2295       assertEquals(hfcc.getHFilesChecked(), check);
2296       assertEquals(hfcc.getCorrupted().size(), corrupt);
2297       assertEquals(hfcc.getFailures().size(), fail);
2298       assertEquals(hfcc.getQuarantined().size(), quar);
2299       assertEquals(hfcc.getMissing().size(), missing);
2300 
2301       // its been fixed, verify that we can enable
2302       admin.enableTableAsync(table);
2303       while (!admin.isTableEnabled(table)) {
2304         try {
2305           Thread.sleep(250);
2306         } catch (InterruptedException e) {
2307           e.printStackTrace();
2308           fail("Interrupted when trying to enable table " + table);
2309         }
2310       }
2311     } finally {
2312       cleanupTable(table);
2313     }
2314   }
2315 
2316   /**
2317    * This creates a table and simulates the race situation where a concurrent compaction or split
2318    * has removed an hfile after the corruption checker learned about it.
2319    */
2320   @Test(timeout=180000)
2321   public void testQuarantineMissingHFile() throws Exception {
2322     TableName table = TableName.valueOf(name.getMethodName());
2323 
2324     // inject a fault in the hfcc created.
2325     final FileSystem fs = FileSystem.get(conf);
2326     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2327       @Override
2328       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2329         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2330           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2331           @Override
2332           protected void checkHFile(Path p) throws IOException {
2333             if (attemptedFirstHFile.compareAndSet(false, true)) {
2334               assertTrue(fs.delete(p, true)); // make sure delete happened.
2335             }
2336             super.checkHFile(p);
2337           }
2338         };
2339       }
2340     };
2341     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
2342     hbck.close();
2343   }
2344 
2345   /**
2346    * This creates a table and simulates the race situation where a concurrent compaction or split
2347    * has removed an colfam dir before the corruption checker got to it.
2348    */
2349   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
2350   // files in a column family on initial creation -- as suggested by Matteo.
2351   @Ignore @Test(timeout=180000)
2352   public void testQuarantineMissingFamdir() throws Exception {
2353     TableName table = TableName.valueOf(name.getMethodName());
2354     // inject a fault in the hfcc created.
2355     final FileSystem fs = FileSystem.get(conf);
2356     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2357       @Override
2358       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2359         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2360           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2361           @Override
2362           protected void checkColFamDir(Path p) throws IOException {
2363             if (attemptedFirstHFile.compareAndSet(false, true)) {
2364               assertTrue(fs.delete(p, true)); // make sure delete happened.
2365             }
2366             super.checkColFamDir(p);
2367           }
2368         };
2369       }
2370     };
2371     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2372     hbck.close();
2373   }
2374 
2375   /**
2376    * This creates a table and simulates the race situation where a concurrent compaction or split
2377    * has removed a region dir before the corruption checker got to it.
2378    */
2379   @Test(timeout=180000)
2380   public void testQuarantineMissingRegionDir() throws Exception {
2381     TableName table = TableName.valueOf(name.getMethodName());
2382     // inject a fault in the hfcc created.
2383     final FileSystem fs = FileSystem.get(conf);
2384     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2385       @Override
2386       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
2387       throws IOException {
2388         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2389           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2390           @Override
2391           protected void checkRegionDir(Path p) throws IOException {
2392             if (attemptedFirstHFile.compareAndSet(false, true)) {
2393               assertTrue(fs.delete(p, true)); // make sure delete happened.
2394             }
2395             super.checkRegionDir(p);
2396           }
2397         };
2398       }
2399     };
2400     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2401     hbck.close();
2402   }
2403 
2404   /**
2405    * Test fixing lingering reference file.
2406    */
2407   @Test (timeout=180000)
2408   public void testLingeringReferenceFile() throws Exception {
2409     TableName table =
2410         TableName.valueOf("testLingeringReferenceFile");
2411     try {
2412       setupTable(table);
2413       assertEquals(ROWKEYS.length, countRows());
2414 
2415       // Mess it up by creating a fake reference file
2416       FileSystem fs = FileSystem.get(conf);
2417       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2418       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2419       Path famDir = new Path(regionDir, FAM_STR);
2420       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2421       fs.create(fakeReferenceFile);
2422 
2423       HBaseFsck hbck = doFsck(conf, false);
2424       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2425       // fix reference file
2426       doFsck(conf, true);
2427       // check that reference file fixed
2428       assertNoErrors(doFsck(conf, false));
2429     } finally {
2430       cleanupTable(table);
2431     }
2432   }
2433 
2434   /**
2435    * Test fixing lingering HFileLinks.
2436    */
2437   @Test(timeout = 180000)
2438   public void testLingeringHFileLinks() throws Exception {
2439     TableName table = TableName.valueOf("testLingeringHFileLinks");
2440     try {
2441       setupTable(table);
2442 
2443       FileSystem fs = FileSystem.get(conf);
2444       Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2445       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2446       String regionName = regionDir.getName();
2447       Path famDir = new Path(regionDir, FAM_STR);
2448       String HFILE_NAME = "01234567abcd";
2449       Path hFilePath = new Path(famDir, HFILE_NAME);
2450 
2451       // creating HFile
2452       HFileContext context = new HFileContextBuilder().withIncludesTags(false).build();
2453       HFile.Writer w =
2454           HFile.getWriterFactoryNoCache(conf).withPath(fs, hFilePath).withFileContext(context)
2455               .create();
2456       w.close();
2457 
2458       HFileLink.create(conf, fs, famDir, table, regionName, HFILE_NAME);
2459 
2460       // should report no error
2461       HBaseFsck hbck = doFsck(conf, false);
2462       assertNoErrors(hbck);
2463 
2464       // Delete linked file
2465       fs.delete(hFilePath, true);
2466 
2467       // Check without fix should show the error
2468       hbck = doFsck(conf, false);
2469       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2470           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2471 
2472       // Fixing the error
2473       hbck = doFsck(conf, true);
2474       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2475           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2476 
2477       // Fix should sideline these files, thus preventing the error
2478       hbck = doFsck(conf, false);
2479       assertNoErrors(hbck);
2480     } finally {
2481       cleanupTable(table);
2482     }
2483   }
2484 
2485   @Test(timeout = 180000)
2486   public void testCorruptLinkDirectory() throws Exception {
2487     TableName table = TableName.valueOf("testLingeringHFileLinks");
2488     try {
2489       setupTable(table);
2490       FileSystem fs = FileSystem.get(conf);
2491 
2492       Path tableDir = FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2493       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2494       Path famDir = new Path(regionDir, FAM_STR);
2495       String regionName = regionDir.getName();
2496       String HFILE_NAME = "01234567abcd";
2497       String link = HFileLink.createHFileLinkName(table, regionName, HFILE_NAME);
2498 
2499       // should report no error
2500       HBaseFsck hbck = doFsck(conf, false);
2501       assertNoErrors(hbck);
2502 
2503       // creating a directory with file instead of the HFileLink file
2504       fs.mkdirs(new Path(famDir, link));
2505       fs.create(new Path(new Path(famDir, link), "somefile"));
2506 
2507       // Check without fix should show the error
2508       hbck = doFsck(conf, false);
2509       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2510           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2511 
2512       // Fixing the error
2513       hbck = doFsck(conf, true);
2514       assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
2515           HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_HFILELINK });
2516 
2517       // Fix should sideline these files, thus preventing the error
2518       hbck = doFsck(conf, false);
2519       assertNoErrors(hbck);
2520     } finally {
2521       cleanupTable(table);
2522     }
2523   }
2524 
2525   /**
2526    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2527    */
2528   @Test (timeout=180000)
2529   public void testMissingRegionInfoQualifier() throws Exception {
2530     Connection connection = ConnectionFactory.createConnection(conf);
2531     TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2532     try {
2533       setupTable(table);
2534 
2535       // Mess it up by removing the RegionInfo for one region.
2536       final List<Delete> deletes = new LinkedList<Delete>();
2537       Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2538       MetaScanner.metaScan(connection, new MetaScanner.MetaScannerVisitor() {
2539 
2540         @Override
2541         public boolean processRow(Result rowResult) throws IOException {
2542           HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2543           if (hri != null && !hri.getTable().isSystemTable()) {
2544             Delete delete = new Delete(rowResult.getRow());
2545             delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2546             deletes.add(delete);
2547           }
2548           return true;
2549         }
2550 
2551         @Override
2552         public void close() throws IOException {
2553         }
2554       });
2555       meta.delete(deletes);
2556 
2557       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2558       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2559         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2560       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2561         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2562       meta.close();
2563 
2564       HBaseFsck hbck = doFsck(conf, false);
2565       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2566 
2567       // fix reference file
2568       hbck = doFsck(conf, true);
2569 
2570       // check that reference file fixed
2571       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2572     } finally {
2573       cleanupTable(table);
2574     }
2575     connection.close();
2576   }
2577 
2578   /**
2579    * Test pluggable error reporter. It can be plugged in
2580    * from system property or configuration.
2581    */
2582   @Test (timeout=180000)
2583   public void testErrorReporter() throws Exception {
2584     try {
2585       MockErrorReporter.calledCount = 0;
2586       doFsck(conf, false);
2587       assertEquals(MockErrorReporter.calledCount, 0);
2588 
2589       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2590       doFsck(conf, false);
2591       assertTrue(MockErrorReporter.calledCount > 20);
2592     } finally {
2593       conf.set("hbasefsck.errorreporter",
2594         PrintingErrorReporter.class.getName());
2595       MockErrorReporter.calledCount = 0;
2596     }
2597   }
2598 
2599   static class MockErrorReporter implements ErrorReporter {
2600     static int calledCount = 0;
2601 
2602     @Override
2603     public void clear() {
2604       calledCount++;
2605     }
2606 
2607     @Override
2608     public void report(String message) {
2609       calledCount++;
2610     }
2611 
2612     @Override
2613     public void reportError(String message) {
2614       calledCount++;
2615     }
2616 
2617     @Override
2618     public void reportError(ERROR_CODE errorCode, String message) {
2619       calledCount++;
2620     }
2621 
2622     @Override
2623     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2624       calledCount++;
2625     }
2626 
2627     @Override
2628     public void reportError(ERROR_CODE errorCode,
2629         String message, TableInfo table, HbckInfo info) {
2630       calledCount++;
2631     }
2632 
2633     @Override
2634     public void reportError(ERROR_CODE errorCode, String message,
2635         TableInfo table, HbckInfo info1, HbckInfo info2) {
2636       calledCount++;
2637     }
2638 
2639     @Override
2640     public int summarize() {
2641       return ++calledCount;
2642     }
2643 
2644     @Override
2645     public void detail(String details) {
2646       calledCount++;
2647     }
2648 
2649     @Override
2650     public ArrayList<ERROR_CODE> getErrorList() {
2651       calledCount++;
2652       return new ArrayList<ERROR_CODE>();
2653     }
2654 
2655     @Override
2656     public void progress() {
2657       calledCount++;
2658     }
2659 
2660     @Override
2661     public void print(String message) {
2662       calledCount++;
2663     }
2664 
2665     @Override
2666     public void resetErrors() {
2667       calledCount++;
2668     }
2669 
2670     @Override
2671     public boolean tableHasErrors(TableInfo table) {
2672       calledCount++;
2673       return false;
2674     }
2675   }
2676 
2677   @Test(timeout=180000)
2678   public void testCheckTableLocks() throws Exception {
2679     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2680     EnvironmentEdgeManager.injectEdge(edge);
2681     // check no errors
2682     HBaseFsck hbck = doFsck(conf, false);
2683     assertNoErrors(hbck);
2684 
2685     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2686     final TableName tableName = TableName.valueOf("foo");
2687 
2688     // obtain one lock
2689     final TableLockManager tableLockManager =
2690       TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2691     TableLock writeLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2692     writeLock.acquire();
2693     hbck = doFsck(conf, false);
2694     assertNoErrors(hbck); // should not have expired, no problems
2695 
2696     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2697         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2698 
2699     hbck = doFsck(conf, false);
2700     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2701 
2702     final CountDownLatch latch = new CountDownLatch(1);
2703     new Thread() {
2704       @Override
2705       public void run() {
2706         TableLock readLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2707         try {
2708           latch.countDown();
2709           readLock.acquire();
2710         } catch (IOException ex) {
2711           fail();
2712         } catch (IllegalStateException ex) {
2713           return; // expected, since this will be reaped under us.
2714         }
2715         fail("should not have come here");
2716       };
2717     }.start();
2718 
2719     latch.await(); // wait until thread starts
2720     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2721 
2722     hbck = doFsck(conf, false);
2723     // still one expired, one not-expired
2724     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2725 
2726     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2727         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2728 
2729     hbck = doFsck(conf, false);
2730     // both are expired
2731     assertErrors(
2732       hbck,
2733       new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK});
2734 
2735     Configuration localConf = new Configuration(conf);
2736     // reaping from ZKInterProcessWriteLock uses znode cTime,
2737     // which is not injectable through EnvironmentEdge
2738     localConf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1);
2739     Threads.sleep(10);
2740     hbck = doFsck(localConf, true); // now fix both cases
2741 
2742     hbck = doFsck(localConf, false);
2743     assertNoErrors(hbck);
2744 
2745     // ensure that locks are deleted
2746     writeLock = tableLockManager.writeLock(tableName, "should acquire without blocking");
2747     writeLock.acquire(); // this should not block.
2748     writeLock.release(); // release for clean state
2749     tableLockManager.tableDeleted(tableName);
2750   }
2751 
2752   /**
2753    * Test orphaned table ZNode (for table states)
2754    */
2755   @Test
2756   public void testOrphanedTableZNode() throws Exception {
2757     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2758 
2759     try {
2760       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getTableStateManager()
2761       .setTableState(table, ZooKeeperProtos.Table.State.ENABLING);
2762 
2763       try {
2764         setupTable(table);
2765         Assert.fail(
2766           "Create table should fail when its ZNode has already existed with ENABLING state.");
2767       } catch(TableExistsException t) {
2768         //Expected exception
2769       }
2770       // The setup table was interrupted in some state that needs to some cleanup.
2771       try {
2772         cleanupTable(table);
2773       } catch (IOException e) {
2774         // Because create table failed, it is expected that the cleanup table would
2775         // throw some exception.  Ignore and continue.
2776       }
2777 
2778       HBaseFsck hbck = doFsck(conf, false);
2779       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2780 
2781       // fix the orphaned ZK entry
2782       hbck = doFsck(conf, true);
2783 
2784       // check that orpahned ZK table entry is gone.
2785       hbck = doFsck(conf, false);
2786       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2787       // Now create table should succeed.
2788       setupTable(table);
2789     } finally {
2790       // This code could be called that either a table was created successfully or set up
2791       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2792       try {
2793         cleanupTable(table);
2794       } catch (IOException e) {
2795         // The cleanup table would throw some exception if create table failed in some state.
2796         // Ignore this exception
2797       }
2798     }
2799   }
2800 
2801   @Test (timeout=180000)
2802   public void testMetaOffline() throws Exception {
2803     // check no errors
2804     HBaseFsck hbck = doFsck(conf, false);
2805     assertNoErrors(hbck);
2806     deleteMetaRegion(conf, true, false, false);
2807     hbck = doFsck(conf, false);
2808     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2809     // inconsistency and whether we will be fixing it or not.
2810     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2811     hbck = doFsck(conf, true);
2812     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2813     hbck = doFsck(conf, false);
2814     assertNoErrors(hbck);
2815   }
2816 
2817   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2818       boolean regionInfoOnly) throws IOException, InterruptedException {
2819     HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
2820         .getRegionLocation(HConstants.EMPTY_START_ROW);
2821     ServerName hsa = metaLocation.getServerName();
2822     HRegionInfo hri = metaLocation.getRegionInfo();
2823     if (unassign) {
2824       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2825       try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
2826         undeployRegion(unmanagedConnection, hsa, hri);
2827       }
2828     }
2829 
2830     if (regionInfoOnly) {
2831       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2832       Path rootDir = FSUtils.getRootDir(conf);
2833       FileSystem fs = rootDir.getFileSystem(conf);
2834       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2835           hri.getEncodedName());
2836       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2837       fs.delete(hriPath, true);
2838     }
2839 
2840     if (hdfs) {
2841       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2842       Path rootDir = FSUtils.getRootDir(conf);
2843       FileSystem fs = rootDir.getFileSystem(conf);
2844       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2845           hri.getEncodedName());
2846       HBaseFsck.debugLsr(conf, p);
2847       boolean success = fs.delete(p, true);
2848       LOG.info("Deleted " + p + " sucessfully? " + success);
2849       HBaseFsck.debugLsr(conf, p);
2850     }
2851   }
2852 
2853   @Test (timeout=180000)
2854   public void testTableWithNoRegions() throws Exception {
2855     // We might end up with empty regions in a table
2856     // see also testNoHdfsTable()
2857     TableName table =
2858         TableName.valueOf(name.getMethodName());
2859     try {
2860       // create table with one region
2861       HTableDescriptor desc = new HTableDescriptor(table);
2862       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2863       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2864       createTable(TEST_UTIL, desc, null);
2865       tbl = (HTable) connection.getTable(table, tableExecutorService);
2866 
2867       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2868       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
2869           HConstants.EMPTY_END_ROW, false, false, true);
2870 
2871       HBaseFsck hbck = doFsck(conf, false);
2872       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2873 
2874       doFsck(conf, true);
2875 
2876       // fix hole
2877       doFsck(conf, true);
2878 
2879       // check that hole fixed
2880       assertNoErrors(doFsck(conf, false));
2881     } finally {
2882       cleanupTable(table);
2883     }
2884 
2885   }
2886 
2887   @Test (timeout=180000)
2888   public void testHbckAfterRegionMerge() throws Exception {
2889     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2890     Table meta = null;
2891     try {
2892       // disable CatalogJanitor
2893       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2894       setupTable(table);
2895       assertEquals(ROWKEYS.length, countRows());
2896 
2897       // make sure data in regions, if in wal only there is no data loss
2898       admin.flush(table);
2899       HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
2900       HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
2901 
2902       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2903 
2904       assertNotEquals(region1, region2);
2905 
2906       // do a region merge
2907       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2908           region2.getEncodedNameAsBytes(), false);
2909 
2910       // wait until region merged
2911       long timeout = System.currentTimeMillis() + 30 * 1000;
2912       while (true) {
2913         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2914           break;
2915         } else if (System.currentTimeMillis() > timeout) {
2916           fail("Time out waiting on region " + region1.getEncodedName()
2917               + " and " + region2.getEncodedName() + " be merged");
2918         }
2919         Thread.sleep(10);
2920       }
2921 
2922       assertEquals(ROWKEYS.length, countRows());
2923 
2924       HBaseFsck hbck = doFsck(conf, false);
2925       assertNoErrors(hbck); // no errors
2926 
2927     } finally {
2928       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2929       cleanupTable(table);
2930       IOUtils.closeQuietly(meta);
2931     }
2932   }
2933 
2934   @Test (timeout = 180000)
2935   public void testRegionBoundariesCheck() throws Exception {
2936     HBaseFsck hbck = doFsck(conf, false);
2937     assertNoErrors(hbck); // no errors
2938     try {
2939       hbck.checkRegionBoundaries();
2940     } catch (IllegalArgumentException e) {
2941       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2942         fail("Table directory path is not valid." + e.getMessage());
2943       }
2944     }
2945   }
2946 
2947   @org.junit.Rule
2948   public TestName name = new TestName();
2949 
2950   @Test (timeout=180000)
2951   public void testReadOnlyProperty() throws Exception {
2952     HBaseFsck hbck = doFsck(conf, false);
2953     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2954       hbck.shouldIgnorePreCheckPermission());
2955 
2956     hbck = doFsck(conf, true);
2957     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2958       hbck.shouldIgnorePreCheckPermission());
2959 
2960     hbck = doFsck(conf, true);
2961     hbck.setIgnorePreCheckPermission(true);
2962     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2963       hbck.shouldIgnorePreCheckPermission());
2964   }
2965 
2966   @Test (timeout=180000)
2967   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2968     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2969     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2970     try {
2971       HTableDescriptor desc = new HTableDescriptor(table);
2972       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2973       createTable(TEST_UTIL, desc, null);
2974       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2975       for (int i = 0; i < 5; i++) {
2976         Put p1 = new Put(("r" + i).getBytes());
2977         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2978         tbl.put(p1);
2979       }
2980       admin.flush(desc.getTableName());
2981       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2982       int serverWith = cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
2983       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2984       cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
2985       SplitTransactionImpl st = new SplitTransactionImpl(regions.get(0), Bytes.toBytes("r3"));
2986       st.prepare();
2987       st.stepsBeforePONR(regionServer, regionServer, false);
2988       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2989       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2990       for (RegionState state : regionsInTransition.values()) {
2991         am.regionOffline(state.getRegion());
2992       }
2993       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2994       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2995       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2996       am.assign(regionsMap);
2997       am.waitForAssignment(regions.get(0).getRegionInfo());
2998       HBaseFsck hbck = doFsck(conf, false);
2999       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
3000           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
3001       // holes are separate from overlap groups
3002       assertEquals(0, hbck.getOverlapGroups(table).size());
3003 
3004       // fix hole
3005       assertErrors(
3006         doFsck(
3007           conf, false, true, false, false, false, false, false, false, false, false, false, false, null),
3008         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
3009           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
3010 
3011       // check that hole fixed
3012       assertNoErrors(doFsck(conf, false));
3013       assertEquals(5, countRows());
3014     } finally {
3015       if (tbl != null) {
3016         tbl.close();
3017         tbl = null;
3018       }
3019       cleanupTable(table);
3020     }
3021   }
3022 
3023 
3024   public static class MasterSyncObserver extends BaseMasterObserver {
3025     volatile CountDownLatch tableCreationLatch = null;
3026     volatile CountDownLatch tableDeletionLatch = null;
3027 
3028     @Override
3029     public void postCreateTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
3030       HTableDescriptor desc, HRegionInfo[] regions) throws IOException {
3031       // the AccessController test, some times calls only and directly the postCreateTableHandler()
3032       if (tableCreationLatch != null) {
3033         tableCreationLatch.countDown();
3034       }
3035     }
3036 
3037     @Override
3038     public void postDeleteTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
3039                                        TableName tableName)
3040     throws IOException {
3041       // the AccessController test, some times calls only and directly the postDeleteTableHandler()
3042       if (tableDeletionLatch != null) {
3043         tableDeletionLatch.countDown();
3044       }
3045     }
3046   }
3047 
3048   public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
3049     byte [][] splitKeys) throws Exception {
3050     // NOTE: We need a latch because admin is not sync,
3051     // so the postOp coprocessor method may be called after the admin operation returned.
3052     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
3053       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
3054     observer.tableCreationLatch = new CountDownLatch(1);
3055     if (splitKeys != null) {
3056       admin.createTable(htd, splitKeys);
3057     } else {
3058       admin.createTable(htd);
3059     }
3060     observer.tableCreationLatch.await();
3061     observer.tableCreationLatch = null;
3062     testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
3063   }
3064 
3065   public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
3066     throws Exception {
3067     // NOTE: We need a latch because admin is not sync,
3068     // so the postOp coprocessor method may be called after the admin operation returned.
3069     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
3070       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
3071     observer.tableDeletionLatch = new CountDownLatch(1);
3072     try {
3073       admin.disableTable(tableName);
3074     } catch (Exception e) {
3075       LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
3076     }
3077     admin.deleteTable(tableName);
3078     observer.tableDeletionLatch.await();
3079     observer.tableDeletionLatch = null;
3080   }
3081 }