View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.util;
20  
21  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertErrors;
22  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.assertNoErrors;
23  import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.doFsck;
24  import static org.junit.Assert.assertEquals;
25  import static org.junit.Assert.assertFalse;
26  import static org.junit.Assert.assertNotEquals;
27  import static org.junit.Assert.assertNotNull;
28  import static org.junit.Assert.assertTrue;
29  import static org.junit.Assert.fail;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.Collection;
35  import java.util.HashMap;
36  import java.util.HashSet;
37  import java.util.LinkedList;
38  import java.util.List;
39  import java.util.Map;
40  import java.util.Random;
41  import java.util.Map.Entry;
42  import java.util.NavigableMap;
43  import java.util.Set;
44  import java.util.concurrent.Callable;
45  import java.util.concurrent.CountDownLatch;
46  import java.util.concurrent.ExecutorService;
47  import java.util.concurrent.Executors;
48  import java.util.concurrent.Future;
49  import java.util.concurrent.ScheduledThreadPoolExecutor;
50  import java.util.concurrent.SynchronousQueue;
51  import java.util.concurrent.ThreadPoolExecutor;
52  import java.util.concurrent.TimeUnit;
53  import java.util.concurrent.atomic.AtomicBoolean;
54  
55  import org.apache.commons.io.IOUtils;
56  import org.apache.commons.logging.Log;
57  import org.apache.commons.logging.LogFactory;
58  import org.apache.hadoop.conf.Configuration;
59  import org.apache.hadoop.fs.FileStatus;
60  import org.apache.hadoop.fs.FileSystem;
61  import org.apache.hadoop.fs.Path;
62  import org.apache.hadoop.hbase.ClusterStatus;
63  import org.apache.hadoop.hbase.HBaseTestingUtility;
64  import org.apache.hadoop.hbase.HColumnDescriptor;
65  import org.apache.hadoop.hbase.HConstants;
66  import org.apache.hadoop.hbase.HRegionInfo;
67  import org.apache.hadoop.hbase.HRegionLocation;
68  import org.apache.hadoop.hbase.HTableDescriptor;
69  import org.apache.hadoop.hbase.TableExistsException;
70  import org.apache.hadoop.hbase.testclassification.LargeTests;
71  import org.apache.hadoop.hbase.MiniHBaseCluster;
72  import org.apache.hadoop.hbase.RegionLocations;
73  import org.apache.hadoop.hbase.ServerName;
74  import org.apache.hadoop.hbase.TableName;
75  import org.apache.hadoop.hbase.MetaTableAccessor;
76  import org.apache.hadoop.hbase.client.Admin;
77  import org.apache.hadoop.hbase.client.ClusterConnection;
78  import org.apache.hadoop.hbase.client.Connection;
79  import org.apache.hadoop.hbase.client.ConnectionFactory;
80  import org.apache.hadoop.hbase.client.Delete;
81  import org.apache.hadoop.hbase.client.Durability;
82  import org.apache.hadoop.hbase.client.Get;
83  import org.apache.hadoop.hbase.client.HBaseAdmin;
84  import org.apache.hadoop.hbase.client.HConnection;
85  import org.apache.hadoop.hbase.client.HTable;
86  import org.apache.hadoop.hbase.client.MetaScanner;
87  import org.apache.hadoop.hbase.client.Mutation;
88  import org.apache.hadoop.hbase.client.Put;
89  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
90  import org.apache.hadoop.hbase.client.Result;
91  import org.apache.hadoop.hbase.client.ResultScanner;
92  import org.apache.hadoop.hbase.client.RowMutations;
93  import org.apache.hadoop.hbase.client.Scan;
94  import org.apache.hadoop.hbase.client.Table;
95  import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver;
96  import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
97  import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment;
98  import org.apache.hadoop.hbase.coprocessor.ObserverContext;
99  import org.apache.hadoop.hbase.io.hfile.TestHFile;
100 import org.apache.hadoop.hbase.master.AssignmentManager;
101 import org.apache.hadoop.hbase.master.HMaster;
102 import org.apache.hadoop.hbase.master.RegionState;
103 import org.apache.hadoop.hbase.master.RegionStates;
104 import org.apache.hadoop.hbase.master.TableLockManager;
105 import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
106 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
107 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
108 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
109 import org.apache.hadoop.hbase.regionserver.HRegion;
110 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
111 import org.apache.hadoop.hbase.regionserver.HRegionServer;
112 import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
113 import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
114 import org.apache.hadoop.hbase.testclassification.LargeTests;
115 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter;
116 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
117 import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo;
118 import org.apache.hadoop.hbase.util.HBaseFsck.PrintingErrorReporter;
119 import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo;
120 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
121 import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
122 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
123 import org.apache.hadoop.hbase.zookeeper.ZKAssign;
124 import org.apache.zookeeper.KeeperException;
125 import org.junit.AfterClass;
126 import org.junit.Assert;
127 import org.junit.Before;
128 import org.junit.BeforeClass;
129 import org.junit.Ignore;
130 import org.junit.Test;
131 import org.junit.experimental.categories.Category;
132 import org.junit.rules.TestName;
133 
134 import com.google.common.collect.Multimap;
135 
136 /**
137  * This tests HBaseFsck's ability to detect reasons for inconsistent tables.
138  */
139 @Category(LargeTests.class)
140 public class TestHBaseFsck {
141   static final int POOL_SIZE = 7;
142   private static final Log LOG = LogFactory.getLog(TestHBaseFsck.class);
143   private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
144   private final static Configuration conf = TEST_UTIL.getConfiguration();
145   private final static String FAM_STR = "fam";
146   private final static byte[] FAM = Bytes.toBytes(FAM_STR);
147   private final static int REGION_ONLINE_TIMEOUT = 800;
148   private static RegionStates regionStates;
149   private static ExecutorService tableExecutorService;
150   private static ScheduledThreadPoolExecutor hbfsckExecutorService;
151   private static ClusterConnection connection;
152   private static Admin admin;
153 
154   // for the instance, reset every test run
155   private HTable tbl;
156   private final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"),
157     Bytes.toBytes("B"), Bytes.toBytes("C") };
158   // one row per region.
159   private final static byte[][] ROWKEYS= new byte[][] {
160     Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"),
161     Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") };
162 
163   @BeforeClass
164   public static void setUpBeforeClass() throws Exception {
165     TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
166       MasterSyncObserver.class.getName());
167 
168     conf.setInt("hbase.regionserver.handler.count", 2);
169     conf.setInt("hbase.regionserver.metahandler.count", 30);
170 
171     conf.setInt("hbase.htable.threads.max", POOL_SIZE);
172     conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
173     conf.setInt("hbase.hconnection.threads.core", POOL_SIZE);
174     conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
175     conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 8 * REGION_ONLINE_TIMEOUT);
176     TEST_UTIL.startMiniCluster(3);
177 
178     tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
179         new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
180 
181     hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
182 
183     AssignmentManager assignmentManager =
184       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
185     regionStates = assignmentManager.getRegionStates();
186 
187     connection = (ClusterConnection) TEST_UTIL.getConnection();
188 
189     admin = connection.getAdmin();
190     admin.setBalancerRunning(false, true);
191 
192     TEST_UTIL.waitUntilAllRegionsAssigned(TableName.META_TABLE_NAME);
193     TEST_UTIL.waitUntilAllRegionsAssigned(TableName.NAMESPACE_TABLE_NAME);
194   }
195 
196   @AfterClass
197   public static void tearDownAfterClass() throws Exception {
198     tableExecutorService.shutdown();
199     hbfsckExecutorService.shutdown();
200     admin.close();
201     TEST_UTIL.shutdownMiniCluster();
202   }
203 
204   @Before
205   public void setUp() {
206     EnvironmentEdgeManager.reset();
207   }
208 
209   @Test (timeout=180000)
210   public void testHBaseFsck() throws Exception {
211     assertNoErrors(doFsck(conf, false));
212     TableName table = TableName.valueOf("tableBadMetaAssign");
213     HTableDescriptor desc = new HTableDescriptor(table);
214     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
215     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
216     createTable(TEST_UTIL, desc, null);
217 
218     // We created 1 table, should be fine
219     assertNoErrors(doFsck(conf, false));
220 
221     // Now let's mess it up and change the assignment in hbase:meta to
222     // point to a different region server
223     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
224     Scan scan = new Scan();
225     scan.setStartRow(Bytes.toBytes(table+",,"));
226     ResultScanner scanner = meta.getScanner(scan);
227     HRegionInfo hri = null;
228 
229     Result res = scanner.next();
230     ServerName currServer =
231       ServerName.parseFrom(res.getValue(HConstants.CATALOG_FAMILY,
232           HConstants.SERVER_QUALIFIER));
233     long startCode = Bytes.toLong(res.getValue(HConstants.CATALOG_FAMILY,
234         HConstants.STARTCODE_QUALIFIER));
235 
236     for (JVMClusterUtil.RegionServerThread rs :
237         TEST_UTIL.getHBaseCluster().getRegionServerThreads()) {
238 
239       ServerName sn = rs.getRegionServer().getServerName();
240 
241       // When we find a diff RS, change the assignment and break
242       if (!currServer.getHostAndPort().equals(sn.getHostAndPort()) ||
243           startCode != sn.getStartcode()) {
244         Put put = new Put(res.getRow());
245         put.setDurability(Durability.SKIP_WAL);
246         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
247           Bytes.toBytes(sn.getHostAndPort()));
248         put.add(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
249           Bytes.toBytes(sn.getStartcode()));
250         meta.put(put);
251         hri = MetaTableAccessor.getHRegionInfo(res);
252         break;
253       }
254     }
255 
256     // Try to fix the data
257     assertErrors(doFsck(conf, true), new ERROR_CODE[]{
258         ERROR_CODE.SERVER_DOES_NOT_MATCH_META});
259 
260     TEST_UTIL.getHBaseCluster().getMaster()
261       .getAssignmentManager().waitForAssignment(hri);
262 
263     // Should be fixed now
264     assertNoErrors(doFsck(conf, false));
265 
266     // comment needed - what is the purpose of this line
267     Table t = connection.getTable(table, tableExecutorService);
268     ResultScanner s = t.getScanner(new Scan());
269     s.close();
270     t.close();
271 
272     scanner.close();
273     meta.close();
274   }
275 
276   @Test(timeout=180000)
277   public void testFixAssignmentsWhenMETAinTransition() throws Exception {
278     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
279     admin.closeRegion(cluster.getServerHoldingMeta(), HRegionInfo.FIRST_META_REGIONINFO);
280     regionStates.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
281     new MetaTableLocator().deleteMetaLocation(cluster.getMaster().getZooKeeper());
282     assertFalse(regionStates.isRegionOnline(HRegionInfo.FIRST_META_REGIONINFO));
283     HBaseFsck hbck = doFsck(conf, true);
284     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.UNKNOWN, ERROR_CODE.NO_META_REGION,
285         ERROR_CODE.NULL_META_REGION });
286     assertNoErrors(doFsck(conf, false));
287   }
288 
289   /**
290    * Create a new region in META.
291    */
292   private HRegionInfo createRegion(final HTableDescriptor
293       htd, byte[] startKey, byte[] endKey)
294       throws IOException {
295     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
296     HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey);
297     MetaTableAccessor.addRegionToMeta(meta, hri);
298     meta.close();
299     return hri;
300   }
301 
302   /**
303    * Debugging method to dump the contents of meta.
304    */
305   private void dumpMeta(TableName tableName) throws IOException {
306     List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName);
307     for (byte[] row : metaRows) {
308       LOG.info(Bytes.toString(row));
309     }
310   }
311 
312   /**
313    * This method is used to undeploy a region -- close it and attempt to
314    * remove its state from the Master.
315    */
316   private void undeployRegion(Connection conn, ServerName sn,
317       HRegionInfo hri) throws IOException, InterruptedException {
318     try {
319       HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) conn, sn, hri);
320       if (!hri.isMetaTable()) {
321         admin.offline(hri.getRegionName());
322       }
323     } catch (IOException ioe) {
324       LOG.warn("Got exception when attempting to offline region "
325           + Bytes.toString(hri.getRegionName()), ioe);
326     }
327   }
328   /**
329    * Delete a region from assignments, meta, or completely from hdfs.
330    * @param unassign if true unassign region if assigned
331    * @param metaRow  if true remove region's row from META
332    * @param hdfs if true remove region's dir in HDFS
333    */
334   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
335       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
336       boolean hdfs) throws IOException, InterruptedException {
337     deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, HRegionInfo.DEFAULT_REPLICA_ID);
338   }
339 
340   /**
341    * Delete a region from assignments, meta, or completely from hdfs.
342    * @param unassign if true unassign region if assigned
343    * @param metaRow  if true remove region's row from META
344    * @param hdfs if true remove region's dir in HDFS
345    * @param regionInfoOnly if true remove a region dir's .regioninfo file
346    * @param replicaId replica id
347    */
348   private void deleteRegion(Configuration conf, final HTableDescriptor htd,
349       byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow,
350       boolean hdfs, boolean regionInfoOnly, int replicaId)
351           throws IOException, InterruptedException {
352     LOG.info("** Before delete:");
353     dumpMeta(htd.getTableName());
354 
355     List<HRegionLocation> locations = tbl.getAllRegionLocations();
356     for (HRegionLocation location : locations) {
357       HRegionInfo hri = location.getRegionInfo();
358       ServerName hsa = location.getServerName();
359       if (Bytes.compareTo(hri.getStartKey(), startKey) == 0
360           && Bytes.compareTo(hri.getEndKey(), endKey) == 0
361           && hri.getReplicaId() == replicaId) {
362 
363         LOG.info("RegionName: " +hri.getRegionNameAsString());
364         byte[] deleteRow = hri.getRegionName();
365 
366         if (unassign) {
367           LOG.info("Undeploying region " + hri + " from server " + hsa);
368           undeployRegion(connection, hsa, hri);
369         }
370 
371         if (regionInfoOnly) {
372           LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
373           Path rootDir = FSUtils.getRootDir(conf);
374           FileSystem fs = rootDir.getFileSystem(conf);
375           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
376               hri.getEncodedName());
377           Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
378           fs.delete(hriPath, true);
379         }
380 
381         if (hdfs) {
382           LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
383           Path rootDir = FSUtils.getRootDir(conf);
384           FileSystem fs = rootDir.getFileSystem(conf);
385           Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()),
386               hri.getEncodedName());
387           HBaseFsck.debugLsr(conf, p);
388           boolean success = fs.delete(p, true);
389           LOG.info("Deleted " + p + " sucessfully? " + success);
390           HBaseFsck.debugLsr(conf, p);
391         }
392 
393         if (metaRow) {
394           try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
395             Delete delete = new Delete(deleteRow);
396             meta.delete(delete);
397           }
398         }
399       }
400       LOG.info(hri.toString() + hsa.toString());
401     }
402 
403     TEST_UTIL.getMetaTableRows(htd.getTableName());
404     LOG.info("*** After delete:");
405     dumpMeta(htd.getTableName());
406   }
407 
408   /**
409    * Setup a clean table before we start mucking with it.
410    *
411    * It will set tbl which needs to be closed after test
412    *
413    * @throws IOException
414    * @throws InterruptedException
415    * @throws KeeperException
416    */
417   void setupTable(TableName tablename) throws Exception {
418     setupTableWithRegionReplica(tablename, 1);
419   }
420 
421   /**
422    * Setup a clean table with a certain region_replica count
423    *
424    * It will set tbl which needs to be closed after test
425    *
426    * @param tableName
427    * @param replicaCount
428    * @throws Exception
429    */
430   void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception {
431     HTableDescriptor desc = new HTableDescriptor(tablename);
432     desc.setRegionReplication(replicaCount);
433     HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
434     desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
435     createTable(TEST_UTIL, desc, SPLITS);
436 
437     tbl = (HTable) connection.getTable(tablename, tableExecutorService);
438     List<Put> puts = new ArrayList<Put>();
439     for (byte[] row : ROWKEYS) {
440       Put p = new Put(row);
441       p.add(FAM, Bytes.toBytes("val"), row);
442       puts.add(p);
443     }
444     tbl.put(puts);
445     tbl.flushCommits();
446   }
447 
448   /**
449    * Counts the number of row to verify data loss or non-dataloss.
450    */
451   int countRows() throws IOException {
452      Scan s = new Scan();
453      ResultScanner rs = tbl.getScanner(s);
454      int i = 0;
455      while(rs.next() !=null) {
456        i++;
457      }
458      return i;
459   }
460 
461   /**
462    * delete table in preparation for next test
463    *
464    * @param tablename
465    * @throws IOException
466    */
467   void cleanupTable(TableName tablename) throws Exception {
468     if (tbl != null) {
469       tbl.close();
470       tbl = null;
471     }
472 
473     ((ClusterConnection) connection).clearRegionCache();
474     deleteTable(TEST_UTIL, tablename);
475   }
476 
477   /**
478    * This creates a clean table and confirms that the table is clean.
479    */
480   @Test (timeout=180000)
481   public void testHBaseFsckClean() throws Exception {
482     assertNoErrors(doFsck(conf, false));
483     TableName table = TableName.valueOf("tableClean");
484     try {
485       HBaseFsck hbck = doFsck(conf, false);
486       assertNoErrors(hbck);
487 
488       setupTable(table);
489       assertEquals(ROWKEYS.length, countRows());
490 
491       // We created 1 table, should be fine
492       hbck = doFsck(conf, false);
493       assertNoErrors(hbck);
494       assertEquals(0, hbck.getOverlapGroups(table).size());
495       assertEquals(ROWKEYS.length, countRows());
496     } finally {
497       cleanupTable(table);
498     }
499   }
500 
501   /**
502    * Test thread pooling in the case where there are more regions than threads
503    */
504   @Test (timeout=180000)
505   public void testHbckThreadpooling() throws Exception {
506     TableName table =
507         TableName.valueOf("tableDupeStartKey");
508     try {
509       // Create table with 4 regions
510       setupTable(table);
511 
512       // limit number of threads to 1.
513       Configuration newconf = new Configuration(conf);
514       newconf.setInt("hbasefsck.numthreads", 1);
515       assertNoErrors(doFsck(newconf, false));
516 
517       // We should pass without triggering a RejectedExecutionException
518     } finally {
519       cleanupTable(table);
520     }
521   }
522 
523   @Test (timeout=180000)
524   public void testHbckFixOrphanTable() throws Exception {
525     TableName table = TableName.valueOf("tableInfo");
526     FileSystem fs = null;
527     Path tableinfo = null;
528     try {
529       setupTable(table);
530 
531       Path hbaseTableDir = FSUtils.getTableDir(
532           FSUtils.getRootDir(conf), table);
533       fs = hbaseTableDir.getFileSystem(conf);
534       FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
535       tableinfo = status.getPath();
536       fs.rename(tableinfo, new Path("/.tableinfo"));
537 
538       //to report error if .tableinfo is missing.
539       HBaseFsck hbck = doFsck(conf, false);
540       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_TABLEINFO_FILE });
541 
542       // fix OrphanTable with default .tableinfo (htd not yet cached on master)
543       hbck = doFsck(conf, true);
544       assertNoErrors(hbck);
545       status = null;
546       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
547       assertNotNull(status);
548 
549       HTableDescriptor htd = admin.getTableDescriptor(table);
550       htd.setValue("NOT_DEFAULT", "true");
551       admin.disableTable(table);
552       admin.modifyTable(table, htd);
553       admin.enableTable(table);
554       fs.delete(status.getPath(), true);
555 
556       // fix OrphanTable with cache
557       htd = admin.getTableDescriptor(table); // warms up cached htd on master
558       hbck = doFsck(conf, true);
559       assertNoErrors(hbck);
560       status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
561       assertNotNull(status);
562       htd = admin.getTableDescriptor(table);
563       assertEquals(htd.getValue("NOT_DEFAULT"), "true");
564     } finally {
565       fs.rename(new Path("/.tableinfo"), tableinfo);
566       cleanupTable(table);
567     }
568   }
569 
570   /**
571    * This test makes sure that parallel instances of Hbck is disabled.
572    *
573    * @throws Exception
574    */
575   @Test (timeout=180000)
576   public void testParallelHbck() throws Exception {
577     final ExecutorService service;
578     final Future<HBaseFsck> hbck1,hbck2;
579 
580     class RunHbck implements Callable<HBaseFsck>{
581       boolean fail = true;
582       @Override
583       public HBaseFsck call(){
584         Configuration c = new Configuration(conf);
585         c.setInt("hbase.hbck.lockfile.attempts", 1);
586         // HBASE-13574 found that in HADOOP-2.6 and later, the create file would internally retry.
587         // To avoid flakiness of the test, set low max wait time.
588         c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
589         try{
590           return doFsck(c, true); // Exclusive hbck only when fixing
591         } catch(Exception e){
592           if (e.getMessage().contains("Duplicate hbck")) {
593             fail = false;
594           }
595         }
596         // If we reach here, then an exception was caught
597         if (fail) fail();
598         return null;
599       }
600     }
601     service = Executors.newFixedThreadPool(2);
602     hbck1 = service.submit(new RunHbck());
603     hbck2 = service.submit(new RunHbck());
604     service.shutdown();
605     //wait for 15 seconds, for both hbck calls finish
606     service.awaitTermination(15, TimeUnit.SECONDS);
607     HBaseFsck h1 = hbck1.get();
608     HBaseFsck h2 = hbck2.get();
609     // Make sure only one of the calls was successful
610     assert(h1 == null || h2 == null);
611     if (h1 != null) {
612       assert(h1.getRetCode() >= 0);
613     }
614     if (h2 != null) {
615       assert(h2.getRetCode() >= 0);
616     }
617   }
618 
619   /**
620    * This test makes sure that with enough retries both parallel instances
621    * of hbck will be completed successfully.
622    *
623    * @throws Exception
624    */
625   @Test (timeout=180000)
626   public void testParallelWithRetriesHbck() throws Exception {
627     final ExecutorService service;
628     final Future<HBaseFsck> hbck1,hbck2;
629 
630     // With the ExponentialBackoffPolicyWithLimit (starting with 200 milliseconds sleep time, and
631     // max sleep time of 5 seconds), we can retry around 15 times within 80 seconds before bail out.
632     //
633     // Note: the reason to use 80 seconds is that in HADOOP-2.6 and later, the create file would
634     // retry up to HdfsConstants.LEASE_SOFTLIMIT_PERIOD (60 seconds).  See HBASE-13574 for more
635     // details.
636     final int timeoutInSeconds = 80;
637     final int sleepIntervalInMilliseconds = 200;
638     final int maxSleepTimeInMilliseconds = 6000;
639     final int maxRetryAttempts = 15;
640 
641     class RunHbck implements Callable<HBaseFsck>{
642 
643       @Override
644       public HBaseFsck call() throws Exception {
645         // Increase retry attempts to make sure the non-active hbck doesn't get starved
646         Configuration c = new Configuration(conf);
647         c.setInt("hbase.hbck.lockfile.maxwaittime", timeoutInSeconds);
648         c.setInt("hbase.hbck.lockfile.attempt.sleep.interval", sleepIntervalInMilliseconds);
649         c.setInt("hbase.hbck.lockfile.attempt.maxsleeptime", maxSleepTimeInMilliseconds);
650         c.setInt("hbase.hbck.lockfile.attempts", maxRetryAttempts);
651         return doFsck(c, false);
652       }
653     }
654 
655     service = Executors.newFixedThreadPool(2);
656     hbck1 = service.submit(new RunHbck());
657     hbck2 = service.submit(new RunHbck());
658     service.shutdown();
659     //wait for some time, for both hbck calls finish
660     service.awaitTermination(timeoutInSeconds * 2, TimeUnit.SECONDS);
661     HBaseFsck h1 = hbck1.get();
662     HBaseFsck h2 = hbck2.get();
663     // Both should be successful
664     assertNotNull(h1);
665     assertNotNull(h2);
666     assert(h1.getRetCode() >= 0);
667     assert(h2.getRetCode() >= 0);
668 
669   }
670 
671   /**
672    * This create and fixes a bad table with regions that have a duplicate
673    * start key
674    */
675   @Test (timeout=180000)
676   public void testDupeStartKey() throws Exception {
677     TableName table =
678         TableName.valueOf("tableDupeStartKey");
679     try {
680       setupTable(table);
681       assertNoErrors(doFsck(conf, false));
682       assertEquals(ROWKEYS.length, countRows());
683 
684       // Now let's mess it up, by adding a region with a duplicate startkey
685       HRegionInfo hriDupe =
686           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("A2"));
687       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
688       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
689           .waitForAssignment(hriDupe);
690       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
691       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
692 
693       HBaseFsck hbck = doFsck(conf, false);
694       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
695             ERROR_CODE.DUPE_STARTKEYS});
696       assertEquals(2, hbck.getOverlapGroups(table).size());
697       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
698 
699       // fix the degenerate region.
700       doFsck(conf,true);
701 
702       // check that the degenerate region is gone and no data loss
703       HBaseFsck hbck2 = doFsck(conf,false);
704       assertNoErrors(hbck2);
705       assertEquals(0, hbck2.getOverlapGroups(table).size());
706       assertEquals(ROWKEYS.length, countRows());
707     } finally {
708       cleanupTable(table);
709     }
710   }
711 
712   /*
713    * This creates a table with region_replica > 1 and verifies hbck runs
714    * successfully
715    */
716   @Test (timeout=180000)
717   public void testHbckWithRegionReplica() throws Exception {
718     TableName table =
719         TableName.valueOf("testHbckWithRegionReplica");
720     try {
721       setupTableWithRegionReplica(table, 2);
722       TEST_UTIL.getHBaseAdmin().flush(table.getName());
723       assertNoErrors(doFsck(conf, false));
724     } finally {
725       cleanupTable(table);
726     }
727   }
728 
729   @Test
730   public void testHbckWithFewerReplica() throws Exception {
731     TableName table =
732         TableName.valueOf("testHbckWithFewerReplica");
733     try {
734       setupTableWithRegionReplica(table, 2);
735       TEST_UTIL.getHBaseAdmin().flush(table.getName());
736       assertNoErrors(doFsck(conf, false));
737       assertEquals(ROWKEYS.length, countRows());
738       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
739           Bytes.toBytes("C"), true, false, false, false, 1); // unassign one replica
740       // check that problem exists
741       HBaseFsck hbck = doFsck(conf, false);
742       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_DEPLOYED});
743       // fix the problem
744       hbck = doFsck(conf, true);
745       // run hbck again to make sure we don't see any errors
746       hbck = doFsck(conf, false);
747       assertErrors(hbck, new ERROR_CODE[]{});
748     } finally {
749       cleanupTable(table);
750     }
751   }
752 
753   @Test
754   public void testHbckWithExcessReplica() throws Exception {
755     TableName table =
756         TableName.valueOf("testHbckWithExcessReplica");
757     try {
758       setupTableWithRegionReplica(table, 2);
759       TEST_UTIL.getHBaseAdmin().flush(table.getName());
760       assertNoErrors(doFsck(conf, false));
761       assertEquals(ROWKEYS.length, countRows());
762       // the next few lines inject a location in meta for a replica, and then
763       // asks the master to assign the replica (the meta needs to be injected
764       // for the master to treat the request for assignment as valid; the master
765       // checks the region is valid either from its memory or meta)
766       HTable meta = new HTable(conf, TableName.META_TABLE_NAME);
767       List<HRegionInfo> regions = TEST_UTIL.getHBaseAdmin().getTableRegions(table);
768       byte[] startKey = Bytes.toBytes("B");
769       byte[] endKey = Bytes.toBytes("C");
770       byte[] metaKey = null;
771       HRegionInfo newHri = null;
772       for (HRegionInfo h : regions) {
773         if (Bytes.compareTo(h.getStartKey(), startKey) == 0  &&
774             Bytes.compareTo(h.getEndKey(), endKey) == 0 &&
775             h.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
776           metaKey = h.getRegionName();
777           //create a hri with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
778           newHri = RegionReplicaUtil.getRegionInfoForReplica(h, 2);
779           break;
780         }
781       }
782       Put put = new Put(metaKey);
783       ServerName sn = TEST_UTIL.getHBaseAdmin().getClusterStatus().getServers()
784           .toArray(new ServerName[0])[0];
785       //add a location with replicaId as 2 (since we already have replicas with replicaid 0 and 1)
786       MetaTableAccessor.addLocation(put, sn, sn.getStartcode(), -1, 2);
787       meta.put(put);
788       meta.flushCommits();
789       // assign the new replica
790       HBaseFsckRepair.fixUnassigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
791       HBaseFsckRepair.waitUntilAssigned((HBaseAdmin)TEST_UTIL.getHBaseAdmin(), newHri);
792       // now reset the meta row to its original value
793       Delete delete = new Delete(metaKey);
794       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(2));
795       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(2));
796       delete.deleteColumns(HConstants.CATALOG_FAMILY, MetaTableAccessor.getSeqNumColumn(2));
797       meta.delete(delete);
798       meta.flushCommits();
799       meta.close();
800       // check that problem exists
801       HBaseFsck hbck = doFsck(conf, false);
802       assertErrors(hbck, new ERROR_CODE[]{ERROR_CODE.NOT_IN_META});
803       // fix the problem
804       hbck = doFsck(conf, true);
805       // run hbck again to make sure we don't see any errors
806       hbck = doFsck(conf, false);
807       assertErrors(hbck, new ERROR_CODE[]{});
808     } finally {
809       cleanupTable(table);
810     }
811   }
812   /**
813    * Get region info from local cluster.
814    */
815   Map<ServerName, List<String>> getDeployedHRIs(final HBaseAdmin admin) throws IOException {
816     ClusterStatus status = admin.getClusterStatus();
817     Collection<ServerName> regionServers = status.getServers();
818     Map<ServerName, List<String>> mm =
819         new HashMap<ServerName, List<String>>();
820     for (ServerName hsi : regionServers) {
821       AdminProtos.AdminService.BlockingInterface server = ((HConnection) connection).getAdmin(hsi);
822 
823       // list all online regions from this region server
824       List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
825       List<String> regionNames = new ArrayList<String>();
826       for (HRegionInfo hri : regions) {
827         regionNames.add(hri.getRegionNameAsString());
828       }
829       mm.put(hsi, regionNames);
830     }
831     return mm;
832   }
833 
834   /**
835    * Returns the HSI a region info is on.
836    */
837   ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) {
838     for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) {
839       if (e.getValue().contains(hri.getRegionNameAsString())) {
840         return e.getKey();
841       }
842     }
843     return null;
844   }
845 
846   /**
847    * This create and fixes a bad table with regions that have a duplicate
848    * start key
849    */
850   @Test (timeout=180000)
851   public void testDupeRegion() throws Exception {
852     TableName table =
853         TableName.valueOf("tableDupeRegion");
854     try {
855       setupTable(table);
856       assertNoErrors(doFsck(conf, false));
857       assertEquals(ROWKEYS.length, countRows());
858 
859       // Now let's mess it up, by adding a region with a duplicate startkey
860       HRegionInfo hriDupe =
861           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"));
862 
863       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
864       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
865           .waitForAssignment(hriDupe);
866       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
867       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
868 
869       // Yikes! The assignment manager can't tell between diff between two
870       // different regions with the same start/endkeys since it doesn't
871       // differentiate on ts/regionId!  We actually need to recheck
872       // deployments!
873       while (findDeployedHSI(getDeployedHRIs((HBaseAdmin) admin), hriDupe) == null) {
874         Thread.sleep(250);
875       }
876 
877       LOG.debug("Finished assignment of dupe region");
878 
879       // TODO why is dupe region different from dupe start keys?
880       HBaseFsck hbck = doFsck(conf, false);
881       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DUPE_STARTKEYS,
882             ERROR_CODE.DUPE_STARTKEYS});
883       assertEquals(2, hbck.getOverlapGroups(table).size());
884       assertEquals(ROWKEYS.length, countRows()); // seems like the "bigger" region won.
885 
886       // fix the degenerate region.
887       doFsck(conf,true);
888 
889       // check that the degenerate region is gone and no data loss
890       HBaseFsck hbck2 = doFsck(conf,false);
891       assertNoErrors(hbck2);
892       assertEquals(0, hbck2.getOverlapGroups(table).size());
893       assertEquals(ROWKEYS.length, countRows());
894     } finally {
895       cleanupTable(table);
896     }
897   }
898 
899   /**
900    * This creates and fixes a bad table with regions that has startkey == endkey
901    */
902   @Test (timeout=180000)
903   public void testDegenerateRegions() throws Exception {
904     TableName table = TableName.valueOf("tableDegenerateRegions");
905     try {
906       setupTable(table);
907       assertNoErrors(doFsck(conf,false));
908       assertEquals(ROWKEYS.length, countRows());
909 
910       // Now let's mess it up, by adding a region with a duplicate startkey
911       HRegionInfo hriDupe =
912           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
913       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriDupe);
914       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
915           .waitForAssignment(hriDupe);
916       ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
917       TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
918 
919       HBaseFsck hbck = doFsck(conf,false);
920       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.DEGENERATE_REGION, ERROR_CODE.DUPE_STARTKEYS,
921           ERROR_CODE.DUPE_STARTKEYS });
922       assertEquals(2, hbck.getOverlapGroups(table).size());
923       assertEquals(ROWKEYS.length, countRows());
924 
925       // fix the degenerate region.
926       doFsck(conf,true);
927 
928       // check that the degenerate region is gone and no data loss
929       HBaseFsck hbck2 = doFsck(conf,false);
930       assertNoErrors(hbck2);
931       assertEquals(0, hbck2.getOverlapGroups(table).size());
932       assertEquals(ROWKEYS.length, countRows());
933     } finally {
934       cleanupTable(table);
935     }
936   }
937 
938   /**
939    * This creates and fixes a bad table where a region is completely contained
940    * by another region.
941    */
942   @Test (timeout=180000)
943   public void testContainedRegionOverlap() throws Exception {
944     TableName table =
945         TableName.valueOf("tableContainedRegionOverlap");
946     try {
947       setupTable(table);
948       assertEquals(ROWKEYS.length, countRows());
949 
950       // Mess it up by creating an overlap in the metadata
951       HRegionInfo hriOverlap =
952           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
953       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
954       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
955           .waitForAssignment(hriOverlap);
956       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
957       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
958 
959       HBaseFsck hbck = doFsck(conf, false);
960       assertErrors(hbck, new ERROR_CODE[] {
961           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
962       assertEquals(2, hbck.getOverlapGroups(table).size());
963       assertEquals(ROWKEYS.length, countRows());
964 
965       // fix the problem.
966       doFsck(conf, true);
967 
968       // verify that overlaps are fixed
969       HBaseFsck hbck2 = doFsck(conf,false);
970       assertNoErrors(hbck2);
971       assertEquals(0, hbck2.getOverlapGroups(table).size());
972       assertEquals(ROWKEYS.length, countRows());
973     } finally {
974       cleanupTable(table);
975     }
976   }
977 
978   /**
979    * This creates and fixes a bad table where an overlap group of
980    * 3 regions. Set HBaseFsck.maxMerge to 2 to trigger sideline overlapped
981    * region. Mess around the meta data so that closeRegion/offlineRegion
982    * throws exceptions.
983    */
984   @Test (timeout=180000)
985   public void testSidelineOverlapRegion() throws Exception {
986     TableName table =
987         TableName.valueOf("testSidelineOverlapRegion");
988     try {
989       setupTable(table);
990       assertEquals(ROWKEYS.length, countRows());
991 
992       // Mess it up by creating an overlap
993       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
994       HMaster master = cluster.getMaster();
995       HRegionInfo hriOverlap1 =
996           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("AB"));
997       master.assignRegion(hriOverlap1);
998       master.getAssignmentManager().waitForAssignment(hriOverlap1);
999       HRegionInfo hriOverlap2 =
1000           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("AB"), Bytes.toBytes("B"));
1001       master.assignRegion(hriOverlap2);
1002       master.getAssignmentManager().waitForAssignment(hriOverlap2);
1003 
1004       HBaseFsck hbck = doFsck(conf, false);
1005       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.DUPE_STARTKEYS,
1006         ERROR_CODE.DUPE_STARTKEYS, ERROR_CODE.OVERLAP_IN_REGION_CHAIN});
1007       assertEquals(3, hbck.getOverlapGroups(table).size());
1008       assertEquals(ROWKEYS.length, countRows());
1009 
1010       // mess around the overlapped regions, to trigger NotServingRegionException
1011       Multimap<byte[], HbckInfo> overlapGroups = hbck.getOverlapGroups(table);
1012       ServerName serverName = null;
1013       byte[] regionName = null;
1014       for (HbckInfo hbi: overlapGroups.values()) {
1015         if ("A".equals(Bytes.toString(hbi.getStartKey()))
1016             && "B".equals(Bytes.toString(hbi.getEndKey()))) {
1017           regionName = hbi.getRegionName();
1018 
1019           // get an RS not serving the region to force bad assignment info in to META.
1020           int k = cluster.getServerWith(regionName);
1021           for (int i = 0; i < 3; i++) {
1022             if (i != k) {
1023               HRegionServer rs = cluster.getRegionServer(i);
1024               serverName = rs.getServerName();
1025               break;
1026             }
1027           }
1028 
1029           HBaseFsckRepair.closeRegionSilentlyAndWait((HConnection) connection,
1030               cluster.getRegionServer(k).getServerName(), hbi.getHdfsHRI());
1031           admin.offline(regionName);
1032           break;
1033         }
1034       }
1035 
1036       assertNotNull(regionName);
1037       assertNotNull(serverName);
1038       try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) {
1039         Put put = new Put(regionName);
1040         put.add(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
1041             Bytes.toBytes(serverName.getHostAndPort()));
1042         meta.put(put);
1043       }
1044 
1045       // fix the problem.
1046       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1047       fsck.connect();
1048       fsck.setDisplayFullReport(); // i.e. -details
1049       fsck.setTimeLag(0);
1050       fsck.setFixAssignments(true);
1051       fsck.setFixMeta(true);
1052       fsck.setFixHdfsHoles(true);
1053       fsck.setFixHdfsOverlaps(true);
1054       fsck.setFixHdfsOrphans(true);
1055       fsck.setFixVersionFile(true);
1056       fsck.setSidelineBigOverlaps(true);
1057       fsck.setMaxMerge(2);
1058       fsck.onlineHbck();
1059       fsck.close();
1060 
1061       // verify that overlaps are fixed, and there are less rows
1062       // since one region is sidelined.
1063       HBaseFsck hbck2 = doFsck(conf,false);
1064       assertNoErrors(hbck2);
1065       assertEquals(0, hbck2.getOverlapGroups(table).size());
1066       assertTrue(ROWKEYS.length > countRows());
1067     } finally {
1068       cleanupTable(table);
1069     }
1070   }
1071 
1072   /**
1073    * This creates and fixes a bad table where a region is completely contained
1074    * by another region, and there is a hole (sort of like a bad split)
1075    */
1076   @Test (timeout=180000)
1077   public void testOverlapAndOrphan() throws Exception {
1078     TableName table =
1079         TableName.valueOf("tableOverlapAndOrphan");
1080     try {
1081       setupTable(table);
1082       assertEquals(ROWKEYS.length, countRows());
1083 
1084       // Mess it up by creating an overlap in the metadata
1085       admin.disableTable(table);
1086       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1087           Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1088       TEST_UTIL.getHBaseAdmin().enableTable(table);
1089 
1090       HRegionInfo hriOverlap =
1091           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
1092       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1093       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1094           .waitForAssignment(hriOverlap);
1095       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1096       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1097 
1098       HBaseFsck hbck = doFsck(conf, false);
1099       assertErrors(hbck, new ERROR_CODE[] {
1100           ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1101           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1102 
1103       // fix the problem.
1104       doFsck(conf, true);
1105 
1106       // verify that overlaps are fixed
1107       HBaseFsck hbck2 = doFsck(conf,false);
1108       assertNoErrors(hbck2);
1109       assertEquals(0, hbck2.getOverlapGroups(table).size());
1110       assertEquals(ROWKEYS.length, countRows());
1111     } finally {
1112       cleanupTable(table);
1113     }
1114   }
1115 
1116   /**
1117    * This creates and fixes a bad table where a region overlaps two regions --
1118    * a start key contained in another region and its end key is contained in
1119    * yet another region.
1120    */
1121   @Test (timeout=180000)
1122   public void testCoveredStartKey() throws Exception {
1123     TableName table =
1124         TableName.valueOf("tableCoveredStartKey");
1125     try {
1126       setupTable(table);
1127       assertEquals(ROWKEYS.length, countRows());
1128 
1129       // Mess it up by creating an overlap in the metadata
1130       HRegionInfo hriOverlap =
1131           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
1132       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
1133       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
1134           .waitForAssignment(hriOverlap);
1135       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
1136       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
1137 
1138       HBaseFsck hbck = doFsck(conf, false);
1139       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
1140           ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
1141       assertEquals(3, hbck.getOverlapGroups(table).size());
1142       assertEquals(ROWKEYS.length, countRows());
1143 
1144       // fix the problem.
1145       doFsck(conf, true);
1146 
1147       // verify that overlaps are fixed
1148       HBaseFsck hbck2 = doFsck(conf, false);
1149       assertErrors(hbck2, new ERROR_CODE[0]);
1150       assertEquals(0, hbck2.getOverlapGroups(table).size());
1151       assertEquals(ROWKEYS.length, countRows());
1152     } finally {
1153       cleanupTable(table);
1154     }
1155   }
1156 
1157   /**
1158    * This creates and fixes a bad table with a missing region -- hole in meta
1159    * and data missing in the fs.
1160    */
1161   @Test (timeout=180000)
1162   public void testRegionHole() throws Exception {
1163     TableName table =
1164         TableName.valueOf("tableRegionHole");
1165     try {
1166       setupTable(table);
1167       assertEquals(ROWKEYS.length, countRows());
1168 
1169       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1170       admin.disableTable(table);
1171       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1172           Bytes.toBytes("C"), true, true, true);
1173       admin.enableTable(table);
1174 
1175       HBaseFsck hbck = doFsck(conf, false);
1176       assertErrors(hbck, new ERROR_CODE[] {
1177           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1178       // holes are separate from overlap groups
1179       assertEquals(0, hbck.getOverlapGroups(table).size());
1180 
1181       // fix hole
1182       doFsck(conf, true);
1183 
1184       // check that hole fixed
1185       assertNoErrors(doFsck(conf,false));
1186       assertEquals(ROWKEYS.length - 2 , countRows()); // lost a region so lost a row
1187     } finally {
1188       cleanupTable(table);
1189     }
1190   }
1191 
1192   /**
1193    * This creates and fixes a bad table with a missing region -- hole in meta
1194    * and data present but .regioinfino missing (an orphan hdfs region)in the fs.
1195    */
1196   @Test (timeout=180000)
1197   public void testHDFSRegioninfoMissing() throws Exception {
1198     TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
1199     try {
1200       setupTable(table);
1201       assertEquals(ROWKEYS.length, countRows());
1202 
1203       // Mess it up by leaving a hole in the meta data
1204       admin.disableTable(table);
1205       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1206           Bytes.toBytes("C"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1207       TEST_UTIL.getHBaseAdmin().enableTable(table);
1208 
1209       HBaseFsck hbck = doFsck(conf, false);
1210       assertErrors(hbck, new ERROR_CODE[] {
1211           ERROR_CODE.ORPHAN_HDFS_REGION,
1212           ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1213           ERROR_CODE.HOLE_IN_REGION_CHAIN});
1214       // holes are separate from overlap groups
1215       assertEquals(0, hbck.getOverlapGroups(table).size());
1216 
1217       // fix hole
1218       doFsck(conf, true);
1219 
1220       // check that hole fixed
1221       assertNoErrors(doFsck(conf, false));
1222       assertEquals(ROWKEYS.length, countRows());
1223     } finally {
1224       cleanupTable(table);
1225     }
1226   }
1227 
1228   /**
1229    * This creates and fixes a bad table with a region that is missing meta and
1230    * not assigned to a region server.
1231    */
1232   @Test (timeout=180000)
1233   public void testNotInMetaOrDeployedHole() throws Exception {
1234     TableName table =
1235         TableName.valueOf("tableNotInMetaOrDeployedHole");
1236     try {
1237       setupTable(table);
1238       assertEquals(ROWKEYS.length, countRows());
1239 
1240       // Mess it up by leaving a hole in the meta data
1241       admin.disableTable(table);
1242       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1243           Bytes.toBytes("C"), true, true, false); // don't rm from fs
1244       admin.enableTable(table);
1245 
1246       HBaseFsck hbck = doFsck(conf, false);
1247       assertErrors(hbck, new ERROR_CODE[] {
1248           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1249       // holes are separate from overlap groups
1250       assertEquals(0, hbck.getOverlapGroups(table).size());
1251 
1252       // fix hole
1253       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1254           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1255 
1256       // check that hole fixed
1257       assertNoErrors(doFsck(conf,false));
1258       assertEquals(ROWKEYS.length, countRows());
1259     } finally {
1260       cleanupTable(table);
1261     }
1262   }
1263 
1264   /**
1265    * This creates fixes a bad table with a hole in meta.
1266    */
1267   @Test (timeout=180000)
1268   public void testNotInMetaHole() throws Exception {
1269     TableName table =
1270         TableName.valueOf("tableNotInMetaHole");
1271     try {
1272       setupTable(table);
1273       assertEquals(ROWKEYS.length, countRows());
1274 
1275       // Mess it up by leaving a hole in the meta data
1276       admin.disableTable(table);
1277       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1278           Bytes.toBytes("C"), false, true, false); // don't rm from fs
1279       admin.enableTable(table);
1280 
1281       HBaseFsck hbck = doFsck(conf, false);
1282       assertErrors(hbck, new ERROR_CODE[] {
1283           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1284       // holes are separate from overlap groups
1285       assertEquals(0, hbck.getOverlapGroups(table).size());
1286 
1287       // fix hole
1288       assertErrors(doFsck(conf, true) , new ERROR_CODE[] {
1289           ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1290 
1291       // check that hole fixed
1292       assertNoErrors(doFsck(conf,false));
1293       assertEquals(ROWKEYS.length, countRows());
1294     } finally {
1295       cleanupTable(table);
1296     }
1297   }
1298 
1299   /**
1300    * This creates and fixes a bad table with a region that is in meta but has
1301    * no deployment or data hdfs
1302    */
1303   @Test (timeout=180000)
1304   public void testNotInHdfs() throws Exception {
1305     TableName table =
1306         TableName.valueOf("tableNotInHdfs");
1307     try {
1308       setupTable(table);
1309       assertEquals(ROWKEYS.length, countRows());
1310 
1311       // make sure data in regions, if in wal only there is no data loss
1312       admin.flush(table);
1313 
1314       // Mess it up by leaving a hole in the hdfs data
1315       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1316           Bytes.toBytes("C"), false, false, true); // don't rm meta
1317 
1318       HBaseFsck hbck = doFsck(conf, false);
1319       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1320       // holes are separate from overlap groups
1321       assertEquals(0, hbck.getOverlapGroups(table).size());
1322 
1323       // fix hole
1324       doFsck(conf, true);
1325 
1326       // check that hole fixed
1327       assertNoErrors(doFsck(conf,false));
1328       assertEquals(ROWKEYS.length - 2, countRows());
1329     } finally {
1330       cleanupTable(table);
1331     }
1332   }
1333 
1334   /**
1335    * This creates and fixes a bad table with a region that is in meta but has
1336    * no deployment or data hdfs. The table has region_replication set to 2.
1337    */
1338   @Test (timeout=180000)
1339   public void testNotInHdfsWithReplicas() throws Exception {
1340     TableName table =
1341         TableName.valueOf("tableNotInHdfs");
1342     HBaseAdmin admin = new HBaseAdmin(conf);
1343     try {
1344       HRegionInfo[] oldHris = new HRegionInfo[2];
1345       setupTableWithRegionReplica(table, 2);
1346       assertEquals(ROWKEYS.length, countRows());
1347       NavigableMap<HRegionInfo, ServerName> map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(),
1348           tbl.getName());
1349       int i = 0;
1350       // store the HRIs of the regions we will mess up
1351       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1352         if (m.getKey().getStartKey().length > 0 &&
1353             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1354           LOG.debug("Initially server hosting " + m.getKey() + " is " + m.getValue());
1355           oldHris[i++] = m.getKey();
1356         }
1357       }
1358       // make sure data in regions
1359       TEST_UTIL.getHBaseAdmin().flush(table.getName());
1360 
1361       // Mess it up by leaving a hole in the hdfs data
1362       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1363           Bytes.toBytes("C"), false, false, true); // don't rm meta
1364 
1365       HBaseFsck hbck = doFsck(conf, false);
1366       assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1367 
1368       // fix hole
1369       doFsck(conf, true);
1370 
1371       // check that hole fixed
1372       assertNoErrors(doFsck(conf,false));
1373       assertEquals(ROWKEYS.length - 2, countRows());
1374 
1375       // the following code checks whether the old primary/secondary has
1376       // been unassigned and the new primary/secondary has been assigned
1377       i = 0;
1378       HRegionInfo[] newHris = new HRegionInfo[2];
1379       // get all table's regions from meta
1380       map = MetaScanner.allTableRegions(TEST_UTIL.getConnection(), tbl.getName());
1381       // get the HRIs of the new regions (hbck created new regions for fixing the hdfs mess-up)
1382       for (Map.Entry<HRegionInfo, ServerName> m : map.entrySet()) {
1383         if (m.getKey().getStartKey().length > 0 &&
1384             m.getKey().getStartKey()[0] == Bytes.toBytes("B")[0]) {
1385           newHris[i++] = m.getKey();
1386         }
1387       }
1388       // get all the online regions in the regionservers
1389       Collection<ServerName> servers = admin.getClusterStatus().getServers();
1390       Set<HRegionInfo> onlineRegions = new HashSet<HRegionInfo>();
1391       for (ServerName s : servers) {
1392         List<HRegionInfo> list = admin.getOnlineRegions(s);
1393         onlineRegions.addAll(list);
1394       }
1395       // the new HRIs must be a subset of the online regions
1396       assertTrue(onlineRegions.containsAll(Arrays.asList(newHris)));
1397       // the old HRIs must not be part of the set (removeAll would return false if
1398       // the set didn't change)
1399       assertFalse(onlineRegions.removeAll(Arrays.asList(oldHris)));
1400     } finally {
1401       cleanupTable(table);
1402       admin.close();
1403     }
1404   }
1405 
1406 
1407   /**
1408    * This creates entries in hbase:meta with no hdfs data.  This should cleanly
1409    * remove the table.
1410    */
1411   @Test (timeout=180000)
1412   public void testNoHdfsTable() throws Exception {
1413     TableName table = TableName.valueOf("NoHdfsTable");
1414     setupTable(table);
1415     assertEquals(ROWKEYS.length, countRows());
1416 
1417     // make sure data in regions, if in wal only there is no data loss
1418     admin.flush(table);
1419 
1420     // Mess it up by deleting hdfs dirs
1421     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
1422         Bytes.toBytes("A"), false, false, true); // don't rm meta
1423     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1424         Bytes.toBytes("B"), false, false, true); // don't rm meta
1425     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1426         Bytes.toBytes("C"), false, false, true); // don't rm meta
1427     deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
1428         Bytes.toBytes(""), false, false, true); // don't rm meta
1429 
1430     // also remove the table directory in hdfs
1431     deleteTableDir(table);
1432 
1433     HBaseFsck hbck = doFsck(conf, false);
1434     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS,
1435         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS,
1436         ERROR_CODE.NOT_IN_HDFS,});
1437     // holes are separate from overlap groups
1438     assertEquals(0, hbck.getOverlapGroups(table).size());
1439 
1440     // fix hole
1441     doFsck(conf, true); // detect dangling regions and remove those
1442 
1443     // check that hole fixed
1444     assertNoErrors(doFsck(conf,false));
1445     assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
1446   }
1447 
1448   public void deleteTableDir(TableName table) throws IOException {
1449     Path rootDir = FSUtils.getRootDir(conf);
1450     FileSystem fs = rootDir.getFileSystem(conf);
1451     Path p = FSUtils.getTableDir(rootDir, table);
1452     HBaseFsck.debugLsr(conf, p);
1453     boolean success = fs.delete(p, true);
1454     LOG.info("Deleted " + p + " sucessfully? " + success);
1455   }
1456 
1457   /**
1458    * when the hbase.version file missing, It is fix the fault.
1459    */
1460   @Test (timeout=180000)
1461   public void testNoVersionFile() throws Exception {
1462     // delete the hbase.version file
1463     Path rootDir = FSUtils.getRootDir(conf);
1464     FileSystem fs = rootDir.getFileSystem(conf);
1465     Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
1466     fs.delete(versionFile, true);
1467 
1468     // test
1469     HBaseFsck hbck = doFsck(conf, false);
1470     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_VERSION_FILE });
1471     // fix hbase.version missing
1472     doFsck(conf, true);
1473 
1474     // no version file fixed
1475     assertNoErrors(doFsck(conf, false));
1476   }
1477 
1478   /**
1479    * The region is not deployed when the table is disabled.
1480    */
1481   @Test (timeout=180000)
1482   public void testRegionShouldNotBeDeployed() throws Exception {
1483     TableName table =
1484         TableName.valueOf("tableRegionShouldNotBeDeployed");
1485     try {
1486       LOG.info("Starting testRegionShouldNotBeDeployed.");
1487       MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
1488       assertTrue(cluster.waitForActiveAndReadyMaster());
1489 
1490 
1491       byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
1492           Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
1493       HTableDescriptor htdDisabled = new HTableDescriptor(table);
1494       htdDisabled.addFamily(new HColumnDescriptor(FAM));
1495 
1496       // Write the .tableinfo
1497       FSTableDescriptors fstd = new FSTableDescriptors(conf);
1498       fstd.createTableDescriptor(htdDisabled);
1499       List<HRegionInfo> disabledRegions =
1500           TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
1501 
1502       // Let's just assign everything to first RS
1503       HRegionServer hrs = cluster.getRegionServer(0);
1504 
1505       // Create region files.
1506       admin.disableTable(table);
1507       admin.enableTable(table);
1508 
1509       // Disable the table and close its regions
1510       admin.disableTable(table);
1511       HRegionInfo region = disabledRegions.remove(0);
1512       byte[] regionName = region.getRegionName();
1513 
1514       // The region should not be assigned currently
1515       assertTrue(cluster.getServerWith(regionName) == -1);
1516 
1517       // Directly open a region on a region server.
1518       // If going through AM/ZK, the region won't be open.
1519       // Even it is opened, AM will close it which causes
1520       // flakiness of this test.
1521       HRegion r = HRegion.openHRegion(
1522         region, htdDisabled, hrs.getWAL(region), conf);
1523       hrs.addToOnlineRegions(r);
1524 
1525       HBaseFsck hbck = doFsck(conf, false);
1526       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
1527 
1528       // fix this fault
1529       doFsck(conf, true);
1530 
1531       // check result
1532       assertNoErrors(doFsck(conf, false));
1533     } finally {
1534       admin.enableTable(table);
1535       cleanupTable(table);
1536     }
1537   }
1538 
1539   /**
1540    * This creates two tables and mess both of them and fix them one by one
1541    */
1542   @Test (timeout=180000)
1543   public void testFixByTable() throws Exception {
1544     TableName table1 =
1545         TableName.valueOf("testFixByTable1");
1546     TableName table2 =
1547         TableName.valueOf("testFixByTable2");
1548     try {
1549       setupTable(table1);
1550       // make sure data in regions, if in wal only there is no data loss
1551       admin.flush(table1);
1552       // Mess them up by leaving a hole in the hdfs data
1553       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1554         Bytes.toBytes("C"), false, false, true); // don't rm meta
1555 
1556       setupTable(table2);
1557       // make sure data in regions, if in wal only there is no data loss
1558       admin.flush(table2);
1559       // Mess them up by leaving a hole in the hdfs data
1560       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1561         Bytes.toBytes("C"), false, false, true); // don't rm meta
1562 
1563       HBaseFsck hbck = doFsck(conf, false);
1564       assertErrors(hbck, new ERROR_CODE[] {
1565         ERROR_CODE.NOT_IN_HDFS, ERROR_CODE.NOT_IN_HDFS});
1566 
1567       // fix hole in table 1
1568       doFsck(conf, true, table1);
1569       // check that hole in table 1 fixed
1570       assertNoErrors(doFsck(conf, false, table1));
1571       // check that hole in table 2 still there
1572       assertErrors(doFsck(conf, false, table2),
1573         new ERROR_CODE[] {ERROR_CODE.NOT_IN_HDFS});
1574 
1575       // fix hole in table 2
1576       doFsck(conf, true, table2);
1577       // check that hole in both tables fixed
1578       assertNoErrors(doFsck(conf, false));
1579       assertEquals(ROWKEYS.length - 2, countRows());
1580     } finally {
1581       cleanupTable(table1);
1582       cleanupTable(table2);
1583     }
1584   }
1585   /**
1586    * A split parent in meta, in hdfs, and not deployed
1587    */
1588   @Test (timeout=180000)
1589   public void testLingeringSplitParent() throws Exception {
1590     TableName table =
1591         TableName.valueOf("testLingeringSplitParent");
1592     Table meta = null;
1593     try {
1594       setupTable(table);
1595       assertEquals(ROWKEYS.length, countRows());
1596 
1597       // make sure data in regions, if in wal only there is no data loss
1598       admin.flush(table);
1599       HRegionLocation location = tbl.getRegionLocation("B");
1600 
1601       // Delete one region from meta, but not hdfs, unassign it.
1602       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
1603         Bytes.toBytes("C"), true, true, false);
1604 
1605       // Create a new meta entry to fake it as a split parent.
1606       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1607       HRegionInfo hri = location.getRegionInfo();
1608 
1609       HRegionInfo a = new HRegionInfo(tbl.getName(),
1610         Bytes.toBytes("B"), Bytes.toBytes("BM"));
1611       HRegionInfo b = new HRegionInfo(tbl.getName(),
1612         Bytes.toBytes("BM"), Bytes.toBytes("C"));
1613 
1614       hri.setOffline(true);
1615       hri.setSplit(true);
1616 
1617       MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
1618       meta.close();
1619       admin.flush(TableName.META_TABLE_NAME);
1620 
1621       HBaseFsck hbck = doFsck(conf, false);
1622       assertErrors(hbck, new ERROR_CODE[] {
1623         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1624 
1625       // regular repair cannot fix lingering split parent
1626       hbck = doFsck(conf, true);
1627       assertErrors(hbck, new ERROR_CODE[] {
1628         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1629       assertFalse(hbck.shouldRerun());
1630       hbck = doFsck(conf, false);
1631       assertErrors(hbck, new ERROR_CODE[] {
1632         ERROR_CODE.LINGERING_SPLIT_PARENT, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1633 
1634       // fix lingering split parent
1635       hbck = new HBaseFsck(conf, hbfsckExecutorService);
1636       hbck.connect();
1637       hbck.setDisplayFullReport(); // i.e. -details
1638       hbck.setTimeLag(0);
1639       hbck.setFixSplitParents(true);
1640       hbck.onlineHbck();
1641       assertTrue(hbck.shouldRerun());
1642       hbck.close();
1643 
1644       Get get = new Get(hri.getRegionName());
1645       Result result = meta.get(get);
1646       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1647         HConstants.SPLITA_QUALIFIER).isEmpty());
1648       assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
1649         HConstants.SPLITB_QUALIFIER).isEmpty());
1650       admin.flush(TableName.META_TABLE_NAME);
1651 
1652       // fix other issues
1653       doFsck(conf, true);
1654 
1655       // check that all are fixed
1656       assertNoErrors(doFsck(conf, false));
1657       assertEquals(ROWKEYS.length, countRows());
1658     } finally {
1659       cleanupTable(table);
1660       IOUtils.closeQuietly(meta);
1661     }
1662   }
1663 
1664   /**
1665    * Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
1666    * valid cases where the daughters are there.
1667    */
1668   @Test (timeout=180000)
1669   public void testValidLingeringSplitParent() throws Exception {
1670     TableName table =
1671         TableName.valueOf("testLingeringSplitParent");
1672     Table meta = null;
1673     try {
1674       setupTable(table);
1675       assertEquals(ROWKEYS.length, countRows());
1676 
1677       // make sure data in regions, if in wal only there is no data loss
1678       admin.flush(table);
1679       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1680 
1681       meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1682       HRegionInfo hri = location.getRegionInfo();
1683 
1684       // do a regular split
1685       byte[] regionName = location.getRegionInfo().getRegionName();
1686       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1687       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1688 
1689       // TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
1690       // for some time until children references are deleted. HBCK erroneously sees this as
1691       // overlapping regions
1692       HBaseFsck hbck = doFsck(
1693         conf, true, true, false, false, false, true, true, true, false, false, false, null);
1694       assertErrors(hbck, new ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
1695 
1696       // assert that the split hbase:meta entry is still there.
1697       Get get = new Get(hri.getRegionName());
1698       Result result = meta.get(get);
1699       assertNotNull(result);
1700       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1701 
1702       assertEquals(ROWKEYS.length, countRows());
1703 
1704       // assert that we still have the split regions
1705       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1706       assertNoErrors(doFsck(conf, false));
1707     } finally {
1708       cleanupTable(table);
1709       IOUtils.closeQuietly(meta);
1710     }
1711   }
1712 
1713   /**
1714    * Split crashed after write to hbase:meta finished for the parent region, but
1715    * failed to write daughters (pre HBASE-7721 codebase)
1716    */
1717   @Test(timeout=75000)
1718   public void testSplitDaughtersNotInMeta() throws Exception {
1719     TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
1720     Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
1721     try {
1722       setupTable(table);
1723       assertEquals(ROWKEYS.length, countRows());
1724 
1725       // make sure data in regions, if in wal only there is no data loss
1726       admin.flush(table);
1727       HRegionLocation location = tbl.getRegionLocation(Bytes.toBytes("B"));
1728 
1729       HRegionInfo hri = location.getRegionInfo();
1730 
1731       // Disable CatalogJanitor to prevent it from cleaning up the parent region
1732       // after split.
1733       admin.enableCatalogJanitor(false);
1734 
1735       // do a regular split
1736       byte[] regionName = location.getRegionInfo().getRegionName();
1737       admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
1738       TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
1739 
1740       PairOfSameType<HRegionInfo> daughters =
1741           MetaTableAccessor.getDaughterRegions(meta.get(new Get(regionName)));
1742 
1743       // Delete daughter regions from meta, but not hdfs, unassign it.
1744       Map<HRegionInfo, ServerName> hris = tbl.getRegionLocations();
1745       undeployRegion(connection, hris.get(daughters.getFirst()), daughters.getFirst());
1746       undeployRegion(connection, hris.get(daughters.getSecond()), daughters.getSecond());
1747 
1748       List<Delete> deletes = new ArrayList<>();
1749       deletes.add(new Delete(daughters.getFirst().getRegionName()));
1750       deletes.add(new Delete(daughters.getSecond().getRegionName()));
1751       meta.delete(deletes);
1752 
1753       // Remove daughters from regionStates
1754       RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
1755           getAssignmentManager().getRegionStates();
1756       regionStates.deleteRegion(daughters.getFirst());
1757       regionStates.deleteRegion(daughters.getSecond());
1758 
1759       HBaseFsck hbck = doFsck(conf, false);
1760       assertErrors(hbck,
1761           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1762               ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
1763 
1764       // now fix it. The fix should not revert the region split, but add daughters to META
1765       hbck = doFsck(
1766         conf, true, true, false, false, false, false, false, false, false, false, false, null);
1767       assertErrors(hbck,
1768           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
1769               ERROR_CODE.HOLE_IN_REGION_CHAIN });
1770 
1771       // assert that the split hbase:meta entry is still there.
1772       Get get = new Get(hri.getRegionName());
1773       Result result = meta.get(get);
1774       assertNotNull(result);
1775       assertNotNull(MetaTableAccessor.getHRegionInfo(result));
1776 
1777       assertEquals(ROWKEYS.length, countRows());
1778 
1779       // assert that we still have the split regions
1780       assertEquals(tbl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions pre-split.
1781       assertNoErrors(doFsck(conf, false)); //should be fixed by now
1782     } finally {
1783       admin.enableCatalogJanitor(true);
1784       meta.close();
1785       cleanupTable(table);
1786     }
1787   }
1788 
1789   /**
1790    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1791    * meta and data missing in the fs.
1792    */
1793   @Test(timeout=120000)
1794   public void testMissingFirstRegion() throws Exception {
1795     TableName table = TableName.valueOf("testMissingFirstRegion");
1796     try {
1797       setupTable(table);
1798       assertEquals(ROWKEYS.length, countRows());
1799 
1800       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1801       admin.disableTable(table);
1802       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
1803           true, true);
1804       admin.enableTable(table);
1805 
1806       HBaseFsck hbck = doFsck(conf, false);
1807       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
1808       // fix hole
1809       doFsck(conf, true);
1810       // check that hole fixed
1811       assertNoErrors(doFsck(conf, false));
1812     } finally {
1813       cleanupTable(table);
1814     }
1815   }
1816 
1817   /**
1818    * This creates and fixes a bad table with a missing region which is the 1st region -- hole in
1819    * meta and data missing in the fs.
1820    */
1821   @Test(timeout=120000)
1822   public void testRegionDeployedNotInHdfs() throws Exception {
1823     TableName table =
1824         TableName.valueOf("testSingleRegionDeployedNotInHdfs");
1825     try {
1826       setupTable(table);
1827       admin.flush(table);
1828 
1829       // Mess it up by deleting region dir
1830       deleteRegion(conf, tbl.getTableDescriptor(),
1831         HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
1832         false, true);
1833 
1834       HBaseFsck hbck = doFsck(conf, false);
1835       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
1836       // fix hole
1837       doFsck(conf, true);
1838       // check that hole fixed
1839       assertNoErrors(doFsck(conf, false));
1840     } finally {
1841       cleanupTable(table);
1842     }
1843   }
1844 
1845   /**
1846    * This creates and fixes a bad table with missing last region -- hole in meta and data missing in
1847    * the fs.
1848    */
1849   @Test(timeout=120000)
1850   public void testMissingLastRegion() throws Exception {
1851     TableName table =
1852         TableName.valueOf("testMissingLastRegion");
1853     try {
1854       setupTable(table);
1855       assertEquals(ROWKEYS.length, countRows());
1856 
1857       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
1858       admin.disableTable(table);
1859       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
1860           true, true);
1861       admin.enableTable(table);
1862 
1863       HBaseFsck hbck = doFsck(conf, false);
1864       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
1865       // fix hole
1866       doFsck(conf, true);
1867       // check that hole fixed
1868       assertNoErrors(doFsck(conf, false));
1869     } finally {
1870       cleanupTable(table);
1871     }
1872   }
1873 
1874   /**
1875    * Test -noHdfsChecking option can detect and fix assignments issue.
1876    */
1877   @Test (timeout=180000)
1878   public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
1879     TableName table =
1880         TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
1881     try {
1882       setupTable(table);
1883       assertEquals(ROWKEYS.length, countRows());
1884 
1885       // Mess it up by closing a region
1886       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1887         Bytes.toBytes("B"), true, false, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
1888 
1889       // verify there is no other errors
1890       HBaseFsck hbck = doFsck(conf, false);
1891       assertErrors(hbck, new ERROR_CODE[] {
1892         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1893 
1894       // verify that noHdfsChecking report the same errors
1895       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1896       fsck.connect();
1897       fsck.setDisplayFullReport(); // i.e. -details
1898       fsck.setTimeLag(0);
1899       fsck.setCheckHdfs(false);
1900       fsck.onlineHbck();
1901       assertErrors(fsck, new ERROR_CODE[] {
1902         ERROR_CODE.NOT_DEPLOYED, ERROR_CODE.HOLE_IN_REGION_CHAIN});
1903       fsck.close();
1904 
1905       // verify that fixAssignments works fine with noHdfsChecking
1906       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1907       fsck.connect();
1908       fsck.setDisplayFullReport(); // i.e. -details
1909       fsck.setTimeLag(0);
1910       fsck.setCheckHdfs(false);
1911       fsck.setFixAssignments(true);
1912       fsck.onlineHbck();
1913       assertTrue(fsck.shouldRerun());
1914       fsck.onlineHbck();
1915       assertNoErrors(fsck);
1916 
1917       assertEquals(ROWKEYS.length, countRows());
1918 
1919       fsck.close();
1920     } finally {
1921       cleanupTable(table);
1922     }
1923   }
1924 
1925   /**
1926    * Test -noHdfsChecking option can detect region is not in meta but deployed.
1927    * However, it can not fix it without checking Hdfs because we need to get
1928    * the region info from Hdfs in this case, then to patch the meta.
1929    */
1930   @Test (timeout=180000)
1931   public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
1932     TableName table =
1933         TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
1934     try {
1935       setupTable(table);
1936       assertEquals(ROWKEYS.length, countRows());
1937 
1938       // Mess it up by deleting a region from the metadata
1939       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1940         Bytes.toBytes("B"), false, true, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
1941 
1942       // verify there is no other errors
1943       HBaseFsck hbck = doFsck(conf, false);
1944       assertErrors(hbck,
1945           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1946 
1947       // verify that noHdfsChecking report the same errors
1948       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
1949       fsck.connect();
1950       fsck.setDisplayFullReport(); // i.e. -details
1951       fsck.setTimeLag(0);
1952       fsck.setCheckHdfs(false);
1953       fsck.onlineHbck();
1954       assertErrors(fsck,
1955           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1956       fsck.close();
1957 
1958       // verify that fixMeta doesn't work with noHdfsChecking
1959       fsck = new HBaseFsck(conf, hbfsckExecutorService);
1960       fsck.connect();
1961       fsck.setDisplayFullReport(); // i.e. -details
1962       fsck.setTimeLag(0);
1963       fsck.setCheckHdfs(false);
1964       fsck.setFixAssignments(true);
1965       fsck.setFixMeta(true);
1966       fsck.onlineHbck();
1967       assertFalse(fsck.shouldRerun());
1968       assertErrors(fsck,
1969           new ERROR_CODE[] { ERROR_CODE.NOT_IN_META, ERROR_CODE.HOLE_IN_REGION_CHAIN });
1970       fsck.close();
1971 
1972       // fix the cluster so other tests won't be impacted
1973       fsck = doFsck(conf, true);
1974       assertTrue(fsck.shouldRerun());
1975       fsck = doFsck(conf, true);
1976       assertNoErrors(fsck);
1977     } finally {
1978       cleanupTable(table);
1979     }
1980   }
1981 
1982   /**
1983    * Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
1984    * and -noHdfsChecking can't detect orphan Hdfs region.
1985    */
1986   @Test (timeout=180000)
1987   public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
1988     TableName table =
1989         TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
1990     try {
1991       setupTable(table);
1992       assertEquals(ROWKEYS.length, countRows());
1993 
1994       // Mess it up by creating an overlap in the metadata
1995       admin.disableTable(table);
1996       deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
1997         Bytes.toBytes("B"), true, true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
1998       TEST_UTIL.getHBaseAdmin().enableTable(table);
1999 
2000       HRegionInfo hriOverlap =
2001           createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
2002       TEST_UTIL.getHBaseCluster().getMaster().assignRegion(hriOverlap);
2003       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager()
2004         .waitForAssignment(hriOverlap);
2005       ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
2006       TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
2007 
2008       HBaseFsck hbck = doFsck(conf, false);
2009       assertErrors(hbck, new ERROR_CODE[] {
2010         ERROR_CODE.ORPHAN_HDFS_REGION, ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2011         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2012 
2013       // verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
2014       HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
2015       fsck.connect();
2016       fsck.setDisplayFullReport(); // i.e. -details
2017       fsck.setTimeLag(0);
2018       fsck.setCheckHdfs(false);
2019       fsck.onlineHbck();
2020       assertErrors(fsck, new ERROR_CODE[] {
2021         ERROR_CODE.HOLE_IN_REGION_CHAIN});
2022       fsck.close();
2023 
2024       // verify that fixHdfsHoles doesn't work with noHdfsChecking
2025       fsck = new HBaseFsck(conf, hbfsckExecutorService);
2026       fsck.connect();
2027       fsck.setDisplayFullReport(); // i.e. -details
2028       fsck.setTimeLag(0);
2029       fsck.setCheckHdfs(false);
2030       fsck.setFixHdfsHoles(true);
2031       fsck.setFixHdfsOverlaps(true);
2032       fsck.setFixHdfsOrphans(true);
2033       fsck.onlineHbck();
2034       assertFalse(fsck.shouldRerun());
2035       assertErrors(fsck, new ERROR_CODE[] { ERROR_CODE.HOLE_IN_REGION_CHAIN});
2036       fsck.close();
2037     } finally {
2038       if (admin.isTableDisabled(table)) {
2039         admin.enableTable(table);
2040       }
2041       cleanupTable(table);
2042     }
2043   }
2044 
2045   /**
2046    * We don't have an easy way to verify that a flush completed, so we loop until we find a
2047    * legitimate hfile and return it.
2048    * @param fs
2049    * @param table
2050    * @return Path of a flushed hfile.
2051    * @throws IOException
2052    */
2053   Path getFlushedHFile(FileSystem fs, TableName table) throws IOException {
2054     Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2055     Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2056     Path famDir = new Path(regionDir, FAM_STR);
2057 
2058     // keep doing this until we get a legit hfile
2059     while (true) {
2060       FileStatus[] hfFss = fs.listStatus(famDir);
2061       if (hfFss.length == 0) {
2062         continue;
2063       }
2064       for (FileStatus hfs : hfFss) {
2065         if (!hfs.isDirectory()) {
2066           return hfs.getPath();
2067         }
2068       }
2069     }
2070   }
2071 
2072   /**
2073    * This creates a table and then corrupts an hfile.  Hbck should quarantine the file.
2074    */
2075   @Test(timeout=180000)
2076   public void testQuarantineCorruptHFile() throws Exception {
2077     TableName table = TableName.valueOf(name.getMethodName());
2078     try {
2079       setupTable(table);
2080       assertEquals(ROWKEYS.length, countRows());
2081       admin.flush(table); // flush is async.
2082 
2083       FileSystem fs = FileSystem.get(conf);
2084       Path hfile = getFlushedHFile(fs, table);
2085 
2086       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2087       admin.disableTable(table);
2088 
2089       // create new corrupt file called deadbeef (valid hfile name)
2090       Path corrupt = new Path(hfile.getParent(), "deadbeef");
2091       TestHFile.truncateFile(fs, hfile, corrupt);
2092       LOG.info("Created corrupted file " + corrupt);
2093       HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
2094 
2095       // we cannot enable here because enable never finished due to the corrupt region.
2096       HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
2097       assertEquals(res.getRetCode(), 0);
2098       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2099       assertEquals(hfcc.getHFilesChecked(), 5);
2100       assertEquals(hfcc.getCorrupted().size(), 1);
2101       assertEquals(hfcc.getFailures().size(), 0);
2102       assertEquals(hfcc.getQuarantined().size(), 1);
2103       assertEquals(hfcc.getMissing().size(), 0);
2104 
2105       // Its been fixed, verify that we can enable.
2106       admin.enableTable(table);
2107     } finally {
2108       cleanupTable(table);
2109     }
2110   }
2111 
2112   /**
2113    * Test that use this should have a timeout, because this method could potentially wait forever.
2114   */
2115   private void doQuarantineTest(TableName table, HBaseFsck hbck, int check,
2116                                 int corrupt, int fail, int quar, int missing) throws Exception {
2117     try {
2118       setupTable(table);
2119       assertEquals(ROWKEYS.length, countRows());
2120       admin.flush(table); // flush is async.
2121 
2122       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2123       admin.disableTable(table);
2124 
2125       String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission",
2126           table.getNameAsString()};
2127       HBaseFsck res = hbck.exec(hbfsckExecutorService, args);
2128 
2129       HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
2130       assertEquals(hfcc.getHFilesChecked(), check);
2131       assertEquals(hfcc.getCorrupted().size(), corrupt);
2132       assertEquals(hfcc.getFailures().size(), fail);
2133       assertEquals(hfcc.getQuarantined().size(), quar);
2134       assertEquals(hfcc.getMissing().size(), missing);
2135 
2136       // its been fixed, verify that we can enable
2137       admin.enableTableAsync(table);
2138       while (!admin.isTableEnabled(table)) {
2139         try {
2140           Thread.sleep(250);
2141         } catch (InterruptedException e) {
2142           e.printStackTrace();
2143           fail("Interrupted when trying to enable table " + table);
2144         }
2145       }
2146     } finally {
2147       cleanupTable(table);
2148     }
2149   }
2150 
2151   /**
2152    * This creates a table and simulates the race situation where a concurrent compaction or split
2153    * has removed an hfile after the corruption checker learned about it.
2154    */
2155   @Test(timeout=180000)
2156   public void testQuarantineMissingHFile() throws Exception {
2157     TableName table = TableName.valueOf(name.getMethodName());
2158 
2159     // inject a fault in the hfcc created.
2160     final FileSystem fs = FileSystem.get(conf);
2161     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2162       @Override
2163       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2164         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2165           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2166           @Override
2167           protected void checkHFile(Path p) throws IOException {
2168             if (attemptedFirstHFile.compareAndSet(false, true)) {
2169               assertTrue(fs.delete(p, true)); // make sure delete happened.
2170             }
2171             super.checkHFile(p);
2172           }
2173         };
2174       }
2175     };
2176     doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
2177     hbck.close();
2178   }
2179 
2180   /**
2181    * This creates a table and simulates the race situation where a concurrent compaction or split
2182    * has removed an colfam dir before the corruption checker got to it.
2183    */
2184   // Disabled because fails sporadically.  Is this test right?  Timing-wise, there could be no
2185   // files in a column family on initial creation -- as suggested by Matteo.
2186   @Ignore @Test(timeout=180000)
2187   public void testQuarantineMissingFamdir() throws Exception {
2188     TableName table = TableName.valueOf(name.getMethodName());
2189     // inject a fault in the hfcc created.
2190     final FileSystem fs = FileSystem.get(conf);
2191     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2192       @Override
2193       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
2194         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2195           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2196           @Override
2197           protected void checkColFamDir(Path p) throws IOException {
2198             if (attemptedFirstHFile.compareAndSet(false, true)) {
2199               assertTrue(fs.delete(p, true)); // make sure delete happened.
2200             }
2201             super.checkColFamDir(p);
2202           }
2203         };
2204       }
2205     };
2206     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2207     hbck.close();
2208   }
2209 
2210   /**
2211    * This creates a table and simulates the race situation where a concurrent compaction or split
2212    * has removed a region dir before the corruption checker got to it.
2213    */
2214   @Test(timeout=180000)
2215   public void testQuarantineMissingRegionDir() throws Exception {
2216     TableName table = TableName.valueOf(name.getMethodName());
2217     // inject a fault in the hfcc created.
2218     final FileSystem fs = FileSystem.get(conf);
2219     HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
2220       @Override
2221       public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
2222       throws IOException {
2223         return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
2224           AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
2225           @Override
2226           protected void checkRegionDir(Path p) throws IOException {
2227             if (attemptedFirstHFile.compareAndSet(false, true)) {
2228               assertTrue(fs.delete(p, true)); // make sure delete happened.
2229             }
2230             super.checkRegionDir(p);
2231           }
2232         };
2233       }
2234     };
2235     doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
2236     hbck.close();
2237   }
2238 
2239   /**
2240    * Test fixing lingering reference file.
2241    */
2242   @Test (timeout=180000)
2243   public void testLingeringReferenceFile() throws Exception {
2244     TableName table =
2245         TableName.valueOf("testLingeringReferenceFile");
2246     try {
2247       setupTable(table);
2248       assertEquals(ROWKEYS.length, countRows());
2249 
2250       // Mess it up by creating a fake reference file
2251       FileSystem fs = FileSystem.get(conf);
2252       Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table);
2253       Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0);
2254       Path famDir = new Path(regionDir, FAM_STR);
2255       Path fakeReferenceFile = new Path(famDir, "fbce357483ceea.12144538");
2256       fs.create(fakeReferenceFile);
2257 
2258       HBaseFsck hbck = doFsck(conf, false);
2259       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.LINGERING_REFERENCE_HFILE });
2260       // fix reference file
2261       doFsck(conf, true);
2262       // check that reference file fixed
2263       assertNoErrors(doFsck(conf, false));
2264     } finally {
2265       cleanupTable(table);
2266     }
2267   }
2268 
2269   /**
2270    * Test mission REGIONINFO_QUALIFIER in hbase:meta
2271    */
2272   @Test (timeout=180000)
2273   public void testMissingRegionInfoQualifier() throws Exception {
2274     Connection connection = ConnectionFactory.createConnection(conf);
2275     TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
2276     try {
2277       setupTable(table);
2278 
2279       // Mess it up by removing the RegionInfo for one region.
2280       final List<Delete> deletes = new LinkedList<Delete>();
2281       Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
2282       MetaScanner.metaScan(connection, new MetaScanner.MetaScannerVisitor() {
2283 
2284         @Override
2285         public boolean processRow(Result rowResult) throws IOException {
2286           HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
2287           if (hri != null && !hri.getTable().isSystemTable()) {
2288             Delete delete = new Delete(rowResult.getRow());
2289             delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2290             deletes.add(delete);
2291           }
2292           return true;
2293         }
2294 
2295         @Override
2296         public void close() throws IOException {
2297         }
2298       });
2299       meta.delete(deletes);
2300 
2301       // Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
2302       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2303         HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER, Bytes.toBytes("node1:60020")));
2304       meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66")).add(
2305         HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER, Bytes.toBytes(1362150791183L)));
2306       meta.close();
2307 
2308       HBaseFsck hbck = doFsck(conf, false);
2309       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2310 
2311       // fix reference file
2312       hbck = doFsck(conf, true);
2313 
2314       // check that reference file fixed
2315       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.EMPTY_META_CELL));
2316     } finally {
2317       cleanupTable(table);
2318     }
2319     connection.close();
2320   }
2321 
2322   /**
2323    * Test pluggable error reporter. It can be plugged in
2324    * from system property or configuration.
2325    */
2326   @Test (timeout=180000)
2327   public void testErrorReporter() throws Exception {
2328     try {
2329       MockErrorReporter.calledCount = 0;
2330       doFsck(conf, false);
2331       assertEquals(MockErrorReporter.calledCount, 0);
2332 
2333       conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
2334       doFsck(conf, false);
2335       assertTrue(MockErrorReporter.calledCount > 20);
2336     } finally {
2337       conf.set("hbasefsck.errorreporter",
2338         PrintingErrorReporter.class.getName());
2339       MockErrorReporter.calledCount = 0;
2340     }
2341   }
2342 
2343   static class MockErrorReporter implements ErrorReporter {
2344     static int calledCount = 0;
2345 
2346     @Override
2347     public void clear() {
2348       calledCount++;
2349     }
2350 
2351     @Override
2352     public void report(String message) {
2353       calledCount++;
2354     }
2355 
2356     @Override
2357     public void reportError(String message) {
2358       calledCount++;
2359     }
2360 
2361     @Override
2362     public void reportError(ERROR_CODE errorCode, String message) {
2363       calledCount++;
2364     }
2365 
2366     @Override
2367     public void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
2368       calledCount++;
2369     }
2370 
2371     @Override
2372     public void reportError(ERROR_CODE errorCode,
2373         String message, TableInfo table, HbckInfo info) {
2374       calledCount++;
2375     }
2376 
2377     @Override
2378     public void reportError(ERROR_CODE errorCode, String message,
2379         TableInfo table, HbckInfo info1, HbckInfo info2) {
2380       calledCount++;
2381     }
2382 
2383     @Override
2384     public int summarize() {
2385       return ++calledCount;
2386     }
2387 
2388     @Override
2389     public void detail(String details) {
2390       calledCount++;
2391     }
2392 
2393     @Override
2394     public ArrayList<ERROR_CODE> getErrorList() {
2395       calledCount++;
2396       return new ArrayList<ERROR_CODE>();
2397     }
2398 
2399     @Override
2400     public void progress() {
2401       calledCount++;
2402     }
2403 
2404     @Override
2405     public void print(String message) {
2406       calledCount++;
2407     }
2408 
2409     @Override
2410     public void resetErrors() {
2411       calledCount++;
2412     }
2413 
2414     @Override
2415     public boolean tableHasErrors(TableInfo table) {
2416       calledCount++;
2417       return false;
2418     }
2419   }
2420 
2421   @Test(timeout=180000)
2422   public void testCheckTableLocks() throws Exception {
2423     IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
2424     EnvironmentEdgeManager.injectEdge(edge);
2425     // check no errors
2426     HBaseFsck hbck = doFsck(conf, false);
2427     assertNoErrors(hbck);
2428 
2429     ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
2430     final TableName tableName = TableName.valueOf("foo");
2431 
2432     // obtain one lock
2433     final TableLockManager tableLockManager =
2434       TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
2435     TableLock writeLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2436     writeLock.acquire();
2437     hbck = doFsck(conf, false);
2438     assertNoErrors(hbck); // should not have expired, no problems
2439 
2440     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2441         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2442 
2443     hbck = doFsck(conf, false);
2444     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK});
2445 
2446     final CountDownLatch latch = new CountDownLatch(1);
2447     new Thread() {
2448       @Override
2449       public void run() {
2450         TableLock readLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
2451         try {
2452           latch.countDown();
2453           readLock.acquire();
2454         } catch (IOException ex) {
2455           fail();
2456         } catch (IllegalStateException ex) {
2457           return; // expected, since this will be reaped under us.
2458         }
2459         fail("should not have come here");
2460       };
2461     }.start();
2462 
2463     latch.await(); // wait until thread starts
2464     Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
2465 
2466     hbck = doFsck(conf, false);
2467     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK}); // still one expired, one not-expired
2468 
2469     edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
2470         TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
2471 
2472     hbck = doFsck(conf, false);
2473     assertErrors(hbck, new ERROR_CODE[] {ERROR_CODE.EXPIRED_TABLE_LOCK, ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
2474 
2475     conf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1); // reaping from ZKInterProcessWriteLock uses znode cTime,
2476                                                                  // which is not injectable through EnvironmentEdge
2477     Threads.sleep(10);
2478     hbck = doFsck(conf, true); // now fix both cases
2479 
2480     hbck = doFsck(conf, false);
2481     assertNoErrors(hbck);
2482 
2483     // ensure that locks are deleted
2484     writeLock = tableLockManager.writeLock(tableName, "should acquire without blocking");
2485     writeLock.acquire(); // this should not block.
2486     writeLock.release(); // release for clean state
2487     tableLockManager.tableDeleted(tableName);
2488   }
2489 
2490   /**
2491    * Test orphaned table ZNode (for table states)
2492    */
2493   @Test
2494   public void testOrphanedTableZNode() throws Exception {
2495     TableName table = TableName.valueOf("testOrphanedZKTableEntry");
2496 
2497     try {
2498       TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager().getTableStateManager()
2499       .setTableState(table, ZooKeeperProtos.Table.State.ENABLING);
2500 
2501       try {
2502         setupTable(table);
2503         Assert.fail(
2504           "Create table should fail when its ZNode has already existed with ENABLING state.");
2505       } catch(TableExistsException t) {
2506         //Expected exception
2507       }
2508       // The setup table was interrupted in some state that needs to some cleanup.
2509       try {
2510         cleanupTable(table);
2511       } catch (IOException e) {
2512         // Because create table failed, it is expected that the cleanup table would
2513         // throw some exception.  Ignore and continue.
2514       }
2515 
2516       HBaseFsck hbck = doFsck(conf, false);
2517       assertTrue(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2518 
2519       // fix the orphaned ZK entry
2520       hbck = doFsck(conf, true);
2521 
2522       // check that orpahned ZK table entry is gone.
2523       hbck = doFsck(conf, false);
2524       assertFalse(hbck.getErrors().getErrorList().contains(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY));
2525       // Now create table should succeed.
2526       setupTable(table);
2527     } finally {
2528       // This code could be called that either a table was created successfully or set up
2529       // table failed in some unknown state.  Therefore, clean up can either succeed or fail.
2530       try {
2531         cleanupTable(table);
2532       } catch (IOException e) {
2533         // The cleanup table would throw some exception if create table failed in some state.
2534         // Ignore this exception
2535       }
2536     }
2537   }
2538 
2539   @Test (timeout=180000)
2540   public void testMetaOffline() throws Exception {
2541     // check no errors
2542     HBaseFsck hbck = doFsck(conf, false);
2543     assertNoErrors(hbck);
2544     deleteMetaRegion(conf, true, false, false);
2545     hbck = doFsck(conf, false);
2546     // ERROR_CODE.UNKNOWN is coming because we reportError with a message for the hbase:meta
2547     // inconsistency and whether we will be fixing it or not.
2548     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2549     hbck = doFsck(conf, true);
2550     assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NO_META_REGION, ERROR_CODE.UNKNOWN });
2551     hbck = doFsck(conf, false);
2552     assertNoErrors(hbck);
2553   }
2554 
2555   private void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs,
2556       boolean regionInfoOnly) throws IOException, InterruptedException {
2557     HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME)
2558         .getRegionLocation(HConstants.EMPTY_START_ROW);
2559     ServerName hsa = metaLocation.getServerName();
2560     HRegionInfo hri = metaLocation.getRegionInfo();
2561     if (unassign) {
2562       LOG.info("Undeploying meta region " + hri + " from server " + hsa);
2563       try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) {
2564         undeployRegion(unmanagedConnection, hsa, hri);
2565       }
2566     }
2567 
2568     if (regionInfoOnly) {
2569       LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString());
2570       Path rootDir = FSUtils.getRootDir(conf);
2571       FileSystem fs = rootDir.getFileSystem(conf);
2572       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2573           hri.getEncodedName());
2574       Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE);
2575       fs.delete(hriPath, true);
2576     }
2577 
2578     if (hdfs) {
2579       LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString());
2580       Path rootDir = FSUtils.getRootDir(conf);
2581       FileSystem fs = rootDir.getFileSystem(conf);
2582       Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(),
2583           hri.getEncodedName());
2584       HBaseFsck.debugLsr(conf, p);
2585       boolean success = fs.delete(p, true);
2586       LOG.info("Deleted " + p + " sucessfully? " + success);
2587       HBaseFsck.debugLsr(conf, p);
2588     }
2589   }
2590 
2591   @Test (timeout=180000)
2592   public void testTableWithNoRegions() throws Exception {
2593     // We might end up with empty regions in a table
2594     // see also testNoHdfsTable()
2595     TableName table =
2596         TableName.valueOf(name.getMethodName());
2597     try {
2598       // create table with one region
2599       HTableDescriptor desc = new HTableDescriptor(table);
2600       HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
2601       desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
2602       createTable(TEST_UTIL, desc, null);
2603       tbl = (HTable) connection.getTable(table, tableExecutorService);
2604 
2605       // Mess it up by leaving a hole in the assignment, meta, and hdfs data
2606       deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
2607           HConstants.EMPTY_END_ROW, false, false, true);
2608 
2609       HBaseFsck hbck = doFsck(conf, false);
2610       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_HDFS });
2611 
2612       doFsck(conf, true);
2613 
2614       // fix hole
2615       doFsck(conf, true);
2616 
2617       // check that hole fixed
2618       assertNoErrors(doFsck(conf, false));
2619     } finally {
2620       cleanupTable(table);
2621     }
2622 
2623   }
2624 
2625   @Test (timeout=180000)
2626   public void testHbckAfterRegionMerge() throws Exception {
2627     TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
2628     Table meta = null;
2629     try {
2630       // disable CatalogJanitor
2631       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
2632       setupTable(table);
2633       assertEquals(ROWKEYS.length, countRows());
2634 
2635       // make sure data in regions, if in wal only there is no data loss
2636       admin.flush(table);
2637       HRegionInfo region1 = tbl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
2638       HRegionInfo region2 = tbl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
2639 
2640       int regionCountBeforeMerge = tbl.getRegionLocations().size();
2641 
2642       assertNotEquals(region1, region2);
2643 
2644       // do a region merge
2645       admin.mergeRegions(region1.getEncodedNameAsBytes(),
2646           region2.getEncodedNameAsBytes(), false);
2647 
2648       // wait until region merged
2649       long timeout = System.currentTimeMillis() + 30 * 1000;
2650       while (true) {
2651         if (tbl.getRegionLocations().size() < regionCountBeforeMerge) {
2652           break;
2653         } else if (System.currentTimeMillis() > timeout) {
2654           fail("Time out waiting on region " + region1.getEncodedName()
2655               + " and " + region2.getEncodedName() + " be merged");
2656         }
2657         Thread.sleep(10);
2658       }
2659 
2660       assertEquals(ROWKEYS.length, countRows());
2661 
2662       HBaseFsck hbck = doFsck(conf, false);
2663       assertNoErrors(hbck); // no errors
2664 
2665     } finally {
2666       TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
2667       cleanupTable(table);
2668       IOUtils.closeQuietly(meta);
2669     }
2670   }
2671 
2672   @Test (timeout = 180000)
2673   public void testRegionBoundariesCheck() throws Exception {
2674     HBaseFsck hbck = doFsck(conf, false);
2675     assertNoErrors(hbck); // no errors
2676     try {
2677       hbck.checkRegionBoundaries();
2678     } catch (IllegalArgumentException e) {
2679       if (e.getMessage().endsWith("not a valid DFS filename.")) {
2680         fail("Table directory path is not valid." + e.getMessage());
2681       }
2682     }
2683   }
2684 
2685   @org.junit.Rule
2686   public TestName name = new TestName();
2687 
2688   @Test (timeout=180000)
2689   public void testReadOnlyProperty() throws Exception {
2690     HBaseFsck hbck = doFsck(conf, false);
2691     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2692       hbck.shouldIgnorePreCheckPermission());
2693 
2694     hbck = doFsck(conf, true);
2695     Assert.assertEquals("shouldIgnorePreCheckPermission", false,
2696       hbck.shouldIgnorePreCheckPermission());
2697 
2698     hbck = doFsck(conf, true);
2699     hbck.setIgnorePreCheckPermission(true);
2700     Assert.assertEquals("shouldIgnorePreCheckPermission", true,
2701       hbck.shouldIgnorePreCheckPermission());
2702   }
2703 
2704   @Test (timeout=180000)
2705   public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
2706     TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
2707     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
2708     try {
2709       HTableDescriptor desc = new HTableDescriptor(table);
2710       desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
2711       createTable(TEST_UTIL, desc, null);
2712       tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
2713       for (int i = 0; i < 5; i++) {
2714         Put p1 = new Put(("r" + i).getBytes());
2715         p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
2716         tbl.put(p1);
2717       }
2718       admin.flush(desc.getTableName());
2719       List<HRegion> regions = cluster.getRegions(desc.getTableName());
2720       int serverWith = cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
2721       HRegionServer regionServer = cluster.getRegionServer(serverWith);
2722       cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
2723       SplitTransactionImpl st = new SplitTransactionImpl(regions.get(0), Bytes.toBytes("r3"));
2724       st.prepare();
2725       st.stepsBeforePONR(regionServer, regionServer, false);
2726       AssignmentManager am = cluster.getMaster().getAssignmentManager();
2727       Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
2728       for (RegionState state : regionsInTransition.values()) {
2729         am.regionOffline(state.getRegion());
2730       }
2731       ZKAssign.deleteNodeFailSilent(regionServer.getZooKeeper(), regions.get(0).getRegionInfo());
2732       Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
2733       regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
2734       am.assign(regionsMap);
2735       am.waitForAssignment(regions.get(0).getRegionInfo());
2736       HBaseFsck hbck = doFsck(conf, false);
2737       assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2738           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2739       // holes are separate from overlap groups
2740       assertEquals(0, hbck.getOverlapGroups(table).size());
2741 
2742       // fix hole
2743       assertErrors(
2744         doFsck(
2745           conf, false, true, false, false, false, false, false, false, false, false, false, null),
2746         new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
2747           ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
2748 
2749       // check that hole fixed
2750       assertNoErrors(doFsck(conf, false));
2751       assertEquals(5, countRows());
2752     } finally {
2753       if (tbl != null) {
2754         tbl.close();
2755         tbl = null;
2756       }
2757       cleanupTable(table);
2758     }
2759   }
2760 
2761 
2762   public static class MasterSyncObserver extends BaseMasterObserver {
2763     volatile CountDownLatch tableCreationLatch = null;
2764     volatile CountDownLatch tableDeletionLatch = null;
2765 
2766     @Override
2767     public void postCreateTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
2768       HTableDescriptor desc, HRegionInfo[] regions) throws IOException {
2769       // the AccessController test, some times calls only and directly the postCreateTableHandler()
2770       if (tableCreationLatch != null) {
2771         tableCreationLatch.countDown();
2772       }
2773     }
2774 
2775     @Override
2776     public void postDeleteTableHandler(final ObserverContext<MasterCoprocessorEnvironment> ctx,
2777                                        TableName tableName)
2778     throws IOException {
2779       // the AccessController test, some times calls only and directly the postDeleteTableHandler()
2780       if (tableDeletionLatch != null) {
2781         tableDeletionLatch.countDown();
2782       }
2783     }
2784   }
2785 
2786   public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd,
2787     byte [][] splitKeys) throws Exception {
2788     // NOTE: We need a latch because admin is not sync,
2789     // so the postOp coprocessor method may be called after the admin operation returned.
2790     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
2791       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
2792     observer.tableCreationLatch = new CountDownLatch(1);
2793     if (splitKeys != null) {
2794       admin.createTable(htd, splitKeys);
2795     } else {
2796       admin.createTable(htd);
2797     }
2798     observer.tableCreationLatch.await();
2799     observer.tableCreationLatch = null;
2800     testUtil.waitUntilAllRegionsAssigned(htd.getTableName());
2801   }
2802 
2803   public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName)
2804     throws Exception {
2805     // NOTE: We need a latch because admin is not sync,
2806     // so the postOp coprocessor method may be called after the admin operation returned.
2807     MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster()
2808       .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName());
2809     observer.tableDeletionLatch = new CountDownLatch(1);
2810     try {
2811       admin.disableTable(tableName);
2812     } catch (Exception e) {
2813       LOG.debug("Table: " + tableName + " already disabled, so just deleting it.");
2814     }
2815     admin.deleteTable(tableName);
2816     observer.tableDeletionLatch.await();
2817     observer.tableDeletionLatch = null;
2818   }
2819 }