1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.util;
19
20 import java.io.Closeable;
21 import java.io.FileNotFoundException;
22 import java.io.IOException;
23 import java.io.InterruptedIOException;
24 import java.io.PrintWriter;
25 import java.io.StringWriter;
26 import java.net.InetAddress;
27 import java.net.URI;
28 import java.util.ArrayList;
29 import java.util.Arrays;
30 import java.util.Collection;
31 import java.util.Collections;
32 import java.util.Comparator;
33 import java.util.HashMap;
34 import java.util.HashSet;
35 import java.util.Iterator;
36 import java.util.List;
37 import java.util.Map;
38 import java.util.Map.Entry;
39 import java.util.Set;
40 import java.util.SortedMap;
41 import java.util.SortedSet;
42 import java.util.TreeMap;
43 import java.util.TreeSet;
44 import java.util.Vector;
45 import java.util.concurrent.Callable;
46 import java.util.concurrent.ConcurrentSkipListMap;
47 import java.util.concurrent.ExecutionException;
48 import java.util.concurrent.ExecutorService;
49 import java.util.concurrent.Executors;
50 import java.util.concurrent.Future;
51 import java.util.concurrent.FutureTask;
52 import java.util.concurrent.ScheduledThreadPoolExecutor;
53 import java.util.concurrent.TimeUnit;
54 import java.util.concurrent.TimeoutException;
55 import java.util.concurrent.atomic.AtomicBoolean;
56 import java.util.concurrent.atomic.AtomicInteger;
57
58 import com.google.common.base.Joiner;
59 import com.google.common.base.Preconditions;
60 import com.google.common.collect.ImmutableList;
61 import com.google.common.collect.Lists;
62 import com.google.common.collect.Multimap;
63 import com.google.common.collect.Ordering;
64 import com.google.common.collect.TreeMultimap;
65 import com.google.protobuf.ServiceException;
66
67 import org.apache.commons.lang.StringUtils;
68 import org.apache.commons.logging.Log;
69 import org.apache.commons.logging.LogFactory;
70 import org.apache.hadoop.hbase.classification.InterfaceAudience;
71 import org.apache.hadoop.hbase.classification.InterfaceStability;
72 import org.apache.hadoop.conf.Configuration;
73 import org.apache.hadoop.conf.Configured;
74 import org.apache.hadoop.fs.FSDataOutputStream;
75 import org.apache.hadoop.fs.FileStatus;
76 import org.apache.hadoop.fs.FileSystem;
77 import org.apache.hadoop.fs.Path;
78 import org.apache.hadoop.fs.permission.FsAction;
79 import org.apache.hadoop.fs.permission.FsPermission;
80 import org.apache.hadoop.hbase.Abortable;
81 import org.apache.hadoop.hbase.Cell;
82 import org.apache.hadoop.hbase.ClusterStatus;
83 import org.apache.hadoop.hbase.CoordinatedStateException;
84 import org.apache.hadoop.hbase.HBaseConfiguration;
85 import org.apache.hadoop.hbase.HBaseInterfaceAudience;
86 import org.apache.hadoop.hbase.HColumnDescriptor;
87 import org.apache.hadoop.hbase.HConstants;
88 import org.apache.hadoop.hbase.HRegionInfo;
89 import org.apache.hadoop.hbase.HRegionLocation;
90 import org.apache.hadoop.hbase.HTableDescriptor;
91 import org.apache.hadoop.hbase.KeyValue;
92 import org.apache.hadoop.hbase.MasterNotRunningException;
93 import org.apache.hadoop.hbase.RegionLocations;
94 import org.apache.hadoop.hbase.ServerName;
95 import org.apache.hadoop.hbase.TableName;
96 import org.apache.hadoop.hbase.ZooKeeperConnectionException;
97 import org.apache.hadoop.hbase.MetaTableAccessor;
98 import org.apache.hadoop.hbase.classification.InterfaceAudience;
99 import org.apache.hadoop.hbase.classification.InterfaceStability;
100 import org.apache.hadoop.hbase.client.Admin;
101 import org.apache.hadoop.hbase.client.ClusterConnection;
102 import org.apache.hadoop.hbase.client.ConnectionFactory;
103 import org.apache.hadoop.hbase.client.Delete;
104 import org.apache.hadoop.hbase.client.Get;
105 import org.apache.hadoop.hbase.client.HBaseAdmin;
106 import org.apache.hadoop.hbase.client.HConnectable;
107 import org.apache.hadoop.hbase.client.HConnection;
108 import org.apache.hadoop.hbase.client.HConnectionManager;
109 import org.apache.hadoop.hbase.client.MetaScanner;
110 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitor;
111 import org.apache.hadoop.hbase.client.MetaScanner.MetaScannerVisitorBase;
112 import org.apache.hadoop.hbase.client.Put;
113 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
114 import org.apache.hadoop.hbase.client.Result;
115 import org.apache.hadoop.hbase.client.RowMutations;
116 import org.apache.hadoop.hbase.client.Table;
117 import org.apache.hadoop.hbase.io.FileLink;
118 import org.apache.hadoop.hbase.io.HFileLink;
119 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
120 import org.apache.hadoop.hbase.io.hfile.HFile;
121 import org.apache.hadoop.hbase.master.MasterFileSystem;
122 import org.apache.hadoop.hbase.master.RegionState;
123 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
124 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService.BlockingInterface;
125 import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
126 import org.apache.hadoop.hbase.regionserver.HRegion;
127 import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
128 import org.apache.hadoop.hbase.regionserver.StoreFileInfo;
129 import org.apache.hadoop.hbase.security.AccessDeniedException;
130 import org.apache.hadoop.hbase.security.UserProvider;
131 import org.apache.hadoop.hbase.util.Bytes.ByteArrayComparator;
132 import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
133 import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
134 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandler;
135 import org.apache.hadoop.hbase.util.hbck.TableIntegrityErrorHandlerImpl;
136 import org.apache.hadoop.hbase.util.hbck.TableLockChecker;
137 import org.apache.hadoop.hbase.wal.WALSplitter;
138 import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
139 import org.apache.hadoop.hbase.zookeeper.ZKTableStateClientSideReader;
140 import org.apache.hadoop.hbase.zookeeper.ZKTableStateManager;
141 import org.apache.hadoop.hbase.zookeeper.ZKUtil;
142 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
143 import org.apache.hadoop.hdfs.protocol.AlreadyBeingCreatedException;
144 import org.apache.hadoop.io.IOUtils;
145 import org.apache.hadoop.ipc.RemoteException;
146 import org.apache.hadoop.security.UserGroupInformation;
147 import org.apache.hadoop.util.ReflectionUtils;
148 import org.apache.hadoop.util.Tool;
149 import org.apache.hadoop.util.ToolRunner;
150 import org.apache.zookeeper.KeeperException;
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197 @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS)
198 @InterfaceStability.Evolving
199 public class HBaseFsck extends Configured implements Closeable {
200 public static final long DEFAULT_TIME_LAG = 60000;
201 public static final long DEFAULT_SLEEP_BEFORE_RERUN = 10000;
202 private static final int MAX_NUM_THREADS = 50;
203 private static boolean rsSupportsOffline = true;
204 private static final int DEFAULT_OVERLAPS_TO_SIDELINE = 2;
205 private static final int DEFAULT_MAX_MERGE = 5;
206 private static final String TO_BE_LOADED = "to_be_loaded";
207 private static final String HBCK_LOCK_FILE = "hbase-hbck.lock";
208 private static final int DEFAULT_MAX_LOCK_FILE_ATTEMPTS = 5;
209 private static final int DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL = 200;
210 private static final int DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME = 5000;
211
212
213
214
215 private static final int DEFAULT_WAIT_FOR_LOCK_TIMEOUT = 80;
216 private static final int DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS = 5;
217 private static final int DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL = 200;
218 private static final int DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME = 5000;
219
220
221
222
223 private static final Log LOG = LogFactory.getLog(HBaseFsck.class.getName());
224 private ClusterStatus status;
225 private ClusterConnection connection;
226 private Admin admin;
227 private Table meta;
228
229 protected ExecutorService executor;
230 private long startMillis = EnvironmentEdgeManager.currentTime();
231 private HFileCorruptionChecker hfcc;
232 private int retcode = 0;
233 private Path HBCK_LOCK_PATH;
234 private FSDataOutputStream hbckOutFd;
235
236
237
238 private final AtomicBoolean hbckLockCleanup = new AtomicBoolean(false);
239
240
241
242
243 private static boolean details = false;
244 private long timelag = DEFAULT_TIME_LAG;
245 private static boolean disableSplitAndMerge = false;
246 private boolean fixAssignments = false;
247 private boolean fixMeta = false;
248 private boolean checkHdfs = true;
249 private boolean fixHdfsHoles = false;
250 private boolean fixHdfsOverlaps = false;
251 private boolean fixHdfsOrphans = false;
252 private boolean fixTableOrphans = false;
253 private boolean fixVersionFile = false;
254 private boolean fixSplitParents = false;
255 private boolean fixReferenceFiles = false;
256 private boolean fixHFileLinks = false;
257 private boolean fixEmptyMetaCells = false;
258 private boolean fixTableLocks = false;
259 private boolean fixTableZNodes = false;
260 private boolean fixAny = false;
261
262
263
264 private Set<TableName> tablesIncluded = new HashSet<TableName>();
265 private int maxMerge = DEFAULT_MAX_MERGE;
266 private int maxOverlapsToSideline = DEFAULT_OVERLAPS_TO_SIDELINE;
267 private boolean sidelineBigOverlaps = false;
268 private Path sidelineDir = null;
269
270 private boolean rerun = false;
271 private static boolean summary = false;
272 private boolean checkMetaOnly = false;
273 private boolean checkRegionBoundaries = false;
274 private boolean ignorePreCheckPermission = false;
275
276
277
278
279 final private ErrorReporter errors;
280 int fixes = 0;
281
282
283
284
285
286
287 private TreeMap<String, HbckInfo> regionInfoMap = new TreeMap<String, HbckInfo>();
288 private TreeSet<TableName> disabledTables =
289 new TreeSet<TableName>();
290
291 private Set<Result> emptyRegionInfoQualifiers = new HashSet<Result>();
292
293
294
295
296
297
298
299
300
301
302
303 private SortedMap<TableName, TableInfo> tablesInfo =
304 new ConcurrentSkipListMap<TableName, TableInfo>();
305
306
307
308
309 private List<HbckInfo> orphanHdfsDirs = Collections.synchronizedList(new ArrayList<HbckInfo>());
310
311 private Map<TableName, Set<String>> orphanTableDirs =
312 new HashMap<TableName, Set<String>>();
313
314 private Map<TableName, Set<String>> skippedRegions = new HashMap<TableName, Set<String>>();
315
316
317
318
319 private Set<TableName> orphanedTableZNodes = new HashSet<TableName>();
320 private final RetryCounterFactory lockFileRetryCounterFactory;
321 private final RetryCounterFactory createZNodeRetryCounterFactory;
322
323 private ZooKeeperWatcher zkw = null;
324 private String hbckEphemeralNodePath = null;
325 private boolean hbckZodeCreated = false;
326
327
328
329
330
331
332
333
334 public HBaseFsck(Configuration conf) throws MasterNotRunningException,
335 ZooKeeperConnectionException, IOException, ClassNotFoundException {
336 this(conf, createThreadPool(conf));
337 }
338
339 private static ExecutorService createThreadPool(Configuration conf) {
340 int numThreads = conf.getInt("hbasefsck.numthreads", MAX_NUM_THREADS);
341 return new ScheduledThreadPoolExecutor(numThreads, Threads.newDaemonThreadFactory("hbasefsck"));
342 }
343
344
345
346
347
348
349
350
351
352
353
354 public HBaseFsck(Configuration conf, ExecutorService exec) throws MasterNotRunningException,
355 ZooKeeperConnectionException, IOException, ClassNotFoundException {
356 super(conf);
357 errors = getErrorReporter(getConf());
358 this.executor = exec;
359 lockFileRetryCounterFactory = new RetryCounterFactory(
360 getConf().getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS),
361 getConf().getInt(
362 "hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL),
363 getConf().getInt(
364 "hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME));
365 createZNodeRetryCounterFactory = new RetryCounterFactory(
366 getConf().getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS),
367 getConf().getInt(
368 "hbase.hbck.createznode.attempt.sleep.interval",
369 DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL),
370 getConf().getInt(
371 "hbase.hbck.createznode.attempt.maxsleeptime",
372 DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME));
373 zkw = createZooKeeperWatcher();
374 }
375
376 private class FileLockCallable implements Callable<FSDataOutputStream> {
377 RetryCounter retryCounter;
378
379 public FileLockCallable(RetryCounter retryCounter) {
380 this.retryCounter = retryCounter;
381 }
382 @Override
383 public FSDataOutputStream call() throws IOException {
384 try {
385 FileSystem fs = FSUtils.getCurrentFileSystem(getConf());
386 FsPermission defaultPerms = FSUtils.getFilePermissions(fs, getConf(),
387 HConstants.DATA_FILE_UMASK_KEY);
388 Path tmpDir = new Path(FSUtils.getRootDir(getConf()), HConstants.HBASE_TEMP_DIRECTORY);
389 fs.mkdirs(tmpDir);
390 HBCK_LOCK_PATH = new Path(tmpDir, HBCK_LOCK_FILE);
391 final FSDataOutputStream out = createFileWithRetries(fs, HBCK_LOCK_PATH, defaultPerms);
392 out.writeBytes(InetAddress.getLocalHost().toString());
393 out.flush();
394 return out;
395 } catch(RemoteException e) {
396 if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
397 return null;
398 } else {
399 throw e;
400 }
401 }
402 }
403
404 private FSDataOutputStream createFileWithRetries(final FileSystem fs,
405 final Path hbckLockFilePath, final FsPermission defaultPerms)
406 throws IOException {
407
408 IOException exception = null;
409 do {
410 try {
411 return FSUtils.create(fs, hbckLockFilePath, defaultPerms, false);
412 } catch (IOException ioe) {
413 LOG.info("Failed to create lock file " + hbckLockFilePath.getName()
414 + ", try=" + (retryCounter.getAttemptTimes() + 1) + " of "
415 + retryCounter.getMaxAttempts());
416 LOG.debug("Failed to create lock file " + hbckLockFilePath.getName(),
417 ioe);
418 try {
419 exception = ioe;
420 retryCounter.sleepUntilNextRetry();
421 } catch (InterruptedException ie) {
422 throw (InterruptedIOException) new InterruptedIOException(
423 "Can't create lock file " + hbckLockFilePath.getName())
424 .initCause(ie);
425 }
426 }
427 } while (retryCounter.shouldRetry());
428
429 throw exception;
430 }
431 }
432
433
434
435
436
437
438
439 private FSDataOutputStream checkAndMarkRunningHbck() throws IOException {
440 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
441 FileLockCallable callable = new FileLockCallable(retryCounter);
442 ExecutorService executor = Executors.newFixedThreadPool(1);
443 FutureTask<FSDataOutputStream> futureTask = new FutureTask<FSDataOutputStream>(callable);
444 executor.execute(futureTask);
445 final int timeoutInSeconds = getConf().getInt(
446 "hbase.hbck.lockfile.maxwaittime", DEFAULT_WAIT_FOR_LOCK_TIMEOUT);
447 FSDataOutputStream stream = null;
448 try {
449 stream = futureTask.get(timeoutInSeconds, TimeUnit.SECONDS);
450 } catch (ExecutionException ee) {
451 LOG.warn("Encountered exception when opening lock file", ee);
452 } catch (InterruptedException ie) {
453 LOG.warn("Interrupted when opening lock file", ie);
454 Thread.currentThread().interrupt();
455 } catch (TimeoutException exception) {
456
457 LOG.warn("Took more than " + timeoutInSeconds + " seconds in obtaining lock");
458 futureTask.cancel(true);
459 } finally {
460 executor.shutdownNow();
461 }
462 return stream;
463 }
464
465 private void unlockHbck() {
466 if (hbckLockCleanup.compareAndSet(true, false)) {
467 RetryCounter retryCounter = lockFileRetryCounterFactory.create();
468 do {
469 try {
470 IOUtils.closeStream(hbckOutFd);
471 FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()),
472 HBCK_LOCK_PATH, true);
473 LOG.info("Finishing hbck");
474 return;
475 } catch (IOException ioe) {
476 LOG.info("Failed to delete " + HBCK_LOCK_PATH + ", try="
477 + (retryCounter.getAttemptTimes() + 1) + " of "
478 + retryCounter.getMaxAttempts());
479 LOG.debug("Failed to delete " + HBCK_LOCK_PATH, ioe);
480 try {
481 retryCounter.sleepUntilNextRetry();
482 } catch (InterruptedException ie) {
483 Thread.currentThread().interrupt();
484 LOG.warn("Interrupted while deleting lock file" +
485 HBCK_LOCK_PATH);
486 return;
487 }
488 }
489 } while (retryCounter.shouldRetry());
490 }
491 }
492
493
494
495
496
497 public void connect() throws IOException {
498
499
500 hbckOutFd = checkAndMarkRunningHbck();
501 if (hbckOutFd == null) {
502 setRetCode(-1);
503 LOG.error("Another instance of hbck is running, exiting this instance.[If you are sure" +
504 " no other instance is running, delete the lock file " +
505 HBCK_LOCK_PATH + " and rerun the tool]");
506 throw new IOException("Duplicate hbck - Abort");
507 }
508
509
510 hbckLockCleanup.set(true);
511
512
513
514
515 Runtime.getRuntime().addShutdownHook(new Thread() {
516 @Override
517 public void run() {
518 IOUtils.closeStream(HBaseFsck.this);
519 cleanupHbckZnode();
520 unlockHbck();
521 }
522 });
523
524 LOG.info("Launching hbck");
525
526 connection = (ClusterConnection)ConnectionFactory.createConnection(getConf());
527 admin = connection.getAdmin();
528 meta = connection.getTable(TableName.META_TABLE_NAME);
529 status = admin.getClusterStatus();
530 }
531
532
533
534
535 private void loadDeployedRegions() throws IOException, InterruptedException {
536
537 Collection<ServerName> regionServers = status.getServers();
538 errors.print("Number of live region servers: " + regionServers.size());
539 if (details) {
540 for (ServerName rsinfo: regionServers) {
541 errors.print(" " + rsinfo.getServerName());
542 }
543 }
544
545
546 Collection<ServerName> deadRegionServers = status.getDeadServerNames();
547 errors.print("Number of dead region servers: " + deadRegionServers.size());
548 if (details) {
549 for (ServerName name: deadRegionServers) {
550 errors.print(" " + name);
551 }
552 }
553
554
555 errors.print("Master: " + status.getMaster());
556
557
558 Collection<ServerName> backupMasters = status.getBackupMasters();
559 errors.print("Number of backup masters: " + backupMasters.size());
560 if (details) {
561 for (ServerName name: backupMasters) {
562 errors.print(" " + name);
563 }
564 }
565
566 errors.print("Average load: " + status.getAverageLoad());
567 errors.print("Number of requests: " + status.getRequestsCount());
568 errors.print("Number of regions: " + status.getRegionsCount());
569
570 Map<String, RegionState> rits = status.getRegionsInTransition();
571 errors.print("Number of regions in transition: " + rits.size());
572 if (details) {
573 for (RegionState state: rits.values()) {
574 errors.print(" " + state.toDescriptiveString());
575 }
576 }
577
578
579 processRegionServers(regionServers);
580 }
581
582
583
584
585 private void clearState() {
586
587 fixes = 0;
588 regionInfoMap.clear();
589 emptyRegionInfoQualifiers.clear();
590 disabledTables.clear();
591 errors.clear();
592 tablesInfo.clear();
593 orphanHdfsDirs.clear();
594 skippedRegions.clear();
595 }
596
597
598
599
600
601
602 public void offlineHdfsIntegrityRepair() throws IOException, InterruptedException {
603
604 if (shouldCheckHdfs() && (shouldFixHdfsOrphans() || shouldFixHdfsHoles()
605 || shouldFixHdfsOverlaps() || shouldFixTableOrphans())) {
606 LOG.info("Loading regioninfos HDFS");
607
608 int maxIterations = getConf().getInt("hbase.hbck.integrityrepair.iterations.max", 3);
609 int curIter = 0;
610 do {
611 clearState();
612
613 restoreHdfsIntegrity();
614 curIter++;
615 } while (fixes > 0 && curIter <= maxIterations);
616
617
618
619 if (curIter > 2) {
620 if (curIter == maxIterations) {
621 LOG.warn("Exiting integrity repairs after max " + curIter + " iterations. "
622 + "Tables integrity may not be fully repaired!");
623 } else {
624 LOG.info("Successfully exiting integrity repairs after " + curIter + " iterations");
625 }
626 }
627 }
628 }
629
630
631
632
633
634
635
636
637
638 public int onlineConsistencyRepair() throws IOException, KeeperException,
639 InterruptedException {
640 clearState();
641
642
643 loadDeployedRegions();
644
645 recordMetaRegion();
646
647 if (!checkMetaRegion()) {
648 String errorMsg = "hbase:meta table is not consistent. ";
649 if (shouldFixAssignments()) {
650 errorMsg += "HBCK will try fixing it. Rerun once hbase:meta is back to consistent state.";
651 } else {
652 errorMsg += "Run HBCK with proper fix options to fix hbase:meta inconsistency.";
653 }
654 errors.reportError(errorMsg + " Exiting...");
655 return -2;
656 }
657
658 LOG.info("Loading regionsinfo from the hbase:meta table");
659 boolean success = loadMetaEntries();
660 if (!success) return -1;
661
662
663 reportEmptyMetaCells();
664
665
666 if (shouldFixEmptyMetaCells()) {
667 fixEmptyMetaCells();
668 }
669
670
671 if (!checkMetaOnly) {
672 reportTablesInFlux();
673 }
674
675
676 if (shouldCheckHdfs()) {
677 LOG.info("Loading region directories from HDFS");
678 loadHdfsRegionDirs();
679 LOG.info("Loading region information from HDFS");
680 loadHdfsRegionInfos();
681 }
682
683
684 loadDisabledTables();
685
686
687 fixOrphanTables();
688
689 LOG.info("Checking and fixing region consistency");
690
691
692 checkAndFixConsistency();
693
694
695 checkIntegrity();
696 return errors.getErrorList().size();
697 }
698
699
700
701
702
703
704
705
706 private boolean setMasterInMaintenanceMode() throws IOException {
707 RetryCounter retryCounter = createZNodeRetryCounterFactory.create();
708 hbckEphemeralNodePath = ZKUtil.joinZNode(
709 ZooKeeperWatcher.masterMaintZNode,
710 "hbck-" + Long.toString(EnvironmentEdgeManager.currentTime()));
711 do {
712 try {
713 hbckZodeCreated = ZKUtil.createEphemeralNodeAndWatch(zkw, hbckEphemeralNodePath, null);
714 if (hbckZodeCreated) {
715 break;
716 }
717 } catch (KeeperException e) {
718 if (retryCounter.getAttemptTimes() >= retryCounter.getMaxAttempts()) {
719 throw new IOException("Can't create znode " + hbckEphemeralNodePath, e);
720 }
721
722 }
723
724 LOG.warn("Fail to create znode " + hbckEphemeralNodePath + ", try=" +
725 (retryCounter.getAttemptTimes() + 1) + " of " + retryCounter.getMaxAttempts());
726
727 try {
728 retryCounter.sleepUntilNextRetry();
729 } catch (InterruptedException ie) {
730 throw (InterruptedIOException) new InterruptedIOException(
731 "Can't create znode " + hbckEphemeralNodePath).initCause(ie);
732 }
733 } while (retryCounter.shouldRetry());
734 return hbckZodeCreated;
735 }
736
737 private void cleanupHbckZnode() {
738 try {
739 if (zkw != null && hbckZodeCreated) {
740 ZKUtil.deleteNode(zkw, hbckEphemeralNodePath);
741 hbckZodeCreated = false;
742 }
743 } catch (KeeperException e) {
744
745 if (!e.code().equals(KeeperException.Code.NONODE)) {
746 LOG.warn("Delete HBCK znode " + hbckEphemeralNodePath + " failed ", e);
747 }
748 }
749 }
750
751
752
753
754
755 public int onlineHbck()
756 throws IOException, KeeperException, InterruptedException, ServiceException {
757
758 errors.print("Version: " + status.getHBaseVersion());
759 offlineHdfsIntegrityRepair();
760
761
762
763
764
765 if (!setMasterInMaintenanceMode()) {
766 LOG.warn("HBCK is running while master is not in maintenance mode, you might see transient "
767 + "error. Please run HBCK multiple times to reduce the chance of transient error.");
768 }
769
770 onlineConsistencyRepair();
771
772 if (checkRegionBoundaries) {
773 checkRegionBoundaries();
774 }
775
776 offlineReferenceFileRepair();
777 offlineHLinkFileRepair();
778
779 checkAndFixTableLocks();
780
781
782 checkAndFixOrphanedTableZNodes();
783
784
785 cleanupHbckZnode();
786
787
788 unlockHbck();
789
790
791 printTableSummary(tablesInfo);
792 return errors.summarize();
793 }
794
795 public static byte[] keyOnly (byte[] b) {
796 if (b == null)
797 return b;
798 int rowlength = Bytes.toShort(b, 0);
799 byte[] result = new byte[rowlength];
800 System.arraycopy(b, Bytes.SIZEOF_SHORT, result, 0, rowlength);
801 return result;
802 }
803
804 @Override
805 public void close() throws IOException {
806 try {
807 cleanupHbckZnode();
808 unlockHbck();
809 } catch (Exception io) {
810 LOG.warn(io);
811 } finally {
812 if (zkw != null) {
813 zkw.close();
814 zkw = null;
815 }
816 IOUtils.cleanup(null, admin, meta, connection);
817 }
818 }
819
820 private static class RegionBoundariesInformation {
821 public byte [] regionName;
822 public byte [] metaFirstKey;
823 public byte [] metaLastKey;
824 public byte [] storesFirstKey;
825 public byte [] storesLastKey;
826 @Override
827 public String toString () {
828 return "regionName=" + Bytes.toStringBinary(regionName) +
829 "\nmetaFirstKey=" + Bytes.toStringBinary(metaFirstKey) +
830 "\nmetaLastKey=" + Bytes.toStringBinary(metaLastKey) +
831 "\nstoresFirstKey=" + Bytes.toStringBinary(storesFirstKey) +
832 "\nstoresLastKey=" + Bytes.toStringBinary(storesLastKey);
833 }
834 }
835
836 public void checkRegionBoundaries() {
837 try {
838 ByteArrayComparator comparator = new ByteArrayComparator();
839 List<HRegionInfo> regions = MetaScanner.listAllRegions(getConf(), connection, false);
840 final RegionBoundariesInformation currentRegionBoundariesInformation =
841 new RegionBoundariesInformation();
842 Path hbaseRoot = FSUtils.getRootDir(getConf());
843 for (HRegionInfo regionInfo : regions) {
844 Path tableDir = FSUtils.getTableDir(hbaseRoot, regionInfo.getTable());
845 currentRegionBoundariesInformation.regionName = regionInfo.getRegionName();
846
847
848 Path path = new Path(tableDir, regionInfo.getEncodedName());
849 FileSystem fs = path.getFileSystem(getConf());
850 FileStatus[] files = fs.listStatus(path);
851
852 byte[] storeFirstKey = null;
853 byte[] storeLastKey = null;
854 for (FileStatus file : files) {
855 String fileName = file.getPath().toString();
856 fileName = fileName.substring(fileName.lastIndexOf("/") + 1);
857 if (!fileName.startsWith(".") && !fileName.endsWith("recovered.edits")) {
858 FileStatus[] storeFiles = fs.listStatus(file.getPath());
859
860 for (FileStatus storeFile : storeFiles) {
861 HFile.Reader reader = HFile.createReader(fs, storeFile.getPath(), new CacheConfig(
862 getConf()), getConf());
863 if ((reader.getFirstKey() != null)
864 && ((storeFirstKey == null) || (comparator.compare(storeFirstKey,
865 reader.getFirstKey()) > 0))) {
866 storeFirstKey = reader.getFirstKey();
867 }
868 if ((reader.getLastKey() != null)
869 && ((storeLastKey == null) || (comparator.compare(storeLastKey,
870 reader.getLastKey())) < 0)) {
871 storeLastKey = reader.getLastKey();
872 }
873 reader.close();
874 }
875 }
876 }
877 currentRegionBoundariesInformation.metaFirstKey = regionInfo.getStartKey();
878 currentRegionBoundariesInformation.metaLastKey = regionInfo.getEndKey();
879 currentRegionBoundariesInformation.storesFirstKey = keyOnly(storeFirstKey);
880 currentRegionBoundariesInformation.storesLastKey = keyOnly(storeLastKey);
881 if (currentRegionBoundariesInformation.metaFirstKey.length == 0)
882 currentRegionBoundariesInformation.metaFirstKey = null;
883 if (currentRegionBoundariesInformation.metaLastKey.length == 0)
884 currentRegionBoundariesInformation.metaLastKey = null;
885
886
887
888
889
890
891 boolean valid = true;
892
893 if ((currentRegionBoundariesInformation.storesFirstKey != null)
894 && (currentRegionBoundariesInformation.metaFirstKey != null)) {
895 valid = valid
896 && comparator.compare(currentRegionBoundariesInformation.storesFirstKey,
897 currentRegionBoundariesInformation.metaFirstKey) >= 0;
898 }
899
900 if ((currentRegionBoundariesInformation.storesLastKey != null)
901 && (currentRegionBoundariesInformation.metaLastKey != null)) {
902 valid = valid
903 && comparator.compare(currentRegionBoundariesInformation.storesLastKey,
904 currentRegionBoundariesInformation.metaLastKey) < 0;
905 }
906 if (!valid) {
907 errors.reportError(ERROR_CODE.BOUNDARIES_ERROR, "Found issues with regions boundaries",
908 tablesInfo.get(regionInfo.getTable()));
909 LOG.warn("Region's boundaries not alligned between stores and META for:");
910 LOG.warn(currentRegionBoundariesInformation);
911 }
912 }
913 } catch (IOException e) {
914 LOG.error(e);
915 }
916 }
917
918
919
920
921 private void adoptHdfsOrphans(Collection<HbckInfo> orphanHdfsDirs) throws IOException {
922 for (HbckInfo hi : orphanHdfsDirs) {
923 LOG.info("Attempting to handle orphan hdfs dir: " + hi.getHdfsRegionDir());
924 adoptHdfsOrphan(hi);
925 }
926 }
927
928
929
930
931
932
933
934
935
936
937 @SuppressWarnings("deprecation")
938 private void adoptHdfsOrphan(HbckInfo hi) throws IOException {
939 Path p = hi.getHdfsRegionDir();
940 FileSystem fs = p.getFileSystem(getConf());
941 FileStatus[] dirs = fs.listStatus(p);
942 if (dirs == null) {
943 LOG.warn("Attempt to adopt ophan hdfs region skipped becuase no files present in " +
944 p + ". This dir could probably be deleted.");
945 return ;
946 }
947
948 TableName tableName = hi.getTableName();
949 TableInfo tableInfo = tablesInfo.get(tableName);
950 Preconditions.checkNotNull(tableInfo, "Table '" + tableName + "' not present!");
951 HTableDescriptor template = tableInfo.getHTD();
952
953
954 Pair<byte[],byte[]> orphanRegionRange = null;
955 for (FileStatus cf : dirs) {
956 String cfName= cf.getPath().getName();
957
958 if (cfName.startsWith(".") || cfName.equals(HConstants.SPLIT_LOGDIR_NAME)) continue;
959
960 FileStatus[] hfiles = fs.listStatus(cf.getPath());
961 for (FileStatus hfile : hfiles) {
962 byte[] start, end;
963 HFile.Reader hf = null;
964 try {
965 CacheConfig cacheConf = new CacheConfig(getConf());
966 hf = HFile.createReader(fs, hfile.getPath(), cacheConf, getConf());
967 hf.loadFileInfo();
968 KeyValue startKv = KeyValue.createKeyValueFromKey(hf.getFirstKey());
969 start = startKv.getRow();
970 KeyValue endKv = KeyValue.createKeyValueFromKey(hf.getLastKey());
971 end = endKv.getRow();
972 } catch (IOException ioe) {
973 LOG.warn("Problem reading orphan file " + hfile + ", skipping");
974 continue;
975 } catch (NullPointerException ioe) {
976 LOG.warn("Orphan file " + hfile + " is possibly corrupted HFile, skipping");
977 continue;
978 } finally {
979 if (hf != null) {
980 hf.close();
981 }
982 }
983
984
985 if (orphanRegionRange == null) {
986
987 orphanRegionRange = new Pair<byte[], byte[]>(start, end);
988 } else {
989
990
991
992 if (Bytes.compareTo(orphanRegionRange.getFirst(), start) > 0) {
993 orphanRegionRange.setFirst(start);
994 }
995 if (Bytes.compareTo(orphanRegionRange.getSecond(), end) < 0 ) {
996 orphanRegionRange.setSecond(end);
997 }
998 }
999 }
1000 }
1001 if (orphanRegionRange == null) {
1002 LOG.warn("No data in dir " + p + ", sidelining data");
1003 fixes++;
1004 sidelineRegionDir(fs, hi);
1005 return;
1006 }
1007 LOG.info("Min max keys are : [" + Bytes.toString(orphanRegionRange.getFirst()) + ", " +
1008 Bytes.toString(orphanRegionRange.getSecond()) + ")");
1009
1010
1011 HRegionInfo hri = new HRegionInfo(template.getTableName(), orphanRegionRange.getFirst(),
1012 Bytes.add(orphanRegionRange.getSecond(), new byte[1]));
1013 LOG.info("Creating new region : " + hri);
1014 HRegion region = HBaseFsckRepair.createHDFSRegionDir(getConf(), hri, template);
1015 Path target = region.getRegionFileSystem().getRegionDir();
1016
1017
1018 mergeRegionDirs(target, hi);
1019 fixes++;
1020 }
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030 private int restoreHdfsIntegrity() throws IOException, InterruptedException {
1031
1032 LOG.info("Loading HBase regioninfo from HDFS...");
1033 loadHdfsRegionDirs();
1034
1035 int errs = errors.getErrorList().size();
1036
1037 tablesInfo = loadHdfsRegionInfos();
1038 checkHdfsIntegrity(false, false);
1039
1040 if (errors.getErrorList().size() == errs) {
1041 LOG.info("No integrity errors. We are done with this phase. Glorious.");
1042 return 0;
1043 }
1044
1045 if (shouldFixHdfsOrphans() && orphanHdfsDirs.size() > 0) {
1046 adoptHdfsOrphans(orphanHdfsDirs);
1047
1048 }
1049
1050
1051 if (shouldFixHdfsHoles()) {
1052 clearState();
1053 loadHdfsRegionDirs();
1054 tablesInfo = loadHdfsRegionInfos();
1055 tablesInfo = checkHdfsIntegrity(shouldFixHdfsHoles(), false);
1056 }
1057
1058
1059 if (shouldFixHdfsOverlaps()) {
1060
1061 clearState();
1062 loadHdfsRegionDirs();
1063 tablesInfo = loadHdfsRegionInfos();
1064 tablesInfo = checkHdfsIntegrity(false, shouldFixHdfsOverlaps());
1065 }
1066
1067 return errors.getErrorList().size();
1068 }
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078 private void offlineReferenceFileRepair() throws IOException, InterruptedException {
1079 Configuration conf = getConf();
1080 Path hbaseRoot = FSUtils.getRootDir(conf);
1081 FileSystem fs = hbaseRoot.getFileSystem(conf);
1082 LOG.info("Computing mapping of all store files");
1083 Map<String, Path> allFiles = FSUtils.getTableStoreFilePathMap(fs, hbaseRoot,
1084 new FSUtils.ReferenceFileFilter(fs), executor, errors);
1085 errors.print("");
1086 LOG.info("Validating mapping using HDFS state");
1087 for (Path path: allFiles.values()) {
1088 Path referredToFile = StoreFileInfo.getReferredToFile(path);
1089 if (fs.exists(referredToFile)) continue;
1090
1091
1092 errors.reportError(ERROR_CODE.LINGERING_REFERENCE_HFILE,
1093 "Found lingering reference file " + path);
1094 if (!shouldFixReferenceFiles()) continue;
1095
1096
1097 boolean success = false;
1098 String pathStr = path.toString();
1099
1100
1101
1102
1103
1104 int index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR);
1105 for (int i = 0; index > 0 && i < 5; i++) {
1106 index = pathStr.lastIndexOf(Path.SEPARATOR_CHAR, index - 1);
1107 }
1108 if (index > 0) {
1109 Path rootDir = getSidelineDir();
1110 Path dst = new Path(rootDir, pathStr.substring(index + 1));
1111 fs.mkdirs(dst.getParent());
1112 LOG.info("Trying to sildeline reference file "
1113 + path + " to " + dst);
1114 setShouldRerun();
1115
1116 success = fs.rename(path, dst);
1117 }
1118 if (!success) {
1119 LOG.error("Failed to sideline reference file " + path);
1120 }
1121 }
1122 }
1123
1124
1125
1126
1127
1128
1129 private void offlineHLinkFileRepair() throws IOException, InterruptedException {
1130 Configuration conf = getConf();
1131 Path hbaseRoot = FSUtils.getRootDir(conf);
1132 FileSystem fs = hbaseRoot.getFileSystem(conf);
1133 LOG.info("Computing mapping of all link files");
1134 Map<String, Path> allFiles = FSUtils
1135 .getTableStoreFilePathMap(fs, hbaseRoot, new FSUtils.HFileLinkFilter(), executor, errors);
1136 errors.print("");
1137
1138 LOG.info("Validating mapping using HDFS state");
1139 for (Path path : allFiles.values()) {
1140
1141 HFileLink actualLink = HFileLink.buildFromHFileLinkPattern(conf, path);
1142 if (actualLink.exists(fs)) continue;
1143
1144
1145 errors.reportError(ERROR_CODE.LINGERING_HFILELINK, "Found lingering HFileLink " + path);
1146 if (!shouldFixHFileLinks()) continue;
1147
1148
1149 setShouldRerun();
1150
1151
1152
1153
1154 boolean success = sidelineFile(fs, hbaseRoot, path);
1155
1156 if (!success) {
1157 LOG.error("Failed to sideline HFileLink file " + path);
1158 }
1159
1160
1161
1162
1163 Path backRefPath = FileLink.getBackReferencesDir(HFileArchiveUtil
1164 .getStoreArchivePath(conf, HFileLink.getReferencedTableName(path.getName().toString()),
1165 HFileLink.getReferencedRegionName(path.getName().toString()),
1166 path.getParent().getName()),
1167 HFileLink.getReferencedHFileName(path.getName().toString()));
1168 success = sidelineFile(fs, hbaseRoot, backRefPath);
1169
1170 if (!success) {
1171 LOG.error("Failed to sideline HFileLink backreference file " + path);
1172 }
1173 }
1174 }
1175
1176 private boolean sidelineFile(FileSystem fs, Path hbaseRoot, Path path) throws IOException {
1177 URI uri = hbaseRoot.toUri().relativize(path.toUri());
1178 if (uri.isAbsolute()) return false;
1179 String relativePath = uri.getPath();
1180 Path rootDir = getSidelineDir();
1181 Path dst = new Path(rootDir, relativePath);
1182 boolean pathCreated = fs.mkdirs(dst.getParent());
1183 if (!pathCreated) {
1184 LOG.error("Failed to create path: " + dst.getParent());
1185 return false;
1186 }
1187 LOG.info("Trying to sideline file " + path + " to " + dst);
1188 return fs.rename(path, dst);
1189 }
1190
1191
1192
1193
1194 private void reportEmptyMetaCells() {
1195 errors.print("Number of empty REGIONINFO_QUALIFIER rows in hbase:meta: " +
1196 emptyRegionInfoQualifiers.size());
1197 if (details) {
1198 for (Result r: emptyRegionInfoQualifiers) {
1199 errors.print(" " + r);
1200 }
1201 }
1202 }
1203
1204
1205
1206
1207 private void reportTablesInFlux() {
1208 AtomicInteger numSkipped = new AtomicInteger(0);
1209 HTableDescriptor[] allTables = getTables(numSkipped);
1210 errors.print("Number of Tables: " + allTables.length);
1211 if (details) {
1212 if (numSkipped.get() > 0) {
1213 errors.detail("Number of Tables in flux: " + numSkipped.get());
1214 }
1215 for (HTableDescriptor td : allTables) {
1216 errors.detail(" Table: " + td.getTableName() + "\t" +
1217 (td.isReadOnly() ? "ro" : "rw") + "\t" +
1218 (td.isMetaRegion() ? "META" : " ") + "\t" +
1219 " families: " + td.getFamilies().size());
1220 }
1221 }
1222 }
1223
1224 public ErrorReporter getErrors() {
1225 return errors;
1226 }
1227
1228
1229
1230
1231
1232 private void loadHdfsRegioninfo(HbckInfo hbi) throws IOException {
1233 Path regionDir = hbi.getHdfsRegionDir();
1234 if (regionDir == null) {
1235 if (hbi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1236
1237 LOG.warn("No HDFS region dir found: " + hbi + " meta=" + hbi.metaEntry);
1238 }
1239 return;
1240 }
1241
1242 if (hbi.hdfsEntry.hri != null) {
1243
1244 return;
1245 }
1246
1247 FileSystem fs = FileSystem.get(getConf());
1248 HRegionInfo hri = HRegionFileSystem.loadRegionInfoFileContent(fs, regionDir);
1249 LOG.debug("HRegionInfo read: " + hri.toString());
1250 hbi.hdfsEntry.hri = hri;
1251 }
1252
1253
1254
1255
1256
1257 public static class RegionRepairException extends IOException {
1258 private static final long serialVersionUID = 1L;
1259 final IOException ioe;
1260 public RegionRepairException(String s, IOException ioe) {
1261 super(s);
1262 this.ioe = ioe;
1263 }
1264 }
1265
1266
1267
1268
1269 private SortedMap<TableName, TableInfo> loadHdfsRegionInfos()
1270 throws IOException, InterruptedException {
1271 tablesInfo.clear();
1272
1273 Collection<HbckInfo> hbckInfos = regionInfoMap.values();
1274
1275
1276 List<WorkItemHdfsRegionInfo> hbis = new ArrayList<WorkItemHdfsRegionInfo>(hbckInfos.size());
1277 List<Future<Void>> hbiFutures;
1278
1279 for (HbckInfo hbi : hbckInfos) {
1280 WorkItemHdfsRegionInfo work = new WorkItemHdfsRegionInfo(hbi, this, errors);
1281 hbis.add(work);
1282 }
1283
1284
1285 hbiFutures = executor.invokeAll(hbis);
1286
1287 for(int i=0; i<hbiFutures.size(); i++) {
1288 WorkItemHdfsRegionInfo work = hbis.get(i);
1289 Future<Void> f = hbiFutures.get(i);
1290 try {
1291 f.get();
1292 } catch(ExecutionException e) {
1293 LOG.warn("Failed to read .regioninfo file for region " +
1294 work.hbi.getRegionNameAsString(), e.getCause());
1295 }
1296 }
1297
1298 Path hbaseRoot = FSUtils.getRootDir(getConf());
1299 FileSystem fs = hbaseRoot.getFileSystem(getConf());
1300
1301 for (HbckInfo hbi: hbckInfos) {
1302
1303 if (hbi.getHdfsHRI() == null) {
1304
1305 continue;
1306 }
1307
1308
1309
1310 TableName tableName = hbi.getTableName();
1311 if (tableName == null) {
1312
1313 LOG.warn("tableName was null for: " + hbi);
1314 continue;
1315 }
1316
1317 TableInfo modTInfo = tablesInfo.get(tableName);
1318 if (modTInfo == null) {
1319
1320 modTInfo = new TableInfo(tableName);
1321 tablesInfo.put(tableName, modTInfo);
1322 try {
1323 HTableDescriptor htd =
1324 FSTableDescriptors.getTableDescriptorFromFs(fs, hbaseRoot, tableName);
1325 modTInfo.htds.add(htd);
1326 } catch (IOException ioe) {
1327 if (!orphanTableDirs.containsKey(tableName)) {
1328 LOG.warn("Unable to read .tableinfo from " + hbaseRoot, ioe);
1329
1330 errors.reportError(ERROR_CODE.NO_TABLEINFO_FILE,
1331 "Unable to read .tableinfo from " + hbaseRoot + "/" + tableName);
1332 Set<String> columns = new HashSet<String>();
1333 orphanTableDirs.put(tableName, getColumnFamilyList(columns, hbi));
1334 }
1335 }
1336 }
1337 if (!hbi.isSkipChecks()) {
1338 modTInfo.addRegionInfo(hbi);
1339 }
1340 }
1341
1342 loadTableInfosForTablesWithNoRegion();
1343 errors.print("");
1344
1345 return tablesInfo;
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355 private Set<String> getColumnFamilyList(Set<String> columns, HbckInfo hbi) throws IOException {
1356 Path regionDir = hbi.getHdfsRegionDir();
1357 FileSystem fs = regionDir.getFileSystem(getConf());
1358 FileStatus[] subDirs = fs.listStatus(regionDir, new FSUtils.FamilyDirFilter(fs));
1359 for (FileStatus subdir : subDirs) {
1360 String columnfamily = subdir.getPath().getName();
1361 columns.add(columnfamily);
1362 }
1363 return columns;
1364 }
1365
1366
1367
1368
1369
1370
1371
1372
1373 private boolean fabricateTableInfo(FSTableDescriptors fstd, TableName tableName,
1374 Set<String> columns) throws IOException {
1375 if (columns ==null || columns.isEmpty()) return false;
1376 HTableDescriptor htd = new HTableDescriptor(tableName);
1377 for (String columnfamimly : columns) {
1378 htd.addFamily(new HColumnDescriptor(columnfamimly));
1379 }
1380 fstd.createTableDescriptor(htd, true);
1381 return true;
1382 }
1383
1384
1385
1386
1387
1388 public void fixEmptyMetaCells() throws IOException {
1389 if (shouldFixEmptyMetaCells() && !emptyRegionInfoQualifiers.isEmpty()) {
1390 LOG.info("Trying to fix empty REGIONINFO_QUALIFIER hbase:meta rows.");
1391 for (Result region : emptyRegionInfoQualifiers) {
1392 deleteMetaRegion(region.getRow());
1393 errors.getErrorList().remove(ERROR_CODE.EMPTY_META_CELL);
1394 }
1395 emptyRegionInfoQualifiers.clear();
1396 }
1397 }
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408 public void fixOrphanTables() throws IOException {
1409 if (shouldFixTableOrphans() && !orphanTableDirs.isEmpty()) {
1410
1411 List<TableName> tmpList = new ArrayList<TableName>();
1412 tmpList.addAll(orphanTableDirs.keySet());
1413 HTableDescriptor[] htds = getHTableDescriptors(tmpList);
1414 Iterator<Entry<TableName, Set<String>>> iter =
1415 orphanTableDirs.entrySet().iterator();
1416 int j = 0;
1417 int numFailedCase = 0;
1418 FSTableDescriptors fstd = new FSTableDescriptors(getConf());
1419 while (iter.hasNext()) {
1420 Entry<TableName, Set<String>> entry =
1421 iter.next();
1422 TableName tableName = entry.getKey();
1423 LOG.info("Trying to fix orphan table error: " + tableName);
1424 if (j < htds.length) {
1425 if (tableName.equals(htds[j].getTableName())) {
1426 HTableDescriptor htd = htds[j];
1427 LOG.info("fixing orphan table: " + tableName + " from cache");
1428 fstd.createTableDescriptor(htd, true);
1429 j++;
1430 iter.remove();
1431 }
1432 } else {
1433 if (fabricateTableInfo(fstd, tableName, entry.getValue())) {
1434 LOG.warn("fixing orphan table: " + tableName + " with a default .tableinfo file");
1435 LOG.warn("Strongly recommend to modify the HTableDescriptor if necessary for: " + tableName);
1436 iter.remove();
1437 } else {
1438 LOG.error("Unable to create default .tableinfo for " + tableName + " while missing column family information");
1439 numFailedCase++;
1440 }
1441 }
1442 fixes++;
1443 }
1444
1445 if (orphanTableDirs.isEmpty()) {
1446
1447
1448 setShouldRerun();
1449 LOG.warn("Strongly recommend to re-run manually hfsck after all orphanTableDirs being fixed");
1450 } else if (numFailedCase > 0) {
1451 LOG.error("Failed to fix " + numFailedCase
1452 + " OrphanTables with default .tableinfo files");
1453 }
1454
1455 }
1456
1457 orphanTableDirs.clear();
1458
1459 }
1460
1461
1462
1463
1464
1465
1466 private HRegion createNewMeta() throws IOException {
1467 Path rootdir = FSUtils.getRootDir(getConf());
1468 Configuration c = getConf();
1469 HRegionInfo metaHRI = new HRegionInfo(HRegionInfo.FIRST_META_REGIONINFO);
1470 HTableDescriptor metaDescriptor = new FSTableDescriptors(c).get(TableName.META_TABLE_NAME);
1471 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, false);
1472 HRegion meta = HRegion.createHRegion(metaHRI, rootdir, c, metaDescriptor);
1473 MasterFileSystem.setInfoFamilyCachingForMeta(metaDescriptor, true);
1474 return meta;
1475 }
1476
1477
1478
1479
1480
1481
1482
1483 private ArrayList<Put> generatePuts(
1484 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1485 ArrayList<Put> puts = new ArrayList<Put>();
1486 boolean hasProblems = false;
1487 for (Entry<TableName, TableInfo> e : tablesInfo.entrySet()) {
1488 TableName name = e.getKey();
1489
1490
1491 if (name.compareTo(TableName.META_TABLE_NAME) == 0) {
1492 continue;
1493 }
1494
1495 TableInfo ti = e.getValue();
1496 for (Entry<byte[], Collection<HbckInfo>> spl : ti.sc.getStarts().asMap()
1497 .entrySet()) {
1498 Collection<HbckInfo> his = spl.getValue();
1499 int sz = his.size();
1500 if (sz != 1) {
1501
1502 LOG.error("Split starting at " + Bytes.toStringBinary(spl.getKey())
1503 + " had " + sz + " regions instead of exactly 1." );
1504 hasProblems = true;
1505 continue;
1506 }
1507
1508
1509 HbckInfo hi = his.iterator().next();
1510 HRegionInfo hri = hi.getHdfsHRI();
1511 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
1512 puts.add(p);
1513 }
1514 }
1515 return hasProblems ? null : puts;
1516 }
1517
1518
1519
1520
1521 private void suggestFixes(
1522 SortedMap<TableName, TableInfo> tablesInfo) throws IOException {
1523 logParallelMerge();
1524 for (TableInfo tInfo : tablesInfo.values()) {
1525 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1526 tInfo.checkRegionChain(handler);
1527 }
1528 }
1529
1530
1531
1532
1533
1534
1535
1536
1537 public boolean rebuildMeta(boolean fix) throws IOException,
1538 InterruptedException {
1539
1540
1541
1542
1543
1544 LOG.info("Loading HBase regioninfo from HDFS...");
1545 loadHdfsRegionDirs();
1546
1547 int errs = errors.getErrorList().size();
1548 tablesInfo = loadHdfsRegionInfos();
1549 checkHdfsIntegrity(false, false);
1550
1551
1552 if (errors.getErrorList().size() != errs) {
1553
1554 while(true) {
1555 fixes = 0;
1556 suggestFixes(tablesInfo);
1557 errors.clear();
1558 loadHdfsRegionInfos();
1559 checkHdfsIntegrity(shouldFixHdfsHoles(), shouldFixHdfsOverlaps());
1560
1561 int errCount = errors.getErrorList().size();
1562
1563 if (fixes == 0) {
1564 if (errCount > 0) {
1565 return false;
1566 } else {
1567 break;
1568 }
1569 }
1570 }
1571 }
1572
1573
1574 LOG.info("HDFS regioninfo's seems good. Sidelining old hbase:meta");
1575 Path backupDir = sidelineOldMeta();
1576
1577 LOG.info("Creating new hbase:meta");
1578 HRegion meta = createNewMeta();
1579
1580
1581 List<Put> puts = generatePuts(tablesInfo);
1582 if (puts == null) {
1583 LOG.fatal("Problem encountered when creating new hbase:meta entries. " +
1584 "You may need to restore the previously sidelined hbase:meta");
1585 return false;
1586 }
1587 meta.batchMutate(puts.toArray(new Put[puts.size()]));
1588 HRegion.closeHRegion(meta);
1589 LOG.info("Success! hbase:meta table rebuilt.");
1590 LOG.info("Old hbase:meta is moved into " + backupDir);
1591 return true;
1592 }
1593
1594
1595
1596
1597 private void logParallelMerge() {
1598 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
1599 LOG.info("Handling overlap merges in parallel. set hbasefsck.overlap.merge.parallel to" +
1600 " false to run serially.");
1601 } else {
1602 LOG.info("Handling overlap merges serially. set hbasefsck.overlap.merge.parallel to" +
1603 " true to run in parallel.");
1604 }
1605 }
1606
1607 private SortedMap<TableName, TableInfo> checkHdfsIntegrity(boolean fixHoles,
1608 boolean fixOverlaps) throws IOException {
1609 LOG.info("Checking HBase region split map from HDFS data...");
1610 logParallelMerge();
1611 for (TableInfo tInfo : tablesInfo.values()) {
1612 TableIntegrityErrorHandler handler;
1613 if (fixHoles || fixOverlaps) {
1614 handler = tInfo.new HDFSIntegrityFixer(tInfo, errors, getConf(),
1615 fixHoles, fixOverlaps);
1616 } else {
1617 handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
1618 }
1619 if (!tInfo.checkRegionChain(handler)) {
1620
1621 errors.report("Found inconsistency in table " + tInfo.getName());
1622 }
1623 }
1624 return tablesInfo;
1625 }
1626
1627 private Path getSidelineDir() throws IOException {
1628 if (sidelineDir == null) {
1629 Path hbaseDir = FSUtils.getRootDir(getConf());
1630 Path hbckDir = new Path(hbaseDir, HConstants.HBCK_SIDELINEDIR_NAME);
1631 sidelineDir = new Path(hbckDir, hbaseDir.getName() + "-"
1632 + startMillis);
1633 }
1634 return sidelineDir;
1635 }
1636
1637
1638
1639
1640 Path sidelineRegionDir(FileSystem fs, HbckInfo hi) throws IOException {
1641 return sidelineRegionDir(fs, null, hi);
1642 }
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652 Path sidelineRegionDir(FileSystem fs,
1653 String parentDir, HbckInfo hi) throws IOException {
1654 TableName tableName = hi.getTableName();
1655 Path regionDir = hi.getHdfsRegionDir();
1656
1657 if (!fs.exists(regionDir)) {
1658 LOG.warn("No previous " + regionDir + " exists. Continuing.");
1659 return null;
1660 }
1661
1662 Path rootDir = getSidelineDir();
1663 if (parentDir != null) {
1664 rootDir = new Path(rootDir, parentDir);
1665 }
1666 Path sidelineTableDir= FSUtils.getTableDir(rootDir, tableName);
1667 Path sidelineRegionDir = new Path(sidelineTableDir, regionDir.getName());
1668 fs.mkdirs(sidelineRegionDir);
1669 boolean success = false;
1670 FileStatus[] cfs = fs.listStatus(regionDir);
1671 if (cfs == null) {
1672 LOG.info("Region dir is empty: " + regionDir);
1673 } else {
1674 for (FileStatus cf : cfs) {
1675 Path src = cf.getPath();
1676 Path dst = new Path(sidelineRegionDir, src.getName());
1677 if (fs.isFile(src)) {
1678
1679 success = fs.rename(src, dst);
1680 if (!success) {
1681 String msg = "Unable to rename file " + src + " to " + dst;
1682 LOG.error(msg);
1683 throw new IOException(msg);
1684 }
1685 continue;
1686 }
1687
1688
1689 fs.mkdirs(dst);
1690
1691 LOG.info("Sidelining files from " + src + " into containing region " + dst);
1692
1693
1694
1695
1696 FileStatus[] hfiles = fs.listStatus(src);
1697 if (hfiles != null && hfiles.length > 0) {
1698 for (FileStatus hfile : hfiles) {
1699 success = fs.rename(hfile.getPath(), dst);
1700 if (!success) {
1701 String msg = "Unable to rename file " + src + " to " + dst;
1702 LOG.error(msg);
1703 throw new IOException(msg);
1704 }
1705 }
1706 }
1707 LOG.debug("Sideline directory contents:");
1708 debugLsr(sidelineRegionDir);
1709 }
1710 }
1711
1712 LOG.info("Removing old region dir: " + regionDir);
1713 success = fs.delete(regionDir, true);
1714 if (!success) {
1715 String msg = "Unable to delete dir " + regionDir;
1716 LOG.error(msg);
1717 throw new IOException(msg);
1718 }
1719 return sidelineRegionDir;
1720 }
1721
1722
1723
1724
1725 void sidelineTable(FileSystem fs, TableName tableName, Path hbaseDir,
1726 Path backupHbaseDir) throws IOException {
1727 Path tableDir = FSUtils.getTableDir(hbaseDir, tableName);
1728 if (fs.exists(tableDir)) {
1729 Path backupTableDir= FSUtils.getTableDir(backupHbaseDir, tableName);
1730 fs.mkdirs(backupTableDir.getParent());
1731 boolean success = fs.rename(tableDir, backupTableDir);
1732 if (!success) {
1733 throw new IOException("Failed to move " + tableName + " from "
1734 + tableDir + " to " + backupTableDir);
1735 }
1736 } else {
1737 LOG.info("No previous " + tableName + " exists. Continuing.");
1738 }
1739 }
1740
1741
1742
1743
1744 Path sidelineOldMeta() throws IOException {
1745
1746 Path hbaseDir = FSUtils.getRootDir(getConf());
1747 FileSystem fs = hbaseDir.getFileSystem(getConf());
1748 Path backupDir = getSidelineDir();
1749 fs.mkdirs(backupDir);
1750
1751 try {
1752 sidelineTable(fs, TableName.META_TABLE_NAME, hbaseDir, backupDir);
1753 } catch (IOException e) {
1754 LOG.fatal("... failed to sideline meta. Currently in inconsistent state. To restore "
1755 + "try to rename hbase:meta in " + backupDir.getName() + " to "
1756 + hbaseDir.getName() + ".", e);
1757 throw e;
1758 }
1759 return backupDir;
1760 }
1761
1762
1763
1764
1765
1766
1767 private void loadDisabledTables()
1768 throws ZooKeeperConnectionException, IOException {
1769 HConnectionManager.execute(new HConnectable<Void>(getConf()) {
1770 @Override
1771 public Void connect(HConnection connection) throws IOException {
1772 try {
1773 for (TableName tableName :
1774 ZKTableStateClientSideReader.getDisabledOrDisablingTables(zkw)) {
1775 disabledTables.add(tableName);
1776 }
1777 } catch (KeeperException ke) {
1778 throw new IOException(ke);
1779 } catch (InterruptedException e) {
1780 throw new InterruptedIOException();
1781 }
1782 return null;
1783 }
1784 });
1785 }
1786
1787
1788
1789
1790 private boolean isTableDisabled(HRegionInfo regionInfo) {
1791 return disabledTables.contains(regionInfo.getTable());
1792 }
1793
1794
1795
1796
1797
1798 public void loadHdfsRegionDirs() throws IOException, InterruptedException {
1799 Path rootDir = FSUtils.getRootDir(getConf());
1800 FileSystem fs = rootDir.getFileSystem(getConf());
1801
1802
1803 List<FileStatus> tableDirs = Lists.newArrayList();
1804
1805 boolean foundVersionFile = fs.exists(new Path(rootDir, HConstants.VERSION_FILE_NAME));
1806
1807 List<Path> paths = FSUtils.getTableDirs(fs, rootDir);
1808 for (Path path : paths) {
1809 TableName tableName = FSUtils.getTableName(path);
1810 if ((!checkMetaOnly &&
1811 isTableIncluded(tableName)) ||
1812 tableName.equals(TableName.META_TABLE_NAME)) {
1813 tableDirs.add(fs.getFileStatus(path));
1814 }
1815 }
1816
1817
1818 if (!foundVersionFile) {
1819 errors.reportError(ERROR_CODE.NO_VERSION_FILE,
1820 "Version file does not exist in root dir " + rootDir);
1821 if (shouldFixVersionFile()) {
1822 LOG.info("Trying to create a new " + HConstants.VERSION_FILE_NAME
1823 + " file.");
1824 setShouldRerun();
1825 FSUtils.setVersion(fs, rootDir, getConf().getInt(
1826 HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000), getConf().getInt(
1827 HConstants.VERSION_FILE_WRITE_ATTEMPTS,
1828 HConstants.DEFAULT_VERSION_FILE_WRITE_ATTEMPTS));
1829 }
1830 }
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840 for (FileStatus tableDir : tableDirs) {
1841 LOG.debug("Loading region dirs from " +tableDir.getPath());
1842 WorkItemHdfsDir item = new WorkItemHdfsDir(fs, errors, tableDir);
1843 try {
1844 item.call();
1845 } catch (ExecutionException e) {
1846 LOG.warn("Could not completely load table dir " +
1847 tableDir.getPath(), e.getCause());
1848 }
1849 }
1850 errors.print("");
1851 }
1852
1853
1854
1855
1856 private boolean recordMetaRegion() throws IOException {
1857 RegionLocations rl = ((ClusterConnection)connection).locateRegion(TableName.META_TABLE_NAME,
1858 HConstants.EMPTY_START_ROW, false, false);
1859 if (rl == null) {
1860 errors.reportError(ERROR_CODE.NULL_META_REGION,
1861 "META region or some of its attributes are null.");
1862 return false;
1863 }
1864 for (HRegionLocation metaLocation : rl.getRegionLocations()) {
1865
1866 if (metaLocation == null || metaLocation.getRegionInfo() == null ||
1867 metaLocation.getHostname() == null) {
1868 errors.reportError(ERROR_CODE.NULL_META_REGION,
1869 "META region or some of its attributes are null.");
1870 return false;
1871 }
1872 ServerName sn = metaLocation.getServerName();
1873 MetaEntry m = new MetaEntry(metaLocation.getRegionInfo(), sn, EnvironmentEdgeManager.currentTime());
1874 HbckInfo hbckInfo = regionInfoMap.get(metaLocation.getRegionInfo().getEncodedName());
1875 if (hbckInfo == null) {
1876 regionInfoMap.put(metaLocation.getRegionInfo().getEncodedName(), new HbckInfo(m));
1877 } else {
1878 hbckInfo.metaEntry = m;
1879 }
1880 }
1881 return true;
1882 }
1883
1884 private ZooKeeperWatcher createZooKeeperWatcher() throws IOException {
1885 return new ZooKeeperWatcher(getConf(), "hbase Fsck", new Abortable() {
1886 @Override
1887 public void abort(String why, Throwable e) {
1888 LOG.error(why, e);
1889 System.exit(1);
1890 }
1891
1892 @Override
1893 public boolean isAborted() {
1894 return false;
1895 }
1896
1897 });
1898 }
1899
1900
1901
1902
1903
1904
1905
1906 void processRegionServers(Collection<ServerName> regionServerList)
1907 throws IOException, InterruptedException {
1908
1909 List<WorkItemRegion> workItems = new ArrayList<WorkItemRegion>(regionServerList.size());
1910 List<Future<Void>> workFutures;
1911
1912
1913 for (ServerName rsinfo: regionServerList) {
1914 workItems.add(new WorkItemRegion(this, rsinfo, errors, connection));
1915 }
1916
1917 workFutures = executor.invokeAll(workItems);
1918
1919 for(int i=0; i<workFutures.size(); i++) {
1920 WorkItemRegion item = workItems.get(i);
1921 Future<Void> f = workFutures.get(i);
1922 try {
1923 f.get();
1924 } catch(ExecutionException e) {
1925 LOG.warn("Could not process regionserver " + item.rsinfo.getHostAndPort(),
1926 e.getCause());
1927 }
1928 }
1929 }
1930
1931
1932
1933
1934 private void checkAndFixConsistency()
1935 throws IOException, KeeperException, InterruptedException {
1936
1937
1938 List<CheckRegionConsistencyWorkItem> workItems =
1939 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1940 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1941 if (e.getValue().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
1942 workItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1943 }
1944 }
1945 checkRegionConsistencyConcurrently(workItems);
1946
1947 boolean prevHdfsCheck = shouldCheckHdfs();
1948 setCheckHdfs(false);
1949
1950
1951 List<CheckRegionConsistencyWorkItem> replicaWorkItems =
1952 new ArrayList<CheckRegionConsistencyWorkItem>(regionInfoMap.size());
1953 for (java.util.Map.Entry<String, HbckInfo> e: regionInfoMap.entrySet()) {
1954 if (e.getValue().getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
1955 replicaWorkItems.add(new CheckRegionConsistencyWorkItem(e.getKey(), e.getValue()));
1956 }
1957 }
1958 checkRegionConsistencyConcurrently(replicaWorkItems);
1959 setCheckHdfs(prevHdfsCheck);
1960
1961
1962
1963
1964
1965 int terminateThreshold = getConf().getInt("hbase.hbck.skipped.regions.limit", 0);
1966 int numOfSkippedRegions = skippedRegions.size();
1967 if (numOfSkippedRegions > 0 && numOfSkippedRegions > terminateThreshold) {
1968 throw new IOException(numOfSkippedRegions
1969 + " region(s) could not be checked or repaired. See logs for detail.");
1970 }
1971 }
1972
1973
1974
1975
1976 private void checkRegionConsistencyConcurrently(
1977 final List<CheckRegionConsistencyWorkItem> workItems)
1978 throws IOException, KeeperException, InterruptedException {
1979 if (workItems.isEmpty()) {
1980 return;
1981 }
1982
1983 List<Future<Void>> workFutures = executor.invokeAll(workItems);
1984 for(Future<Void> f: workFutures) {
1985 try {
1986 f.get();
1987 } catch(ExecutionException e1) {
1988 LOG.warn("Could not check region consistency " , e1.getCause());
1989 if (e1.getCause() instanceof IOException) {
1990 throw (IOException)e1.getCause();
1991 } else if (e1.getCause() instanceof KeeperException) {
1992 throw (KeeperException)e1.getCause();
1993 } else if (e1.getCause() instanceof InterruptedException) {
1994 throw (InterruptedException)e1.getCause();
1995 } else {
1996 throw new IOException(e1.getCause());
1997 }
1998 }
1999 }
2000 }
2001
2002 class CheckRegionConsistencyWorkItem implements Callable<Void> {
2003 private final String key;
2004 private final HbckInfo hbi;
2005
2006 CheckRegionConsistencyWorkItem(String key, HbckInfo hbi) {
2007 this.key = key;
2008 this.hbi = hbi;
2009 }
2010
2011 @Override
2012 public synchronized Void call() throws Exception {
2013 try {
2014 checkRegionConsistency(key, hbi);
2015 } catch (Exception e) {
2016
2017
2018 LOG.warn("Unable to complete check or repair the region '" + hbi.getRegionNameAsString()
2019 + "'.", e);
2020 if (hbi.getHdfsHRI().isMetaRegion()) {
2021 throw e;
2022 }
2023 LOG.warn("Skip region '" + hbi.getRegionNameAsString() + "'");
2024 addSkippedRegion(hbi);
2025 }
2026 return null;
2027 }
2028 }
2029
2030 private void addSkippedRegion(final HbckInfo hbi) {
2031 Set<String> skippedRegionNames = skippedRegions.get(hbi.getTableName());
2032 if (skippedRegionNames == null) {
2033 skippedRegionNames = new HashSet<String>();
2034 }
2035 skippedRegionNames.add(hbi.getRegionNameAsString());
2036 skippedRegions.put(hbi.getTableName(), skippedRegionNames);
2037 }
2038
2039 private void preCheckPermission() throws IOException, AccessDeniedException {
2040 if (shouldIgnorePreCheckPermission()) {
2041 return;
2042 }
2043
2044 Path hbaseDir = FSUtils.getRootDir(getConf());
2045 FileSystem fs = hbaseDir.getFileSystem(getConf());
2046 UserProvider userProvider = UserProvider.instantiate(getConf());
2047 UserGroupInformation ugi = userProvider.getCurrent().getUGI();
2048 FileStatus[] files = fs.listStatus(hbaseDir);
2049 for (FileStatus file : files) {
2050 try {
2051 FSUtils.checkAccess(ugi, file, FsAction.WRITE);
2052 } catch (AccessDeniedException ace) {
2053 LOG.warn("Got AccessDeniedException when preCheckPermission ", ace);
2054 errors.reportError(ERROR_CODE.WRONG_USAGE, "Current user " + ugi.getUserName()
2055 + " does not have write perms to " + file.getPath()
2056 + ". Please rerun hbck as hdfs user " + file.getOwner());
2057 throw ace;
2058 }
2059 }
2060 }
2061
2062
2063
2064
2065 private void deleteMetaRegion(HbckInfo hi) throws IOException {
2066 deleteMetaRegion(hi.metaEntry.getRegionName());
2067 }
2068
2069
2070
2071
2072 private void deleteMetaRegion(byte[] metaKey) throws IOException {
2073 Delete d = new Delete(metaKey);
2074 meta.delete(d);
2075 LOG.info("Deleted " + Bytes.toString(metaKey) + " from META" );
2076 }
2077
2078
2079
2080
2081 private void resetSplitParent(HbckInfo hi) throws IOException {
2082 RowMutations mutations = new RowMutations(hi.metaEntry.getRegionName());
2083 Delete d = new Delete(hi.metaEntry.getRegionName());
2084 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITA_QUALIFIER);
2085 d.deleteColumn(HConstants.CATALOG_FAMILY, HConstants.SPLITB_QUALIFIER);
2086 mutations.add(d);
2087
2088 HRegionInfo hri = new HRegionInfo(hi.metaEntry);
2089 hri.setOffline(false);
2090 hri.setSplit(false);
2091 Put p = MetaTableAccessor.makePutFromRegionInfo(hri);
2092 mutations.add(p);
2093
2094 meta.mutateRow(mutations);
2095 LOG.info("Reset split parent " + hi.metaEntry.getRegionNameAsString() + " in META" );
2096 }
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106 private void offline(byte[] regionName) throws IOException {
2107 String regionString = Bytes.toStringBinary(regionName);
2108 if (!rsSupportsOffline) {
2109 LOG.warn("Using unassign region " + regionString
2110 + " instead of using offline method, you should"
2111 + " restart HMaster after these repairs");
2112 admin.unassign(regionName, true);
2113 return;
2114 }
2115
2116
2117 try {
2118 LOG.info("Offlining region " + regionString);
2119 admin.offline(regionName);
2120 } catch (IOException ioe) {
2121 String notFoundMsg = "java.lang.NoSuchMethodException: " +
2122 "org.apache.hadoop.hbase.master.HMaster.offline([B)";
2123 if (ioe.getMessage().contains(notFoundMsg)) {
2124 LOG.warn("Using unassign region " + regionString
2125 + " instead of using offline method, you should"
2126 + " restart HMaster after these repairs");
2127 rsSupportsOffline = false;
2128 admin.unassign(regionName, true);
2129 return;
2130 }
2131 throw ioe;
2132 }
2133 }
2134
2135 private void undeployRegions(HbckInfo hi) throws IOException, InterruptedException {
2136 undeployRegionsForHbi(hi);
2137
2138 if (hi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
2139 return;
2140 }
2141 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
2142 for (int i = 1; i < numReplicas; i++) {
2143 if (hi.getPrimaryHRIForDeployedReplica() == null) continue;
2144 HRegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica(
2145 hi.getPrimaryHRIForDeployedReplica(), i);
2146 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
2147 if (h != null) {
2148 undeployRegionsForHbi(h);
2149
2150
2151 h.setSkipChecks(true);
2152 }
2153 }
2154 }
2155
2156 private void undeployRegionsForHbi(HbckInfo hi) throws IOException, InterruptedException {
2157 for (OnlineEntry rse : hi.deployedEntries) {
2158 LOG.debug("Undeploy region " + rse.hri + " from " + rse.hsa);
2159 try {
2160 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, rse.hsa, rse.hri);
2161 offline(rse.hri.getRegionName());
2162 } catch (IOException ioe) {
2163 LOG.warn("Got exception when attempting to offline region "
2164 + Bytes.toString(rse.hri.getRegionName()), ioe);
2165 }
2166 }
2167 }
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181 private void closeRegion(HbckInfo hi) throws IOException, InterruptedException {
2182 if (hi.metaEntry == null && hi.hdfsEntry == null) {
2183 undeployRegions(hi);
2184 return;
2185 }
2186
2187
2188 Get get = new Get(hi.getRegionName());
2189 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
2190 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
2191 get.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER);
2192
2193 if (hi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2194 int numReplicas = admin.getTableDescriptor(hi.getTableName()).getRegionReplication();
2195 for (int i = 0; i < numReplicas; i++) {
2196 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getServerColumn(i));
2197 get.addColumn(HConstants.CATALOG_FAMILY, MetaTableAccessor.getStartCodeColumn(i));
2198 }
2199 }
2200 Result r = meta.get(get);
2201 RegionLocations rl = MetaTableAccessor.getRegionLocations(r);
2202 if (rl == null) {
2203 LOG.warn("Unable to close region " + hi.getRegionNameAsString() +
2204 " since meta does not have handle to reach it");
2205 return;
2206 }
2207 for (HRegionLocation h : rl.getRegionLocations()) {
2208 ServerName serverName = h.getServerName();
2209 if (serverName == null) {
2210 errors.reportError("Unable to close region "
2211 + hi.getRegionNameAsString() + " because meta does not "
2212 + "have handle to reach it.");
2213 continue;
2214 }
2215 HRegionInfo hri = h.getRegionInfo();
2216 if (hri == null) {
2217 LOG.warn("Unable to close region " + hi.getRegionNameAsString()
2218 + " because hbase:meta had invalid or missing "
2219 + HConstants.CATALOG_FAMILY_STR + ":"
2220 + Bytes.toString(HConstants.REGIONINFO_QUALIFIER)
2221 + " qualifier value.");
2222 continue;
2223 }
2224
2225 HBaseFsckRepair.closeRegionSilentlyAndWait(connection, serverName, hri);
2226 }
2227 }
2228
2229 private void tryAssignmentRepair(HbckInfo hbi, String msg) throws IOException,
2230 KeeperException, InterruptedException {
2231
2232 if (shouldFixAssignments()) {
2233 errors.print(msg);
2234 undeployRegions(hbi);
2235 setShouldRerun();
2236 HRegionInfo hri = hbi.getHdfsHRI();
2237 if (hri == null) {
2238 hri = hbi.metaEntry;
2239 }
2240 HBaseFsckRepair.fixUnassigned(admin, hri);
2241 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2242
2243
2244 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) return;
2245 int replicationCount = admin.getTableDescriptor(hri.getTable()).getRegionReplication();
2246 for (int i = 1; i < replicationCount; i++) {
2247 hri = RegionReplicaUtil.getRegionInfoForReplica(hri, i);
2248 HbckInfo h = regionInfoMap.get(hri.getEncodedName());
2249 if (h != null) {
2250 undeployRegions(h);
2251
2252
2253 h.setSkipChecks(true);
2254 }
2255 HBaseFsckRepair.fixUnassigned(admin, hri);
2256 HBaseFsckRepair.waitUntilAssigned(admin, hri);
2257 }
2258
2259 }
2260 }
2261
2262
2263
2264
2265 private void checkRegionConsistency(final String key, final HbckInfo hbi)
2266 throws IOException, KeeperException, InterruptedException {
2267
2268 if (hbi.isSkipChecks()) return;
2269 String descriptiveName = hbi.toString();
2270 boolean inMeta = hbi.metaEntry != null;
2271
2272 boolean inHdfs = !shouldCheckHdfs() || hbi.getHdfsRegionDir() != null;
2273 boolean hasMetaAssignment = inMeta && hbi.metaEntry.regionServer != null;
2274 boolean isDeployed = !hbi.deployedOn.isEmpty();
2275 boolean isMultiplyDeployed = hbi.deployedOn.size() > 1;
2276 boolean deploymentMatchesMeta =
2277 hasMetaAssignment && isDeployed && !isMultiplyDeployed &&
2278 hbi.metaEntry.regionServer.equals(hbi.deployedOn.get(0));
2279 boolean splitParent =
2280 (hbi.metaEntry == null)? false: hbi.metaEntry.isSplit() && hbi.metaEntry.isOffline();
2281 boolean shouldBeDeployed = inMeta && !isTableDisabled(hbi.metaEntry);
2282 boolean recentlyModified = inHdfs &&
2283 hbi.getModTime() + timelag > EnvironmentEdgeManager.currentTime();
2284
2285
2286 if (hbi.containsOnlyHdfsEdits()) {
2287 return;
2288 }
2289 if (inMeta && inHdfs && isDeployed && deploymentMatchesMeta && shouldBeDeployed) {
2290 return;
2291 } else if (inMeta && inHdfs && !shouldBeDeployed && !isDeployed) {
2292 LOG.info("Region " + descriptiveName + " is in META, and in a disabled " +
2293 "tabled that is not deployed");
2294 return;
2295 } else if (recentlyModified) {
2296 LOG.warn("Region " + descriptiveName + " was recently modified -- skipping");
2297 return;
2298 }
2299
2300 else if (!inMeta && !inHdfs && !isDeployed) {
2301
2302 assert false : "Entry for region with no data";
2303 } else if (!inMeta && !inHdfs && isDeployed) {
2304 errors.reportError(ERROR_CODE.NOT_IN_META_HDFS, "Region "
2305 + descriptiveName + ", key=" + key + ", not on HDFS or in hbase:meta but " +
2306 "deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2307 if (shouldFixAssignments()) {
2308 undeployRegions(hbi);
2309 }
2310
2311 } else if (!inMeta && inHdfs && !isDeployed) {
2312 if (hbi.isMerged()) {
2313
2314
2315 hbi.setSkipChecks(true);
2316 LOG.info("Region " + descriptiveName
2317 + " got merge recently, its file(s) will be cleaned by CatalogJanitor later");
2318 return;
2319 }
2320 errors.reportError(ERROR_CODE.NOT_IN_META_OR_DEPLOYED, "Region "
2321 + descriptiveName + " on HDFS, but not listed in hbase:meta " +
2322 "or deployed on any region server");
2323
2324 if (shouldFixMeta()) {
2325 if (!hbi.isHdfsRegioninfoPresent()) {
2326 LOG.error("Region " + hbi.getHdfsHRI() + " could have been repaired"
2327 + " in table integrity repair phase if -fixHdfsOrphans was" +
2328 " used.");
2329 return;
2330 }
2331
2332 HRegionInfo hri = hbi.getHdfsHRI();
2333 TableInfo tableInfo = tablesInfo.get(hri.getTable());
2334
2335 for (HRegionInfo region : tableInfo.getRegionsFromMeta()) {
2336 if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0
2337 && (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(),
2338 hri.getEndKey()) >= 0)
2339 && Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) {
2340 if(region.isSplit() || region.isOffline()) continue;
2341 Path regionDir = hbi.getHdfsRegionDir();
2342 FileSystem fs = regionDir.getFileSystem(getConf());
2343 List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir);
2344 for (Path familyDir : familyDirs) {
2345 List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir);
2346 for (Path referenceFilePath : referenceFilePaths) {
2347 Path parentRegionDir =
2348 StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent();
2349 if (parentRegionDir.toString().endsWith(region.getEncodedName())) {
2350 LOG.warn(hri + " start and stop keys are in the range of " + region
2351 + ". The region might not be cleaned up from hdfs when region " + region
2352 + " split failed. Hence deleting from hdfs.");
2353 HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs,
2354 regionDir.getParent(), hri);
2355 return;
2356 }
2357 }
2358 }
2359 }
2360 }
2361
2362 LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
2363 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2364 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2365 admin.getClusterStatus().getServers(), numReplicas);
2366
2367 tryAssignmentRepair(hbi, "Trying to reassign region...");
2368 }
2369
2370 } else if (!inMeta && inHdfs && isDeployed) {
2371 errors.reportError(ERROR_CODE.NOT_IN_META, "Region " + descriptiveName
2372 + " not in META, but deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2373 debugLsr(hbi.getHdfsRegionDir());
2374 if (hbi.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) {
2375
2376
2377
2378
2379 if (shouldFixAssignments()) {
2380 undeployRegionsForHbi(hbi);
2381 }
2382 }
2383 if (shouldFixMeta() && hbi.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2384 if (!hbi.isHdfsRegioninfoPresent()) {
2385 LOG.error("This should have been repaired in table integrity repair phase");
2386 return;
2387 }
2388
2389 LOG.info("Patching hbase:meta with with .regioninfo: " + hbi.getHdfsHRI());
2390 int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
2391 HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
2392 admin.getClusterStatus().getServers(), numReplicas);
2393 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2394 }
2395
2396
2397 } else if (inMeta && inHdfs && !isDeployed && splitParent) {
2398
2399
2400 if (hbi.metaEntry.splitA != null && hbi.metaEntry.splitB != null) {
2401
2402 HbckInfo infoA = this.regionInfoMap.get(hbi.metaEntry.splitA.getEncodedName());
2403 HbckInfo infoB = this.regionInfoMap.get(hbi.metaEntry.splitB.getEncodedName());
2404 if (infoA != null && infoB != null) {
2405
2406 hbi.setSkipChecks(true);
2407 return;
2408 }
2409 }
2410 errors.reportError(ERROR_CODE.LINGERING_SPLIT_PARENT, "Region "
2411 + descriptiveName + " is a split parent in META, in HDFS, "
2412 + "and not deployed on any region server. This could be transient.");
2413 if (shouldFixSplitParents()) {
2414 setShouldRerun();
2415 resetSplitParent(hbi);
2416 }
2417 } else if (inMeta && !inHdfs && !isDeployed) {
2418 errors.reportError(ERROR_CODE.NOT_IN_HDFS_OR_DEPLOYED, "Region "
2419 + descriptiveName + " found in META, but not in HDFS "
2420 + "or deployed on any region server.");
2421 if (shouldFixMeta()) {
2422 deleteMetaRegion(hbi);
2423 }
2424 } else if (inMeta && !inHdfs && isDeployed) {
2425 errors.reportError(ERROR_CODE.NOT_IN_HDFS, "Region " + descriptiveName
2426 + " found in META, but not in HDFS, " +
2427 "and deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2428
2429
2430
2431 if (shouldFixAssignments()) {
2432 errors.print("Trying to fix unassigned region...");
2433 undeployRegions(hbi);
2434 }
2435 if (shouldFixMeta()) {
2436
2437 deleteMetaRegion(hbi);
2438 }
2439 } else if (inMeta && inHdfs && !isDeployed && shouldBeDeployed) {
2440 errors.reportError(ERROR_CODE.NOT_DEPLOYED, "Region " + descriptiveName
2441 + " not deployed on any region server.");
2442 tryAssignmentRepair(hbi, "Trying to fix unassigned region...");
2443 } else if (inMeta && inHdfs && isDeployed && !shouldBeDeployed) {
2444 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
2445 "Region " + descriptiveName + " should not be deployed according " +
2446 "to META, but is deployed on " + Joiner.on(", ").join(hbi.deployedOn));
2447 if (shouldFixAssignments()) {
2448 errors.print("Trying to close the region " + descriptiveName);
2449 setShouldRerun();
2450 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2451 }
2452 } else if (inMeta && inHdfs && isMultiplyDeployed) {
2453 errors.reportError(ERROR_CODE.MULTI_DEPLOYED, "Region " + descriptiveName
2454 + " is listed in hbase:meta on region server " + hbi.metaEntry.regionServer
2455 + " but is multiply assigned to region servers " +
2456 Joiner.on(", ").join(hbi.deployedOn));
2457
2458 if (shouldFixAssignments()) {
2459 errors.print("Trying to fix assignment error...");
2460 setShouldRerun();
2461 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2462 }
2463 } else if (inMeta && inHdfs && isDeployed && !deploymentMatchesMeta) {
2464 errors.reportError(ERROR_CODE.SERVER_DOES_NOT_MATCH_META, "Region "
2465 + descriptiveName + " listed in hbase:meta on region server " +
2466 hbi.metaEntry.regionServer + " but found on region server " +
2467 hbi.deployedOn.get(0));
2468
2469 if (shouldFixAssignments()) {
2470 errors.print("Trying to fix assignment error...");
2471 setShouldRerun();
2472 HBaseFsckRepair.fixMultiAssignment(connection, hbi.metaEntry, hbi.deployedOn);
2473 HBaseFsckRepair.waitUntilAssigned(admin, hbi.getHdfsHRI());
2474 }
2475 } else {
2476 errors.reportError(ERROR_CODE.UNKNOWN, "Region " + descriptiveName +
2477 " is in an unforeseen state:" +
2478 " inMeta=" + inMeta +
2479 " inHdfs=" + inHdfs +
2480 " isDeployed=" + isDeployed +
2481 " isMultiplyDeployed=" + isMultiplyDeployed +
2482 " deploymentMatchesMeta=" + deploymentMatchesMeta +
2483 " shouldBeDeployed=" + shouldBeDeployed);
2484 }
2485 }
2486
2487
2488
2489
2490
2491
2492
2493 SortedMap<TableName, TableInfo> checkIntegrity() throws IOException {
2494 tablesInfo = new TreeMap<TableName,TableInfo> ();
2495 LOG.debug("There are " + regionInfoMap.size() + " region info entries");
2496 for (HbckInfo hbi : regionInfoMap.values()) {
2497
2498 if (hbi.metaEntry == null) {
2499
2500 Path p = hbi.getHdfsRegionDir();
2501 if (p == null) {
2502 errors.report("No regioninfo in Meta or HDFS. " + hbi);
2503 }
2504
2505
2506 continue;
2507 }
2508 if (hbi.metaEntry.regionServer == null) {
2509 errors.detail("Skipping region because no region server: " + hbi);
2510 continue;
2511 }
2512 if (hbi.metaEntry.isOffline()) {
2513 errors.detail("Skipping region because it is offline: " + hbi);
2514 continue;
2515 }
2516 if (hbi.containsOnlyHdfsEdits()) {
2517 errors.detail("Skipping region because it only contains edits" + hbi);
2518 continue;
2519 }
2520
2521
2522
2523
2524
2525
2526 if (hbi.deployedOn.size() == 0) continue;
2527
2528
2529 TableName tableName = hbi.metaEntry.getTable();
2530 TableInfo modTInfo = tablesInfo.get(tableName);
2531 if (modTInfo == null) {
2532 modTInfo = new TableInfo(tableName);
2533 }
2534 for (ServerName server : hbi.deployedOn) {
2535 modTInfo.addServer(server);
2536 }
2537
2538 if (!hbi.isSkipChecks()) {
2539 modTInfo.addRegionInfo(hbi);
2540 }
2541
2542 tablesInfo.put(tableName, modTInfo);
2543 }
2544
2545 loadTableInfosForTablesWithNoRegion();
2546
2547 logParallelMerge();
2548 for (TableInfo tInfo : tablesInfo.values()) {
2549 TableIntegrityErrorHandler handler = tInfo.new IntegrityFixSuggester(tInfo, errors);
2550 if (!tInfo.checkRegionChain(handler)) {
2551 errors.report("Found inconsistency in table " + tInfo.getName());
2552 }
2553 }
2554 return tablesInfo;
2555 }
2556
2557
2558
2559
2560 private void loadTableInfosForTablesWithNoRegion() throws IOException {
2561 Map<String, HTableDescriptor> allTables = new FSTableDescriptors(getConf()).getAll();
2562 for (HTableDescriptor htd : allTables.values()) {
2563 if (checkMetaOnly && !htd.isMetaTable()) {
2564 continue;
2565 }
2566
2567 TableName tableName = htd.getTableName();
2568 if (isTableIncluded(tableName) && !tablesInfo.containsKey(tableName)) {
2569 TableInfo tableInfo = new TableInfo(tableName);
2570 tableInfo.htds.add(htd);
2571 tablesInfo.put(htd.getTableName(), tableInfo);
2572 }
2573 }
2574 }
2575
2576
2577
2578
2579
2580 public int mergeRegionDirs(Path targetRegionDir, HbckInfo contained) throws IOException {
2581 int fileMoves = 0;
2582 String thread = Thread.currentThread().getName();
2583 LOG.debug("[" + thread + "] Contained region dir after close and pause");
2584 debugLsr(contained.getHdfsRegionDir());
2585
2586
2587 FileSystem fs = targetRegionDir.getFileSystem(getConf());
2588 FileStatus[] dirs = null;
2589 try {
2590 dirs = fs.listStatus(contained.getHdfsRegionDir());
2591 } catch (FileNotFoundException fnfe) {
2592
2593
2594 if (!fs.exists(contained.getHdfsRegionDir())) {
2595 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2596 + " is missing. Assuming already sidelined or moved.");
2597 } else {
2598 sidelineRegionDir(fs, contained);
2599 }
2600 return fileMoves;
2601 }
2602
2603 if (dirs == null) {
2604 if (!fs.exists(contained.getHdfsRegionDir())) {
2605 LOG.warn("[" + thread + "] HDFS region dir " + contained.getHdfsRegionDir()
2606 + " already sidelined.");
2607 } else {
2608 sidelineRegionDir(fs, contained);
2609 }
2610 return fileMoves;
2611 }
2612
2613 for (FileStatus cf : dirs) {
2614 Path src = cf.getPath();
2615 Path dst = new Path(targetRegionDir, src.getName());
2616
2617 if (src.getName().equals(HRegionFileSystem.REGION_INFO_FILE)) {
2618
2619 continue;
2620 }
2621
2622 if (src.getName().equals(HConstants.HREGION_OLDLOGDIR_NAME)) {
2623
2624 continue;
2625 }
2626
2627 LOG.info("[" + thread + "] Moving files from " + src + " into containing region " + dst);
2628
2629
2630
2631
2632 for (FileStatus hfile : fs.listStatus(src)) {
2633 boolean success = fs.rename(hfile.getPath(), dst);
2634 if (success) {
2635 fileMoves++;
2636 }
2637 }
2638 LOG.debug("[" + thread + "] Sideline directory contents:");
2639 debugLsr(targetRegionDir);
2640 }
2641
2642
2643 sidelineRegionDir(fs, contained);
2644 LOG.info("[" + thread + "] Sidelined region dir "+ contained.getHdfsRegionDir() + " into " +
2645 getSidelineDir());
2646 debugLsr(contained.getHdfsRegionDir());
2647
2648 return fileMoves;
2649 }
2650
2651
2652 static class WorkItemOverlapMerge implements Callable<Void> {
2653 private TableIntegrityErrorHandler handler;
2654 Collection<HbckInfo> overlapgroup;
2655
2656 WorkItemOverlapMerge(Collection<HbckInfo> overlapgroup, TableIntegrityErrorHandler handler) {
2657 this.handler = handler;
2658 this.overlapgroup = overlapgroup;
2659 }
2660
2661 @Override
2662 public Void call() throws Exception {
2663 handler.handleOverlapGroup(overlapgroup);
2664 return null;
2665 }
2666 };
2667
2668
2669
2670
2671
2672 public class TableInfo {
2673 TableName tableName;
2674 TreeSet <ServerName> deployedOn;
2675
2676
2677 final List<HbckInfo> backwards = new ArrayList<HbckInfo>();
2678
2679
2680 final Map<Path, HbckInfo> sidelinedRegions = new HashMap<Path, HbckInfo>();
2681
2682
2683 final RegionSplitCalculator<HbckInfo> sc = new RegionSplitCalculator<HbckInfo>(cmp);
2684
2685
2686 final Set<HTableDescriptor> htds = new HashSet<HTableDescriptor>();
2687
2688
2689 final Multimap<byte[], HbckInfo> overlapGroups =
2690 TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
2691
2692
2693 private ImmutableList<HRegionInfo> regionsFromMeta = null;
2694
2695 TableInfo(TableName name) {
2696 this.tableName = name;
2697 deployedOn = new TreeSet <ServerName>();
2698 }
2699
2700
2701
2702
2703 private HTableDescriptor getHTD() {
2704 if (htds.size() == 1) {
2705 return (HTableDescriptor)htds.toArray()[0];
2706 } else {
2707 LOG.error("None/Multiple table descriptors found for table '"
2708 + tableName + "' regions: " + htds);
2709 }
2710 return null;
2711 }
2712
2713 public void addRegionInfo(HbckInfo hir) {
2714 if (Bytes.equals(hir.getEndKey(), HConstants.EMPTY_END_ROW)) {
2715
2716
2717 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2718 return;
2719 }
2720
2721
2722 if (Bytes.compareTo(hir.getStartKey(), hir.getEndKey()) > 0) {
2723 errors.reportError(
2724 ERROR_CODE.REGION_CYCLE,
2725 String.format("The endkey for this region comes before the "
2726 + "startkey, startkey=%s, endkey=%s",
2727 Bytes.toStringBinary(hir.getStartKey()),
2728 Bytes.toStringBinary(hir.getEndKey())), this, hir);
2729 backwards.add(hir);
2730 return;
2731 }
2732
2733
2734
2735 if (hir.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) sc.add(hir);
2736 }
2737
2738 public void addServer(ServerName server) {
2739 this.deployedOn.add(server);
2740 }
2741
2742 public TableName getName() {
2743 return tableName;
2744 }
2745
2746 public int getNumRegions() {
2747 return sc.getStarts().size() + backwards.size();
2748 }
2749
2750 public synchronized ImmutableList<HRegionInfo> getRegionsFromMeta() {
2751
2752 if (regionsFromMeta == null) {
2753 List<HRegionInfo> regions = new ArrayList<HRegionInfo>();
2754 for (HbckInfo h : HBaseFsck.this.regionInfoMap.values()) {
2755 if (tableName.equals(h.getTableName())) {
2756 if (h.metaEntry != null) {
2757 regions.add((HRegionInfo) h.metaEntry);
2758 }
2759 }
2760 }
2761 regionsFromMeta = Ordering.natural().immutableSortedCopy(regions);
2762 }
2763
2764 return regionsFromMeta;
2765 }
2766
2767
2768 private class IntegrityFixSuggester extends TableIntegrityErrorHandlerImpl {
2769 ErrorReporter errors;
2770
2771 IntegrityFixSuggester(TableInfo ti, ErrorReporter errors) {
2772 this.errors = errors;
2773 setTableInfo(ti);
2774 }
2775
2776 @Override
2777 public void handleRegionStartKeyNotEmpty(HbckInfo hi) throws IOException{
2778 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2779 "First region should start with an empty key. You need to "
2780 + " create a new region and regioninfo in HDFS to plug the hole.",
2781 getTableInfo(), hi);
2782 }
2783
2784 @Override
2785 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2786 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2787 "Last region should end with an empty key. You need to "
2788 + "create a new region and regioninfo in HDFS to plug the hole.", getTableInfo());
2789 }
2790
2791 @Override
2792 public void handleDegenerateRegion(HbckInfo hi) throws IOException{
2793 errors.reportError(ERROR_CODE.DEGENERATE_REGION,
2794 "Region has the same start and end key.", getTableInfo(), hi);
2795 }
2796
2797 @Override
2798 public void handleDuplicateStartKeys(HbckInfo r1, HbckInfo r2) throws IOException{
2799 byte[] key = r1.getStartKey();
2800
2801 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2802 "Multiple regions have the same startkey: "
2803 + Bytes.toStringBinary(key), getTableInfo(), r1);
2804 errors.reportError(ERROR_CODE.DUPE_STARTKEYS,
2805 "Multiple regions have the same startkey: "
2806 + Bytes.toStringBinary(key), getTableInfo(), r2);
2807 }
2808
2809 @Override
2810 public void handleOverlapInRegionChain(HbckInfo hi1, HbckInfo hi2) throws IOException{
2811 errors.reportError(ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
2812 "There is an overlap in the region chain.",
2813 getTableInfo(), hi1, hi2);
2814 }
2815
2816 @Override
2817 public void handleHoleInRegionChain(byte[] holeStart, byte[] holeStop) throws IOException{
2818 errors.reportError(
2819 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2820 "There is a hole in the region chain between "
2821 + Bytes.toStringBinary(holeStart) + " and "
2822 + Bytes.toStringBinary(holeStop)
2823 + ". You need to create a new .regioninfo and region "
2824 + "dir in hdfs to plug the hole.");
2825 }
2826 };
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840 private class HDFSIntegrityFixer extends IntegrityFixSuggester {
2841 Configuration conf;
2842
2843 boolean fixOverlaps = true;
2844
2845 HDFSIntegrityFixer(TableInfo ti, ErrorReporter errors, Configuration conf,
2846 boolean fixHoles, boolean fixOverlaps) {
2847 super(ti, errors);
2848 this.conf = conf;
2849 this.fixOverlaps = fixOverlaps;
2850
2851 }
2852
2853
2854
2855
2856
2857
2858 @Override
2859 public void handleRegionStartKeyNotEmpty(HbckInfo next) throws IOException {
2860 errors.reportError(ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY,
2861 "First region should start with an empty key. Creating a new " +
2862 "region and regioninfo in HDFS to plug the hole.",
2863 getTableInfo(), next);
2864 HTableDescriptor htd = getTableInfo().getHTD();
2865
2866 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(),
2867 HConstants.EMPTY_START_ROW, next.getStartKey());
2868
2869
2870 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2871 LOG.info("Table region start key was not empty. Created new empty region: "
2872 + newRegion + " " +region);
2873 fixes++;
2874 }
2875
2876 @Override
2877 public void handleRegionEndKeyNotEmpty(byte[] curEndKey) throws IOException {
2878 errors.reportError(ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY,
2879 "Last region should end with an empty key. Creating a new "
2880 + "region and regioninfo in HDFS to plug the hole.", getTableInfo());
2881 HTableDescriptor htd = getTableInfo().getHTD();
2882
2883 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), curEndKey,
2884 HConstants.EMPTY_START_ROW);
2885
2886 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2887 LOG.info("Table region end key was not empty. Created new empty region: " + newRegion
2888 + " " + region);
2889 fixes++;
2890 }
2891
2892
2893
2894
2895
2896 @Override
2897 public void handleHoleInRegionChain(byte[] holeStartKey, byte[] holeStopKey) throws IOException {
2898 errors.reportError(
2899 ERROR_CODE.HOLE_IN_REGION_CHAIN,
2900 "There is a hole in the region chain between "
2901 + Bytes.toStringBinary(holeStartKey) + " and "
2902 + Bytes.toStringBinary(holeStopKey)
2903 + ". Creating a new regioninfo and region "
2904 + "dir in hdfs to plug the hole.");
2905 HTableDescriptor htd = getTableInfo().getHTD();
2906 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), holeStartKey, holeStopKey);
2907 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2908 LOG.info("Plugged hole by creating new empty region: "+ newRegion + " " +region);
2909 fixes++;
2910 }
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923 @Override
2924 public void handleOverlapGroup(Collection<HbckInfo> overlap)
2925 throws IOException {
2926 Preconditions.checkNotNull(overlap);
2927 Preconditions.checkArgument(overlap.size() >0);
2928
2929 if (!this.fixOverlaps) {
2930 LOG.warn("Not attempting to repair overlaps.");
2931 return;
2932 }
2933
2934 if (overlap.size() > maxMerge) {
2935 LOG.warn("Overlap group has " + overlap.size() + " overlapping " +
2936 "regions which is greater than " + maxMerge + ", the max number of regions to merge");
2937 if (sidelineBigOverlaps) {
2938
2939 sidelineBigOverlaps(overlap);
2940 }
2941 return;
2942 }
2943
2944 mergeOverlaps(overlap);
2945 }
2946
2947 void mergeOverlaps(Collection<HbckInfo> overlap)
2948 throws IOException {
2949 String thread = Thread.currentThread().getName();
2950 LOG.info("== [" + thread + "] Merging regions into one region: "
2951 + Joiner.on(",").join(overlap));
2952
2953 Pair<byte[], byte[]> range = null;
2954 for (HbckInfo hi : overlap) {
2955 if (range == null) {
2956 range = new Pair<byte[], byte[]>(hi.getStartKey(), hi.getEndKey());
2957 } else {
2958 if (RegionSplitCalculator.BYTES_COMPARATOR
2959 .compare(hi.getStartKey(), range.getFirst()) < 0) {
2960 range.setFirst(hi.getStartKey());
2961 }
2962 if (RegionSplitCalculator.BYTES_COMPARATOR
2963 .compare(hi.getEndKey(), range.getSecond()) > 0) {
2964 range.setSecond(hi.getEndKey());
2965 }
2966 }
2967
2968 LOG.debug("[" + thread + "] Closing region before moving data around: " + hi);
2969 LOG.debug("[" + thread + "] Contained region dir before close");
2970 debugLsr(hi.getHdfsRegionDir());
2971 try {
2972 LOG.info("[" + thread + "] Closing region: " + hi);
2973 closeRegion(hi);
2974 } catch (IOException ioe) {
2975 LOG.warn("[" + thread + "] Was unable to close region " + hi
2976 + ". Just continuing... ", ioe);
2977 } catch (InterruptedException e) {
2978 LOG.warn("[" + thread + "] Was unable to close region " + hi
2979 + ". Just continuing... ", e);
2980 }
2981
2982 try {
2983 LOG.info("[" + thread + "] Offlining region: " + hi);
2984 offline(hi.getRegionName());
2985 } catch (IOException ioe) {
2986 LOG.warn("[" + thread + "] Unable to offline region from master: " + hi
2987 + ". Just continuing... ", ioe);
2988 }
2989 }
2990
2991
2992 HTableDescriptor htd = getTableInfo().getHTD();
2993
2994 HRegionInfo newRegion = new HRegionInfo(htd.getTableName(), range.getFirst(),
2995 range.getSecond());
2996 HRegion region = HBaseFsckRepair.createHDFSRegionDir(conf, newRegion, htd);
2997 LOG.info("[" + thread + "] Created new empty container region: " +
2998 newRegion + " to contain regions: " + Joiner.on(",").join(overlap));
2999 debugLsr(region.getRegionFileSystem().getRegionDir());
3000
3001
3002 boolean didFix= false;
3003 Path target = region.getRegionFileSystem().getRegionDir();
3004 for (HbckInfo contained : overlap) {
3005 LOG.info("[" + thread + "] Merging " + contained + " into " + target );
3006 int merges = mergeRegionDirs(target, contained);
3007 if (merges > 0) {
3008 didFix = true;
3009 }
3010 }
3011 if (didFix) {
3012 fixes++;
3013 }
3014 }
3015
3016
3017
3018
3019
3020
3021
3022
3023 void sidelineBigOverlaps(
3024 Collection<HbckInfo> bigOverlap) throws IOException {
3025 int overlapsToSideline = bigOverlap.size() - maxMerge;
3026 if (overlapsToSideline > maxOverlapsToSideline) {
3027 overlapsToSideline = maxOverlapsToSideline;
3028 }
3029 List<HbckInfo> regionsToSideline =
3030 RegionSplitCalculator.findBigRanges(bigOverlap, overlapsToSideline);
3031 FileSystem fs = FileSystem.get(conf);
3032 for (HbckInfo regionToSideline: regionsToSideline) {
3033 try {
3034 LOG.info("Closing region: " + regionToSideline);
3035 closeRegion(regionToSideline);
3036 } catch (IOException ioe) {
3037 LOG.warn("Was unable to close region " + regionToSideline
3038 + ". Just continuing... ", ioe);
3039 } catch (InterruptedException e) {
3040 LOG.warn("Was unable to close region " + regionToSideline
3041 + ". Just continuing... ", e);
3042 }
3043
3044 try {
3045 LOG.info("Offlining region: " + regionToSideline);
3046 offline(regionToSideline.getRegionName());
3047 } catch (IOException ioe) {
3048 LOG.warn("Unable to offline region from master: " + regionToSideline
3049 + ". Just continuing... ", ioe);
3050 }
3051
3052 LOG.info("Before sideline big overlapped region: " + regionToSideline.toString());
3053 Path sidelineRegionDir = sidelineRegionDir(fs, TO_BE_LOADED, regionToSideline);
3054 if (sidelineRegionDir != null) {
3055 sidelinedRegions.put(sidelineRegionDir, regionToSideline);
3056 LOG.info("After sidelined big overlapped region: "
3057 + regionToSideline.getRegionNameAsString()
3058 + " to " + sidelineRegionDir.toString());
3059 fixes++;
3060 }
3061 }
3062 }
3063 }
3064
3065
3066
3067
3068
3069
3070
3071 public boolean checkRegionChain(TableIntegrityErrorHandler handler) throws IOException {
3072
3073
3074
3075 if (disabledTables.contains(this.tableName)) {
3076 return true;
3077 }
3078 int originalErrorsCount = errors.getErrorList().size();
3079 Multimap<byte[], HbckInfo> regions = sc.calcCoverage();
3080 SortedSet<byte[]> splits = sc.getSplits();
3081
3082 byte[] prevKey = null;
3083 byte[] problemKey = null;
3084
3085 if (splits.size() == 0) {
3086
3087 handler.handleHoleInRegionChain(HConstants.EMPTY_START_ROW, HConstants.EMPTY_END_ROW);
3088 }
3089
3090 for (byte[] key : splits) {
3091 Collection<HbckInfo> ranges = regions.get(key);
3092 if (prevKey == null && !Bytes.equals(key, HConstants.EMPTY_BYTE_ARRAY)) {
3093 for (HbckInfo rng : ranges) {
3094 handler.handleRegionStartKeyNotEmpty(rng);
3095 }
3096 }
3097
3098
3099 for (HbckInfo rng : ranges) {
3100
3101 byte[] endKey = rng.getEndKey();
3102 endKey = (endKey.length == 0) ? null : endKey;
3103 if (Bytes.equals(rng.getStartKey(),endKey)) {
3104 handler.handleDegenerateRegion(rng);
3105 }
3106 }
3107
3108 if (ranges.size() == 1) {
3109
3110 if (problemKey != null) {
3111 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
3112 }
3113 problemKey = null;
3114 } else if (ranges.size() > 1) {
3115
3116
3117 if (problemKey == null) {
3118
3119 LOG.warn("Naming new problem group: " + Bytes.toStringBinary(key));
3120 problemKey = key;
3121 }
3122 overlapGroups.putAll(problemKey, ranges);
3123
3124
3125 ArrayList<HbckInfo> subRange = new ArrayList<HbckInfo>(ranges);
3126
3127 for (HbckInfo r1 : ranges) {
3128 if (r1.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
3129 subRange.remove(r1);
3130 for (HbckInfo r2 : subRange) {
3131 if (r2.getReplicaId() != HRegionInfo.DEFAULT_REPLICA_ID) continue;
3132 if (Bytes.compareTo(r1.getStartKey(), r2.getStartKey())==0) {
3133 handler.handleDuplicateStartKeys(r1,r2);
3134 } else {
3135
3136 handler.handleOverlapInRegionChain(r1, r2);
3137 }
3138 }
3139 }
3140
3141 } else if (ranges.size() == 0) {
3142 if (problemKey != null) {
3143 LOG.warn("reached end of problem group: " + Bytes.toStringBinary(key));
3144 }
3145 problemKey = null;
3146
3147 byte[] holeStopKey = sc.getSplits().higher(key);
3148
3149 if (holeStopKey != null) {
3150
3151 handler.handleHoleInRegionChain(key, holeStopKey);
3152 }
3153 }
3154 prevKey = key;
3155 }
3156
3157
3158
3159 if (prevKey != null) {
3160 handler.handleRegionEndKeyNotEmpty(prevKey);
3161 }
3162
3163
3164 if (getConf().getBoolean("hbasefsck.overlap.merge.parallel", true)) {
3165 boolean ok = handleOverlapsParallel(handler, prevKey);
3166 if (!ok) {
3167 return false;
3168 }
3169 } else {
3170 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
3171 handler.handleOverlapGroup(overlap);
3172 }
3173 }
3174
3175 if (details) {
3176
3177 errors.print("---- Table '" + this.tableName
3178 + "': region split map");
3179 dump(splits, regions);
3180 errors.print("---- Table '" + this.tableName
3181 + "': overlap groups");
3182 dumpOverlapProblems(overlapGroups);
3183 errors.print("There are " + overlapGroups.keySet().size()
3184 + " overlap groups with " + overlapGroups.size()
3185 + " overlapping regions");
3186 }
3187 if (!sidelinedRegions.isEmpty()) {
3188 LOG.warn("Sidelined big overlapped regions, please bulk load them!");
3189 errors.print("---- Table '" + this.tableName
3190 + "': sidelined big overlapped regions");
3191 dumpSidelinedRegions(sidelinedRegions);
3192 }
3193 return errors.getErrorList().size() == originalErrorsCount;
3194 }
3195
3196 private boolean handleOverlapsParallel(TableIntegrityErrorHandler handler, byte[] prevKey)
3197 throws IOException {
3198
3199
3200 List<WorkItemOverlapMerge> merges = new ArrayList<WorkItemOverlapMerge>(overlapGroups.size());
3201 List<Future<Void>> rets;
3202 for (Collection<HbckInfo> overlap : overlapGroups.asMap().values()) {
3203
3204 merges.add(new WorkItemOverlapMerge(overlap, handler));
3205 }
3206 try {
3207 rets = executor.invokeAll(merges);
3208 } catch (InterruptedException e) {
3209 LOG.error("Overlap merges were interrupted", e);
3210 return false;
3211 }
3212 for(int i=0; i<merges.size(); i++) {
3213 WorkItemOverlapMerge work = merges.get(i);
3214 Future<Void> f = rets.get(i);
3215 try {
3216 f.get();
3217 } catch(ExecutionException e) {
3218 LOG.warn("Failed to merge overlap group" + work, e.getCause());
3219 } catch (InterruptedException e) {
3220 LOG.error("Waiting for overlap merges was interrupted", e);
3221 return false;
3222 }
3223 }
3224 return true;
3225 }
3226
3227
3228
3229
3230
3231
3232
3233 void dump(SortedSet<byte[]> splits, Multimap<byte[], HbckInfo> regions) {
3234
3235 StringBuilder sb = new StringBuilder();
3236 for (byte[] k : splits) {
3237 sb.setLength(0);
3238 sb.append(Bytes.toStringBinary(k) + ":\t");
3239 for (HbckInfo r : regions.get(k)) {
3240 sb.append("[ "+ r.toString() + ", "
3241 + Bytes.toStringBinary(r.getEndKey())+ "]\t");
3242 }
3243 errors.print(sb.toString());
3244 }
3245 }
3246 }
3247
3248 public void dumpOverlapProblems(Multimap<byte[], HbckInfo> regions) {
3249
3250
3251 for (byte[] k : regions.keySet()) {
3252 errors.print(Bytes.toStringBinary(k) + ":");
3253 for (HbckInfo r : regions.get(k)) {
3254 errors.print("[ " + r.toString() + ", "
3255 + Bytes.toStringBinary(r.getEndKey()) + "]");
3256 }
3257 errors.print("----");
3258 }
3259 }
3260
3261 public void dumpSidelinedRegions(Map<Path, HbckInfo> regions) {
3262 for (Map.Entry<Path, HbckInfo> entry: regions.entrySet()) {
3263 TableName tableName = entry.getValue().getTableName();
3264 Path path = entry.getKey();
3265 errors.print("This sidelined region dir should be bulk loaded: "
3266 + path.toString());
3267 errors.print("Bulk load command looks like: "
3268 + "hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles "
3269 + path.toUri().getPath() + " "+ tableName);
3270 }
3271 }
3272
3273 public Multimap<byte[], HbckInfo> getOverlapGroups(
3274 TableName table) {
3275 TableInfo ti = tablesInfo.get(table);
3276 return ti.overlapGroups;
3277 }
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288 HTableDescriptor[] getTables(AtomicInteger numSkipped) {
3289 List<TableName> tableNames = new ArrayList<TableName>();
3290 long now = EnvironmentEdgeManager.currentTime();
3291
3292 for (HbckInfo hbi : regionInfoMap.values()) {
3293 MetaEntry info = hbi.metaEntry;
3294
3295
3296
3297 if (info != null && info.getStartKey().length == 0 && !info.isMetaRegion()) {
3298 if (info.modTime + timelag < now) {
3299 tableNames.add(info.getTable());
3300 } else {
3301 numSkipped.incrementAndGet();
3302 }
3303 }
3304 }
3305 return getHTableDescriptors(tableNames);
3306 }
3307
3308 HTableDescriptor[] getHTableDescriptors(List<TableName> tableNames) {
3309 HTableDescriptor[] htd = new HTableDescriptor[0];
3310 Admin admin = null;
3311 try {
3312 LOG.info("getHTableDescriptors == tableNames => " + tableNames);
3313 admin = new HBaseAdmin(getConf());
3314 htd = admin.getTableDescriptorsByTableName(tableNames);
3315 } catch (IOException e) {
3316 LOG.debug("Exception getting table descriptors", e);
3317 } finally {
3318 if (admin != null) {
3319 try {
3320 admin.close();
3321 } catch (IOException e) {
3322 LOG.debug("Exception closing HBaseAdmin", e);
3323 }
3324 }
3325 }
3326 return htd;
3327 }
3328
3329
3330
3331
3332
3333
3334 private synchronized HbckInfo getOrCreateInfo(String name) {
3335 HbckInfo hbi = regionInfoMap.get(name);
3336 if (hbi == null) {
3337 hbi = new HbckInfo(null);
3338 regionInfoMap.put(name, hbi);
3339 }
3340 return hbi;
3341 }
3342
3343 private void checkAndFixTableLocks() throws IOException {
3344 TableLockChecker checker = new TableLockChecker(zkw, errors);
3345 checker.checkTableLocks();
3346
3347 if (this.fixTableLocks) {
3348 checker.fixExpiredTableLocks();
3349 }
3350 }
3351
3352
3353
3354
3355
3356
3357
3358 private void checkAndFixOrphanedTableZNodes()
3359 throws IOException, KeeperException, InterruptedException {
3360 Set<TableName> enablingTables = ZKTableStateClientSideReader.getEnablingTables(zkw);
3361 String msg;
3362 TableInfo tableInfo;
3363
3364 for (TableName tableName : enablingTables) {
3365
3366 tableInfo = tablesInfo.get(tableName);
3367 if (tableInfo != null) {
3368
3369 continue;
3370 }
3371
3372 msg = "Table " + tableName + " not found in hbase:meta. Orphaned table ZNode found.";
3373 LOG.warn(msg);
3374 orphanedTableZNodes.add(tableName);
3375 errors.reportError(ERROR_CODE.ORPHANED_ZK_TABLE_ENTRY, msg);
3376 }
3377
3378 if (orphanedTableZNodes.size() > 0 && this.fixTableZNodes) {
3379 ZKTableStateManager zkTableStateMgr = new ZKTableStateManager(zkw);
3380
3381 for (TableName tableName : orphanedTableZNodes) {
3382 try {
3383
3384
3385
3386
3387 zkTableStateMgr.setTableState(tableName, ZooKeeperProtos.Table.State.DISABLED);
3388 } catch (CoordinatedStateException e) {
3389
3390 LOG.error(
3391 "Got a CoordinatedStateException while fixing the ENABLING table znode " + tableName,
3392 e);
3393 }
3394 }
3395 }
3396 }
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407 boolean checkMetaRegion() throws IOException, KeeperException, InterruptedException {
3408 Map<Integer, HbckInfo> metaRegions = new HashMap<Integer, HbckInfo>();
3409 for (HbckInfo value : regionInfoMap.values()) {
3410 if (value.metaEntry != null && value.metaEntry.isMetaRegion()) {
3411 metaRegions.put(value.getReplicaId(), value);
3412 }
3413 }
3414 int metaReplication = admin.getTableDescriptor(TableName.META_TABLE_NAME)
3415 .getRegionReplication();
3416 boolean noProblem = true;
3417
3418
3419 for (int i = 0; i < metaReplication; i++) {
3420 HbckInfo metaHbckInfo = metaRegions.remove(i);
3421 List<ServerName> servers = new ArrayList<ServerName>();
3422 if (metaHbckInfo != null) {
3423 servers = metaHbckInfo.deployedOn;
3424 }
3425 if (servers.size() != 1) {
3426 noProblem = false;
3427 if (servers.size() == 0) {
3428 assignMetaReplica(i);
3429 } else if (servers.size() > 1) {
3430 errors
3431 .reportError(ERROR_CODE.MULTI_META_REGION, "hbase:meta, replicaId " +
3432 metaHbckInfo.getReplicaId() + " is found on more than one region.");
3433 if (shouldFixAssignments()) {
3434 errors.print("Trying to fix a problem with hbase:meta, replicaId " +
3435 metaHbckInfo.getReplicaId() +"..");
3436 setShouldRerun();
3437
3438 HBaseFsckRepair.fixMultiAssignment(connection, metaHbckInfo.metaEntry, servers);
3439 }
3440 }
3441 }
3442 }
3443
3444 for (Map.Entry<Integer, HbckInfo> entry : metaRegions.entrySet()) {
3445 noProblem = false;
3446 errors.reportError(ERROR_CODE.SHOULD_NOT_BE_DEPLOYED,
3447 "hbase:meta replicas are deployed in excess. Configured " + metaReplication +
3448 ", deployed " + metaRegions.size());
3449 if (shouldFixAssignments()) {
3450 errors.print("Trying to undeploy excess replica, replicaId: " + entry.getKey() +
3451 " of hbase:meta..");
3452 setShouldRerun();
3453 unassignMetaReplica(entry.getValue());
3454 }
3455 }
3456
3457
3458 return noProblem;
3459 }
3460
3461 private void unassignMetaReplica(HbckInfo hi) throws IOException, InterruptedException,
3462 KeeperException {
3463 undeployRegions(hi);
3464 ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(hi.metaEntry.getReplicaId()));
3465 }
3466
3467 private void assignMetaReplica(int replicaId)
3468 throws IOException, KeeperException, InterruptedException {
3469 errors.reportError(ERROR_CODE.NO_META_REGION, "hbase:meta, replicaId " +
3470 replicaId +" is not found on any region.");
3471 if (shouldFixAssignments()) {
3472 errors.print("Trying to fix a problem with hbase:meta..");
3473 setShouldRerun();
3474
3475 HRegionInfo h = RegionReplicaUtil.getRegionInfoForReplica(
3476 HRegionInfo.FIRST_META_REGIONINFO, replicaId);
3477 HBaseFsckRepair.fixUnassigned(admin, h);
3478 HBaseFsckRepair.waitUntilAssigned(admin, h);
3479 }
3480 }
3481
3482
3483
3484
3485
3486 boolean loadMetaEntries() throws IOException {
3487 MetaScannerVisitor visitor = new MetaScannerVisitorBase() {
3488 int countRecord = 1;
3489
3490
3491 final Comparator<Cell> comp = new Comparator<Cell>() {
3492 @Override
3493 public int compare(Cell k1, Cell k2) {
3494 return (int)(k1.getTimestamp() - k2.getTimestamp());
3495 }
3496 };
3497
3498 @Override
3499 public boolean processRow(Result result) throws IOException {
3500 try {
3501
3502
3503 long ts = Collections.max(result.listCells(), comp).getTimestamp();
3504 RegionLocations rl = MetaTableAccessor.getRegionLocations(result);
3505 if (rl == null) {
3506 emptyRegionInfoQualifiers.add(result);
3507 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3508 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3509 return true;
3510 }
3511 ServerName sn = null;
3512 if (rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID) == null ||
3513 rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo() == null) {
3514 emptyRegionInfoQualifiers.add(result);
3515 errors.reportError(ERROR_CODE.EMPTY_META_CELL,
3516 "Empty REGIONINFO_QUALIFIER found in hbase:meta");
3517 return true;
3518 }
3519 HRegionInfo hri = rl.getRegionLocation(HRegionInfo.DEFAULT_REPLICA_ID).getRegionInfo();
3520 if (!(isTableIncluded(hri.getTable())
3521 || hri.isMetaRegion())) {
3522 return true;
3523 }
3524 PairOfSameType<HRegionInfo> daughters = HRegionInfo.getDaughterRegions(result);
3525 for (HRegionLocation h : rl.getRegionLocations()) {
3526 if (h == null || h.getRegionInfo() == null) {
3527 continue;
3528 }
3529 sn = h.getServerName();
3530 hri = h.getRegionInfo();
3531
3532 MetaEntry m = null;
3533 if (hri.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
3534 m = new MetaEntry(hri, sn, ts, daughters.getFirst(), daughters.getSecond());
3535 } else {
3536 m = new MetaEntry(hri, sn, ts, null, null);
3537 }
3538 HbckInfo previous = regionInfoMap.get(hri.getEncodedName());
3539 if (previous == null) {
3540 regionInfoMap.put(hri.getEncodedName(), new HbckInfo(m));
3541 } else if (previous.metaEntry == null) {
3542 previous.metaEntry = m;
3543 } else {
3544 throw new IOException("Two entries in hbase:meta are same " + previous);
3545 }
3546 }
3547 PairOfSameType<HRegionInfo> mergeRegions = HRegionInfo.getMergeRegions(result);
3548 for (HRegionInfo mergeRegion : new HRegionInfo[] {
3549 mergeRegions.getFirst(), mergeRegions.getSecond() }) {
3550 if (mergeRegion != null) {
3551
3552 HbckInfo hbInfo = getOrCreateInfo(mergeRegion.getEncodedName());
3553 hbInfo.setMerged(true);
3554 }
3555 }
3556
3557
3558 if (countRecord % 100 == 0) {
3559 errors.progress();
3560 }
3561 countRecord++;
3562 return true;
3563 } catch (RuntimeException e) {
3564 LOG.error("Result=" + result);
3565 throw e;
3566 }
3567 }
3568 };
3569 if (!checkMetaOnly) {
3570
3571 MetaScanner.metaScan(connection, visitor);
3572 }
3573
3574 errors.print("");
3575 return true;
3576 }
3577
3578
3579
3580
3581 static class MetaEntry extends HRegionInfo {
3582 ServerName regionServer;
3583 long modTime;
3584 HRegionInfo splitA, splitB;
3585
3586 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime) {
3587 this(rinfo, regionServer, modTime, null, null);
3588 }
3589
3590 public MetaEntry(HRegionInfo rinfo, ServerName regionServer, long modTime,
3591 HRegionInfo splitA, HRegionInfo splitB) {
3592 super(rinfo);
3593 this.regionServer = regionServer;
3594 this.modTime = modTime;
3595 this.splitA = splitA;
3596 this.splitB = splitB;
3597 }
3598
3599 @Override
3600 public boolean equals(Object o) {
3601 boolean superEq = super.equals(o);
3602 if (!superEq) {
3603 return superEq;
3604 }
3605
3606 MetaEntry me = (MetaEntry) o;
3607 if (!regionServer.equals(me.regionServer)) {
3608 return false;
3609 }
3610 return (modTime == me.modTime);
3611 }
3612
3613 @Override
3614 public int hashCode() {
3615 int hash = Arrays.hashCode(getRegionName());
3616 hash ^= getRegionId();
3617 hash ^= Arrays.hashCode(getStartKey());
3618 hash ^= Arrays.hashCode(getEndKey());
3619 hash ^= Boolean.valueOf(isOffline()).hashCode();
3620 hash ^= getTable().hashCode();
3621 if (regionServer != null) {
3622 hash ^= regionServer.hashCode();
3623 }
3624 hash ^= modTime;
3625 return hash;
3626 }
3627 }
3628
3629
3630
3631
3632 static class HdfsEntry {
3633 HRegionInfo hri;
3634 Path hdfsRegionDir = null;
3635 long hdfsRegionDirModTime = 0;
3636 boolean hdfsRegioninfoFilePresent = false;
3637 boolean hdfsOnlyEdits = false;
3638 }
3639
3640
3641
3642
3643 static class OnlineEntry {
3644 HRegionInfo hri;
3645 ServerName hsa;
3646
3647 @Override
3648 public String toString() {
3649 return hsa.toString() + ";" + hri.getRegionNameAsString();
3650 }
3651 }
3652
3653
3654
3655
3656
3657 public static class HbckInfo implements KeyRange {
3658 private MetaEntry metaEntry = null;
3659 private HdfsEntry hdfsEntry = null;
3660 private List<OnlineEntry> deployedEntries = Lists.newArrayList();
3661 private List<ServerName> deployedOn = Lists.newArrayList();
3662 private boolean skipChecks = false;
3663 private boolean isMerged = false;
3664 private int deployedReplicaId = HRegionInfo.DEFAULT_REPLICA_ID;
3665 private HRegionInfo primaryHRIForDeployedReplica = null;
3666
3667 HbckInfo(MetaEntry metaEntry) {
3668 this.metaEntry = metaEntry;
3669 }
3670
3671 public int getReplicaId() {
3672 if (metaEntry != null) return metaEntry.getReplicaId();
3673 return deployedReplicaId;
3674 }
3675
3676 public synchronized void addServer(HRegionInfo hri, ServerName server) {
3677 OnlineEntry rse = new OnlineEntry() ;
3678 rse.hri = hri;
3679 rse.hsa = server;
3680 this.deployedEntries.add(rse);
3681 this.deployedOn.add(server);
3682
3683 this.deployedReplicaId = hri.getReplicaId();
3684 this.primaryHRIForDeployedReplica =
3685 RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
3686 }
3687
3688 @Override
3689 public synchronized String toString() {
3690 StringBuilder sb = new StringBuilder();
3691 sb.append("{ meta => ");
3692 sb.append((metaEntry != null)? metaEntry.getRegionNameAsString() : "null");
3693 sb.append( ", hdfs => " + getHdfsRegionDir());
3694 sb.append( ", deployed => " + Joiner.on(", ").join(deployedEntries));
3695 sb.append( ", replicaId => " + getReplicaId());
3696 sb.append(" }");
3697 return sb.toString();
3698 }
3699
3700 @Override
3701 public byte[] getStartKey() {
3702 if (this.metaEntry != null) {
3703 return this.metaEntry.getStartKey();
3704 } else if (this.hdfsEntry != null) {
3705 return this.hdfsEntry.hri.getStartKey();
3706 } else {
3707 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3708 return null;
3709 }
3710 }
3711
3712 @Override
3713 public byte[] getEndKey() {
3714 if (this.metaEntry != null) {
3715 return this.metaEntry.getEndKey();
3716 } else if (this.hdfsEntry != null) {
3717 return this.hdfsEntry.hri.getEndKey();
3718 } else {
3719 LOG.error("Entry " + this + " has no meta or hdfs region start key.");
3720 return null;
3721 }
3722 }
3723
3724 public TableName getTableName() {
3725 if (this.metaEntry != null) {
3726 return this.metaEntry.getTable();
3727 } else if (this.hdfsEntry != null) {
3728
3729
3730 Path tableDir = this.hdfsEntry.hdfsRegionDir.getParent();
3731 return FSUtils.getTableName(tableDir);
3732 } else {
3733
3734 for (OnlineEntry e : deployedEntries) {
3735 return e.hri.getTable();
3736 }
3737 return null;
3738 }
3739 }
3740
3741 public String getRegionNameAsString() {
3742 if (metaEntry != null) {
3743 return metaEntry.getRegionNameAsString();
3744 } else if (hdfsEntry != null) {
3745 if (hdfsEntry.hri != null) {
3746 return hdfsEntry.hri.getRegionNameAsString();
3747 }
3748 } else {
3749
3750 for (OnlineEntry e : deployedEntries) {
3751 return e.hri.getRegionNameAsString();
3752 }
3753 }
3754 return null;
3755 }
3756
3757 public byte[] getRegionName() {
3758 if (metaEntry != null) {
3759 return metaEntry.getRegionName();
3760 } else if (hdfsEntry != null) {
3761 return hdfsEntry.hri.getRegionName();
3762 } else {
3763
3764 for (OnlineEntry e : deployedEntries) {
3765 return e.hri.getRegionName();
3766 }
3767 return null;
3768 }
3769 }
3770
3771 public HRegionInfo getPrimaryHRIForDeployedReplica() {
3772 return primaryHRIForDeployedReplica;
3773 }
3774
3775 Path getHdfsRegionDir() {
3776 if (hdfsEntry == null) {
3777 return null;
3778 }
3779 return hdfsEntry.hdfsRegionDir;
3780 }
3781
3782 boolean containsOnlyHdfsEdits() {
3783 if (hdfsEntry == null) {
3784 return false;
3785 }
3786 return hdfsEntry.hdfsOnlyEdits;
3787 }
3788
3789 boolean isHdfsRegioninfoPresent() {
3790 if (hdfsEntry == null) {
3791 return false;
3792 }
3793 return hdfsEntry.hdfsRegioninfoFilePresent;
3794 }
3795
3796 long getModTime() {
3797 if (hdfsEntry == null) {
3798 return 0;
3799 }
3800 return hdfsEntry.hdfsRegionDirModTime;
3801 }
3802
3803 HRegionInfo getHdfsHRI() {
3804 if (hdfsEntry == null) {
3805 return null;
3806 }
3807 return hdfsEntry.hri;
3808 }
3809
3810 public void setSkipChecks(boolean skipChecks) {
3811 this.skipChecks = skipChecks;
3812 }
3813
3814 public boolean isSkipChecks() {
3815 return skipChecks;
3816 }
3817
3818 public void setMerged(boolean isMerged) {
3819 this.isMerged = isMerged;
3820 }
3821
3822 public boolean isMerged() {
3823 return this.isMerged;
3824 }
3825 }
3826
3827 final static Comparator<HbckInfo> cmp = new Comparator<HbckInfo>() {
3828 @Override
3829 public int compare(HbckInfo l, HbckInfo r) {
3830 if (l == r) {
3831
3832 return 0;
3833 }
3834
3835 int tableCompare = l.getTableName().compareTo(r.getTableName());
3836 if (tableCompare != 0) {
3837 return tableCompare;
3838 }
3839
3840 int startComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3841 l.getStartKey(), r.getStartKey());
3842 if (startComparison != 0) {
3843 return startComparison;
3844 }
3845
3846
3847 byte[] endKey = r.getEndKey();
3848 endKey = (endKey.length == 0) ? null : endKey;
3849 byte[] endKey2 = l.getEndKey();
3850 endKey2 = (endKey2.length == 0) ? null : endKey2;
3851 int endComparison = RegionSplitCalculator.BYTES_COMPARATOR.compare(
3852 endKey2, endKey);
3853
3854 if (endComparison != 0) {
3855 return endComparison;
3856 }
3857
3858
3859
3860 if (l.hdfsEntry == null && r.hdfsEntry == null) {
3861 return 0;
3862 }
3863 if (l.hdfsEntry == null && r.hdfsEntry != null) {
3864 return 1;
3865 }
3866
3867 if (r.hdfsEntry == null) {
3868 return -1;
3869 }
3870
3871 return (int) (l.hdfsEntry.hri.getRegionId()- r.hdfsEntry.hri.getRegionId());
3872 }
3873 };
3874
3875
3876
3877
3878 private void printTableSummary(SortedMap<TableName, TableInfo> tablesInfo) {
3879 StringBuilder sb = new StringBuilder();
3880 int numOfSkippedRegions;
3881 errors.print("Summary:");
3882 for (TableInfo tInfo : tablesInfo.values()) {
3883 numOfSkippedRegions = (skippedRegions.containsKey(tInfo.getName())) ?
3884 skippedRegions.get(tInfo.getName()).size() : 0;
3885
3886 if (errors.tableHasErrors(tInfo)) {
3887 errors.print("Table " + tInfo.getName() + " is inconsistent.");
3888 } else if (numOfSkippedRegions > 0){
3889 errors.print("Table " + tInfo.getName() + " is okay (with "
3890 + numOfSkippedRegions + " skipped regions).");
3891 }
3892 else {
3893 errors.print("Table " + tInfo.getName() + " is okay.");
3894 }
3895 errors.print(" Number of regions: " + tInfo.getNumRegions());
3896 if (numOfSkippedRegions > 0) {
3897 Set<String> skippedRegionStrings = skippedRegions.get(tInfo.getName());
3898 System.out.println(" Number of skipped regions: " + numOfSkippedRegions);
3899 System.out.println(" List of skipped regions:");
3900 for(String sr : skippedRegionStrings) {
3901 System.out.println(" " + sr);
3902 }
3903 }
3904 sb.setLength(0);
3905 sb.append(" Deployed on: ");
3906 for (ServerName server : tInfo.deployedOn) {
3907 sb.append(" " + server.toString());
3908 }
3909 errors.print(sb.toString());
3910 }
3911 }
3912
3913 static ErrorReporter getErrorReporter(
3914 final Configuration conf) throws ClassNotFoundException {
3915 Class<? extends ErrorReporter> reporter = conf.getClass("hbasefsck.errorreporter", PrintingErrorReporter.class, ErrorReporter.class);
3916 return ReflectionUtils.newInstance(reporter, conf);
3917 }
3918
3919 public interface ErrorReporter {
3920 enum ERROR_CODE {
3921 UNKNOWN, NO_META_REGION, NULL_META_REGION, NO_VERSION_FILE, NOT_IN_META_HDFS, NOT_IN_META,
3922 NOT_IN_META_OR_DEPLOYED, NOT_IN_HDFS_OR_DEPLOYED, NOT_IN_HDFS, SERVER_DOES_NOT_MATCH_META, NOT_DEPLOYED,
3923 MULTI_DEPLOYED, SHOULD_NOT_BE_DEPLOYED, MULTI_META_REGION, RS_CONNECT_FAILURE,
3924 FIRST_REGION_STARTKEY_NOT_EMPTY, LAST_REGION_ENDKEY_NOT_EMPTY, DUPE_STARTKEYS,
3925 HOLE_IN_REGION_CHAIN, OVERLAP_IN_REGION_CHAIN, REGION_CYCLE, DEGENERATE_REGION,
3926 ORPHAN_HDFS_REGION, LINGERING_SPLIT_PARENT, NO_TABLEINFO_FILE, LINGERING_REFERENCE_HFILE,
3927 LINGERING_HFILELINK, WRONG_USAGE, EMPTY_META_CELL, EXPIRED_TABLE_LOCK,
3928 ORPHANED_ZK_TABLE_ENTRY, BOUNDARIES_ERROR
3929 }
3930 void clear();
3931 void report(String message);
3932 void reportError(String message);
3933 void reportError(ERROR_CODE errorCode, String message);
3934 void reportError(ERROR_CODE errorCode, String message, TableInfo table);
3935 void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info);
3936 void reportError(
3937 ERROR_CODE errorCode,
3938 String message,
3939 TableInfo table,
3940 HbckInfo info1,
3941 HbckInfo info2
3942 );
3943 int summarize();
3944 void detail(String details);
3945 ArrayList<ERROR_CODE> getErrorList();
3946 void progress();
3947 void print(String message);
3948 void resetErrors();
3949 boolean tableHasErrors(TableInfo table);
3950 }
3951
3952 static class PrintingErrorReporter implements ErrorReporter {
3953 public int errorCount = 0;
3954 private int showProgress;
3955
3956 private static final int progressThreshold = 100;
3957
3958 Set<TableInfo> errorTables = new HashSet<TableInfo>();
3959
3960
3961 private ArrayList<ERROR_CODE> errorList = new ArrayList<ERROR_CODE>();
3962
3963 @Override
3964 public void clear() {
3965 errorTables.clear();
3966 errorList.clear();
3967 errorCount = 0;
3968 }
3969
3970 @Override
3971 public synchronized void reportError(ERROR_CODE errorCode, String message) {
3972 if (errorCode == ERROR_CODE.WRONG_USAGE) {
3973 System.err.println(message);
3974 return;
3975 }
3976
3977 errorList.add(errorCode);
3978 if (!summary) {
3979 System.out.println("ERROR: " + message);
3980 }
3981 errorCount++;
3982 showProgress = 0;
3983 }
3984
3985 @Override
3986 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table) {
3987 errorTables.add(table);
3988 reportError(errorCode, message);
3989 }
3990
3991 @Override
3992 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
3993 HbckInfo info) {
3994 errorTables.add(table);
3995 String reference = "(region " + info.getRegionNameAsString() + ")";
3996 reportError(errorCode, reference + " " + message);
3997 }
3998
3999 @Override
4000 public synchronized void reportError(ERROR_CODE errorCode, String message, TableInfo table,
4001 HbckInfo info1, HbckInfo info2) {
4002 errorTables.add(table);
4003 String reference = "(regions " + info1.getRegionNameAsString()
4004 + " and " + info2.getRegionNameAsString() + ")";
4005 reportError(errorCode, reference + " " + message);
4006 }
4007
4008 @Override
4009 public synchronized void reportError(String message) {
4010 reportError(ERROR_CODE.UNKNOWN, message);
4011 }
4012
4013
4014
4015
4016
4017
4018 @Override
4019 public synchronized void report(String message) {
4020 if (! summary) {
4021 System.out.println("ERROR: " + message);
4022 }
4023 showProgress = 0;
4024 }
4025
4026 @Override
4027 public synchronized int summarize() {
4028 System.out.println(Integer.toString(errorCount) +
4029 " inconsistencies detected.");
4030 if (errorCount == 0) {
4031 System.out.println("Status: OK");
4032 return 0;
4033 } else {
4034 System.out.println("Status: INCONSISTENT");
4035 return -1;
4036 }
4037 }
4038
4039 @Override
4040 public ArrayList<ERROR_CODE> getErrorList() {
4041 return errorList;
4042 }
4043
4044 @Override
4045 public synchronized void print(String message) {
4046 if (!summary) {
4047 System.out.println(message);
4048 }
4049 }
4050
4051 @Override
4052 public boolean tableHasErrors(TableInfo table) {
4053 return errorTables.contains(table);
4054 }
4055
4056 @Override
4057 public void resetErrors() {
4058 errorCount = 0;
4059 }
4060
4061 @Override
4062 public synchronized void detail(String message) {
4063 if (details) {
4064 System.out.println(message);
4065 }
4066 showProgress = 0;
4067 }
4068
4069 @Override
4070 public synchronized void progress() {
4071 if (showProgress++ == progressThreshold) {
4072 if (!summary) {
4073 System.out.print(".");
4074 }
4075 showProgress = 0;
4076 }
4077 }
4078 }
4079
4080
4081
4082
4083 static class WorkItemRegion implements Callable<Void> {
4084 private HBaseFsck hbck;
4085 private ServerName rsinfo;
4086 private ErrorReporter errors;
4087 private HConnection connection;
4088
4089 WorkItemRegion(HBaseFsck hbck, ServerName info,
4090 ErrorReporter errors, HConnection connection) {
4091 this.hbck = hbck;
4092 this.rsinfo = info;
4093 this.errors = errors;
4094 this.connection = connection;
4095 }
4096
4097 @Override
4098 public synchronized Void call() throws IOException {
4099 errors.progress();
4100 try {
4101 BlockingInterface server = connection.getAdmin(rsinfo);
4102
4103
4104 List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server);
4105 regions = filterRegions(regions);
4106
4107 if (details) {
4108 errors.detail("RegionServer: " + rsinfo.getServerName() +
4109 " number of regions: " + regions.size());
4110 for (HRegionInfo rinfo: regions) {
4111 errors.detail(" " + rinfo.getRegionNameAsString() +
4112 " id: " + rinfo.getRegionId() +
4113 " encoded_name: " + rinfo.getEncodedName() +
4114 " start: " + Bytes.toStringBinary(rinfo.getStartKey()) +
4115 " end: " + Bytes.toStringBinary(rinfo.getEndKey()));
4116 }
4117 }
4118
4119
4120 for (HRegionInfo r:regions) {
4121 HbckInfo hbi = hbck.getOrCreateInfo(r.getEncodedName());
4122 hbi.addServer(r, rsinfo);
4123 }
4124 } catch (IOException e) {
4125 errors.reportError(ERROR_CODE.RS_CONNECT_FAILURE, "RegionServer: " + rsinfo.getServerName() +
4126 " Unable to fetch region information. " + e);
4127 throw e;
4128 }
4129 return null;
4130 }
4131
4132 private List<HRegionInfo> filterRegions(List<HRegionInfo> regions) {
4133 List<HRegionInfo> ret = Lists.newArrayList();
4134 for (HRegionInfo hri : regions) {
4135 if (hri.isMetaTable() || (!hbck.checkMetaOnly
4136 && hbck.isTableIncluded(hri.getTable()))) {
4137 ret.add(hri);
4138 }
4139 }
4140 return ret;
4141 }
4142 }
4143
4144
4145
4146
4147
4148 class WorkItemHdfsDir implements Callable<Void> {
4149 private FileStatus tableDir;
4150 private ErrorReporter errors;
4151 private FileSystem fs;
4152
4153 WorkItemHdfsDir(FileSystem fs, ErrorReporter errors,
4154 FileStatus status) {
4155 this.fs = fs;
4156 this.tableDir = status;
4157 this.errors = errors;
4158 }
4159
4160 @Override
4161 public synchronized Void call() throws InterruptedException, ExecutionException {
4162 final Vector<Exception> exceptions = new Vector<Exception>();
4163 try {
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266 static class WorkItemHdfsRegionInfo implements Callable<Void> {
4267 private HbckInfo hbi;
4268 private HBaseFsck hbck;
4269 private ErrorReporter errors;
4270
4271 WorkItemHdfsRegionInfo(HbckInfo hbi, HBaseFsck hbck, ErrorReporter errors) {
4272 this.hbi = hbi;
4273 this.hbck = hbck;
4274 this.errors = errors;
4275 }
4276
4277 @Override
4278 public synchronized Void call() throws IOException {
4279
4280 if (hbi.getHdfsHRI() == null) {
4281 try {
4282 errors.progress();
4283 hbck.loadHdfsRegioninfo(hbi);
4284 } catch (IOException ioe) {
4285 String msg = "Orphan region in HDFS: Unable to load .regioninfo from table "
4286 + hbi.getTableName() + " in hdfs dir "
4287 + hbi.getHdfsRegionDir()
4288 + "! It may be an invalid format or version file. Treating as "
4289 + "an orphaned regiondir.";
4290 errors.reportError(ERROR_CODE.ORPHAN_HDFS_REGION, msg);
4291 try {
4292 hbck.debugLsr(hbi.getHdfsRegionDir());
4293 } catch (IOException ioe2) {
4294 LOG.error("Unable to read directory " + hbi.getHdfsRegionDir(), ioe2);
4295 throw ioe2;
4296 }
4297 hbck.orphanHdfsDirs.add(hbi);
4298 throw ioe;
4299 }
4300 }
4301 return null;
4302 }
4303 };
4304
4305
4306
4307
4308
4309 public static void setDisplayFullReport() {
4310 details = true;
4311 }
4312
4313
4314
4315
4316
4317 void setSummary() {
4318 summary = true;
4319 }
4320
4321
4322
4323
4324
4325 void setCheckMetaOnly() {
4326 checkMetaOnly = true;
4327 }
4328
4329
4330
4331
4332 void setRegionBoundariesCheck() {
4333 checkRegionBoundaries = true;
4334 }
4335
4336
4337
4338
4339
4340 public void setFixTableLocks(boolean shouldFix) {
4341 fixTableLocks = shouldFix;
4342 fixAny |= shouldFix;
4343 }
4344
4345
4346
4347
4348
4349 public void setFixTableZNodes(boolean shouldFix) {
4350 fixTableZNodes = shouldFix;
4351 fixAny |= shouldFix;
4352 }
4353
4354
4355
4356
4357
4358
4359
4360 void setShouldRerun() {
4361 rerun = true;
4362 }
4363
4364 boolean shouldRerun() {
4365 return rerun;
4366 }
4367
4368
4369
4370
4371
4372 public void setFixAssignments(boolean shouldFix) {
4373 fixAssignments = shouldFix;
4374 fixAny |= shouldFix;
4375 }
4376
4377 boolean shouldFixAssignments() {
4378 return fixAssignments;
4379 }
4380
4381 public void setFixMeta(boolean shouldFix) {
4382 fixMeta = shouldFix;
4383 fixAny |= shouldFix;
4384 }
4385
4386 boolean shouldFixMeta() {
4387 return fixMeta;
4388 }
4389
4390 public void setFixEmptyMetaCells(boolean shouldFix) {
4391 fixEmptyMetaCells = shouldFix;
4392 fixAny |= shouldFix;
4393 }
4394
4395 boolean shouldFixEmptyMetaCells() {
4396 return fixEmptyMetaCells;
4397 }
4398
4399 public void setCheckHdfs(boolean checking) {
4400 checkHdfs = checking;
4401 }
4402
4403 boolean shouldCheckHdfs() {
4404 return checkHdfs;
4405 }
4406
4407 public void setFixHdfsHoles(boolean shouldFix) {
4408 fixHdfsHoles = shouldFix;
4409 fixAny |= shouldFix;
4410 }
4411
4412 boolean shouldFixHdfsHoles() {
4413 return fixHdfsHoles;
4414 }
4415
4416 public void setFixTableOrphans(boolean shouldFix) {
4417 fixTableOrphans = shouldFix;
4418 fixAny |= shouldFix;
4419 }
4420
4421 boolean shouldFixTableOrphans() {
4422 return fixTableOrphans;
4423 }
4424
4425 public void setFixHdfsOverlaps(boolean shouldFix) {
4426 fixHdfsOverlaps = shouldFix;
4427 fixAny |= shouldFix;
4428 }
4429
4430 boolean shouldFixHdfsOverlaps() {
4431 return fixHdfsOverlaps;
4432 }
4433
4434 public void setFixHdfsOrphans(boolean shouldFix) {
4435 fixHdfsOrphans = shouldFix;
4436 fixAny |= shouldFix;
4437 }
4438
4439 boolean shouldFixHdfsOrphans() {
4440 return fixHdfsOrphans;
4441 }
4442
4443 public void setFixVersionFile(boolean shouldFix) {
4444 fixVersionFile = shouldFix;
4445 fixAny |= shouldFix;
4446 }
4447
4448 public boolean shouldFixVersionFile() {
4449 return fixVersionFile;
4450 }
4451
4452 public void setSidelineBigOverlaps(boolean sbo) {
4453 this.sidelineBigOverlaps = sbo;
4454 }
4455
4456 public boolean shouldSidelineBigOverlaps() {
4457 return sidelineBigOverlaps;
4458 }
4459
4460 public void setFixSplitParents(boolean shouldFix) {
4461 fixSplitParents = shouldFix;
4462 fixAny |= shouldFix;
4463 }
4464
4465 boolean shouldFixSplitParents() {
4466 return fixSplitParents;
4467 }
4468
4469 public void setFixReferenceFiles(boolean shouldFix) {
4470 fixReferenceFiles = shouldFix;
4471 fixAny |= shouldFix;
4472 }
4473
4474 boolean shouldFixReferenceFiles() {
4475 return fixReferenceFiles;
4476 }
4477
4478 public void setFixHFileLinks(boolean shouldFix) {
4479 fixHFileLinks = shouldFix;
4480 fixAny |= shouldFix;
4481 }
4482
4483 boolean shouldFixHFileLinks() {
4484 return fixHFileLinks;
4485 }
4486
4487 public boolean shouldIgnorePreCheckPermission() {
4488 return !fixAny || ignorePreCheckPermission;
4489 }
4490
4491 public void setIgnorePreCheckPermission(boolean ignorePreCheckPermission) {
4492 this.ignorePreCheckPermission = ignorePreCheckPermission;
4493 }
4494
4495
4496
4497
4498 public void setMaxMerge(int mm) {
4499 this.maxMerge = mm;
4500 }
4501
4502 public int getMaxMerge() {
4503 return maxMerge;
4504 }
4505
4506 public void setMaxOverlapsToSideline(int mo) {
4507 this.maxOverlapsToSideline = mo;
4508 }
4509
4510 public int getMaxOverlapsToSideline() {
4511 return maxOverlapsToSideline;
4512 }
4513
4514
4515
4516
4517
4518 boolean isTableIncluded(TableName table) {
4519 return (tablesIncluded.size() == 0) || tablesIncluded.contains(table);
4520 }
4521
4522 public void includeTable(TableName table) {
4523 tablesIncluded.add(table);
4524 }
4525
4526 Set<TableName> getIncludedTables() {
4527 return new HashSet<TableName>(tablesIncluded);
4528 }
4529
4530
4531
4532
4533
4534
4535 public void setTimeLag(long seconds) {
4536 timelag = seconds * 1000;
4537 }
4538
4539
4540
4541
4542
4543 public void setSidelineDir(String sidelineDir) {
4544 this.sidelineDir = new Path(sidelineDir);
4545 }
4546
4547 protected HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles) throws IOException {
4548 return new HFileCorruptionChecker(getConf(), executor, sidelineCorruptHFiles);
4549 }
4550
4551 public HFileCorruptionChecker getHFilecorruptionChecker() {
4552 return hfcc;
4553 }
4554
4555 public void setHFileCorruptionChecker(HFileCorruptionChecker hfcc) {
4556 this.hfcc = hfcc;
4557 }
4558
4559 public void setRetCode(int code) {
4560 this.retcode = code;
4561 }
4562
4563 public int getRetCode() {
4564 return retcode;
4565 }
4566
4567 protected HBaseFsck printUsageAndExit() {
4568 StringWriter sw = new StringWriter(2048);
4569 PrintWriter out = new PrintWriter(sw);
4570 out.println("Usage: fsck [opts] {only tables}");
4571 out.println(" where [opts] are:");
4572 out.println(" -help Display help options (this)");
4573 out.println(" -details Display full report of all regions.");
4574 out.println(" -timelag <timeInSeconds> Process only regions that " +
4575 " have not experienced any metadata updates in the last " +
4576 " <timeInSeconds> seconds.");
4577 out.println(" -sleepBeforeRerun <timeInSeconds> Sleep this many seconds" +
4578 " before checking if the fix worked if run with -fix");
4579 out.println(" -summary Print only summary of the tables and status.");
4580 out.println(" -metaonly Only check the state of the hbase:meta table.");
4581 out.println(" -sidelineDir <hdfs://> HDFS path to backup existing meta.");
4582 out.println(" -boundaries Verify that regions boundaries are the same between META and store files.");
4583 out.println("");
4584 out.println(" Metadata Repair options: (expert features, use with caution!)");
4585 out.println(" -fix Try to fix region assignments. This is for backwards compatiblity");
4586 out.println(" -fixAssignments Try to fix region assignments. Replaces the old -fix");
4587 out.println(" -fixMeta Try to fix meta problems. This assumes HDFS region info is good.");
4588 out.println(" -noHdfsChecking Don't load/check region info from HDFS."
4589 + " Assumes hbase:meta region info is good. Won't check/fix any HDFS issue, e.g. hole, orphan, or overlap");
4590 out.println(" -fixHdfsHoles Try to fix region holes in hdfs.");
4591 out.println(" -fixHdfsOrphans Try to fix region dirs with no .regioninfo file in hdfs");
4592 out.println(" -fixTableOrphans Try to fix table dirs with no .tableinfo file in hdfs (online mode only)");
4593 out.println(" -fixHdfsOverlaps Try to fix region overlaps in hdfs.");
4594 out.println(" -fixVersionFile Try to fix missing hbase.version file in hdfs.");
4595 out.println(" -maxMerge <n> When fixing region overlaps, allow at most <n> regions to merge. (n=" + DEFAULT_MAX_MERGE +" by default)");
4596 out.println(" -sidelineBigOverlaps When fixing region overlaps, allow to sideline big overlaps");
4597 out.println(" -maxOverlapsToSideline <n> When fixing region overlaps, allow at most <n> regions to sideline per group. (n=" + DEFAULT_OVERLAPS_TO_SIDELINE +" by default)");
4598 out.println(" -fixSplitParents Try to force offline split parents to be online.");
4599 out.println(" -ignorePreCheckPermission ignore filesystem permission pre-check");
4600 out.println(" -fixReferenceFiles Try to offline lingering reference store files");
4601 out.println(" -fixHFileLinks Try to offline lingering HFileLinks");
4602 out.println(" -fixEmptyMetaCells Try to fix hbase:meta entries not referencing any region"
4603 + " (empty REGIONINFO_QUALIFIER rows)");
4604
4605 out.println("");
4606 out.println(" Datafile Repair options: (expert features, use with caution!)");
4607 out.println(" -checkCorruptHFiles Check all Hfiles by opening them to make sure they are valid");
4608 out.println(" -sidelineCorruptHFiles Quarantine corrupted HFiles. implies -checkCorruptHFiles");
4609
4610 out.println("");
4611 out.println(" Metadata Repair shortcuts");
4612 out.println(" -repair Shortcut for -fixAssignments -fixMeta -fixHdfsHoles " +
4613 "-fixHdfsOrphans -fixHdfsOverlaps -fixVersionFile -sidelineBigOverlaps " +
4614 "-fixReferenceFiles -fixHFileLinks -fixTableLocks -fixOrphanedTableZnodes");
4615
4616 out.println(" -repairHoles Shortcut for -fixAssignments -fixMeta -fixHdfsHoles");
4617
4618 out.println("");
4619 out.println(" Table lock options");
4620 out.println(" -fixTableLocks Deletes table locks held for a long time (hbase.table.lock.expire.ms, 10min by default)");
4621
4622 out.println("");
4623 out.println(" Table Znode options");
4624 out.println(" -fixOrphanedTableZnodes Set table state in ZNode to disabled if table does not exists");
4625
4626 out.flush();
4627 errors.reportError(ERROR_CODE.WRONG_USAGE, sw.toString());
4628
4629 setRetCode(-2);
4630 return this;
4631 }
4632
4633
4634
4635
4636
4637
4638
4639 public static void main(String[] args) throws Exception {
4640
4641 Configuration conf = HBaseConfiguration.create();
4642 Path hbasedir = FSUtils.getRootDir(conf);
4643 URI defaultFs = hbasedir.getFileSystem(conf).getUri();
4644 FSUtils.setFsDefault(conf, new Path(defaultFs));
4645 int ret = ToolRunner.run(new HBaseFsckTool(conf), args);
4646 System.exit(ret);
4647 }
4648
4649
4650
4651
4652 static class HBaseFsckTool extends Configured implements Tool {
4653 HBaseFsckTool(Configuration conf) { super(conf); }
4654 @Override
4655 public int run(String[] args) throws Exception {
4656 HBaseFsck hbck = new HBaseFsck(getConf());
4657 hbck.exec(hbck.executor, args);
4658 hbck.close();
4659 return hbck.getRetCode();
4660 }
4661 };
4662
4663
4664 public HBaseFsck exec(ExecutorService exec, String[] args) throws KeeperException, IOException,
4665 ServiceException, InterruptedException {
4666 long sleepBeforeRerun = DEFAULT_SLEEP_BEFORE_RERUN;
4667
4668 boolean checkCorruptHFiles = false;
4669 boolean sidelineCorruptHFiles = false;
4670
4671
4672 for (int i = 0; i < args.length; i++) {
4673 String cmd = args[i];
4674 if (cmd.equals("-help") || cmd.equals("-h")) {
4675 return printUsageAndExit();
4676 } else if (cmd.equals("-details")) {
4677 setDisplayFullReport();
4678 } else if (cmd.equals("-timelag")) {
4679 if (i == args.length - 1) {
4680 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -timelag needs a value.");
4681 return printUsageAndExit();
4682 }
4683 try {
4684 long timelag = Long.parseLong(args[i+1]);
4685 setTimeLag(timelag);
4686 } catch (NumberFormatException e) {
4687 errors.reportError(ERROR_CODE.WRONG_USAGE, "-timelag needs a numeric value.");
4688 return printUsageAndExit();
4689 }
4690 i++;
4691 } else if (cmd.equals("-sleepBeforeRerun")) {
4692 if (i == args.length - 1) {
4693 errors.reportError(ERROR_CODE.WRONG_USAGE,
4694 "HBaseFsck: -sleepBeforeRerun needs a value.");
4695 return printUsageAndExit();
4696 }
4697 try {
4698 sleepBeforeRerun = Long.parseLong(args[i+1]);
4699 } catch (NumberFormatException e) {
4700 errors.reportError(ERROR_CODE.WRONG_USAGE, "-sleepBeforeRerun needs a numeric value.");
4701 return printUsageAndExit();
4702 }
4703 i++;
4704 } else if (cmd.equals("-sidelineDir")) {
4705 if (i == args.length - 1) {
4706 errors.reportError(ERROR_CODE.WRONG_USAGE, "HBaseFsck: -sidelineDir needs a value.");
4707 return printUsageAndExit();
4708 }
4709 i++;
4710 setSidelineDir(args[i]);
4711 } else if (cmd.equals("-fix")) {
4712 errors.reportError(ERROR_CODE.WRONG_USAGE,
4713 "This option is deprecated, please use -fixAssignments instead.");
4714 setFixAssignments(true);
4715 } else if (cmd.equals("-fixAssignments")) {
4716 setFixAssignments(true);
4717 } else if (cmd.equals("-fixMeta")) {
4718 setFixMeta(true);
4719 } else if (cmd.equals("-noHdfsChecking")) {
4720 setCheckHdfs(false);
4721 } else if (cmd.equals("-fixHdfsHoles")) {
4722 setFixHdfsHoles(true);
4723 } else if (cmd.equals("-fixHdfsOrphans")) {
4724 setFixHdfsOrphans(true);
4725 } else if (cmd.equals("-fixTableOrphans")) {
4726 setFixTableOrphans(true);
4727 } else if (cmd.equals("-fixHdfsOverlaps")) {
4728 setFixHdfsOverlaps(true);
4729 } else if (cmd.equals("-fixVersionFile")) {
4730 setFixVersionFile(true);
4731 } else if (cmd.equals("-sidelineBigOverlaps")) {
4732 setSidelineBigOverlaps(true);
4733 } else if (cmd.equals("-fixSplitParents")) {
4734 setFixSplitParents(true);
4735 } else if (cmd.equals("-ignorePreCheckPermission")) {
4736 setIgnorePreCheckPermission(true);
4737 } else if (cmd.equals("-checkCorruptHFiles")) {
4738 checkCorruptHFiles = true;
4739 } else if (cmd.equals("-sidelineCorruptHFiles")) {
4740 sidelineCorruptHFiles = true;
4741 } else if (cmd.equals("-fixReferenceFiles")) {
4742 setFixReferenceFiles(true);
4743 } else if (cmd.equals("-fixHFileLinks")) {
4744 setFixHFileLinks(true);
4745 } else if (cmd.equals("-fixEmptyMetaCells")) {
4746 setFixEmptyMetaCells(true);
4747 } else if (cmd.equals("-repair")) {
4748
4749
4750 setFixHdfsHoles(true);
4751 setFixHdfsOrphans(true);
4752 setFixMeta(true);
4753 setFixAssignments(true);
4754 setFixHdfsOverlaps(true);
4755 setFixVersionFile(true);
4756 setSidelineBigOverlaps(true);
4757 setFixSplitParents(false);
4758 setCheckHdfs(true);
4759 setFixReferenceFiles(true);
4760 setFixHFileLinks(true);
4761 setFixTableLocks(true);
4762 setFixTableZNodes(true);
4763 } else if (cmd.equals("-repairHoles")) {
4764
4765 setFixHdfsHoles(true);
4766 setFixHdfsOrphans(false);
4767 setFixMeta(true);
4768 setFixAssignments(true);
4769 setFixHdfsOverlaps(false);
4770 setSidelineBigOverlaps(false);
4771 setFixSplitParents(false);
4772 setCheckHdfs(true);
4773 } else if (cmd.equals("-maxOverlapsToSideline")) {
4774 if (i == args.length - 1) {
4775 errors.reportError(ERROR_CODE.WRONG_USAGE,
4776 "-maxOverlapsToSideline needs a numeric value argument.");
4777 return printUsageAndExit();
4778 }
4779 try {
4780 int maxOverlapsToSideline = Integer.parseInt(args[i+1]);
4781 setMaxOverlapsToSideline(maxOverlapsToSideline);
4782 } catch (NumberFormatException e) {
4783 errors.reportError(ERROR_CODE.WRONG_USAGE,
4784 "-maxOverlapsToSideline needs a numeric value argument.");
4785 return printUsageAndExit();
4786 }
4787 i++;
4788 } else if (cmd.equals("-maxMerge")) {
4789 if (i == args.length - 1) {
4790 errors.reportError(ERROR_CODE.WRONG_USAGE,
4791 "-maxMerge needs a numeric value argument.");
4792 return printUsageAndExit();
4793 }
4794 try {
4795 int maxMerge = Integer.parseInt(args[i+1]);
4796 setMaxMerge(maxMerge);
4797 } catch (NumberFormatException e) {
4798 errors.reportError(ERROR_CODE.WRONG_USAGE,
4799 "-maxMerge needs a numeric value argument.");
4800 return printUsageAndExit();
4801 }
4802 i++;
4803 } else if (cmd.equals("-summary")) {
4804 setSummary();
4805 } else if (cmd.equals("-metaonly")) {
4806 setCheckMetaOnly();
4807 } else if (cmd.equals("-boundaries")) {
4808 setRegionBoundariesCheck();
4809 } else if (cmd.equals("-fixTableLocks")) {
4810 setFixTableLocks(true);
4811 } else if (cmd.equals("-fixOrphanedTableZnodes")) {
4812 setFixTableZNodes(true);
4813 } else if (cmd.startsWith("-")) {
4814 errors.reportError(ERROR_CODE.WRONG_USAGE, "Unrecognized option:" + cmd);
4815 return printUsageAndExit();
4816 } else {
4817 includeTable(TableName.valueOf(cmd));
4818 errors.print("Allow checking/fixes for table: " + cmd);
4819 }
4820 }
4821
4822 errors.print("HBaseFsck command line options: " + StringUtils.join(args, " "));
4823
4824
4825 try {
4826 preCheckPermission();
4827 } catch (AccessDeniedException ace) {
4828 Runtime.getRuntime().exit(-1);
4829 } catch (IOException ioe) {
4830 Runtime.getRuntime().exit(-1);
4831 }
4832
4833
4834 connect();
4835
4836 try {
4837
4838 if (checkCorruptHFiles || sidelineCorruptHFiles) {
4839 LOG.info("Checking all hfiles for corruption");
4840 HFileCorruptionChecker hfcc = createHFileCorruptionChecker(sidelineCorruptHFiles);
4841 setHFileCorruptionChecker(hfcc);
4842 Collection<TableName> tables = getIncludedTables();
4843 Collection<Path> tableDirs = new ArrayList<Path>();
4844 Path rootdir = FSUtils.getRootDir(getConf());
4845 if (tables.size() > 0) {
4846 for (TableName t : tables) {
4847 tableDirs.add(FSUtils.getTableDir(rootdir, t));
4848 }
4849 } else {
4850 tableDirs = FSUtils.getTableDirs(FSUtils.getCurrentFileSystem(getConf()), rootdir);
4851 }
4852 hfcc.checkTables(tableDirs);
4853 hfcc.report(errors);
4854 }
4855
4856
4857 int code = onlineHbck();
4858 setRetCode(code);
4859
4860
4861
4862
4863 if (shouldRerun()) {
4864 try {
4865 LOG.info("Sleeping " + sleepBeforeRerun + "ms before re-checking after fix...");
4866 Thread.sleep(sleepBeforeRerun);
4867 } catch (InterruptedException ie) {
4868 LOG.warn("Interrupted while sleeping");
4869 return this;
4870 }
4871
4872 setFixAssignments(false);
4873 setFixMeta(false);
4874 setFixHdfsHoles(false);
4875 setFixHdfsOverlaps(false);
4876 setFixVersionFile(false);
4877 setFixTableOrphans(false);
4878 errors.resetErrors();
4879 code = onlineHbck();
4880 setRetCode(code);
4881 }
4882 } finally {
4883 IOUtils.cleanup(null, this);
4884 }
4885 return this;
4886 }
4887
4888
4889
4890
4891 void debugLsr(Path p) throws IOException {
4892 debugLsr(getConf(), p, errors);
4893 }
4894
4895
4896
4897
4898 public static void debugLsr(Configuration conf,
4899 Path p) throws IOException {
4900 debugLsr(conf, p, new PrintingErrorReporter());
4901 }
4902
4903
4904
4905
4906 public static void debugLsr(Configuration conf,
4907 Path p, ErrorReporter errors) throws IOException {
4908 if (!LOG.isDebugEnabled() || p == null) {
4909 return;
4910 }
4911 FileSystem fs = p.getFileSystem(conf);
4912
4913 if (!fs.exists(p)) {
4914
4915 return;
4916 }
4917 errors.print(p.toString());
4918
4919 if (fs.isFile(p)) {
4920 return;
4921 }
4922
4923 if (fs.getFileStatus(p).isDirectory()) {
4924 FileStatus[] fss= fs.listStatus(p);
4925 for (FileStatus status : fss) {
4926 debugLsr(conf, status.getPath(), errors);
4927 }
4928 }
4929 }
4930 }