View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.master.procedure;
20  
21  import java.io.IOException;
22  import java.util.concurrent.CountDownLatch;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.conf.Configuration;
27  import org.apache.hadoop.fs.FileSystem;
28  import org.apache.hadoop.fs.Path;
29  import org.apache.hadoop.hbase.HBaseTestingUtility;
30  import org.apache.hadoop.hbase.HRegionInfo;
31  import org.apache.hadoop.hbase.HTableDescriptor;
32  import org.apache.hadoop.hbase.MiniHBaseCluster;
33  import org.apache.hadoop.hbase.TableName;
34  import org.apache.hadoop.hbase.master.HMaster;
35  import org.apache.hadoop.hbase.procedure2.Procedure;
36  import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
37  import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
38  import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility.TestProcedure;
39  import org.apache.hadoop.hbase.procedure2.store.ProcedureStore;
40  import org.apache.hadoop.hbase.procedure2.store.wal.WALProcedureStore;
41  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.CreateTableState;
42  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.DeleteTableState;
43  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.DisableTableState;
44  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.EnableTableState;
45  import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.TruncateTableState;
46  import org.apache.hadoop.hbase.testclassification.LargeTests;
47  import org.apache.hadoop.hbase.util.Bytes;
48  import org.apache.hadoop.hbase.util.FSUtils;
49  import org.apache.hadoop.hbase.util.ModifyRegionUtils;
50  import org.junit.After;
51  import org.junit.Before;
52  import org.junit.Ignore;
53  import org.junit.Test;
54  import org.junit.experimental.categories.Category;
55  import org.mockito.Mockito;
56  
57  import static org.junit.Assert.assertEquals;
58  import static org.junit.Assert.assertTrue;
59  import static org.junit.Assert.fail;
60  
61  @Category(LargeTests.class)
62  public class TestMasterFailoverWithProcedures {
63    private static final Log LOG = LogFactory.getLog(TestMasterFailoverWithProcedures.class);
64  
65    protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility();
66  
67    private static void setupConf(Configuration conf) {
68      // don't waste time retrying with the roll, the test is already slow enough.
69      conf.setInt("hbase.procedure.store.wal.max.retries.before.roll", 1);
70      conf.setInt("hbase.procedure.store.wal.wait.before.roll", 0);
71      conf.setInt("hbase.procedure.store.wal.max.roll.retries", 1);
72      conf.setInt("hbase.procedure.store.wal.sync.failure.roll.max", 1);
73    }
74  
75    @Before
76    public void setup() throws Exception {
77      setupConf(UTIL.getConfiguration());
78      UTIL.startMiniCluster(2, 1);
79  
80      final ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
81      ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, false);
82      ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, false);
83    }
84  
85    @After
86    public void tearDown() throws Exception {
87      try {
88        UTIL.shutdownMiniCluster();
89      } catch (Exception e) {
90        LOG.warn("failure shutting down cluster", e);
91      }
92    }
93  
94    @Test(timeout=60000)
95    @Ignore
96    public void testWalRecoverLease() throws Exception {
97      final ProcedureStore masterStore = getMasterProcedureExecutor().getStore();
98      assertTrue("expected WALStore for this test", masterStore instanceof WALProcedureStore);
99  
100     HMaster firstMaster = UTIL.getHBaseCluster().getMaster();
101     // Abort Latch for the master store
102     final CountDownLatch masterStoreAbort = new CountDownLatch(1);
103     masterStore.registerListener(new ProcedureStore.ProcedureStoreListener() {
104       @Override
105       public void postSync() {}
106 
107       @Override
108       public void abortProcess() {
109         LOG.debug("Abort store of Master");
110         masterStoreAbort.countDown();
111       }
112     });
113 
114     // startup a fake master the new WAL store will take the lease
115     // and the active master should abort.
116     HMaster backupMaster3 = Mockito.mock(HMaster.class);
117     Mockito.doReturn(firstMaster.getConfiguration()).when(backupMaster3).getConfiguration();
118     Mockito.doReturn(true).when(backupMaster3).isActiveMaster();
119     final WALProcedureStore backupStore3 = new WALProcedureStore(firstMaster.getConfiguration(),
120         firstMaster.getMasterFileSystem().getFileSystem(),
121         ((WALProcedureStore)masterStore).getWALDir(),
122         new MasterProcedureEnv.WALStoreLeaseRecovery(backupMaster3));
123     // Abort Latch for the test store
124     final CountDownLatch backupStore3Abort = new CountDownLatch(1);
125     backupStore3.registerListener(new ProcedureStore.ProcedureStoreListener() {
126       @Override
127       public void postSync() {}
128 
129       @Override
130       public void abortProcess() {
131         LOG.debug("Abort store of backupMaster3");
132         backupStore3Abort.countDown();
133         backupStore3.stop(true);
134       }
135     });
136     backupStore3.start(1);
137     backupStore3.recoverLease();
138 
139     // Try to trigger a command on the master (WAL lease expired on the active one)
140     HTableDescriptor htd = MasterProcedureTestingUtility.createHTD(TableName.valueOf("mtb"), "f");
141     HRegionInfo[] regions = ModifyRegionUtils.createHRegionInfos(htd, null);
142     LOG.debug("submit proc");
143     try {
144       getMasterProcedureExecutor().submitProcedure(
145         new CreateTableProcedure(getMasterProcedureExecutor().getEnvironment(), htd, regions));
146       fail("expected RuntimeException 'sync aborted'");
147     } catch (RuntimeException e) {
148       LOG.info("got " + e.getMessage());
149     }
150     LOG.debug("wait master store abort");
151     masterStoreAbort.await();
152 
153     // Now the real backup master should start up
154     LOG.debug("wait backup master to startup");
155     waitBackupMaster(UTIL, firstMaster);
156     assertEquals(true, firstMaster.isStopped());
157 
158     // wait the store in here to abort (the test will fail due to timeout if it doesn't)
159     LOG.debug("wait the store to abort");
160     backupStore3.getStoreTracker().setDeleted(1, false);
161     try {
162       backupStore3.delete(1);
163       fail("expected RuntimeException 'sync aborted'");
164     } catch (RuntimeException e) {
165       LOG.info("got " + e.getMessage());
166     }
167     backupStore3Abort.await();
168   }
169 
170   /**
171    * Tests proper fencing in case the current WAL store is fenced
172    */
173   @Test
174   public void testWALfencingWithoutWALRolling() throws IOException {
175     testWALfencing(false);
176   }
177 
178   /**
179    * Tests proper fencing in case the current WAL store does not receive writes until after the
180    * new WAL does a couple of WAL rolls.
181    */
182   @Test
183   public void testWALfencingWithWALRolling() throws IOException {
184     testWALfencing(true);
185   }
186 
187   public void testWALfencing(boolean walRolls) throws IOException {
188     final ProcedureStore procStore = getMasterProcedureExecutor().getStore();
189     assertTrue("expected WALStore for this test", procStore instanceof WALProcedureStore);
190 
191     HMaster firstMaster = UTIL.getHBaseCluster().getMaster();
192 
193     // cause WAL rolling after a delete in WAL:
194     firstMaster.getConfiguration().setLong("hbase.procedure.store.wal.roll.threshold", 1);
195 
196     HMaster backupMaster3 = Mockito.mock(HMaster.class);
197     Mockito.doReturn(firstMaster.getConfiguration()).when(backupMaster3).getConfiguration();
198     Mockito.doReturn(true).when(backupMaster3).isActiveMaster();
199     final WALProcedureStore procStore2 = new WALProcedureStore(firstMaster.getConfiguration(),
200         firstMaster.getMasterFileSystem().getFileSystem(),
201         ((WALProcedureStore)procStore).getWALDir(),
202         new MasterProcedureEnv.WALStoreLeaseRecovery(backupMaster3));
203 
204     // start a second store which should fence the first one out
205     LOG.info("Starting new WALProcedureStore");
206     procStore2.start(1);
207     procStore2.recoverLease();
208 
209     // before writing back to the WAL store, optionally do a couple of WAL rolls (which causes
210     // to delete the old WAL files).
211     if (walRolls) {
212       LOG.info("Inserting into second WALProcedureStore, causing WAL rolls");
213       for (int i = 0; i < 512; i++) {
214         // insert something to the second store then delete it, causing a WAL roll(s)
215         Procedure proc2 = new TestProcedure(i);
216         procStore2.insert(proc2, null);
217         procStore2.delete(proc2.getProcId()); // delete the procedure so that the WAL is removed later
218       }
219     }
220 
221     // Now, insert something to the first store, should fail.
222     // If the store does a WAL roll and continue with another logId without checking higher logIds
223     // it will incorrectly succeed.
224     LOG.info("Inserting into first WALProcedureStore");
225     try {
226       procStore.insert(new TestProcedure(11), null);
227       fail("Inserting into Procedure Store should have failed");
228     } catch (Exception ex) {
229       LOG.info("Received expected exception", ex);
230     }
231   }
232 
233   // ==========================================================================
234   //  Test Create Table
235   // ==========================================================================
236   @Test(timeout=60000)
237   public void testCreateWithFailover() throws Exception {
238     // TODO: Should we try every step? (master failover takes long time)
239     // It is already covered by TestCreateTableProcedure
240     // but without the master restart, only the executor/store is restarted.
241     // Without Master restart we may not find bug in the procedure code
242     // like missing "wait" for resources to be available (e.g. RS)
243     testCreateWithFailoverAtStep(CreateTableState.CREATE_TABLE_ASSIGN_REGIONS.ordinal());
244   }
245 
246   private void testCreateWithFailoverAtStep(final int step) throws Exception {
247     final TableName tableName = TableName.valueOf("testCreateWithFailoverAtStep" + step);
248 
249     // create the table
250     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
251     ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true);
252     ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true);
253 
254     // Start the Create procedure && kill the executor
255     byte[][] splitKeys = null;
256     HTableDescriptor htd = MasterProcedureTestingUtility.createHTD(tableName, "f1", "f2");
257     HRegionInfo[] regions = ModifyRegionUtils.createHRegionInfos(htd, splitKeys);
258     long procId = procExec.submitProcedure(
259       new CreateTableProcedure(procExec.getEnvironment(), htd, regions));
260     testRecoveryAndDoubleExecution(UTIL, procId, step, CreateTableState.values());
261 
262     MasterProcedureTestingUtility.validateTableCreation(
263       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
264   }
265 
266   // ==========================================================================
267   //  Test Delete Table
268   // ==========================================================================
269   @Test(timeout=60000)
270   public void testDeleteWithFailover() throws Exception {
271     // TODO: Should we try every step? (master failover takes long time)
272     // It is already covered by TestDeleteTableProcedure
273     // but without the master restart, only the executor/store is restarted.
274     // Without Master restart we may not find bug in the procedure code
275     // like missing "wait" for resources to be available (e.g. RS)
276     testDeleteWithFailoverAtStep(DeleteTableState.DELETE_TABLE_UNASSIGN_REGIONS.ordinal());
277   }
278 
279   private void testDeleteWithFailoverAtStep(final int step) throws Exception {
280     final TableName tableName = TableName.valueOf("testDeleteWithFailoverAtStep" + step);
281 
282     // create the table
283     byte[][] splitKeys = null;
284     HRegionInfo[] regions = MasterProcedureTestingUtility.createTable(
285       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
286     Path tableDir = FSUtils.getTableDir(getRootDir(), tableName);
287     MasterProcedureTestingUtility.validateTableCreation(
288       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
289     UTIL.getHBaseAdmin().disableTable(tableName);
290 
291     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
292     ProcedureTestingUtility.setKillBeforeStoreUpdate(procExec, true);
293     ProcedureTestingUtility.setToggleKillBeforeStoreUpdate(procExec, true);
294 
295     // Start the Delete procedure && kill the executor
296     long procId = procExec.submitProcedure(
297       new DeleteTableProcedure(procExec.getEnvironment(), tableName));
298     testRecoveryAndDoubleExecution(UTIL, procId, step, DeleteTableState.values());
299 
300     MasterProcedureTestingUtility.validateTableDeletion(
301       UTIL.getHBaseCluster().getMaster(), tableName, regions, "f1", "f2");
302   }
303 
304   // ==========================================================================
305   //  Test Truncate Table
306   // ==========================================================================
307   @Test(timeout=90000)
308   public void testTruncateWithFailover() throws Exception {
309     // TODO: Should we try every step? (master failover takes long time)
310     // It is already covered by TestTruncateTableProcedure
311     // but without the master restart, only the executor/store is restarted.
312     // Without Master restart we may not find bug in the procedure code
313     // like missing "wait" for resources to be available (e.g. RS)
314     testTruncateWithFailoverAtStep(true, TruncateTableState.TRUNCATE_TABLE_ADD_TO_META.ordinal());
315   }
316 
317   private void testTruncateWithFailoverAtStep(final boolean preserveSplits, final int step)
318       throws Exception {
319     final TableName tableName = TableName.valueOf("testTruncateWithFailoverAtStep" + step);
320 
321     // create the table
322     final String[] families = new String[] { "f1", "f2" };
323     final byte[][] splitKeys = new byte[][] {
324       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
325     };
326     HRegionInfo[] regions = MasterProcedureTestingUtility.createTable(
327       getMasterProcedureExecutor(), tableName, splitKeys, families);
328     // load and verify that there are rows in the table
329     MasterProcedureTestingUtility.loadData(
330       UTIL.getConnection(), tableName, 100, splitKeys, families);
331     assertEquals(100, UTIL.countRows(tableName));
332     // disable the table
333     UTIL.getHBaseAdmin().disableTable(tableName);
334 
335     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
336     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
337 
338     // Start the Truncate procedure && kill the executor
339     long procId = procExec.submitProcedure(
340       new TruncateTableProcedure(procExec.getEnvironment(), tableName, preserveSplits));
341     testRecoveryAndDoubleExecution(UTIL, procId, step, TruncateTableState.values());
342 
343     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, false);
344     UTIL.waitUntilAllRegionsAssigned(tableName);
345 
346     // validate the table regions and layout
347     regions = UTIL.getHBaseAdmin().getTableRegions(tableName).toArray(new HRegionInfo[0]);
348     if (preserveSplits) {
349       assertEquals(1 + splitKeys.length, regions.length);
350     } else {
351       assertEquals(1, regions.length);
352     }
353     MasterProcedureTestingUtility.validateTableCreation(
354       UTIL.getHBaseCluster().getMaster(), tableName, regions, families);
355 
356     // verify that there are no rows in the table
357     assertEquals(0, UTIL.countRows(tableName));
358 
359     // verify that the table is read/writable
360     MasterProcedureTestingUtility.loadData(
361       UTIL.getConnection(), tableName, 50, splitKeys, families);
362     assertEquals(50, UTIL.countRows(tableName));
363   }
364 
365   // ==========================================================================
366   //  Test Disable Table
367   // ==========================================================================
368   @Test(timeout=60000)
369   public void testDisableTableWithFailover() throws Exception {
370     // TODO: Should we try every step? (master failover takes long time)
371     // It is already covered by TestDisableTableProcedure
372     // but without the master restart, only the executor/store is restarted.
373     // Without Master restart we may not find bug in the procedure code
374     // like missing "wait" for resources to be available (e.g. RS)
375     testDisableTableWithFailoverAtStep(
376       DisableTableState.DISABLE_TABLE_MARK_REGIONS_OFFLINE.ordinal());
377   }
378 
379   private void testDisableTableWithFailoverAtStep(final int step) throws Exception {
380     final TableName tableName = TableName.valueOf("testDisableTableWithFailoverAtStep" + step);
381 
382     // create the table
383     final byte[][] splitKeys = new byte[][] {
384       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
385     };
386     MasterProcedureTestingUtility.createTable(
387       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
388 
389     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
390     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
391 
392     // Start the Delete procedure && kill the executor
393     long procId = procExec.submitProcedure(
394       new DisableTableProcedure(procExec.getEnvironment(), tableName, false));
395     testRecoveryAndDoubleExecution(UTIL, procId, step, DisableTableState.values());
396 
397     MasterProcedureTestingUtility.validateTableIsDisabled(
398       UTIL.getHBaseCluster().getMaster(), tableName);
399   }
400 
401   // ==========================================================================
402   //  Test Enable Table
403   // ==========================================================================
404   @Test(timeout=60000)
405   public void testEnableTableWithFailover() throws Exception {
406     // TODO: Should we try every step? (master failover takes long time)
407     // It is already covered by TestEnableTableProcedure
408     // but without the master restart, only the executor/store is restarted.
409     // Without Master restart we may not find bug in the procedure code
410     // like missing "wait" for resources to be available (e.g. RS)
411     testEnableTableWithFailoverAtStep(
412       EnableTableState.ENABLE_TABLE_MARK_REGIONS_ONLINE.ordinal());
413   }
414 
415   private void testEnableTableWithFailoverAtStep(final int step) throws Exception {
416     final TableName tableName = TableName.valueOf("testEnableTableWithFailoverAtStep" + step);
417 
418     // create the table
419     final byte[][] splitKeys = new byte[][] {
420       Bytes.toBytes("a"), Bytes.toBytes("b"), Bytes.toBytes("c")
421     };
422     MasterProcedureTestingUtility.createTable(
423       getMasterProcedureExecutor(), tableName, splitKeys, "f1", "f2");
424     UTIL.getHBaseAdmin().disableTable(tableName);
425 
426     ProcedureExecutor<MasterProcedureEnv> procExec = getMasterProcedureExecutor();
427     ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdate(procExec, true);
428 
429     // Start the Delete procedure && kill the executor
430     long procId = procExec.submitProcedure(
431       new EnableTableProcedure(procExec.getEnvironment(), tableName, false));
432     testRecoveryAndDoubleExecution(UTIL, procId, step, EnableTableState.values());
433 
434     MasterProcedureTestingUtility.validateTableIsEnabled(
435       UTIL.getHBaseCluster().getMaster(), tableName);
436   }
437 
438   // ==========================================================================
439   //  Test Helpers
440   // ==========================================================================
441   public static <TState> void testRecoveryAndDoubleExecution(final HBaseTestingUtility testUtil,
442       final long procId, final int lastStepBeforeFailover, TState[] states) throws Exception {
443     ProcedureExecutor<MasterProcedureEnv> procExec =
444       testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor();
445     ProcedureTestingUtility.waitProcedure(procExec, procId);
446 
447     for (int i = 0; i < lastStepBeforeFailover; ++i) {
448       LOG.info("Restart "+ i +" exec state: " + states[i]);
449       ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId);
450       ProcedureTestingUtility.restart(procExec);
451       ProcedureTestingUtility.waitProcedure(procExec, procId);
452     }
453     ProcedureTestingUtility.assertProcNotYetCompleted(procExec, procId);
454 
455     LOG.info("Trigger master failover");
456     masterFailover(testUtil);
457 
458     procExec = testUtil.getHBaseCluster().getMaster().getMasterProcedureExecutor();
459     ProcedureTestingUtility.waitProcedure(procExec, procId);
460     ProcedureTestingUtility.assertProcNotFailed(procExec, procId);
461   }
462 
463   // ==========================================================================
464   //  Master failover utils
465   // ==========================================================================
466   public static void masterFailover(final HBaseTestingUtility testUtil)
467       throws Exception {
468     MiniHBaseCluster cluster = testUtil.getMiniHBaseCluster();
469 
470     // Kill the master
471     HMaster oldMaster = cluster.getMaster();
472     cluster.killMaster(cluster.getMaster().getServerName());
473 
474     // Wait the secondary
475     waitBackupMaster(testUtil, oldMaster);
476   }
477 
478   public static void waitBackupMaster(final HBaseTestingUtility testUtil,
479       final HMaster oldMaster) throws Exception {
480     MiniHBaseCluster cluster = testUtil.getMiniHBaseCluster();
481 
482     HMaster newMaster = cluster.getMaster();
483     while (newMaster == null || newMaster == oldMaster) {
484       Thread.sleep(250);
485       newMaster = cluster.getMaster();
486     }
487 
488     while (!(newMaster.isActiveMaster() && newMaster.isInitialized())) {
489       Thread.sleep(250);
490     }
491   }
492 
493   // ==========================================================================
494   //  Helpers
495   // ==========================================================================
496   private MasterProcedureEnv getMasterProcedureEnv() {
497     return getMasterProcedureExecutor().getEnvironment();
498   }
499 
500   private ProcedureExecutor<MasterProcedureEnv> getMasterProcedureExecutor() {
501     return UTIL.getHBaseCluster().getMaster().getMasterProcedureExecutor();
502   }
503 
504   private FileSystem getFileSystem() {
505     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getFileSystem();
506   }
507 
508   private Path getRootDir() {
509     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir();
510   }
511 
512   private Path getTempDir() {
513     return UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getTempDir();
514   }
515 }