View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import static org.junit.Assert.assertTrue;
22  
23  import java.io.IOException;
24  import java.util.ArrayList;
25  import java.util.List;
26  import java.util.concurrent.CountDownLatch;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.fs.FileSystem;
32  import org.apache.hadoop.fs.Path;
33  import org.apache.hadoop.hbase.Cell;
34  import org.apache.hadoop.hbase.CellScanner;
35  import org.apache.hadoop.hbase.ChoreService;
36  import org.apache.hadoop.hbase.CoordinatedStateManager;
37  import org.apache.hadoop.hbase.HBaseTestingUtility;
38  import org.apache.hadoop.hbase.HConstants;
39  import org.apache.hadoop.hbase.HTableDescriptor;
40  import org.apache.hadoop.hbase.Server;
41  import org.apache.hadoop.hbase.ServerName;
42  import org.apache.hadoop.hbase.TableName;
43  import org.apache.hadoop.hbase.client.ClusterConnection;
44  import org.apache.hadoop.hbase.client.Durability;
45  import org.apache.hadoop.hbase.client.Put;
46  import org.apache.hadoop.hbase.regionserver.wal.DamagedWALException;
47  import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
48  import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
49  import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
50  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
51  import org.apache.hadoop.hbase.testclassification.MediumTests;
52  import org.apache.hadoop.hbase.util.Bytes;
53  import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
54  import org.apache.hadoop.hbase.util.Threads;
55  import org.apache.hadoop.hbase.wal.WAL;
56  import org.apache.hadoop.hbase.wal.WALKey;
57  import org.apache.hadoop.hbase.wal.WALProvider.Writer;
58  import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
59  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
60  import org.junit.After;
61  import org.junit.Before;
62  import org.junit.Ignore;
63  import org.junit.Rule;
64  import org.junit.Test;
65  import org.junit.experimental.categories.Category;
66  import org.junit.rules.TestName;
67  import org.mockito.Mockito;
68  
69  /**
70   * Testing for lock up of WAL subsystem.
71   * Copied from TestHRegion.
72   */
73  @Category({MediumTests.class})
74  public class TestWALLockup {
75    private static final Log LOG = LogFactory.getLog(TestWALLockup.class);
76    @Rule public TestName name = new TestName();
77  
78    private static final String COLUMN_FAMILY = "MyCF";
79    private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
80  
81    HRegion region = null;
82    // Do not run unit tests in parallel (? Why not?  It don't work?  Why not?  St.Ack)
83    private static HBaseTestingUtility TEST_UTIL;
84    private static Configuration CONF ;
85    private String dir;
86  
87    // Test names
88    protected TableName tableName;
89  
90    @Before
91    public void setup() throws IOException {
92      TEST_UTIL = HBaseTestingUtility.createLocalHTU();
93      CONF = TEST_UTIL.getConfiguration();
94      // Disable block cache.
95      CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
96      dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
97      tableName = TableName.valueOf(name.getMethodName());
98    }
99  
100   @After
101   public void tearDown() throws Exception {
102     EnvironmentEdgeManagerTestHelper.reset();
103     LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
104     TEST_UTIL.cleanupTestDir();
105   }
106 
107   String getName() {
108     return name.getMethodName();
109   }
110 
111   /**
112    * Reproduce locking up that happens when we get an inopportune sync during setup for
113    * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
114    * it is locked up.
115    * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
116    * set up a dodgy WAL that will throw an exception when we go to append to it.
117    */
118   @Ignore @Test (timeout=30000)
119   public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
120     // A WAL that we can have throw exceptions when a flag is set.
121     class DodgyFSLog extends FSHLog {
122       // Set this when want the WAL to start throwing exceptions.
123       volatile boolean throwException = false;
124 
125       // Latch to hold up processing until after another operation has had time to run.
126       CountDownLatch latch = new CountDownLatch(1);
127 
128       public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
129       throws IOException {
130         super(fs, root, logDir, conf);
131       }
132 
133       @Override
134       protected void afterCreatingZigZagLatch() {
135         // If throwException set, then append will throw an exception causing the WAL to be
136         // rolled. We'll come in here. Hold up processing until a sync can get in before
137         // the zigzag has time to complete its setup and get its own sync in. This is what causes
138         // the lock up we've seen in production.
139         if (throwException) {
140           try {
141             LOG.info("LATCHED");
142             this.latch.await();
143           } catch (InterruptedException e) {
144             // TODO Auto-generated catch block
145             e.printStackTrace();
146           }
147         }
148       }
149 
150       @Override
151       protected void beforeWaitOnSafePoint() {
152         if (throwException) {
153           LOG.info("COUNTDOWN");
154           // Don't countdown latch until someone waiting on it otherwise, the above
155           // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
156           // be stuck; test won't go down
157           while (this.latch.getCount() <= 0) Threads.sleep(1);
158           this.latch.countDown();
159         }
160       }
161 
162       @Override
163       protected Writer createWriterInstance(Path path) throws IOException {
164         final Writer w = super.createWriterInstance(path);
165         return new Writer() {
166           @Override
167           public void close() throws IOException {
168             w.close();
169           }
170 
171           @Override
172           public void sync() throws IOException {
173             if (throwException) {
174               throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
175             }
176             w.sync();
177           }
178 
179           @Override
180           public void append(Entry entry) throws IOException {
181             if (throwException) {
182               throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
183             }
184             w.append(entry);
185           }
186 
187           @Override
188           public long getLength() throws IOException {
189             return w.getLength();
190           }
191         };
192       }
193     }
194 
195     // Mocked up server and regionserver services. Needed below.
196     Server server = Mockito.mock(Server.class);
197     Mockito.when(server.getConfiguration()).thenReturn(CONF);
198     Mockito.when(server.isStopped()).thenReturn(false);
199     Mockito.when(server.isAborted()).thenReturn(false);
200     RegionServerServices services = Mockito.mock(RegionServerServices.class);
201 
202     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
203     FileSystem fs = FileSystem.get(CONF);
204     Path rootDir = new Path(dir + getName());
205     DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
206     Path originalWAL = dodgyWAL.getCurrentFileName();
207     // I need a log roller running.
208     LogRoller logRoller = new LogRoller(server, services);
209     logRoller.addWAL(dodgyWAL);
210     // There is no 'stop' once a logRoller is running.. it just dies.
211     logRoller.start();
212     // Now get a region and start adding in edits.
213     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
214     final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
215     byte [] bytes = Bytes.toBytes(getName());
216     try {
217       // First get something into memstore. Make a Put and then pull the Cell out of it. Will
218       // manage append and sync carefully in below to manufacture hang. We keep adding same
219       // edit. WAL subsystem doesn't care.
220       Put put = new Put(bytes);
221       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
222       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName());
223       WALEdit edit = new WALEdit();
224       List<Cell> cells = new ArrayList<Cell>();
225       for (CellScanner cs = put.cellScanner(); cs.advance();) {
226         edit.add(cs.current());
227         cells.add(cs.current());
228       }
229       // Put something in memstore and out in the WAL. Do a big number of appends so we push
230       // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
231       for (int i = 0; i < 1000; i++) {
232         dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true,
233           cells);
234       }
235       // Set it so we start throwing exceptions.
236       dodgyWAL.throwException = true;
237       // This append provokes a WAL roll.
238       dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, cells);
239       boolean exception = false;
240       try {
241         dodgyWAL.sync();
242       } catch (Exception e) {
243         exception = true;
244       }
245       assertTrue("Did not get sync exception", exception);
246 
247       // Get a memstore flush going too so we have same hung profile as up in the issue over
248       // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
249       // by the zigzaglatch waiting on syncs to come home.
250       Thread t = new Thread ("flusher") {
251         public void run() {
252           try {
253             region.flush(false);
254           } catch (IOException e) {
255             // TODO Auto-generated catch block
256             e.printStackTrace();
257           }
258         };
259       };
260       t.setDaemon(true);
261       t.start();
262       // Wait till it gets into flushing. It will get stuck on getSequenceId. Then proceed.
263       while (!region.writestate.flushing) Threads.sleep(1);
264       // Now assert I got a new WAL file put in place even though loads of errors above.
265       assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
266       // Can I append to it?
267       dodgyWAL.throwException = false;
268       region.put(put);
269     } finally {
270       // To stop logRoller, its server has to say it is stopped.
271       Mockito.when(server.isStopped()).thenReturn(true);
272       if (logRoller != null) logRoller.interrupt();
273       if (region != null) region.close();
274       if (dodgyWAL != null) dodgyWAL.close();
275     }
276   }
277 
278   /**
279    * Reproduce locking up that happens when there's no further syncs after
280    * append fails, and causing an isolated sync then infinite wait. See
281    * HBASE-16960. If below is broken, we will see this test timeout because it
282    * is locked up.
283    * <p/>
284    * Steps for reproduce:<br/>
285    * 1. Trigger server abort through dodgyWAL1<br/>
286    * 2. Add a {@link DummyWALActionsListener} to dodgyWAL2 to cause ringbuffer
287    * event handler thread sleep for a while thus keeping {@code endOfBatch}
288    * false<br/>
289    * 3. Publish a sync then an append which will throw exception, check whether
290    * the sync could return
291    */
292   @Test(timeout = 20000)
293   public void testLockup16960() throws IOException {
294     // A WAL that we can have throw exceptions when a flag is set.
295     class DodgyFSLog extends FSHLog {
296       // Set this when want the WAL to start throwing exceptions.
297       volatile boolean throwException = false;
298 
299       public DodgyFSLog(FileSystem fs, Path root, String logDir,
300           Configuration conf) throws IOException {
301         super(fs, root, logDir, conf);
302       }
303 
304       @Override
305       protected Writer createWriterInstance(Path path) throws IOException {
306         final Writer w = super.createWriterInstance(path);
307         return new Writer() {
308           @Override
309           public void close() throws IOException {
310             w.close();
311           }
312 
313           @Override
314           public void sync() throws IOException {
315             if (throwException) {
316               throw new IOException(
317                   "FAKE! Failed to replace a bad datanode...SYNC");
318             }
319             w.sync();
320           }
321 
322           @Override
323           public void append(Entry entry) throws IOException {
324             if (throwException) {
325               throw new IOException(
326                   "FAKE! Failed to replace a bad datanode...APPEND");
327             }
328             w.append(entry);
329           }
330 
331           @Override
332           public long getLength() throws IOException {
333             return w.getLength();
334           }
335         };
336       }
337 
338       @Override
339       public byte[][] rollWriter(boolean force) throws FailedLogCloseException,
340           IOException {
341         if (throwException) {
342           throw new FailedLogCloseException("testLockup16960");
343         }
344         return super.rollWriter(force);
345       }
346     }
347 
348     // Mocked up server and regionserver services. Needed below.
349     Server server = new DummyServer(CONF, ServerName.valueOf(
350         "hostname1.example.org", 1234, 1L).toString());
351     RegionServerServices services = Mockito.mock(RegionServerServices.class);
352 
353     CONF.setLong("hbase.regionserver.hlog.sync.timeout", 10000);
354 
355     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL,
356     // go ahead with test.
357     FileSystem fs = FileSystem.get(CONF);
358     Path rootDir = new Path(dir + getName());
359     DodgyFSLog dodgyWAL1 = new DodgyFSLog(fs, rootDir, getName(), CONF);
360 
361     Path rootDir2 = new Path(dir + getName() + "2");
362     final DodgyFSLog dodgyWAL2 = new DodgyFSLog(fs, rootDir2, getName() + "2",
363         CONF);
364     // Add a listener to force ringbuffer event handler sleep for a while
365     dodgyWAL2.registerWALActionsListener(new DummyWALActionsListener());
366 
367     // I need a log roller running.
368     LogRoller logRoller = new LogRoller(server, services);
369     logRoller.addWAL(dodgyWAL1);
370     logRoller.addWAL(dodgyWAL2);
371     // There is no 'stop' once a logRoller is running.. it just dies.
372     logRoller.start();
373     // Now get a region and start adding in edits.
374     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
375     final HRegion region = initHRegion(tableName, null, null, dodgyWAL1);
376     byte[] bytes = Bytes.toBytes(getName());
377 
378     try {
379       Put put = new Put(bytes);
380       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
381       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
382           htd.getTableName());
383       WALEdit edit = new WALEdit();
384       List<Cell> cells = new ArrayList<Cell>();
385       for (CellScanner cs = put.cellScanner(); cs.advance();) {
386         edit.add(cs.current());
387         cells.add(cs.current());
388       }
389 
390       LOG.info("SET throwing of exception on append");
391       dodgyWAL1.throwException = true;
392       // This append provokes a WAL roll request
393       dodgyWAL1.append(htd, region.getRegionInfo(), key, edit,
394           region.getSequenceId(), true, cells);
395       boolean exception = false;
396       try {
397         dodgyWAL1.sync();
398       } catch (Exception e) {
399         exception = true;
400       }
401       assertTrue("Did not get sync exception", exception);
402 
403       // LogRoller call dodgyWAL1.rollWriter get FailedLogCloseException and
404       // cause server abort.
405       try {
406         // wait LogRoller exit.
407         Thread.sleep(50);
408       } catch (InterruptedException e) {
409         e.printStackTrace();
410       }
411 
412       final CountDownLatch latch = new CountDownLatch(1);
413 
414       // make RingBufferEventHandler sleep 1s, so the following sync
415       // endOfBatch=false
416       key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
417           TableName.valueOf("sleep"));
418       dodgyWAL2.append(htd, region.getRegionInfo(), key, edit,
419           region.getSequenceId(), true, cells);
420 
421       Thread t = new Thread("Sync") {
422         public void run() {
423           try {
424             dodgyWAL2.sync();
425           } catch (IOException e) {
426             LOG.info("In sync", e);
427           }
428           latch.countDown();
429           LOG.info("Sync exiting");
430         };
431       };
432       t.setDaemon(true);
433       t.start();
434       try {
435         // make sure sync have published.
436         Thread.sleep(100);
437       } catch (InterruptedException e1) {
438         e1.printStackTrace();
439       }
440       // make append throw DamagedWALException
441       key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(),
442           TableName.valueOf("DamagedWALException"));
443       dodgyWAL2.append(htd, region.getRegionInfo(), key, edit,
444           region.getSequenceId(), true, cells);
445 
446       while (latch.getCount() > 0) {
447         Threads.sleep(100);
448       }
449       assertTrue(server.isAborted());
450     } finally {
451       if (logRoller != null) {
452         logRoller.interrupt();
453       }
454       try {
455         if (region != null) {
456           region.close();
457         }
458         if (dodgyWAL1 != null) {
459           dodgyWAL1.close();
460         }
461         if (dodgyWAL2 != null) {
462           dodgyWAL2.close();
463         }
464       } catch (Exception e) {
465         LOG.info("On way out", e);
466       }
467     }
468   }
469 
470   static class DummyServer implements Server {
471     private Configuration conf;
472     private String serverName;
473     private boolean isAborted = false;
474 
475     public DummyServer(Configuration conf, String serverName) {
476       this.conf = conf;
477       this.serverName = serverName;
478     }
479 
480     @Override
481     public Configuration getConfiguration() {
482       return conf;
483     }
484 
485     @Override
486     public ZooKeeperWatcher getZooKeeper() {
487       return null;
488     }
489 
490     @Override
491     public CoordinatedStateManager getCoordinatedStateManager() {
492       return null;
493     }
494 
495     @Override
496     public ClusterConnection getConnection() {
497       return null;
498     }
499 
500     @Override
501     public MetaTableLocator getMetaTableLocator() {
502       return null;
503     }
504 
505     @Override
506     public ServerName getServerName() {
507       return ServerName.valueOf(this.serverName);
508     }
509 
510     @Override
511     public void abort(String why, Throwable e) {
512       LOG.info("Aborting " + serverName);
513       this.isAborted = true;
514     }
515 
516     @Override
517     public boolean isAborted() {
518       return this.isAborted;
519     }
520 
521     @Override
522     public void stop(String why) {
523       this.isAborted = true;
524     }
525 
526     @Override
527     public boolean isStopped() {
528       return this.isAborted;
529     }
530 
531     @Override
532     public ChoreService getChoreService() {
533       return null;
534     }
535 
536   }
537 
538   static class DummyWALActionsListener extends WALActionsListener.Base {
539 
540     @Override
541     public void visitLogEntryBeforeWrite(HTableDescriptor htd, WALKey logKey,
542         WALEdit logEdit) throws IOException {
543       if (logKey.getTablename().getNameAsString().equalsIgnoreCase("sleep")) {
544         try {
545           Thread.sleep(1000);
546         } catch (InterruptedException e) {
547           e.printStackTrace();
548         }
549       }
550       if (logKey.getTablename().getNameAsString()
551           .equalsIgnoreCase("DamagedWALException")) {
552         throw new DamagedWALException("Failed appending");
553       }
554     }
555 
556   }
557 
558   /**
559    * @return A region on which you must call
560    *         {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done.
561    */
562   public HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal)
563   throws IOException {
564     return TEST_UTIL.createLocalHRegion(tableName.getName(), startKey, stopKey,
565       getName(), CONF, false, Durability.SYNC_WAL,
566       wal, COLUMN_FAMILY_BYTES);
567   }
568 }