View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  
22  import static org.junit.Assert.assertTrue;
23  
24  import java.io.IOException;
25  import java.util.concurrent.CountDownLatch;
26  import java.util.concurrent.TimeUnit;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.fs.FileSystem;
32  import org.apache.hadoop.fs.Path;
33  import org.apache.hadoop.hbase.CellScanner;
34  import org.apache.hadoop.hbase.HBaseTestingUtility;
35  import org.apache.hadoop.hbase.HConstants;
36  import org.apache.hadoop.hbase.HTableDescriptor;
37  import org.apache.hadoop.hbase.Server;
38  import org.apache.hadoop.hbase.TableName;
39  import org.apache.hadoop.hbase.client.Durability;
40  import org.apache.hadoop.hbase.client.Put;
41  import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
42  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
43  import org.apache.hadoop.hbase.testclassification.MediumTests;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
46  import org.apache.hadoop.hbase.util.Threads;
47  import org.apache.hadoop.hbase.wal.WAL;
48  import org.apache.hadoop.hbase.wal.WALKey;
49  import org.apache.hadoop.hbase.wal.WALProvider.Writer;
50  import org.junit.After;
51  import org.junit.Before;
52  import org.junit.Rule;
53  import org.junit.Test;
54  import org.junit.experimental.categories.Category;
55  import org.junit.rules.TestName;
56  import org.mockito.Mockito;
57  
58  /**
59   * Testing for lock up of WAL subsystem.
60   * Copied from TestHRegion.
61   */
62  @Category({MediumTests.class})
63  public class TestWALLockup {
64    private static final Log LOG = LogFactory.getLog(TestWALLockup.class);
65    @Rule public TestName name = new TestName();
66  
67    private static final String COLUMN_FAMILY = "MyCF";
68    private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
69  
70    HRegion region = null;
71    // Do not run unit tests in parallel (? Why not?  It don't work?  Why not?  St.Ack)
72    private static HBaseTestingUtility TEST_UTIL;
73    private static Configuration CONF ;
74    private String dir;
75  
76    // Test names
77    protected TableName tableName;
78  
79    @Before
80    public void setup() throws IOException {
81      TEST_UTIL = HBaseTestingUtility.createLocalHTU();
82      CONF = TEST_UTIL.getConfiguration();
83      // Disable block cache.
84      CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
85      dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
86      tableName = TableName.valueOf(name.getMethodName());
87    }
88  
89    @After
90    public void tearDown() throws Exception {
91      EnvironmentEdgeManagerTestHelper.reset();
92      LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
93      TEST_UTIL.cleanupTestDir();
94    }
95  
96    String getName() {
97      return name.getMethodName();
98    }
99  
100   /**
101    * Reproduce locking up that happens when we get an inopportune sync during setup for
102    * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
103    * it is locked up.
104    * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
105    * set up a dodgy WAL that will throw an exception when we go to append to it.
106    */
107   @Test (timeout=20000)
108   public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
109     // A WAL that we can have throw exceptions when a flag is set.
110     class DodgyFSLog extends FSHLog {
111       // Set this when want the WAL to start throwing exceptions.
112       volatile boolean throwException = false;
113 
114       // Latch to hold up processing until after another operation has had time to run.
115       CountDownLatch latch = new CountDownLatch(1);
116 
117       public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
118       throws IOException {
119         super(fs, root, logDir, conf);
120       }
121 
122       @Override
123       protected void afterCreatingZigZagLatch() {
124         // If throwException set, then append will throw an exception causing the WAL to be
125         // rolled. We'll come in here. Hold up processing until a sync can get in before
126         // the zigzag has time to complete its setup and get its own sync in. This is what causes
127         // the lock up we've seen in production.
128         if (throwException) {
129           try {
130             LOG.info("LATCHED");
131             // So, timing can have it that the test can run and the bad flush below happens
132             // before we get here. In this case, we'll be stuck waiting on this latch but there
133             // is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
134             // because all WALs have rolled. In this case, just give up on test.
135             if (!this.latch.await(5, TimeUnit.SECONDS)) {
136               LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
137             }
138           } catch (InterruptedException e) {
139             // TODO Auto-generated catch block
140             e.printStackTrace();
141           }
142         }
143       }
144 
145       @Override
146       protected void beforeWaitOnSafePoint() {
147         if (throwException) {
148           LOG.info("COUNTDOWN");
149           // Don't countdown latch until someone waiting on it otherwise, the above
150           // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
151           // be stuck; test won't go down
152           while (this.latch.getCount() <= 0) Threads.sleep(1);
153           this.latch.countDown();
154         }
155       }
156 
157       @Override
158       protected Writer createWriterInstance(Path path) throws IOException {
159         final Writer w = super.createWriterInstance(path);
160         return new Writer() {
161           @Override
162           public void close() throws IOException {
163             w.close();
164           }
165 
166           @Override
167           public void sync() throws IOException {
168             if (throwException) {
169               throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
170             }
171             w.sync();
172           }
173 
174           @Override
175           public void append(Entry entry) throws IOException {
176             if (throwException) {
177               throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
178             }
179             w.append(entry);
180           }
181 
182           @Override
183           public long getLength() throws IOException {
184             return w.getLength();
185           }
186         };
187       }
188     }
189 
190     // Mocked up server and regionserver services. Needed below.
191     Server server = Mockito.mock(Server.class);
192     Mockito.when(server.getConfiguration()).thenReturn(CONF);
193     Mockito.when(server.isStopped()).thenReturn(false);
194     Mockito.when(server.isAborted()).thenReturn(false);
195     RegionServerServices services = Mockito.mock(RegionServerServices.class);
196 
197     // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
198     FileSystem fs = FileSystem.get(CONF);
199     Path rootDir = new Path(dir + getName());
200     DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
201     Path originalWAL = dodgyWAL.getCurrentFileName();
202     // I need a log roller running.
203     LogRoller logRoller = new LogRoller(server, services);
204     logRoller.addWAL(dodgyWAL);
205     // There is no 'stop' once a logRoller is running.. it just dies.
206     logRoller.start();
207     // Now get a region and start adding in edits.
208     HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME);
209     final HRegion region = initHRegion(tableName, null, null, dodgyWAL);
210     byte [] bytes = Bytes.toBytes(getName());
211     try {
212       // First get something into memstore. Make a Put and then pull the Cell out of it. Will
213       // manage append and sync carefully in below to manufacture hang. We keep adding same
214       // edit. WAL subsystem doesn't care.
215       Put put = new Put(bytes);
216       put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
217       WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName());
218       WALEdit edit = new WALEdit();
219       CellScanner CellScanner = put.cellScanner();
220       assertTrue(CellScanner.advance());
221       edit.add(CellScanner.current());
222       // Put something in memstore and out in the WAL. Do a big number of appends so we push
223       // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
224       for (int i = 0; i < 1000; i++) {
225         region.put(put);
226       }
227       // Set it so we start throwing exceptions.
228       LOG.info("SET throwing of exception on append");
229       dodgyWAL.throwException = true;
230       // This append provokes a WAL roll request
231       dodgyWAL.append(htd, region.getRegionInfo(), key, edit, true);
232       boolean exception = false;
233       try {
234         dodgyWAL.sync();
235       } catch (Exception e) {
236         exception = true;
237       }
238       assertTrue("Did not get sync exception", exception);
239 
240       // Get a memstore flush going too so we have same hung profile as up in the issue over
241       // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
242       // by the zigzaglatch waiting on syncs to come home.
243       Thread t = new Thread ("Flusher") {
244         public void run() {
245           try {
246             if (region.getMemstoreSize() <= 0) {
247               throw new IOException("memstore size=" + region.getMemstoreSize());
248             }
249             region.flush(false);
250           } catch (IOException e) {
251             // Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
252             // when roll completes.
253             LOG.info("In flush", e);
254           }
255           LOG.info("Exiting");
256         };
257       };
258       t.setDaemon(true);
259       t.start();
260       // Wait until 
261       while (dodgyWAL.latch.getCount() > 0) Threads.sleep(1);
262       // Now assert I got a new WAL file put in place even though loads of errors above.
263       assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
264       // Can I append to it?
265       dodgyWAL.throwException = false;
266       try {
267         region.put(put);
268       } catch (Exception e) {
269         LOG.info("In the put", e);
270       }
271     } finally {
272       // To stop logRoller, its server has to say it is stopped.
273       Mockito.when(server.isStopped()).thenReturn(true);
274       if (logRoller != null) logRoller.interrupt();
275       try {
276         if (region != null) region.close();
277         if (dodgyWAL != null) dodgyWAL.close();
278       } catch (Exception e) {
279         LOG.info("On way out", e);
280       }
281     }
282   }
283 
284   /**
285    * @return A region on which you must call
286    *         {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done.
287    */
288   public HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal)
289   throws IOException {
290     return TEST_UTIL.createLocalHRegion(tableName.getName(), startKey, stopKey,
291       getName(), CONF, false, Durability.SYNC_WAL,
292       wal, COLUMN_FAMILY_BYTES);
293   }
294 }