View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.hbase.HBaseTestingUtility;
26  import org.apache.hadoop.hbase.HConstants;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.TableName;
29  import org.apache.hadoop.hbase.client.Durability;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.Put;
32  import org.apache.hadoop.hbase.client.RegionLocator;
33  import org.apache.hadoop.hbase.client.Result;
34  import org.apache.hadoop.hbase.client.ResultScanner;
35  import org.apache.hadoop.hbase.client.Scan;
36  import org.apache.hadoop.hbase.client.Table;
37  import org.apache.hadoop.hbase.testclassification.LargeTests;
38  import org.apache.hadoop.hbase.util.Bytes;
39  import org.junit.AfterClass;
40  import org.junit.Assert;
41  import org.junit.Before;
42  import org.junit.BeforeClass;
43  import org.junit.Ignore;
44  import org.junit.Test;
45  import org.junit.experimental.categories.Category;
46  
47  /**
48   * Test transitions of state across the master.  Sets up the cluster once and
49   * then runs a couple of tests.
50   */
51  @Category(LargeTests.class)
52  public class TestMasterTransitions {
53    private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class);
54    private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
55    private static final TableName TABLENAME = TableName.valueOf("master_transitions");
56    private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"),
57      Bytes.toBytes("b"), Bytes.toBytes("c")};
58  
59    /**
60     * Start up a mini cluster and put a small table of many empty regions into it.
61     * @throws Exception
62     */
63    @BeforeClass public static void beforeAllTests() throws Exception {
64      TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true);
65      TEST_UTIL.startMiniCluster(2);
66      // Create a table of three families.  This will assign a region.
67      TEST_UTIL.createMultiRegionTable(TABLENAME, FAMILIES);
68      HTable t = (HTable) TEST_UTIL.getConnection().getTable(TABLENAME);
69      int countOfRegions = -1;
70      try (RegionLocator r = t.getRegionLocator()) {
71        countOfRegions = r.getStartKeys().length;
72      }
73      TEST_UTIL.waitUntilAllRegionsAssigned(TABLENAME);
74      addToEachStartKey(countOfRegions);
75      t.close();
76    }
77  
78    @AfterClass public static void afterAllTests() throws Exception {
79      TEST_UTIL.shutdownMiniCluster();
80    }
81  
82    @Before public void setup() throws IOException {
83      TEST_UTIL.ensureSomeRegionServersAvailable(2);
84    }
85  
86    /**
87     * Listener for regionserver events testing hbase-2428 (Infinite loop of
88     * region closes if hbase:meta region is offline).  In particular, listen
89     * for the close of the 'metaServer' and when it comes in, requeue it with a
90     * delay as though there were an issue processing the shutdown.  As part of
91     * the requeuing,  send over a close of a region on 'otherServer' so it comes
92     * into a master that has its meta region marked as offline.
93     */
94    /*
95    static class HBase2428Listener implements RegionServerOperationListener {
96      // Map of what we've delayed so we don't do do repeated delays.
97      private final Set<RegionServerOperation> postponed =
98        new CopyOnWriteArraySet<RegionServerOperation>();
99      private boolean done = false;;
100     private boolean metaShutdownReceived = false;
101     private final HServerAddress metaAddress;
102     private final MiniHBaseCluster cluster;
103     private final int otherServerIndex;
104     private final HRegionInfo hri;
105     private int closeCount = 0;
106     static final int SERVER_DURATION = 3 * 1000;
107     static final int CLOSE_DURATION = 1 * 1000;
108 
109     HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress,
110         final HRegionInfo closingHRI, final int otherServerIndex) {
111       this.cluster = c;
112       this.metaAddress = metaAddress;
113       this.hri = closingHRI;
114       this.otherServerIndex = otherServerIndex;
115     }
116 
117     @Override
118     public boolean process(final RegionServerOperation op) throws IOException {
119       // If a regionserver shutdown and its of the meta server, then we want to
120       // delay the processing of the shutdown and send off a close of a region on
121       // the 'otherServer.
122       boolean result = true;
123       if (op instanceof ProcessServerShutdown) {
124         ProcessServerShutdown pss = (ProcessServerShutdown)op;
125         if (pss.getDeadServerAddress().equals(this.metaAddress)) {
126           // Don't postpone more than once.
127           if (!this.postponed.contains(pss)) {
128             // Close some region.
129             this.cluster.addMessageToSendRegionServer(this.otherServerIndex,
130               new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri,
131               Bytes.toBytes("Forcing close in test")));
132             this.postponed.add(pss);
133             // Put off the processing of the regionserver shutdown processing.
134             pss.setDelay(SERVER_DURATION);
135             this.metaShutdownReceived = true;
136             // Return false.  This will add this op to the delayed queue.
137             result = false;
138           }
139         }
140       } else {
141         // Have the close run frequently.
142         if (isWantedCloseOperation(op) != null) {
143           op.setDelay(CLOSE_DURATION);
144           // Count how many times it comes through here.
145           this.closeCount++;
146         }
147       }
148       return result;
149     }
150 
151     public void processed(final RegionServerOperation op) {
152       if (isWantedCloseOperation(op) != null) return;
153       this.done = true;
154     }
155 */
156     /*
157      * @param op
158      * @return Null if not the wanted ProcessRegionClose, else <code>op</code>
159      * cast as a ProcessRegionClose.
160      */
161   /*
162     private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) {
163       // Count every time we get a close operation.
164       if (op instanceof ProcessRegionClose) {
165         ProcessRegionClose c = (ProcessRegionClose)op;
166         if (c.regionInfo.equals(hri)) {
167           return c;
168         }
169       }
170       return null;
171     }
172 
173     boolean isDone() {
174       return this.done;
175     }
176 
177     boolean isMetaShutdownReceived() {
178       return metaShutdownReceived;
179     }
180 
181     int getCloseCount() {
182       return this.closeCount;
183     }
184 
185     @Override
186     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
187       return true;
188     }
189   }
190 */
191   /**
192    * In 2428, the meta region has just been set offline and then a close comes
193    * in.
194    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a>
195    */
196   @Ignore @Test  (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428()
197   throws Exception {
198     /*
199     LOG.info("Running testRegionCloseWhenNoMetaHBase2428");
200     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
201     final HMaster master = cluster.getMaster();
202     int metaIndex = cluster.getServerWithMeta();
203     // Figure the index of the server that is not server the hbase:meta
204     int otherServerIndex = -1;
205     for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) {
206       if (i == metaIndex) continue;
207       otherServerIndex = i;
208       break;
209     }
210     final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex);
211     final HRegionServer metaHRS = cluster.getRegionServer(metaIndex);
212 
213     // Get a region out on the otherServer.
214     final HRegionInfo hri =
215       otherServer.getOnlineRegions().iterator().next().getRegionInfo();
216 
217     // Add our RegionServerOperationsListener
218     HBase2428Listener listener = new HBase2428Listener(cluster,
219       metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex);
220     master.getRegionServerOperationQueue().
221       registerRegionServerOperationListener(listener);
222     try {
223       // Now close the server carrying meta.
224       cluster.abortRegionServer(metaIndex);
225 
226       // First wait on receipt of meta server shutdown message.
227       while(!listener.metaShutdownReceived) Threads.sleep(100);
228       while(!listener.isDone()) Threads.sleep(10);
229       // We should not have retried the close more times than it took for the
230       // server shutdown message to exit the delay queue and get processed
231       // (Multiple by two to add in some slop in case of GC or something).
232       assertTrue(listener.getCloseCount() > 1);
233       assertTrue(listener.getCloseCount() <
234         ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2));
235 
236       // Assert the closed region came back online
237       assertRegionIsBackOnline(hri);
238     } finally {
239       master.getRegionServerOperationQueue().
240         unregisterRegionServerOperationListener(listener);
241     }
242     */
243   }
244 
245   /**
246    * Test adding in a new server before old one on same host+port is dead.
247    * Make the test more onerous by having the server under test carry the meta.
248    * If confusion between old and new, purportedly meta never comes back.  Test
249    * that meta gets redeployed.
250    */
251   @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413()
252   throws IOException {
253     /*
254     LOG.info("Running testAddingServerBeforeOldIsDead2413");
255     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
256     int count = count();
257     int metaIndex = cluster.getServerWithMeta();
258     MiniHBaseClusterRegionServer metaHRS =
259       (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex);
260     int port = metaHRS.getServerInfo().getServerAddress().getPort();
261     Configuration c = TEST_UTIL.getConfiguration();
262     String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0");
263     try {
264       LOG.info("KILLED=" + metaHRS);
265       metaHRS.kill();
266       c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port));
267       // Try and start new regionserver.  It might clash with the old
268       // regionserver port so keep trying to get past the BindException.
269       HRegionServer hrs = null;
270       while (true) {
271         try {
272           hrs = cluster.startRegionServer().getRegionServer();
273           break;
274         } catch (IOException e) {
275           if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) {
276             InvocationTargetException ee = (InvocationTargetException)e.getCause();
277             if (ee.getCause() != null && ee.getCause() instanceof BindException) {
278               LOG.info("BindException; retrying: " + e.toString());
279             }
280           }
281         }
282       }
283       LOG.info("STARTED=" + hrs);
284       // Wait until he's been given at least 3 regions before we go on to try
285       // and count rows in table.
286       while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100);
287       LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() +
288         " regions");
289       assertEquals(count, count());
290     } finally {
291       c.set(HConstants.REGIONSERVER_PORT, oldPort);
292     }
293     */
294   }
295 
296   /**
297    * HBase2482 is about outstanding region openings.  If any are outstanding
298    * when a regionserver goes down, then they'll never deploy.  They'll be
299    * stuck in the regions-in-transition list for ever.  This listener looks
300    * for a region opening HMsg and if its from the server passed on construction,
301    * then we kill it.  It also looks out for a close message on the victim
302    * server because that signifies start of the fireworks.
303    */
304   /*
305   static class HBase2482Listener implements RegionServerOperationListener {
306     private final HRegionServer victim;
307     private boolean abortSent = false;
308     // We closed regions on new server.
309     private volatile boolean closed = false;
310     // Copy of regions on new server
311     private final Collection<HRegion> copyOfOnlineRegions;
312     // This is the region that was in transition on the server we aborted. Test
313     // passes if this region comes back online successfully.
314     private HRegionInfo regionToFind;
315 
316     HBase2482Listener(final HRegionServer victim) {
317       this.victim = victim;
318       // Copy regions currently open on this server so I can notice when
319       // there is a close.
320       this.copyOfOnlineRegions =
321         this.victim.getCopyOfOnlineRegionsSortedBySize().values();
322     }
323 
324     @Override
325     public boolean process(HServerInfo serverInfo, HMsg incomingMsg) {
326       if (!victim.getServerInfo().equals(serverInfo) ||
327           this.abortSent || !this.closed) {
328         return true;
329       }
330       if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true;
331       // Save the region that is in transition so can test later it came back.
332       this.regionToFind = incomingMsg.getRegionInfo();
333       String msg = "ABORTING " + this.victim + " because got a " +
334         HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " +
335         incomingMsg.getRegionInfo().getRegionNameAsString();
336       this.victim.abort(msg);
337       this.abortSent = true;
338       return true;
339     }
340 
341     @Override
342     public boolean process(RegionServerOperation op) throws IOException {
343       return true;
344     }
345 
346     @Override
347     public void processed(RegionServerOperation op) {
348       if (this.closed || !(op instanceof ProcessRegionClose)) return;
349       ProcessRegionClose close = (ProcessRegionClose)op;
350       for (HRegion r: this.copyOfOnlineRegions) {
351         if (r.getRegionInfo().equals(close.regionInfo)) {
352           // We've closed one of the regions that was on the victim server.
353           // Now can start testing for when all regions are back online again
354           LOG.info("Found close of " +
355             r.getRegionInfo().getRegionNameAsString() +
356             "; setting close happened flag");
357           this.closed = true;
358           break;
359         }
360       }
361     }
362   }
363 */
364   /**
365    * In 2482, a RS with an opening region on it dies.  The said region is then
366    * stuck in the master's regions-in-transition and never leaves it.  This
367    * test works by bringing up a new regionserver, waiting for the load
368    * balancer to give it some regions.  Then, we close all on the new server.
369    * After sending all the close messages, we send the new regionserver the
370    * special blocking message so it can not process any more messages.
371    * Meantime reopening of the just-closed regions is backed up on the new
372    * server.  Soon as master gets an opening region from the new regionserver,
373    * we kill it.  We then wait on all regions to come back on line.  If bug
374    * is fixed, this should happen soon as the processing of the killed server is
375    * done.
376    * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a>
377    */
378   @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482()
379   throws Exception {
380     /*
381     LOG.info("Running testKillRSWithOpeningRegion2482");
382     MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
383     if (cluster.getLiveRegionServerThreads().size() < 2) {
384       // Need at least two servers.
385       cluster.startRegionServer();
386     }
387     // Count how many regions are online.  They need to be all back online for
388     // this test to succeed.
389     int countOfMetaRegions = countOfMetaRegions();
390     // Add a listener on the server.
391     HMaster m = cluster.getMaster();
392     // Start new regionserver.
393     MiniHBaseClusterRegionServer hrs =
394       (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer();
395     LOG.info("Started new regionserver: " + hrs.toString());
396     // Wait until has some regions before proceeding.  Balancer will give it some.
397     int minimumRegions =
398       countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2);
399     while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100);
400     // Set the listener only after some regions have been opened on new server.
401     HBase2482Listener listener = new HBase2482Listener(hrs);
402     m.getRegionServerOperationQueue().
403       registerRegionServerOperationListener(listener);
404     try {
405       // Go close all non-catalog regions on this new server
406       closeAllNonCatalogRegions(cluster, hrs);
407       // After all closes, add blocking message before the region opens start to
408       // come in.
409       cluster.addMessageToSendRegionServer(hrs,
410         new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER));
411       // Wait till one of the above close messages has an effect before we start
412       // wait on all regions back online.
413       while (!listener.closed) Threads.sleep(100);
414       LOG.info("Past close");
415       // Make sure the abort server message was sent.
416       while(!listener.abortSent) Threads.sleep(100);
417       LOG.info("Past abort send; waiting on all regions to redeploy");
418       // Now wait for regions to come back online.
419       assertRegionIsBackOnline(listener.regionToFind);
420     } finally {
421       m.getRegionServerOperationQueue().
422         unregisterRegionServerOperationListener(listener);
423     }
424     */
425   }
426 
427   /*
428    * @return Count of all non-catalog regions on the designated server
429    */
430 /*
431   private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster,
432     final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs)
433   throws IOException {
434     int countOfRegions = 0;
435     for (HRegion r: hrs.getOnlineRegions()) {
436       if (r.getRegionInfo().isMetaRegion()) continue;
437       cluster.addMessageToSendRegionServer(hrs,
438         new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo()));
439       LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() +
440         " on " + hrs.toString());
441       countOfRegions++;
442     }
443     return countOfRegions;
444   }
445 
446   private void assertRegionIsBackOnline(final HRegionInfo hri)
447   throws IOException {
448     // Region should have an entry in its startkey because of addRowToEachRegion.
449     byte [] row = getStartKey(hri);
450     HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
451     Get g =  new Get(row);
452     assertTrue((t.get(g)).size() > 0);
453   }
454 
455   /*
456    * @return Count of regions in meta table.
457    * @throws IOException
458    */
459   /*
460   private static int countOfMetaRegions()
461   throws IOException {
462     HTable meta = new HTable(TEST_UTIL.getConfiguration(),
463       HConstants.META_TABLE_NAME);
464     int rows = 0;
465     Scan scan = new Scan();
466     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
467     ResultScanner s = meta.getScanner(scan);
468     for (Result r = null; (r = s.next()) != null;) {
469       byte [] b =
470         r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
471       if (b == null || b.length <= 0) break;
472       rows++;
473     }
474     s.close();
475     return rows;
476   }
477 */
478   /*
479    * Add to each of the regions in hbase:meta a value.  Key is the startrow of the
480    * region (except its 'aaa' for first region).  Actual value is the row name.
481    * @param expected
482    * @return
483    * @throws IOException
484    */
485   private static int addToEachStartKey(final int expected) throws IOException {
486     Table t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME);
487     Table meta = new HTable(TEST_UTIL.getConfiguration(),
488         TableName.META_TABLE_NAME);
489     int rows = 0;
490     Scan scan = new Scan();
491     scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
492     ResultScanner s = meta.getScanner(scan);
493     for (Result r = null; (r = s.next()) != null;) {
494       HRegionInfo hri = HRegionInfo.getHRegionInfo(r);
495       if (hri == null) break;
496       if (!hri.getTable().equals(TABLENAME)) {
497         continue;
498       }
499 
500       // If start key, add 'aaa'.
501       if(!hri.getTable().equals(TABLENAME)) {
502         continue;
503       }
504       byte [] row = getStartKey(hri);
505       Put p = new Put(row);
506       p.setDurability(Durability.SKIP_WAL);
507       p.add(getTestFamily(), getTestQualifier(), row);
508       t.put(p);
509       rows++;
510     }
511     s.close();
512     Assert.assertEquals(expected, rows);
513     t.close();
514     meta.close();
515     return rows;
516   }
517 
518   /*
519    * @param hri
520    * @return Start key for hri (If start key is '', then return 'aaa'.
521    */
522   private static byte [] getStartKey(final HRegionInfo hri) {
523     return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())?
524         Bytes.toBytes("aaa"): hri.getStartKey();
525   }
526 
527   private static byte [] getTestFamily() {
528     return FAMILIES[0];
529   }
530 
531   private static byte [] getTestQualifier() {
532     return getTestFamily();
533   }
534 }