View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.RegionTransition;
39  import org.apache.hadoop.hbase.HTableDescriptor;
40  import org.apache.hadoop.hbase.MetaTableAccessor;
41  import org.apache.hadoop.hbase.Server;
42  import org.apache.hadoop.hbase.ServerLoad;
43  import org.apache.hadoop.hbase.ServerName;
44  import org.apache.hadoop.hbase.TableName;
45  import org.apache.hadoop.hbase.TableStateManager;
46  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
47  import org.apache.hadoop.hbase.MetaTableAccessor;
48  import org.apache.hadoop.hbase.master.RegionState.State;
49  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.hbase.util.FSUtils;
52  import org.apache.hadoop.hbase.util.Pair;
53  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
54  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
55  import org.apache.zookeeper.KeeperException;
56  
57  import com.google.common.annotations.VisibleForTesting;
58  import com.google.common.base.Preconditions;
59  
60  /**
61   * Region state accountant. It holds the states of all regions in the memory.
62   * In normal scenario, it should match the meta table and the true region states.
63   *
64   * This map is used by AssignmentManager to track region states.
65   */
66  @InterfaceAudience.Private
67  public class RegionStates {
68    private static final Log LOG = LogFactory.getLog(RegionStates.class);
69  
70    /**
71     * Regions currently in transition.
72     */
73    final HashMap<String, RegionState> regionsInTransition =
74      new HashMap<String, RegionState>();
75  
76    /**
77     * Region encoded name to state map.
78     * All the regions should be in this map.
79     */
80    private final Map<String, RegionState> regionStates =
81      new HashMap<String, RegionState>();
82  
83    /**
84     * Holds mapping of table -> region state
85     */
86    private final Map<TableName, Map<String, RegionState>> regionStatesTableIndex =
87        new HashMap<TableName, Map<String, RegionState>>();
88  
89    /**
90     * Server to regions assignment map.
91     * Contains the set of regions currently assigned to a given server.
92     */
93    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
94      new HashMap<ServerName, Set<HRegionInfo>>();
95  
96    /**
97     * Maintains the mapping from the default region to the replica regions.
98     */
99    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
100     new HashMap<HRegionInfo, Set<HRegionInfo>>();
101 
102   /**
103    * Region to server assignment map.
104    * Contains the server a given region is currently assigned to.
105    */
106   private final TreeMap<HRegionInfo, ServerName> regionAssignments =
107     new TreeMap<HRegionInfo, ServerName>();
108 
109   /**
110    * Encoded region name to server assignment map for re-assignment
111    * purpose. Contains the server a given region is last known assigned
112    * to, which has not completed log splitting, so not assignable.
113    * If a region is currently assigned, this server info in this
114    * map should be the same as that in regionAssignments.
115    * However the info in regionAssignments is cleared when the region
116    * is offline while the info in lastAssignments is cleared when
117    * the region is closed or the server is dead and processed.
118    */
119   private final HashMap<String, ServerName> lastAssignments =
120     new HashMap<String, ServerName>();
121 
122   /**
123    * Encoded region name to server assignment map for the
124    * purpose to clean up serverHoldings when a region is online
125    * on a new server. When the region is offline from the previous
126    * server, we cleaned up regionAssignments so that it has the
127    * latest assignment map. But we didn't clean up serverHoldings
128    * to match the meta. We need this map to find out the old server
129    * whose serverHoldings needs cleanup, given a moved region.
130    */
131   private final HashMap<String, ServerName> oldAssignments =
132     new HashMap<String, ServerName>();
133 
134   /**
135    * Map a host port pair string to the latest start code
136    * of a region server which is known to be dead. It is dead
137    * to us, but server manager may not know it yet.
138    */
139   private final HashMap<String, Long> deadServers =
140     new HashMap<String, Long>();
141 
142   /**
143    * Map a dead servers to the time when log split is done.
144    * Since log splitting is not ordered, we have to remember
145    * all processed instances. The map is cleaned up based
146    * on a configured time. By default, we assume a dead
147    * server should be done with log splitting in two hours.
148    */
149   private final HashMap<ServerName, Long> processedServers =
150     new HashMap<ServerName, Long>();
151   private long lastProcessedServerCleanTime;
152 
153   private final TableStateManager tableStateManager;
154   private final RegionStateStore regionStateStore;
155   private final ServerManager serverManager;
156   private final Server server;
157 
158   // The maximum time to keep a log split info in region states map
159   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
160   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
161 
162   RegionStates(final Server master, final TableStateManager tableStateManager,
163       final ServerManager serverManager, final RegionStateStore regionStateStore) {
164     this.tableStateManager = tableStateManager;
165     this.regionStateStore = regionStateStore;
166     this.serverManager = serverManager;
167     this.server = master;
168   }
169 
170   /**
171    * @return an unmodifiable the region assignment map
172    */
173   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
174     return Collections.unmodifiableMap(regionAssignments);
175   }
176 
177   /**
178    * Return the replicas (including default) for the regions grouped by ServerName
179    * @param regions
180    * @return a pair containing the groupings as a map
181    */
182   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
183     Collection<HRegionInfo> regions) {
184     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
185     for (HRegionInfo region : regions) {
186       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
187       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
188       if (allReplicas != null) {
189         for (HRegionInfo hri : allReplicas) {
190           ServerName server = regionAssignments.get(hri);
191           if (server != null) {
192             List<HRegionInfo> regionsOnServer = map.get(server);
193             if (regionsOnServer == null) {
194               regionsOnServer = new ArrayList<HRegionInfo>(1);
195               map.put(server, regionsOnServer);
196             }
197             regionsOnServer.add(hri);
198           }
199         }
200       }
201     }
202     return map;
203   }
204 
205   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
206     return regionAssignments.get(hri);
207   }
208 
209   /**
210    * Get regions in transition and their states
211    */
212   @SuppressWarnings("unchecked")
213   public synchronized Map<String, RegionState> getRegionsInTransition() {
214     return (Map<String, RegionState>)regionsInTransition.clone();
215   }
216 
217   /**
218    * @return True if specified region in transition.
219    */
220   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
221     return regionsInTransition.containsKey(hri.getEncodedName());
222   }
223 
224   /**
225    * @return True if specified region in transition.
226    */
227   public synchronized boolean isRegionInTransition(final String encodedName) {
228     return regionsInTransition.containsKey(encodedName);
229   }
230 
231   /**
232    * @return True if any region in transition.
233    */
234   public synchronized boolean isRegionsInTransition() {
235     return !regionsInTransition.isEmpty();
236   }
237 
238   /**
239    * @return True if hbase:meta table region is in transition.
240    */
241   public synchronized boolean isMetaRegionInTransition() {
242     for (RegionState state : regionsInTransition.values()) {
243       if (state.getRegion().isMetaRegion()) return true;
244     }
245     return false;
246   }
247 
248   /**
249    * @return True if specified region assigned, and not in transition.
250    */
251   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
252     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
253   }
254 
255   /**
256    * @return True if specified region offline/closed, but not in transition.
257    * If the region is not in the map, it is offline to us too.
258    */
259   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
260     return getRegionState(hri) == null || (!isRegionInTransition(hri)
261       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
262   }
263 
264   /**
265    * @return True if specified region is in one of the specified states.
266    */
267   public boolean isRegionInState(
268       final HRegionInfo hri, final State... states) {
269     return isRegionInState(hri.getEncodedName(), states);
270   }
271 
272   /**
273    * @return True if specified region is in one of the specified states.
274    */
275   public boolean isRegionInState(
276       final String encodedName, final State... states) {
277     RegionState regionState = getRegionState(encodedName);
278     return isOneOfStates(regionState, states);
279   }
280 
281   /**
282    * Wait for the state map to be updated by assignment manager.
283    */
284   public synchronized void waitForUpdate(
285       final long timeout) throws InterruptedException {
286     this.wait(timeout);
287   }
288 
289   /**
290    * Get region transition state
291    */
292   public RegionState getRegionTransitionState(final HRegionInfo hri) {
293     return getRegionTransitionState(hri.getEncodedName());
294   }
295 
296   /**
297    * Get region transition state
298    */
299   public synchronized RegionState
300       getRegionTransitionState(final String encodedName) {
301     return regionsInTransition.get(encodedName);
302   }
303 
304   /**
305    * Add a list of regions to RegionStates. If a region is split
306    * and offline, its state will be SPLIT. Otherwise, its state will
307    * be OFFLINE. Region already in RegionStates will be skipped.
308    */
309   public void createRegionStates(
310       final List<HRegionInfo> hris) {
311     for (HRegionInfo hri: hris) {
312       createRegionState(hri);
313     }
314   }
315 
316   /**
317    * Add a region to RegionStates. If the region is split
318    * and offline, its state will be SPLIT. Otherwise, its state will
319    * be OFFLINE. If it is already in RegionStates, this call has
320    * no effect, and the original state is returned.
321    */
322   public RegionState createRegionState(final HRegionInfo hri) {
323     return createRegionState(hri, null, null, null);
324   }
325 
326   /**
327    * Add a region to RegionStates with the specified state.
328    * If the region is already in RegionStates, this call has
329    * no effect, and the original state is returned.
330    *
331    * @param hri the region info to create a state for
332    * @param newState the state to the region in set to
333    * @param serverName the server the region is transitioning on
334    * @param lastHost the last server that hosts the region
335    * @return the current state
336    */
337   public synchronized RegionState createRegionState(final HRegionInfo hri,
338       State newState, ServerName serverName, ServerName lastHost) {
339     if (newState == null || (newState == State.OPEN && serverName == null)) {
340       newState =  State.OFFLINE;
341     }
342     if (hri.isOffline() && hri.isSplit()) {
343       newState = State.SPLIT;
344       serverName = null;
345     }
346     String encodedName = hri.getEncodedName();
347     RegionState regionState = regionStates.get(encodedName);
348     if (regionState != null) {
349       LOG.warn("Tried to create a state for a region already in RegionStates, "
350         + "used existing: " + regionState + ", ignored new: " + newState);
351     } else {
352       regionState = new RegionState(hri, newState, serverName);
353       putRegionState(regionState);
354       if (newState == State.OPEN) {
355         if (!serverName.equals(lastHost)) {
356           LOG.warn("Open region's last host " + lastHost
357             + " should be the same as the current one " + serverName
358             + ", ignored the last and used the current one");
359           lastHost = serverName;
360         }
361         lastAssignments.put(encodedName, lastHost);
362         regionAssignments.put(hri, lastHost);
363       } else if (!regionState.isUnassignable()) {
364         regionsInTransition.put(encodedName, regionState);
365       }
366       if (lastHost != null && newState != State.SPLIT) {
367         addToServerHoldings(lastHost, hri);
368         if (newState != State.OPEN) {
369           oldAssignments.put(encodedName, lastHost);
370         }
371       }
372     }
373     return regionState;
374   }
375 
376   private RegionState putRegionState(RegionState regionState) {
377     HRegionInfo hri = regionState.getRegion();
378     String encodedName = hri.getEncodedName();
379     TableName table = hri.getTable();
380     RegionState oldState = regionStates.put(encodedName, regionState);
381     Map<String, RegionState> map = regionStatesTableIndex.get(table);
382     if (map == null) {
383       map = new HashMap<String, RegionState>();
384       regionStatesTableIndex.put(table, map);
385     }
386     map.put(encodedName, regionState);
387     return oldState;
388   }
389 
390   /**
391    * Set the region state to CLOSED
392    */
393   public RegionState setRegionStateTOCLOSED(
394       final byte[] regionName,
395       final ServerName serverName) {
396     HRegionInfo regionInfo = getRegionInfo(regionName);
397     return setRegionStateTOCLOSED(regionInfo, serverName);
398   }
399 
400   /**
401    * Set the region state to CLOSED
402    */
403   public RegionState setRegionStateTOCLOSED(
404       final HRegionInfo regionInfo,
405       final ServerName serverName) {
406     ServerName sn = serverName;
407     if (sn == null) {
408       RegionState regionState = getRegionState(regionInfo.getEncodedName());
409       if (regionState != null) {
410         sn = regionState.getServerName();
411       }
412       // TODO: if sn is null, should we dig into
413       // lastAssignments.get(regionInfo.getEncodedName() to get the server name?
414       // For now, I just keep the same logic that works in the past
415     }
416     // We have to make sure that the last region server is set to be the same as the
417     // current RS.  If we don't do that, we could run into situation that both AM and SSH
418     // think other would do the assignment work; at the end, neither does the work and
419     // region remains RIT.
420     // See HBASE-13330 and HBASE-17023
421     setLastRegionServerOfRegion(sn, regionInfo.getEncodedName());
422     return updateRegionState(regionInfo, State.CLOSED, sn);
423   }
424 
425   /**
426    * Update a region state. It will be put in transition if not already there.
427    */
428   public RegionState updateRegionState(
429       final HRegionInfo hri, final State state) {
430     RegionState regionState = getRegionState(hri.getEncodedName());
431     return updateRegionState(hri, state,
432       regionState == null ? null : regionState.getServerName());
433   }
434 
435   /**
436    * Update a region state. It will be put in transition if not already there.
437    *
438    * If we can't find the region info based on the region name in
439    * the transition, log a warning and return null.
440    */
441   public RegionState updateRegionState(
442       final RegionTransition transition, final State state) {
443     byte [] regionName = transition.getRegionName();
444     HRegionInfo regionInfo = getRegionInfo(regionName);
445     if (regionInfo == null) {
446       String prettyRegionName = HRegionInfo.prettyPrint(
447         HRegionInfo.encodeRegionName(regionName));
448       LOG.warn("Failed to find region " + prettyRegionName
449         + " in updating its state to " + state
450         + " based on region transition " + transition);
451       return null;
452     }
453     return updateRegionState(regionInfo, state,
454       transition.getServerName());
455   }
456 
457   /**
458    * Transition a region state to OPEN from OPENING/PENDING_OPEN
459    */
460   public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
461       final RegionTransition transition, final RegionState fromState, final ServerName sn) {
462     if(fromState.isPendingOpenOrOpeningOnServer(sn)){
463       return updateRegionState(transition, State.OPEN);
464     }
465     return null;
466   }
467 
468   /**
469    * Update a region state. It will be put in transition if not already there.
470    */
471   public RegionState updateRegionState(
472       final HRegionInfo hri, final State state, final ServerName serverName) {
473     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
474   }
475 
476   public void regionOnline(
477       final HRegionInfo hri, final ServerName serverName) {
478     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
479   }
480 
481   /**
482    * A region is online, won't be in transition any more.
483    * We can't confirm it is really online on specified region server
484    * because it hasn't been put in region server's online region list yet.
485    */
486   public void regionOnline(final HRegionInfo hri,
487       final ServerName serverName, long openSeqNum) {
488     String encodedName = hri.getEncodedName();
489     if (!serverManager.isServerOnline(serverName)) {
490       // This is possible if the region server dies before master gets a
491       // chance to handle ZK event in time. At this time, if the dead server
492       // is already processed by SSH, we should ignore this event.
493       // If not processed yet, ignore and let SSH deal with it.
494       LOG.warn("Ignored, " + encodedName
495         + " was opened on a dead server: " + serverName);
496       return;
497     }
498     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
499 
500     synchronized (this) {
501       regionsInTransition.remove(encodedName);
502       ServerName oldServerName = regionAssignments.put(hri, serverName);
503       if (!serverName.equals(oldServerName)) {
504         if (LOG.isDebugEnabled()) {
505           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName + " " + hri);
506         } else {
507           LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
508         }
509         addToServerHoldings(serverName, hri);
510         addToReplicaMapping(hri);
511         if (oldServerName == null) {
512           oldServerName = oldAssignments.remove(encodedName);
513         }
514         if (oldServerName != null
515             && !oldServerName.equals(serverName)
516             && serverHoldings.containsKey(oldServerName)) {
517           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
518           removeFromServerHoldings(oldServerName, hri);
519         }
520       }
521     }
522   }
523 
524   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
525     Set<HRegionInfo> regions = serverHoldings.get(serverName);
526     if (regions == null) {
527       regions = new HashSet<HRegionInfo>();
528       serverHoldings.put(serverName, regions);
529     }
530     regions.add(hri);
531   }
532 
533   private void addToReplicaMapping(HRegionInfo hri) {
534     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
535     Set<HRegionInfo> replicas =
536         defaultReplicaToOtherReplicas.get(defaultReplica);
537     if (replicas == null) {
538       replicas = new HashSet<HRegionInfo>();
539       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
540     }
541     replicas.add(hri);
542   }
543 
544   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
545     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
546     oldRegions.remove(hri);
547     if (oldRegions.isEmpty()) {
548       serverHoldings.remove(serverName);
549     }
550   }
551 
552   private void removeFromReplicaMapping(HRegionInfo hri) {
553     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
554     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
555     if (replicas != null) {
556       replicas.remove(hri);
557       if (replicas.isEmpty()) {
558         defaultReplicaToOtherReplicas.remove(defaultReplica);
559       }
560     }
561   }
562 
563   /**
564    * A dead server's wals have been split so that all the regions
565    * used to be open on it can be safely assigned now. Mark them assignable.
566    */
567   public synchronized void logSplit(final ServerName serverName) {
568     for (Iterator<Map.Entry<String, ServerName>> it
569         = lastAssignments.entrySet().iterator(); it.hasNext();) {
570       Map.Entry<String, ServerName> e = it.next();
571       if (e.getValue().equals(serverName)) {
572         it.remove();
573       }
574     }
575     long now = System.currentTimeMillis();
576     if (LOG.isDebugEnabled()) {
577       LOG.debug("Adding to processed servers " + serverName);
578     }
579     processedServers.put(serverName, Long.valueOf(now));
580     Configuration conf = server.getConfiguration();
581     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
582     // Doesn't have to be very accurate about the clean up time
583     if (now > lastProcessedServerCleanTime + obsoleteTime) {
584       lastProcessedServerCleanTime = now;
585       long cutoff = now - obsoleteTime;
586       for (Iterator<Map.Entry<ServerName, Long>> it
587           = processedServers.entrySet().iterator(); it.hasNext();) {
588         Map.Entry<ServerName, Long> e = it.next();
589         if (e.getValue().longValue() < cutoff) {
590           if (LOG.isDebugEnabled()) {
591             LOG.debug("Removed from processed servers " + e.getKey());
592           }
593           it.remove();
594         }
595       }
596     }
597   }
598 
599   /**
600    * Log split is done for a given region, so it is assignable now.
601    */
602   public void logSplit(final HRegionInfo region) {
603     clearLastAssignment(region);
604   }
605 
606   public synchronized void clearLastAssignment(final HRegionInfo region) {
607     lastAssignments.remove(region.getEncodedName());
608   }
609 
610   /**
611    * A region is offline, won't be in transition any more.
612    */
613   public void regionOffline(final HRegionInfo hri) {
614     regionOffline(hri, null);
615   }
616 
617   /**
618    * A region is offline, won't be in transition any more. Its state
619    * should be the specified expected state, which can only be
620    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
621    */
622   public void regionOffline(
623       final HRegionInfo hri, final State expectedState) {
624     Preconditions.checkArgument(expectedState == null
625       || RegionState.isUnassignable(expectedState),
626         "Offlined region should not be " + expectedState);
627     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
628       // Remove it from all region maps
629       deleteRegion(hri);
630       return;
631     }
632     State newState =
633       expectedState == null ? State.OFFLINE : expectedState;
634     updateRegionState(hri, newState);
635     String encodedName = hri.getEncodedName();
636     synchronized (this) {
637       regionsInTransition.remove(encodedName);
638       ServerName oldServerName = regionAssignments.remove(hri);
639       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
640         if (newState == State.MERGED || newState == State.SPLIT
641             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
642               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
643           // Offline the region only if it's merged/split, or the table is disabled/disabling.
644           // Otherwise, offline it from this server only when it is online on a different server.
645           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
646           removeFromServerHoldings(oldServerName, hri);
647           removeFromReplicaMapping(hri);
648         } else {
649           // Need to remember it so that we can offline it from this
650           // server when it is online on a different server.
651           oldAssignments.put(encodedName, oldServerName);
652         }
653       }
654     }
655   }
656 
657   /**
658    * A server is offline, all regions on it are dead.
659    */
660   public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
661     // Offline all regions on this server not already in transition.
662     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
663     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
664     // Offline regions outside the loop and synchronized block to avoid
665     // ConcurrentModificationException and deadlock in case of meta anassigned,
666     // but RegionState a blocked.
667     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
668     synchronized (this) {
669       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
670       if (assignedRegions == null) {
671         assignedRegions = new HashSet<HRegionInfo>();
672       }
673 
674       for (HRegionInfo region : assignedRegions) {
675         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
676         if (isRegionOnline(region)) {
677           regionsToOffline.add(region);
678         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
679           LOG.debug("Offline splitting/merging region " + getRegionState(region));
680           try {
681             // Delete the ZNode if exists
682             ZKAssign.deleteNodeFailSilent(watcher, region);
683             regionsToOffline.add(region);
684           } catch (KeeperException ke) {
685             server.abort("Unexpected ZK exception deleting node " + region, ke);
686           }
687         }
688       }
689 
690       for (RegionState state : regionsInTransition.values()) {
691         HRegionInfo hri = state.getRegion();
692         if (assignedRegions.contains(hri)) {
693           // Region is open on this region server, but in transition.
694           // This region must be moving away from this server, or splitting/merging.
695           // SSH will handle it, either skip assigning, or re-assign.
696           LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
697         } else if (sn.equals(state.getServerName())) {
698           // Region is in transition on this region server, and this
699           // region is not open on this server. So the region must be
700           // moving to this server from another one (i.e. opening or
701           // pending open on this server, was open on another one.
702           // Offline state is also kind of pending open if the region is in
703           // transition. The region could be in failed_close state too if we have
704           // tried several times to open it while this region server is not reachable)
705           if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
706             LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
707             rits.add(hri);
708           } else if(state.isSplittingNew() || state.isMergingNew()) {
709             regionsToCleanIfNoMetaEntry.add(state.getRegion());
710           } else {
711             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
712           }
713         }
714       }
715       this.notifyAll();
716     }
717 
718     for (HRegionInfo hri : regionsToOffline) {
719       regionOffline(hri);
720     }
721 
722     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
723     return rits;
724   }
725 
726   /**
727    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
728    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
729    */
730   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
731     if (hris.isEmpty()) return;
732     for (HRegionInfo hri: hris) {
733       try {
734         // This is RPC to meta table. It is done while we have a synchronize on
735         // regionstates. No progress will be made if meta is not available at this time.
736         // This is a cleanup task. Not critical.
737         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
738             null) {
739           regionOffline(hri);
740           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
741         }
742       } catch (IOException e) {
743         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
744       }
745     }
746   }
747 
748   /**
749    * Gets the online regions of the specified table.
750    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
751    * Only returns <em>online</em> regions.  If a region on this table has been
752    * closed during a disable, etc., it will be included in the returned list.
753    * So, the returned list may not necessarily be ALL regions in this table, its
754    * all the ONLINE regions in the table.
755    * @param tableName
756    * @return Online regions from <code>tableName</code>
757    */
758   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
759     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
760     // boundary needs to have table's name but regionID 0 so that it is sorted
761     // before all table's regions.
762     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
763     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
764       if(!hri.getTable().equals(tableName)) break;
765       tableRegions.add(hri);
766     }
767     return tableRegions;
768   }
769 
770   /**
771    * Gets current state of all regions of the table.
772    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
773    * Method guaranteed to return keys for all states
774    * in {@link org.apache.hadoop.hbase.master.RegionState.State}
775    *
776    * @param tableName
777    * @return Online regions from <code>tableName</code>
778    */
779   public synchronized Map<RegionState.State, List<HRegionInfo>>
780   getRegionByStateOfTable(TableName tableName) {
781     Map<RegionState.State, List<HRegionInfo>> tableRegions =
782         new HashMap<State, List<HRegionInfo>>();
783     for (State state : State.values()) {
784       tableRegions.put(state, new ArrayList<HRegionInfo>());
785     }
786     Map<String, RegionState> indexMap = regionStatesTableIndex.get(tableName);
787     if (indexMap == null)
788       return tableRegions;
789     for (RegionState regionState : indexMap.values()) {
790       tableRegions.get(regionState.getState()).add(regionState.getRegion());
791     }
792     return tableRegions;
793   }
794 
795   /**
796    * Wait on region to clear regions-in-transition.
797    * <p>
798    * If the region isn't in transition, returns immediately.  Otherwise, method
799    * blocks until the region is out of transition.
800    */
801   public synchronized void waitOnRegionToClearRegionsInTransition(
802       final HRegionInfo hri) throws InterruptedException {
803     if (!isRegionInTransition(hri)) return;
804 
805     while(!server.isStopped() && isRegionInTransition(hri)) {
806       RegionState rs = getRegionState(hri);
807       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
808       waitForUpdate(100);
809     }
810 
811     if (server.isStopped()) {
812       LOG.info("Giving up wait on region in " +
813         "transition because stoppable.isStopped is set");
814     }
815   }
816 
817   /**
818    * A table is deleted. Remove its regions from all internal maps.
819    * We loop through all regions assuming we don't delete tables too much.
820    */
821   public void tableDeleted(final TableName tableName) {
822     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
823     synchronized (this) {
824       for (RegionState state: regionStates.values()) {
825         HRegionInfo region = state.getRegion();
826         if (region.getTable().equals(tableName)) {
827           regionsToDelete.add(region);
828         }
829       }
830     }
831     for (HRegionInfo region: regionsToDelete) {
832       deleteRegion(region);
833     }
834   }
835 
836   /**
837    * Get a copy of all regions assigned to a server
838    */
839   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
840     Set<HRegionInfo> regions = serverHoldings.get(serverName);
841     if (regions == null) return null;
842     return new HashSet<HRegionInfo>(regions);
843   }
844 
845   /**
846    * Remove a region from all state maps.
847    */
848   @VisibleForTesting
849   public synchronized void deleteRegion(final HRegionInfo hri) {
850     String encodedName = hri.getEncodedName();
851     regionsInTransition.remove(encodedName);
852     regionStates.remove(encodedName);
853     TableName table = hri.getTable();
854     Map<String, RegionState> indexMap = regionStatesTableIndex.get(table);
855     indexMap.remove(encodedName);
856     if (indexMap.size() == 0)
857       regionStatesTableIndex.remove(table);
858     lastAssignments.remove(encodedName);
859     ServerName sn = regionAssignments.remove(hri);
860     if (sn != null) {
861       Set<HRegionInfo> regions = serverHoldings.get(sn);
862       regions.remove(hri);
863     }
864   }
865 
866   /**
867    * Checking if a region was assigned to a server which is not online now.
868    * If so, we should hold re-assign this region till SSH has split its wals.
869    * Once logs are split, the last assignment of this region will be reset,
870    * which means a null last assignment server is ok for re-assigning.
871    *
872    * A region server could be dead but we don't know it yet. We may
873    * think it's online falsely. Therefore if a server is online, we still
874    * need to confirm it reachable and having the expected start code.
875    */
876   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
877     ServerName server = lastAssignments.get(encodedName);
878     return isServerDeadAndNotProcessed(server);
879   }
880 
881   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
882     if (server == null) return false;
883     if (serverManager.isServerOnline(server)) {
884       String hostAndPort = server.getHostAndPort();
885       long startCode = server.getStartcode();
886       Long deadCode = deadServers.get(hostAndPort);
887       if (deadCode == null || startCode > deadCode.longValue()) {
888         if (serverManager.isServerReachable(server)) {
889           return false;
890         }
891         // The size of deadServers won't grow unbounded.
892         deadServers.put(hostAndPort, Long.valueOf(startCode));
893       }
894       // Watch out! If the server is not dead, the region could
895       // remain unassigned. That's why ServerManager#isServerReachable
896       // should use some retry.
897       //
898       // We cache this info since it is very unlikely for that
899       // instance to come back up later on. We don't want to expire
900       // the server since we prefer to let it die naturally.
901       LOG.warn("Couldn't reach online server " + server);
902     }
903     // Now, we know it's dead. Check if it's processed
904     return !processedServers.containsKey(server);
905   }
906 
907  /**
908    * Get the last region server a region was on for purpose of re-assignment,
909    * i.e. should the re-assignment be held back till log split is done?
910    */
911   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
912     return lastAssignments.get(encodedName);
913   }
914 
915   synchronized void setLastRegionServerOfRegions(
916       final ServerName serverName, final List<HRegionInfo> regionInfos) {
917     for (HRegionInfo hri: regionInfos) {
918       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
919     }
920   }
921 
922   synchronized void setLastRegionServerOfRegion(
923       final ServerName serverName, final String encodedName) {
924     lastAssignments.put(encodedName, serverName);
925   }
926 
927   void splitRegion(HRegionInfo p,
928       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
929 
930     regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p));
931     synchronized (this) {
932       // After PONR, split is considered to be done.
933       // Update server holdings to be aligned with the meta.
934       Set<HRegionInfo> regions = serverHoldings.get(sn);
935       if (regions == null) {
936         throw new IllegalStateException(sn + " should host some regions");
937       }
938       regions.remove(p);
939       regions.add(a);
940       regions.add(b);
941     }
942   }
943 
944   void mergeRegions(HRegionInfo p,
945       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
946     regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a));
947     synchronized (this) {
948       // After PONR, merge is considered to be done.
949       // Update server holdings to be aligned with the meta.
950       Set<HRegionInfo> regions = serverHoldings.get(sn);
951       if (regions == null) {
952         throw new IllegalStateException(sn + " should host some regions");
953       }
954       regions.remove(a);
955       regions.remove(b);
956       regions.add(p);
957     }
958   }
959 
960   private int getRegionReplication(HRegionInfo r) throws IOException {
961     if (tableStateManager != null) {
962       HTableDescriptor htd = ((MasterServices)server).getTableDescriptors().get(r.getTable());
963       if (htd != null) {
964         return htd.getRegionReplication();
965       }
966     }
967     return 1;
968   }
969 
970   /**
971    * At cluster clean re/start, mark all user regions closed except those of tables
972    * that are excluded, such as disabled/disabling/enabling tables. All user regions
973    * and their previous locations are returned.
974    */
975   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
976     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
977     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
978     for(RegionState state: regionStates.values()) {
979       HRegionInfo hri = state.getRegion();
980       if (state.isSplit() || hri.isSplit()) {
981         continue;
982       }
983       TableName tableName = hri.getTable();
984       if (!TableName.META_TABLE_NAME.equals(tableName)
985           && (noExcludeTables || !excludedTables.contains(tableName))) {
986         toBeClosed.add(hri);
987       }
988     }
989     Map<HRegionInfo, ServerName> allUserRegions =
990       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
991     for (HRegionInfo hri: toBeClosed) {
992       RegionState regionState = updateRegionState(hri, State.CLOSED);
993       allUserRegions.put(hri, regionState.getServerName());
994     }
995     return allUserRegions;
996   }
997 
998   /**
999    * Compute the average load across all region servers.
1000    * Currently, this uses a very naive computation - just uses the number of
1001    * regions being served, ignoring stats about number of requests.
1002    * @return the average load
1003    */
1004   protected synchronized double getAverageLoad() {
1005     int numServers = 0, totalLoad = 0;
1006     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1007       Set<HRegionInfo> regions = e.getValue();
1008       ServerName serverName = e.getKey();
1009       int regionCount = regions.size();
1010       if (serverManager.isServerOnline(serverName)) {
1011         totalLoad += regionCount;
1012         numServers++;
1013       }
1014     }
1015     if (numServers > 1) {
1016       // The master region server holds only a couple regions.
1017       // Don't consider this server in calculating the average load
1018       // if there are other region servers to avoid possible confusion.
1019       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
1020       if (hris != null) {
1021         totalLoad -= hris.size();
1022         numServers--;
1023       }
1024     }
1025     return numServers == 0 ? 0.0 :
1026       (double)totalLoad / (double)numServers;
1027   }
1028 
1029   /**
1030    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
1031    * Can't let out original since it can change and at least the load balancer
1032    * wants to iterate this exported list.  We need to synchronize on regions
1033    * since all access to this.servers is under a lock on this.regions.
1034    *
1035    * @return A clone of current assignments by table.
1036    */
1037   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
1038       getAssignmentsByTable() {
1039     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
1040       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
1041     synchronized (this) {
1042       if (!server.getConfiguration().getBoolean(
1043             HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, false)) {
1044         Map<ServerName, List<HRegionInfo>> svrToRegions =
1045           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1046         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1047           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1048         }
1049         result.put(TableName.valueOf(HConstants.ENSEMBLE_TABLE_NAME), svrToRegions);
1050       } else {
1051         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1052           for (HRegionInfo hri: e.getValue()) {
1053             if (hri.isMetaRegion()) continue;
1054             TableName tablename = hri.getTable();
1055             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
1056             if (svrToRegions == null) {
1057               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1058               result.put(tablename, svrToRegions);
1059             }
1060             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
1061             if (regions == null) {
1062               regions = new ArrayList<HRegionInfo>();
1063               svrToRegions.put(e.getKey(), regions);
1064             }
1065             regions.add(hri);
1066           }
1067         }
1068       }
1069     }
1070 
1071     Map<ServerName, ServerLoad>
1072       onlineSvrs = serverManager.getOnlineServers();
1073     // Take care of servers w/o assignments, and remove servers in draining mode
1074     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
1075     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
1076       for (ServerName svr: onlineSvrs.keySet()) {
1077         if (!map.containsKey(svr)) {
1078           map.put(svr, new ArrayList<HRegionInfo>());
1079         }
1080       }
1081       map.keySet().removeAll(drainingServers);
1082     }
1083     return result;
1084   }
1085 
1086   protected RegionState getRegionState(final HRegionInfo hri) {
1087     return getRegionState(hri.getEncodedName());
1088   }
1089 
1090   /**
1091    * Returns a clone of region assignments per server
1092    * @return a Map of ServerName to a List of HRegionInfo's
1093    */
1094   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
1095     Map<ServerName, List<HRegionInfo>> regionsByServer =
1096         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
1097     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
1098       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
1099     }
1100     return regionsByServer;
1101   }
1102 
1103   protected synchronized RegionState getRegionState(final String encodedName) {
1104     return regionStates.get(encodedName);
1105   }
1106 
1107   /**
1108    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
1109    * @param  regionName
1110    * @return HRegionInfo for the region
1111    */
1112   @SuppressWarnings("deprecation")
1113   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1114     String encodedName = HRegionInfo.encodeRegionName(regionName);
1115     RegionState regionState = getRegionState(encodedName);
1116     if (regionState != null) {
1117       return regionState.getRegion();
1118     }
1119 
1120     try {
1121       Pair<HRegionInfo, ServerName> p =
1122         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1123       HRegionInfo hri = p == null ? null : p.getFirst();
1124       if (hri != null) {
1125         createRegionState(hri);
1126       }
1127       return hri;
1128     } catch (IOException e) {
1129       server.abort("Aborting because error occoured while reading "
1130         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1131       return null;
1132     }
1133   }
1134 
1135   static boolean isOneOfStates(RegionState regionState, State... states) {
1136     State s = regionState != null ? regionState.getState() : null;
1137     for (State state: states) {
1138       if (s == state) return true;
1139     }
1140     return false;
1141   }
1142 
1143   /**
1144    * Update a region state. It will be put in transition if not already there.
1145    */
1146   private RegionState updateRegionState(final HRegionInfo hri,
1147       final State state, final ServerName serverName, long openSeqNum) {
1148     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
1149       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1150         + " on " + serverName + ", set to " + state);
1151     }
1152 
1153     String encodedName = hri.getEncodedName();
1154     RegionState regionState = new RegionState(
1155       hri, state, System.currentTimeMillis(), serverName);
1156     RegionState oldState = getRegionState(encodedName);
1157     if (!regionState.equals(oldState)) {
1158       LOG.info("Transition " + oldState + " to " + regionState);
1159       // Persist region state before updating in-memory info, if needed
1160       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1161     }
1162 
1163     synchronized (this) {
1164       regionsInTransition.put(encodedName, regionState);
1165       putRegionState(regionState);
1166 
1167       // For these states, region should be properly closed.
1168       // There should be no log splitting issue.
1169       if ((state == State.CLOSED || state == State.MERGED
1170           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1171         ServerName last = lastAssignments.get(encodedName);
1172         if (last.equals(serverName)) {
1173           lastAssignments.remove(encodedName);
1174         } else {
1175           LOG.warn(encodedName + " moved to " + state + " on "
1176             + serverName + ", expected " + last);
1177         }
1178       }
1179 
1180       // Once a region is opened, record its last assignment right away.
1181       if (serverName != null && state == State.OPEN) {
1182         ServerName last = lastAssignments.get(encodedName);
1183         if (!serverName.equals(last)) {
1184           lastAssignments.put(encodedName, serverName);
1185           if (last != null && isServerDeadAndNotProcessed(last)) {
1186             LOG.warn(encodedName + " moved to " + serverName
1187               + ", while it's previous host " + last
1188               + " is dead but not processed yet");
1189           }
1190         }
1191       }
1192 
1193       // notify the change
1194       this.notifyAll();
1195     }
1196     return regionState;
1197   }
1198 }