View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  
19  package org.apache.hadoop.hbase.chaos.actions;
20  
21  import java.io.IOException;
22  import java.util.ArrayList;
23  import java.util.Collection;
24  import java.util.LinkedList;
25  import java.util.List;
26  
27  import org.apache.commons.lang.math.RandomUtils;
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  import org.apache.hadoop.hbase.ClusterStatus;
31  import org.apache.hadoop.hbase.HBaseCluster;
32  import org.apache.hadoop.hbase.HRegionInfo;
33  import org.apache.hadoop.hbase.IntegrationTestingUtility;
34  import org.apache.hadoop.hbase.ServerLoad;
35  import org.apache.hadoop.hbase.ServerName;
36  import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
37  import org.apache.hadoop.hbase.client.Admin;
38  import org.apache.hadoop.hbase.client.HBaseAdmin;
39  import org.apache.hadoop.hbase.util.Bytes;
40  
41  /**
42   * A (possibly mischievous) action that the ChaosMonkey can perform.
43   */
44  public class Action {
45  
46    public static final String KILL_MASTER_TIMEOUT_KEY =
47        "hbase.chaosmonkey.action.killmastertimeout";
48    public static final String START_MASTER_TIMEOUT_KEY =
49        "hbase.chaosmonkey.action.startmastertimeout";
50    public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
51    public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
52  
53    protected static Log LOG = LogFactory.getLog(Action.class);
54  
55    protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
56    protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
57    protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
58    protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
59  
60    protected ActionContext context;
61    protected HBaseCluster cluster;
62    protected ClusterStatus initialStatus;
63    protected ServerName[] initialServers;
64  
65    protected long killMasterTimeout;
66    protected long startMasterTimeout;
67    protected long killRsTimeout;
68    protected long startRsTimeout;
69  
70    public void init(ActionContext context) throws IOException {
71      this.context = context;
72      cluster = context.getHBaseCluster();
73      initialStatus = cluster.getInitialClusterStatus();
74      Collection<ServerName> regionServers = initialStatus.getServers();
75      initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
76  
77      killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
78          KILL_MASTER_TIMEOUT_DEFAULT);
79      startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
80          START_MASTER_TIMEOUT_DEFAULT);
81      killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
82      startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
83    }
84  
85    public void perform() throws Exception { }
86  
87    /** Returns current region servers - active master */
88    protected ServerName[] getCurrentServers() throws IOException {
89      ClusterStatus clusterStatus = cluster.getClusterStatus();
90      Collection<ServerName> regionServers = clusterStatus.getServers();
91      int count = regionServers == null ? 0 : regionServers.size();
92      if (count <= 0) {
93        return new ServerName [] {};
94      }
95      ServerName master = clusterStatus.getMaster();
96      if (master == null || !regionServers.contains(master)) {
97        return regionServers.toArray(new ServerName[count]);
98      }
99      if (count == 1) {
100       return new ServerName [] {};
101     }
102     ArrayList<ServerName> tmp = new ArrayList<ServerName>(count);
103     tmp.addAll(regionServers);
104     tmp.remove(master);
105     return tmp.toArray(new ServerName[count-1]);
106   }
107 
108   protected void killMaster(ServerName server) throws IOException {
109     LOG.info("Killing master:" + server);
110     cluster.killMaster(server);
111     cluster.waitForMasterToStop(server, killMasterTimeout);
112     LOG.info("Killed master server:" + server);
113   }
114 
115   protected void startMaster(ServerName server) throws IOException {
116     LOG.info("Starting master:" + server.getHostname());
117     cluster.startMaster(server.getHostname(), server.getPort());
118     cluster.waitForActiveAndReadyMaster(startMasterTimeout);
119     LOG.info("Started master: " + server);
120   }
121 
122   protected void killRs(ServerName server) throws IOException {
123     LOG.info("Killing region server:" + server);
124     cluster.killRegionServer(server);
125     cluster.waitForRegionServerToStop(server, killRsTimeout);
126     LOG.info("Killed region server:" + server + ". Reported num of rs:"
127         + cluster.getClusterStatus().getServersSize());
128   }
129 
130   protected void startRs(ServerName server) throws IOException {
131     LOG.info("Starting region server:" + server.getHostname());
132     cluster.startRegionServer(server.getHostname(), server.getPort());
133     cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
134     LOG.info("Started region server:" + server + ". Reported num of rs:"
135         + cluster.getClusterStatus().getServersSize());
136   }
137 
138   protected void unbalanceRegions(ClusterStatus clusterStatus,
139       List<ServerName> fromServers, List<ServerName> toServers,
140       double fractionOfRegions) throws Exception {
141     List<byte[]> victimRegions = new LinkedList<byte[]>();
142     for (ServerName server : fromServers) {
143       ServerLoad serverLoad = clusterStatus.getLoad(server);
144       // Ugh.
145       List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
146       int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
147       LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
148       for (int i = 0; i < victimRegionCount; ++i) {
149         int victimIx = RandomUtils.nextInt(regions.size());
150         String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
151         victimRegions.add(Bytes.toBytes(regionId));
152       }
153     }
154 
155     LOG.info("Moving " + victimRegions.size() + " regions from " + fromServers.size()
156         + " servers to " + toServers.size() + " different servers");
157     Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
158     for (byte[] victimRegion : victimRegions) {
159       int targetIx = RandomUtils.nextInt(toServers.size());
160       admin.move(victimRegion, Bytes.toBytes(toServers.get(targetIx).getServerName()));
161     }
162   }
163 
164   protected void forceBalancer() throws Exception {
165     Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
166     boolean result = false;
167     try {
168       result = admin.balancer();
169     } catch (Exception e) {
170       LOG.warn("Got exception while doing balance ", e);
171     }
172     if (!result) {
173       LOG.error("Balancer didn't succeed");
174     }
175   }
176 
177   /**
178    * Context for Action's
179    */
180   public static class ActionContext {
181     private IntegrationTestingUtility util;
182 
183     public ActionContext(IntegrationTestingUtility util) {
184       this.util = util;
185     }
186 
187     public IntegrationTestingUtility getHBaseIntegrationTestingUtility() {
188       return util;
189     }
190 
191     public HBaseCluster getHBaseCluster() {
192       return util.getHBaseClusterInterface();
193     }
194   }
195 }