1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.hadoop.hbase.chaos.actions;
20
21 import java.io.IOException;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.LinkedList;
25 import java.util.List;
26
27 import org.apache.commons.lang.math.RandomUtils;
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30 import org.apache.hadoop.hbase.ClusterStatus;
31 import org.apache.hadoop.hbase.HBaseCluster;
32 import org.apache.hadoop.hbase.HRegionInfo;
33 import org.apache.hadoop.hbase.IntegrationTestingUtility;
34 import org.apache.hadoop.hbase.ServerLoad;
35 import org.apache.hadoop.hbase.ServerName;
36 import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
37 import org.apache.hadoop.hbase.client.Admin;
38 import org.apache.hadoop.hbase.client.HBaseAdmin;
39 import org.apache.hadoop.hbase.util.Bytes;
40
41
42
43
44 public class Action {
45
46 public static final String KILL_MASTER_TIMEOUT_KEY =
47 "hbase.chaosmonkey.action.killmastertimeout";
48 public static final String START_MASTER_TIMEOUT_KEY =
49 "hbase.chaosmonkey.action.startmastertimeout";
50 public static final String KILL_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.killrstimeout";
51 public static final String START_RS_TIMEOUT_KEY = "hbase.chaosmonkey.action.startrstimeout";
52
53 protected static Log LOG = LogFactory.getLog(Action.class);
54
55 protected static final long KILL_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
56 protected static final long START_MASTER_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
57 protected static final long KILL_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
58 protected static final long START_RS_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
59
60 protected ActionContext context;
61 protected HBaseCluster cluster;
62 protected ClusterStatus initialStatus;
63 protected ServerName[] initialServers;
64
65 protected long killMasterTimeout;
66 protected long startMasterTimeout;
67 protected long killRsTimeout;
68 protected long startRsTimeout;
69
70 public void init(ActionContext context) throws IOException {
71 this.context = context;
72 cluster = context.getHBaseCluster();
73 initialStatus = cluster.getInitialClusterStatus();
74 Collection<ServerName> regionServers = initialStatus.getServers();
75 initialServers = regionServers.toArray(new ServerName[regionServers.size()]);
76
77 killMasterTimeout = cluster.getConf().getLong(KILL_MASTER_TIMEOUT_KEY,
78 KILL_MASTER_TIMEOUT_DEFAULT);
79 startMasterTimeout = cluster.getConf().getLong(START_MASTER_TIMEOUT_KEY,
80 START_MASTER_TIMEOUT_DEFAULT);
81 killRsTimeout = cluster.getConf().getLong(KILL_RS_TIMEOUT_KEY, KILL_RS_TIMEOUT_DEFAULT);
82 startRsTimeout = cluster.getConf().getLong(START_RS_TIMEOUT_KEY, START_RS_TIMEOUT_DEFAULT);
83 }
84
85 public void perform() throws Exception { }
86
87
88 protected ServerName[] getCurrentServers() throws IOException {
89 ClusterStatus clusterStatus = cluster.getClusterStatus();
90 Collection<ServerName> regionServers = clusterStatus.getServers();
91 int count = regionServers == null ? 0 : regionServers.size();
92 if (count <= 0) {
93 return new ServerName [] {};
94 }
95 ServerName master = clusterStatus.getMaster();
96 if (master == null || !regionServers.contains(master)) {
97 return regionServers.toArray(new ServerName[count]);
98 }
99 if (count == 1) {
100 return new ServerName [] {};
101 }
102 ArrayList<ServerName> tmp = new ArrayList<ServerName>(count);
103 tmp.addAll(regionServers);
104 tmp.remove(master);
105 return tmp.toArray(new ServerName[count-1]);
106 }
107
108 protected void killMaster(ServerName server) throws IOException {
109 LOG.info("Killing master:" + server);
110 cluster.killMaster(server);
111 cluster.waitForMasterToStop(server, killMasterTimeout);
112 LOG.info("Killed master server:" + server);
113 }
114
115 protected void startMaster(ServerName server) throws IOException {
116 LOG.info("Starting master:" + server.getHostname());
117 cluster.startMaster(server.getHostname(), server.getPort());
118 cluster.waitForActiveAndReadyMaster(startMasterTimeout);
119 LOG.info("Started master: " + server);
120 }
121
122 protected void killRs(ServerName server) throws IOException {
123 LOG.info("Killing region server:" + server);
124 cluster.killRegionServer(server);
125 cluster.waitForRegionServerToStop(server, killRsTimeout);
126 LOG.info("Killed region server:" + server + ". Reported num of rs:"
127 + cluster.getClusterStatus().getServersSize());
128 }
129
130 protected void startRs(ServerName server) throws IOException {
131 LOG.info("Starting region server:" + server.getHostname());
132 cluster.startRegionServer(server.getHostname(), server.getPort());
133 cluster.waitForRegionServerToStart(server.getHostname(), server.getPort(), startRsTimeout);
134 LOG.info("Started region server:" + server + ". Reported num of rs:"
135 + cluster.getClusterStatus().getServersSize());
136 }
137
138 protected void unbalanceRegions(ClusterStatus clusterStatus,
139 List<ServerName> fromServers, List<ServerName> toServers,
140 double fractionOfRegions) throws Exception {
141 List<byte[]> victimRegions = new LinkedList<byte[]>();
142 for (ServerName server : fromServers) {
143 ServerLoad serverLoad = clusterStatus.getLoad(server);
144
145 List<byte[]> regions = new LinkedList<byte[]>(serverLoad.getRegionsLoad().keySet());
146 int victimRegionCount = (int)Math.ceil(fractionOfRegions * regions.size());
147 LOG.debug("Removing " + victimRegionCount + " regions from " + server.getServerName());
148 for (int i = 0; i < victimRegionCount; ++i) {
149 int victimIx = RandomUtils.nextInt(regions.size());
150 String regionId = HRegionInfo.encodeRegionName(regions.remove(victimIx));
151 victimRegions.add(Bytes.toBytes(regionId));
152 }
153 }
154
155 LOG.info("Moving " + victimRegions.size() + " regions from " + fromServers.size()
156 + " servers to " + toServers.size() + " different servers");
157 Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
158 for (byte[] victimRegion : victimRegions) {
159 int targetIx = RandomUtils.nextInt(toServers.size());
160 admin.move(victimRegion, Bytes.toBytes(toServers.get(targetIx).getServerName()));
161 }
162 }
163
164 protected void forceBalancer() throws Exception {
165 Admin admin = this.context.getHBaseIntegrationTestingUtility().getHBaseAdmin();
166 boolean result = false;
167 try {
168 result = admin.balancer();
169 } catch (Exception e) {
170 LOG.warn("Got exception while doing balance ", e);
171 }
172 if (!result) {
173 LOG.error("Balancer didn't succeed");
174 }
175 }
176
177
178
179
180 public static class ActionContext {
181 private IntegrationTestingUtility util;
182
183 public ActionContext(IntegrationTestingUtility util) {
184 this.util = util;
185 }
186
187 public IntegrationTestingUtility getHBaseIntegrationTestingUtility() {
188 return util;
189 }
190
191 public HBaseCluster getHBaseCluster() {
192 return util.getHBaseClusterInterface();
193 }
194 }
195 }