View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.util;
19  
20  import java.lang.management.GarbageCollectorMXBean;
21  import java.lang.management.ManagementFactory;
22  import java.util.List;
23  import java.util.Map;
24  import java.util.Set;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
30  import org.apache.hadoop.conf.Configuration;
31  
32  import com.google.common.base.Joiner;
33  import com.google.common.base.Preconditions;
34  import com.google.common.base.Stopwatch;
35  import com.google.common.collect.Lists;
36  import com.google.common.collect.Maps;
37  import com.google.common.collect.Sets;
38  
39  /**
40   * Class which sets up a simple thread which runs in a loop sleeping
41   * for a short interval of time. If the sleep takes significantly longer
42   * than its target time, it implies that the JVM or host machine has
43   * paused processing, which may cause other problems. If such a pause is
44   * detected, the thread logs a message.
45   * The original JvmPauseMonitor is:
46   * ${hadoop-common-project}/hadoop-common/src/main/java/org/apache/hadoop/util/
47   * JvmPauseMonitor.java
48   * r1503806 | cmccabe | 2013-07-17 01:48:24 +0800 (Wed, 17 Jul 2013) | 1 line
49   * HADOOP-9618.  thread which detects GC pauses(Todd Lipcon)
50   */
51  @InterfaceAudience.Private
52  public class JvmPauseMonitor {
53    private static final Log LOG = LogFactory.getLog(JvmPauseMonitor.class);
54  
55    /** The target sleep time */
56    private static final long SLEEP_INTERVAL_MS = 500;
57    
58    /** log WARN if we detect a pause longer than this threshold */
59    private final long warnThresholdMs;
60    public static final String WARN_THRESHOLD_KEY =
61        "jvm.pause.warn-threshold.ms";
62    private static final long WARN_THRESHOLD_DEFAULT = 10000;
63    
64    /** log INFO if we detect a pause longer than this threshold */
65    private final long infoThresholdMs;
66    public static final String INFO_THRESHOLD_KEY =
67        "jvm.pause.info-threshold.ms";
68    private static final long INFO_THRESHOLD_DEFAULT = 1000;
69  
70    private Thread monitorThread;
71    private volatile boolean shouldRun = true;
72    private JvmPauseMonitorSource metricsSource;
73  
74    public JvmPauseMonitor(Configuration conf) {
75      this(conf, null);
76    }
77  
78    public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
79      this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
80      this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
81      this.metricsSource = metricsSource;
82    }
83    
84    public void start() {
85      Preconditions.checkState(monitorThread == null, "Already started");
86      monitorThread = new Thread(new Monitor());
87      monitorThread.setDaemon(true);
88      monitorThread.setName("JvmPauseMonitor");
89      monitorThread.start();
90    }
91  
92    public void stop() {
93      shouldRun = false;
94      monitorThread.interrupt();
95      try {
96        monitorThread.join();
97      } catch (InterruptedException e) {
98        Thread.currentThread().interrupt();
99      }
100   }
101   
102   private String formatMessage(long extraSleepTime, List<String> gcDiffs) {
103     String ret = "Detected pause in JVM or host machine (eg GC): " + "pause of approximately "
104         + extraSleepTime + "ms\n";
105     if (gcDiffs.isEmpty()) {
106       ret += "No GCs detected";
107     } else {
108       ret += Joiner.on("\n").join(gcDiffs);
109     }
110     return ret;
111   }
112   
113   private Map<String, GcTimes> getGcTimes() {
114     Map<String, GcTimes> map = Maps.newHashMap();
115     List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
116     for (GarbageCollectorMXBean gcBean : gcBeans) {
117       map.put(gcBean.getName(), new GcTimes(gcBean));
118     }
119     return map;
120   }
121 
122   private static class GcTimes {
123     private GcTimes(GarbageCollectorMXBean gcBean) {
124       gcCount = gcBean.getCollectionCount();
125       gcTimeMillis = gcBean.getCollectionTime();
126     }
127 
128     private GcTimes(long count, long time) {
129       this.gcCount = count;
130       this.gcTimeMillis = time;
131     }
132 
133     private GcTimes subtract(GcTimes other) {
134       return new GcTimes(this.gcCount - other.gcCount, this.gcTimeMillis - other.gcTimeMillis);
135     }
136 
137     @Override
138     public String toString() {
139       return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
140     }
141 
142     private long gcCount;
143     private long gcTimeMillis;
144   }
145 
146   private class Monitor implements Runnable {
147     @Override
148     public void run() {
149       Stopwatch sw = new Stopwatch();
150       Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
151       while (shouldRun) {
152         sw.reset().start();
153         try {
154           Thread.sleep(SLEEP_INTERVAL_MS);
155         } catch (InterruptedException ie) {
156           return;
157         }
158 
159         long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
160         Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
161 
162         if (extraSleepTime > infoThresholdMs) {
163           Set<String> gcBeanNames = Sets.intersection(gcTimesAfterSleep.keySet(),
164             gcTimesBeforeSleep.keySet());
165           List<String> gcDiffs = Lists.newArrayList();
166           for (String name : gcBeanNames) {
167             GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
168             if (diff.gcCount != 0) {
169               gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
170             }
171           }
172 
173           updateMetrics(extraSleepTime, !gcDiffs.isEmpty());
174 
175           if (extraSleepTime > warnThresholdMs) {
176             LOG.warn(formatMessage(extraSleepTime, gcDiffs));
177           } else {
178             LOG.info(formatMessage(extraSleepTime, gcDiffs));
179           }
180         }
181         gcTimesBeforeSleep = gcTimesAfterSleep;
182       }
183     }
184   }
185 
186   public void updateMetrics(long sleepTime, boolean gcDetected) {
187     if (metricsSource != null) {
188       if (sleepTime > warnThresholdMs) {
189         metricsSource.incWarnThresholdExceeded(1);
190       } else {
191         metricsSource.incInfoThresholdExceeded(1);
192       }
193       if (gcDetected) {
194         metricsSource.updatePauseTimeWithGc(sleepTime);
195       } else {
196         metricsSource.updatePauseTimeWithoutGc(sleepTime);
197       }
198     }
199   }
200 
201   public JvmPauseMonitorSource getMetricsSource() {
202     return metricsSource;
203   }
204 
205   public void setMetricsSource(JvmPauseMonitorSource metricsSource) {
206     this.metricsSource = metricsSource;
207   }
208 
209   /**
210    * Simple 'main' to facilitate manual testing of the pause monitor.
211    * 
212    * This main function just leaks memory into a list. Running this class
213    * with a 1GB heap will very quickly go into "GC hell" and result in
214    * log messages about the GC pauses.
215    */
216   public static void main(String []args) throws Exception {
217     new JvmPauseMonitor(new Configuration()).start();
218     List<String> list = Lists.newArrayList();
219     int i = 0;
220     while (true) {
221       list.add(String.valueOf(i++));
222     }
223   }
224 }