View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.io.UnsupportedEncodingException;
26  import java.lang.reflect.Constructor;
27  import java.text.ParseException;
28  import java.util.AbstractList;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.Comparator;
34  import java.util.HashMap;
35  import java.util.HashSet;
36  import java.util.Iterator;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Map.Entry;
40  import java.util.NavigableMap;
41  import java.util.NavigableSet;
42  import java.util.RandomAccess;
43  import java.util.Set;
44  import java.util.TreeMap;
45  import java.util.concurrent.Callable;
46  import java.util.concurrent.CompletionService;
47  import java.util.concurrent.ConcurrentHashMap;
48  import java.util.concurrent.ConcurrentMap;
49  import java.util.concurrent.ConcurrentSkipListMap;
50  import java.util.concurrent.CountDownLatch;
51  import java.util.concurrent.ExecutionException;
52  import java.util.concurrent.ExecutorCompletionService;
53  import java.util.concurrent.ExecutorService;
54  import java.util.concurrent.Executors;
55  import java.util.concurrent.Future;
56  import java.util.concurrent.FutureTask;
57  import java.util.concurrent.ThreadFactory;
58  import java.util.concurrent.ThreadPoolExecutor;
59  import java.util.concurrent.TimeUnit;
60  import java.util.concurrent.TimeoutException;
61  import java.util.concurrent.atomic.AtomicBoolean;
62  import java.util.concurrent.atomic.AtomicInteger;
63  import java.util.concurrent.atomic.AtomicLong;
64  import java.util.concurrent.locks.Lock;
65  import java.util.concurrent.locks.ReentrantReadWriteLock;
66  
67  import org.apache.commons.lang.RandomStringUtils;
68  import org.apache.commons.logging.Log;
69  import org.apache.commons.logging.LogFactory;
70  import org.apache.hadoop.conf.Configuration;
71  import org.apache.hadoop.fs.FileStatus;
72  import org.apache.hadoop.fs.FileSystem;
73  import org.apache.hadoop.fs.Path;
74  import org.apache.hadoop.hbase.Cell;
75  import org.apache.hadoop.hbase.CellScanner;
76  import org.apache.hadoop.hbase.CellUtil;
77  import org.apache.hadoop.hbase.CompoundConfiguration;
78  import org.apache.hadoop.hbase.DoNotRetryIOException;
79  import org.apache.hadoop.hbase.DroppedSnapshotException;
80  import org.apache.hadoop.hbase.HBaseConfiguration;
81  import org.apache.hadoop.hbase.HBaseIOException;
82  import org.apache.hadoop.hbase.HColumnDescriptor;
83  import org.apache.hadoop.hbase.HConstants;
84  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
85  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
86  import org.apache.hadoop.hbase.HRegionInfo;
87  import org.apache.hadoop.hbase.HTableDescriptor;
88  import org.apache.hadoop.hbase.KeyValue;
89  import org.apache.hadoop.hbase.KeyValueUtil;
90  import org.apache.hadoop.hbase.NamespaceDescriptor;
91  import org.apache.hadoop.hbase.NotServingRegionException;
92  import org.apache.hadoop.hbase.RegionTooBusyException;
93  import org.apache.hadoop.hbase.TableName;
94  import org.apache.hadoop.hbase.Tag;
95  import org.apache.hadoop.hbase.TagType;
96  import org.apache.hadoop.hbase.UnknownScannerException;
97  import org.apache.hadoop.hbase.backup.HFileArchiver;
98  import org.apache.hadoop.hbase.classification.InterfaceAudience;
99  import org.apache.hadoop.hbase.client.Append;
100 import org.apache.hadoop.hbase.client.Delete;
101 import org.apache.hadoop.hbase.client.Durability;
102 import org.apache.hadoop.hbase.client.Get;
103 import org.apache.hadoop.hbase.client.Increment;
104 import org.apache.hadoop.hbase.client.IsolationLevel;
105 import org.apache.hadoop.hbase.client.Mutation;
106 import org.apache.hadoop.hbase.client.Put;
107 import org.apache.hadoop.hbase.client.RegionReplicaUtil;
108 import org.apache.hadoop.hbase.client.Result;
109 import org.apache.hadoop.hbase.client.RowMutations;
110 import org.apache.hadoop.hbase.client.Scan;
111 import org.apache.hadoop.hbase.conf.ConfigurationManager;
112 import org.apache.hadoop.hbase.conf.PropagatingConfigurationObserver;
113 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
114 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
115 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
116 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
117 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
118 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
119 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
120 import org.apache.hadoop.hbase.filter.FilterWrapper;
121 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
122 import org.apache.hadoop.hbase.io.HeapSize;
123 import org.apache.hadoop.hbase.io.TimeRange;
124 import org.apache.hadoop.hbase.io.hfile.BlockCache;
125 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
126 import org.apache.hadoop.hbase.io.hfile.HFile;
127 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
128 import org.apache.hadoop.hbase.ipc.CoprocessorRpcUtils;
129 import org.apache.hadoop.hbase.ipc.RpcCallContext;
130 import org.apache.hadoop.hbase.ipc.RpcServer;
131 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
132 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
133 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
134 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
135 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
136 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
137 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.RegionLoad;
138 import org.apache.hadoop.hbase.protobuf.generated.ClusterStatusProtos.StoreSequenceId;
139 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
140 import org.apache.hadoop.hbase.protobuf.generated.WALProtos;
141 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
142 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor;
143 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.FlushAction;
144 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FlushDescriptor.StoreFlushDescriptor;
145 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor;
146 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.RegionEventDescriptor.EventType;
147 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.StoreDescriptor;
148 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
149 import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
150 import org.apache.hadoop.hbase.regionserver.ScannerContext.NextState;
151 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
152 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputController;
153 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputControllerFactory;
154 import org.apache.hadoop.hbase.regionserver.compactions.NoLimitCompactionThroughputController;
155 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
156 import org.apache.hadoop.hbase.regionserver.wal.MetricsWAL;
157 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
158 import org.apache.hadoop.hbase.regionserver.wal.ReplayHLogKey;
159 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
160 import org.apache.hadoop.hbase.regionserver.wal.WALUtil;
161 import org.apache.hadoop.hbase.security.User;
162 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
163 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
164 import org.apache.hadoop.hbase.util.ByteStringer;
165 import org.apache.hadoop.hbase.util.Bytes;
166 import org.apache.hadoop.hbase.util.CancelableProgressable;
167 import org.apache.hadoop.hbase.util.ClassSize;
168 import org.apache.hadoop.hbase.util.CompressionTest;
169 import org.apache.hadoop.hbase.util.Counter;
170 import org.apache.hadoop.hbase.util.EncryptionTest;
171 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
172 import org.apache.hadoop.hbase.util.FSTableDescriptors;
173 import org.apache.hadoop.hbase.util.FSUtils;
174 import org.apache.hadoop.hbase.util.HashedBytes;
175 import org.apache.hadoop.hbase.util.Pair;
176 import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
177 import org.apache.hadoop.hbase.util.Threads;
178 import org.apache.hadoop.hbase.wal.WAL;
179 import org.apache.hadoop.hbase.wal.WALFactory;
180 import org.apache.hadoop.hbase.wal.WALKey;
181 import org.apache.hadoop.hbase.wal.WALSplitter;
182 import org.apache.hadoop.hbase.wal.WALSplitter.MutationReplay;
183 import org.apache.hadoop.io.MultipleIOException;
184 import org.apache.hadoop.util.StringUtils;
185 import org.apache.htrace.Trace;
186 import org.apache.htrace.TraceScope;
187 
188 import com.google.common.annotations.VisibleForTesting;
189 import com.google.common.base.Optional;
190 import com.google.common.base.Preconditions;
191 import com.google.common.collect.Lists;
192 import com.google.common.collect.Maps;
193 import com.google.common.io.Closeables;
194 import com.google.protobuf.ByteString;
195 import com.google.protobuf.Descriptors;
196 import com.google.protobuf.Message;
197 import com.google.protobuf.RpcCallback;
198 import com.google.protobuf.RpcController;
199 import com.google.protobuf.Service;
200 import com.google.protobuf.TextFormat;
201 
202 @InterfaceAudience.Private
203 public class HRegion implements HeapSize, PropagatingConfigurationObserver, Region {
204   public static final Log LOG = LogFactory.getLog(HRegion.class);
205 
206   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
207     "hbase.hregion.scan.loadColumnFamiliesOnDemand";
208 
209   /**
210    * Longest time we'll wait on a sequenceid.
211    * Sequenceid comes up out of the WAL subsystem. WAL subsystem can go bad or a test might use
212    * it without cleanup previous usage properly; generally, a WAL roll is needed.
213    * Key to use changing the default of 30000ms.
214    */
215   private final int maxWaitForSeqId;
216   private static final String MAX_WAIT_FOR_SEQ_ID_KEY = "hbase.hregion.max.wait.for.sequenceid.ms";
217   private static final int DEFAULT_MAX_WAIT_FOR_SEQ_ID = 30000;
218 
219   /**
220    * This is the global default value for durability. All tables/mutations not
221    * defining a durability or using USE_DEFAULT will default to this value.
222    */
223   private static final Durability DEFAULT_DURABILITY = Durability.SYNC_WAL;
224 
225   final AtomicBoolean closed = new AtomicBoolean(false);
226   /* Closing can take some time; use the closing flag if there is stuff we don't
227    * want to do while in closing state; e.g. like offer this region up to the
228    * master as a region to close if the carrying regionserver is overloaded.
229    * Once set, it is never cleared.
230    */
231   final AtomicBoolean closing = new AtomicBoolean(false);
232 
233   /**
234    * The max sequence id of flushed data on this region. There is no edit in memory that is
235    * less that this sequence id.
236    */
237   private volatile long maxFlushedSeqId = HConstants.NO_SEQNUM;
238 
239   /**
240    * Record the sequence id of last flush operation. Can be in advance of
241    * {@link #maxFlushedSeqId} when flushing a single column family. In this case,
242    * {@link #maxFlushedSeqId} will be older than the oldest edit in memory.
243    */
244   private volatile long lastFlushOpSeqId = HConstants.NO_SEQNUM;
245   /**
246    * Region scoped edit sequence Id. Edits to this region are GUARANTEED to appear in the WAL
247    * file in this sequence id's order; i.e. edit #2 will be in the WAL after edit #1.
248    * Its default value is -1L. This default is used as a marker to indicate
249    * that the region hasn't opened yet. Once it is opened, it is set to the derived
250    * #openSeqNum, the largest sequence id of all hfiles opened under this Region.
251    *
252    * <p>Control of this sequence is handed off to the WAL implementation.  It is responsible
253    * for tagging edits with the correct sequence id since it is responsible for getting the
254    * edits into the WAL files. It controls updating the sequence id value.  DO NOT UPDATE IT
255    * OUTSIDE OF THE WAL.  The value you get will not be what you think it is.
256    */
257   private final AtomicLong sequenceId = new AtomicLong(-1L);
258 
259   /**
260    * The sequence id of the last replayed open region event from the primary region. This is used
261    * to skip entries before this due to the possibility of replay edits coming out of order from
262    * replication.
263    */
264   protected volatile long lastReplayedOpenRegionSeqId = -1L;
265   protected volatile long lastReplayedCompactionSeqId = -1L;
266 
267   //////////////////////////////////////////////////////////////////////////////
268   // Members
269   //////////////////////////////////////////////////////////////////////////////
270 
271   // map from a locked row to the context for that lock including:
272   // - CountDownLatch for threads waiting on that row
273   // - the thread that owns the lock (allow reentrancy)
274   // - reference count of (reentrant) locks held by the thread
275   // - the row itself
276   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
277       new ConcurrentHashMap<HashedBytes, RowLockContext>();
278 
279   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
280       Bytes.BYTES_RAWCOMPARATOR);
281 
282   // TODO: account for each registered handler in HeapSize computation
283   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
284 
285   public final AtomicLong memstoreSize = new AtomicLong(0);
286 
287   // Debug possible data loss due to WAL off
288   final Counter numMutationsWithoutWAL = new Counter();
289   final Counter dataInMemoryWithoutWAL = new Counter();
290 
291   // Debug why CAS operations are taking a while.
292   final Counter checkAndMutateChecksPassed = new Counter();
293   final Counter checkAndMutateChecksFailed = new Counter();
294 
295   //Number of requests
296   final Counter readRequestsCount = new Counter();
297   final Counter writeRequestsCount = new Counter();
298 
299   // Number of requests blocked by memstore size.
300   private final Counter blockedRequestsCount = new Counter();
301 
302   // Compaction counters
303   final AtomicLong compactionsFinished = new AtomicLong(0L);
304   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
305   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
306 
307   private final WAL wal;
308   private final HRegionFileSystem fs;
309   protected final Configuration conf;
310   private final Configuration baseConf;
311   private final KeyValue.KVComparator comparator;
312   private final int rowLockWaitDuration;
313   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
314 
315   // The internal wait duration to acquire a lock before read/update
316   // from the region. It is not per row. The purpose of this wait time
317   // is to avoid waiting a long time while the region is busy, so that
318   // we can release the IPC handler soon enough to improve the
319   // availability of the region server. It can be adjusted by
320   // tuning configuration "hbase.busy.wait.duration".
321   final long busyWaitDuration;
322   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
323 
324   // If updating multiple rows in one call, wait longer,
325   // i.e. waiting for busyWaitDuration * # of rows. However,
326   // we can limit the max multiplier.
327   final int maxBusyWaitMultiplier;
328 
329   // Max busy wait duration. There is no point to wait longer than the RPC
330   // purge timeout, when a RPC call will be terminated by the RPC engine.
331   final long maxBusyWaitDuration;
332 
333   // negative number indicates infinite timeout
334   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
335   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
336 
337   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
338 
339   /**
340    * The sequence ID that was encountered when this region was opened.
341    */
342   private long openSeqNum = HConstants.NO_SEQNUM;
343 
344   /**
345    * The default setting for whether to enable on-demand CF loading for
346    * scan requests to this region. Requests can override it.
347    */
348   private boolean isLoadingCfsOnDemandDefault = false;
349 
350   private final AtomicInteger majorInProgress = new AtomicInteger(0);
351   private final AtomicInteger minorInProgress = new AtomicInteger(0);
352 
353   //
354   // Context: During replay we want to ensure that we do not lose any data. So, we
355   // have to be conservative in how we replay wals. For each store, we calculate
356   // the maxSeqId up to which the store was flushed. And, skip the edits which
357   // are equal to or lower than maxSeqId for each store.
358   // The following map is populated when opening the region
359   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
360 
361   /** Saved state from replaying prepare flush cache */
362   private PrepareFlushResult prepareFlushResult = null;
363 
364   /**
365    * Config setting for whether to allow writes when a region is in recovering or not.
366    */
367   private boolean disallowWritesInRecovering = false;
368 
369   // when a region is in recovering state, it can only accept writes not reads
370   private volatile boolean isRecovering = false;
371 
372   private volatile Optional<ConfigurationManager> configurationManager;
373 
374   /**
375    * @return The smallest mvcc readPoint across all the scanners in this
376    * region. Writes older than this readPoint, are included  in every
377    * read operation.
378    */
379   public long getSmallestReadPoint() {
380     long minimumReadPoint;
381     // We need to ensure that while we are calculating the smallestReadPoint
382     // no new RegionScanners can grab a readPoint that we are unaware of.
383     // We achieve this by synchronizing on the scannerReadPoints object.
384     synchronized(scannerReadPoints) {
385       minimumReadPoint = mvcc.memstoreReadPoint();
386 
387       for (Long readPoint: this.scannerReadPoints.values()) {
388         if (readPoint < minimumReadPoint) {
389           minimumReadPoint = readPoint;
390         }
391       }
392     }
393     return minimumReadPoint;
394   }
395 
396   /*
397    * Data structure of write state flags used coordinating flushes,
398    * compactions and closes.
399    */
400   static class WriteState {
401     // Set while a memstore flush is happening.
402     volatile boolean flushing = false;
403     // Set when a flush has been requested.
404     volatile boolean flushRequested = false;
405     // Number of compactions running.
406     volatile int compacting = 0;
407     // Gets set in close. If set, cannot compact or flush again.
408     volatile boolean writesEnabled = true;
409     // Set if region is read-only
410     volatile boolean readOnly = false;
411     // whether the reads are enabled. This is different than readOnly, because readOnly is
412     // static in the lifetime of the region, while readsEnabled is dynamic
413     volatile boolean readsEnabled = true;
414 
415     /**
416      * Set flags that make this region read-only.
417      *
418      * @param onOff flip value for region r/o setting
419      */
420     synchronized void setReadOnly(final boolean onOff) {
421       this.writesEnabled = !onOff;
422       this.readOnly = onOff;
423     }
424 
425     boolean isReadOnly() {
426       return this.readOnly;
427     }
428 
429     boolean isFlushRequested() {
430       return this.flushRequested;
431     }
432 
433     void setReadsEnabled(boolean readsEnabled) {
434       this.readsEnabled = readsEnabled;
435     }
436 
437     static final long HEAP_SIZE = ClassSize.align(
438         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
439   }
440 
441   /**
442    * Objects from this class are created when flushing to describe all the different states that
443    * that method ends up in. The Result enum describes those states. The sequence id should only
444    * be specified if the flush was successful, and the failure message should only be specified
445    * if it didn't flush.
446    */
447   public static class FlushResultImpl implements FlushResult {
448     final Result result;
449     final String failureReason;
450     final long flushSequenceId;
451     final boolean wroteFlushWalMarker;
452 
453     /**
454      * Convenience constructor to use when the flush is successful, the failure message is set to
455      * null.
456      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
457      * @param flushSequenceId Generated sequence id that comes right after the edits in the
458      *                        memstores.
459      */
460     FlushResultImpl(Result result, long flushSequenceId) {
461       this(result, flushSequenceId, null, false);
462       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
463           .FLUSHED_COMPACTION_NEEDED;
464     }
465 
466     /**
467      * Convenience constructor to use when we cannot flush.
468      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
469      * @param failureReason Reason why we couldn't flush.
470      */
471     FlushResultImpl(Result result, String failureReason, boolean wroteFlushMarker) {
472       this(result, -1, failureReason, wroteFlushMarker);
473       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
474     }
475 
476     /**
477      * Constructor with all the parameters.
478      * @param result Any of the Result.
479      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
480      * @param failureReason Reason why we couldn't flush, or null.
481      */
482     FlushResultImpl(Result result, long flushSequenceId, String failureReason,
483       boolean wroteFlushMarker) {
484       this.result = result;
485       this.flushSequenceId = flushSequenceId;
486       this.failureReason = failureReason;
487       this.wroteFlushWalMarker = wroteFlushMarker;
488     }
489 
490     /**
491      * Convenience method, the equivalent of checking if result is
492      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
493      * @return true if the memstores were flushed, else false.
494      */
495     @Override
496     public boolean isFlushSucceeded() {
497       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
498           .FLUSHED_COMPACTION_NEEDED;
499     }
500 
501     /**
502      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
503      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
504      */
505     @Override
506     public boolean isCompactionNeeded() {
507       return result == Result.FLUSHED_COMPACTION_NEEDED;
508     }
509 
510     @Override
511     public String toString() {
512       return new StringBuilder()
513         .append("flush result:").append(result).append(", ")
514         .append("failureReason:").append(failureReason).append(",")
515         .append("flush seq id").append(flushSequenceId).toString();
516     }
517 
518     @Override
519     public Result getResult() {
520       return result;
521     }
522   }
523 
524   /** A result object from prepare flush cache stage */
525   @VisibleForTesting
526   static class PrepareFlushResult {
527     final FlushResult result; // indicating a failure result from prepare
528     final TreeMap<byte[], StoreFlushContext> storeFlushCtxs;
529     final TreeMap<byte[], List<Path>> committedFiles;
530     final TreeMap<byte[], Long> storeFlushableSize;
531     final long startTime;
532     final long flushOpSeqId;
533     final long flushedSeqId;
534     final long totalFlushableSize;
535 
536     /** Constructs an early exit case */
537     PrepareFlushResult(FlushResult result, long flushSeqId) {
538       this(result, null, null, null, Math.max(0, flushSeqId), 0, 0, 0);
539     }
540 
541     /** Constructs a successful prepare flush result */
542     PrepareFlushResult(
543       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
544       TreeMap<byte[], List<Path>> committedFiles,
545       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
546       long flushedSeqId, long totalFlushableSize) {
547       this(null, storeFlushCtxs, committedFiles, storeFlushableSize, startTime,
548         flushSeqId, flushedSeqId, totalFlushableSize);
549     }
550 
551     private PrepareFlushResult(
552       FlushResult result,
553       TreeMap<byte[], StoreFlushContext> storeFlushCtxs,
554       TreeMap<byte[], List<Path>> committedFiles,
555       TreeMap<byte[], Long> storeFlushableSize, long startTime, long flushSeqId,
556       long flushedSeqId, long totalFlushableSize) {
557       this.result = result;
558       this.storeFlushCtxs = storeFlushCtxs;
559       this.committedFiles = committedFiles;
560       this.storeFlushableSize = storeFlushableSize;
561       this.startTime = startTime;
562       this.flushOpSeqId = flushSeqId;
563       this.flushedSeqId = flushedSeqId;
564       this.totalFlushableSize = totalFlushableSize;
565     }
566 
567     public FlushResult getResult() {
568       return this.result;
569     }
570   }
571 
572   /**
573    * A class that tracks exceptions that have been observed in one batch. Not thread safe.
574    */
575   static class ObservedExceptionsInBatch {
576     private boolean wrongRegion = false;
577     private boolean failedSanityCheck = false;
578     private boolean wrongFamily = false;
579 
580     /**
581      * @return If a {@link WrongRegionException} has been observed.
582      */
583     boolean hasSeenWrongRegion() {
584       return wrongRegion;
585     }
586 
587     /**
588      * Records that a {@link WrongRegionException} has been observed.
589      */
590     void sawWrongRegion() {
591       wrongRegion = true;
592     }
593 
594     /**
595      * @return If a {@link FailedSanityCheckException} has been observed.
596      */
597     boolean hasSeenFailedSanityCheck() {
598       return failedSanityCheck;
599     }
600 
601     /**
602      * Records that a {@link FailedSanityCheckException} has been observed.
603      */
604     void sawFailedSanityCheck() {
605       failedSanityCheck = true;
606     }
607 
608     /**
609      * @return If a {@link NoSuchColumnFamilyException} has been observed.
610      */
611     boolean hasSeenNoSuchFamily() {
612       return wrongFamily;
613     }
614 
615     /**
616      * Records that a {@link NoSuchColumnFamilyException} has been observed.
617      */
618     void sawNoSuchFamily() {
619       wrongFamily = true;
620     }
621   }
622 
623   final WriteState writestate = new WriteState();
624 
625   long memstoreFlushSize;
626   final long timestampSlop;
627   final long rowProcessorTimeout;
628 
629   // Last flush time for each Store. Useful when we are flushing for each column
630   private final ConcurrentMap<Store, Long> lastStoreFlushTimeMap =
631       new ConcurrentHashMap<Store, Long>();
632 
633   final RegionServerServices rsServices;
634   private RegionServerAccounting rsAccounting;
635   private long flushCheckInterval;
636   // flushPerChanges is to prevent too many changes in memstore
637   private long flushPerChanges;
638   private long blockingMemStoreSize;
639   final long threadWakeFrequency;
640   // Used to guard closes
641   final ReentrantReadWriteLock lock =
642     new ReentrantReadWriteLock();
643 
644   // Stop updates lock
645   private final ReentrantReadWriteLock updatesLock =
646     new ReentrantReadWriteLock();
647   private boolean splitRequest;
648   private byte[] explicitSplitPoint = null;
649 
650   private final MultiVersionConsistencyControl mvcc =
651       new MultiVersionConsistencyControl();
652 
653   // Coprocessor host
654   private RegionCoprocessorHost coprocessorHost;
655 
656   private HTableDescriptor htableDescriptor = null;
657   private RegionSplitPolicy splitPolicy;
658   private FlushPolicy flushPolicy;
659 
660   private final MetricsRegion metricsRegion;
661   private final MetricsRegionWrapperImpl metricsRegionWrapper;
662   private final Durability durability;
663   private final boolean regionStatsEnabled;
664 
665   /**
666    * HRegion constructor. This constructor should only be used for testing and
667    * extensions.  Instances of HRegion should be instantiated with the
668    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
669    *
670    * @param tableDir qualified path of directory where region should be located,
671    * usually the table directory.
672    * @param wal The WAL is the outbound log for any updates to the HRegion
673    * The wal file is a logfile from the previous execution that's
674    * custom-computed for this HRegion. The HRegionServer computes and sorts the
675    * appropriate wal info for this HRegion. If there is a previous wal file
676    * (implying that the HRegion has been written-to before), then read it from
677    * the supplied path.
678    * @param fs is the filesystem.
679    * @param confParam is global configuration settings.
680    * @param regionInfo - HRegionInfo that describes the region
681    * is new), then read them from the supplied path.
682    * @param htd the table descriptor
683    * @param rsServices reference to {@link RegionServerServices} or null
684    */
685   @Deprecated
686   public HRegion(final Path tableDir, final WAL wal, final FileSystem fs,
687       final Configuration confParam, final HRegionInfo regionInfo,
688       final HTableDescriptor htd, final RegionServerServices rsServices) {
689     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
690       wal, confParam, htd, rsServices);
691   }
692 
693   /**
694    * HRegion constructor. This constructor should only be used for testing and
695    * extensions.  Instances of HRegion should be instantiated with the
696    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
697    *
698    * @param fs is the filesystem.
699    * @param wal The WAL is the outbound log for any updates to the HRegion
700    * The wal file is a logfile from the previous execution that's
701    * custom-computed for this HRegion. The HRegionServer computes and sorts the
702    * appropriate wal info for this HRegion. If there is a previous wal file
703    * (implying that the HRegion has been written-to before), then read it from
704    * the supplied path.
705    * @param confParam is global configuration settings.
706    * @param htd the table descriptor
707    * @param rsServices reference to {@link RegionServerServices} or null
708    */
709   public HRegion(final HRegionFileSystem fs, final WAL wal, final Configuration confParam,
710       final HTableDescriptor htd, final RegionServerServices rsServices) {
711     if (htd == null) {
712       throw new IllegalArgumentException("Need table descriptor");
713     }
714 
715     if (confParam instanceof CompoundConfiguration) {
716       throw new IllegalArgumentException("Need original base configuration");
717     }
718 
719     this.comparator = fs.getRegionInfo().getComparator();
720     this.wal = wal;
721     this.fs = fs;
722 
723     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
724     this.baseConf = confParam;
725     this.conf = new CompoundConfiguration()
726       .add(confParam)
727       .addStringMap(htd.getConfiguration())
728       .addWritableMap(htd.getValues());
729     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
730         DEFAULT_CACHE_FLUSH_INTERVAL);
731     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
732     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
733       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
734           + MAX_FLUSH_PER_CHANGES);
735     }
736     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
737                     DEFAULT_ROWLOCK_WAIT_DURATION);
738 
739     this.maxWaitForSeqId = conf.getInt(MAX_WAIT_FOR_SEQ_ID_KEY, DEFAULT_MAX_WAIT_FOR_SEQ_ID);
740     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
741     this.htableDescriptor = htd;
742     this.rsServices = rsServices;
743     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
744     setHTableSpecificConf();
745     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
746 
747     this.busyWaitDuration = conf.getLong(
748       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
749     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
750     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
751       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
752         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
753         + maxBusyWaitMultiplier + "). Their product should be positive");
754     }
755     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
756       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
757 
758     /*
759      * timestamp.slop provides a server-side constraint on the timestamp. This
760      * assumes that you base your TS around currentTimeMillis(). In this case,
761      * throw an error to the user if the user-specified TS is newer than now +
762      * slop. LATEST_TIMESTAMP == don't use this functionality
763      */
764     this.timestampSlop = conf.getLong(
765         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
766         HConstants.LATEST_TIMESTAMP);
767 
768     /**
769      * Timeout for the process time in processRowsWithLocks().
770      * Use -1 to switch off time bound.
771      */
772     this.rowProcessorTimeout = conf.getLong(
773         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
774     this.durability = htd.getDurability() == Durability.USE_DEFAULT
775         ? DEFAULT_DURABILITY
776         : htd.getDurability();
777     if (rsServices != null) {
778       this.rsAccounting = this.rsServices.getRegionServerAccounting();
779       // don't initialize coprocessors if not running within a regionserver
780       // TODO: revisit if coprocessors should load in other cases
781       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
782       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
783       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
784 
785       Map<String, Region> recoveringRegions = rsServices.getRecoveringRegions();
786       String encodedName = getRegionInfo().getEncodedName();
787       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
788         this.isRecovering = true;
789         recoveringRegions.put(encodedName, this);
790       }
791     } else {
792       this.metricsRegionWrapper = null;
793       this.metricsRegion = null;
794     }
795     if (LOG.isDebugEnabled()) {
796       // Write out region name as string and its encoded name.
797       LOG.debug("Instantiated " + this);
798     }
799 
800     // by default, we allow writes against a region when it's in recovering
801     this.disallowWritesInRecovering =
802         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
803           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
804     configurationManager = Optional.absent();
805 
806     // disable stats tracking system tables, but check the config for everything else
807     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
808         NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ?
809           false :
810           conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
811               HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
812   }
813 
814   void setHTableSpecificConf() {
815     if (this.htableDescriptor == null) return;
816     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
817 
818     if (flushSize <= 0) {
819       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
820         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
821     }
822     this.memstoreFlushSize = flushSize;
823     this.blockingMemStoreSize = this.memstoreFlushSize *
824         conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
825                 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
826   }
827 
828   /**
829    * Initialize this region.
830    * Used only by tests and SplitTransaction to reopen the region.
831    * You should use createHRegion() or openHRegion()
832    * @return What the next sequence (edit) id should be.
833    * @throws IOException e
834    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
835    */
836   @Deprecated
837   public long initialize() throws IOException {
838     return initialize(null);
839   }
840 
841   /**
842    * Initialize this region.
843    *
844    * @param reporter Tickle every so often if initialize is taking a while.
845    * @return What the next sequence (edit) id should be.
846    * @throws IOException e
847    */
848   private long initialize(final CancelableProgressable reporter) throws IOException {
849     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
850     long nextSeqId = -1;
851     try {
852       nextSeqId = initializeRegionInternals(reporter, status);
853       return nextSeqId;
854     } finally {
855       // nextSeqid will be -1 if the initialization fails.
856       // At least it will be 0 otherwise.
857       if (nextSeqId == -1) {
858         status.abort("Exception during region " + getRegionInfo().getRegionNameAsString() +
859           " initialization.");
860       }
861     }
862   }
863 
864   private long initializeRegionInternals(final CancelableProgressable reporter,
865       final MonitoredTask status) throws IOException {
866     if (coprocessorHost != null) {
867       status.setStatus("Running coprocessor pre-open hook");
868       coprocessorHost.preOpen();
869     }
870 
871     // Write HRI to a file in case we need to recover hbase:meta
872     // Only the primary replica should write .regioninfo
873     if (this.getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
874       status.setStatus("Writing region info on filesystem");
875       fs.checkRegionInfoOnFilesystem();
876     } else {
877       if (LOG.isDebugEnabled()) {
878         LOG.debug("Skipping creation of .regioninfo file for " + this.getRegionInfo());
879       }
880     }
881 
882     // Initialize all the HStores
883     status.setStatus("Initializing all the Stores");
884     long maxSeqId = initializeRegionStores(reporter, status, false);
885     this.lastReplayedOpenRegionSeqId = maxSeqId;
886 
887     this.writestate.setReadOnly(ServerRegionReplicaUtil.isReadOnly(this));
888     this.writestate.flushRequested = false;
889     this.writestate.compacting = 0;
890 
891     if (this.writestate.writesEnabled) {
892       // Remove temporary data left over from old regions
893       status.setStatus("Cleaning up temporary data from old regions");
894       fs.cleanupTempDir();
895     }
896 
897     if (this.writestate.writesEnabled) {
898       status.setStatus("Cleaning up detritus from prior splits");
899       // Get rid of any splits or merges that were lost in-progress.  Clean out
900       // these directories here on open.  We may be opening a region that was
901       // being split but we crashed in the middle of it all.
902       fs.cleanupAnySplitDetritus();
903       fs.cleanupMergesDir();
904     }
905 
906     // Initialize split policy
907     this.splitPolicy = RegionSplitPolicy.create(this, conf);
908 
909     // Initialize flush policy
910     this.flushPolicy = FlushPolicyFactory.create(this, conf);
911 
912     long lastFlushTime = EnvironmentEdgeManager.currentTime();
913     for (Store store: stores.values()) {
914       this.lastStoreFlushTimeMap.put(store, lastFlushTime);
915     }
916 
917     // Use maximum of log sequenceid or that which was found in stores
918     // (particularly if no recovered edits, seqid will be -1).
919     long nextSeqid = maxSeqId;
920 
921     // In distributedLogReplay mode, we don't know the last change sequence number because region
922     // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
923     // overlaps used sequence numbers
924     if (this.writestate.writesEnabled) {
925       nextSeqid = WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs
926           .getRegionDir(), nextSeqid, (this.isRecovering ? (this.flushPerChanges + 10000000) : 1));
927     } else {
928       nextSeqid++;
929     }
930 
931     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
932       "; next sequenceid=" + nextSeqid);
933 
934     // A region can be reopened if failed a split; reset flags
935     this.closing.set(false);
936     this.closed.set(false);
937 
938     if (coprocessorHost != null) {
939       status.setStatus("Running coprocessor post-open hooks");
940       coprocessorHost.postOpen();
941     }
942 
943     status.markComplete("Region opened successfully");
944     return nextSeqid;
945   }
946 
947   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status,
948       boolean warmupOnly)
949       throws IOException {
950 
951     // Load in all the HStores.
952 
953     long maxSeqId = -1;
954     // initialized to -1 so that we pick up MemstoreTS from column families
955     long maxMemstoreTS = -1;
956 
957     if (!htableDescriptor.getFamilies().isEmpty()) {
958       // initialize the thread pool for opening stores in parallel.
959       ThreadPoolExecutor storeOpenerThreadPool =
960         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
961       CompletionService<HStore> completionService =
962         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
963 
964       // initialize each store in parallel
965       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
966         status.setStatus("Instantiating store for column family " + family);
967         completionService.submit(new Callable<HStore>() {
968           @Override
969           public HStore call() throws IOException {
970             return instantiateHStore(family);
971           }
972         });
973       }
974       boolean allStoresOpened = false;
975       try {
976         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
977           Future<HStore> future = completionService.take();
978           HStore store = future.get();
979           this.stores.put(store.getFamily().getName(), store);
980 
981           long storeMaxSequenceId = store.getMaxSequenceId();
982           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
983               storeMaxSequenceId);
984           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
985             maxSeqId = storeMaxSequenceId;
986           }
987           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
988           if (maxStoreMemstoreTS > maxMemstoreTS) {
989             maxMemstoreTS = maxStoreMemstoreTS;
990           }
991         }
992         allStoresOpened = true;
993       } catch (InterruptedException e) {
994         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
995       } catch (ExecutionException e) {
996         throw new IOException(e.getCause());
997       } finally {
998         storeOpenerThreadPool.shutdownNow();
999         if (!allStoresOpened) {
1000           // something went wrong, close all opened stores
1001           LOG.error("Could not initialize all stores for the region=" + this);
1002           for (Store store : this.stores.values()) {
1003             try {
1004               store.close();
1005             } catch (IOException e) {
1006               LOG.warn(e.getMessage());
1007             }
1008           }
1009         }
1010       }
1011     }
1012     if (ServerRegionReplicaUtil.shouldReplayRecoveredEdits(this) && !warmupOnly) {
1013       // Recover any edits if available.
1014       maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
1015           this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
1016     }
1017     maxSeqId = Math.max(maxSeqId, maxMemstoreTS + 1);
1018     mvcc.initialize(maxSeqId);
1019     return maxSeqId;
1020   }
1021 
1022   private void initializeWarmup(final CancelableProgressable reporter) throws IOException {
1023     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
1024 
1025     // Initialize all the HStores
1026     status.setStatus("Warming up all the Stores");
1027     initializeRegionStores(reporter, status, true);
1028   }
1029 
1030   /**
1031    * @return Map of StoreFiles by column family
1032    */
1033   private NavigableMap<byte[], List<Path>> getStoreFiles() {
1034     NavigableMap<byte[], List<Path>> allStoreFiles =
1035       new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
1036     for (Store store: getStores()) {
1037       Collection<StoreFile> storeFiles = store.getStorefiles();
1038       if (storeFiles == null) continue;
1039       List<Path> storeFileNames = new ArrayList<Path>();
1040       for (StoreFile storeFile: storeFiles) {
1041         storeFileNames.add(storeFile.getPath());
1042       }
1043       allStoreFiles.put(store.getFamily().getName(), storeFileNames);
1044     }
1045     return allStoreFiles;
1046   }
1047 
1048   private void writeRegionOpenMarker(WAL wal, long openSeqId) throws IOException {
1049     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1050     RegionEventDescriptor regionOpenDesc = ProtobufUtil.toRegionEventDescriptor(
1051       RegionEventDescriptor.EventType.REGION_OPEN, getRegionInfo(), openSeqId,
1052       getRegionServerServices().getServerName(), storeFiles);
1053     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionOpenDesc,
1054       getSequenceId());
1055   }
1056 
1057   private void writeRegionCloseMarker(WAL wal) throws IOException {
1058     Map<byte[], List<Path>> storeFiles = getStoreFiles();
1059     RegionEventDescriptor regionEventDesc = ProtobufUtil.toRegionEventDescriptor(
1060       RegionEventDescriptor.EventType.REGION_CLOSE, getRegionInfo(), getSequenceId().get(),
1061       getRegionServerServices().getServerName(), storeFiles);
1062     WALUtil.writeRegionEventMarker(wal, getTableDesc(), getRegionInfo(), regionEventDesc,
1063       getSequenceId());
1064 
1065     // Store SeqId in HDFS when a region closes
1066     // checking region folder exists is due to many tests which delete the table folder while a
1067     // table is still online
1068     if (this.fs.getFileSystem().exists(this.fs.getRegionDir())) {
1069       WALSplitter.writeRegionSequenceIdFile(this.fs.getFileSystem(), this.fs.getRegionDir(),
1070         getSequenceId().get(), 0);
1071     }
1072   }
1073 
1074   /**
1075    * @return True if this region has references.
1076    */
1077   public boolean hasReferences() {
1078     for (Store store : this.stores.values()) {
1079       if (store.hasReferences()) return true;
1080     }
1081     return false;
1082   }
1083 
1084   @Override
1085   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
1086     HDFSBlocksDistribution hdfsBlocksDistribution =
1087       new HDFSBlocksDistribution();
1088     synchronized (this.stores) {
1089       for (Store store : this.stores.values()) {
1090         Collection<StoreFile> storeFiles = store.getStorefiles();
1091         if (storeFiles == null) continue;
1092         for (StoreFile sf : storeFiles) {
1093           HDFSBlocksDistribution storeFileBlocksDistribution =
1094             sf.getHDFSBlockDistribution();
1095           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
1096         }
1097       }
1098     }
1099     return hdfsBlocksDistribution;
1100   }
1101 
1102   /**
1103    * This is a helper function to compute HDFS block distribution on demand
1104    * @param conf configuration
1105    * @param tableDescriptor HTableDescriptor of the table
1106    * @param regionInfo encoded name of the region
1107    * @return The HDFS blocks distribution for the given region.
1108    * @throws IOException
1109    */
1110   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1111       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
1112     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
1113     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
1114   }
1115 
1116   /**
1117    * This is a helper function to compute HDFS block distribution on demand
1118    * @param conf configuration
1119    * @param tableDescriptor HTableDescriptor of the table
1120    * @param regionInfo encoded name of the region
1121    * @param tablePath the table directory
1122    * @return The HDFS blocks distribution for the given region.
1123    * @throws IOException
1124    */
1125   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
1126       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
1127       throws IOException {
1128     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
1129     FileSystem fs = tablePath.getFileSystem(conf);
1130 
1131     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
1132     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
1133       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
1134       if (storeFiles == null) continue;
1135       for (StoreFileInfo storeFileInfo : storeFiles) {
1136         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
1137       }
1138     }
1139     return hdfsBlocksDistribution;
1140   }
1141 
1142   /**
1143    * Increase the size of mem store in this region and the size of global mem
1144    * store
1145    * @return the size of memstore in this region
1146    */
1147   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
1148     if (this.rsAccounting != null) {
1149       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
1150     }
1151     long size = this.memstoreSize.addAndGet(memStoreSize);
1152     // This is extremely bad if we make memstoreSize negative. Log as much info on the offending
1153     // caller as possible. (memStoreSize might be a negative value already -- freeing memory)
1154     // Only perform this call for the primary replica (not the secondaries)
1155     if (HRegionInfo.DEFAULT_REPLICA_ID == this.getRegionInfo().getReplicaId() && size < 0) {
1156       LOG.error("Asked to modify this region's (" + this.toString()
1157       + ") memstoreSize to a negative value which is incorrect. Current memstoreSize="
1158       + (size-memStoreSize) + ", delta=" + memStoreSize, new Exception());
1159     }
1160     return size;
1161   }
1162 
1163   @Override
1164   public HRegionInfo getRegionInfo() {
1165     return this.fs.getRegionInfo();
1166   }
1167 
1168   /**
1169    * @return Instance of {@link RegionServerServices} used by this HRegion.
1170    * Can be null.
1171    */
1172   RegionServerServices getRegionServerServices() {
1173     return this.rsServices;
1174   }
1175 
1176   @Override
1177   public long getReadRequestsCount() {
1178     return readRequestsCount.get();
1179   }
1180 
1181   @Override
1182   public void updateReadRequestsCount(long i) {
1183     readRequestsCount.add(i);
1184   }
1185 
1186   @Override
1187   public long getWriteRequestsCount() {
1188     return writeRequestsCount.get();
1189   }
1190 
1191   @Override
1192   public void updateWriteRequestsCount(long i) {
1193     writeRequestsCount.add(i);
1194   }
1195 
1196   @Override
1197   public long getMemstoreSize() {
1198     return memstoreSize.get();
1199   }
1200 
1201   @Override
1202   public long getNumMutationsWithoutWAL() {
1203     return numMutationsWithoutWAL.get();
1204   }
1205 
1206   @Override
1207   public long getDataInMemoryWithoutWAL() {
1208     return dataInMemoryWithoutWAL.get();
1209   }
1210 
1211   @Override
1212   public long getBlockedRequestsCount() {
1213     return blockedRequestsCount.get();
1214   }
1215 
1216   @Override
1217   public long getCheckAndMutateChecksPassed() {
1218     return checkAndMutateChecksPassed.get();
1219   }
1220 
1221   @Override
1222   public long getCheckAndMutateChecksFailed() {
1223     return checkAndMutateChecksFailed.get();
1224   }
1225 
1226   @Override
1227   public MetricsRegion getMetrics() {
1228     return metricsRegion;
1229   }
1230 
1231   @Override
1232   public boolean isClosed() {
1233     return this.closed.get();
1234   }
1235 
1236   @Override
1237   public boolean isClosing() {
1238     return this.closing.get();
1239   }
1240 
1241   @Override
1242   public boolean isReadOnly() {
1243     return this.writestate.isReadOnly();
1244   }
1245 
1246   /**
1247    * Reset recovering state of current region
1248    */
1249   public void setRecovering(boolean newState) {
1250     boolean wasRecovering = this.isRecovering;
1251     // before we flip the recovering switch (enabling reads) we should write the region open
1252     // event to WAL if needed
1253     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
1254         && wasRecovering && !newState) {
1255 
1256       // force a flush only if region replication is set up for this region. Otherwise no need.
1257       boolean forceFlush = getTableDesc().getRegionReplication() > 1;
1258 
1259       // force a flush first
1260       MonitoredTask status = TaskMonitor.get().createStatus(
1261         "Flushing region " + this + " because recovery is finished");
1262       try {
1263         if (forceFlush) {
1264           internalFlushcache(status);
1265         }
1266 
1267         status.setStatus("Writing region open event marker to WAL because recovery is finished");
1268         try {
1269           long seqId = openSeqNum;
1270           // obtain a new seqId because we possibly have writes and flushes on top of openSeqNum
1271           if (wal != null) {
1272             seqId = getNextSequenceId(wal);
1273           }
1274           writeRegionOpenMarker(wal, seqId);
1275         } catch (IOException e) {
1276           // We cannot rethrow this exception since we are being called from the zk thread. The
1277           // region has already opened. In this case we log the error, but continue
1278           LOG.warn(getRegionInfo().getEncodedName() + " : was not able to write region opening "
1279               + "event to WAL, continueing", e);
1280         }
1281       } catch (IOException ioe) {
1282         // Distributed log replay semantics does not necessarily require a flush, since the replayed
1283         // data is already written again in the WAL. So failed flush should be fine.
1284         LOG.warn(getRegionInfo().getEncodedName() + " : was not able to flush "
1285             + "event to WAL, continueing", ioe);
1286       } finally {
1287         status.cleanup();
1288       }
1289     }
1290 
1291     this.isRecovering = newState;
1292     if (wasRecovering && !isRecovering) {
1293       // Call only when wal replay is over.
1294       coprocessorHost.postLogReplay();
1295     }
1296   }
1297 
1298   @Override
1299   public boolean isRecovering() {
1300     return this.isRecovering;
1301   }
1302 
1303   @Override
1304   public boolean isAvailable() {
1305     return !isClosed() && !isClosing();
1306   }
1307 
1308   /** @return true if region is splittable */
1309   public boolean isSplittable() {
1310     return isAvailable() && !hasReferences();
1311   }
1312 
1313   /**
1314    * @return true if region is mergeable
1315    */
1316   public boolean isMergeable() {
1317     if (!isAvailable()) {
1318       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1319           + " is not mergeable because it is closing or closed");
1320       return false;
1321     }
1322     if (hasReferences()) {
1323       LOG.debug("Region " + getRegionInfo().getRegionNameAsString()
1324           + " is not mergeable because it has references");
1325       return false;
1326     }
1327 
1328     return true;
1329   }
1330 
1331   public boolean areWritesEnabled() {
1332     synchronized(this.writestate) {
1333       return this.writestate.writesEnabled;
1334     }
1335   }
1336 
1337    public MultiVersionConsistencyControl getMVCC() {
1338      return mvcc;
1339    }
1340 
1341    @Override
1342    public long getMaxFlushedSeqId() {
1343      return maxFlushedSeqId;
1344    }
1345 
1346    @Override
1347    public long getReadpoint(IsolationLevel isolationLevel) {
1348      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1349        // This scan can read even uncommitted transactions
1350        return Long.MAX_VALUE;
1351      }
1352      return mvcc.memstoreReadPoint();
1353    }
1354 
1355    @Override
1356    public boolean isLoadingCfsOnDemandDefault() {
1357      return this.isLoadingCfsOnDemandDefault;
1358    }
1359 
1360   /**
1361    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1362    * service any more calls.
1363    *
1364    * <p>This method could take some time to execute, so don't call it from a
1365    * time-sensitive thread.
1366    *
1367    * @return Vector of all the storage files that the HRegion's component
1368    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1369    * vector if already closed and null if judged that it should not close.
1370    *
1371    * @throws IOException e
1372    * @throws DroppedSnapshotException Thrown when replay of wal is required
1373    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1374    * caller MUST abort after this.
1375    */
1376   public Map<byte[], List<StoreFile>> close() throws IOException {
1377     return close(false);
1378   }
1379 
1380   private final Object closeLock = new Object();
1381 
1382   /** Conf key for the periodic flush interval */
1383   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1384       "hbase.regionserver.optionalcacheflushinterval";
1385   /** Default interval for the memstore flush */
1386   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1387   public static final int META_CACHE_FLUSH_INTERVAL = 300000; // 5 minutes
1388 
1389   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1390   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1391       "hbase.regionserver.flush.per.changes";
1392   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1393   /**
1394    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1395    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1396    */
1397   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1398 
1399   /**
1400    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1401    * Shut down each HStore, don't service any more calls.
1402    *
1403    * This method could take some time to execute, so don't call it from a
1404    * time-sensitive thread.
1405    *
1406    * @param abort true if server is aborting (only during testing)
1407    * @return Vector of all the storage files that the HRegion's component
1408    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1409    * we are not to close at this time or we are already closed.
1410    *
1411    * @throws IOException e
1412    * @throws DroppedSnapshotException Thrown when replay of wal is required
1413    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1414    * caller MUST abort after this.
1415    */
1416   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1417     // Only allow one thread to close at a time. Serialize them so dual
1418     // threads attempting to close will run up against each other.
1419     MonitoredTask status = TaskMonitor.get().createStatus(
1420         "Closing region " + this +
1421         (abort ? " due to abort" : ""));
1422 
1423     status.setStatus("Waiting for close lock");
1424     try {
1425       synchronized (closeLock) {
1426         return doClose(abort, status);
1427       }
1428     } finally {
1429       status.cleanup();
1430     }
1431   }
1432 
1433   /**
1434    * Exposed for some very specific unit tests.
1435    */
1436   @VisibleForTesting
1437   public void setClosing(boolean closing) {
1438     this.closing.set(closing);
1439   }
1440 
1441   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1442       throws IOException {
1443     if (isClosed()) {
1444       LOG.warn("Region " + this + " already closed");
1445       return null;
1446     }
1447 
1448     if (coprocessorHost != null) {
1449       status.setStatus("Running coprocessor pre-close hooks");
1450       this.coprocessorHost.preClose(abort);
1451     }
1452 
1453     status.setStatus("Disabling compacts and flushes for region");
1454     boolean canFlush = true;
1455     synchronized (writestate) {
1456       // Disable compacting and flushing by background threads for this
1457       // region.
1458       canFlush = !writestate.readOnly;
1459       writestate.writesEnabled = false;
1460       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1461       waitForFlushesAndCompactions();
1462     }
1463     // If we were not just flushing, is it worth doing a preflush...one
1464     // that will clear out of the bulk of the memstore before we put up
1465     // the close flag?
1466     if (!abort && worthPreFlushing() && canFlush) {
1467       status.setStatus("Pre-flushing region before close");
1468       LOG.info("Running close preflush of " + getRegionInfo().getRegionNameAsString());
1469       try {
1470         internalFlushcache(status);
1471       } catch (IOException ioe) {
1472         // Failed to flush the region. Keep going.
1473         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1474       }
1475     }
1476 
1477     // block waiting for the lock for closing
1478     lock.writeLock().lock();
1479     this.closing.set(true);
1480     status.setStatus("Disabling writes for close");
1481     try {
1482       if (this.isClosed()) {
1483         status.abort("Already got closed by another process");
1484         // SplitTransaction handles the null
1485         return null;
1486       }
1487       LOG.debug("Updates disabled for region " + this);
1488       // Don't flush the cache if we are aborting
1489       if (!abort && canFlush) {
1490         int flushCount = 0;
1491         while (this.memstoreSize.get() > 0) {
1492           try {
1493             if (flushCount++ > 0) {
1494               int actualFlushes = flushCount - 1;
1495               if (actualFlushes > 5) {
1496                 // If we tried 5 times and are unable to clear memory, abort
1497                 // so we do not lose data
1498                 throw new DroppedSnapshotException("Failed clearing memory after " +
1499                   actualFlushes + " attempts on region: " +
1500                     Bytes.toStringBinary(getRegionInfo().getRegionName()));
1501               }
1502               LOG.info("Running extra flush, " + actualFlushes +
1503                 " (carrying snapshot?) " + this);
1504             }
1505             internalFlushcache(status);
1506           } catch (IOException ioe) {
1507             status.setStatus("Failed flush " + this + ", putting online again");
1508             synchronized (writestate) {
1509               writestate.writesEnabled = true;
1510             }
1511             // Have to throw to upper layers.  I can't abort server from here.
1512             throw ioe;
1513           }
1514         }
1515       }
1516 
1517       Map<byte[], List<StoreFile>> result =
1518         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1519       if (!stores.isEmpty()) {
1520         // initialize the thread pool for closing stores in parallel.
1521         ThreadPoolExecutor storeCloserThreadPool =
1522           getStoreOpenAndCloseThreadPool("StoreCloserThread-" +
1523             getRegionInfo().getRegionNameAsString());
1524         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1525           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1526 
1527         // close each store in parallel
1528         for (final Store store : stores.values()) {
1529           long flushableSize = store.getFlushableSize();
1530           if (!(abort || flushableSize == 0 || writestate.readOnly)) {
1531             getRegionServerServices().abort("Assertion failed while closing store "
1532                 + getRegionInfo().getRegionNameAsString() + " " + store
1533                 + ". flushableSize expected=0, actual= " + flushableSize
1534                 + ". Current memstoreSize=" + getMemstoreSize() + ". Maybe a coprocessor "
1535                 + "operation failed and left the memstore in a partially updated state.", null);
1536           }
1537           completionService
1538               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1539                 @Override
1540                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1541                   return new Pair<byte[], Collection<StoreFile>>(
1542                     store.getFamily().getName(), store.close());
1543                 }
1544               });
1545         }
1546         try {
1547           for (int i = 0; i < stores.size(); i++) {
1548             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1549             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1550             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1551             if (familyFiles == null) {
1552               familyFiles = new ArrayList<StoreFile>();
1553               result.put(storeFiles.getFirst(), familyFiles);
1554             }
1555             familyFiles.addAll(storeFiles.getSecond());
1556           }
1557         } catch (InterruptedException e) {
1558           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1559         } catch (ExecutionException e) {
1560           throw new IOException(e.getCause());
1561         } finally {
1562           storeCloserThreadPool.shutdownNow();
1563         }
1564       }
1565 
1566       status.setStatus("Writing region close event to WAL");
1567       if (!abort && wal != null && getRegionServerServices() != null && !writestate.readOnly) {
1568         writeRegionCloseMarker(wal);
1569       }
1570 
1571       this.closed.set(true);
1572       if (!canFlush) {
1573         addAndGetGlobalMemstoreSize(-memstoreSize.get());
1574       } else if (memstoreSize.get() != 0) {
1575         LOG.error("Memstore size is " + memstoreSize.get());
1576       }
1577       if (coprocessorHost != null) {
1578         status.setStatus("Running coprocessor post-close hooks");
1579         this.coprocessorHost.postClose(abort);
1580       }
1581       if (this.metricsRegion != null) {
1582         this.metricsRegion.close();
1583       }
1584       if (this.metricsRegionWrapper != null) {
1585         Closeables.closeQuietly(this.metricsRegionWrapper);
1586       }
1587       status.markComplete("Closed");
1588       LOG.info("Closed " + this);
1589       return result;
1590     } finally {
1591       lock.writeLock().unlock();
1592     }
1593   }
1594 
1595   @Override
1596   public void waitForFlushesAndCompactions() {
1597     synchronized (writestate) {
1598       if (this.writestate.readOnly) {
1599         // we should not wait for replayed flushed if we are read only (for example in case the
1600         // region is a secondary replica).
1601         return;
1602       }
1603       boolean interrupted = false;
1604       try {
1605         while (writestate.compacting > 0 || writestate.flushing) {
1606           LOG.debug("waiting for " + writestate.compacting + " compactions"
1607             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1608           try {
1609             writestate.wait();
1610           } catch (InterruptedException iex) {
1611             // essentially ignore and propagate the interrupt back up
1612             LOG.warn("Interrupted while waiting");
1613             interrupted = true;
1614           }
1615         }
1616       } finally {
1617         if (interrupted) {
1618           Thread.currentThread().interrupt();
1619         }
1620       }
1621     }
1622   }
1623 
1624   public void waitForFlushes() {
1625     synchronized (writestate) {
1626       if (this.writestate.readOnly) {
1627         // we should not wait for replayed flushed if we are read only (for example in case the
1628         // region is a secondary replica).
1629         return;
1630       }
1631       if (!writestate.flushing) return;
1632       long start = System.currentTimeMillis();
1633       boolean interrupted = false;
1634       try {
1635         while (writestate.flushing) {
1636           LOG.debug("waiting for cache flush to complete for region " + this);
1637           try {
1638             writestate.wait();
1639           } catch (InterruptedException iex) {
1640             // essentially ignore and propagate the interrupt back up
1641             LOG.warn("Interrupted while waiting");
1642             interrupted = true;
1643           }
1644         }
1645       } finally {
1646         if (interrupted) {
1647           Thread.currentThread().interrupt();
1648         }
1649       }
1650       long duration = System.currentTimeMillis() - start;
1651       LOG.debug("Waited " + duration + " ms for flush to complete");
1652     }
1653   }
1654   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1655       final String threadNamePrefix) {
1656     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1657     int maxThreads = Math.min(numStores,
1658         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1659             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1660     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1661   }
1662 
1663   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1664       final String threadNamePrefix) {
1665     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1666     int maxThreads = Math.max(1,
1667         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1668             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1669             / numStores);
1670     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1671   }
1672 
1673   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1674       final String threadNamePrefix) {
1675     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1676       new ThreadFactory() {
1677         private int count = 1;
1678 
1679         @Override
1680         public Thread newThread(Runnable r) {
1681           return new Thread(r, threadNamePrefix + "-" + count++);
1682         }
1683       });
1684   }
1685 
1686    /**
1687     * @return True if its worth doing a flush before we put up the close flag.
1688     */
1689   private boolean worthPreFlushing() {
1690     return this.memstoreSize.get() >
1691       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1692   }
1693 
1694   //////////////////////////////////////////////////////////////////////////////
1695   // HRegion accessors
1696   //////////////////////////////////////////////////////////////////////////////
1697 
1698   @Override
1699   public HTableDescriptor getTableDesc() {
1700     return this.htableDescriptor;
1701   }
1702 
1703   /** @return WAL in use for this region */
1704   public WAL getWAL() {
1705     return this.wal;
1706   }
1707 
1708   /**
1709    * A split takes the config from the parent region & passes it to the daughter
1710    * region's constructor. If 'conf' was passed, you would end up using the HTD
1711    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1712    * to the daughter regions to avoid this tricky dedupe problem.
1713    * @return Configuration object
1714    */
1715   Configuration getBaseConf() {
1716     return this.baseConf;
1717   }
1718 
1719   /** @return {@link FileSystem} being used by this region */
1720   public FileSystem getFilesystem() {
1721     return fs.getFileSystem();
1722   }
1723 
1724   /** @return the {@link HRegionFileSystem} used by this region */
1725   public HRegionFileSystem getRegionFileSystem() {
1726     return this.fs;
1727   }
1728 
1729   @Override
1730   public long getEarliestFlushTimeForAllStores() {
1731     return lastStoreFlushTimeMap.isEmpty() ? Long.MAX_VALUE : Collections.min(lastStoreFlushTimeMap
1732         .values());
1733   }
1734 
1735   @Override
1736   public long getOldestHfileTs(boolean majorCompactioOnly) throws IOException {
1737     long result = Long.MAX_VALUE;
1738     for (Store store : getStores()) {
1739       Collection<StoreFile> storeFiles = store.getStorefiles();
1740       if (storeFiles == null) continue;
1741       for (StoreFile file : storeFiles) {
1742         StoreFile.Reader sfReader = file.getReader();
1743         if (sfReader == null) continue;
1744         HFile.Reader reader = sfReader.getHFileReader();
1745         if (reader == null) continue;
1746         if (majorCompactioOnly) {
1747           byte[] val = reader.loadFileInfo().get(StoreFile.MAJOR_COMPACTION_KEY);
1748           if (val == null) continue;
1749           if (val == null || !Bytes.toBoolean(val)) {
1750             continue;
1751           }
1752         }
1753         result = Math.min(result, reader.getFileContext().getFileCreateTime());
1754       }
1755     }
1756     return result == Long.MAX_VALUE ? 0 : result;
1757   }
1758 
1759   RegionLoad.Builder setCompleteSequenceId(RegionLoad.Builder regionLoadBldr) {
1760     long lastFlushOpSeqIdLocal = this.lastFlushOpSeqId;
1761     byte[] encodedRegionName = this.getRegionInfo().getEncodedNameAsBytes();
1762     regionLoadBldr.clearStoreCompleteSequenceId();
1763     for (byte[] familyName : this.stores.keySet()) {
1764       long oldestUnflushedSeqId = this.wal.getEarliestMemstoreSeqNum(encodedRegionName, familyName);
1765       // no oldestUnflushedSeqId means no data has written to the store after last flush, so we use
1766       // lastFlushOpSeqId as complete sequence id for the store.
1767       regionLoadBldr.addStoreCompleteSequenceId(StoreSequenceId
1768           .newBuilder()
1769           .setFamilyName(ByteString.copyFrom(familyName))
1770           .setSequenceId(
1771             oldestUnflushedSeqId < 0 ? lastFlushOpSeqIdLocal : oldestUnflushedSeqId - 1).build());
1772     }
1773     return regionLoadBldr.setCompleteSequenceId(getMaxFlushedSeqId());
1774   }
1775 
1776   //////////////////////////////////////////////////////////////////////////////
1777   // HRegion maintenance.
1778   //
1779   // These methods are meant to be called periodically by the HRegionServer for
1780   // upkeep.
1781   //////////////////////////////////////////////////////////////////////////////
1782 
1783   /** @return returns size of largest HStore. */
1784   public long getLargestHStoreSize() {
1785     long size = 0;
1786     for (Store h : stores.values()) {
1787       long storeSize = h.getSize();
1788       if (storeSize > size) {
1789         size = storeSize;
1790       }
1791     }
1792     return size;
1793   }
1794 
1795   /**
1796    * @return KeyValue Comparator
1797    */
1798   public KeyValue.KVComparator getComparator() {
1799     return this.comparator;
1800   }
1801 
1802   /*
1803    * Do preparation for pending compaction.
1804    * @throws IOException
1805    */
1806   protected void doRegionCompactionPrep() throws IOException {
1807   }
1808 
1809   @Override
1810   public void triggerMajorCompaction() throws IOException {
1811     for (Store s : getStores()) {
1812       s.triggerMajorCompaction();
1813     }
1814   }
1815 
1816   @Override
1817   public void compact(final boolean majorCompaction) throws IOException {
1818     if (majorCompaction) {
1819       triggerMajorCompaction();
1820     }
1821     for (Store s : getStores()) {
1822       CompactionContext compaction = s.requestCompaction();
1823       if (compaction != null) {
1824         CompactionThroughputController controller = null;
1825         if (rsServices != null) {
1826           controller = CompactionThroughputControllerFactory.create(rsServices, conf);
1827         }
1828         if (controller == null) {
1829           controller = NoLimitCompactionThroughputController.INSTANCE;
1830         }
1831         compact(compaction, s, controller, null);
1832       }
1833     }
1834   }
1835 
1836   /**
1837    * This is a helper function that compact all the stores synchronously
1838    * It is used by utilities and testing
1839    *
1840    * @throws IOException e
1841    */
1842   public void compactStores() throws IOException {
1843     for (Store s : getStores()) {
1844       CompactionContext compaction = s.requestCompaction();
1845       if (compaction != null) {
1846         compact(compaction, s, NoLimitCompactionThroughputController.INSTANCE, null);
1847       }
1848     }
1849   }
1850 
1851   /**
1852    * This is a helper function that compact the given store
1853    * It is used by utilities and testing
1854    *
1855    * @throws IOException e
1856    */
1857   @VisibleForTesting
1858   void compactStore(byte[] family, CompactionThroughputController throughputController)
1859       throws IOException {
1860     Store s = getStore(family);
1861     CompactionContext compaction = s.requestCompaction();
1862     if (compaction != null) {
1863       compact(compaction, s, throughputController, null);
1864     }
1865   }
1866 
1867   /*
1868    * Called by compaction thread and after region is opened to compact the
1869    * HStores if necessary.
1870    *
1871    * <p>This operation could block for a long time, so don't call it from a
1872    * time-sensitive thread.
1873    *
1874    * Note that no locking is necessary at this level because compaction only
1875    * conflicts with a region split, and that cannot happen because the region
1876    * server does them sequentially and not in parallel.
1877    *
1878    * @param compaction Compaction details, obtained by requestCompaction()
1879    * @param throughputController
1880    * @return whether the compaction completed
1881    */
1882   public boolean compact(CompactionContext compaction, Store store,
1883       CompactionThroughputController throughputController) throws IOException {
1884     return compact(compaction, store, throughputController, null);
1885   }
1886 
1887   public boolean compact(CompactionContext compaction, Store store,
1888       CompactionThroughputController throughputController, User user) throws IOException {
1889     assert compaction != null && compaction.hasSelection();
1890     assert !compaction.getRequest().getFiles().isEmpty();
1891     if (this.closing.get() || this.closed.get()) {
1892       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1893       store.cancelRequestedCompaction(compaction);
1894       return false;
1895     }
1896     MonitoredTask status = null;
1897     boolean requestNeedsCancellation = true;
1898     /*
1899      * We are trying to remove / relax the region read lock for compaction.
1900      * Let's see what are the potential race conditions among the operations (user scan,
1901      * region split, region close and region bulk load).
1902      * 
1903      *  user scan ---> region read lock
1904      *  region split --> region close first --> region write lock
1905      *  region close --> region write lock
1906      *  region bulk load --> region write lock
1907      *  
1908      * read lock is compatible with read lock. ---> no problem with user scan/read
1909      * region bulk load does not cause problem for compaction (no consistency problem, store lock
1910      *  will help the store file accounting).
1911      * They can run almost concurrently at the region level.
1912      * 
1913      * The only remaining race condition is between the region close and compaction.
1914      * So we will evaluate, below, how region close intervenes with compaction if compaction does
1915      * not acquire region read lock.
1916      * 
1917      * Here are the steps for compaction:
1918      * 1. obtain list of StoreFile's
1919      * 2. create StoreFileScanner's based on list from #1
1920      * 3. perform compaction and save resulting files under tmp dir
1921      * 4. swap in compacted files
1922      * 
1923      * #1 is guarded by store lock. This patch does not change this --> no worse or better
1924      * For #2, we obtain smallest read point (for region) across all the Scanners (for both default
1925      * compactor and stripe compactor).
1926      * The read points are for user scans. Region keeps the read points for all currently open
1927      * user scanners.
1928      * Compaction needs to know the smallest read point so that during re-write of the hfiles,
1929      * it can remove the mvcc points for the cells if their mvccs are older than the smallest
1930      * since they are not needed anymore.
1931      * This will not conflict with compaction.
1932      * For #3, it can be performed in parallel to other operations.
1933      * For #4 bulk load and compaction don't conflict with each other on the region level
1934      *   (for multi-family atomicy). 
1935      * Region close and compaction are guarded pretty well by the 'writestate'.
1936      * In HRegion#doClose(), we have :
1937      * synchronized (writestate) {
1938      *   // Disable compacting and flushing by background threads for this
1939      *   // region.
1940      *   canFlush = !writestate.readOnly;
1941      *   writestate.writesEnabled = false;
1942      *   LOG.debug("Closing " + this + ": disabling compactions & flushes");
1943      *   waitForFlushesAndCompactions();
1944      * }
1945      * waitForFlushesAndCompactions() would wait for writestate.compacting to come down to 0.
1946      * and in HRegion.compact()
1947      *  try {
1948      *    synchronized (writestate) {
1949      *    if (writestate.writesEnabled) {
1950      *      wasStateSet = true;
1951      *      ++writestate.compacting;
1952      *    } else {
1953      *      String msg = "NOT compacting region " + this + ". Writes disabled.";
1954      *      LOG.info(msg);
1955      *      status.abort(msg);
1956      *      return false;
1957      *    }
1958      *  }
1959      * Also in compactor.performCompaction():
1960      * check periodically to see if a system stop is requested
1961      * if (closeCheckInterval > 0) {
1962      *   bytesWritten += len;
1963      *   if (bytesWritten > closeCheckInterval) {
1964      *     bytesWritten = 0;
1965      *     if (!store.areWritesEnabled()) {
1966      *       progress.cancel();
1967      *       return false;
1968      *     }
1969      *   }
1970      * }
1971      */
1972     try {
1973       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1974       if (stores.get(cf) != store) {
1975         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1976             + " has been re-instantiated, cancel this compaction request. "
1977             + " It may be caused by the roll back of split transaction");
1978         return false;
1979       }
1980 
1981       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1982       if (this.closed.get()) {
1983         String msg = "Skipping compaction on " + this + " because closed";
1984         LOG.debug(msg);
1985         status.abort(msg);
1986         return false;
1987       }
1988       boolean wasStateSet = false;
1989       try {
1990         synchronized (writestate) {
1991           if (writestate.writesEnabled) {
1992             wasStateSet = true;
1993             ++writestate.compacting;
1994           } else {
1995             String msg = "NOT compacting region " + this + ". Writes disabled.";
1996             LOG.info(msg);
1997             status.abort(msg);
1998             return false;
1999           }
2000         }
2001         LOG.info("Starting compaction on " + store + " in region " + this
2002             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
2003         doRegionCompactionPrep();
2004         try {
2005           status.setStatus("Compacting store " + store);
2006           // We no longer need to cancel the request on the way out of this
2007           // method because Store#compact will clean up unconditionally
2008           requestNeedsCancellation = false;
2009           store.compact(compaction, throughputController, user);
2010         } catch (InterruptedIOException iioe) {
2011           String msg = "compaction interrupted";
2012           LOG.info(msg, iioe);
2013           status.abort(msg);
2014           return false;
2015         }
2016       } finally {
2017         if (wasStateSet) {
2018           synchronized (writestate) {
2019             --writestate.compacting;
2020             if (writestate.compacting <= 0) {
2021               writestate.notifyAll();
2022             }
2023           }
2024         }
2025       }
2026       status.markComplete("Compaction complete");
2027       return true;
2028     } finally {
2029       if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
2030       if (status != null) status.cleanup();
2031     }
2032   }
2033 
2034   @Override
2035   public FlushResult flush(boolean force) throws IOException {
2036     return flushcache(force, false);
2037   }
2038 
2039   /**
2040    * Flush the cache.
2041    *
2042    * When this method is called the cache will be flushed unless:
2043    * <ol>
2044    *   <li>the cache is empty</li>
2045    *   <li>the region is closed.</li>
2046    *   <li>a flush is already in progress</li>
2047    *   <li>writes are disabled</li>
2048    * </ol>
2049    *
2050    * <p>This method may block for some time, so it should not be called from a
2051    * time-sensitive thread.
2052    * @param forceFlushAllStores whether we want to flush all stores
2053    * @param writeFlushRequestWalMarker whether to write the flush request marker to WAL
2054    * @return whether the flush is success and whether the region needs compacting
2055    *
2056    * @throws IOException general io exceptions
2057    * @throws DroppedSnapshotException Thrown when replay of wal is required
2058    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
2059    * caller MUST abort after this.
2060    */
2061   public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
2062       throws IOException {
2063     // fail-fast instead of waiting on the lock
2064     if (this.closing.get()) {
2065       String msg = "Skipping flush on " + this + " because closing";
2066       LOG.debug(msg);
2067       return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2068     }
2069     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
2070     status.setStatus("Acquiring readlock on region");
2071     // block waiting for the lock for flushing cache
2072     lock.readLock().lock();
2073     try {
2074       if (this.closed.get()) {
2075         String msg = "Skipping flush on " + this + " because closed";
2076         LOG.debug(msg);
2077         status.abort(msg);
2078         return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2079       }
2080       if (coprocessorHost != null) {
2081         status.setStatus("Running coprocessor pre-flush hooks");
2082         coprocessorHost.preFlush();
2083       }
2084       // TODO: this should be managed within memstore with the snapshot, updated only after flush
2085       // successful
2086       if (numMutationsWithoutWAL.get() > 0) {
2087         numMutationsWithoutWAL.set(0);
2088         dataInMemoryWithoutWAL.set(0);
2089       }
2090       synchronized (writestate) {
2091         if (!writestate.flushing && writestate.writesEnabled) {
2092           this.writestate.flushing = true;
2093         } else {
2094           if (LOG.isDebugEnabled()) {
2095             LOG.debug("NOT flushing memstore for region " + this
2096                 + ", flushing=" + writestate.flushing + ", writesEnabled="
2097                 + writestate.writesEnabled);
2098           }
2099           String msg = "Not flushing since "
2100               + (writestate.flushing ? "already flushing"
2101               : "writes not enabled");
2102           status.abort(msg);
2103           return new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false);
2104         }
2105       }
2106 
2107       try {
2108         Collection<Store> specificStoresToFlush =
2109             forceFlushAllStores ? stores.values() : flushPolicy.selectStoresToFlush();
2110         FlushResult fs = internalFlushcache(specificStoresToFlush,
2111           status, writeFlushRequestWalMarker);
2112 
2113         if (coprocessorHost != null) {
2114           status.setStatus("Running post-flush coprocessor hooks");
2115           coprocessorHost.postFlush();
2116         }
2117 
2118         status.markComplete("Flush successful");
2119         return fs;
2120       } finally {
2121         synchronized (writestate) {
2122           writestate.flushing = false;
2123           this.writestate.flushRequested = false;
2124           writestate.notifyAll();
2125         }
2126       }
2127     } finally {
2128       lock.readLock().unlock();
2129       status.cleanup();
2130     }
2131   }
2132 
2133   /**
2134    * Should the store be flushed because it is old enough.
2135    * <p>
2136    * Every FlushPolicy should call this to determine whether a store is old enough to flush(except
2137    * that you always flush all stores). Otherwise the {@link #shouldFlush()} method will always
2138    * returns true which will make a lot of flush requests.
2139    */
2140   boolean shouldFlushStore(Store store) {
2141     long maxFlushedSeqId =
2142         this.wal.getEarliestMemstoreSeqNum(getRegionInfo().getEncodedNameAsBytes(), store
2143             .getFamily().getName()) - 1;
2144     if (maxFlushedSeqId > 0 && maxFlushedSeqId + flushPerChanges < sequenceId.get()) {
2145       if (LOG.isDebugEnabled()) {
2146         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
2147             + " will be flushed because its max flushed seqId(" + maxFlushedSeqId
2148             + ") is far away from current(" + sequenceId.get() + "), max allowed is "
2149             + flushPerChanges);
2150       }
2151       return true;
2152     }
2153     if (flushCheckInterval <= 0) {
2154       return false;
2155     }
2156     long now = EnvironmentEdgeManager.currentTime();
2157     if (store.timeOfOldestEdit() < now - flushCheckInterval) {
2158       if (LOG.isDebugEnabled()) {
2159         LOG.debug("Column Family: " + store.getColumnFamilyName() + " of region " + this
2160             + " will be flushed because time of its oldest edit (" + store.timeOfOldestEdit()
2161             + ") is far away from now(" + now + "), max allowed is " + flushCheckInterval);
2162       }
2163       return true;
2164     }
2165     return false;
2166   }
2167 
2168   /**
2169    * Should the memstore be flushed now
2170    */
2171   boolean shouldFlush() {
2172     // This is a rough measure.
2173     if (this.maxFlushedSeqId > 0
2174           && (this.maxFlushedSeqId + this.flushPerChanges < this.sequenceId.get())) {
2175       return true;
2176     }
2177     long modifiedFlushCheckInterval = flushCheckInterval;
2178     if (getRegionInfo().isMetaRegion() &&
2179         getRegionInfo().getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
2180       modifiedFlushCheckInterval = META_CACHE_FLUSH_INTERVAL;
2181     }
2182     if (modifiedFlushCheckInterval <= 0) { //disabled
2183       return false;
2184     }
2185     long now = EnvironmentEdgeManager.currentTime();
2186     //if we flushed in the recent past, we don't need to do again now
2187     if ((now - getEarliestFlushTimeForAllStores() < modifiedFlushCheckInterval)) {
2188       return false;
2189     }
2190     //since we didn't flush in the recent past, flush now if certain conditions
2191     //are met. Return true on first such memstore hit.
2192     for (Store s : getStores()) {
2193       if (s.timeOfOldestEdit() < now - modifiedFlushCheckInterval) {
2194         // we have an old enough edit in the memstore, flush
2195         return true;
2196       }
2197     }
2198     return false;
2199   }
2200 
2201   /**
2202    * Flushing all stores.
2203    *
2204    * @see #internalFlushcache(Collection, MonitoredTask, boolean)
2205    */
2206   private FlushResult internalFlushcache(MonitoredTask status)
2207       throws IOException {
2208     return internalFlushcache(stores.values(), status, false);
2209   }
2210 
2211   /**
2212    * Flushing given stores.
2213    *
2214    * @see #internalFlushcache(WAL, long, Collection, MonitoredTask, boolean)
2215    */
2216   private FlushResult internalFlushcache(final Collection<Store> storesToFlush,
2217       MonitoredTask status, boolean writeFlushWalMarker) throws IOException {
2218     return internalFlushcache(this.wal, HConstants.NO_SEQNUM, storesToFlush,
2219         status, writeFlushWalMarker);
2220   }
2221 
2222   /**
2223    * Flush the memstore. Flushing the memstore is a little tricky. We have a lot
2224    * of updates in the memstore, all of which have also been written to the wal.
2225    * We need to write those updates in the memstore out to disk, while being
2226    * able to process reads/writes as much as possible during the flush
2227    * operation.
2228    * <p>
2229    * This method may block for some time. Every time you call it, we up the
2230    * regions sequence id even if we don't flush; i.e. the returned region id
2231    * will be at least one larger than the last edit applied to this region. The
2232    * returned id does not refer to an actual edit. The returned id can be used
2233    * for say installing a bulk loaded file just ahead of the last hfile that was
2234    * the result of this flush, etc.
2235    *
2236    * @param wal
2237    *          Null if we're NOT to go via wal.
2238    * @param myseqid
2239    *          The seqid to use if <code>wal</code> is null writing out flush
2240    *          file.
2241    * @param storesToFlush
2242    *          The list of stores to flush.
2243    * @return object describing the flush's state
2244    * @throws IOException
2245    *           general io exceptions
2246    * @throws DroppedSnapshotException
2247    *           Thrown when replay of wal is required because a Snapshot was not
2248    *           properly persisted.
2249    */
2250   protected FlushResult internalFlushcache(final WAL wal, final long myseqid,
2251       final Collection<Store> storesToFlush, MonitoredTask status, boolean writeFlushWalMarker)
2252           throws IOException {
2253     PrepareFlushResult result
2254       = internalPrepareFlushCache(wal, myseqid, storesToFlush, status, writeFlushWalMarker);
2255     if (result.result == null) {
2256       return internalFlushCacheAndCommit(wal, status, result, storesToFlush);
2257     } else {
2258       return result.result; // early exit due to failure from prepare stage
2259     }
2260   }
2261 
2262   protected PrepareFlushResult internalPrepareFlushCache(
2263       final WAL wal, final long myseqid, final Collection<Store> storesToFlush,
2264       MonitoredTask status, boolean writeFlushWalMarker)
2265           throws IOException {
2266 
2267     if (this.rsServices != null && this.rsServices.isAborted()) {
2268       // Don't flush when server aborting, it's unsafe
2269       throw new IOException("Aborting flush because server is aborted...");
2270     }
2271     final long startTime = EnvironmentEdgeManager.currentTime();
2272     // If nothing to flush, return, but we need to safely update the region sequence id
2273     if (this.memstoreSize.get() <= 0) {
2274       // Take an update lock because am about to change the sequence id and we want the sequence id
2275       // to be at the border of the empty memstore.
2276       MultiVersionConsistencyControl.WriteEntry writeEntry = null;
2277       this.updatesLock.writeLock().lock();
2278       try {
2279         if (this.memstoreSize.get() <= 0) {
2280           // Presume that if there are still no edits in the memstore, then there are no edits for
2281           // this region out in the WAL subsystem so no need to do any trickery clearing out
2282           // edits in the WAL system. Up the sequence number so the resulting flush id is for
2283           // sure just beyond the last appended region edit (useful as a marker when bulk loading,
2284           // etc.)
2285           // wal can be null replaying edits.
2286           if (wal != null) {
2287             writeEntry = mvcc.beginMemstoreInsert();
2288             long flushOpSeqId = getNextSequenceId(wal);
2289             FlushResult flushResult = new FlushResultImpl(
2290               FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, flushOpSeqId, "Nothing to flush",
2291               writeFlushRequestMarkerToWAL(wal, writeFlushWalMarker));
2292             writeEntry.setWriteNumber(flushOpSeqId);
2293             mvcc.waitForPreviousTransactionsComplete(writeEntry);
2294             writeEntry = null;
2295             return new PrepareFlushResult(flushResult, myseqid);
2296           } else {
2297             return new PrepareFlushResult(
2298               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
2299                 "Nothing to flush", false),
2300               myseqid);
2301           }
2302         }
2303       } finally {
2304         this.updatesLock.writeLock().unlock();
2305         if (writeEntry != null) {
2306           mvcc.advanceMemstore(writeEntry);
2307         }
2308       }
2309     }
2310 
2311     if (LOG.isInfoEnabled()) {
2312       LOG.info("Started memstore flush for " + this + ", current region memstore size "
2313           + StringUtils.byteDesc(this.memstoreSize.get()) + ", and " + storesToFlush.size() + "/"
2314           + stores.size() + " column families' memstores are being flushed."
2315           + ((wal != null) ? "" : "; wal is null, using passed sequenceid=" + myseqid));
2316       // only log when we are not flushing all stores.
2317       if (this.stores.size() > storesToFlush.size()) {
2318         for (Store store: storesToFlush) {
2319           LOG.info("Flushing Column Family: " + store.getColumnFamilyName()
2320               + " which was occupying "
2321               + StringUtils.byteDesc(store.getMemStoreSize()) + " of memstore.");
2322         }
2323       }
2324     }
2325     // Stop updates while we snapshot the memstore of all of these regions' stores. We only have
2326     // to do this for a moment.  It is quick. We also set the memstore size to zero here before we
2327     // allow updates again so its value will represent the size of the updates received
2328     // during flush
2329     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
2330     // We have to take an update lock during snapshot, or else a write could end up in both snapshot
2331     // and memstore (makes it difficult to do atomic rows then)
2332     status.setStatus("Obtaining lock to block concurrent updates");
2333     // block waiting for the lock for internal flush
2334     this.updatesLock.writeLock().lock();
2335     status.setStatus("Preparing to flush by snapshotting stores in " +
2336       getRegionInfo().getEncodedName());
2337     long totalFlushableSizeOfFlushableStores = 0;
2338 
2339     Set<byte[]> flushedFamilyNames = new HashSet<byte[]>();
2340     for (Store store: storesToFlush) {
2341       flushedFamilyNames.add(store.getFamily().getName());
2342     }
2343 
2344     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
2345       = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
2346     TreeMap<byte[], List<Path>> committedFiles = new TreeMap<byte[], List<Path>>(
2347         Bytes.BYTES_COMPARATOR);
2348     TreeMap<byte[], Long> storeFlushableSize
2349         = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
2350     // The sequence id of this flush operation which is used to log FlushMarker and pass to
2351     // createFlushContext to use as the store file's sequence id.
2352     long flushOpSeqId = HConstants.NO_SEQNUM;
2353     // The max flushed sequence id after this flush operation. Used as completeSequenceId which is
2354     // passed to HMaster.
2355     long flushedSeqId = HConstants.NO_SEQNUM;
2356     byte[] encodedRegionName = getRegionInfo().getEncodedNameAsBytes();
2357 
2358     long trxId = 0;
2359     try {
2360       try {
2361         mvcc.waitForPreviousTransactionsComplete();
2362         writeEntry = mvcc.beginMemstoreInsert();
2363         if (wal != null) {
2364           Long earliestUnflushedSequenceIdForTheRegion =
2365               wal.startCacheFlush(encodedRegionName, flushedFamilyNames);
2366           if (earliestUnflushedSequenceIdForTheRegion == null) {
2367             // This should never happen. This is how startCacheFlush signals flush cannot proceed.
2368             String msg = this.getRegionInfo().getEncodedName() + " flush aborted; WAL closing.";
2369             status.setStatus(msg);
2370             return new PrepareFlushResult(
2371               new FlushResultImpl(FlushResult.Result.CANNOT_FLUSH, msg, false),
2372               myseqid);
2373           }
2374           flushOpSeqId = getNextSequenceId(wal);
2375           // Back up 1, minus 1 from oldest sequence id in memstore to get last 'flushed' edit
2376           flushedSeqId =
2377             earliestUnflushedSequenceIdForTheRegion.longValue() == HConstants.NO_SEQNUM?
2378               flushOpSeqId: earliestUnflushedSequenceIdForTheRegion.longValue() - 1;
2379         } else {
2380           // use the provided sequence Id as WAL is not being used for this flush.
2381           flushedSeqId = flushOpSeqId = myseqid;
2382         }
2383 
2384         for (Store s : storesToFlush) {
2385           totalFlushableSizeOfFlushableStores += s.getFlushableSize();
2386           storeFlushCtxs.put(s.getFamily().getName(), s.createFlushContext(flushOpSeqId));
2387           committedFiles.put(s.getFamily().getName(), null); // for writing stores to WAL
2388           storeFlushableSize.put(s.getFamily().getName(), s.getFlushableSize());
2389         }
2390 
2391         // write the snapshot start to WAL
2392         if (wal != null && !writestate.readOnly) {
2393           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.START_FLUSH,
2394             getRegionInfo(), flushOpSeqId, committedFiles);
2395           // no sync. Sync is below where we do not hold the updates lock
2396           trxId = WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2397             desc, sequenceId, false);
2398         }
2399 
2400         // Prepare flush (take a snapshot)
2401         for (StoreFlushContext flush : storeFlushCtxs.values()) {
2402           flush.prepare();
2403         }
2404       } catch (IOException ex) {
2405         if (wal != null) {
2406           if (trxId > 0) { // check whether we have already written START_FLUSH to WAL
2407             try {
2408               FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2409                 getRegionInfo(), flushOpSeqId, committedFiles);
2410               WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2411                 desc, sequenceId, false);
2412             } catch (Throwable t) {
2413               LOG.warn("Received unexpected exception trying to write ABORT_FLUSH marker to WAL:" +
2414                   StringUtils.stringifyException(t));
2415               // ignore this since we will be aborting the RS with DSE.
2416             }
2417           }
2418           // we have called wal.startCacheFlush(), now we have to abort it
2419           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2420           throw ex; // let upper layers deal with it.
2421         }
2422       } finally {
2423         this.updatesLock.writeLock().unlock();
2424       }
2425       String s = "Finished memstore snapshotting " + this +
2426         ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSizeOfFlushableStores;
2427       status.setStatus(s);
2428       if (LOG.isTraceEnabled()) LOG.trace(s);
2429       // sync unflushed WAL changes
2430       // see HBASE-8208 for details
2431       if (wal != null) {
2432         try {
2433           wal.sync(); // ensure that flush marker is sync'ed
2434         } catch (IOException ioe) {
2435           wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2436           throw ioe;
2437         }
2438       }
2439 
2440       // wait for all in-progress transactions to commit to WAL before
2441       // we can start the flush. This prevents
2442       // uncommitted transactions from being written into HFiles.
2443       // We have to block before we start the flush, otherwise keys that
2444       // were removed via a rollbackMemstore could be written to Hfiles.
2445       writeEntry.setWriteNumber(flushOpSeqId);
2446       mvcc.waitForPreviousTransactionsComplete(writeEntry);
2447       // set w to null to prevent mvcc.advanceMemstore from being called again inside finally block
2448       writeEntry = null;
2449     } finally {
2450       if (writeEntry != null) {
2451         // in case of failure just mark current writeEntry as complete
2452         mvcc.advanceMemstore(writeEntry);
2453       }
2454     }
2455     return new PrepareFlushResult(storeFlushCtxs, committedFiles, storeFlushableSize, startTime, flushOpSeqId,
2456       flushedSeqId, totalFlushableSizeOfFlushableStores);
2457   }
2458 
2459   /**
2460    * Writes a marker to WAL indicating a flush is requested but cannot be complete due to various
2461    * reasons. Ignores exceptions from WAL. Returns whether the write succeeded.
2462    * @param wal
2463    * @return whether WAL write was successful
2464    */
2465   private boolean writeFlushRequestMarkerToWAL(WAL wal, boolean writeFlushWalMarker) {
2466     if (writeFlushWalMarker && wal != null && !writestate.readOnly) {
2467       FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.CANNOT_FLUSH,
2468         getRegionInfo(), -1, new TreeMap<byte[], List<Path>>());
2469       try {
2470         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2471           desc, sequenceId, true);
2472         return true;
2473       } catch (IOException e) {
2474         LOG.warn(getRegionInfo().getEncodedName() + " : "
2475             + "Received exception while trying to write the flush request to wal", e);
2476       }
2477     }
2478     return false;
2479   }
2480 
2481   protected FlushResult internalFlushCacheAndCommit(
2482         final WAL wal, MonitoredTask status, final PrepareFlushResult prepareResult,
2483         final Collection<Store> storesToFlush)
2484     throws IOException {
2485 
2486     // prepare flush context is carried via PrepareFlushResult
2487     TreeMap<byte[], StoreFlushContext> storeFlushCtxs = prepareResult.storeFlushCtxs;
2488     TreeMap<byte[], List<Path>> committedFiles = prepareResult.committedFiles;
2489     long startTime = prepareResult.startTime;
2490     long flushOpSeqId = prepareResult.flushOpSeqId;
2491     long flushedSeqId = prepareResult.flushedSeqId;
2492     long totalFlushableSizeOfFlushableStores = prepareResult.totalFlushableSize;
2493 
2494     String s = "Flushing stores of " + this;
2495     status.setStatus(s);
2496     if (LOG.isTraceEnabled()) LOG.trace(s);
2497 
2498     // Any failure from here on out will be catastrophic requiring server
2499     // restart so wal content can be replayed and put back into the memstore.
2500     // Otherwise, the snapshot content while backed up in the wal, it will not
2501     // be part of the current running servers state.
2502     boolean compactionRequested = false;
2503     long flushedOutputFileSize = 0;
2504     try {
2505       // A.  Flush memstore to all the HStores.
2506       // Keep running vector of all store files that includes both old and the
2507       // just-made new flush store file. The new flushed file is still in the
2508       // tmp directory.
2509 
2510       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2511         flush.flushCache(status);
2512       }
2513 
2514       // Switch snapshot (in memstore) -> new hfile (thus causing
2515       // all the store scanners to reset/reseek).
2516       Iterator<Store> it = storesToFlush.iterator();
2517       // stores.values() and storeFlushCtxs have same order
2518       for (StoreFlushContext flush : storeFlushCtxs.values()) {
2519         boolean needsCompaction = flush.commit(status);
2520         if (needsCompaction) {
2521           compactionRequested = true;
2522         }
2523         byte[] storeName = it.next().getFamily().getName();
2524         List<Path> storeCommittedFiles = flush.getCommittedFiles();
2525         committedFiles.put(storeName, storeCommittedFiles);
2526         // Flush committed no files, indicating flush is empty or flush was canceled
2527         if (storeCommittedFiles == null || storeCommittedFiles.isEmpty()) {
2528           totalFlushableSizeOfFlushableStores -= prepareResult.storeFlushableSize.get(storeName);
2529         }
2530         flushedOutputFileSize += flush.getOutputFileSize();
2531       }
2532       storeFlushCtxs.clear();
2533 
2534       // Set down the memstore size by amount of flush.
2535       this.addAndGetGlobalMemstoreSize(-totalFlushableSizeOfFlushableStores);
2536 
2537       if (wal != null) {
2538         // write flush marker to WAL. If fail, we should throw DroppedSnapshotException
2539         FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.COMMIT_FLUSH,
2540           getRegionInfo(), flushOpSeqId, committedFiles);
2541         WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2542           desc, sequenceId, true);
2543       }
2544     } catch (Throwable t) {
2545       // An exception here means that the snapshot was not persisted.
2546       // The wal needs to be replayed so its content is restored to memstore.
2547       // Currently, only a server restart will do this.
2548       // We used to only catch IOEs but its possible that we'd get other
2549       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
2550       // all and sundry.
2551       if (wal != null) {
2552         try {
2553           FlushDescriptor desc = ProtobufUtil.toFlushDescriptor(FlushAction.ABORT_FLUSH,
2554             getRegionInfo(), flushOpSeqId, committedFiles);
2555           WALUtil.writeFlushMarker(wal, this.htableDescriptor, getRegionInfo(),
2556             desc, sequenceId, false);
2557         } catch (Throwable ex) {
2558           LOG.warn(getRegionInfo().getEncodedName() + " : "
2559               + "Received unexpected exception trying to write ABORT_FLUSH marker to WAL:"
2560               + StringUtils.stringifyException(ex));
2561           // ignore this since we will be aborting the RS with DSE.
2562         }
2563         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2564       }
2565       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
2566           Bytes.toStringBinary(getRegionInfo().getRegionName()));
2567       dse.initCause(t);
2568       status.abort("Flush failed: " + StringUtils.stringifyException(t));
2569 
2570       // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
2571       // However, since we may have the region read lock, we cannot call close(true) here since
2572       // we cannot promote to a write lock. Instead we are setting closing so that all other region
2573       // operations except for close will be rejected.
2574       this.closing.set(true);
2575 
2576       if (rsServices != null) {
2577         // This is a safeguard against the case where the caller fails to explicitly handle aborting
2578         rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
2579       }
2580 
2581       throw dse;
2582     }
2583 
2584     // If we get to here, the HStores have been written.
2585     if (wal != null) {
2586       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
2587     }
2588 
2589     // Record latest flush time
2590     for (Store store: storesToFlush) {
2591       this.lastStoreFlushTimeMap.put(store, startTime);
2592     }
2593 
2594     // Update the oldest unflushed sequence id for region.
2595     this.maxFlushedSeqId = flushedSeqId;
2596 
2597     // Record flush operation sequence id.
2598     this.lastFlushOpSeqId = flushOpSeqId;
2599 
2600     // C. Finally notify anyone waiting on memstore to clear:
2601     // e.g. checkResources().
2602     synchronized (this) {
2603       notifyAll(); // FindBugs NN_NAKED_NOTIFY
2604     }
2605 
2606     long time = EnvironmentEdgeManager.currentTime() - startTime;
2607     long memstoresize = this.memstoreSize.get();
2608     String msg = "Finished memstore flush of ~"
2609         + StringUtils.byteDesc(totalFlushableSizeOfFlushableStores) + "/"
2610         + totalFlushableSizeOfFlushableStores + ", currentsize="
2611         + StringUtils.byteDesc(memstoresize) + "/" + memstoresize
2612         + " for region " + this + " in " + time + "ms, sequenceid="
2613         + flushOpSeqId +  ", compaction requested=" + compactionRequested
2614         + ((wal == null) ? "; wal=null" : "");
2615     LOG.info(msg);
2616     status.setStatus(msg);
2617 
2618     if (rsServices != null && rsServices.getMetrics() != null) {
2619       rsServices.getMetrics().updateFlush(
2620         getTableDesc().getTableName().getNameAsString(),
2621         time - startTime,
2622         totalFlushableSizeOfFlushableStores, flushedOutputFileSize);
2623     }
2624 
2625     return new FlushResultImpl(compactionRequested ?
2626         FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
2627           FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushOpSeqId);
2628   }
2629 
2630   /**
2631    * Method to safely get the next sequence number.
2632    * @return Next sequence number unassociated with any actual edit.
2633    * @throws IOException
2634    */
2635   @VisibleForTesting
2636   protected long getNextSequenceId(final WAL wal) throws IOException {
2637     // TODO: For review. Putting an empty edit in to get a sequenceid out will not work if the
2638     // WAL is banjaxed... if it has gotten an exception and the WAL has not yet been rolled or
2639     // aborted. In this case, we'll just get stuck here. For now, until HBASE-12751, just have
2640     // a timeout. May happen in tests after we tightened the semantic via HBASE-14317.
2641     // Also, the getSequenceId blocks on a latch. There is no global list of outstanding latches
2642     WALKey key = this.appendEmptyEdit(wal, null);
2643     return key.getSequenceId(maxWaitForSeqId);
2644   }
2645 
2646   //////////////////////////////////////////////////////////////////////////////
2647   // get() methods for client use.
2648   //////////////////////////////////////////////////////////////////////////////
2649 
2650   @Override
2651   public Result getClosestRowBefore(final byte [] row, final byte [] family) throws IOException {
2652     if (coprocessorHost != null) {
2653       Result result = new Result();
2654       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
2655         return result;
2656       }
2657     }
2658     // look across all the HStores for this region and determine what the
2659     // closest key is across all column families, since the data may be sparse
2660     checkRow(row, "getClosestRowBefore");
2661     startRegionOperation(Operation.GET);
2662     this.readRequestsCount.increment();
2663     try {
2664       Store store = getStore(family);
2665       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
2666       Cell key = store.getRowKeyAtOrBefore(row);
2667       Result result = null;
2668       if (key != null) {
2669         Get get = new Get(CellUtil.cloneRow(key));
2670         get.addFamily(family);
2671         result = get(get);
2672       }
2673       if (coprocessorHost != null) {
2674         coprocessorHost.postGetClosestRowBefore(row, family, result);
2675       }
2676       return result;
2677     } finally {
2678       closeRegionOperation(Operation.GET);
2679     }
2680   }
2681 
2682   @Override
2683   public RegionScanner getScanner(Scan scan) throws IOException {
2684    return getScanner(scan, null);
2685   }
2686 
2687   protected RegionScanner getScanner(Scan scan,
2688       List<KeyValueScanner> additionalScanners) throws IOException {
2689     startRegionOperation(Operation.SCAN);
2690     try {
2691       // Verify families are all valid
2692       if (!scan.hasFamilies()) {
2693         // Adding all families to scanner
2694         for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
2695           scan.addFamily(family);
2696         }
2697       } else {
2698         for (byte [] family : scan.getFamilyMap().keySet()) {
2699           checkFamily(family);
2700         }
2701       }
2702       return instantiateRegionScanner(scan, additionalScanners);
2703     } finally {
2704       closeRegionOperation(Operation.SCAN);
2705     }
2706   }
2707 
2708   protected RegionScanner instantiateRegionScanner(Scan scan,
2709       List<KeyValueScanner> additionalScanners) throws IOException {
2710     if (scan.isReversed()) {
2711       if (scan.getFilter() != null) {
2712         scan.getFilter().setReversed(true);
2713       }
2714       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2715     }
2716     return new RegionScannerImpl(scan, additionalScanners, this);
2717   }
2718 
2719   @Override
2720   public void prepareDelete(Delete delete) throws IOException {
2721     // Check to see if this is a deleteRow insert
2722     if(delete.getFamilyCellMap().isEmpty()){
2723       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2724         // Don't eat the timestamp
2725         delete.addFamily(family, delete.getTimeStamp());
2726       }
2727     } else {
2728       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2729         if(family == null) {
2730           throw new NoSuchColumnFamilyException("Empty family is invalid");
2731         }
2732         checkFamily(family);
2733       }
2734     }
2735   }
2736 
2737   @Override
2738   public void delete(Delete delete) throws IOException {
2739     checkReadOnly();
2740     checkResources();
2741     startRegionOperation(Operation.DELETE);
2742     try {
2743       delete.getRow();
2744       // All edits for the given row (across all column families) must happen atomically.
2745       doBatchMutate(delete);
2746     } finally {
2747       closeRegionOperation(Operation.DELETE);
2748     }
2749   }
2750 
2751   /**
2752    * Row needed by below method.
2753    */
2754   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2755 
2756   /**
2757    * This is used only by unit tests. Not required to be a public API.
2758    * @param familyMap map of family to edits for the given family.
2759    * @throws IOException
2760    */
2761   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2762       Durability durability) throws IOException {
2763     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2764     delete.setFamilyCellMap(familyMap);
2765     delete.setDurability(durability);
2766     doBatchMutate(delete);
2767   }
2768 
2769   @Override
2770   public void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2771       byte[] byteNow) throws IOException {
2772     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2773 
2774       byte[] family = e.getKey();
2775       List<Cell> cells = e.getValue();
2776       assert cells instanceof RandomAccess;
2777 
2778       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2779       int listSize = cells.size();
2780       for (int i=0; i < listSize; i++) {
2781         Cell cell = cells.get(i);
2782         //  Check if time is LATEST, change to time of most recent addition if so
2783         //  This is expensive.
2784         if (cell.getTimestamp() == HConstants.LATEST_TIMESTAMP && CellUtil.isDeleteType(cell)) {
2785           byte[] qual = CellUtil.cloneQualifier(cell);
2786           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2787 
2788           Integer count = kvCount.get(qual);
2789           if (count == null) {
2790             kvCount.put(qual, 1);
2791           } else {
2792             kvCount.put(qual, count + 1);
2793           }
2794           count = kvCount.get(qual);
2795 
2796           Get get = new Get(CellUtil.cloneRow(cell));
2797           get.setMaxVersions(count);
2798           get.addColumn(family, qual);
2799           if (coprocessorHost != null) {
2800             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2801                 byteNow, get)) {
2802               updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2803             }
2804           } else {
2805             updateDeleteLatestVersionTimeStamp(cell, get, count, byteNow);
2806           }
2807         } else {
2808           CellUtil.updateLatestStamp(cell, byteNow, 0);
2809         }
2810       }
2811     }
2812   }
2813 
2814   void updateDeleteLatestVersionTimeStamp(Cell cell, Get get, int count, byte[] byteNow)
2815       throws IOException {
2816     List<Cell> result = get(get, false);
2817 
2818     if (result.size() < count) {
2819       // Nothing to delete
2820       CellUtil.updateLatestStamp(cell, byteNow, 0);
2821       return;
2822     }
2823     if (result.size() > count) {
2824       throw new RuntimeException("Unexpected size: " + result.size());
2825     }
2826     Cell getCell = result.get(count - 1);
2827     CellUtil.setTimestamp(cell, getCell.getTimestamp());
2828   }
2829 
2830   @Override
2831   public void put(Put put) throws IOException {
2832     checkReadOnly();
2833 
2834     // Do a rough check that we have resources to accept a write.  The check is
2835     // 'rough' in that between the resource check and the call to obtain a
2836     // read lock, resources may run out.  For now, the thought is that this
2837     // will be extremely rare; we'll deal with it when it happens.
2838     checkResources();
2839     startRegionOperation(Operation.PUT);
2840     try {
2841       // All edits for the given row (across all column families) must happen atomically.
2842       doBatchMutate(put);
2843     } finally {
2844       closeRegionOperation(Operation.PUT);
2845     }
2846   }
2847 
2848   /**
2849    * Struct-like class that tracks the progress of a batch operation,
2850    * accumulating status codes and tracking the index at which processing
2851    * is proceeding.
2852    */
2853   private abstract static class BatchOperationInProgress<T> {
2854     T[] operations;
2855     int nextIndexToProcess = 0;
2856     OperationStatus[] retCodeDetails;
2857     WALEdit[] walEditsFromCoprocessors;
2858 
2859     public BatchOperationInProgress(T[] operations) {
2860       this.operations = operations;
2861       this.retCodeDetails = new OperationStatus[operations.length];
2862       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2863       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2864     }
2865 
2866     public abstract Mutation getMutation(int index);
2867     public abstract long getNonceGroup(int index);
2868     public abstract long getNonce(int index);
2869     /** This method is potentially expensive and should only be used for non-replay CP path. */
2870     public abstract Mutation[] getMutationsForCoprocs();
2871     public abstract boolean isInReplay();
2872     public abstract long getReplaySequenceId();
2873 
2874     public boolean isDone() {
2875       return nextIndexToProcess == operations.length;
2876     }
2877   }
2878 
2879   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2880     private long nonceGroup;
2881     private long nonce;
2882     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2883       super(operations);
2884       this.nonceGroup = nonceGroup;
2885       this.nonce = nonce;
2886     }
2887 
2888     @Override
2889     public Mutation getMutation(int index) {
2890       return this.operations[index];
2891     }
2892 
2893     @Override
2894     public long getNonceGroup(int index) {
2895       return nonceGroup;
2896     }
2897 
2898     @Override
2899     public long getNonce(int index) {
2900       return nonce;
2901     }
2902 
2903     @Override
2904     public Mutation[] getMutationsForCoprocs() {
2905       return this.operations;
2906     }
2907 
2908     @Override
2909     public boolean isInReplay() {
2910       return false;
2911     }
2912 
2913     @Override
2914     public long getReplaySequenceId() {
2915       return 0;
2916     }
2917   }
2918 
2919   private static class ReplayBatch extends BatchOperationInProgress<MutationReplay> {
2920     private long replaySeqId = 0;
2921     public ReplayBatch(MutationReplay[] operations, long seqId) {
2922       super(operations);
2923       this.replaySeqId = seqId;
2924     }
2925 
2926     @Override
2927     public Mutation getMutation(int index) {
2928       return this.operations[index].mutation;
2929     }
2930 
2931     @Override
2932     public long getNonceGroup(int index) {
2933       return this.operations[index].nonceGroup;
2934     }
2935 
2936     @Override
2937     public long getNonce(int index) {
2938       return this.operations[index].nonce;
2939     }
2940 
2941     @Override
2942     public Mutation[] getMutationsForCoprocs() {
2943       assert false;
2944       throw new RuntimeException("Should not be called for replay batch");
2945     }
2946 
2947     @Override
2948     public boolean isInReplay() {
2949       return true;
2950     }
2951 
2952     @Override
2953     public long getReplaySequenceId() {
2954       return this.replaySeqId;
2955     }
2956   }
2957 
2958   @Override
2959   public OperationStatus[] batchMutate(Mutation[] mutations, long nonceGroup, long nonce)
2960       throws IOException {
2961     // As it stands, this is used for 3 things
2962     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2963     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2964     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2965     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2966   }
2967 
2968   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2969     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2970   }
2971 
2972   @Override
2973   public OperationStatus[] batchReplay(MutationReplay[] mutations, long replaySeqId)
2974       throws IOException {
2975     if (!RegionReplicaUtil.isDefaultReplica(getRegionInfo())
2976         && replaySeqId < lastReplayedOpenRegionSeqId) {
2977       // if it is a secondary replica we should ignore these entries silently
2978       // since they are coming out of order
2979       if (LOG.isTraceEnabled()) {
2980         LOG.trace(getRegionInfo().getEncodedName() + " : "
2981           + "Skipping " + mutations.length + " mutations with replaySeqId=" + replaySeqId
2982           + " which is < than lastReplayedOpenRegionSeqId=" + lastReplayedOpenRegionSeqId);
2983         for (MutationReplay mut : mutations) {
2984           LOG.trace(getRegionInfo().getEncodedName() + " : Skipping : " + mut.mutation);
2985         }
2986       }
2987 
2988       OperationStatus[] statuses = new OperationStatus[mutations.length];
2989       for (int i = 0; i < statuses.length; i++) {
2990         statuses[i] = OperationStatus.SUCCESS;
2991       }
2992       return statuses;
2993     }
2994     return batchMutate(new ReplayBatch(mutations, replaySeqId));
2995   }
2996 
2997   /**
2998    * Perform a batch of mutations.
2999    * It supports only Put and Delete mutations and will ignore other types passed.
3000    * @param batchOp contains the list of mutations
3001    * @return an array of OperationStatus which internally contains the
3002    *         OperationStatusCode and the exceptionMessage if any.
3003    * @throws IOException
3004    */
3005   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
3006     boolean initialized = false;
3007     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
3008     startRegionOperation(op);
3009     try {
3010       while (!batchOp.isDone()) {
3011         if (!batchOp.isInReplay()) {
3012           checkReadOnly();
3013         }
3014         checkResources();
3015 
3016         if (!initialized) {
3017           this.writeRequestsCount.add(batchOp.operations.length);
3018           if (!batchOp.isInReplay()) {
3019             doPreMutationHook(batchOp);
3020           }
3021           initialized = true;
3022         }
3023         doMiniBatchMutation(batchOp);
3024         long newSize = this.getMemstoreSize();
3025         if (isFlushSize(newSize)) {
3026           requestFlush();
3027         }
3028       }
3029     } finally {
3030       closeRegionOperation(op);
3031     }
3032     return batchOp.retCodeDetails;
3033   }
3034 
3035 
3036   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
3037       throws IOException {
3038     /* Run coprocessor pre hook outside of locks to avoid deadlock */
3039     WALEdit walEdit = new WALEdit();
3040     if (coprocessorHost != null) {
3041       for (int i = 0 ; i < batchOp.operations.length; i++) {
3042         Mutation m = batchOp.getMutation(i);
3043         if (m instanceof Put) {
3044           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
3045             // pre hook says skip this Put
3046             // mark as success and skip in doMiniBatchMutation
3047             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3048           }
3049         } else if (m instanceof Delete) {
3050           Delete curDel = (Delete) m;
3051           if (curDel.getFamilyCellMap().isEmpty()) {
3052             // handle deleting a row case
3053             prepareDelete(curDel);
3054           }
3055           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
3056             // pre hook says skip this Delete
3057             // mark as success and skip in doMiniBatchMutation
3058             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3059           }
3060         } else {
3061           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
3062           // mark the operation return code as failure so that it will not be considered in
3063           // the doMiniBatchMutation
3064           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
3065               "Put/Delete mutations only supported in batchMutate() now");
3066         }
3067         if (!walEdit.isEmpty()) {
3068           batchOp.walEditsFromCoprocessors[i] = walEdit;
3069           walEdit = new WALEdit();
3070         }
3071       }
3072     }
3073   }
3074 
3075   @SuppressWarnings("unchecked")
3076   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
3077     boolean isInReplay = batchOp.isInReplay();
3078     // variable to note if all Put items are for the same CF -- metrics related
3079     boolean putsCfSetConsistent = true;
3080     //The set of columnFamilies first seen for Put.
3081     Set<byte[]> putsCfSet = null;
3082     // variable to note if all Delete items are for the same CF -- metrics related
3083     boolean deletesCfSetConsistent = true;
3084     //The set of columnFamilies first seen for Delete.
3085     Set<byte[]> deletesCfSet = null;
3086 
3087     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
3088     WALEdit walEdit = new WALEdit(isInReplay);
3089     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
3090     long txid = 0;
3091     boolean doRollBackMemstore = false;
3092     boolean locked = false;
3093 
3094     /** Keep track of the locks we hold so we can release them in finally clause */
3095     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
3096     // reference family maps directly so coprocessors can mutate them if desired
3097     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
3098     List<Cell> memstoreCells = new ArrayList<Cell>();
3099     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
3100     int firstIndex = batchOp.nextIndexToProcess;
3101     int lastIndexExclusive = firstIndex;
3102     boolean success = false;
3103     int noOfPuts = 0, noOfDeletes = 0;
3104     WALKey walKey = null;
3105     long mvccNum = 0;
3106     long addedSize = 0;
3107     final ObservedExceptionsInBatch observedExceptions = new ObservedExceptionsInBatch();
3108     try {
3109       // ------------------------------------
3110       // STEP 1. Try to acquire as many locks as we can, and ensure
3111       // we acquire at least one.
3112       // ----------------------------------
3113       int numReadyToWrite = 0;
3114       long now = EnvironmentEdgeManager.currentTime();
3115       while (lastIndexExclusive < batchOp.operations.length) {
3116         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
3117         boolean isPutMutation = mutation instanceof Put;
3118 
3119         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
3120         // store the family map reference to allow for mutations
3121         familyMaps[lastIndexExclusive] = familyMap;
3122 
3123         // skip anything that "ran" already
3124         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
3125             != OperationStatusCode.NOT_RUN) {
3126           lastIndexExclusive++;
3127           continue;
3128         }
3129 
3130         try {
3131           checkAndPrepareMutation(mutation, batchOp.isInReplay(), familyMap, now);
3132         } catch (NoSuchColumnFamilyException nscf) {
3133           final String msg = "No such column family in batch mutation. ";
3134           if (observedExceptions.hasSeenNoSuchFamily()) {
3135             LOG.warn(msg + nscf.getMessage());
3136           } else {
3137             LOG.warn(msg, nscf);
3138             observedExceptions.sawNoSuchFamily();
3139           }
3140           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3141               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
3142           lastIndexExclusive++;
3143           continue;
3144         } catch (FailedSanityCheckException fsce) {
3145           final String msg = "Batch Mutation did not pass sanity check. ";
3146           if (observedExceptions.hasSeenFailedSanityCheck()) {
3147             LOG.warn(msg + fsce.getMessage());
3148           } else {
3149             LOG.warn(msg, fsce);
3150             observedExceptions.sawFailedSanityCheck();
3151           }
3152           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3153               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
3154           lastIndexExclusive++;
3155           continue;
3156         } catch (WrongRegionException we) {
3157           final String msg = "Batch mutation had a row that does not belong to this region. ";
3158           if (observedExceptions.hasSeenWrongRegion()) {
3159             LOG.warn(msg + we.getMessage());
3160           } else {
3161             LOG.warn(msg, we);
3162             observedExceptions.sawWrongRegion();
3163           }
3164           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
3165               OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
3166           lastIndexExclusive++;
3167           continue;
3168         }
3169 
3170         // If we haven't got any rows in our batch, we should block to
3171         // get the next one.
3172         boolean shouldBlock = numReadyToWrite == 0;
3173         RowLock rowLock = null;
3174         try {
3175           rowLock = getRowLockInternal(mutation.getRow(), shouldBlock);
3176         } catch (IOException ioe) {
3177           LOG.warn("Failed getting lock in batch put, row="
3178             + Bytes.toStringBinary(mutation.getRow()), ioe);
3179         }
3180         if (rowLock == null) {
3181           // We failed to grab another lock
3182           break; // stop acquiring more rows for this batch
3183         } else {
3184           acquiredRowLocks.add(rowLock);
3185         }
3186 
3187         lastIndexExclusive++;
3188         numReadyToWrite++;
3189 
3190         if (isPutMutation) {
3191           // If Column Families stay consistent through out all of the
3192           // individual puts then metrics can be reported as a mutliput across
3193           // column families in the first put.
3194           if (putsCfSet == null) {
3195             putsCfSet = mutation.getFamilyCellMap().keySet();
3196           } else {
3197             putsCfSetConsistent = putsCfSetConsistent
3198                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
3199           }
3200         } else {
3201           if (deletesCfSet == null) {
3202             deletesCfSet = mutation.getFamilyCellMap().keySet();
3203           } else {
3204             deletesCfSetConsistent = deletesCfSetConsistent
3205                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
3206           }
3207         }
3208       }
3209 
3210       // we should record the timestamp only after we have acquired the rowLock,
3211       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
3212       now = EnvironmentEdgeManager.currentTime();
3213       byte[] byteNow = Bytes.toBytes(now);
3214 
3215       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
3216       if (numReadyToWrite <= 0) return 0L;
3217 
3218       // We've now grabbed as many mutations off the list as we can
3219 
3220       // ------------------------------------
3221       // STEP 2. Update any LATEST_TIMESTAMP timestamps
3222       // ----------------------------------
3223       for (int i = firstIndex; !isInReplay && i < lastIndexExclusive; i++) {
3224         // skip invalid
3225         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3226             != OperationStatusCode.NOT_RUN) continue;
3227 
3228         Mutation mutation = batchOp.getMutation(i);
3229         if (mutation instanceof Put) {
3230           updateCellTimestamps(familyMaps[i].values(), byteNow);
3231           noOfPuts++;
3232         } else {
3233           prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
3234           noOfDeletes++;
3235         }
3236         rewriteCellTags(familyMaps[i], mutation);
3237       }
3238 
3239       lock(this.updatesLock.readLock(), numReadyToWrite);
3240       locked = true;
3241       if(isInReplay) {
3242         mvccNum = batchOp.getReplaySequenceId();
3243       } else {
3244         mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
3245       }
3246       //
3247       // ------------------------------------
3248       // Acquire the latest mvcc number
3249       // ----------------------------------
3250       writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
3251 
3252       // calling the pre CP hook for batch mutation
3253       if (!isInReplay && coprocessorHost != null) {
3254         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3255           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3256           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3257         if (coprocessorHost.preBatchMutate(miniBatchOp)) {
3258           return 0L;
3259         } else {
3260           for (int i = firstIndex; i < lastIndexExclusive; i++) {
3261             if (batchOp.retCodeDetails[i].getOperationStatusCode() != OperationStatusCode.NOT_RUN) {
3262               // lastIndexExclusive was incremented above.
3263               continue;
3264             }
3265             // we pass (i - firstIndex) below since the call expects a relative index
3266             Mutation[] cpMutations = miniBatchOp.getOperationsFromCoprocessors(i - firstIndex);
3267             if (cpMutations == null) {
3268               continue;
3269             }
3270             // Else Coprocessor added more Mutations corresponding to the Mutation at this index.
3271             for (int j = 0; j < cpMutations.length; j++) {
3272               Mutation cpMutation = miniBatchOp.getOperationsFromCoprocessors(i)[j];
3273               Map<byte[], List<Cell>> cpFamilyMap = cpMutation.getFamilyCellMap();
3274               checkAndPrepareMutation(cpMutation, isInReplay, cpFamilyMap, now);
3275 
3276               // Acquire row locks. If not, the whole batch will fail.
3277               acquiredRowLocks.add(getRowLock(cpMutation.getRow(), true));
3278 
3279               if (cpMutation.getDurability() == Durability.SKIP_WAL) {
3280                 recordMutationWithoutWal(cpFamilyMap);
3281               }
3282 
3283               // Returned mutations from coprocessor correspond to the Mutation at index i. We can
3284               // directly add the cells from those mutations to the familyMaps of this mutation.
3285               mergeFamilyMaps(familyMaps[i], cpFamilyMap); // will get added to the memstore later
3286             }
3287           }
3288         }
3289       }
3290 
3291       // ------------------------------------
3292       // STEP 3. Write back to memstore
3293       // Write to memstore. It is ok to write to memstore
3294       // first without updating the WAL because we do not roll
3295       // forward the memstore MVCC. The MVCC will be moved up when
3296       // the complete operation is done. These changes are not yet
3297       // visible to scanners till we update the MVCC. The MVCC is
3298       // moved only when the sync is complete.
3299       // ----------------------------------
3300       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3301         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3302             != OperationStatusCode.NOT_RUN) {
3303           continue;
3304         }
3305         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
3306         addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, memstoreCells, isInReplay);
3307       }
3308 
3309       // ------------------------------------
3310       // STEP 4. Build WAL edit
3311       // ----------------------------------
3312       Durability durability = Durability.USE_DEFAULT;
3313       for (int i = firstIndex; i < lastIndexExclusive; i++) {
3314         // Skip puts that were determined to be invalid during preprocessing
3315         if (batchOp.retCodeDetails[i].getOperationStatusCode()
3316             != OperationStatusCode.NOT_RUN) {
3317           continue;
3318         }
3319         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
3320 
3321         Mutation m = batchOp.getMutation(i);
3322         Durability tmpDur = getEffectiveDurability(m.getDurability());
3323         if (tmpDur.ordinal() > durability.ordinal()) {
3324           durability = tmpDur;
3325         }
3326         if (tmpDur == Durability.SKIP_WAL) {
3327           recordMutationWithoutWal(m.getFamilyCellMap());
3328           continue;
3329         }
3330 
3331         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
3332         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
3333         // Given how nonces are originally written, these should be contiguous.
3334         // They don't have to be, it will still work, just write more WALEdits than needed.
3335         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
3336           if (walEdit.size() > 0) {
3337             assert isInReplay;
3338             if (!isInReplay) {
3339               throw new IOException("Multiple nonces per batch and not in replay");
3340             }
3341             // txid should always increase, so having the one from the last call is ok.
3342             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3343             walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3344               this.htableDescriptor.getTableName(), now, m.getClusterIds(),
3345               currentNonceGroup, currentNonce);
3346             txid = this.wal.append(this.htableDescriptor,  this.getRegionInfo(),  walKey,
3347               walEdit, getSequenceId(), true, null);
3348             walEdit = new WALEdit(isInReplay);
3349             walKey = null;
3350           }
3351           currentNonceGroup = nonceGroup;
3352           currentNonce = nonce;
3353         }
3354 
3355         // Add WAL edits by CP
3356         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
3357         if (fromCP != null) {
3358           for (Cell cell : fromCP.getCells()) {
3359             walEdit.add(cell);
3360           }
3361         }
3362         addFamilyMapToWALEdit(familyMaps[i], walEdit);
3363       }
3364 
3365       // -------------------------
3366       // STEP 5. Append the final edit to WAL. Do not sync wal.
3367       // -------------------------
3368       Mutation mutation = batchOp.getMutation(firstIndex);
3369       if (isInReplay) {
3370         // use wal key from the original
3371         walKey = new ReplayHLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3372           this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3373           mutation.getClusterIds(), currentNonceGroup, currentNonce);
3374         long replaySeqId = batchOp.getReplaySequenceId();
3375         walKey.setOrigLogSeqNum(replaySeqId);
3376 
3377         // ensure that the sequence id of the region is at least as big as orig log seq id
3378         while (true) {
3379           long seqId = getSequenceId().get();
3380           if (seqId >= replaySeqId) break;
3381           if (getSequenceId().compareAndSet(seqId, replaySeqId)) break;
3382         }
3383       }
3384       if (walEdit.size() > 0) {
3385         if (!isInReplay) {
3386         // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
3387         walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
3388             this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
3389             mutation.getClusterIds(), currentNonceGroup, currentNonce);
3390         }
3391 
3392         txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit,
3393           getSequenceId(), true, memstoreCells);
3394       }
3395       if (walKey == null){
3396         // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
3397         walKey = this.appendEmptyEdit(this.wal, memstoreCells);
3398       }
3399 
3400       // -------------------------------
3401       // STEP 6. Release row locks, etc.
3402       // -------------------------------
3403       if (locked) {
3404         this.updatesLock.readLock().unlock();
3405         locked = false;
3406       }
3407       releaseRowLocks(acquiredRowLocks);
3408 
3409       // -------------------------
3410       // STEP 7. Sync wal.
3411       // -------------------------
3412       if (txid != 0) {
3413         syncOrDefer(txid, durability);
3414       }
3415 
3416       doRollBackMemstore = false;
3417       // update memstore size
3418       this.addAndGetGlobalMemstoreSize(addedSize);
3419 
3420       // calling the post CP hook for batch mutation
3421       if (!isInReplay && coprocessorHost != null) {
3422         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3423           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3424           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3425         coprocessorHost.postBatchMutate(miniBatchOp);
3426       }
3427 
3428 
3429       // ------------------------------------------------------------------
3430       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
3431       // ------------------------------------------------------------------
3432       if (writeEntry != null) {
3433         mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
3434         writeEntry = null;
3435       }
3436 
3437       // ------------------------------------
3438       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
3439       // synced so that the coprocessor contract is adhered to.
3440       // ------------------------------------
3441       if (!isInReplay && coprocessorHost != null) {
3442         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3443           // only for successful puts
3444           if (batchOp.retCodeDetails[i].getOperationStatusCode()
3445               != OperationStatusCode.SUCCESS) {
3446             continue;
3447           }
3448           Mutation m = batchOp.getMutation(i);
3449           if (m instanceof Put) {
3450             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
3451           } else {
3452             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
3453           }
3454         }
3455       }
3456 
3457       success = true;
3458       return addedSize;
3459     } finally {
3460       // if the wal sync was unsuccessful, remove keys from memstore
3461       if (doRollBackMemstore) {
3462         rollbackMemstore(memstoreCells);
3463         if (writeEntry != null) mvcc.cancelMemstoreInsert(writeEntry);
3464       } else {
3465         if (writeEntry != null) {
3466           mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
3467         }
3468       }
3469 
3470       if (locked) {
3471         this.updatesLock.readLock().unlock();
3472       }
3473       releaseRowLocks(acquiredRowLocks);
3474 
3475       // See if the column families were consistent through the whole thing.
3476       // if they were then keep them. If they were not then pass a null.
3477       // null will be treated as unknown.
3478       // Total time taken might be involving Puts and Deletes.
3479       // Split the time for puts and deletes based on the total number of Puts and Deletes.
3480 
3481       if (noOfPuts > 0) {
3482         // There were some Puts in the batch.
3483         if (this.metricsRegion != null) {
3484           this.metricsRegion.updatePut();
3485         }
3486       }
3487       if (noOfDeletes > 0) {
3488         // There were some Deletes in the batch.
3489         if (this.metricsRegion != null) {
3490           this.metricsRegion.updateDelete();
3491         }
3492       }
3493       if (!success) {
3494         for (int i = firstIndex; i < lastIndexExclusive; i++) {
3495           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
3496             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
3497           }
3498         }
3499       }
3500       if (coprocessorHost != null && !batchOp.isInReplay()) {
3501         // call the coprocessor hook to do any finalization steps
3502         // after the put is done
3503         MiniBatchOperationInProgress<Mutation> miniBatchOp =
3504           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
3505           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
3506         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
3507       }
3508 
3509       batchOp.nextIndexToProcess = lastIndexExclusive;
3510     }
3511   }
3512 
3513   private void mergeFamilyMaps(Map<byte[], List<Cell>> familyMap,
3514       Map<byte[], List<Cell>> toBeMerged) {
3515     for (Map.Entry<byte[], List<Cell>> entry : toBeMerged.entrySet()) {
3516       List<Cell> cells = familyMap.get(entry.getKey());
3517       if (cells == null) {
3518         familyMap.put(entry.getKey(), entry.getValue());
3519       } else {
3520         cells.addAll(entry.getValue());
3521       }
3522     }
3523   }
3524 
3525   /**
3526    * Returns effective durability from the passed durability and
3527    * the table descriptor.
3528    */
3529   protected Durability getEffectiveDurability(Durability d) {
3530     return d == Durability.USE_DEFAULT ? this.durability : d;
3531   }
3532 
3533   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3534   //the getting of the lock happens before, so that you would just pass it into
3535   //the methods. So in the case of checkAndMutate you could just do lockRow,
3536   //get, put, unlockRow or something
3537 
3538   @Override
3539   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
3540       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
3541       boolean writeToWAL)
3542   throws IOException{
3543     checkReadOnly();
3544     //TODO, add check for value length or maybe even better move this to the
3545     //client if this becomes a global setting
3546     checkResources();
3547     boolean isPut = w instanceof Put;
3548     if (!isPut && !(w instanceof Delete))
3549       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
3550           "be Put or Delete");
3551     if (!Bytes.equals(row, w.getRow())) {
3552       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
3553           "getRow must match the passed row");
3554     }
3555 
3556     startRegionOperation();
3557     try {
3558       Get get = new Get(row);
3559       checkFamily(family);
3560       get.addColumn(family, qualifier);
3561 
3562       // Lock row - note that doBatchMutate will relock this row if called
3563       RowLock rowLock = getRowLock(get.getRow());
3564       // wait for all previous transactions to complete (with lock held)
3565       mvcc.waitForPreviousTransactionsComplete();
3566       try {
3567         if (this.getCoprocessorHost() != null) {
3568           Boolean processed = null;
3569           if (w instanceof Put) {
3570             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
3571                 qualifier, compareOp, comparator, (Put) w);
3572           } else if (w instanceof Delete) {
3573             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
3574                 qualifier, compareOp, comparator, (Delete) w);
3575           }
3576           if (processed != null) {
3577             return processed;
3578           }
3579         }
3580         List<Cell> result = get(get, false);
3581 
3582         boolean valueIsNull = comparator.getValue() == null ||
3583           comparator.getValue().length == 0;
3584         boolean matches = false;
3585         long cellTs = 0;
3586         if (result.size() == 0 && valueIsNull) {
3587           matches = true;
3588         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3589             valueIsNull) {
3590           matches = true;
3591           cellTs = result.get(0).getTimestamp();
3592         } else if (result.size() == 1 && !valueIsNull) {
3593           Cell kv = result.get(0);
3594           cellTs = kv.getTimestamp();
3595           int compareResult = comparator.compareTo(kv.getValueArray(),
3596               kv.getValueOffset(), kv.getValueLength());
3597           switch (compareOp) {
3598           case LESS:
3599             matches = compareResult < 0;
3600             break;
3601           case LESS_OR_EQUAL:
3602             matches = compareResult <= 0;
3603             break;
3604           case EQUAL:
3605             matches = compareResult == 0;
3606             break;
3607           case NOT_EQUAL:
3608             matches = compareResult != 0;
3609             break;
3610           case GREATER_OR_EQUAL:
3611             matches = compareResult >= 0;
3612             break;
3613           case GREATER:
3614             matches = compareResult > 0;
3615             break;
3616           default:
3617             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3618           }
3619         }
3620         //If matches put the new put or delete the new delete
3621         if (matches) {
3622           // We have acquired the row lock already. If the system clock is NOT monotonically
3623           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3624           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3625           // there is no way to pass the cellTs. See HBASE-14054.
3626           long now = EnvironmentEdgeManager.currentTime();
3627           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3628           byte[] byteTs = Bytes.toBytes(ts);
3629 
3630           if (w instanceof Put) {
3631             updateCellTimestamps(w.getFamilyCellMap().values(), byteTs);
3632           }
3633           // else delete is not needed since it already does a second get, and sets the timestamp
3634           // from get (see prepareDeleteTimestamps).
3635 
3636           // All edits for the given row (across all column families) must
3637           // happen atomically.
3638           doBatchMutate(w);
3639           this.checkAndMutateChecksPassed.increment();
3640           return true;
3641         }
3642         this.checkAndMutateChecksFailed.increment();
3643         return false;
3644       } finally {
3645         rowLock.release();
3646       }
3647     } finally {
3648       closeRegionOperation();
3649     }
3650   }
3651 
3652   //TODO, Think that gets/puts and deletes should be refactored a bit so that
3653   //the getting of the lock happens before, so that you would just pass it into
3654   //the methods. So in the case of checkAndMutate you could just do lockRow,
3655   //get, put, unlockRow or something
3656 
3657   @Override
3658   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
3659       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
3660       boolean writeToWAL) throws IOException {
3661     checkReadOnly();
3662     //TODO, add check for value length or maybe even better move this to the
3663     //client if this becomes a global setting
3664     checkResources();
3665 
3666     startRegionOperation();
3667     try {
3668       Get get = new Get(row);
3669       checkFamily(family);
3670       get.addColumn(family, qualifier);
3671 
3672       // Lock row - note that doBatchMutate will relock this row if called
3673       RowLock rowLock = getRowLock(get.getRow());
3674       // wait for all previous transactions to complete (with lock held)
3675       mvcc.waitForPreviousTransactionsComplete();
3676       try {
3677         List<Cell> result = get(get, false);
3678 
3679         boolean valueIsNull = comparator.getValue() == null ||
3680             comparator.getValue().length == 0;
3681         boolean matches = false;
3682         long cellTs = 0;
3683         if (result.size() == 0 && valueIsNull) {
3684           matches = true;
3685         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
3686             valueIsNull) {
3687           matches = true;
3688           cellTs = result.get(0).getTimestamp();
3689         } else if (result.size() == 1 && !valueIsNull) {
3690           Cell kv = result.get(0);
3691           cellTs = kv.getTimestamp();
3692           int compareResult = comparator.compareTo(kv.getValueArray(),
3693               kv.getValueOffset(), kv.getValueLength());
3694           switch (compareOp) {
3695           case LESS:
3696             matches = compareResult < 0;
3697             break;
3698           case LESS_OR_EQUAL:
3699             matches = compareResult <= 0;
3700             break;
3701           case EQUAL:
3702             matches = compareResult == 0;
3703             break;
3704           case NOT_EQUAL:
3705             matches = compareResult != 0;
3706             break;
3707           case GREATER_OR_EQUAL:
3708             matches = compareResult >= 0;
3709             break;
3710           case GREATER:
3711             matches = compareResult > 0;
3712             break;
3713           default:
3714             throw new RuntimeException("Unknown Compare op " + compareOp.name());
3715           }
3716         }
3717         //If matches put the new put or delete the new delete
3718         if (matches) {
3719           // We have acquired the row lock already. If the system clock is NOT monotonically
3720           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
3721           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
3722           // there is no way to pass the cellTs. See HBASE-14054.
3723           long now = EnvironmentEdgeManager.currentTime();
3724           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
3725           byte[] byteTs = Bytes.toBytes(ts);
3726 
3727           for (Mutation w : rm.getMutations()) {
3728             if (w instanceof Put) {
3729               updateCellTimestamps(w.getFamilyCellMap().values(), byteTs);
3730             }
3731             // else delete is not needed since it already does a second get, and sets the timestamp
3732             // from get (see prepareDeleteTimestamps).
3733           }
3734 
3735           // All edits for the given row (across all column families) must
3736           // happen atomically.
3737           mutateRow(rm);
3738           this.checkAndMutateChecksPassed.increment();
3739           return true;
3740         }
3741         this.checkAndMutateChecksFailed.increment();
3742         return false;
3743       } finally {
3744         rowLock.release();
3745       }
3746     } finally {
3747       closeRegionOperation();
3748     }
3749   }
3750 
3751   private void doBatchMutate(Mutation mutation) throws IOException {
3752     // Currently this is only called for puts and deletes, so no nonces.
3753     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation });
3754     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3755       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3756     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3757       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3758     }
3759   }
3760 
3761   /**
3762    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3763    * working snapshot directory.
3764    *
3765    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3766    * arg.  (In the future other cancellable HRegion methods could eventually add a
3767    * {@link ForeignExceptionSnare}, or we could do something fancier).
3768    *
3769    * @param desc snapshot description object
3770    * @param exnSnare ForeignExceptionSnare that captures external exceptions in case we need to
3771    *   bail out.  This is allowed to be null and will just be ignored in that case.
3772    * @throws IOException if there is an external or internal error causing the snapshot to fail
3773    */
3774   public void addRegionToSnapshot(SnapshotDescription desc,
3775       ForeignExceptionSnare exnSnare) throws IOException {
3776     Path rootDir = FSUtils.getRootDir(conf);
3777     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3778 
3779     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3780             snapshotDir, desc, exnSnare);
3781     manifest.addRegion(this);
3782   }
3783 
3784   @Override
3785   public void updateCellTimestamps(final Iterable<List<Cell>> cellItr, final byte[] now)
3786       throws IOException {
3787     for (List<Cell> cells: cellItr) {
3788       if (cells == null) continue;
3789       assert cells instanceof RandomAccess;
3790       int listSize = cells.size();
3791       for (int i = 0; i < listSize; i++) {
3792         CellUtil.updateLatestStamp(cells.get(i), now, 0);
3793       }
3794     }
3795   }
3796 
3797   /**
3798    * Possibly rewrite incoming cell tags.
3799    */
3800   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3801     // Check if we have any work to do and early out otherwise
3802     // Update these checks as more logic is added here
3803 
3804     if (m.getTTL() == Long.MAX_VALUE) {
3805       return;
3806     }
3807 
3808     // From this point we know we have some work to do
3809 
3810     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3811       List<Cell> cells = e.getValue();
3812       assert cells instanceof RandomAccess;
3813       int listSize = cells.size();
3814       for (int i = 0; i < listSize; i++) {
3815         Cell cell = cells.get(i);
3816         List<Tag> newTags = Tag.carryForwardTags(null, cell);
3817         newTags = carryForwardTTLTag(newTags, m);
3818 
3819         // Rewrite the cell with the updated set of tags
3820         cells.set(i, new KeyValue(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
3821           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
3822           cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
3823           cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
3824           cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
3825           newTags));
3826       }
3827     }
3828   }
3829 
3830   /*
3831    * Check if resources to support an update.
3832    *
3833    * We throw RegionTooBusyException if above memstore limit
3834    * and expect client to retry using some kind of backoff
3835   */
3836   private void checkResources() throws RegionTooBusyException {
3837     // If catalog region, do not impose resource constraints or block updates.
3838     if (this.getRegionInfo().isMetaRegion()) return;
3839 
3840     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3841       blockedRequestsCount.increment();
3842       requestFlush();
3843       throw new RegionTooBusyException("Above memstore limit, " +
3844           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3845           this.getRegionInfo().getRegionNameAsString()) +
3846           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3847           this.getRegionServerServices().getServerName()) +
3848           ", memstoreSize=" + memstoreSize.get() +
3849           ", blockingMemStoreSize=" + blockingMemStoreSize);
3850     }
3851   }
3852 
3853   /**
3854    * @throws IOException Throws exception if region is in read-only mode.
3855    */
3856   protected void checkReadOnly() throws IOException {
3857     if (isReadOnly()) {
3858       throw new DoNotRetryIOException("region is read only");
3859     }
3860   }
3861 
3862   protected void checkReadsEnabled() throws IOException {
3863     if (!this.writestate.readsEnabled) {
3864       throw new IOException(getRegionInfo().getEncodedName()
3865         + ": The region's reads are disabled. Cannot serve the request");
3866     }
3867   }
3868 
3869   public void setReadsEnabled(boolean readsEnabled) {
3870    if (readsEnabled && !this.writestate.readsEnabled) {
3871      LOG.info(getRegionInfo().getEncodedName() + " : Enabling reads for region.");
3872     }
3873     this.writestate.setReadsEnabled(readsEnabled);
3874   }
3875 
3876   /**
3877    * Add updates first to the wal and then add values to memstore.
3878    * Warning: Assumption is caller has lock on passed in row.
3879    * @param edits Cell updates by column
3880    * @throws IOException
3881    */
3882   private void put(final byte [] row, byte [] family, List<Cell> edits)
3883   throws IOException {
3884     NavigableMap<byte[], List<Cell>> familyMap;
3885     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3886 
3887     familyMap.put(family, edits);
3888     Put p = new Put(row);
3889     p.setFamilyCellMap(familyMap);
3890     doBatchMutate(p);
3891   }
3892 
3893   /**
3894    * Atomically apply the given map of family->edits to the memstore.
3895    * This handles the consistency control on its own, but the caller
3896    * should already have locked updatesLock.readLock(). This also does
3897    * <b>not</b> check the families for validity.
3898    *
3899    * @param familyMap Map of kvs per family
3900    * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
3901    *        If null, then this method internally creates a mvcc transaction.
3902    * @param output newly added KVs into memstore
3903    * @param isInReplay true when adding replayed KVs into memstore
3904    * @return the additional memory usage of the memstore caused by the
3905    * new entries.
3906    * @throws IOException
3907    */
3908   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3909     long mvccNum, List<Cell> memstoreCells, boolean isInReplay) throws IOException {
3910     long size = 0;
3911 
3912     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3913       byte[] family = e.getKey();
3914       List<Cell> cells = e.getValue();
3915       assert cells instanceof RandomAccess;
3916       Store store = getStore(family);
3917       int listSize = cells.size();
3918       for (int i=0; i < listSize; i++) {
3919         Cell cell = cells.get(i);
3920         CellUtil.setSequenceId(cell, mvccNum);
3921         Pair<Long, Cell> ret = store.add(cell);
3922         size += ret.getFirst();
3923         memstoreCells.add(ret.getSecond());
3924         if(isInReplay) {
3925           // set memstore newly added cells with replay mvcc number
3926           CellUtil.setSequenceId(ret.getSecond(), mvccNum);
3927         }
3928       }
3929     }
3930 
3931      return size;
3932    }
3933 
3934   /**
3935    * Remove all the keys listed in the map from the memstore. This method is
3936    * called when a Put/Delete has updated memstore but subsequently fails to update
3937    * the wal. This method is then invoked to rollback the memstore.
3938    */
3939   private void rollbackMemstore(List<Cell> memstoreCells) {
3940     int kvsRolledback = 0;
3941 
3942     for (Cell cell : memstoreCells) {
3943       byte[] family = CellUtil.cloneFamily(cell);
3944       Store store = getStore(family);
3945       store.rollback(cell);
3946       kvsRolledback++;
3947     }
3948     LOG.debug("rollbackMemstore rolled back " + kvsRolledback);
3949   }
3950 
3951   @Override
3952   public void checkFamilies(Collection<byte[]> families) throws NoSuchColumnFamilyException {
3953     for (byte[] family : families) {
3954       checkFamily(family);
3955     }
3956   }
3957 
3958   private void checkAndPrepareMutation(Mutation mutation, boolean replay,
3959       final Map<byte[], List<Cell>> familyMap, final long now)
3960           throws IOException {
3961     if (mutation instanceof Put) {
3962       // Check the families in the put. If bad, skip this one.
3963       if (replay) {
3964         removeNonExistentColumnFamilyForReplay(familyMap);
3965       } else {
3966         checkFamilies(familyMap.keySet());
3967       }
3968       checkTimestamps(mutation.getFamilyCellMap(), now);
3969     } else {
3970       prepareDelete((Delete)mutation);
3971     }
3972     checkRow(mutation.getRow(), "doMiniBatchMutation");
3973   }
3974 
3975   /**
3976    * During replay, there could exist column families which are removed between region server
3977    * failure and replay
3978    */
3979   private void removeNonExistentColumnFamilyForReplay(
3980       final Map<byte[], List<Cell>> familyMap) {
3981     List<byte[]> nonExistentList = null;
3982     for (byte[] family : familyMap.keySet()) {
3983       if (!this.htableDescriptor.hasFamily(family)) {
3984         if (nonExistentList == null) {
3985           nonExistentList = new ArrayList<byte[]>();
3986         }
3987         nonExistentList.add(family);
3988       }
3989     }
3990     if (nonExistentList != null) {
3991       for (byte[] family : nonExistentList) {
3992         // Perhaps schema was changed between crash and replay
3993         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3994         familyMap.remove(family);
3995       }
3996     }
3997   }
3998 
3999   @Override
4000   public void checkTimestamps(final Map<byte[], List<Cell>> familyMap, long now)
4001       throws FailedSanityCheckException {
4002     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
4003       return;
4004     }
4005     long maxTs = now + timestampSlop;
4006     for (List<Cell> kvs : familyMap.values()) {
4007       assert kvs instanceof RandomAccess;
4008       int listSize  = kvs.size();
4009       for (int i=0; i < listSize; i++) {
4010         Cell cell = kvs.get(i);
4011         // see if the user-side TS is out of range. latest = server-side
4012         long ts = cell.getTimestamp();
4013         if (ts != HConstants.LATEST_TIMESTAMP && ts > maxTs) {
4014           throw new FailedSanityCheckException("Timestamp for KV out of range "
4015               + cell + " (too.new=" + timestampSlop + ")");
4016         }
4017       }
4018     }
4019   }
4020 
4021   /**
4022    * Append the given map of family->edits to a WALEdit data structure.
4023    * This does not write to the WAL itself.
4024    * @param familyMap map of family->edits
4025    * @param walEdit the destination entry to append into
4026    */
4027   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
4028       WALEdit walEdit) {
4029     for (List<Cell> edits : familyMap.values()) {
4030       assert edits instanceof RandomAccess;
4031       int listSize = edits.size();
4032       for (int i=0; i < listSize; i++) {
4033         Cell cell = edits.get(i);
4034         walEdit.add(cell);
4035       }
4036     }
4037   }
4038 
4039   private void requestFlush() {
4040     if (this.rsServices == null) {
4041       return;
4042     }
4043     synchronized (writestate) {
4044       if (this.writestate.isFlushRequested()) {
4045         return;
4046       }
4047       writestate.flushRequested = true;
4048     }
4049     // Make request outside of synchronize block; HBASE-818.
4050     this.rsServices.getFlushRequester().requestFlush(this, false);
4051     if (LOG.isDebugEnabled()) {
4052       LOG.debug("Flush requested on " + this);
4053     }
4054   }
4055 
4056   /*
4057    * @param size
4058    * @return True if size is over the flush threshold
4059    */
4060   private boolean isFlushSize(final long size) {
4061     return size > this.memstoreFlushSize;
4062   }
4063 
4064   /**
4065    * Read the edits put under this region by wal splitting process.  Put
4066    * the recovered edits back up into this region.
4067    *
4068    * <p>We can ignore any wal message that has a sequence ID that's equal to or
4069    * lower than minSeqId.  (Because we know such messages are already
4070    * reflected in the HFiles.)
4071    *
4072    * <p>While this is running we are putting pressure on memory yet we are
4073    * outside of our usual accounting because we are not yet an onlined region
4074    * (this stuff is being run as part of Region initialization).  This means
4075    * that if we're up against global memory limits, we'll not be flagged to flush
4076    * because we are not online. We can't be flushed by usual mechanisms anyways;
4077    * we're not yet online so our relative sequenceids are not yet aligned with
4078    * WAL sequenceids -- not till we come up online, post processing of split
4079    * edits.
4080    *
4081    * <p>But to help relieve memory pressure, at least manage our own heap size
4082    * flushing if are in excess of per-region limits.  Flushing, though, we have
4083    * to be careful and avoid using the regionserver/wal sequenceid.  Its running
4084    * on a different line to whats going on in here in this region context so if we
4085    * crashed replaying these edits, but in the midst had a flush that used the
4086    * regionserver wal with a sequenceid in excess of whats going on in here
4087    * in this region and with its split editlogs, then we could miss edits the
4088    * next time we go to recover. So, we have to flush inline, using seqids that
4089    * make sense in a this single region context only -- until we online.
4090    *
4091    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
4092    * the maxSeqId for the store to be applied, else its skipped.
4093    * @return the sequence id of the last edit added to this region out of the
4094    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4095    * @throws UnsupportedEncodingException
4096    * @throws IOException
4097    */
4098   protected long replayRecoveredEditsIfAny(final Path regiondir,
4099       Map<byte[], Long> maxSeqIdInStores,
4100       final CancelableProgressable reporter, final MonitoredTask status)
4101       throws IOException {
4102     long minSeqIdForTheRegion = -1;
4103     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
4104       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
4105         minSeqIdForTheRegion = maxSeqIdInStore;
4106       }
4107     }
4108     long seqid = minSeqIdForTheRegion;
4109 
4110     FileSystem fs = this.fs.getFileSystem();
4111     NavigableSet<Path> files = WALSplitter.getSplitEditFilesSorted(fs, regiondir);
4112     if (LOG.isDebugEnabled()) {
4113       LOG.debug("Found " + (files == null ? 0 : files.size())
4114         + " recovered edits file(s) under " + regiondir);
4115     }
4116 
4117     if (files == null || files.isEmpty()) return seqid;
4118 
4119     for (Path edits: files) {
4120       if (edits == null || !fs.exists(edits)) {
4121         LOG.warn("Null or non-existent edits file: " + edits);
4122         continue;
4123       }
4124       if (isZeroLengthThenDelete(fs, edits)) continue;
4125 
4126       long maxSeqId;
4127       String fileName = edits.getName();
4128       maxSeqId = Math.abs(Long.parseLong(fileName));
4129       if (maxSeqId <= minSeqIdForTheRegion) {
4130         if (LOG.isDebugEnabled()) {
4131           String msg = "Maximum sequenceid for this wal is " + maxSeqId
4132             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
4133             + ", skipped the whole file, path=" + edits;
4134           LOG.debug(msg);
4135         }
4136         continue;
4137       }
4138 
4139       try {
4140         // replay the edits. Replay can return -1 if everything is skipped, only update
4141         // if seqId is greater
4142         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
4143       } catch (IOException e) {
4144         boolean skipErrors = conf.getBoolean(
4145             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
4146             conf.getBoolean(
4147                 "hbase.skip.errors",
4148                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
4149         if (conf.get("hbase.skip.errors") != null) {
4150           LOG.warn(
4151               "The property 'hbase.skip.errors' has been deprecated. Please use " +
4152               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
4153         }
4154         if (skipErrors) {
4155           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4156           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
4157               + "=true so continuing. Renamed " + edits +
4158               " as " + p, e);
4159         } else {
4160           throw e;
4161         }
4162       }
4163     }
4164     // The edits size added into rsAccounting during this replaying will not
4165     // be required any more. So just clear it.
4166     if (this.rsAccounting != null) {
4167       this.rsAccounting.clearRegionReplayEditsSize(getRegionInfo().getRegionName());
4168     }
4169     if (seqid > minSeqIdForTheRegion) {
4170       // Then we added some edits to memory. Flush and cleanup split edit files.
4171       internalFlushcache(null, seqid, stores.values(), status, false);
4172     }
4173     // Now delete the content of recovered edits.  We're done w/ them.
4174     if (files.size() > 0 && this.conf.getBoolean("hbase.region.archive.recovered.edits", false)) {
4175       // For debugging data loss issues!
4176       // If this flag is set, make use of the hfile archiving by making recovered.edits a fake
4177       // column family. Have to fake out file type too by casting our recovered.edits as storefiles
4178       String fakeFamilyName = WALSplitter.getRegionDirRecoveredEditsDir(regiondir).getName();
4179       Set<StoreFile> fakeStoreFiles = new HashSet<StoreFile>(files.size());
4180       for (Path file: files) {
4181         fakeStoreFiles.add(new StoreFile(getRegionFileSystem().getFileSystem(), file, this.conf,
4182           null, null));
4183       }
4184       getRegionFileSystem().removeStoreFiles(fakeFamilyName, fakeStoreFiles);
4185     } else {
4186       for (Path file: files) {
4187         if (!fs.delete(file, false)) {
4188           LOG.error("Failed delete of " + file);
4189         } else {
4190           LOG.debug("Deleted recovered.edits file=" + file);
4191         }
4192       }
4193     }
4194     return seqid;
4195   }
4196 
4197   /*
4198    * @param edits File of recovered edits.
4199    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in wal
4200    * must be larger than this to be replayed for each store.
4201    * @param reporter
4202    * @return the sequence id of the last edit added to this region out of the
4203    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
4204    * @throws IOException
4205    */
4206   private long replayRecoveredEdits(final Path edits,
4207       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
4208     throws IOException {
4209     String msg = "Replaying edits from " + edits;
4210     LOG.info(msg);
4211     MonitoredTask status = TaskMonitor.get().createStatus(msg);
4212     FileSystem fs = this.fs.getFileSystem();
4213 
4214     status.setStatus("Opening recovered edits");
4215     WAL.Reader reader = null;
4216     try {
4217       reader = WALFactory.createReader(fs, edits, conf);
4218       long currentEditSeqId = -1;
4219       long currentReplaySeqId = -1;
4220       long firstSeqIdInLog = -1;
4221       long skippedEdits = 0;
4222       long editsCount = 0;
4223       long intervalEdits = 0;
4224       WAL.Entry entry;
4225       Store store = null;
4226       boolean reported_once = false;
4227       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
4228 
4229       try {
4230         // How many edits seen before we check elapsed time
4231         int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
4232         // How often to send a progress report (default 1/2 master timeout)
4233         int period = this.conf.getInt("hbase.hstore.report.period", 300000);
4234         long lastReport = EnvironmentEdgeManager.currentTime();
4235 
4236         while ((entry = reader.next()) != null) {
4237           WALKey key = entry.getKey();
4238           WALEdit val = entry.getEdit();
4239 
4240           if (ng != null) { // some test, or nonces disabled
4241             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
4242           }
4243 
4244           if (reporter != null) {
4245             intervalEdits += val.size();
4246             if (intervalEdits >= interval) {
4247               // Number of edits interval reached
4248               intervalEdits = 0;
4249               long cur = EnvironmentEdgeManager.currentTime();
4250               if (lastReport + period <= cur) {
4251                 status.setStatus("Replaying edits..." +
4252                     " skipped=" + skippedEdits +
4253                     " edits=" + editsCount);
4254                 // Timeout reached
4255                 if(!reporter.progress()) {
4256                   msg = "Progressable reporter failed, stopping replay";
4257                   LOG.warn(msg);
4258                   status.abort(msg);
4259                   throw new IOException(msg);
4260                 }
4261                 reported_once = true;
4262                 lastReport = cur;
4263               }
4264             }
4265           }
4266 
4267           if (firstSeqIdInLog == -1) {
4268             firstSeqIdInLog = key.getLogSeqNum();
4269           }
4270           if (currentEditSeqId > key.getLogSeqNum()) {
4271             // when this condition is true, it means we have a serious defect because we need to
4272             // maintain increasing SeqId for WAL edits per region
4273             LOG.error(getRegionInfo().getEncodedName() + " : "
4274                  + "Found decreasing SeqId. PreId=" + currentEditSeqId + " key=" + key
4275                 + "; edit=" + val);
4276           } else {
4277             currentEditSeqId = key.getLogSeqNum();
4278           }
4279           currentReplaySeqId = (key.getOrigLogSeqNum() > 0) ?
4280             key.getOrigLogSeqNum() : currentEditSeqId;
4281 
4282           // Start coprocessor replay here. The coprocessor is for each WALEdit
4283           // instead of a KeyValue.
4284           if (coprocessorHost != null) {
4285             status.setStatus("Running pre-WAL-restore hook in coprocessors");
4286             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
4287               // if bypass this wal entry, ignore it ...
4288               continue;
4289             }
4290           }
4291           boolean checkRowWithinBoundary = false;
4292           // Check this edit is for this region.
4293           if (!Bytes.equals(key.getEncodedRegionName(),
4294               this.getRegionInfo().getEncodedNameAsBytes())) {
4295             checkRowWithinBoundary = true;
4296           }
4297 
4298           boolean flush = false;
4299           for (Cell cell: val.getCells()) {
4300             // Check this edit is for me. Also, guard against writing the special
4301             // METACOLUMN info such as HBASE::CACHEFLUSH entries
4302             if (CellUtil.matchingFamily(cell, WALEdit.METAFAMILY)) {
4303               // if region names don't match, skipp replaying compaction marker
4304               if (!checkRowWithinBoundary) {
4305                 //this is a special edit, we should handle it
4306                 CompactionDescriptor compaction = WALEdit.getCompaction(cell);
4307                 if (compaction != null) {
4308                   //replay the compaction
4309                   replayWALCompactionMarker(compaction, false, true, Long.MAX_VALUE);
4310                 }
4311               }
4312               skippedEdits++;
4313               continue;
4314             }
4315             // Figure which store the edit is meant for.
4316             if (store == null || !CellUtil.matchingFamily(cell, store.getFamily().getName())) {
4317               store = getStore(cell);
4318             }
4319             if (store == null) {
4320               // This should never happen.  Perhaps schema was changed between
4321               // crash and redeploy?
4322               LOG.warn("No family for " + cell);
4323               skippedEdits++;
4324               continue;
4325             }
4326             if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
4327               cell.getRowArray(), cell.getRowOffset(), cell.getRowLength())) {
4328               LOG.warn("Row of " + cell + " is not within region boundary");
4329               skippedEdits++;
4330               continue;
4331             }
4332             // Now, figure if we should skip this edit.
4333             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
4334                 .getName())) {
4335               skippedEdits++;
4336               continue;
4337             }
4338             CellUtil.setSequenceId(cell, currentReplaySeqId);
4339 
4340             // Once we are over the limit, restoreEdit will keep returning true to
4341             // flush -- but don't flush until we've played all the kvs that make up
4342             // the WALEdit.
4343             flush |= restoreEdit(store, cell);
4344             editsCount++;
4345           }
4346           if (flush) {
4347             internalFlushcache(null, currentEditSeqId, stores.values(), status, false);
4348           }
4349 
4350           if (coprocessorHost != null) {
4351             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
4352           }
4353         }
4354       } catch (EOFException eof) {
4355         Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4356         msg = "Encountered EOF. Most likely due to Master failure during " +
4357             "wal splitting, so we have this data in another edit.  " +
4358             "Continuing, but renaming " + edits + " as " + p;
4359         LOG.warn(msg, eof);
4360         status.abort(msg);
4361       } catch (IOException ioe) {
4362         // If the IOE resulted from bad file format,
4363         // then this problem is idempotent and retrying won't help
4364         if (ioe.getCause() instanceof ParseException) {
4365           Path p = WALSplitter.moveAsideBadEditsFile(fs, edits);
4366           msg = "File corruption encountered!  " +
4367               "Continuing, but renaming " + edits + " as " + p;
4368           LOG.warn(msg, ioe);
4369           status.setStatus(msg);
4370         } else {
4371           status.abort(StringUtils.stringifyException(ioe));
4372           // other IO errors may be transient (bad network connection,
4373           // checksum exception on one datanode, etc).  throw & retry
4374           throw ioe;
4375         }
4376       }
4377       if (reporter != null && !reported_once) {
4378         reporter.progress();
4379       }
4380       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
4381         ", firstSequenceIdInLog=" + firstSeqIdInLog +
4382         ", maxSequenceIdInLog=" + currentEditSeqId + ", path=" + edits;
4383       status.markComplete(msg);
4384       LOG.debug(msg);
4385       return currentEditSeqId;
4386     } finally {
4387       status.cleanup();
4388       if (reader != null) {
4389          reader.close();
4390       }
4391     }
4392   }
4393 
4394   /**
4395    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
4396    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
4397    * See HBASE-2331.
4398    */
4399   void replayWALCompactionMarker(CompactionDescriptor compaction, boolean pickCompactionFiles,
4400       boolean removeFiles, long replaySeqId)
4401       throws IOException {
4402     try {
4403       checkTargetRegion(compaction.getEncodedRegionName().toByteArray(),
4404         "Compaction marker from WAL ", compaction);
4405     } catch (WrongRegionException wre) {
4406       if (RegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4407         // skip the compaction marker since it is not for this region
4408         return;
4409       }
4410       throw wre;
4411     }
4412 
4413     synchronized (writestate) {
4414       if (replaySeqId < lastReplayedOpenRegionSeqId) {
4415         LOG.warn(getRegionInfo().getEncodedName() + " : "
4416             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4417             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4418             + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4419         return;
4420       }
4421       if (replaySeqId < lastReplayedCompactionSeqId) {
4422         LOG.warn(getRegionInfo().getEncodedName() + " : "
4423             + "Skipping replaying compaction event :" + TextFormat.shortDebugString(compaction)
4424             + " because its sequence id " + replaySeqId + " is smaller than this regions "
4425             + "lastReplayedCompactionSeqId of " + lastReplayedCompactionSeqId);
4426         return;
4427       } else {
4428         lastReplayedCompactionSeqId = replaySeqId;
4429       }
4430 
4431       if (LOG.isDebugEnabled()) {
4432         LOG.debug(getRegionInfo().getEncodedName() + " : "
4433             + "Replaying compaction marker " + TextFormat.shortDebugString(compaction)
4434             + " with seqId=" + replaySeqId + " and lastReplayedOpenRegionSeqId="
4435             + lastReplayedOpenRegionSeqId);
4436       }
4437 
4438       startRegionOperation(Operation.REPLAY_EVENT);
4439       try {
4440         Store store = this.getStore(compaction.getFamilyName().toByteArray());
4441         if (store == null) {
4442           LOG.warn(getRegionInfo().getEncodedName() + " : "
4443               + "Found Compaction WAL edit for deleted family:"
4444               + Bytes.toString(compaction.getFamilyName().toByteArray()));
4445           return;
4446         }
4447         store.replayCompactionMarker(compaction, pickCompactionFiles, removeFiles);
4448         logRegionFiles();
4449       } catch (FileNotFoundException ex) {
4450         LOG.warn(getRegionInfo().getEncodedName() + " : "
4451             + "At least one of the store files in compaction: "
4452             + TextFormat.shortDebugString(compaction)
4453             + " doesn't exist any more. Skip loading the file(s)", ex);
4454       } finally {
4455         closeRegionOperation(Operation.REPLAY_EVENT);
4456       }
4457     }
4458   }
4459 
4460   void replayWALFlushMarker(FlushDescriptor flush, long replaySeqId) throws IOException {
4461     checkTargetRegion(flush.getEncodedRegionName().toByteArray(),
4462       "Flush marker from WAL ", flush);
4463 
4464     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4465       return; // if primary nothing to do
4466     }
4467 
4468     if (LOG.isDebugEnabled()) {
4469       LOG.debug(getRegionInfo().getEncodedName() + " : "
4470           + "Replaying flush marker " + TextFormat.shortDebugString(flush));
4471     }
4472 
4473     startRegionOperation(Operation.REPLAY_EVENT); // use region close lock to guard against close
4474     try {
4475       FlushAction action = flush.getAction();
4476       switch (action) {
4477       case START_FLUSH:
4478         replayWALFlushStartMarker(flush);
4479         break;
4480       case COMMIT_FLUSH:
4481         replayWALFlushCommitMarker(flush);
4482         break;
4483       case ABORT_FLUSH:
4484         replayWALFlushAbortMarker(flush);
4485         break;
4486       case CANNOT_FLUSH:
4487         replayWALFlushCannotFlushMarker(flush, replaySeqId);
4488         break;
4489       default:
4490         LOG.warn(getRegionInfo().getEncodedName() + " : " +
4491           "Received a flush event with unknown action, ignoring. " +
4492           TextFormat.shortDebugString(flush));
4493         break;
4494       }
4495 
4496       logRegionFiles();
4497     } finally {
4498       closeRegionOperation(Operation.REPLAY_EVENT);
4499     }
4500   }
4501 
4502   /** Replay the flush marker from primary region by creating a corresponding snapshot of
4503    * the store memstores, only if the memstores do not have a higher seqId from an earlier wal
4504    * edit (because the events may be coming out of order).
4505    */
4506   @VisibleForTesting
4507   PrepareFlushResult replayWALFlushStartMarker(FlushDescriptor flush) throws IOException {
4508     long flushSeqId = flush.getFlushSequenceNumber();
4509 
4510     HashSet<Store> storesToFlush = new HashSet<Store>();
4511     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4512       byte[] family = storeFlush.getFamilyName().toByteArray();
4513       Store store = getStore(family);
4514       if (store == null) {
4515         LOG.warn(getRegionInfo().getEncodedName() + " : "
4516           + "Received a flush start marker from primary, but the family is not found. Ignoring"
4517           + " StoreFlushDescriptor:" + TextFormat.shortDebugString(storeFlush));
4518         continue;
4519       }
4520       storesToFlush.add(store);
4521     }
4522 
4523     MonitoredTask status = TaskMonitor.get().createStatus("Preparing flush " + this);
4524 
4525     // we will use writestate as a coarse-grain lock for all the replay events
4526     // (flush, compaction, region open etc)
4527     synchronized (writestate) {
4528       try {
4529         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4530           LOG.warn(getRegionInfo().getEncodedName() + " : "
4531               + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4532               + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4533               + " of " + lastReplayedOpenRegionSeqId);
4534           return null;
4535         }
4536         if (numMutationsWithoutWAL.get() > 0) {
4537           numMutationsWithoutWAL.set(0);
4538           dataInMemoryWithoutWAL.set(0);
4539         }
4540 
4541         if (!writestate.flushing) {
4542           // we do not have an active snapshot and corresponding this.prepareResult. This means
4543           // we can just snapshot our memstores and continue as normal.
4544 
4545           // invoke prepareFlushCache. Send null as wal since we do not want the flush events in wal
4546           PrepareFlushResult prepareResult = internalPrepareFlushCache(null,
4547             flushSeqId, storesToFlush, status, false);
4548           if (prepareResult.result == null) {
4549             // save the PrepareFlushResult so that we can use it later from commit flush
4550             this.writestate.flushing = true;
4551             this.prepareFlushResult = prepareResult;
4552             status.markComplete("Flush prepare successful");
4553             if (LOG.isDebugEnabled()) {
4554               LOG.debug(getRegionInfo().getEncodedName() + " : "
4555                   + " Prepared flush with seqId:" + flush.getFlushSequenceNumber());
4556             }
4557           } else {
4558             // special case empty memstore. We will still save the flush result in this case, since
4559             // our memstore ie empty, but the primary is still flushing
4560             if (prepareResult.getResult().getResult() ==
4561                   FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
4562               this.writestate.flushing = true;
4563               this.prepareFlushResult = prepareResult;
4564               if (LOG.isDebugEnabled()) {
4565                 LOG.debug(getRegionInfo().getEncodedName() + " : "
4566                   + " Prepared empty flush with seqId:" + flush.getFlushSequenceNumber());
4567               }
4568             }
4569             status.abort("Flush prepare failed with " + prepareResult.result);
4570             // nothing much to do. prepare flush failed because of some reason.
4571           }
4572           return prepareResult;
4573         } else {
4574           // we already have an active snapshot.
4575           if (flush.getFlushSequenceNumber() == this.prepareFlushResult.flushOpSeqId) {
4576             // They define the same flush. Log and continue.
4577             LOG.warn(getRegionInfo().getEncodedName() + " : "
4578                 + "Received a flush prepare marker with the same seqId: " +
4579                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4580                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4581             // ignore
4582           } else if (flush.getFlushSequenceNumber() < this.prepareFlushResult.flushOpSeqId) {
4583             // We received a flush with a smaller seqNum than what we have prepared. We can only
4584             // ignore this prepare flush request.
4585             LOG.warn(getRegionInfo().getEncodedName() + " : "
4586                 + "Received a flush prepare marker with a smaller seqId: " +
4587                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4588                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4589             // ignore
4590           } else {
4591             // We received a flush with a larger seqNum than what we have prepared
4592             LOG.warn(getRegionInfo().getEncodedName() + " : "
4593                 + "Received a flush prepare marker with a larger seqId: " +
4594                 + flush.getFlushSequenceNumber() + " before clearing the previous one with seqId: "
4595                 + prepareFlushResult.flushOpSeqId + ". Ignoring");
4596             // We do not have multiple active snapshots in the memstore or a way to merge current
4597             // memstore snapshot with the contents and resnapshot for now. We cannot take
4598             // another snapshot and drop the previous one because that will cause temporary
4599             // data loss in the secondary. So we ignore this for now, deferring the resolution
4600             // to happen when we see the corresponding flush commit marker. If we have a memstore
4601             // snapshot with x, and later received another prepare snapshot with y (where x < y),
4602             // when we see flush commit for y, we will drop snapshot for x, and can also drop all
4603             // the memstore edits if everything in memstore is < y. This is the usual case for
4604             // RS crash + recovery where we might see consequtive prepare flush wal markers.
4605             // Otherwise, this will cause more memory to be used in secondary replica until a
4606             // further prapare + commit flush is seen and replayed.
4607           }
4608         }
4609       } finally {
4610         status.cleanup();
4611         writestate.notifyAll();
4612       }
4613     }
4614     return null;
4615   }
4616 
4617   @VisibleForTesting
4618   void replayWALFlushCommitMarker(FlushDescriptor flush) throws IOException {
4619     MonitoredTask status = TaskMonitor.get().createStatus("Committing flush " + this);
4620 
4621     // check whether we have the memstore snapshot with the corresponding seqId. Replay to
4622     // secondary region replicas are in order, except for when the region moves or then the
4623     // region server crashes. In those cases, we may receive replay requests out of order from
4624     // the original seqIds.
4625     synchronized (writestate) {
4626       try {
4627         if (flush.getFlushSequenceNumber() < lastReplayedOpenRegionSeqId) {
4628           LOG.warn(getRegionInfo().getEncodedName() + " : "
4629             + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4630             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4631             + " of " + lastReplayedOpenRegionSeqId);
4632           return;
4633         }
4634 
4635         if (writestate.flushing) {
4636           PrepareFlushResult prepareFlushResult = this.prepareFlushResult;
4637           if (flush.getFlushSequenceNumber() == prepareFlushResult.flushOpSeqId) {
4638             if (LOG.isDebugEnabled()) {
4639               LOG.debug(getRegionInfo().getEncodedName() + " : "
4640                   + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4641                   + " and a previous prepared snapshot was found");
4642             }
4643             // This is the regular case where we received commit flush after prepare flush
4644             // corresponding to the same seqId.
4645             replayFlushInStores(flush, prepareFlushResult, true);
4646 
4647             // Set down the memstore size by amount of flush.
4648             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4649 
4650             this.prepareFlushResult = null;
4651             writestate.flushing = false;
4652           } else if (flush.getFlushSequenceNumber() < prepareFlushResult.flushOpSeqId) {
4653             // This should not happen normally. However, lets be safe and guard against these cases
4654             // we received a flush commit with a smaller seqId than what we have prepared
4655             // we will pick the flush file up from this commit (if we have not seen it), but we
4656             // will not drop the memstore
4657             LOG.warn(getRegionInfo().getEncodedName() + " : "
4658                 + "Received a flush commit marker with smaller seqId: "
4659                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: "
4660                 + prepareFlushResult.flushOpSeqId + ". Picking up new file, but not dropping"
4661                 +"  prepared memstore snapshot");
4662             replayFlushInStores(flush, prepareFlushResult, false);
4663 
4664             // snapshot is not dropped, so memstore sizes should not be decremented
4665             // we still have the prepared snapshot, flushing should still be true
4666           } else {
4667             // This should not happen normally. However, lets be safe and guard against these cases
4668             // we received a flush commit with a larger seqId than what we have prepared
4669             // we will pick the flush file for this. We will also obtain the updates lock and
4670             // look for contents of the memstore to see whether we have edits after this seqId.
4671             // If not, we will drop all the memstore edits and the snapshot as well.
4672             LOG.warn(getRegionInfo().getEncodedName() + " : "
4673                 + "Received a flush commit marker with larger seqId: "
4674                 + flush.getFlushSequenceNumber() + " than what we have prepared with seqId: " +
4675                 prepareFlushResult.flushOpSeqId + ". Picking up new file and dropping prepared"
4676                 +" memstore snapshot");
4677 
4678             replayFlushInStores(flush, prepareFlushResult, true);
4679 
4680             // Set down the memstore size by amount of flush.
4681             this.addAndGetGlobalMemstoreSize(-prepareFlushResult.totalFlushableSize);
4682 
4683             // Inspect the memstore contents to see whether the memstore contains only edits
4684             // with seqId smaller than the flush seqId. If so, we can discard those edits.
4685             dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4686 
4687             this.prepareFlushResult = null;
4688             writestate.flushing = false;
4689           }
4690           // If we were waiting for observing a flush or region opening event for not showing
4691           // partial data after a secondary region crash, we can allow reads now. We can only make
4692           // sure that we are not showing partial data (for example skipping some previous edits)
4693           // until we observe a full flush start and flush commit. So if we were not able to find
4694           // a previous flush we will not enable reads now.
4695           this.setReadsEnabled(true);
4696         } else {
4697           LOG.warn(getRegionInfo().getEncodedName() + " : "
4698               + "Received a flush commit marker with seqId:" + flush.getFlushSequenceNumber()
4699               + ", but no previous prepared snapshot was found");
4700           // There is no corresponding prepare snapshot from before.
4701           // We will pick up the new flushed file
4702           replayFlushInStores(flush, null, false);
4703 
4704           // Inspect the memstore contents to see whether the memstore contains only edits
4705           // with seqId smaller than the flush seqId. If so, we can discard those edits.
4706           dropMemstoreContentsForSeqId(flush.getFlushSequenceNumber(), null);
4707         }
4708 
4709         status.markComplete("Flush commit successful");
4710 
4711         // Update the last flushed sequence id for region.
4712         this.maxFlushedSeqId = flush.getFlushSequenceNumber();
4713 
4714         // advance the mvcc read point so that the new flushed file is visible.
4715         // there may be some in-flight transactions, but they won't be made visible since they are
4716         // either greater than flush seq number or they were already dropped via flush.
4717         // TODO: If we are using FlushAllStoresPolicy, then this can make edits visible from other
4718         // stores while they are still in flight because the flush commit marker will not contain
4719         // flushes from ALL stores.
4720         getMVCC().advanceMemstoreReadPointIfNeeded(flush.getFlushSequenceNumber());
4721 
4722       } catch (FileNotFoundException ex) {
4723         LOG.warn(getRegionInfo().getEncodedName() + " : "
4724             + "At least one of the store files in flush: " + TextFormat.shortDebugString(flush)
4725             + " doesn't exist any more. Skip loading the file(s)", ex);
4726       }
4727       finally {
4728         status.cleanup();
4729         writestate.notifyAll();
4730       }
4731     }
4732 
4733     // C. Finally notify anyone waiting on memstore to clear:
4734     // e.g. checkResources().
4735     synchronized (this) {
4736       notifyAll(); // FindBugs NN_NAKED_NOTIFY
4737     }
4738   }
4739 
4740   /**
4741    * Replays the given flush descriptor by opening the flush files in stores and dropping the
4742    * memstore snapshots if requested.
4743    * @param flush
4744    * @param prepareFlushResult
4745    * @param dropMemstoreSnapshot
4746    * @throws IOException
4747    */
4748   private void replayFlushInStores(FlushDescriptor flush, PrepareFlushResult prepareFlushResult,
4749       boolean dropMemstoreSnapshot)
4750       throws IOException {
4751     for (StoreFlushDescriptor storeFlush : flush.getStoreFlushesList()) {
4752       byte[] family = storeFlush.getFamilyName().toByteArray();
4753       Store store = getStore(family);
4754       if (store == null) {
4755         LOG.warn(getRegionInfo().getEncodedName() + " : "
4756             + "Received a flush commit marker from primary, but the family is not found."
4757             + "Ignoring StoreFlushDescriptor:" + storeFlush);
4758         continue;
4759       }
4760       List<String> flushFiles = storeFlush.getFlushOutputList();
4761       StoreFlushContext ctx = null;
4762       long startTime = EnvironmentEdgeManager.currentTime();
4763       if (prepareFlushResult == null || prepareFlushResult.storeFlushCtxs == null) {
4764         ctx = store.createFlushContext(flush.getFlushSequenceNumber());
4765       } else {
4766         ctx = prepareFlushResult.storeFlushCtxs.get(family);
4767         startTime = prepareFlushResult.startTime;
4768       }
4769 
4770       if (ctx == null) {
4771         LOG.warn(getRegionInfo().getEncodedName() + " : "
4772             + "Unexpected: flush commit marker received from store "
4773             + Bytes.toString(family) + " but no associated flush context. Ignoring");
4774         continue;
4775       }
4776 
4777       ctx.replayFlush(flushFiles, dropMemstoreSnapshot); // replay the flush
4778 
4779       // Record latest flush time
4780       this.lastStoreFlushTimeMap.put(store, startTime);
4781     }
4782   }
4783 
4784   /**
4785    * Drops the memstore contents after replaying a flush descriptor or region open event replay
4786    * if the memstore edits have seqNums smaller than the given seq id
4787    * @param flush the flush descriptor
4788    * @throws IOException
4789    */
4790   private long dropMemstoreContentsForSeqId(long seqId, Store store) throws IOException {
4791     long totalFreedSize = 0;
4792     this.updatesLock.writeLock().lock();
4793     try {
4794       mvcc.waitForPreviousTransactionsComplete();
4795       long currentSeqId = getSequenceId().get();
4796       if (seqId >= currentSeqId) {
4797         // then we can drop the memstore contents since everything is below this seqId
4798         LOG.info(getRegionInfo().getEncodedName() + " : "
4799             + "Dropping memstore contents as well since replayed flush seqId: "
4800             + seqId + " is greater than current seqId:" + currentSeqId);
4801 
4802         // Prepare flush (take a snapshot) and then abort (drop the snapshot)
4803         if (store == null ) {
4804           for (Store s : stores.values()) {
4805             totalFreedSize += doDropStoreMemstoreContentsForSeqId(s, currentSeqId);
4806           }
4807         } else {
4808           totalFreedSize += doDropStoreMemstoreContentsForSeqId(store, currentSeqId);
4809         }
4810       } else {
4811         LOG.info(getRegionInfo().getEncodedName() + " : "
4812             + "Not dropping memstore contents since replayed flush seqId: "
4813             + seqId + " is smaller than current seqId:" + currentSeqId);
4814       }
4815     } finally {
4816       this.updatesLock.writeLock().unlock();
4817     }
4818     return totalFreedSize;
4819   }
4820 
4821   private long doDropStoreMemstoreContentsForSeqId(Store s, long currentSeqId) throws IOException {
4822     long snapshotSize = s.getFlushableSize();
4823     this.addAndGetGlobalMemstoreSize(-snapshotSize);
4824     StoreFlushContext ctx = s.createFlushContext(currentSeqId);
4825     ctx.prepare();
4826     ctx.abort();
4827     return snapshotSize;
4828   }
4829 
4830   private void replayWALFlushAbortMarker(FlushDescriptor flush) {
4831     // nothing to do for now. A flush abort will cause a RS abort which means that the region
4832     // will be opened somewhere else later. We will see the region open event soon, and replaying
4833     // that will drop the snapshot
4834   }
4835 
4836   private void replayWALFlushCannotFlushMarker(FlushDescriptor flush, long replaySeqId) {
4837     synchronized (writestate) {
4838       if (this.lastReplayedOpenRegionSeqId > replaySeqId) {
4839         LOG.warn(getRegionInfo().getEncodedName() + " : "
4840           + "Skipping replaying flush event :" + TextFormat.shortDebugString(flush)
4841           + " because its sequence id " + replaySeqId + " is smaller than this regions "
4842           + "lastReplayedOpenRegionSeqId of " + lastReplayedOpenRegionSeqId);
4843         return;
4844       }
4845 
4846       // If we were waiting for observing a flush or region opening event for not showing partial
4847       // data after a secondary region crash, we can allow reads now. This event means that the
4848       // primary was not able to flush because memstore is empty when we requested flush. By the
4849       // time we observe this, we are guaranteed to have up to date seqId with our previous
4850       // assignment.
4851       this.setReadsEnabled(true);
4852     }
4853   }
4854 
4855   @VisibleForTesting
4856   PrepareFlushResult getPrepareFlushResult() {
4857     return prepareFlushResult;
4858   }
4859 
4860   void replayWALRegionEventMarker(RegionEventDescriptor regionEvent) throws IOException {
4861     checkTargetRegion(regionEvent.getEncodedRegionName().toByteArray(),
4862       "RegionEvent marker from WAL ", regionEvent);
4863 
4864     startRegionOperation(Operation.REPLAY_EVENT);
4865     try {
4866       if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4867         return; // if primary nothing to do
4868       }
4869 
4870       if (regionEvent.getEventType() == EventType.REGION_CLOSE) {
4871         // nothing to do on REGION_CLOSE for now.
4872         return;
4873       }
4874       if (regionEvent.getEventType() != EventType.REGION_OPEN) {
4875         LOG.warn(getRegionInfo().getEncodedName() + " : "
4876             + "Unknown region event received, ignoring :"
4877             + TextFormat.shortDebugString(regionEvent));
4878         return;
4879       }
4880 
4881       if (LOG.isDebugEnabled()) {
4882         LOG.debug(getRegionInfo().getEncodedName() + " : "
4883           + "Replaying region open event marker " + TextFormat.shortDebugString(regionEvent));
4884       }
4885 
4886       // we will use writestate as a coarse-grain lock for all the replay events
4887       synchronized (writestate) {
4888         // Replication can deliver events out of order when primary region moves or the region
4889         // server crashes, since there is no coordination between replication of different wal files
4890         // belonging to different region servers. We have to safe guard against this case by using
4891         // region open event's seqid. Since this is the first event that the region puts (after
4892         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
4893         // smaller than this seqId
4894         if (this.lastReplayedOpenRegionSeqId <= regionEvent.getLogSequenceNumber()) {
4895           this.lastReplayedOpenRegionSeqId = regionEvent.getLogSequenceNumber();
4896         } else {
4897           LOG.warn(getRegionInfo().getEncodedName() + " : "
4898             + "Skipping replaying region event :" + TextFormat.shortDebugString(regionEvent)
4899             + " because its sequence id is smaller than this regions lastReplayedOpenRegionSeqId "
4900             + " of " + lastReplayedOpenRegionSeqId);
4901           return;
4902         }
4903 
4904         // region open lists all the files that the region has at the time of the opening. Just pick
4905         // all the files and drop prepared flushes and empty memstores
4906         for (StoreDescriptor storeDescriptor : regionEvent.getStoresList()) {
4907           // stores of primary may be different now
4908           byte[] family = storeDescriptor.getFamilyName().toByteArray();
4909           Store store = getStore(family);
4910           if (store == null) {
4911             LOG.warn(getRegionInfo().getEncodedName() + " : "
4912                 + "Received a region open marker from primary, but the family is not found. "
4913                 + "Ignoring. StoreDescriptor:" + storeDescriptor);
4914             continue;
4915           }
4916 
4917           long storeSeqId = store.getMaxSequenceId();
4918           List<String> storeFiles = storeDescriptor.getStoreFileList();
4919           try {
4920             store.refreshStoreFiles(storeFiles); // replace the files with the new ones
4921           } catch (FileNotFoundException ex) {
4922             LOG.warn(getRegionInfo().getEncodedName() + " : "
4923                     + "At least one of the store files: " + storeFiles
4924                     + " doesn't exist any more. Skip loading the file(s)", ex);
4925             continue;
4926           }
4927           if (store.getMaxSequenceId() != storeSeqId) {
4928             // Record latest flush time if we picked up new files
4929             lastStoreFlushTimeMap.put(store, EnvironmentEdgeManager.currentTime());
4930           }
4931 
4932           if (writestate.flushing) {
4933             // only drop memstore snapshots if they are smaller than last flush for the store
4934             if (this.prepareFlushResult.flushOpSeqId <= regionEvent.getLogSequenceNumber()) {
4935               StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
4936                   null : this.prepareFlushResult.storeFlushCtxs.get(family);
4937               if (ctx != null) {
4938                 long snapshotSize = store.getFlushableSize();
4939                 ctx.abort();
4940                 this.addAndGetGlobalMemstoreSize(-snapshotSize);
4941                 this.prepareFlushResult.storeFlushCtxs.remove(family);
4942               }
4943             }
4944           }
4945 
4946           // Drop the memstore contents if they are now smaller than the latest seen flushed file
4947           dropMemstoreContentsForSeqId(regionEvent.getLogSequenceNumber(), store);
4948           if (storeSeqId > this.maxFlushedSeqId) {
4949             this.maxFlushedSeqId = storeSeqId;
4950           }
4951         }
4952 
4953         // if all stores ended up dropping their snapshots, we can safely drop the
4954         // prepareFlushResult
4955         dropPrepareFlushIfPossible();
4956 
4957         // advance the mvcc read point so that the new flushed file is visible.
4958         // there may be some in-flight transactions, but they won't be made visible since they are
4959         // either greater than flush seq number or they were already dropped via flush.
4960         getMVCC().advanceMemstoreReadPointIfNeeded(this.maxFlushedSeqId);
4961 
4962         // If we were waiting for observing a flush or region opening event for not showing partial
4963         // data after a secondary region crash, we can allow reads now.
4964         this.setReadsEnabled(true);
4965 
4966         // C. Finally notify anyone waiting on memstore to clear:
4967         // e.g. checkResources().
4968         synchronized (this) {
4969           notifyAll(); // FindBugs NN_NAKED_NOTIFY
4970         }
4971       }
4972       logRegionFiles();
4973     } finally {
4974       closeRegionOperation(Operation.REPLAY_EVENT);
4975     }
4976   }
4977 
4978   void replayWALBulkLoadEventMarker(WALProtos.BulkLoadDescriptor bulkLoadEvent) throws IOException {
4979     checkTargetRegion(bulkLoadEvent.getEncodedRegionName().toByteArray(),
4980       "BulkLoad marker from WAL ", bulkLoadEvent);
4981 
4982     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
4983       return; // if primary nothing to do
4984     }
4985 
4986     if (LOG.isDebugEnabled()) {
4987       LOG.debug(getRegionInfo().getEncodedName() + " : "
4988               +  "Replaying bulkload event marker " + TextFormat.shortDebugString(bulkLoadEvent));
4989     }
4990     // check if multiple families involved
4991     boolean multipleFamilies = false;
4992     byte[] family = null;
4993     for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
4994       byte[] fam = storeDescriptor.getFamilyName().toByteArray();
4995       if (family == null) {
4996         family = fam;
4997       } else if (!Bytes.equals(family, fam)) {
4998         multipleFamilies = true;
4999         break;
5000       }
5001     }
5002 
5003     startBulkRegionOperation(multipleFamilies);
5004     try {
5005       // we will use writestate as a coarse-grain lock for all the replay events
5006       synchronized (writestate) {
5007         // Replication can deliver events out of order when primary region moves or the region
5008         // server crashes, since there is no coordination between replication of different wal files
5009         // belonging to different region servers. We have to safe guard against this case by using
5010         // region open event's seqid. Since this is the first event that the region puts (after
5011         // possibly flushing recovered.edits), after seeing this event, we can ignore every edit
5012         // smaller than this seqId
5013         if (bulkLoadEvent.getBulkloadSeqNum() >= 0
5014             && this.lastReplayedOpenRegionSeqId >= bulkLoadEvent.getBulkloadSeqNum()) {
5015           LOG.warn(getRegionInfo().getEncodedName() + " : "
5016               + "Skipping replaying bulkload event :"
5017               + TextFormat.shortDebugString(bulkLoadEvent)
5018               + " because its sequence id is smaller than this region's lastReplayedOpenRegionSeqId"
5019               + " =" + lastReplayedOpenRegionSeqId);
5020 
5021           return;
5022         }
5023 
5024         for (StoreDescriptor storeDescriptor : bulkLoadEvent.getStoresList()) {
5025           // stores of primary may be different now
5026           family = storeDescriptor.getFamilyName().toByteArray();
5027           Store store = getStore(family);
5028           if (store == null) {
5029             LOG.warn(getRegionInfo().getEncodedName() + " : "
5030                     + "Received a bulk load marker from primary, but the family is not found. "
5031                     + "Ignoring. StoreDescriptor:" + storeDescriptor);
5032             continue;
5033           }
5034 
5035           List<String> storeFiles = storeDescriptor.getStoreFileList();
5036           for (String storeFile : storeFiles) {
5037             StoreFileInfo storeFileInfo = null;
5038             try {
5039               storeFileInfo = fs.getStoreFileInfo(Bytes.toString(family), storeFile);
5040               store.bulkLoadHFile(storeFileInfo);
5041             } catch(FileNotFoundException ex) {
5042               LOG.warn(getRegionInfo().getEncodedName() + " : "
5043                       + ((storeFileInfo != null) ? storeFileInfo.toString() :
5044                             (new Path(Bytes.toString(family), storeFile)).toString())
5045                       + " doesn't exist any more. Skip loading the file");
5046             }
5047           }
5048         }
5049       }
5050       if (bulkLoadEvent.getBulkloadSeqNum() > 0) {
5051         getMVCC().advanceMemstoreReadPointIfNeeded(bulkLoadEvent.getBulkloadSeqNum());
5052       }
5053     } finally {
5054       closeBulkRegionOperation();
5055     }
5056   }
5057 
5058   /**
5059    * If all stores ended up dropping their snapshots, we can safely drop the prepareFlushResult
5060    */
5061   private void dropPrepareFlushIfPossible() {
5062     if (writestate.flushing) {
5063       boolean canDrop = true;
5064       if (prepareFlushResult.storeFlushCtxs != null) {
5065         for (Entry<byte[], StoreFlushContext> entry
5066             : prepareFlushResult.storeFlushCtxs.entrySet()) {
5067           Store store = getStore(entry.getKey());
5068           if (store == null) {
5069             continue;
5070           }
5071           if (store.getSnapshotSize() > 0) {
5072             canDrop = false;
5073             break;
5074           }
5075         }
5076       }
5077 
5078       // this means that all the stores in the region has finished flushing, but the WAL marker
5079       // may not have been written or we did not receive it yet.
5080       if (canDrop) {
5081         writestate.flushing = false;
5082         this.prepareFlushResult = null;
5083       }
5084     }
5085   }
5086 
5087   @Override
5088   public boolean refreshStoreFiles() throws IOException {
5089     if (ServerRegionReplicaUtil.isDefaultReplica(this.getRegionInfo())) {
5090       return false; // if primary nothing to do
5091     }
5092 
5093     if (LOG.isDebugEnabled()) {
5094       LOG.debug(getRegionInfo().getEncodedName() + " : "
5095           + "Refreshing store files to see whether we can free up memstore");
5096     }
5097 
5098     long totalFreedSize = 0;
5099 
5100     long smallestSeqIdInStores = Long.MAX_VALUE;
5101 
5102     startRegionOperation(); // obtain region close lock
5103     try {
5104       synchronized (writestate) {
5105         for (Store store : getStores()) {
5106           // TODO: some stores might see new data from flush, while others do not which
5107           // MIGHT break atomic edits across column families.
5108           long maxSeqIdBefore = store.getMaxSequenceId();
5109 
5110           // refresh the store files. This is similar to observing a region open wal marker.
5111           store.refreshStoreFiles();
5112 
5113           long storeSeqId = store.getMaxSequenceId();
5114           if (storeSeqId < smallestSeqIdInStores) {
5115             smallestSeqIdInStores = storeSeqId;
5116           }
5117 
5118           // see whether we can drop the memstore or the snapshot
5119           if (storeSeqId > maxSeqIdBefore) {
5120 
5121             if (writestate.flushing) {
5122               // only drop memstore snapshots if they are smaller than last flush for the store
5123               if (this.prepareFlushResult.flushOpSeqId <= storeSeqId) {
5124                 StoreFlushContext ctx = this.prepareFlushResult.storeFlushCtxs == null ?
5125                     null : this.prepareFlushResult.storeFlushCtxs.get(store.getFamily().getName());
5126                 if (ctx != null) {
5127                   long snapshotSize = store.getFlushableSize();
5128                   ctx.abort();
5129                   this.addAndGetGlobalMemstoreSize(-snapshotSize);
5130                   this.prepareFlushResult.storeFlushCtxs.remove(store.getFamily().getName());
5131                   totalFreedSize += snapshotSize;
5132                 }
5133               }
5134             }
5135 
5136             // Drop the memstore contents if they are now smaller than the latest seen flushed file
5137             totalFreedSize += dropMemstoreContentsForSeqId(storeSeqId, store);
5138           }
5139         }
5140 
5141         // if all stores ended up dropping their snapshots, we can safely drop the
5142         // prepareFlushResult
5143         dropPrepareFlushIfPossible();
5144 
5145         // advance the mvcc read point so that the new flushed files are visible.
5146         // there may be some in-flight transactions, but they won't be made visible since they are
5147         // either greater than flush seq number or they were already picked up via flush.
5148         for (Store s : getStores()) {
5149           getMVCC().advanceMemstoreReadPointIfNeeded(s.getMaxMemstoreTS());
5150         }
5151 
5152         // smallestSeqIdInStores is the seqId that we have a corresponding hfile for. We can safely
5153         // skip all edits that are to be replayed in the future with that has a smaller seqId
5154         // than this. We are updating lastReplayedOpenRegionSeqId so that we can skip all edits
5155         // that we have picked the flush files for
5156         if (this.lastReplayedOpenRegionSeqId < smallestSeqIdInStores) {
5157           this.lastReplayedOpenRegionSeqId = smallestSeqIdInStores;
5158         }
5159       }
5160       // C. Finally notify anyone waiting on memstore to clear:
5161       // e.g. checkResources().
5162       synchronized (this) {
5163         notifyAll(); // FindBugs NN_NAKED_NOTIFY
5164       }
5165       return totalFreedSize > 0;
5166     } finally {
5167       closeRegionOperation();
5168     }
5169   }
5170 
5171   private void logRegionFiles() {
5172     if (LOG.isTraceEnabled()) {
5173       LOG.trace(getRegionInfo().getEncodedName() + " : Store files for region: ");
5174       for (Store s : stores.values()) {
5175         Collection<StoreFile> storeFiles = s.getStorefiles();
5176         if (storeFiles == null) continue;
5177         for (StoreFile sf : storeFiles) {
5178           LOG.trace(getRegionInfo().getEncodedName() + " : " + sf);
5179         }
5180       }
5181     }
5182   }
5183 
5184   /** Checks whether the given regionName is either equal to our region, or that
5185    * the regionName is the primary region to our corresponding range for the secondary replica.
5186    */
5187   private void checkTargetRegion(byte[] encodedRegionName, String exceptionMsg, Object payload)
5188       throws WrongRegionException {
5189     if (Bytes.equals(this.getRegionInfo().getEncodedNameAsBytes(), encodedRegionName)) {
5190       return;
5191     }
5192 
5193     if (!RegionReplicaUtil.isDefaultReplica(this.getRegionInfo()) &&
5194         Bytes.equals(encodedRegionName,
5195           this.fs.getRegionInfoForFS().getEncodedNameAsBytes())) {
5196       return;
5197     }
5198 
5199     throw new WrongRegionException(exceptionMsg + payload
5200       + " targetted for region " + Bytes.toStringBinary(encodedRegionName)
5201       + " does not match this region: " + this.getRegionInfo());
5202   }
5203 
5204   /**
5205    * Used by tests
5206    * @param s Store to add edit too.
5207    * @param cell Cell to add.
5208    * @return True if we should flush.
5209    */
5210   protected boolean restoreEdit(final Store s, final Cell cell) {
5211     long kvSize = s.add(cell).getFirst();
5212     if (this.rsAccounting != null) {
5213       rsAccounting.addAndGetRegionReplayEditsSize(getRegionInfo().getRegionName(), kvSize);
5214     }
5215     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
5216   }
5217 
5218   /*
5219    * @param fs
5220    * @param p File to check.
5221    * @return True if file was zero-length (and if so, we'll delete it in here).
5222    * @throws IOException
5223    */
5224   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
5225       throws IOException {
5226     FileStatus stat = fs.getFileStatus(p);
5227     if (stat.getLen() > 0) return false;
5228     LOG.warn("File " + p + " is zero-length, deleting.");
5229     fs.delete(p, false);
5230     return true;
5231   }
5232 
5233   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
5234     if (family.isMobEnabled()) {
5235       if (HFile.getFormatVersion(this.conf) < HFile.MIN_FORMAT_VERSION_WITH_TAGS) {
5236         throw new IOException("A minimum HFile version of "
5237             + HFile.MIN_FORMAT_VERSION_WITH_TAGS
5238             + " is required for MOB feature. Consider setting " + HFile.FORMAT_VERSION_KEY
5239             + " accordingly.");
5240       }
5241       return new HMobStore(this, family, this.conf);
5242     }
5243     return new HStore(this, family, this.conf);
5244   }
5245 
5246   @Override
5247   public Store getStore(final byte[] column) {
5248     return this.stores.get(column);
5249   }
5250 
5251   /**
5252    * Return HStore instance. Does not do any copy: as the number of store is limited, we
5253    *  iterate on the list.
5254    */
5255   private Store getStore(Cell cell) {
5256     for (Map.Entry<byte[], Store> famStore : stores.entrySet()) {
5257       if (Bytes.equals(
5258           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
5259           famStore.getKey(), 0, famStore.getKey().length)) {
5260         return famStore.getValue();
5261       }
5262     }
5263 
5264     return null;
5265   }
5266 
5267   @Override
5268   public List<Store> getStores() {
5269     List<Store> list = new ArrayList<Store>(stores.size());
5270     list.addAll(stores.values());
5271     return list;
5272   }
5273 
5274   @Override
5275   public List<String> getStoreFileList(final byte [][] columns)
5276     throws IllegalArgumentException {
5277     List<String> storeFileNames = new ArrayList<String>();
5278     synchronized(closeLock) {
5279       for(byte[] column : columns) {
5280         Store store = this.stores.get(column);
5281         if (store == null) {
5282           throw new IllegalArgumentException("No column family : " +
5283               new String(column) + " available");
5284         }
5285         Collection<StoreFile> storeFiles = store.getStorefiles();
5286         if (storeFiles == null) continue;
5287         for (StoreFile storeFile: storeFiles) {
5288           storeFileNames.add(storeFile.getPath().toString());
5289         }
5290 
5291         logRegionFiles();
5292       }
5293     }
5294     return storeFileNames;
5295   }
5296 
5297   //////////////////////////////////////////////////////////////////////////////
5298   // Support code
5299   //////////////////////////////////////////////////////////////////////////////
5300 
5301   /** Make sure this is a valid row for the HRegion */
5302   void checkRow(final byte [] row, String op) throws IOException {
5303     if (!rowIsInRange(getRegionInfo(), row)) {
5304       throw new WrongRegionException("Requested row out of range for " +
5305           op + " on HRegion " + this + ", startKey='" +
5306           Bytes.toStringBinary(getRegionInfo().getStartKey()) + "', getEndKey()='" +
5307           Bytes.toStringBinary(getRegionInfo().getEndKey()) + "', row='" +
5308           Bytes.toStringBinary(row) + "'");
5309     }
5310   }
5311 
5312   @Override
5313   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
5314     startRegionOperation();
5315     try {
5316       return getRowLockInternal(row, waitForLock);
5317     } finally {
5318       closeRegionOperation();
5319     }
5320   }
5321 
5322   /**
5323    * A version of getRowLock(byte[], boolean) to use when a region operation has already been
5324    * started (the calling thread has already acquired the region-close-guard lock).
5325    */
5326   protected RowLock getRowLockInternal(byte[] row, boolean waitForLock) throws IOException {
5327     HashedBytes rowKey = new HashedBytes(row);
5328     RowLockContext rowLockContext = new RowLockContext(rowKey);
5329 
5330     // loop until we acquire the row lock (unless !waitForLock)
5331     while (true) {
5332       RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
5333       if (existingContext == null) {
5334         // Row is not already locked by any thread, use newly created context.
5335         break;
5336       } else if (existingContext.ownedByCurrentThread()) {
5337         // Row is already locked by current thread, reuse existing context instead.
5338         rowLockContext = existingContext;
5339         break;
5340       } else {
5341         if (!waitForLock) {
5342           return null;
5343         }
5344         TraceScope traceScope = null;
5345         try {
5346           if (Trace.isTracing()) {
5347             traceScope = Trace.startSpan("HRegion.getRowLockInternal");
5348           }
5349           // Row is already locked by some other thread, give up or wait for it
5350           if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
5351             if(traceScope != null) {
5352               traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
5353             }
5354             throw new IOException("Timed out waiting for lock for row: " + rowKey);
5355           }
5356           rowLockContext.setThreadName(Thread.currentThread().getName());
5357           if (traceScope != null) traceScope.close();
5358           traceScope = null;
5359         } catch (InterruptedException ie) {
5360           LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
5361           InterruptedIOException iie = new InterruptedIOException();
5362           iie.initCause(ie);
5363           throw iie;
5364         } finally {
5365           if (traceScope != null) traceScope.close();
5366         }
5367       }
5368     }
5369 
5370     // allocate new lock for this thread
5371     return rowLockContext.newLock();
5372   }
5373 
5374   /**
5375    * Acquires a lock on the given row.
5376    * The same thread may acquire multiple locks on the same row.
5377    * @return the acquired row lock
5378    * @throws IOException if the lock could not be acquired after waiting
5379    */
5380   public RowLock getRowLock(byte[] row) throws IOException {
5381     return getRowLock(row, true);
5382   }
5383 
5384   @Override
5385   public void releaseRowLocks(List<RowLock> rowLocks) {
5386     if (rowLocks != null) {
5387       for (RowLock rowLock : rowLocks) {
5388         rowLock.release();
5389       }
5390       rowLocks.clear();
5391     }
5392   }
5393 
5394   /**
5395    * Determines whether multiple column families are present
5396    * Precondition: familyPaths is not null
5397    *
5398    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
5399    */
5400   private static boolean hasMultipleColumnFamilies(Collection<Pair<byte[], String>> familyPaths) {
5401     boolean multipleFamilies = false;
5402     byte[] family = null;
5403     for (Pair<byte[], String> pair : familyPaths) {
5404       byte[] fam = pair.getFirst();
5405       if (family == null) {
5406         family = fam;
5407       } else if (!Bytes.equals(family, fam)) {
5408         multipleFamilies = true;
5409         break;
5410       }
5411     }
5412     return multipleFamilies;
5413   }
5414 
5415   @Override
5416   public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths, boolean assignSeqId,
5417       BulkLoadListener bulkLoadListener) throws IOException {
5418     return bulkLoadHFiles(familyPaths, assignSeqId, bulkLoadListener, false);
5419   }
5420 
5421   @Override
5422   public Map<byte[], List<Path>> bulkLoadHFiles(Collection<Pair<byte[], String>> familyPaths,
5423       boolean assignSeqId, BulkLoadListener bulkLoadListener, boolean copyFile) throws IOException {
5424     long seqId = -1;
5425     Map<byte[], List<Path>> storeFiles = new TreeMap<byte[], List<Path>>(Bytes.BYTES_COMPARATOR);
5426     Map<String, Long> storeFilesSizes = new HashMap<String, Long>();
5427     Preconditions.checkNotNull(familyPaths);
5428     // we need writeLock for multi-family bulk load
5429     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
5430     boolean isSuccessful = false;
5431     try {
5432       this.writeRequestsCount.increment();
5433 
5434       // There possibly was a split that happened between when the split keys
5435       // were gathered and before the HRegion's write lock was taken.  We need
5436       // to validate the HFile region before attempting to bulk load all of them
5437       List<IOException> ioes = new ArrayList<IOException>();
5438       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
5439       for (Pair<byte[], String> p : familyPaths) {
5440         byte[] familyName = p.getFirst();
5441         String path = p.getSecond();
5442 
5443         Store store = getStore(familyName);
5444         if (store == null) {
5445           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
5446               "No such column family " + Bytes.toStringBinary(familyName));
5447           ioes.add(ioe);
5448         } else {
5449           try {
5450             store.assertBulkLoadHFileOk(new Path(path));
5451           } catch (WrongRegionException wre) {
5452             // recoverable (file doesn't fit in region)
5453             failures.add(p);
5454           } catch (IOException ioe) {
5455             // unrecoverable (hdfs problem)
5456             ioes.add(ioe);
5457           }
5458         }
5459       }
5460 
5461       // validation failed because of some sort of IO problem.
5462       if (ioes.size() != 0) {
5463         IOException e = MultipleIOException.createIOException(ioes);
5464         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
5465         throw e;
5466       }
5467 
5468       // validation failed, bail out before doing anything permanent.
5469       if (failures.size() != 0) {
5470         StringBuilder list = new StringBuilder();
5471         for (Pair<byte[], String> p : failures) {
5472           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
5473           .append(p.getSecond());
5474         }
5475         // problem when validating
5476         LOG.warn("There was a recoverable bulk load failure likely due to a" +
5477             " split.  These (family, HFile) pairs were not loaded: " + list);
5478         return null;
5479       }
5480 
5481       // We need to assign a sequential ID that's in between two memstores in order to preserve
5482       // the guarantee that all the edits lower than the highest sequential ID from all the
5483       // HFiles are flushed on disk. See HBASE-10958.  The sequence id returned when we flush is
5484       // guaranteed to be one beyond the file made when we flushed (or if nothing to flush, it is
5485       // a sequence id that we can be sure is beyond the last hfile written).
5486       if (assignSeqId) {
5487         FlushResult fs = flushcache(true, false);
5488         if (fs.isFlushSucceeded()) {
5489           seqId = ((FlushResultImpl)fs).flushSequenceId;
5490         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
5491           seqId = ((FlushResultImpl)fs).flushSequenceId;
5492         } else if (fs.getResult() == FlushResult.Result.CANNOT_FLUSH) {
5493           // CANNOT_FLUSH may mean that a flush is already on-going
5494           // we need to wait for that flush to complete
5495           waitForFlushes();
5496         } else {
5497           throw new IOException("Could not bulk load with an assigned sequential ID because the "+
5498               "flush didn't run. Reason for not flushing: " + ((FlushResultImpl)fs).failureReason);
5499         }
5500       }
5501 
5502       Map<byte[], List<Pair<Path, Path>>> familyWithFinalPath =
5503           new TreeMap<>(Bytes.BYTES_COMPARATOR);
5504       for (Pair<byte[], String> p : familyPaths) {
5505         byte[] familyName = p.getFirst();
5506         String path = p.getSecond();
5507         Store store = getStore(familyName);
5508         if (!familyWithFinalPath.containsKey(familyName)) {
5509           familyWithFinalPath.put(familyName, new ArrayList<Pair<Path, Path>>());
5510         }
5511         List<Pair<Path, Path>> lst = familyWithFinalPath.get(familyName);
5512         try {
5513           String finalPath = path;
5514           if (bulkLoadListener != null) {
5515             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path, copyFile);
5516           }
5517           Pair<Path, Path> pair = ((HStore)store).preBulkLoadHFile(finalPath, seqId);
5518           lst.add(pair);
5519         } catch (IOException ioe) {
5520           // A failure here can cause an atomicity violation that we currently
5521           // cannot recover from since it is likely a failed HDFS operation.
5522 
5523           LOG.error("There was a partial failure due to IO when attempting to" +
5524               " load " + Bytes.toString(p.getFirst()) + " : " + p.getSecond(), ioe);
5525           if (bulkLoadListener != null) {
5526             try {
5527               bulkLoadListener.failedBulkLoad(familyName, path);
5528             } catch (Exception ex) {
5529               LOG.error("Error while calling failedBulkLoad for family " +
5530                   Bytes.toString(familyName) + " with path " + path, ex);
5531             }
5532           }
5533           throw ioe;
5534         }
5535       }
5536 
5537       if (this.getCoprocessorHost() != null) {
5538         for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) {
5539           this.getCoprocessorHost().preCommitStoreFile(entry.getKey(), entry.getValue());
5540         }
5541       }
5542       for (Map.Entry<byte[], List<Pair<Path, Path>>> entry : familyWithFinalPath.entrySet()) {
5543         byte[] familyName = entry.getKey();
5544         for (Pair<Path, Path> p : entry.getValue()) {
5545           String path = p.getFirst().toString();
5546           Path commitedStoreFile = p.getSecond();
5547           Store store = getStore(familyName);
5548           try {
5549             store.bulkLoadHFile(familyName, path, commitedStoreFile);
5550             // Note the size of the store file
5551             try {
5552               FileSystem fs = commitedStoreFile.getFileSystem(baseConf);
5553               storeFilesSizes.put(commitedStoreFile.getName(), fs.getFileStatus(commitedStoreFile)
5554                   .getLen());
5555             } catch (IOException e) {
5556               LOG.warn("Failed to find the size of hfile " + commitedStoreFile);
5557               storeFilesSizes.put(commitedStoreFile.getName(), 0L);
5558             }
5559 
5560             if(storeFiles.containsKey(familyName)) {
5561               storeFiles.get(familyName).add(commitedStoreFile);
5562             } else {
5563               List<Path> storeFileNames = new ArrayList<Path>();
5564               storeFileNames.add(commitedStoreFile);
5565               storeFiles.put(familyName, storeFileNames);
5566             }
5567             if (bulkLoadListener != null) {
5568               bulkLoadListener.doneBulkLoad(familyName, path);
5569             }
5570           } catch (IOException ioe) {
5571             // A failure here can cause an atomicity violation that we currently
5572             // cannot recover from since it is likely a failed HDFS operation.
5573 
5574             // TODO Need a better story for reverting partial failures due to HDFS.
5575             LOG.error("There was a partial failure due to IO when attempting to" +
5576                 " load " + p.getFirst() + " : " + p.getSecond(), ioe);
5577             if (bulkLoadListener != null) {
5578               try {
5579                 bulkLoadListener.failedBulkLoad(familyName, path);
5580               } catch (Exception ex) {
5581                 LOG.error("Error while calling failedBulkLoad for family " +
5582                     Bytes.toString(familyName) + " with path " + path, ex);
5583               }
5584             }
5585             throw ioe;
5586           }
5587         }
5588 
5589       }
5590 
5591       isSuccessful = true;
5592     } finally {
5593       if (wal != null && !storeFiles.isEmpty()) {
5594         // write a bulk load event when not all hfiles are loaded
5595         try {
5596           WALProtos.BulkLoadDescriptor loadDescriptor = ProtobufUtil.toBulkLoadDescriptor(
5597               this.getRegionInfo().getTable(),
5598               ByteStringer.wrap(this.getRegionInfo().getEncodedNameAsBytes()), storeFiles,
5599               storeFilesSizes, seqId);
5600           WALUtil.writeBulkLoadMarkerAndSync(wal, this.htableDescriptor, getRegionInfo(),
5601               loadDescriptor, sequenceId);
5602         } catch (IOException ioe) {
5603           if (this.rsServices != null) {
5604             // Have to abort region server because some hfiles has been loaded but we can't write
5605             // the event into WAL
5606             isSuccessful = false;
5607             this.rsServices.abort("Failed to write bulk load event into WAL.", ioe);
5608           }
5609         }
5610       }
5611 
5612       closeBulkRegionOperation();
5613     }
5614     return isSuccessful ? storeFiles : null;
5615   }
5616 
5617   @Override
5618   public boolean equals(Object o) {
5619     return o instanceof HRegion && Bytes.equals(getRegionInfo().getRegionName(),
5620                                                 ((HRegion) o).getRegionInfo().getRegionName());
5621   }
5622 
5623   @Override
5624   public int hashCode() {
5625     return Bytes.hashCode(getRegionInfo().getRegionName());
5626   }
5627 
5628   @Override
5629   public String toString() {
5630     return getRegionInfo().getRegionNameAsString();
5631   }
5632 
5633   /**
5634    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
5635    */
5636   class RegionScannerImpl implements RegionScanner {
5637     // Package local for testability
5638     KeyValueHeap storeHeap = null;
5639     /** Heap of key-values that are not essential for the provided filters and are thus read
5640      * on demand, if on-demand column family loading is enabled.*/
5641     KeyValueHeap joinedHeap = null;
5642     /**
5643      * If the joined heap data gathering is interrupted due to scan limits, this will
5644      * contain the row for which we are populating the values.*/
5645     protected Cell joinedContinuationRow = null;
5646     protected final byte[] stopRow;
5647     private final FilterWrapper filter;
5648     private ScannerContext defaultScannerContext;
5649     protected int isScan;
5650     private boolean filterClosed = false;
5651     private long readPt;
5652     private long maxResultSize;
5653     protected HRegion region;
5654 
5655     @Override
5656     public HRegionInfo getRegionInfo() {
5657       return region.getRegionInfo();
5658     }
5659 
5660     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
5661         throws IOException {
5662 
5663       this.region = region;
5664       this.maxResultSize = scan.getMaxResultSize();
5665       if (scan.hasFilter()) {
5666         this.filter = new FilterWrapper(scan.getFilter());
5667       } else {
5668         this.filter = null;
5669       }
5670 
5671       /**
5672        * By default, calls to next/nextRaw must enforce the batch limit. Thus, construct a default
5673        * scanner context that can be used to enforce the batch limit in the event that a
5674        * ScannerContext is not specified during an invocation of next/nextRaw
5675        */
5676       defaultScannerContext = ScannerContext.newBuilder().setBatchLimit(scan.getBatch()).build();
5677 
5678       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
5679         this.stopRow = null;
5680       } else {
5681         this.stopRow = scan.getStopRow();
5682       }
5683       // If we are doing a get, we want to be [startRow,endRow] normally
5684       // it is [startRow,endRow) and if startRow=endRow we get nothing.
5685       this.isScan = scan.isGetScan() ? -1 : 0;
5686 
5687       // synchronize on scannerReadPoints so that nobody calculates
5688       // getSmallestReadPoint, before scannerReadPoints is updated.
5689       IsolationLevel isolationLevel = scan.getIsolationLevel();
5690       synchronized(scannerReadPoints) {
5691         this.readPt = getReadpoint(isolationLevel);
5692         scannerReadPoints.put(this, this.readPt);
5693       }
5694 
5695       // Here we separate all scanners into two lists - scanner that provide data required
5696       // by the filter to operate (scanners list) and all others (joinedScanners list).
5697       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5698       List<KeyValueScanner> joinedScanners
5699         = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
5700       // Store all already instantiated scanners for exception handling
5701       List<KeyValueScanner> instantiatedScanners = new ArrayList<KeyValueScanner>();
5702       if (additionalScanners != null && !additionalScanners.isEmpty()) {
5703         scanners.addAll(additionalScanners);
5704         instantiatedScanners.addAll(additionalScanners);
5705       }
5706 
5707       try {
5708         for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
5709           scan.getFamilyMap().entrySet()) {
5710           Store store = stores.get(entry.getKey());
5711           KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
5712           if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
5713               || this.filter.isFamilyEssential(entry.getKey())) {
5714             scanners.add(scanner);
5715           } else {
5716             joinedScanners.add(scanner);
5717           }
5718         }
5719         initializeKVHeap(scanners, joinedScanners, region);
5720       } catch (Throwable t) {
5721         throw handleException(instantiatedScanners, t);
5722       }
5723     }
5724 
5725     protected void initializeKVHeap(List<KeyValueScanner> scanners,
5726         List<KeyValueScanner> joinedScanners, HRegion region)
5727         throws IOException {
5728       this.storeHeap = new KeyValueHeap(scanners, region.comparator);
5729       if (!joinedScanners.isEmpty()) {
5730         this.joinedHeap = new KeyValueHeap(joinedScanners, region.comparator);
5731       }
5732     }
5733 
5734     private IOException handleException(List<KeyValueScanner> instantiatedScanners,
5735         Throwable t) {
5736       scannerReadPoints.remove(this);
5737       if (storeHeap != null) {
5738         storeHeap.close();
5739         storeHeap = null;
5740         if (joinedHeap != null) {
5741           joinedHeap.close();
5742           joinedHeap = null;
5743         }
5744       } else {
5745         for (KeyValueScanner scanner : instantiatedScanners) {
5746           scanner.close();
5747         }
5748       }
5749       return t instanceof IOException ? (IOException) t : new IOException(t);
5750     }
5751 
5752     @Override
5753     public long getMaxResultSize() {
5754       return maxResultSize;
5755     }
5756 
5757     @Override
5758     public long getMvccReadPoint() {
5759       return this.readPt;
5760     }
5761 
5762     @Override
5763     public int getBatch() {
5764       return this.defaultScannerContext.getBatchLimit();
5765     }
5766 
5767     /**
5768      * Reset both the filter and the old filter.
5769      *
5770      * @throws IOException in case a filter raises an I/O exception.
5771      */
5772     protected void resetFilters() throws IOException {
5773       if (filter != null) {
5774         filter.reset();
5775       }
5776     }
5777 
5778     @Override
5779     public boolean next(List<Cell> outResults)
5780         throws IOException {
5781       // apply the batching limit by default
5782       return next(outResults, defaultScannerContext);
5783     }
5784 
5785     @Override
5786     public synchronized boolean next(List<Cell> outResults, ScannerContext scannerContext) throws IOException {
5787       if (this.filterClosed) {
5788         throw new UnknownScannerException("Scanner was closed (timed out?) " +
5789             "after we renewed it. Could be caused by a very slow scanner " +
5790             "or a lengthy garbage collection");
5791       }
5792       startRegionOperation(Operation.SCAN);
5793       readRequestsCount.increment();
5794       try {
5795         return nextRaw(outResults, scannerContext);
5796       } finally {
5797         closeRegionOperation(Operation.SCAN);
5798       }
5799     }
5800 
5801     @Override
5802     public boolean nextRaw(List<Cell> outResults) throws IOException {
5803       // Use the RegionScanner's context by default
5804       return nextRaw(outResults, defaultScannerContext);
5805     }
5806 
5807     @Override
5808     public boolean nextRaw(List<Cell> outResults, ScannerContext scannerContext)
5809         throws IOException {
5810       if (storeHeap == null) {
5811         // scanner is closed
5812         throw new UnknownScannerException("Scanner was closed");
5813       }
5814       boolean moreValues;
5815       if (outResults.isEmpty()) {
5816         // Usually outResults is empty. This is true when next is called
5817         // to handle scan or get operation.
5818         moreValues = nextInternal(outResults, scannerContext);
5819       } else {
5820         List<Cell> tmpList = new ArrayList<Cell>();
5821         moreValues = nextInternal(tmpList, scannerContext);
5822         outResults.addAll(tmpList);
5823       }
5824 
5825       // If the size limit was reached it means a partial Result is being returned. Returning a
5826       // partial Result means that we should not reset the filters; filters should only be reset in
5827       // between rows
5828       if (!scannerContext.partialResultFormed()) resetFilters();
5829 
5830       if (isFilterDoneInternal()) {
5831         moreValues = false;
5832       }
5833       return moreValues;
5834     }
5835 
5836     /**
5837      * @return true if more cells exist after this batch, false if scanner is done
5838      */
5839     private boolean populateFromJoinedHeap(List<Cell> results, ScannerContext scannerContext)
5840             throws IOException {
5841       assert joinedContinuationRow != null;
5842       boolean moreValues =
5843           populateResult(results, this.joinedHeap, scannerContext,
5844           joinedContinuationRow.getRowArray(), joinedContinuationRow.getRowOffset(),
5845           joinedContinuationRow.getRowLength());
5846 
5847       if (!scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
5848         // We are done with this row, reset the continuation.
5849         joinedContinuationRow = null;
5850       }
5851       // As the data is obtained from two independent heaps, we need to
5852       // ensure that result list is sorted, because Result relies on that.
5853       Collections.sort(results, comparator);
5854       return moreValues;
5855     }
5856 
5857     /**
5858      * Fetches records with currentRow into results list, until next row, batchLimit (if not -1) is
5859      * reached, or remainingResultSize (if not -1) is reaced
5860      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
5861      * @param scannerContext
5862      * @param currentRow Byte array with key we are fetching.
5863      * @param offset offset for currentRow
5864      * @param length length for currentRow
5865      * @return state of last call to {@link KeyValueHeap#next()}
5866      */
5867     private boolean populateResult(List<Cell> results, KeyValueHeap heap,
5868         ScannerContext scannerContext, byte[] currentRow, int offset, short length)
5869         throws IOException {
5870       Cell nextKv;
5871       boolean moreCellsInRow = false;
5872       boolean tmpKeepProgress = scannerContext.getKeepProgress();
5873       // Scanning between column families and thus the scope is between cells
5874       LimitScope limitScope = LimitScope.BETWEEN_CELLS;
5875       do {
5876         // We want to maintain any progress that is made towards the limits while scanning across
5877         // different column families. To do this, we toggle the keep progress flag on during calls
5878         // to the StoreScanner to ensure that any progress made thus far is not wiped away.
5879         scannerContext.setKeepProgress(true);
5880         heap.next(results, scannerContext);
5881         scannerContext.setKeepProgress(tmpKeepProgress);
5882 
5883         nextKv = heap.peek();
5884         moreCellsInRow = moreCellsInRow(nextKv, currentRow, offset, length);
5885 
5886         if (scannerContext.checkBatchLimit(limitScope)) {
5887           return scannerContext.setScannerState(NextState.BATCH_LIMIT_REACHED).hasMoreValues();
5888         } else if (scannerContext.checkSizeLimit(limitScope)) {
5889           ScannerContext.NextState state =
5890               moreCellsInRow ? NextState.SIZE_LIMIT_REACHED_MID_ROW : NextState.SIZE_LIMIT_REACHED;
5891           return scannerContext.setScannerState(state).hasMoreValues();
5892         } else if (scannerContext.checkTimeLimit(limitScope)) {
5893           ScannerContext.NextState state =
5894               moreCellsInRow ? NextState.TIME_LIMIT_REACHED_MID_ROW : NextState.TIME_LIMIT_REACHED;
5895           return scannerContext.setScannerState(state).hasMoreValues();
5896         }
5897       } while (moreCellsInRow);
5898 
5899       return nextKv != null;
5900     }
5901 
5902     /**
5903      * Based on the nextKv in the heap, and the current row, decide whether or not there are more
5904      * cells to be read in the heap. If the row of the nextKv in the heap matches the current row
5905      * then there are more cells to be read in the row.
5906      * @param nextKv
5907      * @param currentRow
5908      * @param offset
5909      * @param length
5910      * @return true When there are more cells in the row to be read
5911      */
5912     private boolean moreCellsInRow(final Cell nextKv, byte[] currentRow, int offset,
5913         short length) {
5914       return nextKv != null && CellUtil.matchingRow(nextKv, currentRow, offset, length);
5915     }
5916 
5917     /*
5918      * @return True if a filter rules the scanner is over, done.
5919      */
5920     @Override
5921     public synchronized boolean isFilterDone() throws IOException {
5922       return isFilterDoneInternal();
5923     }
5924 
5925     private boolean isFilterDoneInternal() throws IOException {
5926       return this.filter != null && this.filter.filterAllRemaining();
5927     }
5928 
5929     private boolean nextInternal(List<Cell> results, ScannerContext scannerContext)
5930         throws IOException {
5931       if (!results.isEmpty()) {
5932         throw new IllegalArgumentException("First parameter should be an empty list");
5933       }
5934       if (scannerContext == null) {
5935         throw new IllegalArgumentException("Scanner context cannot be null");
5936       }
5937       RpcCallContext rpcCall = RpcServer.getCurrentCall();
5938 
5939       // Save the initial progress from the Scanner context in these local variables. The progress
5940       // may need to be reset a few times if rows are being filtered out so we save the initial
5941       // progress.
5942       int initialBatchProgress = scannerContext.getBatchProgress();
5943       long initialSizeProgress = scannerContext.getSizeProgress();
5944       long initialTimeProgress = scannerContext.getTimeProgress();
5945 
5946       // The loop here is used only when at some point during the next we determine
5947       // that due to effects of filters or otherwise, we have an empty row in the result.
5948       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
5949       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
5950       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
5951       while (true) {
5952         // Starting to scan a new row. Reset the scanner progress according to whether or not
5953         // progress should be kept.
5954         if (scannerContext.getKeepProgress()) {
5955           // Progress should be kept. Reset to initial values seen at start of method invocation.
5956           scannerContext
5957               .setProgress(initialBatchProgress, initialSizeProgress, initialTimeProgress);
5958         } else {
5959           scannerContext.clearProgress();
5960         }
5961 
5962         if (rpcCall != null) {
5963           // If a user specifies a too-restrictive or too-slow scanner, the
5964           // client might time out and disconnect while the server side
5965           // is still processing the request. We should abort aggressively
5966           // in that case.
5967           long afterTime = rpcCall.disconnectSince();
5968           if (afterTime >= 0) {
5969             throw new CallerDisconnectedException(
5970                 "Aborting on region " + getRegionInfo().getRegionNameAsString() + ", call " +
5971                     this + " after " + afterTime + " ms, since " +
5972                     "caller disconnected");
5973           }
5974         }
5975 
5976         // Let's see what we have in the storeHeap.
5977         Cell current = this.storeHeap.peek();
5978 
5979         byte[] currentRow = null;
5980         int offset = 0;
5981         short length = 0;
5982         if (current != null) {
5983           currentRow = current.getRowArray();
5984           offset = current.getRowOffset();
5985           length = current.getRowLength();
5986         }
5987 
5988         boolean stopRow = isStopRow(currentRow, offset, length);
5989         // When has filter row is true it means that the all the cells for a particular row must be
5990         // read before a filtering decision can be made. This means that filters where hasFilterRow
5991         // run the risk of encountering out of memory errors in the case that they are applied to a
5992         // table that has very large rows.
5993         boolean hasFilterRow = this.filter != null && this.filter.hasFilterRow();
5994 
5995         // If filter#hasFilterRow is true, partial results are not allowed since allowing them
5996         // would prevent the filters from being evaluated. Thus, if it is true, change the
5997         // scope of any limits that could potentially create partial results to
5998         // LimitScope.BETWEEN_ROWS so that those limits are not reached mid-row
5999         if (hasFilterRow) {
6000           if (LOG.isTraceEnabled()) {
6001             LOG.trace("filter#hasFilterRow is true which prevents partial results from being "
6002                 + " formed. Changing scope of limits that may create partials");
6003           }
6004           scannerContext.setSizeLimitScope(LimitScope.BETWEEN_ROWS);
6005           scannerContext.setTimeLimitScope(LimitScope.BETWEEN_ROWS);
6006         }
6007 
6008         // Check if we were getting data from the joinedHeap and hit the limit.
6009         // If not, then it's main path - getting results from storeHeap.
6010         if (joinedContinuationRow == null) {
6011           // First, check if we are at a stop row. If so, there are no more results.
6012           if (stopRow) {
6013             if (hasFilterRow) {
6014               filter.filterRowCells(results);
6015             }
6016             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6017           }
6018 
6019           // Check if rowkey filter wants to exclude this row. If so, loop to next.
6020           // Technically, if we hit limits before on this row, we don't need this call.
6021           if (filterRowKey(currentRow, offset, length)) {
6022             boolean moreRows = nextRow(currentRow, offset, length);
6023             if (!moreRows) {
6024               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6025             }
6026             results.clear();
6027             continue;
6028           }
6029 
6030           // Ok, we are good, let's try to get some results from the main heap.
6031           populateResult(results, this.storeHeap, scannerContext, currentRow, offset, length);
6032 
6033           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6034             if (hasFilterRow) {
6035               throw new IncompatibleFilterException(
6036                   "Filter whose hasFilterRow() returns true is incompatible with scans that must "
6037                       + " stop mid-row because of a limit. ScannerContext:" + scannerContext);
6038             }
6039             return true;
6040           }
6041 
6042           Cell nextKv = this.storeHeap.peek();
6043           stopRow = nextKv == null ||
6044               isStopRow(nextKv.getRowArray(), nextKv.getRowOffset(), nextKv.getRowLength());
6045           // save that the row was empty before filters applied to it.
6046           final boolean isEmptyRow = results.isEmpty();
6047 
6048           // We have the part of the row necessary for filtering (all of it, usually).
6049           // First filter with the filterRow(List).
6050           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
6051           if (hasFilterRow) {
6052             ret = filter.filterRowCellsWithRet(results);
6053 
6054             // We don't know how the results have changed after being filtered. Must set progress
6055             // according to contents of results now. However, a change in the results should not
6056             // affect the time progress. Thus preserve whatever time progress has been made
6057             long timeProgress = scannerContext.getTimeProgress();
6058             if (scannerContext.getKeepProgress()) {
6059               scannerContext.setProgress(initialBatchProgress, initialSizeProgress,
6060                 initialTimeProgress);
6061             } else {
6062               scannerContext.clearProgress();
6063             }
6064             scannerContext.setTimeProgress(timeProgress);
6065             scannerContext.incrementBatchProgress(results.size());
6066             for (Cell cell : results) {
6067               scannerContext.incrementSizeProgress(CellUtil.estimatedHeapSizeOfWithoutTags(cell));
6068             }
6069           }
6070 
6071           if ((isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE) || filterRow()) {
6072             results.clear();
6073             boolean moreRows = nextRow(currentRow, offset, length);
6074             if (!moreRows) {
6075               return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6076             }
6077 
6078             // This row was totally filtered out, if this is NOT the last row,
6079             // we should continue on. Otherwise, nothing else to do.
6080             if (!stopRow) continue;
6081             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6082           }
6083 
6084           // Ok, we are done with storeHeap for this row.
6085           // Now we may need to fetch additional, non-essential data into row.
6086           // These values are not needed for filter to work, so we postpone their
6087           // fetch to (possibly) reduce amount of data loads from disk.
6088           if (this.joinedHeap != null) {
6089             boolean mayHaveData = joinedHeapMayHaveData(currentRow, offset, length);
6090             if (mayHaveData) {
6091               joinedContinuationRow = current;
6092               populateFromJoinedHeap(results, scannerContext);
6093 
6094               if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6095                 return true;
6096               }
6097             }
6098           }
6099         } else {
6100           // Populating from the joined heap was stopped by limits, populate some more.
6101           populateFromJoinedHeap(results, scannerContext);
6102           if (scannerContext.checkAnyLimitReached(LimitScope.BETWEEN_CELLS)) {
6103             return true;
6104           }
6105         }
6106         // We may have just called populateFromJoinedMap and hit the limits. If that is
6107         // the case, we need to call it again on the next next() invocation.
6108         if (joinedContinuationRow != null) {
6109           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6110         }
6111 
6112         // Finally, we are done with both joinedHeap and storeHeap.
6113         // Double check to prevent empty rows from appearing in result. It could be
6114         // the case when SingleColumnValueExcludeFilter is used.
6115         if (results.isEmpty()) {
6116           boolean moreRows = nextRow(currentRow, offset, length);
6117           if (!moreRows) {
6118             return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6119           }
6120           if (!stopRow) continue;
6121         }
6122 
6123         // We are done. Return the result.
6124         if (stopRow) {
6125           return scannerContext.setScannerState(NextState.NO_MORE_VALUES).hasMoreValues();
6126         } else {
6127           return scannerContext.setScannerState(NextState.MORE_VALUES).hasMoreValues();
6128         }
6129       }
6130     }
6131 
6132     /**
6133      * @param currentRow
6134      * @param offset
6135      * @param length
6136      * @return true when the joined heap may have data for the current row
6137      * @throws IOException
6138      */
6139     private boolean joinedHeapMayHaveData(byte[] currentRow, int offset, short length)
6140         throws IOException {
6141       Cell nextJoinedKv = joinedHeap.peek();
6142       boolean matchCurrentRow =
6143           nextJoinedKv != null && CellUtil.matchingRow(nextJoinedKv, currentRow, offset, length);
6144       boolean matchAfterSeek = false;
6145 
6146       // If the next value in the joined heap does not match the current row, try to seek to the
6147       // correct row
6148       if (!matchCurrentRow) {
6149         Cell firstOnCurrentRow = KeyValueUtil.createFirstOnRow(currentRow, offset, length);
6150         boolean seekSuccessful = this.joinedHeap.requestSeek(firstOnCurrentRow, true, true);
6151         matchAfterSeek =
6152             seekSuccessful && joinedHeap.peek() != null
6153                 && CellUtil.matchingRow(joinedHeap.peek(), currentRow, offset, length);
6154       }
6155 
6156       return matchCurrentRow || matchAfterSeek;
6157     }
6158 
6159     /**
6160      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
6161      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
6162      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
6163      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
6164      * filterRow() will be skipped.
6165      */
6166     private boolean filterRow() throws IOException {
6167       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
6168       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
6169       return filter != null && (!filter.hasFilterRow())
6170           && filter.filterRow();
6171     }
6172 
6173     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
6174       return filter != null
6175           && filter.filterRowKey(row, offset, length);
6176     }
6177 
6178     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
6179       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
6180       Cell next;
6181       while ((next = this.storeHeap.peek()) != null &&
6182              CellUtil.matchingRow(next, currentRow, offset, length)) {
6183         this.storeHeap.next(MOCKED_LIST);
6184       }
6185       resetFilters();
6186       // Calling the hook in CP which allows it to do a fast forward
6187       return this.region.getCoprocessorHost() == null
6188           || this.region.getCoprocessorHost()
6189               .postScannerFilterRow(this, currentRow, offset, length);
6190     }
6191 
6192     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
6193       return currentRow == null ||
6194           (stopRow != null &&
6195           comparator.compareRows(stopRow, 0, stopRow.length,
6196             currentRow, offset, length) <= isScan);
6197     }
6198 
6199     @Override
6200     public synchronized void close() {
6201       if (storeHeap != null) {
6202         storeHeap.close();
6203         storeHeap = null;
6204       }
6205       if (joinedHeap != null) {
6206         joinedHeap.close();
6207         joinedHeap = null;
6208       }
6209       // no need to synchronize here.
6210       scannerReadPoints.remove(this);
6211       this.filterClosed = true;
6212     }
6213 
6214     KeyValueHeap getStoreHeapForTesting() {
6215       return storeHeap;
6216     }
6217 
6218     @Override
6219     public synchronized boolean reseek(byte[] row) throws IOException {
6220       if (row == null) {
6221         throw new IllegalArgumentException("Row cannot be null.");
6222       }
6223       boolean result = false;
6224       startRegionOperation();
6225       try {
6226         KeyValue kv = KeyValueUtil.createFirstOnRow(row);
6227         // use request seek to make use of the lazy seek option. See HBASE-5520
6228         result = this.storeHeap.requestSeek(kv, true, true);
6229         if (this.joinedHeap != null) {
6230           result = this.joinedHeap.requestSeek(kv, true, true) || result;
6231         }
6232       } finally {
6233         closeRegionOperation();
6234       }
6235       return result;
6236     }
6237   }
6238 
6239   // Utility methods
6240   /**
6241    * A utility method to create new instances of HRegion based on the
6242    * {@link HConstants#REGION_IMPL} configuration property.
6243    * @param tableDir qualified path of directory where region should be located,
6244    * usually the table directory.
6245    * @param wal The WAL is the outbound log for any updates to the HRegion
6246    * The wal file is a logfile from the previous execution that's
6247    * custom-computed for this HRegion. The HRegionServer computes and sorts the
6248    * appropriate wal info for this HRegion. If there is a previous file
6249    * (implying that the HRegion has been written-to before), then read it from
6250    * the supplied path.
6251    * @param fs is the filesystem.
6252    * @param conf is global configuration settings.
6253    * @param regionInfo - HRegionInfo that describes the region
6254    * is new), then read them from the supplied path.
6255    * @param htd the table descriptor
6256    * @return the new instance
6257    */
6258   static HRegion newHRegion(Path tableDir, WAL wal, FileSystem fs,
6259       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
6260       RegionServerServices rsServices) {
6261     try {
6262       @SuppressWarnings("unchecked")
6263       Class<? extends HRegion> regionClass =
6264           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
6265 
6266       Constructor<? extends HRegion> c =
6267           regionClass.getConstructor(Path.class, WAL.class, FileSystem.class,
6268               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
6269               RegionServerServices.class);
6270 
6271       return c.newInstance(tableDir, wal, fs, conf, regionInfo, htd, rsServices);
6272     } catch (Throwable e) {
6273       // todo: what should I throw here?
6274       throw new IllegalStateException("Could not instantiate a region instance.", e);
6275     }
6276   }
6277 
6278   /**
6279    * Convenience method creating new HRegions. Used by createTable and by the
6280    * bootstrap code in the HMaster constructor.
6281    * Note, this method creates an {@link WAL} for the created region. It
6282    * needs to be closed explicitly.  Use {@link HRegion#getWAL()} to get
6283    * access.  <b>When done with a region created using this method, you will
6284    * need to explicitly close the {@link WAL} it created too; it will not be
6285    * done for you.  Not closing the wal will leave at least a daemon thread
6286    * running.</b>  Call {@link #closeHRegion(HRegion)} and it will do
6287    * necessary cleanup for you.
6288    * @param info Info for region to create.
6289    * @param rootDir Root directory for HBase instance
6290    * @return new HRegion
6291    *
6292    * @throws IOException
6293    */
6294   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6295       final Configuration conf, final HTableDescriptor hTableDescriptor)
6296   throws IOException {
6297     return createHRegion(info, rootDir, conf, hTableDescriptor, null);
6298   }
6299 
6300   /**
6301    * This will do the necessary cleanup a call to
6302    * {@link #createHRegion(HRegionInfo, Path, Configuration, HTableDescriptor)}
6303    * requires.  This method will close the region and then close its
6304    * associated {@link WAL} file.  You can still use it if you call the other createHRegion,
6305    * the one that takes an {@link WAL} instance but don't be surprised by the
6306    * call to the {@link WAL#close()} on the {@link WAL} the
6307    * HRegion was carrying.
6308    * @throws IOException
6309    */
6310   public static void closeHRegion(final HRegion r) throws IOException {
6311     if (r == null) return;
6312     r.close();
6313     if (r.getWAL() == null) return;
6314     r.getWAL().close();
6315   }
6316 
6317   /**
6318    * Convenience method creating new HRegions. Used by createTable.
6319    * The {@link WAL} for the created region needs to be closed explicitly.
6320    * Use {@link HRegion#getWAL()} to get access.
6321    *
6322    * @param info Info for region to create.
6323    * @param rootDir Root directory for HBase instance
6324    * @param wal shared WAL
6325    * @param initialize - true to initialize the region
6326    * @return new HRegion
6327    *
6328    * @throws IOException
6329    */
6330   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6331                                       final Configuration conf,
6332                                       final HTableDescriptor hTableDescriptor,
6333                                       final WAL wal,
6334                                       final boolean initialize)
6335       throws IOException {
6336     return createHRegion(info, rootDir, conf, hTableDescriptor,
6337         wal, initialize, false);
6338   }
6339 
6340   /**
6341    * Convenience method creating new HRegions. Used by createTable.
6342    * The {@link WAL} for the created region needs to be closed
6343    * explicitly, if it is not null.
6344    * Use {@link HRegion#getWAL()} to get access.
6345    *
6346    * @param info Info for region to create.
6347    * @param rootDir Root directory for HBase instance
6348    * @param wal shared WAL
6349    * @param initialize - true to initialize the region
6350    * @param ignoreWAL - true to skip generate new wal if it is null, mostly for createTable
6351    * @return new HRegion
6352    * @throws IOException
6353    */
6354   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6355                                       final Configuration conf,
6356                                       final HTableDescriptor hTableDescriptor,
6357                                       final WAL wal,
6358                                       final boolean initialize, final boolean ignoreWAL)
6359       throws IOException {
6360       Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6361       return createHRegion(info, rootDir, tableDir, conf, hTableDescriptor, wal, initialize,
6362           ignoreWAL);
6363   }
6364 
6365   /**
6366    * Convenience method creating new HRegions. Used by createTable.
6367    * The {@link WAL} for the created region needs to be closed
6368    * explicitly, if it is not null.
6369    * Use {@link HRegion#getWAL()} to get access.
6370    *
6371    * @param info Info for region to create.
6372    * @param rootDir Root directory for HBase instance
6373    * @param tableDir table directory
6374    * @param wal shared WAL
6375    * @param initialize - true to initialize the region
6376    * @param ignoreWAL - true to skip generate new wal if it is null, mostly for createTable
6377    * @return new HRegion
6378    * @throws IOException
6379    */
6380   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir, final Path tableDir,
6381                                       final Configuration conf,
6382                                       final HTableDescriptor hTableDescriptor,
6383                                       final WAL wal,
6384                                       final boolean initialize, final boolean ignoreWAL)
6385       throws IOException {
6386     LOG.info("creating HRegion " + info.getTable().getNameAsString()
6387         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
6388         " Table name == " + info.getTable().getNameAsString());
6389     FileSystem fs = FileSystem.get(conf);
6390     HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
6391     WAL effectiveWAL = wal;
6392     if (wal == null && !ignoreWAL) {
6393       // TODO HBASE-11983 There'll be no roller for this wal?
6394       // The WAL subsystem will use the default rootDir rather than the passed in rootDir
6395       // unless I pass along via the conf.
6396       Configuration confForWAL = new Configuration(conf);
6397       FSUtils.setRootDir(confForWAL, rootDir);
6398       effectiveWAL = (new WALFactory(confForWAL,
6399           Collections.<WALActionsListener>singletonList(new MetricsWAL()),
6400           "hregion-" + RandomStringUtils.randomNumeric(8))).
6401             getWAL(info.getEncodedNameAsBytes());
6402     }
6403     HRegion region = HRegion.newHRegion(tableDir,
6404         effectiveWAL, fs, conf, info, hTableDescriptor, null);
6405     if (initialize) {
6406       // If initializing, set the sequenceId. It is also required by WALPerformanceEvaluation when
6407       // verifying the WALEdits.
6408       region.setSequenceId(region.initialize(null));
6409     }
6410     return region;
6411   }
6412 
6413   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
6414                                       final Configuration conf,
6415                                       final HTableDescriptor hTableDescriptor,
6416                                       final WAL wal)
6417     throws IOException {
6418     return createHRegion(info, rootDir, conf, hTableDescriptor, wal, true);
6419   }
6420 
6421 
6422   /**
6423    * Open a Region.
6424    * @param info Info for region to be opened.
6425    * @param wal WAL for region to use. This method will call
6426    * WAL#setSequenceNumber(long) passing the result of the call to
6427    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6428    * up.  HRegionStore does this every time it opens a new region.
6429    * @return new HRegion
6430    *
6431    * @throws IOException
6432    */
6433   public static HRegion openHRegion(final HRegionInfo info,
6434       final HTableDescriptor htd, final WAL wal,
6435       final Configuration conf)
6436   throws IOException {
6437     return openHRegion(info, htd, wal, conf, null, null);
6438   }
6439 
6440   /**
6441    * Open a Region.
6442    * @param info Info for region to be opened
6443    * @param htd the table descriptor
6444    * @param wal WAL for region to use. This method will call
6445    * WAL#setSequenceNumber(long) passing the result of the call to
6446    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6447    * up.  HRegionStore does this every time it opens a new region.
6448    * @param conf The Configuration object to use.
6449    * @param rsServices An interface we can request flushes against.
6450    * @param reporter An interface we can report progress against.
6451    * @return new HRegion
6452    *
6453    * @throws IOException
6454    */
6455   public static HRegion openHRegion(final HRegionInfo info,
6456     final HTableDescriptor htd, final WAL wal, final Configuration conf,
6457     final RegionServerServices rsServices,
6458     final CancelableProgressable reporter)
6459   throws IOException {
6460     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
6461   }
6462 
6463   /**
6464    * Open a Region.
6465    * @param rootDir Root directory for HBase instance
6466    * @param info Info for region to be opened.
6467    * @param htd the table descriptor
6468    * @param wal WAL for region to use. This method will call
6469    * WAL#setSequenceNumber(long) passing the result of the call to
6470    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6471    * up.  HRegionStore does this every time it opens a new region.
6472    * @param conf The Configuration object to use.
6473    * @return new HRegion
6474    * @throws IOException
6475    */
6476   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
6477       final HTableDescriptor htd, final WAL wal, final Configuration conf)
6478   throws IOException {
6479     return openHRegion(rootDir, info, htd, wal, conf, null, null);
6480   }
6481 
6482   /**
6483    * Open a Region.
6484    * @param rootDir Root directory for HBase instance
6485    * @param info Info for region to be opened.
6486    * @param htd the table descriptor
6487    * @param wal WAL for region to use. This method will call
6488    * WAL#setSequenceNumber(long) passing the result of the call to
6489    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6490    * up.  HRegionStore does this every time it opens a new region.
6491    * @param conf The Configuration object to use.
6492    * @param rsServices An interface we can request flushes against.
6493    * @param reporter An interface we can report progress against.
6494    * @return new HRegion
6495    * @throws IOException
6496    */
6497   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
6498       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6499       final RegionServerServices rsServices,
6500       final CancelableProgressable reporter)
6501   throws IOException {
6502     FileSystem fs = null;
6503     if (rsServices != null) {
6504       fs = rsServices.getFileSystem();
6505     }
6506     if (fs == null) {
6507       fs = FileSystem.get(conf);
6508     }
6509     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
6510   }
6511 
6512   /**
6513    * Open a Region.
6514    * @param conf The Configuration object to use.
6515    * @param fs Filesystem to use
6516    * @param rootDir Root directory for HBase instance
6517    * @param info Info for region to be opened.
6518    * @param htd the table descriptor
6519    * @param wal WAL for region to use. This method will call
6520    * WAL#setSequenceNumber(long) passing the result of the call to
6521    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6522    * up.  HRegionStore does this every time it opens a new region.
6523    * @return new HRegion
6524    * @throws IOException
6525    */
6526   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6527       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal)
6528       throws IOException {
6529     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
6530   }
6531 
6532   /**
6533    * Open a Region.
6534    * @param conf The Configuration object to use.
6535    * @param fs Filesystem to use
6536    * @param rootDir Root directory for HBase instance
6537    * @param info Info for region to be opened.
6538    * @param htd the table descriptor
6539    * @param wal WAL for region to use. This method will call
6540    * WAL#setSequenceNumber(long) passing the result of the call to
6541    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6542    * up.  HRegionStore does this every time it opens a new region.
6543    * @param rsServices An interface we can request flushes against.
6544    * @param reporter An interface we can report progress against.
6545    * @return new HRegion
6546    * @throws IOException
6547    */
6548   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6549       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final WAL wal,
6550       final RegionServerServices rsServices, final CancelableProgressable reporter)
6551       throws IOException {
6552     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6553     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
6554   }
6555 
6556   /**
6557    * Open a Region.
6558    * @param conf The Configuration object to use.
6559    * @param fs Filesystem to use
6560    * @param rootDir Root directory for HBase instance
6561    * @param info Info for region to be opened.
6562    * @param htd the table descriptor
6563    * @param wal WAL for region to use. This method will call
6564    * WAL#setSequenceNumber(long) passing the result of the call to
6565    * HRegion#getMinSequenceId() to ensure the wal id is properly kept
6566    * up.  HRegionStore does this every time it opens a new region.
6567    * @param rsServices An interface we can request flushes against.
6568    * @param reporter An interface we can report progress against.
6569    * @return new HRegion
6570    * @throws IOException
6571    */
6572   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
6573       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd,
6574       final WAL wal, final RegionServerServices rsServices,
6575       final CancelableProgressable reporter)
6576       throws IOException {
6577     if (info == null) throw new NullPointerException("Passed region info is null");
6578     if (LOG.isDebugEnabled()) {
6579       LOG.debug("Opening region: " + info);
6580     }
6581     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6582     return r.openHRegion(reporter);
6583   }
6584 
6585 
6586   /**
6587    * Useful when reopening a closed region (normally for unit tests)
6588    * @param other original object
6589    * @param reporter An interface we can report progress against.
6590    * @return new HRegion
6591    * @throws IOException
6592    */
6593   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
6594       throws IOException {
6595     HRegionFileSystem regionFs = other.getRegionFileSystem();
6596     HRegion r = newHRegion(regionFs.getTableDir(), other.getWAL(), regionFs.getFileSystem(),
6597         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
6598     return r.openHRegion(reporter);
6599   }
6600 
6601   public static Region openHRegion(final Region other, final CancelableProgressable reporter)
6602         throws IOException {
6603     return openHRegion((HRegion)other, reporter);
6604   }
6605 
6606   /**
6607    * Open HRegion.
6608    * Calls initialize and sets sequenceId.
6609    * @return Returns <code>this</code>
6610    * @throws IOException
6611    */
6612   protected HRegion openHRegion(final CancelableProgressable reporter)
6613   throws IOException {
6614     // Refuse to open the region if we are missing local compression support
6615     checkCompressionCodecs();
6616     // Refuse to open the region if encryption configuration is incorrect or
6617     // codec support is missing
6618     checkEncryption();
6619     // Refuse to open the region if a required class cannot be loaded
6620     checkClassLoading();
6621     this.openSeqNum = initialize(reporter);
6622     this.setSequenceId(openSeqNum);
6623     if (wal != null && getRegionServerServices() != null && !writestate.readOnly
6624         && !isRecovering) {
6625       // Only write the region open event marker to WAL if (1) we are not read-only
6626       // (2) dist log replay is off or we are not recovering. In case region is
6627       // recovering, the open event will be written at setRecovering(false)
6628       writeRegionOpenMarker(wal, openSeqNum);
6629     }
6630     return this;
6631   }
6632 
6633   public static void warmupHRegion(final HRegionInfo info,
6634       final HTableDescriptor htd, final WAL wal, final Configuration conf,
6635       final RegionServerServices rsServices,
6636       final CancelableProgressable reporter)
6637       throws IOException {
6638 
6639     if (info == null) throw new NullPointerException("Passed region info is null");
6640 
6641     if (LOG.isDebugEnabled()) {
6642       LOG.debug("HRegion.Warming up region: " + info);
6643     }
6644 
6645     Path rootDir = FSUtils.getRootDir(conf);
6646     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
6647 
6648     FileSystem fs = null;
6649     if (rsServices != null) {
6650       fs = rsServices.getFileSystem();
6651     }
6652     if (fs == null) {
6653       fs = FileSystem.get(conf);
6654     }
6655 
6656     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
6657     r.initializeWarmup(reporter);
6658     r.close();
6659   }
6660 
6661 
6662   private void checkCompressionCodecs() throws IOException {
6663     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6664       CompressionTest.testCompression(fam.getCompression());
6665       CompressionTest.testCompression(fam.getCompactionCompression());
6666     }
6667   }
6668 
6669   private void checkEncryption() throws IOException {
6670     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
6671       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
6672     }
6673   }
6674 
6675   private void checkClassLoading() throws IOException {
6676     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
6677     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
6678   }
6679 
6680   /**
6681    * Create a daughter region from given a temp directory with the region data.
6682    * @param hri Spec. for daughter region to open.
6683    * @throws IOException
6684    */
6685   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
6686     // Move the files from the temporary .splits to the final /table/region directory
6687     fs.commitDaughterRegion(hri);
6688 
6689     // Create the daughter HRegion instance
6690     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(), fs.getFileSystem(),
6691         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
6692     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
6693     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
6694     return r;
6695   }
6696 
6697   /**
6698    * Create a merged region given a temp directory with the region data.
6699    * @param region_b another merging region
6700    * @return merged HRegion
6701    * @throws IOException
6702    */
6703   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
6704       final HRegion region_b) throws IOException {
6705     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getWAL(),
6706         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
6707         this.getTableDesc(), this.rsServices);
6708     r.readRequestsCount.set(this.getReadRequestsCount()
6709         + region_b.getReadRequestsCount());
6710     r.writeRequestsCount.set(this.getWriteRequestsCount()
6711 
6712         + region_b.getWriteRequestsCount());
6713     this.fs.commitMergedRegion(mergedRegionInfo);
6714     return r;
6715   }
6716 
6717   /**
6718    * Inserts a new region's meta information into the passed
6719    * <code>meta</code> region. Used by the HMaster bootstrap code adding
6720    * new table to hbase:meta table.
6721    *
6722    * @param meta hbase:meta HRegion to be updated
6723    * @param r HRegion to add to <code>meta</code>
6724    *
6725    * @throws IOException
6726    */
6727   // TODO remove since only test and merge use this
6728   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
6729     meta.checkResources();
6730     // The row key is the region name
6731     byte[] row = r.getRegionInfo().getRegionName();
6732     final long now = EnvironmentEdgeManager.currentTime();
6733     final List<Cell> cells = new ArrayList<Cell>(2);
6734     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6735       HConstants.REGIONINFO_QUALIFIER, now,
6736       r.getRegionInfo().toByteArray()));
6737     // Set into the root table the version of the meta table.
6738     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
6739       HConstants.META_VERSION_QUALIFIER, now,
6740       Bytes.toBytes(HConstants.META_VERSION)));
6741     meta.put(row, HConstants.CATALOG_FAMILY, cells);
6742   }
6743 
6744   /**
6745    * Computes the Path of the HRegion
6746    *
6747    * @param tabledir qualified path for table
6748    * @param name ENCODED region name
6749    * @return Path of HRegion directory
6750    */
6751   @Deprecated
6752   public static Path getRegionDir(final Path tabledir, final String name) {
6753     return new Path(tabledir, name);
6754   }
6755 
6756   /**
6757    * Computes the Path of the HRegion
6758    *
6759    * @param rootdir qualified path of HBase root directory
6760    * @param info HRegionInfo for the region
6761    * @return qualified path of region directory
6762    */
6763   @Deprecated
6764   @VisibleForTesting
6765   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
6766     return new Path(
6767       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
6768   }
6769 
6770   /**
6771    * Determines if the specified row is within the row range specified by the
6772    * specified HRegionInfo
6773    *
6774    * @param info HRegionInfo that specifies the row range
6775    * @param row row to be checked
6776    * @return true if the row is within the range specified by the HRegionInfo
6777    */
6778   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
6779     return ((info.getStartKey().length == 0) ||
6780         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
6781         ((info.getEndKey().length == 0) ||
6782             (Bytes.compareTo(info.getEndKey(), row) > 0));
6783   }
6784 
6785   public static boolean rowIsInRange(HRegionInfo info, final byte [] row, final int offset,
6786       final short length) {
6787     return ((info.getStartKey().length == 0) ||
6788         (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
6789           row, offset, length) <= 0)) &&
6790         ((info.getEndKey().length == 0) ||
6791           (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
6792   }
6793 
6794   /**
6795    * Merge two HRegions.  The regions must be adjacent and must not overlap.
6796    *
6797    * @return new merged HRegion
6798    * @throws IOException
6799    */
6800   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
6801   throws IOException {
6802     HRegion a = srcA;
6803     HRegion b = srcB;
6804 
6805     // Make sure that srcA comes first; important for key-ordering during
6806     // write of the merged file.
6807     if (srcA.getRegionInfo().getStartKey() == null) {
6808       if (srcB.getRegionInfo().getStartKey() == null) {
6809         throw new IOException("Cannot merge two regions with null start key");
6810       }
6811       // A's start key is null but B's isn't. Assume A comes before B
6812     } else if ((srcB.getRegionInfo().getStartKey() == null) ||
6813       (Bytes.compareTo(srcA.getRegionInfo().getStartKey(),
6814         srcB.getRegionInfo().getStartKey()) > 0)) {
6815       a = srcB;
6816       b = srcA;
6817     }
6818 
6819     if (!(Bytes.compareTo(a.getRegionInfo().getEndKey(),
6820         b.getRegionInfo().getStartKey()) == 0)) {
6821       throw new IOException("Cannot merge non-adjacent regions");
6822     }
6823     return merge(a, b);
6824   }
6825 
6826   /**
6827    * Merge two regions whether they are adjacent or not.
6828    *
6829    * @param a region a
6830    * @param b region b
6831    * @return new merged region
6832    * @throws IOException
6833    */
6834   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
6835     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
6836       throw new IOException("Regions do not belong to the same table");
6837     }
6838 
6839     FileSystem fs = a.getRegionFileSystem().getFileSystem();
6840     // Make sure each region's cache is empty
6841     a.flush(true);
6842     b.flush(true);
6843 
6844     // Compact each region so we only have one store file per family
6845     a.compact(true);
6846     if (LOG.isDebugEnabled()) {
6847       LOG.debug("Files for region: " + a);
6848       a.getRegionFileSystem().logFileSystemState(LOG);
6849     }
6850     b.compact(true);
6851     if (LOG.isDebugEnabled()) {
6852       LOG.debug("Files for region: " + b);
6853       b.getRegionFileSystem().logFileSystemState(LOG);
6854     }
6855 
6856     RegionMergeTransactionImpl rmt = new RegionMergeTransactionImpl(a, b, true);
6857     if (!rmt.prepare(null)) {
6858       throw new IOException("Unable to merge regions " + a + " and " + b);
6859     }
6860     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
6861     LOG.info("starting merge of regions: " + a + " and " + b
6862         + " into new region " + mergedRegionInfo.getRegionNameAsString()
6863         + " with start key <"
6864         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
6865         + "> and end key <"
6866         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
6867     HRegion dstRegion;
6868     try {
6869       dstRegion = (HRegion)rmt.execute(null, null);
6870     } catch (IOException ioe) {
6871       rmt.rollback(null, null);
6872       throw new IOException("Failed merging region " + a + " and " + b
6873           + ", and successfully rolled back");
6874     }
6875     dstRegion.compact(true);
6876 
6877     if (LOG.isDebugEnabled()) {
6878       LOG.debug("Files for new region");
6879       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
6880     }
6881 
6882     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
6883       throw new IOException("Merged region " + dstRegion
6884           + " still has references after the compaction, is compaction canceled?");
6885     }
6886 
6887     // Archiving the 'A' region
6888     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
6889     // Archiving the 'B' region
6890     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
6891 
6892     LOG.info("merge completed. New region is " + dstRegion);
6893     return dstRegion;
6894   }
6895 
6896   @Override
6897   public Result get(final Get get) throws IOException {
6898     checkRow(get.getRow(), "Get");
6899     // Verify families are all valid
6900     if (get.hasFamilies()) {
6901       for (byte [] family: get.familySet()) {
6902         checkFamily(family);
6903       }
6904     } else { // Adding all families to scanner
6905       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
6906         get.addFamily(family);
6907       }
6908     }
6909     List<Cell> results = get(get, true);
6910     boolean stale = this.getRegionInfo().getReplicaId() != 0;
6911     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null, stale);
6912   }
6913 
6914   @Override
6915   public List<Cell> get(Get get, boolean withCoprocessor) throws IOException {
6916 
6917     List<Cell> results = new ArrayList<Cell>();
6918 
6919     // pre-get CP hook
6920     if (withCoprocessor && (coprocessorHost != null)) {
6921        if (coprocessorHost.preGet(get, results)) {
6922          return results;
6923        }
6924     }
6925     long before =  EnvironmentEdgeManager.currentTime();
6926     Scan scan = new Scan(get);
6927 
6928     RegionScanner scanner = null;
6929     try {
6930       scanner = getScanner(scan);
6931       scanner.next(results);
6932     } finally {
6933       if (scanner != null)
6934         scanner.close();
6935     }
6936 
6937     // post-get CP hook
6938     if (withCoprocessor && (coprocessorHost != null)) {
6939       coprocessorHost.postGet(get, results);
6940     }
6941 
6942     metricsUpdateForGet(results, before);
6943 
6944     return results;
6945   }
6946 
6947   void metricsUpdateForGet(List<Cell> results, long before) {
6948     if (this.metricsRegion != null) {
6949       this.metricsRegion.updateGet(EnvironmentEdgeManager.currentTime() - before);
6950     }
6951 
6952   }
6953 
6954   @Override
6955   public void mutateRow(RowMutations rm) throws IOException {
6956     // Don't need nonces here - RowMutations only supports puts and deletes
6957     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
6958   }
6959 
6960   /**
6961    * Perform atomic mutations within the region w/o nonces.
6962    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
6963    */
6964   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6965       Collection<byte[]> rowsToLock) throws IOException {
6966     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
6967   }
6968 
6969   /**
6970    * Perform atomic mutations within the region.
6971    * @param mutations The list of mutations to perform.
6972    * <code>mutations</code> can contain operations for multiple rows.
6973    * Caller has to ensure that all rows are contained in this region.
6974    * @param rowsToLock Rows to lock
6975    * @param nonceGroup Optional nonce group of the operation (client Id)
6976    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
6977    * If multiple rows are locked care should be taken that
6978    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
6979    * @throws IOException
6980    */
6981   @Override
6982   public void mutateRowsWithLocks(Collection<Mutation> mutations,
6983       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
6984     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
6985     processRowsWithLocks(proc, -1, nonceGroup, nonce);
6986   }
6987 
6988   /**
6989    * @return the current load statistics for the the region
6990    */
6991   public ClientProtos.RegionLoadStats getRegionStats() {
6992     if (!regionStatsEnabled) {
6993       return null;
6994     }
6995     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
6996     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
6997         .memstoreFlushSize)));
6998     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
6999     return stats.build();
7000   }
7001 
7002   @Override
7003   public void processRowsWithLocks(RowProcessor<?,?> processor) throws IOException {
7004     processRowsWithLocks(processor, rowProcessorTimeout, HConstants.NO_NONCE,
7005       HConstants.NO_NONCE);
7006   }
7007 
7008   @Override
7009   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
7010       throws IOException {
7011     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
7012   }
7013 
7014   @Override
7015   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
7016       long nonceGroup, long nonce) throws IOException {
7017 
7018     for (byte[] row : processor.getRowsToLock()) {
7019       checkRow(row, "processRowsWithLocks");
7020     }
7021     if (!processor.readOnly()) {
7022       checkReadOnly();
7023     }
7024     checkResources();
7025 
7026     startRegionOperation();
7027     WALEdit walEdit = new WALEdit();
7028 
7029     // 1. Run pre-process hook
7030     try {
7031       processor.preProcess(this, walEdit);
7032     } catch (IOException e) {
7033       closeRegionOperation();
7034       throw e;
7035     }
7036     // Short circuit the read only case
7037     if (processor.readOnly()) {
7038       try {
7039         long now = EnvironmentEdgeManager.currentTime();
7040         doProcessRowWithTimeout(
7041             processor, now, this, null, null, timeout);
7042         processor.postProcess(this, walEdit, true);
7043       } finally {
7044         closeRegionOperation();
7045       }
7046       return;
7047     }
7048 
7049     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
7050     boolean locked;
7051     boolean walSyncSuccessful = false;
7052     List<RowLock> acquiredRowLocks;
7053     long addedSize = 0;
7054     List<Mutation> mutations = new ArrayList<Mutation>();
7055     List<Cell> memstoreCells = new ArrayList<Cell>();
7056     Collection<byte[]> rowsToLock = processor.getRowsToLock();
7057     long mvccNum = 0;
7058     WALKey walKey = null;
7059     try {
7060       // 2. Acquire the row lock(s)
7061       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
7062       for (byte[] row : rowsToLock) {
7063         // Attempt to lock all involved rows, throw if any lock times out
7064         acquiredRowLocks.add(getRowLock(row));
7065       }
7066       // 3. Region lock
7067       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
7068       locked = true;
7069       // Get a mvcc write number
7070       mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
7071 
7072       long now = EnvironmentEdgeManager.currentTime();
7073       try {
7074         // 4. Let the processor scan the rows, generate mutations and add
7075         //    waledits
7076         doProcessRowWithTimeout(
7077             processor, now, this, mutations, walEdit, timeout);
7078 
7079         if (!mutations.isEmpty()) {
7080           // 5. Start mvcc transaction
7081           writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
7082           // 6. Call the preBatchMutate hook
7083           processor.preBatchMutate(this, walEdit);
7084           // 7. Apply to memstore
7085           for (Mutation m : mutations) {
7086             // Handle any tag based cell features
7087             rewriteCellTags(m.getFamilyCellMap(), m);
7088 
7089             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
7090               Cell cell = cellScanner.current();
7091               CellUtil.setSequenceId(cell, mvccNum);
7092               Store store = getStore(cell);
7093               if (store == null) {
7094                 checkFamily(CellUtil.cloneFamily(cell));
7095                 // unreachable
7096               }
7097               Pair<Long, Cell> ret = store.add(cell);
7098               addedSize += ret.getFirst();
7099               memstoreCells.add(ret.getSecond());
7100             }
7101           }
7102 
7103           long txid = 0;
7104           // 8. Append no sync
7105           if (!walEdit.isEmpty()) {
7106             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7107             walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
7108               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, now,
7109               processor.getClusterIds(), nonceGroup, nonce);
7110             txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
7111               walKey, walEdit, getSequenceId(), true, memstoreCells);
7112           }
7113           if(walKey == null){
7114             // since we use wal sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit
7115             // to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId
7116             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7117           }
7118           // 9. Release region lock
7119           if (locked) {
7120             this.updatesLock.readLock().unlock();
7121             locked = false;
7122           }
7123 
7124           // 10. Release row lock(s)
7125           releaseRowLocks(acquiredRowLocks);
7126 
7127           // 11. Sync edit log
7128           if (txid != 0) {
7129             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
7130           }
7131           walSyncSuccessful = true;
7132           // 12. call postBatchMutate hook
7133           processor.postBatchMutate(this);
7134         }
7135       } finally {
7136         // TODO: Make this method look like all other methods that are doing append/sync and
7137         // memstore rollback such as append and doMiniBatchMutation. Currently it is a little
7138         // different. Make them all share same code!
7139         if (!mutations.isEmpty() && !walSyncSuccessful) {
7140           String row = processor.getRowsToLock().isEmpty() ? "" :
7141             " for row(s):" + StringUtils.byteToHexString(processor.getRowsToLock().iterator().next())
7142             + "...";
7143           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
7144               " memstore keyvalues" + row);
7145           for (Mutation m : mutations) {
7146             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
7147               Cell cell = cellScanner.current();
7148               getStore(cell).rollback(cell);
7149             }
7150           }
7151           if (writeEntry != null) {
7152             mvcc.cancelMemstoreInsert(writeEntry);
7153             writeEntry = null;
7154           }
7155         }
7156         // 13. Roll mvcc forward
7157         if (writeEntry != null) {
7158           mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
7159         }
7160         if (locked) {
7161           this.updatesLock.readLock().unlock();
7162         }
7163         // release locks if some were acquired but another timed out
7164         releaseRowLocks(acquiredRowLocks);
7165       }
7166 
7167       // 14. Run post-process hook
7168       processor.postProcess(this, walEdit, walSyncSuccessful);
7169 
7170     } finally {
7171       closeRegionOperation();
7172       if (!mutations.isEmpty() &&
7173           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
7174         requestFlush();
7175       }
7176     }
7177   }
7178 
7179   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
7180                                        final long now,
7181                                        final HRegion region,
7182                                        final List<Mutation> mutations,
7183                                        final WALEdit walEdit,
7184                                        final long timeout) throws IOException {
7185     // Short circuit the no time bound case.
7186     if (timeout < 0) {
7187       try {
7188         processor.process(now, region, mutations, walEdit);
7189       } catch (IOException e) {
7190         String row = processor.getRowsToLock().isEmpty() ? "" :
7191           " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7192         LOG.warn("RowProcessor:" + processor.getClass().getName() +
7193             " throws Exception" + row, e);
7194         throw e;
7195       }
7196       return;
7197     }
7198 
7199     // Case with time bound
7200     FutureTask<Void> task =
7201       new FutureTask<Void>(new Callable<Void>() {
7202         @Override
7203         public Void call() throws IOException {
7204           try {
7205             processor.process(now, region, mutations, walEdit);
7206             return null;
7207           } catch (IOException e) {
7208             String row = processor.getRowsToLock().isEmpty() ? "" :
7209               " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7210             LOG.warn("RowProcessor:" + processor.getClass().getName() +
7211                 " throws Exception" + row, e);
7212             throw e;
7213           }
7214         }
7215       });
7216     rowProcessorExecutor.execute(task);
7217     try {
7218       task.get(timeout, TimeUnit.MILLISECONDS);
7219     } catch (TimeoutException te) {
7220       String row = processor.getRowsToLock().isEmpty() ? "" :
7221         " on row(s):" + Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) + "...";
7222       LOG.error("RowProcessor timeout:" + timeout + " ms" + row);
7223       throw new IOException(te);
7224     } catch (Exception e) {
7225       throw new IOException(e);
7226     }
7227   }
7228 
7229   public Result append(Append append) throws IOException {
7230     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
7231   }
7232 
7233   // TODO: There's a lot of boiler plate code identical to increment.
7234   // We should refactor append and increment as local get-mutate-put
7235   // transactions, so all stores only go through one code path for puts.
7236 
7237   @Override
7238   public Result append(Append append, long nonceGroup, long nonce) throws IOException {
7239     byte[] row = append.getRow();
7240     checkRow(row, "append");
7241     boolean flush = false;
7242     Durability durability = getEffectiveDurability(append.getDurability());
7243     boolean writeToWAL = durability != Durability.SKIP_WAL;
7244     WALEdit walEdits = null;
7245     List<Cell> allKVs = new ArrayList<Cell>(append.size());
7246     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
7247     long size = 0;
7248     long txid = 0;
7249 
7250     checkReadOnly();
7251     checkResources();
7252     // Lock row
7253     startRegionOperation(Operation.APPEND);
7254     this.writeRequestsCount.increment();
7255     long mvccNum = 0;
7256     WriteEntry writeEntry = null;
7257     WALKey walKey = null;
7258     RowLock rowLock = null;
7259     List<Cell> memstoreCells = new ArrayList<Cell>();
7260     boolean doRollBackMemstore = false;
7261     try {
7262       rowLock = getRowLock(row);
7263       try {
7264         lock(this.updatesLock.readLock());
7265         try {
7266           // wait for all prior MVCC transactions to finish - while we hold the row lock
7267           // (so that we are guaranteed to see the latest state)
7268           mvcc.waitForPreviousTransactionsComplete();
7269           if (this.coprocessorHost != null) {
7270             Result r = this.coprocessorHost.preAppendAfterRowLock(append);
7271             if(r!= null) {
7272               return r;
7273             }
7274           }
7275           // now start my own transaction
7276           mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
7277           writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
7278           long now = EnvironmentEdgeManager.currentTime();
7279           // Process each family
7280           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
7281 
7282             Store store = stores.get(family.getKey());
7283             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
7284 
7285             // Sort the cells so that they match the order that they
7286             // appear in the Get results. Otherwise, we won't be able to
7287             // find the existing values if the cells are not specified
7288             // in order by the client since cells are in an array list.
7289             Collections.sort(family.getValue(), store.getComparator());
7290             // Get previous values for all columns in this family
7291             Get get = new Get(row);
7292             for (Cell cell : family.getValue()) {
7293               get.addColumn(family.getKey(), CellUtil.cloneQualifier(cell));
7294             }
7295             List<Cell> results = get(get, false);
7296 
7297             // Iterate the input columns and update existing values if they were
7298             // found, otherwise add new column initialized to the append value
7299 
7300             // Avoid as much copying as possible. We may need to rewrite and
7301             // consolidate tags. Bytes are only copied once.
7302             // Would be nice if KeyValue had scatter/gather logic
7303             int idx = 0;
7304             for (Cell cell : family.getValue()) {
7305               Cell newCell;
7306               Cell oldCell = null;
7307               if (idx < results.size()
7308                   && CellUtil.matchingQualifier(results.get(idx), cell)) {
7309                 oldCell = results.get(idx);
7310                 long ts = Math.max(now, oldCell.getTimestamp());
7311 
7312                 // Process cell tags
7313                 List<Tag> tags = Tag.carryForwardTags(null, oldCell);
7314                 tags = Tag.carryForwardTags(tags, cell);
7315                 tags = carryForwardTTLTag(tags, append);
7316 
7317                 // Rebuild tags
7318                 byte[] tagBytes = Tag.fromList(tags);
7319 
7320                 // allocate an empty cell once
7321                 newCell = new KeyValue(row.length, cell.getFamilyLength(),
7322                     cell.getQualifierLength(), ts, KeyValue.Type.Put,
7323                     oldCell.getValueLength() + cell.getValueLength(),
7324                     tagBytes == null? 0: tagBytes.length);
7325                 // copy in row, family, and qualifier
7326                 System.arraycopy(cell.getRowArray(), cell.getRowOffset(),
7327                   newCell.getRowArray(), newCell.getRowOffset(), cell.getRowLength());
7328                 System.arraycopy(cell.getFamilyArray(), cell.getFamilyOffset(),
7329                   newCell.getFamilyArray(), newCell.getFamilyOffset(),
7330                   cell.getFamilyLength());
7331                 System.arraycopy(cell.getQualifierArray(), cell.getQualifierOffset(),
7332                   newCell.getQualifierArray(), newCell.getQualifierOffset(),
7333                   cell.getQualifierLength());
7334                 // copy in the value
7335                 System.arraycopy(oldCell.getValueArray(), oldCell.getValueOffset(),
7336                   newCell.getValueArray(), newCell.getValueOffset(),
7337                   oldCell.getValueLength());
7338                 System.arraycopy(cell.getValueArray(), cell.getValueOffset(),
7339                   newCell.getValueArray(),
7340                   newCell.getValueOffset() + oldCell.getValueLength(),
7341                   cell.getValueLength());
7342                 // Copy in tag data
7343                 if (tagBytes != null) {
7344                   System.arraycopy(tagBytes, 0, newCell.getTagsArray(), newCell.getTagsOffset(),
7345                     tagBytes.length);
7346                 }
7347                 idx++;
7348               } else {
7349                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP
7350                 CellUtil.updateLatestStamp(cell, now);
7351 
7352                 // Cell TTL handling
7353 
7354                 if (append.getTTL() != Long.MAX_VALUE) {
7355                   // Add the new TTL tag
7356                   newCell = new KeyValue(cell.getRowArray(), cell.getRowOffset(),
7357                       cell.getRowLength(),
7358                     cell.getFamilyArray(), cell.getFamilyOffset(),
7359                       cell.getFamilyLength(),
7360                     cell.getQualifierArray(), cell.getQualifierOffset(),
7361                       cell.getQualifierLength(),
7362                     cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
7363                     cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
7364                     carryForwardTTLTag(append));
7365                 } else {
7366                   newCell = cell;
7367                 }
7368               }
7369 
7370               CellUtil.setSequenceId(newCell, mvccNum);
7371               // Give coprocessors a chance to update the new cell
7372               if (coprocessorHost != null) {
7373                 newCell = coprocessorHost.postMutationBeforeWAL(RegionObserver.MutationType.APPEND,
7374                     append, oldCell, newCell);
7375               }
7376               kvs.add(newCell);
7377 
7378               // Append update to WAL
7379               if (writeToWAL) {
7380                 if (walEdits == null) {
7381                   walEdits = new WALEdit();
7382                 }
7383                 walEdits.add(newCell);
7384               }
7385             }
7386 
7387             //store the kvs to the temporary memstore before writing WAL
7388             tempMemstore.put(store, kvs);
7389           }
7390 
7391           //Actually write to Memstore now
7392           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
7393             Store store = entry.getKey();
7394             if (store.getFamily().getMaxVersions() == 1) {
7395               // upsert if VERSIONS for this CF == 1
7396               size += store.upsert(entry.getValue(), getSmallestReadPoint());
7397               memstoreCells.addAll(entry.getValue());
7398             } else {
7399               // otherwise keep older versions around
7400               for (Cell cell: entry.getValue()) {
7401                 Pair<Long, Cell> ret = store.add(cell);
7402                 size += ret.getFirst();
7403                 memstoreCells.add(ret.getSecond());
7404                 doRollBackMemstore = true;
7405               }
7406             }
7407             allKVs.addAll(entry.getValue());
7408           }
7409 
7410           // Actually write to WAL now
7411           if (writeToWAL) {
7412             // Using default cluster id, as this can only happen in the originating
7413             // cluster. A slave cluster receives the final value (not the delta)
7414             // as a Put.
7415             // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
7416             walKey = new HLogKey(getRegionInfo().getEncodedNameAsBytes(),
7417               this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
7418             txid = this.wal.append(this.htableDescriptor, getRegionInfo(), walKey, walEdits,
7419               this.sequenceId, true, memstoreCells);
7420           } else {
7421             recordMutationWithoutWal(append.getFamilyCellMap());
7422           }
7423           if (walKey == null) {
7424             // Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
7425             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7426           }
7427           size = this.addAndGetGlobalMemstoreSize(size);
7428           flush = isFlushSize(size);
7429         } finally {
7430           this.updatesLock.readLock().unlock();
7431         }
7432       } finally {
7433         rowLock.release();
7434         rowLock = null;
7435       }
7436       // sync the transaction log outside the rowlock
7437       if(txid != 0){
7438         syncOrDefer(txid, durability);
7439       }
7440       doRollBackMemstore = false;
7441     } finally {
7442       if (rowLock != null) {
7443         rowLock.release();
7444       }
7445       // if the wal sync was unsuccessful, remove keys from memstore
7446       if (doRollBackMemstore) {
7447         rollbackMemstore(memstoreCells);
7448         if (writeEntry != null) mvcc.cancelMemstoreInsert(writeEntry);
7449       } else if (writeEntry != null) {
7450         mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
7451       }
7452 
7453       closeRegionOperation(Operation.APPEND);
7454     }
7455 
7456     if (this.metricsRegion != null) {
7457       this.metricsRegion.updateAppend();
7458     }
7459 
7460     if (flush) {
7461       // Request a cache flush. Do it outside update lock.
7462       requestFlush();
7463     }
7464 
7465 
7466     return append.isReturnResults() ? Result.create(allKVs) : null;
7467   }
7468 
7469   public Result increment(Increment increment) throws IOException {
7470     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
7471   }
7472 
7473   // TODO: There's a lot of boiler plate code identical to append.
7474   // We should refactor append and increment as local get-mutate-put
7475   // transactions, so all stores only go through one code path for puts.
7476 
7477   @Override
7478   public Result increment(Increment increment, long nonceGroup, long nonce)
7479   throws IOException {
7480     checkReadOnly();
7481     checkResources();
7482     checkRow(increment.getRow(), "increment");
7483     checkFamilies(increment.getFamilyCellMap().keySet());
7484     startRegionOperation(Operation.INCREMENT);
7485     this.writeRequestsCount.increment();
7486     try {
7487       return doIncrement(increment, nonceGroup, nonce);
7488     } finally {
7489       if (this.metricsRegion != null) this.metricsRegion.updateIncrement();
7490       closeRegionOperation(Operation.INCREMENT);
7491     }
7492   }
7493 
7494   private Result doIncrement(Increment increment, long nonceGroup, long nonce) throws IOException {
7495     RowLock rowLock = null;
7496     WriteEntry writeEntry = null;
7497     WALKey walKey = null;
7498     boolean doRollBackMemstore = false;
7499     long accumulatedResultSize = 0;
7500     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
7501     List<Cell> memstoreCells = new ArrayList<Cell>();
7502     Durability effectiveDurability = getEffectiveDurability(increment.getDurability());
7503     try {
7504       rowLock = getRowLock(increment.getRow());
7505       long txid = 0;
7506       try {
7507         lock(this.updatesLock.readLock());
7508         try {
7509           // Wait for all prior MVCC transactions to finish - while we hold the row lock
7510           // (so that we are guaranteed to see the latest increment)
7511           this.mvcc.waitForPreviousTransactionsComplete();
7512           if (this.coprocessorHost != null) {
7513             Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
7514             if (r != null) return r;
7515           }
7516           // Now start my own transaction
7517           long mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
7518           writeEntry = this.mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
7519 
7520           // Process increments a Store/family at a time.
7521           long now = EnvironmentEdgeManager.currentTime();
7522           final boolean writeToWAL = effectiveDurability != Durability.SKIP_WAL;
7523           WALEdit walEdits = null;
7524           for (Map.Entry<byte [], List<Cell>> entry: increment.getFamilyCellMap().entrySet()) {
7525             byte [] columnFamilyName = entry.getKey();
7526             List<Cell> increments = entry.getValue();
7527             Store store = this.stores.get(columnFamilyName);
7528             // Do increment for this store; be sure to 'sort' the increments first so increments
7529             // match order in which we get back current Cells when we get.
7530             List<Cell> results = applyIncrementsToColumnFamily(increment, columnFamilyName,
7531                 sort(increments, store.getComparator()), now, mvccNum, allKVs, null);
7532             if (!results.isEmpty()) {
7533               // Prepare WAL updates
7534               if (writeToWAL) {
7535                 // Handmade loop on arraylist is faster than enhanced for-loop.
7536                 // See http://developer.android.com/training/articles/perf-tips.html
7537                 int resultsSize = results.size();
7538                 for (int i = 0; i < resultsSize; i++) {
7539                   if (walEdits == null) walEdits = new WALEdit();
7540                   walEdits.add(results.get(i));
7541                 }
7542               }
7543               // Now write to this Store's memstore.
7544               if (store.getFamily().getMaxVersions() == 1) {
7545                 // Upsert if VERSIONS for this CF == 1
7546                 accumulatedResultSize += store.upsert(results, getSmallestReadPoint());
7547                 memstoreCells.addAll(results);
7548                 // TODO: St.Ack 20151222 Why no rollback in this case?
7549               } else {
7550                 // Otherwise keep older versions around
7551                 for (Cell cell: results) {
7552                   Pair<Long, Cell> ret = store.add(cell);
7553                   accumulatedResultSize += ret.getFirst();
7554                   memstoreCells.add(ret.getSecond());
7555                   doRollBackMemstore = true;
7556                 }
7557               }
7558             }
7559           }
7560 
7561           // Actually write to WAL now
7562           if (walEdits != null && !walEdits.isEmpty()) {
7563             if (writeToWAL) {
7564               // Using default cluster id, as this can only happen in the originating cluster.
7565               // A slave cluster receives the final value (not the delta) as a Put. We use HLogKey
7566               // here instead of WALKey directly to support legacy coprocessors.
7567               walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
7568                 this.htableDescriptor.getTableName(), WALKey.NO_SEQUENCE_ID, nonceGroup, nonce);
7569               txid = this.wal.append(this.htableDescriptor, this.getRegionInfo(),
7570                 walKey, walEdits, getSequenceId(), true, memstoreCells);
7571             } else {
7572               recordMutationWithoutWal(increment.getFamilyCellMap());
7573             }
7574           }
7575           if (walKey == null) {
7576             // Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned
7577             walKey = this.appendEmptyEdit(this.wal, memstoreCells);
7578           }
7579         } finally {
7580           this.updatesLock.readLock().unlock();
7581         }
7582       } finally {
7583         rowLock.release();
7584         rowLock = null;
7585       }
7586       // sync the transaction log outside the rowlock
7587       if (txid != 0) syncOrDefer(txid, effectiveDurability);
7588       doRollBackMemstore = false;
7589     } finally {
7590       if (rowLock != null) rowLock.release();
7591       // if the wal sync was unsuccessful, remove keys from memstore
7592       if (doRollBackMemstore) rollbackMemstore(memstoreCells);
7593       if (writeEntry != null) mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
7594     }
7595     // Request a cache flush.  Do it outside update lock.
7596     if (isFlushSize(this.addAndGetGlobalMemstoreSize(accumulatedResultSize))) requestFlush();
7597     return increment.isReturnResults() ? Result.create(allKVs) : null;
7598   }
7599 
7600   /**
7601    * @return Sorted list of <code>cells</code> using <code>comparator</code>
7602    */
7603   private static List<Cell> sort(List<Cell> cells, final Comparator<Cell> comparator) {
7604     Collections.sort(cells, comparator);
7605     return cells;
7606   }
7607 
7608   /**
7609    * Apply increments to a column family.
7610    * @param sortedIncrements The passed in increments to apply MUST be sorted so that they match
7611    * the order that they appear in the Get results (get results will be sorted on return).
7612    * Otherwise, we won't be able to find the existing values if the cells are not specified in
7613    * order by the client since cells are in an array list.
7614    * @islation Isolation level to use when running the 'get'. Pass null for default.
7615    * @return Resulting increments after <code>sortedIncrements</code> have been applied to current
7616    * values (if any -- else passed increment is the final result).
7617    * @throws IOException
7618    */
7619   private List<Cell> applyIncrementsToColumnFamily(Increment increment, byte[] columnFamilyName,
7620       List<Cell> sortedIncrements, long now, long mvccNum, List<Cell> allKVs,
7621       final IsolationLevel isolation)
7622   throws IOException {
7623     List<Cell> results = new ArrayList<Cell>(sortedIncrements.size());
7624     byte [] row = increment.getRow();
7625     // Get previous values for all columns in this family
7626     List<Cell> currentValues =
7627         getIncrementCurrentValue(increment, columnFamilyName, sortedIncrements, isolation);
7628     // Iterate the input columns and update existing values if they were found, otherwise
7629     // add new column initialized to the increment amount
7630     int idx = 0;
7631     for (int i = 0; i < sortedIncrements.size(); i++) {
7632       Cell inc = sortedIncrements.get(i);
7633       long incrementAmount = getLongValue(inc);
7634       // If increment amount == 0, then don't write this Increment to the WAL.
7635       boolean writeBack = (incrementAmount != 0);
7636       // Carry forward any tags that might have been added by a coprocessor.
7637       List<Tag> tags = Tag.carryForwardTags(inc);
7638 
7639       Cell currentValue = null;
7640       long ts = now;
7641       if (idx < currentValues.size() && CellUtil.matchingQualifier(currentValues.get(idx), inc)) {
7642         currentValue = currentValues.get(idx);
7643         ts = Math.max(now, currentValue.getTimestamp());
7644         incrementAmount += getLongValue(currentValue);
7645         // Carry forward all tags
7646         tags = Tag.carryForwardTags(tags, currentValue);
7647         if (i < (sortedIncrements.size() - 1) &&
7648             !CellUtil.matchingQualifier(inc, sortedIncrements.get(i + 1))) idx++;
7649       }
7650 
7651       // Append new incremented KeyValue to list
7652       byte [] qualifier = CellUtil.cloneQualifier(inc);
7653       byte [] incrementAmountInBytes = Bytes.toBytes(incrementAmount);
7654       tags = carryForwardTTLTag(tags, increment);
7655 
7656       Cell newValue = new KeyValue(row, 0, row.length,
7657         columnFamilyName, 0, columnFamilyName.length,
7658         qualifier, 0, qualifier.length,
7659         ts, KeyValue.Type.Put,
7660         incrementAmountInBytes, 0, incrementAmountInBytes.length,
7661         tags);
7662 
7663       // Don't set an mvcc if none specified. The mvcc may be assigned later in case where we
7664       // write the memstore AFTER we sync our edit to the log.
7665       if (mvccNum != MultiVersionConsistencyControl.NO_WRITE_NUMBER) {
7666         CellUtil.setSequenceId(newValue, mvccNum);
7667       }
7668 
7669       // Give coprocessors a chance to update the new cell
7670       if (coprocessorHost != null) {
7671         newValue = coprocessorHost.postMutationBeforeWAL(
7672             RegionObserver.MutationType.INCREMENT, increment, currentValue, newValue);
7673       }
7674       allKVs.add(newValue);
7675       if (writeBack) {
7676         results.add(newValue);
7677       }
7678     }
7679     return results;
7680   }
7681 
7682   /**
7683    * @return Get the long out of the passed in Cell
7684    * @throws DoNotRetryIOException
7685    */
7686   private static long getLongValue(final Cell cell) throws DoNotRetryIOException {
7687     int len = cell.getValueLength();
7688     if (len != Bytes.SIZEOF_LONG) {
7689       // throw DoNotRetryIOException instead of IllegalArgumentException
7690       throw new DoNotRetryIOException("Field is not a long, it's " + len + " bytes wide");
7691     }
7692     return Bytes.toLong(cell.getValueArray(), cell.getValueOffset(), len);
7693   }
7694 
7695   /**
7696    * Do a specific Get on passed <code>columnFamily</code> and column qualifiers
7697    * from <code>incrementCoordinates</code> only.
7698    * @param increment
7699    * @param columnFamily
7700    * @param incrementCoordinates
7701    * @return Return the Cells to Increment
7702    * @throws IOException
7703    */
7704   private List<Cell> getIncrementCurrentValue(final Increment increment, byte [] columnFamily,
7705       final List<Cell> increments, final IsolationLevel isolation)
7706   throws IOException {
7707     Get get = new Get(increment.getRow());
7708     if (isolation != null) get.setIsolationLevel(isolation);
7709     for (Cell cell: increments) {
7710       get.addColumn(columnFamily, CellUtil.cloneQualifier(cell));
7711     }
7712     TimeRange tr = increment.getTimeRange();
7713     get.setTimeRange(tr.getMin(), tr.getMax());
7714     return get(get, false);
7715   }
7716 
7717   private static List<Tag> carryForwardTTLTag(final Mutation mutation) {
7718     return carryForwardTTLTag(null, mutation);
7719   }
7720 
7721   /**
7722    * @return Carry forward the TTL tag if the increment is carrying one
7723    */
7724   private static List<Tag> carryForwardTTLTag(final List<Tag> tagsOrNull,
7725       final Mutation mutation) {
7726     long ttl = mutation.getTTL();
7727     if (ttl == Long.MAX_VALUE) return tagsOrNull;
7728     List<Tag> tags = tagsOrNull;
7729     // If we are making the array in here, given we are the last thing checked, we'll be only thing
7730     // in the array so set its size to '1' (I saw this being done in earlier version of
7731     // tag-handling).
7732     if (tags == null) tags = new ArrayList<Tag>(1);
7733     tags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(ttl)));
7734     return tags;
7735   }
7736 
7737   //
7738   // New HBASE-880 Helpers
7739   //
7740 
7741   private void checkFamily(final byte [] family)
7742   throws NoSuchColumnFamilyException {
7743     if (!this.htableDescriptor.hasFamily(family)) {
7744       throw new NoSuchColumnFamilyException("Column family " +
7745           Bytes.toString(family) + " does not exist in region " + this
7746           + " in table " + this.htableDescriptor);
7747     }
7748   }
7749 
7750   public static final long FIXED_OVERHEAD = ClassSize.align(
7751       ClassSize.OBJECT +
7752       ClassSize.ARRAY +
7753       45 * ClassSize.REFERENCE + 3 * Bytes.SIZEOF_INT +
7754       (14 * Bytes.SIZEOF_LONG) +
7755       5 * Bytes.SIZEOF_BOOLEAN);
7756 
7757   // woefully out of date - currently missing:
7758   // 1 x HashMap - coprocessorServiceHandlers
7759   // 6 x Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
7760   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
7761   //   writeRequestsCount
7762   // 1 x HRegion$WriteState - writestate
7763   // 1 x RegionCoprocessorHost - coprocessorHost
7764   // 1 x RegionSplitPolicy - splitPolicy
7765   // 1 x MetricsRegion - metricsRegion
7766   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
7767   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
7768       ClassSize.OBJECT + // closeLock
7769       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
7770       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
7771       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
7772       WriteState.HEAP_SIZE + // writestate
7773       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
7774       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
7775       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
7776       + ClassSize.TREEMAP // maxSeqIdInStores
7777       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
7778       ;
7779 
7780   @Override
7781   public long heapSize() {
7782     long heapSize = DEEP_OVERHEAD;
7783     for (Store store : this.stores.values()) {
7784       heapSize += store.heapSize();
7785     }
7786     // this does not take into account row locks, recent flushes, mvcc entries, and more
7787     return heapSize;
7788   }
7789 
7790   /*
7791    * This method calls System.exit.
7792    * @param message Message to print out.  May be null.
7793    */
7794   private static void printUsageAndExit(final String message) {
7795     if (message != null && message.length() > 0) System.out.println(message);
7796     System.out.println("Usage: HRegion CATALOG_TABLE_DIR [major_compact]");
7797     System.out.println("Options:");
7798     System.out.println(" major_compact  Pass this option to major compact " +
7799       "passed region.");
7800     System.out.println("Default outputs scan of passed region.");
7801     System.exit(1);
7802   }
7803 
7804   @Override
7805   public boolean registerService(Service instance) {
7806     /*
7807      * No stacking of instances is allowed for a single service name
7808      */
7809     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
7810     String serviceName = CoprocessorRpcUtils.getServiceName(serviceDesc);
7811     if (coprocessorServiceHandlers.containsKey(serviceName)) {
7812       LOG.error("Coprocessor service " + serviceName +
7813               " already registered, rejecting request from " + instance
7814       );
7815       return false;
7816     }
7817 
7818     coprocessorServiceHandlers.put(serviceName, instance);
7819     if (LOG.isDebugEnabled()) {
7820       LOG.debug("Registered coprocessor service: region=" +
7821           Bytes.toStringBinary(getRegionInfo().getRegionName()) +
7822           " service=" + serviceName);
7823     }
7824     return true;
7825   }
7826 
7827   @Override
7828   public Message execService(RpcController controller, CoprocessorServiceCall call)
7829       throws IOException {
7830     String serviceName = call.getServiceName();
7831     String methodName = call.getMethodName();
7832     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
7833       throw new UnknownProtocolException(null,
7834           "No registered coprocessor service found for name "+serviceName+
7835           " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7836     }
7837 
7838     Service service = coprocessorServiceHandlers.get(serviceName);
7839     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
7840     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
7841     if (methodDesc == null) {
7842       throw new UnknownProtocolException(service.getClass(),
7843           "Unknown method "+methodName+" called on service "+serviceName+
7844               " in region "+Bytes.toStringBinary(getRegionInfo().getRegionName()));
7845     }
7846 
7847     Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType();
7848     ProtobufUtil.mergeFrom(builder, call.getRequest());
7849     Message request = builder.build();
7850 
7851     if (coprocessorHost != null) {
7852       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
7853     }
7854 
7855     final Message.Builder responseBuilder =
7856         service.getResponsePrototype(methodDesc).newBuilderForType();
7857     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
7858       @Override
7859       public void run(Message message) {
7860         if (message != null) {
7861           responseBuilder.mergeFrom(message);
7862         }
7863       }
7864     });
7865 
7866     if (coprocessorHost != null) {
7867       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
7868     }
7869 
7870     return responseBuilder.build();
7871   }
7872 
7873   /*
7874    * Process table.
7875    * Do major compaction or list content.
7876    * @throws IOException
7877    */
7878   private static void processTable(final FileSystem fs, final Path p,
7879       final WALFactory walFactory, final Configuration c,
7880       final boolean majorCompact)
7881   throws IOException {
7882     HRegion region;
7883     FSTableDescriptors fst = new FSTableDescriptors(c);
7884     // Currently expects tables have one region only.
7885     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
7886       final WAL wal = walFactory.getMetaWAL(
7887           HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes());
7888       region = HRegion.newHRegion(p, wal, fs, c,
7889         HRegionInfo.FIRST_META_REGIONINFO, fst.get(TableName.META_TABLE_NAME), null);
7890     } else {
7891       throw new IOException("Not a known catalog table: " + p.toString());
7892     }
7893     try {
7894       region.initialize(null);
7895       if (majorCompact) {
7896         region.compact(true);
7897       } else {
7898         // Default behavior
7899         Scan scan = new Scan();
7900         // scan.addFamily(HConstants.CATALOG_FAMILY);
7901         RegionScanner scanner = region.getScanner(scan);
7902         try {
7903           List<Cell> kvs = new ArrayList<Cell>();
7904           boolean done;
7905           do {
7906             kvs.clear();
7907             done = scanner.next(kvs);
7908             if (kvs.size() > 0) LOG.info(kvs);
7909           } while (done);
7910         } finally {
7911           scanner.close();
7912         }
7913       }
7914     } finally {
7915       region.close();
7916     }
7917   }
7918 
7919   boolean shouldForceSplit() {
7920     return this.splitRequest;
7921   }
7922 
7923   byte[] getExplicitSplitPoint() {
7924     return this.explicitSplitPoint;
7925   }
7926 
7927   void forceSplit(byte[] sp) {
7928     // This HRegion will go away after the forced split is successful
7929     // But if a forced split fails, we need to clear forced split.
7930     this.splitRequest = true;
7931     if (sp != null) {
7932       this.explicitSplitPoint = sp;
7933     }
7934   }
7935 
7936   void clearSplit() {
7937     this.splitRequest = false;
7938     this.explicitSplitPoint = null;
7939   }
7940 
7941   /**
7942    * Give the region a chance to prepare before it is split.
7943    */
7944   protected void prepareToSplit() {
7945     // nothing
7946   }
7947 
7948   /**
7949    * Return the splitpoint. null indicates the region isn't splittable
7950    * If the splitpoint isn't explicitly specified, it will go over the stores
7951    * to find the best splitpoint. Currently the criteria of best splitpoint
7952    * is based on the size of the store.
7953    */
7954   public byte[] checkSplit() {
7955     // Can't split META
7956     if (this.getRegionInfo().isMetaTable() ||
7957         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
7958       if (shouldForceSplit()) {
7959         LOG.warn("Cannot split meta region in HBase 0.20 and above");
7960       }
7961       return null;
7962     }
7963 
7964     // Can't split region which is in recovering state
7965     if (this.isRecovering()) {
7966       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
7967       return null;
7968     }
7969 
7970     if (!splitPolicy.shouldSplit()) {
7971       return null;
7972     }
7973 
7974     byte[] ret = splitPolicy.getSplitPoint();
7975 
7976     if (ret != null) {
7977       try {
7978         checkRow(ret, "calculated split");
7979       } catch (IOException e) {
7980         LOG.error("Ignoring invalid split", e);
7981         return null;
7982       }
7983     }
7984     return ret;
7985   }
7986 
7987   /**
7988    * @return The priority that this region should have in the compaction queue
7989    */
7990   public int getCompactPriority() {
7991     int count = Integer.MAX_VALUE;
7992     for (Store store : stores.values()) {
7993       count = Math.min(count, store.getCompactPriority());
7994     }
7995     return count;
7996   }
7997 
7998 
7999   /** @return the coprocessor host */
8000   @Override
8001   public RegionCoprocessorHost getCoprocessorHost() {
8002     return coprocessorHost;
8003   }
8004 
8005   /** @param coprocessorHost the new coprocessor host */
8006   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
8007     this.coprocessorHost = coprocessorHost;
8008   }
8009 
8010   @Override
8011   public void startRegionOperation() throws IOException {
8012     startRegionOperation(Operation.ANY);
8013   }
8014 
8015   @Override
8016   public void startRegionOperation(Operation op) throws IOException {
8017     switch (op) {
8018     case GET:  // read operations
8019     case SCAN:
8020       checkReadsEnabled();
8021     case INCREMENT: // write operations
8022     case APPEND:
8023     case SPLIT_REGION:
8024     case MERGE_REGION:
8025     case PUT:
8026     case DELETE:
8027     case BATCH_MUTATE:
8028     case COMPACT_REGION:
8029       // when a region is in recovering state, no read, split or merge is allowed
8030       if (isRecovering() && (this.disallowWritesInRecovering ||
8031               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
8032         throw new RegionInRecoveryException(getRegionInfo().getRegionNameAsString() +
8033           " is recovering; cannot take reads");
8034       }
8035       break;
8036     default:
8037       break;
8038     }
8039     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
8040         || op == Operation.COMPACT_REGION) {
8041       // split, merge or compact region doesn't need to check the closing/closed state or lock the
8042       // region
8043       return;
8044     }
8045     if (this.closing.get()) {
8046       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
8047     }
8048     lock(lock.readLock());
8049     if (this.closed.get()) {
8050       lock.readLock().unlock();
8051       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
8052     }
8053     try {
8054       if (coprocessorHost != null) {
8055         coprocessorHost.postStartRegionOperation(op);
8056       }
8057     } catch (Exception e) {
8058       lock.readLock().unlock();
8059       throw new IOException(e);
8060     }
8061   }
8062 
8063   @Override
8064   public void closeRegionOperation() throws IOException {
8065     closeRegionOperation(Operation.ANY);
8066   }
8067 
8068   /**
8069    * Closes the lock. This needs to be called in the finally block corresponding
8070    * to the try block of {@link #startRegionOperation(Operation)}
8071    * @throws IOException
8072    */
8073   public void closeRegionOperation(Operation operation) throws IOException {
8074     lock.readLock().unlock();
8075     if (coprocessorHost != null) {
8076       coprocessorHost.postCloseRegionOperation(operation);
8077     }
8078   }
8079 
8080   /**
8081    * This method needs to be called before any public call that reads or
8082    * modifies stores in bulk. It has to be called just before a try.
8083    * #closeBulkRegionOperation needs to be called in the try's finally block
8084    * Acquires a writelock and checks if the region is closing or closed.
8085    * @throws NotServingRegionException when the region is closing or closed
8086    * @throws RegionTooBusyException if failed to get the lock in time
8087    * @throws InterruptedIOException if interrupted while waiting for a lock
8088    */
8089   private void startBulkRegionOperation(boolean writeLockNeeded)
8090       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
8091     if (this.closing.get()) {
8092       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closing");
8093     }
8094     if (writeLockNeeded) lock(lock.writeLock());
8095     else lock(lock.readLock());
8096     if (this.closed.get()) {
8097       if (writeLockNeeded) lock.writeLock().unlock();
8098       else lock.readLock().unlock();
8099       throw new NotServingRegionException(getRegionInfo().getRegionNameAsString() + " is closed");
8100     }
8101   }
8102 
8103   /**
8104    * Closes the lock. This needs to be called in the finally block corresponding
8105    * to the try block of #startRegionOperation
8106    */
8107   private void closeBulkRegionOperation(){
8108     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
8109     else lock.readLock().unlock();
8110   }
8111 
8112   /**
8113    * Update counters for numer of puts without wal and the size of possible data loss.
8114    * These information are exposed by the region server metrics.
8115    */
8116   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
8117     numMutationsWithoutWAL.increment();
8118     if (numMutationsWithoutWAL.get() <= 1) {
8119       LOG.info("writing data to region " + this +
8120                " with WAL disabled. Data may be lost in the event of a crash.");
8121     }
8122 
8123     long mutationSize = 0;
8124     for (List<Cell> cells: familyMap.values()) {
8125       assert cells instanceof RandomAccess;
8126       int listSize = cells.size();
8127       for (int i=0; i < listSize; i++) {
8128         Cell cell = cells.get(i);
8129         // TODO we need include tags length also here.
8130         mutationSize += KeyValueUtil.keyLength(cell) + cell.getValueLength();
8131       }
8132     }
8133 
8134     dataInMemoryWithoutWAL.add(mutationSize);
8135   }
8136 
8137   private void lock(final Lock lock)
8138       throws RegionTooBusyException, InterruptedIOException {
8139     lock(lock, 1);
8140   }
8141 
8142   /**
8143    * Try to acquire a lock.  Throw RegionTooBusyException
8144    * if failed to get the lock in time. Throw InterruptedIOException
8145    * if interrupted while waiting for the lock.
8146    */
8147   private void lock(final Lock lock, final int multiplier)
8148       throws RegionTooBusyException, InterruptedIOException {
8149     try {
8150       final long waitTime = Math.min(maxBusyWaitDuration,
8151           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
8152       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
8153         throw new RegionTooBusyException(
8154             "failed to get a lock in " + waitTime + " ms. " +
8155                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
8156                 this.getRegionInfo().getRegionNameAsString()) +
8157                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
8158                 this.getRegionServerServices().getServerName()));
8159       }
8160     } catch (InterruptedException ie) {
8161       LOG.info("Interrupted while waiting for a lock");
8162       InterruptedIOException iie = new InterruptedIOException();
8163       iie.initCause(ie);
8164       throw iie;
8165     }
8166   }
8167 
8168   /**
8169    * Calls sync with the given transaction ID if the region's table is not
8170    * deferring it.
8171    * @param txid should sync up to which transaction
8172    * @throws IOException If anything goes wrong with DFS
8173    */
8174   private void syncOrDefer(long txid, Durability durability) throws IOException {
8175     if (this.getRegionInfo().isMetaRegion()) {
8176       this.wal.sync(txid);
8177     } else {
8178       switch(durability) {
8179       case USE_DEFAULT:
8180         // do what table defaults to
8181         if (shouldSyncWAL()) {
8182           this.wal.sync(txid);
8183         }
8184         break;
8185       case SKIP_WAL:
8186         // nothing do to
8187         break;
8188       case ASYNC_WAL:
8189         // nothing do to
8190         break;
8191       case SYNC_WAL:
8192       case FSYNC_WAL:
8193         // sync the WAL edit (SYNC and FSYNC treated the same for now)
8194         this.wal.sync(txid);
8195         break;
8196       }
8197     }
8198   }
8199 
8200   /**
8201    * Check whether we should sync the wal from the table's durability settings
8202    */
8203   private boolean shouldSyncWAL() {
8204     return durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
8205   }
8206 
8207   /**
8208    * A mocked list implementation - discards all updates.
8209    */
8210   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
8211 
8212     @Override
8213     public void add(int index, Cell element) {
8214       // do nothing
8215     }
8216 
8217     @Override
8218     public boolean addAll(int index, Collection<? extends Cell> c) {
8219       return false; // this list is never changed as a result of an update
8220     }
8221 
8222     @Override
8223     public KeyValue get(int index) {
8224       throw new UnsupportedOperationException();
8225     }
8226 
8227     @Override
8228     public int size() {
8229       return 0;
8230     }
8231   };
8232 
8233   /**
8234    * Facility for dumping and compacting catalog tables.
8235    * Only does catalog tables since these are only tables we for sure know
8236    * schema on.  For usage run:
8237    * <pre>
8238    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
8239    * </pre>
8240    * @throws IOException
8241    */
8242   public static void main(String[] args) throws IOException {
8243     if (args.length < 1) {
8244       printUsageAndExit(null);
8245     }
8246     boolean majorCompact = false;
8247     if (args.length > 1) {
8248       if (!args[1].toLowerCase().startsWith("major")) {
8249         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
8250       }
8251       majorCompact = true;
8252     }
8253     final Path tableDir = new Path(args[0]);
8254     final Configuration c = HBaseConfiguration.create();
8255     final FileSystem fs = FileSystem.get(c);
8256     final Path logdir = new Path(c.get("hbase.tmp.dir"));
8257     final String logname = "wal" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
8258 
8259     final Configuration walConf = new Configuration(c);
8260     FSUtils.setRootDir(walConf, logdir);
8261     final WALFactory wals = new WALFactory(walConf, null, logname);
8262     try {
8263       processTable(fs, tableDir, wals, c, majorCompact);
8264     } finally {
8265        wals.close();
8266        // TODO: is this still right?
8267        BlockCache bc = new CacheConfig(c).getBlockCache();
8268        if (bc != null) bc.shutdown();
8269     }
8270   }
8271 
8272   @Override
8273   public long getOpenSeqNum() {
8274     return this.openSeqNum;
8275   }
8276 
8277   @Override
8278   public Map<byte[], Long> getMaxStoreSeqId() {
8279     return this.maxSeqIdInStores;
8280   }
8281 
8282   @Override
8283   public long getOldestSeqIdOfStore(byte[] familyName) {
8284     return wal.getEarliestMemstoreSeqNum(getRegionInfo()
8285         .getEncodedNameAsBytes(), familyName);
8286   }
8287 
8288   @Override
8289   public CompactionState getCompactionState() {
8290     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
8291     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
8292         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
8293   }
8294 
8295   public void reportCompactionRequestStart(boolean isMajor){
8296     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
8297   }
8298 
8299   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted) {
8300     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
8301 
8302     // metrics
8303     compactionsFinished.incrementAndGet();
8304     compactionNumFilesCompacted.addAndGet(numFiles);
8305     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
8306 
8307     assert newValue >= 0;
8308   }
8309 
8310   /**
8311    * Do not change this sequence id. See {@link #sequenceId} comment.
8312    * @return sequenceId
8313    */
8314   @VisibleForTesting
8315   public AtomicLong getSequenceId() {
8316     return this.sequenceId;
8317   }
8318 
8319   /**
8320    * sets this region's sequenceId.
8321    * @param value new value
8322    */
8323   private void setSequenceId(long value) {
8324     this.sequenceId.set(value);
8325   }
8326   
8327   public ConcurrentHashMap<HashedBytes, RowLockContext> getLockedRows() {
8328     return lockedRows;
8329   }
8330 
8331   @VisibleForTesting class RowLockContext {
8332     private final HashedBytes row;
8333     private final CountDownLatch latch = new CountDownLatch(1);
8334     private final Thread thread;
8335     private int lockCount = 0;
8336     private String threadName;
8337     
8338     RowLockContext(HashedBytes row) {
8339       this.row = row;
8340       this.thread = Thread.currentThread();
8341     }
8342 
8343     boolean ownedByCurrentThread() {
8344       return thread == Thread.currentThread();
8345     }
8346 
8347     RowLock newLock() {
8348       lockCount++;
8349       RowLockImpl rl = new RowLockImpl();
8350       rl.setContext(this);
8351       return rl;
8352     }
8353 
8354     void releaseLock() {
8355       if (!ownedByCurrentThread()) {
8356         throw new IllegalArgumentException("Lock held by thread: " + thread
8357           + " cannot be released by different thread: " + Thread.currentThread());
8358       }
8359       lockCount--;
8360       if (lockCount == 0) {
8361         // no remaining locks by the thread, unlock and allow other threads to access
8362         RowLockContext existingContext = lockedRows.remove(row);
8363         if (existingContext != this) {
8364           throw new RuntimeException(
8365               "Internal row lock state inconsistent, should not happen, row: " + row);
8366         }
8367         latch.countDown();
8368       }
8369     }
8370     
8371     public void setThreadName(String threadName) {
8372       this.threadName = threadName;
8373     }
8374     
8375 
8376     @Override
8377     public String toString() {
8378       return "RowLockContext{" +
8379           "row=" + row +
8380           ", count=" + lockCount +
8381           ", threadName=" + threadName +
8382           '}';
8383     }
8384   }
8385 
8386   public static class RowLockImpl implements RowLock {
8387     private RowLockContext context;
8388     private boolean released = false;
8389 
8390     @VisibleForTesting
8391     public RowLockContext getContext() {
8392       return context;
8393     }
8394 
8395     @VisibleForTesting
8396     public void setContext(RowLockContext context) {
8397       this.context = context;
8398     }
8399 
8400     @Override
8401     public void release() {
8402       if (!released) {
8403         context.releaseLock();
8404       }
8405       released = true;
8406     }
8407   }
8408 
8409   /**
8410    * Append a faked WALEdit in order to get a long sequence number and wal syncer will just ignore
8411    * the WALEdit append later.
8412    * @param wal
8413    * @param cells list of Cells inserted into memstore. Those Cells are passed in order to
8414    *        be updated with right mvcc values(their wal sequence number)
8415    * @return Return the key used appending with no sync and no append.
8416    * @throws IOException
8417    */
8418   private WALKey appendEmptyEdit(final WAL wal, List<Cell> cells) throws IOException {
8419     // we use HLogKey here instead of WALKey directly to support legacy coprocessors.
8420     @SuppressWarnings("deprecation")
8421     WALKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(), getRegionInfo().getTable(),
8422       WALKey.NO_SEQUENCE_ID, 0, null, HConstants.NO_NONCE, HConstants.NO_NONCE);
8423     // Call append but with an empty WALEdit.  The returned seqeunce id will not be associated
8424     // with any edit and we can be sure it went in after all outstanding appends.
8425     wal.append(getTableDesc(), getRegionInfo(), key, WALEdit.EMPTY_WALEDIT, getSequenceId(), false,
8426       cells);
8427     return key;
8428   }
8429 
8430   /**
8431    * {@inheritDoc}
8432    */
8433   @Override
8434   public void onConfigurationChange(Configuration conf) {
8435     // Do nothing for now.
8436   }
8437 
8438   /**
8439    * {@inheritDoc}
8440    */
8441   @Override
8442   public void registerChildren(ConfigurationManager manager) {
8443     configurationManager = Optional.of(manager);
8444     for (Store s : this.stores.values()) {
8445       configurationManager.get().registerObserver(s);
8446     }
8447   }
8448 
8449   /**
8450    * {@inheritDoc}
8451    */
8452   @Override
8453   public void deregisterChildren(ConfigurationManager manager) {
8454     for (Store s : this.stores.values()) {
8455       configurationManager.get().deregisterObserver(s);
8456     }
8457   }
8458 
8459   /**
8460    * @return split policy for this region.
8461    */
8462   public RegionSplitPolicy getSplitPolicy() {
8463     return this.splitPolicy;
8464   }
8465 
8466   public long getMemstoreFlushSize() {
8467     return this.memstoreFlushSize;
8468   }
8469 }