diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateRegionProcedure.java index ef11e68217a5..a6985a4edb91 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/TruncateRegionProcedure.java @@ -21,9 +21,11 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.master.MasterCoprocessorHost; import org.apache.hadoop.hbase.master.MasterFileSystem; +import org.apache.hadoop.hbase.master.RegionState.State; import org.apache.hadoop.hbase.master.assignment.RegionStateNode; import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -101,6 +103,9 @@ assert getRegion().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID || isFailed() setNextState(TruncateRegionState.TRUNCATE_REGION_MAKE_OFFLINE); break; case TRUNCATE_REGION_MAKE_OFFLINE: + // Recovery snapshots release the region lock between states, so re-check that the + // original region still exists before creating an unassign TRSP for it. + checkOnline(env, getRegion()); addChildProcedure(createUnAssignProcedures(env)); setNextState(TruncateRegionState.TRUNCATE_REGION_REMOVE); break; @@ -132,8 +137,7 @@ assert getRegion().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID || isFailed() } private void createRegionOnFileSystem(final MasterProcedureEnv env) throws IOException { - RegionStateNode regionNode = - env.getAssignmentManager().getRegionStates().getRegionStateNode(getRegion()); + RegionStateNode regionNode = getExistingRegionStateNode(env); regionNode.lock(); try { final MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); @@ -146,8 +150,7 @@ private void createRegionOnFileSystem(final MasterProcedureEnv env) throws IOExc } private void deleteRegionFromFileSystem(final MasterProcedureEnv env) throws IOException { - RegionStateNode regionNode = - env.getAssignmentManager().getRegionStates().getRegionStateNode(getRegion()); + RegionStateNode regionNode = getExistingRegionStateNode(env); regionNode.lock(); try { final MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); @@ -178,9 +181,12 @@ protected void rollbackState(final MasterProcedureEnv env, final TruncateRegionS case TRUNCATE_REGION_MAKE_OFFLINE: RegionStateNode regionNode = env.getAssignmentManager().getRegionStates().getRegionStateNode(getRegion()); - if (regionNode == null) { - // Region was unassigned by state TRUNCATE_REGION_MAKE_OFFLINE. - // So Assign it back + if ( + regionNode != null + && regionNode.isInState(State.CLOSED, State.OFFLINE, State.ABNORMALLY_CLOSED) + ) { + // Region was unassigned by state TRUNCATE_REGION_MAKE_OFFLINE. Assign it back only if it + // still exists. addChildProcedure(createAssignProcedures(env)); } return; @@ -190,6 +196,16 @@ protected void rollbackState(final MasterProcedureEnv env, final TruncateRegionS } } + private RegionStateNode getExistingRegionStateNode(final MasterProcedureEnv env) + throws UnknownRegionException { + RegionStateNode regionNode = + env.getAssignmentManager().getRegionStates().getRegionStateNode(getRegion()); + if (regionNode == null) { + throw new UnknownRegionException("No RegionState found for " + getRegion().getEncodedName()); + } + return regionNode; + } + @Override protected void completionCleanup(final MasterProcedureEnv env) { releaseSyncLatch(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedureWithRecovery.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedureWithRecovery.java index 15023d48f247..32afd69adf44 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedureWithRecovery.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedureWithRecovery.java @@ -19,8 +19,13 @@ import static org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil.insertData; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseIOException; @@ -29,6 +34,7 @@ import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.SnapshotDescription; import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.master.assignment.MergeTableRegionsProcedure; import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; @@ -199,6 +205,113 @@ public void testRecoverySnapshotAndRestore() throws Exception { UTIL.getAdmin().deleteTable(restoredTableName); } + @Test + public void testMakeOfflineFailsIfRegionWasMergedAwayDuringRecoverySnapshotGap() + throws Exception { + final TableName tableName = TableName.valueOf(name.getMethodName()); + final String[] families = new String[] { "f1", "f2" }; + final ProcedureExecutor procExec = getMasterProcedureExecutor(); + + final byte[][] splitKeys = new byte[][] { Bytes.toBytes("30"), Bytes.toBytes("60") }; + MasterProcedureTestingUtility.createTable(procExec, tableName, splitKeys, families); + + MasterProcedureEnv environment = procExec.getEnvironment(); + RegionInfo regionToTruncate = environment.getAssignmentManager().getAssignedRegions().stream() + .filter(r -> tableName.getNameAsString().equals(r.getTable().getNameAsString())) + .min((o1, o2) -> Bytes.compareTo(o1.getStartKey(), o2.getStartKey())).get(); + TestableTruncateRegionProcedure procedure = + new TestableTruncateRegionProcedure(environment, regionToTruncate); + + mergeRegionContainingTarget(procExec, tableName, regionToTruncate); + assertNull( + environment.getAssignmentManager().getRegionStates().getRegionStateNode(regionToTruncate)); + + procedure.executeStateForTest(environment, TruncateRegionState.TRUNCATE_REGION_MAKE_OFFLINE); + + assertTrue("Truncate should fail once the merged-away parent region is missing", + procedure.isFailed()); + assertNull("MAKE_OFFLINE must not recreate the removed RegionStateNode", + environment.getAssignmentManager().getRegionStates().getRegionStateNode(regionToTruncate)); + assertFalse("MAKE_OFFLINE must not schedule child procedures after the region disappears", + procedure.hasPendingChildrenForTest()); + } + + @Test + public void testRollbackDoesNotRecreateRegionMergedAwayDuringRecoverySnapshotGap() + throws Exception { + final TableName tableName = TableName.valueOf(name.getMethodName()); + final String[] families = new String[] { "f1", "f2" }; + final ProcedureExecutor procExec = getMasterProcedureExecutor(); + + final byte[][] splitKeys = new byte[][] { Bytes.toBytes("30"), Bytes.toBytes("60") }; + MasterProcedureTestingUtility.createTable(procExec, tableName, splitKeys, families); + + MasterProcedureEnv environment = procExec.getEnvironment(); + RegionInfo regionToTruncate = environment.getAssignmentManager().getAssignedRegions().stream() + .filter(r -> tableName.getNameAsString().equals(r.getTable().getNameAsString())) + .min((o1, o2) -> Bytes.compareTo(o1.getStartKey(), o2.getStartKey())).get(); + TestableTruncateRegionProcedure procedure = + new TestableTruncateRegionProcedure(environment, regionToTruncate); + + mergeRegionContainingTarget(procExec, tableName, regionToTruncate); + assertNull( + environment.getAssignmentManager().getRegionStates().getRegionStateNode(regionToTruncate)); + + procedure.rollbackStateForTest(environment, TruncateRegionState.TRUNCATE_REGION_MAKE_OFFLINE); + + assertNull("Rollback must not recreate the removed RegionStateNode", + environment.getAssignmentManager().getRegionStates().getRegionStateNode(regionToTruncate)); + assertFalse("Rollback must not schedule assign children for a removed region", + procedure.hasPendingChildrenForTest()); + } + + private void mergeRegionContainingTarget(ProcedureExecutor procExec, + TableName tableName, RegionInfo regionToTruncate) throws Exception { + List regions = UTIL.getAdmin().getRegions(tableName).stream() + .sorted(RegionInfo.COMPARATOR).collect(Collectors.toList()); + RegionInfo adjacentRegionToMerge = + regions.stream().filter(region -> !region.equals(regionToTruncate)).findFirst() + .orElseThrow(() -> new AssertionError("No adjacent region available for merge")); + long mergeProcId = + procExec.submitProcedure(new MergeTableRegionsProcedure(procExec.getEnvironment(), + new RegionInfo[] { regionToTruncate, adjacentRegionToMerge }, false)); + ProcedureTestingUtility.waitProcedure(procExec, mergeProcId); + ProcedureTestingUtility.assertProcNotFailed(procExec, mergeProcId); + } + + public static class TestableTruncateRegionProcedure extends TruncateRegionProcedure { + private boolean childScheduled; + + public TestableTruncateRegionProcedure() { + super(); + } + + public TestableTruncateRegionProcedure(MasterProcedureEnv env, RegionInfo region) + throws HBaseIOException { + super(env, region); + } + + public Flow executeStateForTest(MasterProcedureEnv env, TruncateRegionState state) + throws InterruptedException { + return super.executeFromState(env, state); + } + + public void rollbackStateForTest(MasterProcedureEnv env, TruncateRegionState state) + throws IOException { + super.rollbackState(env, state); + } + + @Override + protected > void addChildProcedure(T... subProcedure) { + childScheduled = true; + super.addChildProcedure(subProcedure); + } + + public boolean hasPendingChildrenForTest() { + return childScheduled; + } + } + public static class FailingTruncateRegionProcedure extends TruncateRegionProcedure { private boolean failOnce = false;