diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java index 9ad3c792345e..fceead3a15dc 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/NormalUserScanQueryMatcher.java @@ -20,6 +20,7 @@ import java.io.IOException; import org.apache.hadoop.hbase.ExtendedCell; import org.apache.hadoop.hbase.KeepDeletedCells; +import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.regionserver.ScanInfo; @@ -31,6 +32,14 @@ @InterfaceAudience.Private public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { + /** + * Number of consecutive range delete markers (DeleteColumn/DeleteFamily) to skip before switching + * to seek. Seeking is more expensive than skipping for a single marker, but much faster when + * markers accumulate. This threshold avoids the seek overhead for the common case (one delete per + * row/column) while still kicking in when markers pile up. + */ + private static final int SEEK_ON_DELETE_MARKER_THRESHOLD = 3; + /** Keeps track of deletes */ private final DeleteTracker deletes; @@ -40,12 +49,20 @@ public abstract class NormalUserScanQueryMatcher extends UserScanQueryMatcher { /** whether time range queries can see rows "behind" a delete */ protected final boolean seePastDeleteMarkers; + /** Whether seek optimization for range delete markers is applicable */ + private final boolean canSeekOnDeleteMarker; + + /** Count of consecutive range delete markers seen */ + private int rangeDeleteCount; + protected NormalUserScanQueryMatcher(Scan scan, ScanInfo scanInfo, ColumnTracker columns, boolean hasNullColumn, DeleteTracker deletes, long oldestUnexpiredTS, long now) { super(scan, scanInfo, columns, hasNullColumn, oldestUnexpiredTS, now); this.deletes = deletes; this.get = scan.isGetScan(); this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() != KeepDeletedCells.FALSE; + this.canSeekOnDeleteMarker = + !seePastDeleteMarkers && deletes.getClass() == ScanDeleteTracker.class; } @Override @@ -70,9 +87,27 @@ public MatchCode match(ExtendedCell cell) throws IOException { seePastDeleteMarkers ? tr.withinTimeRange(timestamp) : tr.withinOrAfterTimeRange(timestamp); if (includeDeleteMarker) { this.deletes.add(cell); + // A DeleteColumn or DeleteFamily masks all remaining cells for this column/family. + // Seek past them instead of skipping one cell at a time, but only after seeing + // enough consecutive markers to justify the seek overhead. + // Only safe with plain ScanDeleteTracker. Not safe with newVersionBehavior (sequence + // IDs determine visibility), visibility labels (delete/put label mismatch), or + // seePastDeleteMarkers (KEEP_DELETED_CELLS). + if ( + canSeekOnDeleteMarker && (typeByte == KeyValue.Type.DeleteFamily.getCode() + || (typeByte == KeyValue.Type.DeleteColumn.getCode() && cell.getQualifierLength() > 0)) + ) { + if (++rangeDeleteCount >= SEEK_ON_DELETE_MARKER_THRESHOLD) { + rangeDeleteCount = 0; + return columns.getNextRowOrNextColumn(cell); + } + } else { + rangeDeleteCount = 0; + } } return MatchCode.SKIP; } + rangeDeleteCount = 0; returnCode = checkDeleted(deletes, cell); if (returnCode != null) { return returnCode; @@ -83,6 +118,7 @@ public MatchCode match(ExtendedCell cell) throws IOException { @Override protected void reset() { deletes.reset(); + rangeDeleteCount = 0; } @Override diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java index 4f6168b1e926..4478ccf4a747 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/querymatcher/ScanQueryMatcher.java @@ -292,12 +292,11 @@ public void setToNewRow(ExtendedCell currentRow) { public abstract boolean moreRowsMayExistAfter(ExtendedCell cell); public ExtendedCell getKeyForNextColumn(ExtendedCell cell) { - // We aren't sure whether any DeleteFamily cells exist, so we can't skip to next column. - // TODO: Current way disable us to seek to next column quickly. Is there any better solution? - // see HBASE-18471 for more details - // see TestFromClientSide3#testScanAfterDeletingSpecifiedRow - // see TestFromClientSide3#testScanAfterDeletingSpecifiedRowV2 - if (cell.getQualifierLength() == 0) { + // For cells with empty qualifier, we generally can't skip to the next column because + // DeleteFamily cells might exist that we haven't seen yet (see HBASE-18471). + // However, if the cell itself IS a DeleteFamily marker, we know we've already processed it, + // so we can safely seek to the next real column. + if (cell.getQualifierLength() == 0 && !PrivateCellUtil.isDeleteFamily(cell)) { ExtendedCell nextKey = PrivateCellUtil.createNextOnRowCol(cell); if (nextKey != cell) { return nextKey; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java index efbb4c77d475..bdeb7c52a738 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/querymatcher/TestUserScanQueryMatcher.java @@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeepDeletedCells; import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.KeyValue.Type; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.filter.FilterBase; @@ -396,4 +397,75 @@ scanWithFilter, new ScanInfo(this.conf, fam2, 0, 5, ttl, KeepDeletedCells.FALSE, Cell nextCell = qm.getKeyForNextColumn(lastCell); assertArrayEquals(nextCell.getQualifierArray(), col4); } + + /** + * After enough consecutive range delete markers, the matcher should switch from SKIP to + * SEEK_NEXT_COL. Point deletes and KEEP_DELETED_CELLS always SKIP. + */ + @Test + public void testSeekOnRangeDelete() throws IOException { + // DeleteColumn: first two SKIP, third triggers SEEK_NEXT_COL + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteColumn, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + + // DeleteFamily: same threshold behavior + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.DeleteFamily, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SEEK_NEXT_COL); + + // Delete (version): always SKIP (point delete, not range) + assertDeleteMatchCodes(KeepDeletedCells.FALSE, Type.Delete, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); + + // KEEP_DELETED_CELLS=TRUE: always SKIP + assertDeleteMatchCodes(KeepDeletedCells.TRUE, Type.DeleteColumn, MatchCode.SKIP, MatchCode.SKIP, + MatchCode.SKIP, MatchCode.SKIP, MatchCode.SKIP); + } + + /** + * DeleteColumn with empty qualifier must not cause seeking past a subsequent DeleteFamily. + * DeleteFamily masks all columns, so it must be tracked by the delete tracker. + */ + @Test + public void testDeleteColumnEmptyQualifierDoesNotSkipDeleteFamily() throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + byte[] e = HConstants.EMPTY_BYTE_ARRAY; + UserScanQueryMatcher qm = UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, + ttl, KeepDeletedCells.FALSE, HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, + now - ttl, now, null); + + // Feed enough DCs with empty qualifier to hit the threshold, then a DF. + // The DF must NOT be seeked past -- it must be SKIP'd so the tracker picks it up. + KeyValue dc1 = new KeyValue(row1, fam1, e, now, Type.DeleteColumn); + KeyValue dc2 = new KeyValue(row1, fam1, e, now - 1, Type.DeleteColumn); + KeyValue dc3 = new KeyValue(row1, fam1, e, now - 2, Type.DeleteColumn); + KeyValue df = new KeyValue(row1, fam1, e, now - 3, Type.DeleteFamily); + KeyValue put = new KeyValue(row1, fam1, col1, now - 3, Type.Put, data); + + qm.setToNewRow(dc1); + assertEquals(MatchCode.SKIP, qm.match(dc1)); + assertEquals(MatchCode.SKIP, qm.match(dc2)); + // Third DC hits threshold -- but since it's empty qualifier, it should NOT seek + // because a DeleteFamily might follow. + assertEquals(MatchCode.SKIP, qm.match(dc3)); + // DF must be processed (SKIP), not seeked past + assertEquals(MatchCode.SKIP, qm.match(df)); + // Put in col1 at t=now-3 should be masked by DF@t=now-3 + MatchCode putCode = qm.match(put); + assertEquals(MatchCode.SEEK_NEXT_COL, putCode); + } + + private void assertDeleteMatchCodes(KeepDeletedCells keepDeletedCells, Type type, + MatchCode... expected) throws IOException { + long now = EnvironmentEdgeManager.currentTime(); + UserScanQueryMatcher qm = + UserScanQueryMatcher.create(scan, new ScanInfo(this.conf, fam1, 0, 1, ttl, keepDeletedCells, + HConstants.DEFAULT_BLOCKSIZE, 0, rowComparator, false), null, now - ttl, now, null); + boolean familyLevel = type == Type.DeleteFamily || type == Type.DeleteFamilyVersion; + byte[] qual = familyLevel ? HConstants.EMPTY_BYTE_ARRAY : col1; + qm.setToNewRow(new KeyValue(row1, fam1, qual, now, type)); + for (int i = 0; i < expected.length; i++) { + KeyValue kv = new KeyValue(row1, fam1, qual, now - i, type); + assertEquals("Mismatch at index " + i, expected[i], qm.match(kv)); + } + } }