diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java index c234108613..88d60bb28a 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java @@ -18,6 +18,7 @@ */ package org.apache.parquet.column.values.deltastrings; +import java.util.Arrays; import org.apache.parquet.bytes.ByteBufferAllocator; import org.apache.parquet.bytes.BytesInput; import org.apache.parquet.column.Encoding; @@ -88,14 +89,19 @@ public String memUsageString(String prefix) { @Override public void writeBytes(Binary v) { - int i = 0; - byte[] vb = v.getBytes(); - int length = previous.length < vb.length ? previous.length : vb.length; - // find the number of matching prefix bytes between this value and the previous one - for (i = 0; (i < length) && (previous[i] == vb[i]); i++) - ; + byte[] vb = v.getBytesUnsafe(); + int length = Math.min(previous.length, vb.length); + // Find the number of matching prefix bytes between this value and the previous one. + // Arrays.mismatch is intrinsified by the JVM to use SIMD instructions. + int i = Arrays.mismatch(previous, 0, length, vb, 0, length); + if (i < 0) { + i = length; // all bytes in the common range matched + } prefixLengthWriter.writeInteger(i); suffixWriter.writeBytes(v.slice(i, vb.length - i)); - previous = vb; + // Retain an owned copy for prefix comparison with the next value. + // getBytesUnsafe() may return the backing array directly, so we must copy + // if the Binary's backing bytes may be reused by the caller. + previous = v.isBackingBytesReused() ? v.getBytes() : vb; } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java index 5ce6adbdf4..b73e5562dc 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.values.deltastrings; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.apache.parquet.bytes.ByteBufferInputStream; import org.apache.parquet.bytes.DirectByteBufferAllocator; import org.apache.parquet.column.values.Utils; @@ -128,4 +129,24 @@ public void testWriterReset() throws Exception { assertReadWrite(writer, new DeltaByteArrayReader(), values); } + + @Test + public void testReusedBackingArrayRegression() throws Exception { + DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator()); + DeltaByteArrayReader reader = new DeltaByteArrayReader(); + + byte[] buffer = "parquet-000".getBytes(StandardCharsets.UTF_8); + writer.writeBytes(Binary.fromReusedByteArray(buffer)); + + System.arraycopy("parquet-111".getBytes(StandardCharsets.UTF_8), 0, buffer, 0, buffer.length); + writer.writeBytes(Binary.fromReusedByteArray(buffer)); + + System.arraycopy("parquet-222".getBytes(StandardCharsets.UTF_8), 0, buffer, 0, buffer.length); + writer.writeBytes(Binary.fromReusedByteArray(buffer)); + + Binary[] decoded = Utils.readData(reader, writer.getBytes().toInputStream(), 3); + Assert.assertEquals(Binary.fromString("parquet-000"), decoded[0]); + Assert.assertEquals(Binary.fromString("parquet-111"), decoded[1]); + Assert.assertEquals(Binary.fromString("parquet-222"), decoded[2]); + } }