From 57d9661785968d7d0a9cce2a7024b5314f66d203 Mon Sep 17 00:00:00 2001 From: "kewei.11" Date: Sat, 29 Mar 2025 17:08:44 +0800 Subject: [PATCH] Cache preset dict in LZ4WithPresetDictDecompressor --- lucene/CHANGES.txt | 2 ++ .../codecs/compressing/Decompressor.java | 2 ++ .../LZ4WithPresetDictCompressionMode.java | 32 ++++++++++++------- ...Lucene90CompressingStoredFieldsReader.java | 4 +++ .../Lucene90CompressingTermVectorsReader.java | 1 + 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0cdad826b818..51eaece46086 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -253,6 +253,8 @@ Optimizations * GITHUB#14304: Add SIMD optimizations for scalar quantized queries and indexing. (Simon Cooper) +* GITHUB#14397: Cache preset dict for LZ4WithPresetDictDecompressor. (kkewwei) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java index fff2108a42be..f60661ebd90c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java @@ -44,4 +44,6 @@ public abstract void decompress( @Override public abstract Decompressor clone(); + + public void reset() {} } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index 05e8e2d69f51..fe85bb52e0f7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -64,6 +64,7 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor { private int[] compressedLengths; private byte[] buffer; + private boolean reused = false; LZ4WithPresetDictDecompressor() { compressedLengths = new int[0]; @@ -72,16 +73,15 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor { private int readCompressedLengths( DataInput in, int originalLength, int dictLength, int blockLength) throws IOException { - in.readVInt(); // compressed length of the dictionary, unused - int totalLength = dictLength; + compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 2); int i = 0; - compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 1); + compressedLengths[i++] = in.readVInt(); // compressed length of the dictionary + int totalLength = dictLength; while (totalLength < originalLength) { - compressedLengths[i++] = in.readVInt(); totalLength += blockLength; } - return i; + return i - 1; } @Override @@ -98,12 +98,17 @@ public void decompress(DataInput in, int originalLength, int offset, int length, final int blockLength = in.readVInt(); final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength); - - buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength); bytes.length = 0; - // Read the dictionary - if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) { - throw new CorruptIndexException("Illegal dict length", in); + if (reused) { + assert buffer.length >= dictLength + blockLength; + in.skipBytes(compressedLengths[0]); + } else { + // Read the dictionary + buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength); + if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) { + throw new CorruptIndexException("Illegal dict length", in); + } + reused = true; } int offsetInBlock = dictLength; @@ -114,7 +119,7 @@ public void decompress(DataInput in, int originalLength, int offset, int length, // Skip unneeded blocks int numBytesToSkip = 0; for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) { - int compressedBlockLength = compressedLengths[i]; + int compressedBlockLength = compressedLengths[i + 1]; numBytesToSkip += compressedBlockLength; offsetInBlock += blockLength; offsetInBytesRef -= blockLength; @@ -148,6 +153,11 @@ public void decompress(DataInput in, int originalLength, int offset, int length, public Decompressor clone() { return new LZ4WithPresetDictDecompressor(); } + + @Override + public void reset() { + reused = false; + } } private static class LZ4WithPresetDictCompressor extends Compressor { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java index 315adff1473d..de2be67bed5f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java @@ -512,6 +512,7 @@ private void doReset(int docID) throws IOException { bytes.offset = bytes.length = 0; for (int decompressed = 0; decompressed < totalLength; ) { final int toDecompress = Math.min(totalLength - decompressed, chunkSize); + decompressor.reset(); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare); bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length); System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length); @@ -560,6 +561,7 @@ SerializedDocument document(int docID) throws IOException { documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset + offset, length); } else if (sliced) { fieldsStream.seek(startPointer); + decompressor.reset(); decompressor.decompress( fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); documentInput = @@ -573,6 +575,7 @@ void fillBuffer() throws IOException { throw new EOFException(); } final int toDecompress = Math.min(length - decompressed, chunkSize); + decompressor.reset(); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); decompressed += toDecompress; } @@ -644,6 +647,7 @@ SerializedDocument serializedDocument(int docID) throws IOException { if (state.contains(docID) == false) { fieldsStream.seek(indexReader.getStartPointer(docID)); state.reset(docID); + decompressor.reset(); } assert state.contains(docID); return state.document(docID); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java index 06af857c58c5..3e382e18afa0 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java @@ -368,6 +368,7 @@ public Fields get(int doc) throws IOException { startPointer = blockState.startPointer; // avoid searching the start pointer } else { startPointer = indexReader.getStartPointer(doc); + decompressor.reset(); } vectorsStream.seek(startPointer);