diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8f398c644657..725434112263 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -439,6 +439,8 @@ Optimizations * GITHUB#14304: Add SIMD optimizations for scalar quantized queries and indexing. (Simon Cooper) +* GITHUB#14397: Cache preset dict for LZ4WithPresetDictDecompressor. (kkewwei) + Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java index fff2108a42be..f60661ebd90c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java @@ -44,4 +44,6 @@ public abstract void decompress( @Override public abstract Decompressor clone(); + + public void reset() {} } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index 05e8e2d69f51..fe85bb52e0f7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -64,6 +64,7 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor { private int[] compressedLengths; private byte[] buffer; + private boolean reused = false; LZ4WithPresetDictDecompressor() { compressedLengths = new int[0]; @@ -72,16 +73,15 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor { private int readCompressedLengths( DataInput in, int originalLength, int dictLength, int blockLength) throws IOException { - in.readVInt(); // compressed length of the dictionary, unused - int totalLength = dictLength; + compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 2); int i = 0; - compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 1); + compressedLengths[i++] = in.readVInt(); // compressed length of the dictionary + int totalLength = dictLength; while (totalLength < originalLength) { - compressedLengths[i++] = in.readVInt(); totalLength += blockLength; } - return i; + return i - 1; } @Override @@ -98,12 +98,17 @@ public void decompress(DataInput in, int originalLength, int offset, int length, final int blockLength = in.readVInt(); final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength); - - buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength); bytes.length = 0; - // Read the dictionary - if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) { - throw new CorruptIndexException("Illegal dict length", in); + if (reused) { + assert buffer.length >= dictLength + blockLength; + in.skipBytes(compressedLengths[0]); + } else { + // Read the dictionary + buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength); + if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) { + throw new CorruptIndexException("Illegal dict length", in); + } + reused = true; } int offsetInBlock = dictLength; @@ -114,7 +119,7 @@ public void decompress(DataInput in, int originalLength, int offset, int length, // Skip unneeded blocks int numBytesToSkip = 0; for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) { - int compressedBlockLength = compressedLengths[i]; + int compressedBlockLength = compressedLengths[i + 1]; numBytesToSkip += compressedBlockLength; offsetInBlock += blockLength; offsetInBytesRef -= blockLength; @@ -148,6 +153,11 @@ public void decompress(DataInput in, int originalLength, int offset, int length, public Decompressor clone() { return new LZ4WithPresetDictDecompressor(); } + + @Override + public void reset() { + reused = false; + } } private static class LZ4WithPresetDictCompressor extends Compressor { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java index a5d9e276044d..14caff543b9c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java @@ -511,6 +511,7 @@ private void doReset(int docID) throws IOException { bytes.offset = bytes.length = 0; for (int decompressed = 0; decompressed < totalLength; ) { final int toDecompress = Math.min(totalLength - decompressed, chunkSize); + decompressor.reset(); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare); bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length); System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length); @@ -559,6 +560,7 @@ SerializedDocument document(int docID) throws IOException { documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset + offset, length); } else if (sliced) { fieldsStream.seek(startPointer); + decompressor.reset(); decompressor.decompress( fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); documentInput = @@ -572,6 +574,7 @@ void fillBuffer() throws IOException { throw new EOFException(); } final int toDecompress = Math.min(length - decompressed, chunkSize); + decompressor.reset(); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); decompressed += toDecompress; } @@ -643,6 +646,7 @@ SerializedDocument serializedDocument(int docID) throws IOException { if (state.contains(docID) == false) { fieldsStream.seek(indexReader.getStartPointer(docID)); state.reset(docID); + decompressor.reset(); } assert state.contains(docID); return state.document(docID); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java index ec96e787edee..8cd937d88bd5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java @@ -367,6 +367,7 @@ public Fields get(int doc) throws IOException { startPointer = blockState.startPointer; // avoid searching the start pointer } else { startPointer = indexReader.getStartPointer(doc); + decompressor.reset(); } vectorsStream.seek(startPointer);