apache · kkewwei · Mar 29, 2025 · Jul 10, 2025 · jainankitk · Mar 31, 2025
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -439,6 +439,8 @@ Optimizations
 
 * GITHUB#14304: Add SIMD optimizations for scalar quantized queries and indexing. (Simon Cooper)
 
+* GITHUB#14397: Cache preset dict for LZ4WithPresetDictDecompressor. (kkewwei)
+
 Bug Fixes
 ---------------------
 

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java
@@ -44,4 +44,6 @@ public abstract void decompress(
 
   @Override
   public abstract Decompressor clone();
+
+  public void reset() {}
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
@@ -64,6 +64,7 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor {
 
     private int[] compressedLengths;
     private byte[] buffer;
+    private boolean reused = false;
 
     LZ4WithPresetDictDecompressor() {
       compressedLengths = new int[0];
@@ -72,16 +73,15 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor {
 
     private int readCompressedLengths(
         DataInput in, int originalLength, int dictLength, int blockLength) throws IOException {
-      in.readVInt(); // compressed length of the dictionary, unused
-      int totalLength = dictLength;
+      compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 2);
       int i = 0;
-      compressedLengths = ArrayUtil.growNoCopy(compressedLengths, originalLength / blockLength + 1);
+      compressedLengths[i++] = in.readVInt(); // compressed length of the dictionary
+      int totalLength = dictLength;
       while (totalLength < originalLength) {
-
         compressedLengths[i++] = in.readVInt();
         totalLength += blockLength;
       }
-      return i;
+      return i - 1;
     }
 
     @Override
@@ -98,12 +98,17 @@ public void decompress(DataInput in, int originalLength, int offset, int length,
       final int blockLength = in.readVInt();
 
       final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength);
-
-      buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
       bytes.length = 0;
-      // Read the dictionary
-      if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) {
-        throw new CorruptIndexException("Illegal dict length", in);
+      if (reused) {
+        assert buffer.length >= dictLength + blockLength;
+        in.skipBytes(compressedLengths[0]);
+      } else {
+        // Read the dictionary
+        buffer = ArrayUtil.growNoCopy(buffer, dictLength + blockLength);
+        if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) {
+          throw new CorruptIndexException("Illegal dict length", in);
+        }
+        reused = true;
       }
 
       int offsetInBlock = dictLength;
@@ -114,7 +119,7 @@ public void decompress(DataInput in, int originalLength, int offset, int length,
         // Skip unneeded blocks
         int numBytesToSkip = 0;
         for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) {
-          int compressedBlockLength = compressedLengths[i];
+          int compressedBlockLength = compressedLengths[i + 1];
           numBytesToSkip += compressedBlockLength;
           offsetInBlock += blockLength;
           offsetInBytesRef -= blockLength;
@@ -148,6 +153,11 @@ public void decompress(DataInput in, int originalLength, int offset, int length,
     public Decompressor clone() {
       return new LZ4WithPresetDictDecompressor();
     }
+
+    @Override
+    public void reset() {
+      reused = false;
+    }
   }
 
   private static class LZ4WithPresetDictCompressor extends Compressor {

diff --git a/.../org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/.../org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java
@@ -511,6 +511,7 @@ private void doReset(int docID) throws IOException {
           bytes.offset = bytes.length = 0;
           for (int decompressed = 0; decompressed < totalLength; ) {
             final int toDecompress = Math.min(totalLength - decompressed, chunkSize);
+            decompressor.reset();
             decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare);
             bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length);
             System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length);
@@ -559,6 +560,7 @@ SerializedDocument document(int docID) throws IOException {
         documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset + offset, length);
       } else if (sliced) {
         fieldsStream.seek(startPointer);
+        decompressor.reset();
         decompressor.decompress(
             fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes);
         documentInput =
@@ -572,6 +574,7 @@ void fillBuffer() throws IOException {
                   throw new EOFException();
                 }
                 final int toDecompress = Math.min(length - decompressed, chunkSize);
+                decompressor.reset();
                 decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes);
                 decompressed += toDecompress;
               }
@@ -643,6 +646,7 @@ SerializedDocument serializedDocument(int docID) throws IOException {
     if (state.contains(docID) == false) {
       fieldsStream.seek(indexReader.getStartPointer(docID));
       state.reset(docID);
+      decompressor.reset();
     }
     assert state.contains(docID);
     return state.document(docID);

diff --git a/...a/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java b/...a/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingTermVectorsReader.java
@@ -367,6 +367,7 @@ public Fields get(int doc) throws IOException {
       startPointer = blockState.startPointer; // avoid searching the start pointer
     } else {
       startPointer = indexReader.getStartPointer(doc);
+      decompressor.reset();
     }
     vectorsStream.seek(startPointer);
-Original file line number
+Diff line change
@@ Expand Up / @@ -439,6 +439,8 @@ Optimizations @@
     * GITHUB#14304: Add SIMD optimizations for scalar quantized queries and indexing. (Simon Cooper)
+    * GITHUB#14397: Cache preset dict for LZ4WithPresetDictDecompressor. (kkewwei)
     Bug Fixes
     ---------------------
@@ Expand Down @@