Optimize CombinedFieldQuery by nextDocsAndScores (#14834)

HUSTERGS · gesong.samuel · web-flow · commit e4c99780115f · 2025-06-26T17:19:37.000+02:00
Co-authored-by: gesong.samuel &lt;gesong.samuel@bytedance.com&gt;
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -183,6 +183,8 @@ Optimizations
 
 * GITHUB#14851: Fewer virtual calls to enhance performance of point queries. (Ke Wei)
 
+* GITHUB#14834: Optimize CombinedFieldQuery by nextDocsAndScores. (Ge Song)
+
 Bug Fixes
 ---------------------
 * GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when
diff --git a/lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java
@@ -37,6 +37,7 @@
 import org.apache.lucene.search.similarities.DFRSimilarity;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOSupplier;
 import org.apache.lucene.util.RamUsageEstimator;
@@ -445,5 +446,23 @@ public DocIdSetIterator iterator() {
     public float getMaxScore(int upTo) throws IOException {
       return maxScore;
     }
+
+    @Override
+    public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndFloatFeatureBuffer buffer)
+        throws IOException {
+      int batchSize = 64; // arbitrary
+      buffer.growNoCopy(batchSize);
+      int size = 0;
+      DocIdSetIterator iterator = iterator();
+      for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) {
+        if (liveDocs == null || liveDocs.get(doc)) {
+          buffer.docs[size] = doc;
+          buffer.features[size] = freq();
+          ++size;
+        }
+      }
+      buffer.size = size;
+      simScorer.scoreRange(buffer);
+    }
   }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java
@@ -27,6 +27,8 @@
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.search.CombinedFieldQuery.FieldAndWeight;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.LongsRef;
 import org.apache.lucene.util.SmallFloat;
 
 /**
@@ -48,6 +50,7 @@ final class MultiNormsLeafSimScorer {
 
   private final SimScorer scorer;
   private final NumericDocValues norms;
+  private long[] normValues = LongsRef.EMPTY_LONGS;
 
   /** Sole constructor: Score documents of {@code reader} with {@code scorer}. */
   MultiNormsLeafSimScorer(
@@ -113,6 +116,24 @@ public float score(int doc, float freq) throws IOException {
     return scorer.score(freq, getNormValue(doc));
   }
 
+  /**
+   * score the provided documents contained in buffer. This method assumes the float feature store
+   * is {@code freq}
+   *
+   * @see SimScorer#score(float, long)
+   */
+  public void scoreRange(DocAndFloatFeatureBuffer buffer) throws IOException {
+    if (normValues.length < buffer.size) {
+      normValues = ArrayUtil.growNoCopy(normValues, buffer.size);
+    }
+    for (int i = 0; i < buffer.size; i++) {
+      normValues[i] = getNormValue(buffer.docs[i]);
+    }
+    for (int i = 0; i < buffer.size; i++) {
+      buffer.features[i] = scorer.score(buffer.features[i], normValues[i]);
+    }
+  }
+
   /**
    * Explain the score for the provided document assuming the given term document frequency. This
    * method must be called on non-decreasing sequences of doc ids.
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java
@@ -29,6 +29,7 @@
 import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.similarities.BM25Similarity;
@@ -42,6 +43,8 @@
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.search.CheckHits;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.Bits;
 
 public class TestCombinedFieldQuery extends LuceneTestCase {
   public void testInvalid() {
@@ -560,4 +563,110 @@ public CollectionStatistics collectionStatistics(String field) throws IOExceptio
     w.close();
     dir.close();
   }
+
+  public void testNextDocsAndScores() throws IOException {
+    int numMatchDoc = randomIntBetween(100, 500);
+    int boost1 = Math.max(1, random().nextInt(5));
+    int boost2 = Math.max(1, random().nextInt(5));
+
+    Directory dir = newDirectory();
+    Similarity similarity = randomCompatibleSimilarity();
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    iwc.setSimilarity(similarity);
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
+
+    // adding potentially matching doc
+    for (int i = 0; i < numMatchDoc; i++) {
+      Document doc = new Document();
+
+      int freqA = random().nextInt(20) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "foo", Store.NO));
+      }
+
+      freqA = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqA; j++) {
+          doc.add(new TextField("a", "foo" + j, Store.NO));
+        }
+      }
+
+      freqA = random().nextInt(20) + 1;
+      for (int j = 0; j < freqA; j++) {
+        doc.add(new TextField("a", "zoo", Store.NO));
+      }
+
+      int freqB = random().nextInt(20) + 1;
+      for (int j = 0; j < freqB; j++) {
+        doc.add(new TextField("b", "zoo", Store.NO));
+      }
+
+      freqB = random().nextInt(20) + 1;
+      if (randomBoolean()) {
+        for (int j = 0; j < freqB; j++) {
+          doc.add(new TextField("b", "zoo" + j, Store.NO));
+        }
+      }
+
+      int freqC = random().nextInt(20) + 1;
+      for (int j = 0; j < freqC; j++) {
+        doc.add(new TextField("c", "bla" + j, Store.NO));
+      }
+      w.addDocument(doc);
+    }
+
+    w.forceMerge(1);
+
+    IndexReader reader = getOnlyLeafReader(w.getReader());
+    IndexSearcher searcher = newSearcher(reader, false);
+    searcher.setSimilarity(similarity);
+
+    CombinedFieldQuery query =
+        new CombinedFieldQuery.Builder("foo")
+            .addField("a", (float) boost1)
+            .addField("b", (float) boost2)
+            .build();
+
+    Weight weight = searcher.createWeight(query, ScoreMode.TOP_SCORES, 1f);
+    LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
+    Bits liveDocs = context.reader().getLiveDocs();
+
+    Scorer scorer1 = weight.scorer(context);
+    Scorer scorer2 = weight.scorer(context);
+    scorer1.iterator().nextDoc();
+    scorer2.iterator().nextDoc();
+    DocAndFloatFeatureBuffer buffer = new DocAndFloatFeatureBuffer();
+    while (true) {
+      int curDoc = scorer2.iterator().docID();
+      int upTo =
+          TestUtil.nextInt(random(), curDoc, (int) Math.min(Integer.MAX_VALUE, curDoc + 512L));
+      scorer1.nextDocsAndScores(upTo, liveDocs, buffer);
+      assertEquals(buffer.size == 0, curDoc >= upTo);
+
+      for (int i = 0; i < buffer.size; ++i) {
+        while (liveDocs != null && liveDocs.get(scorer2.iterator().docID()) == false) {
+          scorer2.iterator().nextDoc();
+        }
+        assertEquals(scorer2.iterator().docID(), buffer.docs[i]);
+        assertEquals(scorer2.score(), buffer.features[i], 0f);
+        scorer2.iterator().nextDoc();
+      }
+
+      assertEquals(scorer2.iterator().docID(), scorer1.iterator().docID());
+      if (scorer1.iterator().docID() == DocIdSetIterator.NO_MORE_DOCS) {
+        break;
+      }
+    }
+
+    Scorer scorer3 = weight.scorer(context);
+    scorer3.iterator().nextDoc();
+    scorer3.nextDocsAndScores(
+        DocIdSetIterator.NO_MORE_DOCS, new Bits.MatchNoBits(context.reader().maxDoc()), buffer);
+    assertEquals(0, buffer.size);
+
+    reader.close();
+    w.close();
+    dir.close();
+  }
 }