Skip to content

Commit e4c9978

Browse files
HUSTERGSgesong.samuel
andauthored
Optimize CombinedFieldQuery by nextDocsAndScores (#14834)
Co-authored-by: gesong.samuel <[email protected]>
1 parent 647c05b commit e4c9978

File tree

4 files changed

+151
-0
lines changed

4 files changed

+151
-0
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ Optimizations
183183

184184
* GITHUB#14851: Fewer virtual calls to enhance performance of point queries. (Ke Wei)
185185

186+
* GITHUB#14834: Optimize CombinedFieldQuery by nextDocsAndScores. (Ge Song)
187+
186188
Bug Fixes
187189
---------------------
188190
* GITHUB#14654: ValueSource.fromDoubleValuesSource(dvs).getSortField() would throw errors when

lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import org.apache.lucene.search.similarities.DFRSimilarity;
3838
import org.apache.lucene.search.similarities.Similarity;
3939
import org.apache.lucene.util.Accountable;
40+
import org.apache.lucene.util.Bits;
4041
import org.apache.lucene.util.BytesRef;
4142
import org.apache.lucene.util.IOSupplier;
4243
import org.apache.lucene.util.RamUsageEstimator;
@@ -445,5 +446,23 @@ public DocIdSetIterator iterator() {
445446
public float getMaxScore(int upTo) throws IOException {
446447
return maxScore;
447448
}
449+
450+
@Override
451+
public void nextDocsAndScores(int upTo, Bits liveDocs, DocAndFloatFeatureBuffer buffer)
452+
throws IOException {
453+
int batchSize = 64; // arbitrary
454+
buffer.growNoCopy(batchSize);
455+
int size = 0;
456+
DocIdSetIterator iterator = iterator();
457+
for (int doc = docID(); doc < upTo && size < batchSize; doc = iterator.nextDoc()) {
458+
if (liveDocs == null || liveDocs.get(doc)) {
459+
buffer.docs[size] = doc;
460+
buffer.features[size] = freq();
461+
++size;
462+
}
463+
}
464+
buffer.size = size;
465+
simScorer.scoreRange(buffer);
466+
}
448467
}
449468
}

lucene/core/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
import org.apache.lucene.index.NumericDocValues;
2828
import org.apache.lucene.search.CombinedFieldQuery.FieldAndWeight;
2929
import org.apache.lucene.search.similarities.Similarity.SimScorer;
30+
import org.apache.lucene.util.ArrayUtil;
31+
import org.apache.lucene.util.LongsRef;
3032
import org.apache.lucene.util.SmallFloat;
3133

3234
/**
@@ -48,6 +50,7 @@ final class MultiNormsLeafSimScorer {
4850

4951
private final SimScorer scorer;
5052
private final NumericDocValues norms;
53+
private long[] normValues = LongsRef.EMPTY_LONGS;
5154

5255
/** Sole constructor: Score documents of {@code reader} with {@code scorer}. */
5356
MultiNormsLeafSimScorer(
@@ -113,6 +116,24 @@ public float score(int doc, float freq) throws IOException {
113116
return scorer.score(freq, getNormValue(doc));
114117
}
115118

119+
/**
120+
* score the provided documents contained in buffer. This method assumes the float feature store
121+
* is {@code freq}
122+
*
123+
* @see SimScorer#score(float, long)
124+
*/
125+
public void scoreRange(DocAndFloatFeatureBuffer buffer) throws IOException {
126+
if (normValues.length < buffer.size) {
127+
normValues = ArrayUtil.growNoCopy(normValues, buffer.size);
128+
}
129+
for (int i = 0; i < buffer.size; i++) {
130+
normValues[i] = getNormValue(buffer.docs[i]);
131+
}
132+
for (int i = 0; i < buffer.size; i++) {
133+
buffer.features[i] = scorer.score(buffer.features[i], normValues[i]);
134+
}
135+
}
136+
116137
/**
117138
* Explain the score for the provided document assuming the given term document frequency. This
118139
* method must be called on non-decreasing sequences of doc ids.

lucene/core/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.apache.lucene.index.FieldInvertState;
3030
import org.apache.lucene.index.IndexReader;
3131
import org.apache.lucene.index.IndexWriterConfig;
32+
import org.apache.lucene.index.LeafReaderContext;
3233
import org.apache.lucene.index.MultiReader;
3334
import org.apache.lucene.index.Term;
3435
import org.apache.lucene.search.similarities.BM25Similarity;
@@ -42,6 +43,8 @@
4243
import org.apache.lucene.tests.index.RandomIndexWriter;
4344
import org.apache.lucene.tests.search.CheckHits;
4445
import org.apache.lucene.tests.util.LuceneTestCase;
46+
import org.apache.lucene.tests.util.TestUtil;
47+
import org.apache.lucene.util.Bits;
4548

4649
public class TestCombinedFieldQuery extends LuceneTestCase {
4750
public void testInvalid() {
@@ -560,4 +563,110 @@ public CollectionStatistics collectionStatistics(String field) throws IOExceptio
560563
w.close();
561564
dir.close();
562565
}
566+
567+
public void testNextDocsAndScores() throws IOException {
568+
int numMatchDoc = randomIntBetween(100, 500);
569+
int boost1 = Math.max(1, random().nextInt(5));
570+
int boost2 = Math.max(1, random().nextInt(5));
571+
572+
Directory dir = newDirectory();
573+
Similarity similarity = randomCompatibleSimilarity();
574+
575+
IndexWriterConfig iwc = new IndexWriterConfig();
576+
iwc.setSimilarity(similarity);
577+
RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);
578+
579+
// adding potentially matching doc
580+
for (int i = 0; i < numMatchDoc; i++) {
581+
Document doc = new Document();
582+
583+
int freqA = random().nextInt(20) + 1;
584+
for (int j = 0; j < freqA; j++) {
585+
doc.add(new TextField("a", "foo", Store.NO));
586+
}
587+
588+
freqA = random().nextInt(20) + 1;
589+
if (randomBoolean()) {
590+
for (int j = 0; j < freqA; j++) {
591+
doc.add(new TextField("a", "foo" + j, Store.NO));
592+
}
593+
}
594+
595+
freqA = random().nextInt(20) + 1;
596+
for (int j = 0; j < freqA; j++) {
597+
doc.add(new TextField("a", "zoo", Store.NO));
598+
}
599+
600+
int freqB = random().nextInt(20) + 1;
601+
for (int j = 0; j < freqB; j++) {
602+
doc.add(new TextField("b", "zoo", Store.NO));
603+
}
604+
605+
freqB = random().nextInt(20) + 1;
606+
if (randomBoolean()) {
607+
for (int j = 0; j < freqB; j++) {
608+
doc.add(new TextField("b", "zoo" + j, Store.NO));
609+
}
610+
}
611+
612+
int freqC = random().nextInt(20) + 1;
613+
for (int j = 0; j < freqC; j++) {
614+
doc.add(new TextField("c", "bla" + j, Store.NO));
615+
}
616+
w.addDocument(doc);
617+
}
618+
619+
w.forceMerge(1);
620+
621+
IndexReader reader = getOnlyLeafReader(w.getReader());
622+
IndexSearcher searcher = newSearcher(reader, false);
623+
searcher.setSimilarity(similarity);
624+
625+
CombinedFieldQuery query =
626+
new CombinedFieldQuery.Builder("foo")
627+
.addField("a", (float) boost1)
628+
.addField("b", (float) boost2)
629+
.build();
630+
631+
Weight weight = searcher.createWeight(query, ScoreMode.TOP_SCORES, 1f);
632+
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
633+
Bits liveDocs = context.reader().getLiveDocs();
634+
635+
Scorer scorer1 = weight.scorer(context);
636+
Scorer scorer2 = weight.scorer(context);
637+
scorer1.iterator().nextDoc();
638+
scorer2.iterator().nextDoc();
639+
DocAndFloatFeatureBuffer buffer = new DocAndFloatFeatureBuffer();
640+
while (true) {
641+
int curDoc = scorer2.iterator().docID();
642+
int upTo =
643+
TestUtil.nextInt(random(), curDoc, (int) Math.min(Integer.MAX_VALUE, curDoc + 512L));
644+
scorer1.nextDocsAndScores(upTo, liveDocs, buffer);
645+
assertEquals(buffer.size == 0, curDoc >= upTo);
646+
647+
for (int i = 0; i < buffer.size; ++i) {
648+
while (liveDocs != null && liveDocs.get(scorer2.iterator().docID()) == false) {
649+
scorer2.iterator().nextDoc();
650+
}
651+
assertEquals(scorer2.iterator().docID(), buffer.docs[i]);
652+
assertEquals(scorer2.score(), buffer.features[i], 0f);
653+
scorer2.iterator().nextDoc();
654+
}
655+
656+
assertEquals(scorer2.iterator().docID(), scorer1.iterator().docID());
657+
if (scorer1.iterator().docID() == DocIdSetIterator.NO_MORE_DOCS) {
658+
break;
659+
}
660+
}
661+
662+
Scorer scorer3 = weight.scorer(context);
663+
scorer3.iterator().nextDoc();
664+
scorer3.nextDocsAndScores(
665+
DocIdSetIterator.NO_MORE_DOCS, new Bits.MatchNoBits(context.reader().maxDoc()), buffer);
666+
assertEquals(0, buffer.size);
667+
668+
reader.close();
669+
w.close();
670+
dir.close();
671+
}
563672
}

0 commit comments

Comments
 (0)