Skip to content

Commit a34ac2a

Browse files
Fixed indexing error: Inconsistency of field data structures across documents for field [organism]
1 parent f2dfee4 commit a34ac2a

File tree

4 files changed

+59
-19
lines changed

4 files changed

+59
-19
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
<paxtools.version>6.0.0-SNAPSHOT</paxtools.version>
2828
<validator.version>6.0.0-SNAPSHOT</validator.version>
2929
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
30-
<lucene.version>9.7.0</lucene.version>
30+
<lucene.version>9.10.0</lucene.version>
3131
<jvm.options>-Xmx3g -Dfile.encoding=UTF-8 -ea --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED</jvm.options>
3232
<agent>${settings.localRepository}/org/springframework/spring-instrument/${spring-framework.version}/spring-instrument-${spring-framework.version}.jar</agent>
3333
<!-- this copy is created by maven-dependency-plugin -->

src/main/java/cpath/service/IndexImpl.java

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -524,13 +524,8 @@ public long count(String queryString) {
524524
private void addDatasources(Set<Provenance> set, Document doc) {
525525
for (Provenance p : set) {
526526
//store but do not index/tokenize the URI
527-
doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));
528-
529-
//index the last/local (collection prefix) part of the Provenance uri
530-
String u = p.getUri();
531-
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
532-
u = u.replaceAll(".*[/#:]", "");
533-
doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
527+
// doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));
528+
doc.add(new TextField(FIELD_DATASOURCE, p.getUri(), Field.Store.YES));
534529

535530
//index names (including the datasource identifier from metadata json config; see premerge/merge)
536531
//different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
@@ -542,8 +537,8 @@ private void addDatasources(Set<Provenance> set, Document doc) {
542537

543538
private void addOrganisms(Set<BioSource> set, Document doc) {
544539
for(BioSource bs : set) {
545-
// store but do not index URI (see transform method above, where the organism URIs are added to search hits)
546-
doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
540+
//doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
541+
doc.add(new TextField(FIELD_ORGANISM, bs.getUri(), Field.Store.YES));
547542

548543
// add organism names
549544
for(String s : bs.getName()) {
@@ -558,8 +553,9 @@ private void addOrganisms(Set<BioSource> set, Document doc) {
558553
}
559554
// include tissue type terms
560555
if (bs.getTissue() != null) {
561-
for (String s : bs.getTissue().getTerm())
556+
for (String s : bs.getTissue().getTerm()) {
562557
doc.add(new TextField(FIELD_ORGANISM, s.toLowerCase(), Field.Store.NO));
558+
}
563559
}
564560
// include cell type terms
565561
if (bs.getCellType() != null) {

src/test/java/cpath/service/IndexIT.java

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,19 +95,12 @@ public final void search() throws IOException {
9595
assertEquals(2, response.getSearchHit().size());
9696
response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
9797
assertEquals(1, response.getSearchHit().size());
98-
//datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
99-
assertTrue(index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/reactome/"}, null).isEmpty());
100-
assertTrue(index.search("*", 0, Pathway.class, new String[] {"test:kegg_test"}, null).isEmpty());
101-
//using the local/last part of the URI (standard bio collection prefix/name)
102-
response = index.search("*", 0, Pathway.class, new String[] {"kegg_test"}, null);
103-
assertFalse(response.isEmpty());
104-
assertEquals(1, response.getSearchHit().size());
105-
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("test:kegg_test")));
10698

10799
//find by partial name of a datasource - "pathway" of "KEGG Pathway"...
108100
response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
109101
assertFalse(response.isEmpty());
110102
assertEquals(1, response.getSearchHit().size());
103+
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("test:kegg_test")));
111104

112105
response = index.search("pathway:glycolysis", 0, SmallMoleculeReference.class, null, null);
113106
assertEquals(5, response.getSearchHit().size());

src/test/resources/test-index-it.owl

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<rdf:RDF
3+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
4+
xmlns:owl="http://www.w3.org/2002/07/owl#"
5+
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
6+
xmlns:bp="http://www.biopax.org/release/biopax-level3.owl#"
7+
xml:base="pc14:">
8+
<owl:Ontology rdf:about="">
9+
<owl:imports rdf:resource="http://www.biopax.org/release/biopax-level3.owl#" />
10+
</owl:Ontology>
11+
12+
<bp:Provenance rdf:ID="bind">
13+
<bp:standardName rdf:datatype = "xsd:string">BIND</bp:standardName>
14+
<bp:displayName rdf:datatype = "xsd:string">BIND</bp:displayName>
15+
<bp:name rdf:datatype = "xsd:string">Biomolecular Interaction Network Database</bp:name>
16+
<bp:name rdf:datatype = "xsd:string">bind</bp:name>
17+
<bp:comment rdf:datatype = "xsd:string">Source http://download.baderlab.org/BINDTranslation/release1_0/PSIMI25_XML/taxid9606_PSIMI25.xml type: PSI_MI, BIND (human), 15-Dec-2010</bp:comment>
18+
</bp:Provenance>
19+
20+
<bp:Protein rdf:about="bind:Protein_rcsb_pdb_1ZDT_see-also_1713944180538">
21+
<bp:displayName rdf:datatype = "xsd:string">1ZDT_B</bp:displayName>
22+
<bp:entityReference rdf:resource="bind:ProteinReference_rcsb_pdb_1ZDT_see-also" />
23+
<bp:comment rdf:datatype = "xsd:string">Protein Chain B, NR5A1[221-461]. This residue range is taken from the PDB file and may not match the GI given.</bp:comment>
24+
<bp:comment rdf:datatype = "xsd:string">experimental form entity</bp:comment>
25+
<bp:dataSource rdf:resource="#bind" />
26+
</bp:Protein>
27+
28+
<bp:ProteinReference rdf:about="bind:ProteinReference_rcsb_pdb_1ZDT_see-also">
29+
<bp:xref rdf:resource="#RX_genbank_indentifier_67463979_see-also" />
30+
<bp:xref rdf:resource="#RX_pdb_1ZDT_see-also" />
31+
<bp:xref rdf:resource="#RX_hgnc_symbol_NCOA2_see-also" />
32+
<bp:xref rdf:resource="#RX_uniprot_Q15596_see-also" />
33+
<bp:xref rdf:resource="#RX_hgnc_symbol_NR5A1_see-also" />
34+
<bp:xref rdf:resource="#RX_uniprot_Q13285_see-also" />
35+
<bp:organism rdf:resource="bind:BIO_ncbitaxon_0" />
36+
<bp:displayName rdf:datatype = "xsd:string">Nuclear Receptor Coactivator 2</bp:displayName>
37+
<bp:name rdf:datatype = "xsd:string">Chain P; The Crystal Structure Of Human Steroidogenic Factor-1</bp:name>
38+
</bp:ProteinReference>
39+
40+
<bp:BioSource rdf:about="bind:BIO_ncbitaxon_0">
41+
</bp:BioSource>
42+
43+
<bp:RelationshipXref rdf:ID="RX_genbank_indentifier_67463979_see-also">
44+
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0361" />
45+
<bp:id rdf:datatype = "xsd:string">67463979</bp:id>
46+
<bp:db rdf:datatype = "xsd:string">genbank indentifier</bp:db><!-- kept the original (mis)spelling (BIND) -->
47+
</bp:RelationshipXref>
48+
49+
50+
51+
</rdf:RDF>

0 commit comments

Comments
 (0)