Skip to content

Commit 74bf3a6

Browse files
Some improvements for the 'organism' and 'datasource' index/search fields.
1 parent 3e8ff28 commit 74bf3a6

File tree

5 files changed

+45
-42
lines changed

5 files changed

+45
-42
lines changed

src/main/java/cpath/service/ConsoleApplication.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ public void run(String... args) throws Exception {
9090
.hasArg().argName("filename").build();
9191
options.addOption(o);
9292
o = Option.builder("F").longOpt("F")
93-
.desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<nameOrUri,..> -Ftypes=<interface,..> " +
93+
.desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<name,..> -Ftypes=<interface,..> " +
9494
"(when 'uris' is defined, other options are ignored)")
9595
.argName("property=value").hasArgs().valueSeparator().numberOfArgs(2).build();
9696
options.addOption(o);
@@ -219,7 +219,7 @@ private void merge() {
219219
*
220220
* @param output - output BioPAX file name (path)
221221
* @param uris - optional, the list of valid (existing) URIs to extract a sub-model
222-
* @param datasources filter by datasource if 'uris' is not empty
222+
* @param datasources filter by datasource (name or identifier) if 'uris' is not empty
223223
* @param types filter by BioPAX type if 'uris' is not empty
224224
* @throws IOException, IllegalStateException (in maintenance mode)
225225
*/

src/main/java/cpath/service/IndexImpl.java

Lines changed: 31 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ private SearchResponse transform(Query query, IndexSearcher searcher, TopDocs to
248248
}
249249
}
250250

251-
// extract organisms (URI only)
251+
// extract organisms (URIs only)
252252
if(doc.get(FIELD_ORGANISM) != null) {
253253
Set<String> uniqueVals = new TreeSet<>();
254254
for(String o : doc.getValues(FIELD_ORGANISM)) {
@@ -360,19 +360,19 @@ public void save(BioPAXElement bpe) {
360360

361361
// create a new document
362362
final Document doc = new Document();
363-
// using StringField and KeywordAnalyser for this field
363+
// using StringField and KeywordAnalyser (when searching) for 'uri' field
364364
final String uri = bpe.getUri();
365-
// save URI: indexed, not analyzed, stored
365+
// save URI: indexed, not analyzed, stored
366366
doc.add(new StringField(FIELD_URI, uri, Field.Store.YES));
367-
//extract and index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...')
368-
if(uri.startsWith("http://")) {
369-
String id = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
370-
id = id.replaceAll(".*[/#]", "").trim();
371-
doc.add(new StringField(FIELD_URI, id, Field.Store.NO));
372-
}
373-
374-
// index and store but not analyze/tokenize the biopax class name:
367+
368+
//index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...'); todo: why?..
369+
String luri = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
370+
luri = luri.replaceAll(".*[/#]", "").trim();
371+
doc.add(new StringField(FIELD_URI, luri, Field.Store.NO));
372+
373+
// index and store but not analyze/tokenize biopax class name (lowcase as we use StandardAnalyzer for searching/filtering in this field):
375374
doc.add(new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES));
375+
376376
// extra index fields
377377
addPathways(ModelUtils.getParentPathways(bpe), doc);
378378
addOrganisms(ModelUtils.getOrganisms(bpe), doc);
@@ -394,11 +394,10 @@ public void save(BioPAXElement bpe) {
394394
}
395395

396396
// Add more xref IDs to the index using id-mapping
397-
Set<String> ids = CPathUtils.getXrefIds(bpe);
397+
final Set<String> ids = CPathUtils.getXrefIds(bpe);
398398
Pattern isoformIdPattern = Pattern.compile(Resolver.getNamespace("uniprot.isoform", true).getPattern());
399-
Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern()); //"uniprot protein" is the preferred name
400-
// in addition, collect ChEBI and UniProt IDs and then
401-
// use id-mapping to associate the bpe with more IDs:
399+
Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern());
400+
// also collect ChEBI and UniProt IDs and then use id-mapping to associate the bpe with more IDs:
402401
final List<String> uniprotIds = new ArrayList<>();
403402
final List<String> chebiIds = new ArrayList<>();
404403
for(String id : ids) {
@@ -407,16 +406,17 @@ public void save(BioPAXElement bpe) {
407406
chebiIds.add(id);
408407
} else if(isoformIdPattern.matcher(id).find()) {
409408
//cut the isoform num. suffix
410-
id = id.replaceFirst("-\\d+$", "");
411-
uniprotIds.add(id);
409+
uniprotIds.add(id.replaceFirst("-\\d+$", ""));
412410
} else if(uniprotIdPattern.matcher(id).find()) {
413411
uniprotIds.add(id);
414412
}
415413
}
414+
//id-mapping to find some other ids that map to the chebi/uniprot ones that we collected from the bpe.
416415
addSupportedIdsThatMapToChebi(chebiIds, ids);
417416
addSupportedIdsThatMapToUniprotId(uniprotIds, ids);
418-
for (String id : ids) {//index as: not analyzed, not tokenized
419-
// doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO)); // TODO: why did we do this? IDs are case-sensitive.
417+
for (String id : ids) {
418+
//index as: not analyzed, not tokenized; we use KeywordAnalyzer when searching this field...
419+
//doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO));//todo: why did we have it? (ID is normally case-sensitive)
420420
doc.add(new StringField(FIELD_XREFID, id, Field.Store.NO));
421421
//also store a lower-case prefix (banana, e.g. 'chebi:1234' version of the id)
422422
if(StringUtils.contains(id,":")) {
@@ -452,7 +452,7 @@ public void save(BioPAXElement bpe) {
452452
// save/update the lucene document
453453
try {
454454
indexWriter.updateDocument(new Term(FIELD_URI, uri), doc);
455-
} catch (IOException e) {
455+
} catch (Exception e) {
456456
throw new RuntimeException("Failed to index: " + bpe.getUri(), e);
457457
}
458458
}
@@ -523,30 +523,26 @@ public long count(String queryString) {
523523

524524
private void addDatasources(Set<Provenance> set, Document doc) {
525525
for (Provenance p : set) {
526-
// Index (!) and store URI (untokenized) -
527-
// required to accurately calculate no. entities or to filter by data source
528-
// (different data sources might share same names)
526+
//store but do not index/tokenize the URI
527+
doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));
528+
529+
//index the last/local (collection prefix) part of the normalized Provenance uri
529530
String u = p.getUri();
530-
doc.add(new StringField(FIELD_DATASOURCE, u, Field.Store.YES));
531-
532-
//index the identifier part of uri as well
533-
if(u.startsWith("http://")) {
534-
if (u.endsWith("/"))
535-
u = u.substring(0, u.length() - 1);
536-
u = u.replaceAll(".*/", "");
537-
doc.add(new StringField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
538-
}
531+
if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
532+
u = u.replaceAll(".*[/#]", "");
533+
doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
539534

540-
// index names
535+
//index names (including the datasource identifier from metadata json config; see premerge/merge)
536+
//different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
541537
for (String s : p.getName()) {
542-
doc.add(new StringField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
538+
doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
543539
}
544540
}
545541
}
546542

547543
private void addOrganisms(Set<BioSource> set, Document doc) {
548544
for(BioSource bs : set) {
549-
// store URI as is (not indexed, not tokinized)
545+
// store but do not index URI (see transform method above, where the organism URIs are added to search hits)
550546
doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
551547

552548
// add organism names

src/main/java/cpath/service/Merger.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ private Model merge(Datasource datasource) {
150150
ModelUtils.normalizeGenerics(providerModel);
151151

152152
//for (already normalized) BioSource, also add the name from
153-
//application.properties (it helps full-text search)
153+
//application.properties (it helps full-text search in case the orig. BioSource had no names but taxon ref...)
154154
Map<String,String> orgMap = service.settings().getOrganismsAsTaxonomyToNameMap();
155155
for(BioSource org : providerModel.getObjects(BioSource.class)) {
156156
for(UnificationXref x : new ClassFilterSet<>(org.getXref(), UnificationXref.class)) {
@@ -510,7 +510,7 @@ private void chemXrefByMapping(final Model m, Named bpe, final int maxNumXrefsTo
510510
* This step won't much improve full-text index/search and graph queries
511511
* (where id-mapping is used again anyway), but may help improve export to SIF and GSEA formats.
512512
* This method is called only for original PEs or their ERs that were not mapped/merged
513-
* with a warehouse canonical ERs for various known reasons (no match for a ID or no ID, ambiguous ID, etc.)
513+
* with a warehouse canonical ERs for various known reasons (no match for an ID or no ID, ambiguous ID, etc.)
514514
*
515515
* This method won't add additional xrefs if a UniProt/HGNC one is already present despite it'd map
516516
* to many canonical ERs/IDs (in fact, it'd even map to hundreds (Trembl) IDs, e.g., in cases like 'ND5',

src/main/java/cpath/service/metadata/Datasource.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ public void setProvenanceFor(Model model, String xmlBase) {
112112
String displayName = getName().iterator().next();
113113
pro.setDisplayName(displayName);
114114
pro.setStandardName(standardName());
115+
pro.addName(identifier);
115116

116117
if (getName().size() > 2)
117118
for (int i = 2; i < getName().size(); i++)

src/test/java/cpath/service/IndexIT.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,12 +96,18 @@ public final void search() throws IOException {
9696
response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
9797
assertEquals(1, response.getSearchHit().size());
9898

99-
//datasource filter using a URI (required for -update-counts console command and datasources.html page to work)
99+
//datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
100100
response = index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/kegg.pathway/"}, null);
101+
assertTrue(response.isEmpty());
102+
103+
//using the local/last part of the URI (standard bio collection prefix/name)
104+
response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
101105
assertFalse(response.isEmpty());
102106
assertEquals(1, response.getSearchHit().size());
103-
//using metadata identifier
104-
response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
107+
assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("http://identifiers.org/kegg.pathway/")));
108+
109+
//find by partial name of a datasource - "pathway" of "KEGG Pathway"...
110+
response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
105111
assertFalse(response.isEmpty());
106112
assertEquals(1, response.getSearchHit().size());
107113

0 commit comments

Comments
 (0)