Some improvements for the 'organism' and 'datasource' index/search fields.

IgorRodchenkov · IgorRodchenkov · commit 74bf3a613a39 · 2024-04-24T00:29:36.000-04:00
diff --git a/src/main/java/cpath/service/ConsoleApplication.java b/src/main/java/cpath/service/ConsoleApplication.java
@@ -90,7 +90,7 @@ public void run(String... args) throws Exception {
       .hasArg().argName("filename").build();
     options.addOption(o);
     o = Option.builder("F").longOpt("F")
-      .desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<nameOrUri,..> -Ftypes=<interface,..> " +
+      .desc("filters for the export option, e.g., -Furis=<uri,..> -Fdatasources=<name,..> -Ftypes=<interface,..> " +
         "(when 'uris' is defined, other options are ignored)")
       .argName("property=value").hasArgs().valueSeparator().numberOfArgs(2).build();
     options.addOption(o);
@@ -219,7 +219,7 @@ private void merge() {
    *
    * @param output      - output BioPAX file name (path)
    * @param uris        - optional, the list of valid (existing) URIs to extract a sub-model
-   * @param datasources filter by datasource if 'uris' is not empty
+   * @param datasources filter by datasource (name or identifier) if 'uris' is not empty
    * @param types       filter by BioPAX type if 'uris' is not empty
    * @throws IOException, IllegalStateException (in maintenance mode)
    */
diff --git a/src/main/java/cpath/service/IndexImpl.java b/src/main/java/cpath/service/IndexImpl.java
@@ -248,7 +248,7 @@ private SearchResponse transform(Query query, IndexSearcher searcher, TopDocs to
 				}	
 			}
 						
-			// extract organisms (URI only) 
+			// extract organisms (URIs only)
 			if(doc.get(FIELD_ORGANISM) != null) {
 				Set<String> uniqueVals = new TreeSet<>();
 				for(String o : doc.getValues(FIELD_ORGANISM)) {
@@ -360,19 +360,19 @@ public void save(BioPAXElement bpe) {
 
 		// create a new document
 		final Document doc = new Document();
-		// using StringField and KeywordAnalyser for this field
+		// using StringField and KeywordAnalyser (when searching) for 'uri' field
 		final String uri = bpe.getUri();
-        // save URI: indexed, not analyzed, stored
+    // save URI: indexed, not analyzed, stored
 		doc.add(new StringField(FIELD_URI, uri, Field.Store.YES));
-        //extract and index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...')
-        if(uri.startsWith("http://")) {
-        	String id = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
-            id = id.replaceAll(".*[/#]", "").trim();
-            doc.add(new StringField(FIELD_URI, id, Field.Store.NO));
-        }
-
-		// index and store but not analyze/tokenize the biopax class name:
+
+		//index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...'); todo: why?..
+		String luri = (uri.endsWith("/")) ? uri.substring(0, uri.length()-1) : uri;
+		luri = luri.replaceAll(".*[/#]", "").trim();
+		doc.add(new StringField(FIELD_URI, luri, Field.Store.NO));
+
+		// index and store but not analyze/tokenize biopax class name (lowcase as we use StandardAnalyzer for searching/filtering in this field):
 		doc.add(new StringField(FIELD_TYPE, bpe.getModelInterface().getSimpleName().toLowerCase(), Field.Store.YES));
+
 		// extra index fields
 		addPathways(ModelUtils.getParentPathways(bpe), doc);
 		addOrganisms(ModelUtils.getOrganisms(bpe), doc);
@@ -394,11 +394,10 @@ public void save(BioPAXElement bpe) {
 		}
 
 		// Add more xref IDs to the index using id-mapping
-		Set<String> ids = CPathUtils.getXrefIds(bpe);
+		final Set<String> ids = CPathUtils.getXrefIds(bpe);
 		Pattern isoformIdPattern = Pattern.compile(Resolver.getNamespace("uniprot.isoform", true).getPattern());
-		Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern()); //"uniprot protein" is the preferred name
-		// in addition, collect ChEBI and UniProt IDs and then
-		// use id-mapping to associate the bpe with more IDs:
+		Pattern uniprotIdPattern = Pattern.compile(Resolver.getNamespace("uniprot", true).getPattern());
+		// also collect ChEBI and UniProt IDs and then use id-mapping to associate the bpe with more IDs:
 		final List<String> uniprotIds = new ArrayList<>();
 		final List<String> chebiIds = new ArrayList<>();
 		for(String id : ids) {
@@ -407,16 +406,17 @@ public void save(BioPAXElement bpe) {
 				chebiIds.add(id);
 			} else if(isoformIdPattern.matcher(id).find()) {
 				//cut the isoform num. suffix
-				id = id.replaceFirst("-\\d+$", "");
-				uniprotIds.add(id);
+				uniprotIds.add(id.replaceFirst("-\\d+$", ""));
 			} else if(uniprotIdPattern.matcher(id).find()) {
 				uniprotIds.add(id);
 			}
 		}
+		//id-mapping to find some other ids that map to the chebi/uniprot ones that we collected from the bpe.
 		addSupportedIdsThatMapToChebi(chebiIds, ids);
 		addSupportedIdsThatMapToUniprotId(uniprotIds, ids);
-		for (String id : ids) {//index as: not analyzed, not tokenized
-//			doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO)); // TODO: why did we do this? IDs are case-sensitive.
+		for (String id : ids) {
+			//index as: not analyzed, not tokenized; we use KeywordAnalyzer when searching this field...
+			//doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO));//todo: why did we have it? (ID is normally case-sensitive)
 			doc.add(new StringField(FIELD_XREFID, id, Field.Store.NO));
 			//also store a lower-case prefix (banana, e.g. 'chebi:1234' version of the id)
 			if(StringUtils.contains(id,":")) {
@@ -452,7 +452,7 @@ public void save(BioPAXElement bpe) {
 		// save/update the lucene document
 		try {
 			indexWriter.updateDocument(new Term(FIELD_URI, uri), doc);
-		} catch (IOException e) {
+		} catch (Exception e) {
 			throw new RuntimeException("Failed to index: " + bpe.getUri(), e);
 		}
 	}
@@ -523,30 +523,26 @@ public long count(String queryString) {
 
 	private void addDatasources(Set<Provenance> set, Document doc) {
 		for (Provenance p : set) {
-			// Index (!) and store URI (untokenized) -
-			// required to accurately calculate no. entities or to filter by data source
-			// (different data sources might share same names)
+			//store but do not index/tokenize the URI
+			doc.add(new StoredField(FIELD_DATASOURCE, p.getUri()));
+
+			//index the last/local (collection prefix) part of the normalized Provenance uri
 			String u = p.getUri();
-			doc.add(new StringField(FIELD_DATASOURCE, u, Field.Store.YES));
-
-            //index the identifier part of uri as well
-			if(u.startsWith("http://")) {
-				if (u.endsWith("/"))
-					u = u.substring(0, u.length() - 1);
-				u = u.replaceAll(".*/", "");
-				doc.add(new StringField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
-			}
+			if (u.endsWith("/")) u = u.substring(0, u.length() - 1);
+			u = u.replaceAll(".*[/#]", "");
+			doc.add(new TextField(FIELD_DATASOURCE, u.toLowerCase(), Field.Store.NO));
 
-			// index names
+			//index names (including the datasource identifier from metadata json config; see premerge/merge)
+			//different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
 			for (String s : p.getName()) {
-				doc.add(new StringField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
+				doc.add(new TextField(FIELD_DATASOURCE, s.toLowerCase(), Field.Store.NO));
 			}
 		}
 	}
 
 	private void addOrganisms(Set<BioSource> set, Document doc) {	
 		for(BioSource bs : set) {
-			// store URI as is (not indexed, not tokinized)
+			// store but do not index URI (see transform method above, where the organism URIs are added to search hits)
 			doc.add(new StoredField(FIELD_ORGANISM, bs.getUri()));
 				
 			// add organism names
diff --git a/src/main/java/cpath/service/Merger.java b/src/main/java/cpath/service/Merger.java
@@ -150,7 +150,7 @@ private Model merge(Datasource datasource) {
 			ModelUtils.normalizeGenerics(providerModel);
 
 			//for (already normalized) BioSource, also add the name from
-			//application.properties (it helps full-text search)
+			//application.properties (it helps full-text search in case the orig. BioSource had no names but taxon ref...)
 			Map<String,String> orgMap = service.settings().getOrganismsAsTaxonomyToNameMap();
 			for(BioSource org : providerModel.getObjects(BioSource.class)) {
 				for(UnificationXref x : new ClassFilterSet<>(org.getXref(), UnificationXref.class)) {
@@ -510,7 +510,7 @@ private void chemXrefByMapping(final Model m, Named bpe, final int maxNumXrefsTo
      * This step won't much improve full-text index/search and graph queries
      * (where id-mapping is used again anyway), but may help improve export to SIF and GSEA formats.
      * This method is called only for original PEs or their ERs that were not mapped/merged
-     * with a warehouse canonical ERs for various known reasons (no match for a ID or no ID, ambiguous ID, etc.)
+     * with a warehouse canonical ERs for various known reasons (no match for an ID or no ID, ambiguous ID, etc.)
      *
      * This method won't add additional xrefs if a UniProt/HGNC one is already present despite it'd map
      * to many canonical ERs/IDs (in fact, it'd even map to hundreds (Trembl) IDs, e.g., in cases like 'ND5',
diff --git a/src/main/java/cpath/service/metadata/Datasource.java b/src/main/java/cpath/service/metadata/Datasource.java
@@ -112,6 +112,7 @@ public void setProvenanceFor(Model model, String xmlBase) {
     String displayName = getName().iterator().next();
     pro.setDisplayName(displayName);
     pro.setStandardName(standardName());
+    pro.addName(identifier);
 
     if (getName().size() > 2)
       for (int i = 2; i < getName().size(); i++)
diff --git a/src/test/java/cpath/service/IndexIT.java b/src/test/java/cpath/service/IndexIT.java
@@ -96,12 +96,18 @@ public final void search() throws IOException {
     response = index.search("*", 0, Provenance.class, new String[] {"kegg"}, null);
     assertEquals(1, response.getSearchHit().size());
 
-    //datasource filter using a URI (required for -update-counts console command and datasources.html page to work)
+    //datasource filter using Provenance absolute URI - not needed anymore - still stored but not indexed anymore
     response = index.search("*", 0, Pathway.class, new String[] {"http://identifiers.org/kegg.pathway/"}, null);
+    assertTrue(response.isEmpty());
+
+    //using the local/last part of the URI (standard bio collection prefix/name)
+    response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
     assertFalse(response.isEmpty());
     assertEquals(1, response.getSearchHit().size());
-    //using metadata identifier
-    response = index.search("*", 0, Pathway.class, new String[] {"kegg.pathway"}, null);
+    assertTrue(response.getSearchHit().stream().anyMatch(h -> h.getDataSource().contains("http://identifiers.org/kegg.pathway/")));
+
+    //find by partial name of a datasource - "pathway" of "KEGG Pathway"...
+    response = index.search("*", 0, Pathway.class, new String[] {"pathway"}, null);
     assertFalse(response.isEmpty());
     assertEquals(1, response.getSearchHit().size());