@@ -248,7 +248,7 @@ private SearchResponse transform(Query query, IndexSearcher searcher, TopDocs to
248
248
}
249
249
}
250
250
251
- // extract organisms (URI only)
251
+ // extract organisms (URIs only)
252
252
if (doc .get (FIELD_ORGANISM ) != null ) {
253
253
Set <String > uniqueVals = new TreeSet <>();
254
254
for (String o : doc .getValues (FIELD_ORGANISM )) {
@@ -360,19 +360,19 @@ public void save(BioPAXElement bpe) {
360
360
361
361
// create a new document
362
362
final Document doc = new Document ();
363
- // using StringField and KeywordAnalyser for this field
363
+ // using StringField and KeywordAnalyser (when searching) for 'uri' field
364
364
final String uri = bpe .getUri ();
365
- // save URI: indexed, not analyzed, stored
365
+ // save URI: indexed, not analyzed, stored
366
366
doc .add (new StringField (FIELD_URI , uri , Field .Store .YES ));
367
- //extract and index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...')
368
- if (uri .startsWith ("http://" )) {
369
- String id = (uri .endsWith ("/" )) ? uri .substring (0 , uri .length ()-1 ) : uri ;
370
- id = id .replaceAll (".*[/#]" , "" ).trim ();
371
- doc .add (new StringField (FIELD_URI , id , Field .Store .NO ));
372
- }
373
-
374
- // index and store but not analyze/tokenize the biopax class name:
367
+
368
+ //index the last part of the uri (e.g., 'hsa00010' or like 'ProteinReference_ca123bd44...'); todo: why?..
369
+ String luri = (uri .endsWith ("/" )) ? uri .substring (0 , uri .length ()-1 ) : uri ;
370
+ luri = luri .replaceAll (".*[/#]" , "" ).trim ();
371
+ doc .add (new StringField (FIELD_URI , luri , Field .Store .NO ));
372
+
373
+ // index and store but not analyze/tokenize biopax class name (lowcase as we use StandardAnalyzer for searching/filtering in this field):
375
374
doc .add (new StringField (FIELD_TYPE , bpe .getModelInterface ().getSimpleName ().toLowerCase (), Field .Store .YES ));
375
+
376
376
// extra index fields
377
377
addPathways (ModelUtils .getParentPathways (bpe ), doc );
378
378
addOrganisms (ModelUtils .getOrganisms (bpe ), doc );
@@ -394,11 +394,10 @@ public void save(BioPAXElement bpe) {
394
394
}
395
395
396
396
// Add more xref IDs to the index using id-mapping
397
- Set <String > ids = CPathUtils .getXrefIds (bpe );
397
+ final Set <String > ids = CPathUtils .getXrefIds (bpe );
398
398
Pattern isoformIdPattern = Pattern .compile (Resolver .getNamespace ("uniprot.isoform" , true ).getPattern ());
399
- Pattern uniprotIdPattern = Pattern .compile (Resolver .getNamespace ("uniprot" , true ).getPattern ()); //"uniprot protein" is the preferred name
400
- // in addition, collect ChEBI and UniProt IDs and then
401
- // use id-mapping to associate the bpe with more IDs:
399
+ Pattern uniprotIdPattern = Pattern .compile (Resolver .getNamespace ("uniprot" , true ).getPattern ());
400
+ // also collect ChEBI and UniProt IDs and then use id-mapping to associate the bpe with more IDs:
402
401
final List <String > uniprotIds = new ArrayList <>();
403
402
final List <String > chebiIds = new ArrayList <>();
404
403
for (String id : ids ) {
@@ -407,16 +406,17 @@ public void save(BioPAXElement bpe) {
407
406
chebiIds .add (id );
408
407
} else if (isoformIdPattern .matcher (id ).find ()) {
409
408
//cut the isoform num. suffix
410
- id = id .replaceFirst ("-\\ d+$" , "" );
411
- uniprotIds .add (id );
409
+ uniprotIds .add (id .replaceFirst ("-\\ d+$" , "" ));
412
410
} else if (uniprotIdPattern .matcher (id ).find ()) {
413
411
uniprotIds .add (id );
414
412
}
415
413
}
414
+ //id-mapping to find some other ids that map to the chebi/uniprot ones that we collected from the bpe.
416
415
addSupportedIdsThatMapToChebi (chebiIds , ids );
417
416
addSupportedIdsThatMapToUniprotId (uniprotIds , ids );
418
- for (String id : ids ) {//index as: not analyzed, not tokenized
419
- // doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO)); // TODO: why did we do this? IDs are case-sensitive.
417
+ for (String id : ids ) {
418
+ //index as: not analyzed, not tokenized; we use KeywordAnalyzer when searching this field...
419
+ //doc.add(new StringField(FIELD_XREFID, id.toLowerCase(), Field.Store.NO));//todo: why did we have it? (ID is normally case-sensitive)
420
420
doc .add (new StringField (FIELD_XREFID , id , Field .Store .NO ));
421
421
//also store a lower-case prefix (banana, e.g. 'chebi:1234' version of the id)
422
422
if (StringUtils .contains (id ,":" )) {
@@ -452,7 +452,7 @@ public void save(BioPAXElement bpe) {
452
452
// save/update the lucene document
453
453
try {
454
454
indexWriter .updateDocument (new Term (FIELD_URI , uri ), doc );
455
- } catch (IOException e ) {
455
+ } catch (Exception e ) {
456
456
throw new RuntimeException ("Failed to index: " + bpe .getUri (), e );
457
457
}
458
458
}
@@ -523,30 +523,26 @@ public long count(String queryString) {
523
523
524
524
private void addDatasources (Set <Provenance > set , Document doc ) {
525
525
for (Provenance p : set ) {
526
- // Index (!) and store URI (untokenized) -
527
- // required to accurately calculate no. entities or to filter by data source
528
- // (different data sources might share same names)
526
+ //store but do not index/tokenize the URI
527
+ doc .add (new StoredField (FIELD_DATASOURCE , p .getUri ()));
528
+
529
+ //index the last/local (collection prefix) part of the normalized Provenance uri
529
530
String u = p .getUri ();
530
- doc .add (new StringField (FIELD_DATASOURCE , u , Field .Store .YES ));
531
-
532
- //index the identifier part of uri as well
533
- if (u .startsWith ("http://" )) {
534
- if (u .endsWith ("/" ))
535
- u = u .substring (0 , u .length () - 1 );
536
- u = u .replaceAll (".*/" , "" );
537
- doc .add (new StringField (FIELD_DATASOURCE , u .toLowerCase (), Field .Store .NO ));
538
- }
531
+ if (u .endsWith ("/" )) u = u .substring (0 , u .length () - 1 );
532
+ u = u .replaceAll (".*[/#]" , "" );
533
+ doc .add (new TextField (FIELD_DATASOURCE , u .toLowerCase (), Field .Store .NO ));
539
534
540
- // index names
535
+ //index names (including the datasource identifier from metadata json config; see premerge/merge)
536
+ //different data sources can have the same name e.g. 'intact'; tokenized - to search by partial name
541
537
for (String s : p .getName ()) {
542
- doc .add (new StringField (FIELD_DATASOURCE , s .toLowerCase (), Field .Store .NO ));
538
+ doc .add (new TextField (FIELD_DATASOURCE , s .toLowerCase (), Field .Store .NO ));
543
539
}
544
540
}
545
541
}
546
542
547
543
private void addOrganisms (Set <BioSource > set , Document doc ) {
548
544
for (BioSource bs : set ) {
549
- // store URI as is (not indexed, not tokinized )
545
+ // store but do not index URI (see transform method above, where the organism URIs are added to search hits )
550
546
doc .add (new StoredField (FIELD_ORGANISM , bs .getUri ()));
551
547
552
548
// add organism names
0 commit comments