Skip to content

Commit abdebe4

Browse files
Bumped Paxtools and Validator versions, added Analysis class and console app command "-m" to fix bad/invalid URIs issue #319 (e.g. netpath "S 312" and all intact_complex URIs)
1 parent 6d8311c commit abdebe4

18 files changed

+20179
-5253
lines changed

pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
<start-class>cpath.Application</start-class>
2525
<license.licenseName>MIT</license.licenseName>
2626
<github.global.server>github</github.global.server>
27-
<paxtools.version>6.0.0</paxtools.version>
28-
<validator.version>6.0.0</validator.version>
27+
<paxtools.version>6.0.1-SNAPSHOT</paxtools.version>
28+
<validator.version>6.0.1-SNAPSHOT</validator.version>
2929
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
3030
<lucene.version>9.10.0</lucene.version>
3131
<jvm.options>-Xmx3g -Dfile.encoding=UTF-8 -ea -Dpaxtools.CollectionProvider=org.biopax.paxtools.trove.TProvider --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED</jvm.options>
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
package cpath.analysis;
2+
3+
import cpath.service.api.Analysis;
4+
import org.apache.commons.lang3.RegExUtils;
5+
import org.apache.commons.lang3.StringUtils;
6+
import org.biopax.paxtools.controller.ModelUtils;
7+
import org.biopax.paxtools.model.BioPAXElement;
8+
import org.biopax.paxtools.model.Model;
9+
import org.biopax.paxtools.model.level3.SimplePhysicalEntity;
10+
import org.biopax.paxtools.model.level3.UtilityClass;
11+
12+
import java.util.HashSet;
13+
14+
/**
15+
* This is to fix issue #319 in the PC v14 data (in June 2024 it was beta)
16+
* and similar potential LD issues due to invalid biopax URIs...
17+
* Also remove all dangling SimplePhysicalEntity (i.e. not Complex) individuals, if any
18+
* (these ain't useful for anything and are likely there due to mistakes or duplicate original data, e.g. in NetPath...)
19+
*/
20+
public class Fix319 implements Analysis<Model> {
21+
public void execute(Model model) {
22+
//remove dangling SPEs (such non-participant/components molecules are not useful for pathway analyses...)
23+
ModelUtils.removeObjectsIfDangling(model, SimplePhysicalEntity.class);
24+
25+
//now, remove dangling xrefs, CV et al. utility type individuals
26+
ModelUtils.removeObjectsIfDangling(model, UtilityClass.class);
27+
28+
//replace bad URI part "intact_complex" with "intact.complex" (also replaces "pc14:intact_complex")
29+
for (BioPAXElement e : new HashSet<>(model.getObjects())) {
30+
if (StringUtils.contains(e.getUri(), "intact_complex")) {
31+
String r = RegExUtils.replaceFirst(e.getUri(), "intact_complex", "intact.complex");
32+
ModelUtils.updateUri(model, e, r);
33+
}
34+
}
35+
36+
//fix bad invalid URIs (there were some URIs with a space,
37+
// e.g. "netpath:S 312" causing trouble when converting to JSONLD, etc.)
38+
ModelUtils.fixInvalidUris(model);
39+
}
40+
}

src/main/java/cpath/analysis/TraverseAnalysis.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ public void execute(Model model) {
2828
final String propertyPath = callback.getPropertyPath();
2929
callback.getTraverseEntry().clear();
3030

31-
PathAccessor pathAccessor = null;
31+
PathAccessor pathAccessor;
3232
try {
3333
pathAccessor = new PathAccessor(propertyPath, model.getLevel());
3434
} catch (Exception e) {
@@ -43,7 +43,6 @@ public void execute(Model model) {
4343
TraverseEntry entry = new TraverseEntry();
4444
entry.setUri(uri);
4545
if(!pathAccessor.isUnknown(v)) {
46-
// entry.getValue().addAll(v);
4746
for(Object o : v) {
4847
if(o instanceof BioPAXElement)
4948
entry.getValue().add(((BioPAXElement) o).getUri());

src/main/java/cpath/cleaner/NetPathCleaner.java

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,7 @@
99
import org.biopax.paxtools.io.SimpleIOHandler;
1010
import org.biopax.paxtools.model.BioPAXLevel;
1111
import org.biopax.paxtools.model.Model;
12-
import org.biopax.paxtools.model.level3.ControlledVocabulary;
13-
import org.biopax.paxtools.model.level3.RelationshipXref;
14-
import org.biopax.paxtools.model.level3.SequenceModificationVocabulary;
15-
import org.biopax.paxtools.model.level3.UnificationXref;
16-
import org.biopax.paxtools.model.level3.UtilityClass;
17-
import org.biopax.paxtools.model.level3.XReferrable;
12+
import org.biopax.paxtools.model.level3.*;
1813
import org.biopax.paxtools.util.ClassFilterSet;
1914
import org.slf4j.Logger;
2015
import org.slf4j.LoggerFactory;
@@ -25,29 +20,29 @@
2520
* Implementation of Cleaner interface for NetPath data.
2621
*/
2722
final class NetPathCleaner implements Cleaner {
28-
29-
// logger
30-
private static Logger log = LoggerFactory.getLogger(NetPathCleaner.class);
23+
private static Logger log = LoggerFactory.getLogger(NetPathCleaner.class);
3124

32-
public void clean(InputStream data, OutputStream cleanedData)
33-
{
25+
public void clean(InputStream data, OutputStream cleanedData) {
3426
// create bp model from dataFile
3527
SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3);
3628
Model model = simpleReader.convertFromOWL(data);
3729
log.info("Cleaning NetPath data, please be patient...");
3830

31+
//fix bad/invalid URIs (there are several with spaces in them...)
32+
ModelUtils.fixInvalidUris(model);
33+
3934
// Fix some CV xrefs
4035
// a CV must have one unification xref;
4136
// if there are also relationship and publication xrefs, it's a biopax error, but we'll keep as is (not critical);
4237
// So, if there is no unification xref but rel. xrefs (in fact, one or none in NetPath), we convert rel. to unif. xref.
4338
Set<ControlledVocabulary> cvs = new HashSet<>(model.getObjects(ControlledVocabulary.class));
44-
for(ControlledVocabulary cv : cvs) {
45-
log.info("Processing " + cv.toString() + "; xrefs: " + cv.getXref());
46-
39+
for (ControlledVocabulary cv : cvs) {
40+
log.info("Processing " + cv.toString() + "; xrefs: " + cv.getXref());
41+
4742
//insert "L-" after "phospho-" in MFV terms (if it does not contain "phospho-L-" already)
48-
if(cv instanceof SequenceModificationVocabulary) {
49-
for(String t: new HashSet<>(cv.getTerm())) {
50-
if(t.contains("phospho-") && !t.contains("phospho-L-")) {
43+
if (cv instanceof SequenceModificationVocabulary) {
44+
for (String t : new HashSet<>(cv.getTerm())) {
45+
if (t.contains("phospho-") && !t.contains("phospho-L-")) {
5146
//insert "L-", replace term
5247
cv.removeTerm(t);
5348
t = t.replace("phospho-", "phospho-L-");
@@ -56,54 +51,54 @@ public void clean(InputStream data, OutputStream cleanedData)
5651
}
5752
}
5853
}
59-
54+
6055
Set<UnificationXref> urefs = new ClassFilterSet<>(new HashSet<>(cv.getXref()), UnificationXref.class);
6156
//skip if there is a unification xref
62-
if(!urefs.isEmpty()) {
63-
log.info("(skip) there are unif.xref: " + urefs);
57+
if (!urefs.isEmpty()) {
6458
continue; //perhaps, will never happen (I manually checked a couple of orig. files)
6559
}
66-
60+
6761
Set<RelationshipXref> rxrefs = new ClassFilterSet<>(new HashSet<>(cv.getXref()), RelationshipXref.class);
68-
for(RelationshipXref x : rxrefs) {
62+
for (RelationshipXref x : rxrefs) {
6963
//remove and skip for bad xref (just in case there are any)
70-
if(x.getDb()==null || x.getId()==null) {
64+
if (x.getDb() == null || x.getId() == null) {
7165
cv.removeXref(x);
7266
model.remove(x);
7367
continue;
7468
}
75-
7669
String id = x.getId();
77-
String uri = "UX_" + BaseCleaner.encode(x.getDb() + "_"+ id);
70+
String uri = "UX_" + BaseCleaner.encode(x.getDb() + "_" + id);
7871
UnificationXref ux = (UnificationXref) model.getByID(uri);
79-
if(ux == null) {
72+
if (ux == null) {
8073
ux = model.addNew(UnificationXref.class, uri);
8174
ux.setDb(x.getDb());
8275
ux.setId(id);
83-
}
76+
}
8477
cv.removeXref(x);
8578
cv.addXref(ux);
8679
}
8780
}
88-
81+
8982
//convert shared UnificationXrefs into RelationshipXrefs (in fact, some of those are just invalid db/id)
90-
Set<UnificationXref> uxrefs = new HashSet<>(model.getObjects(UnificationXref.class));
91-
for(UnificationXref x : uxrefs) {
92-
if(x.getXrefOf().size() > 1) {
83+
for (UnificationXref x : new HashSet<>(model.getObjects(UnificationXref.class))) {
84+
if (x.getXrefOf().size() > 1) {
9385
//convert to RX, re-associate
9486
RelationshipXref rx = BaseCleaner.getOrCreateRx(x, model);
95-
for(XReferrable owner : new HashSet<>(x.getXrefOf())) {
96-
if(owner instanceof ControlledVocabulary)
87+
for (XReferrable owner : new HashSet<>(x.getXrefOf())) {
88+
if (owner instanceof ControlledVocabulary) {
9789
continue; //CVs can use same UX, but that means they are to merge...
90+
}
9891
owner.removeXref(x);
9992
owner.addXref(rx);
100-
}
101-
log.info("replaced UX {} with RX {}", x, rx);
93+
}
10294
}
10395
}
104-
96+
97+
//SPEs that are not component/participant are not needed
98+
ModelUtils.removeObjectsIfDangling(model, SimplePhysicalEntity.class);
99+
//xrefs, CVs et al. utility class individuals are not interesting for pathway analysis
105100
ModelUtils.removeObjectsIfDangling(model, UtilityClass.class);
106-
101+
107102
// convert model back to OutputStream for return
108103
try {
109104
simpleReader.convertToOWL(model, cleanedData);

src/main/java/cpath/service/CPathUtils.java

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import java.io.InputStream;
55
import java.io.OutputStream;
66
import java.lang.reflect.Constructor;
7-
import java.lang.reflect.Method;
87
import java.nio.file.*;
98
import java.util.*;
109
import java.util.stream.Stream;
@@ -21,7 +20,6 @@
2120
import org.biopax.paxtools.controller.Fetcher;
2221
import org.biopax.paxtools.controller.ModelUtils;
2322
import org.biopax.paxtools.controller.SimpleEditorMap;
24-
import org.biopax.paxtools.impl.BioPAXElementImpl;
2523
import org.biopax.paxtools.io.SimpleIOHandler;
2624
import org.biopax.paxtools.model.BioPAXElement;
2725
import org.biopax.paxtools.model.BioPAXLevel;
@@ -89,27 +87,17 @@ static void saveMetadata(Metadata metadata, String path) {
8987
}
9088

9189
/**
92-
* Replaces the URI of a BioPAX object
93-
* using java reflection. Normally, one should avoid this;
94-
* please use when absolutely necessary and with great care.
90+
* Replaces the URI of a BioPAX object using java reflection.
91+
* Please use when absolutely necessary and with great care.
9592
*
9693
* @param model model
9794
* @param el biopax object from the model
9895
* @param newUri new URI
9996
*/
10097
public static void replaceUri(Model model, BioPAXElement el, String newUri) {
101-
if (el.getUri().equals(newUri)) {
102-
return; // no action required
98+
if (!el.getUri().equals(newUri)) {
99+
ModelUtils.updateUri(model, el, newUri);
103100
}
104-
model.remove(el);
105-
try {
106-
Method m = BioPAXElementImpl.class.getDeclaredMethod("setUri", String.class);
107-
m.setAccessible(true);
108-
m.invoke(el, newUri);
109-
} catch (Exception e) {
110-
throw new RuntimeException(e);
111-
}
112-
model.add(el);
113101
}
114102

115103
/**
@@ -154,7 +142,7 @@ static String rebaseUri(String absoluteUri, String fromBase, String toBase) {
154142
* Replaces xml:base for the normalized model and updates the URis of all non-normalized objects
155143
* (mostly Entity, Evidence, etc.)
156144
* The model is already normalized, which means the URIs of many xrefs, CVs, entity reference start with
157-
* bioregistry.io/ or are CURIEs like e.g. chebi:1234, pubmed:1234556.
145+
* http://bioregistry.io/ or are CURIEs like e.g. chebi:1234, pubmed:1234556.
158146
*/
159147
public static void rebaseUris(Model model, String fromBase, String toBase) {
160148
Assert.hasText(toBase, "Blank/null value is not allowed for xmlBase");

src/main/java/cpath/service/ConsoleApplication.java

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,15 @@ public void run(String... args) throws Exception {
7878
.hasArg().argName("from-stage").optionalArg(true).type(Stage.class).build();
7979
options.addOption(o);
8080
o = Option.builder("a").longOpt("analyze")
81-
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to analyse the integrated " +
82-
"BioPAX model (the class and its dependencies are expected to be found on the classpath)")
81+
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to analyse the " +
82+
"BioPAX model (the class and its dependencies are expected to be on the classpath)")
8383
.hasArg().argName("class").build();
8484
options.addOption(o);
85+
o = Option.builder("m").longOpt("modify")
86+
.desc("use a class that implements cpath.service.api.Analysis<Model> interface to modify the " +
87+
"BioPAX model and re-index (the class and its dependencies are expected to be on the classpath)")
88+
.hasArg().argName("class").build();
89+
options.addOption(o);
8590
o = Option.builder("e").longOpt("export")
8691
.desc("export the main BioPAX model or sub-model defined by additional filters (see: -F)")
8792
.hasArg().argName("filename").build();
@@ -134,43 +139,59 @@ else if (cmd.hasOption("export")) {
134139
exportData(cmd.getOptionValue("export"), uris, datasources, types);
135140
}
136141
else if (cmd.hasOption("analyze")) {
137-
executeAnalysis(cmd.getOptionValue("analyze"), true);
142+
analyzeModel(cmd.getOptionValue("analyze"));
143+
}
144+
else if (cmd.hasOption("modify")) {
145+
modifyModel(cmd.getOptionValue("modify"));
138146
}
139147
else {
140148
new HelpFormatter().printHelp("cPath2", options);
141149
}
142150
}
143151

144-
/**
145-
* Runs a class that analyses or modifies the main BioPAX model.
152+
/*
153+
* Runs a class that analyses the main BioPAX model.
146154
*
147155
* @param analysisClass a class that implements {@link Analysis}
148-
* @param readOnly whether this is to modify and replace the BioPAX Model or not
149156
*/
150-
private void executeAnalysis(String analysisClass, boolean readOnly) {
157+
private void analyzeModel(String analysisClass) {
151158
Analysis<Model> analysis;
152159
try {
153160
Class c = Class.forName(analysisClass);
154161
analysis = (Analysis<Model>) c.getDeclaredConstructor().newInstance();
155162
} catch (Exception e) {
156163
throw new RuntimeException(e);
157164
}
158-
159165
Model model = CPathUtils.importFromTheArchive(service.settings().mainModelFile());
160166
analysis.execute(model);
167+
}
161168

162-
if (!readOnly) { //replace the main BioPAX model archive
163-
try {
164-
new SimpleIOHandler(BioPAXLevel.L3).convertToOWL(model,
165-
new GZIPOutputStream(new FileOutputStream(service.settings().mainModelFile())));
166-
} catch (Exception e) {
167-
throw new RuntimeException("Failed updating the main BioPAX archive!", e);
168-
}
169-
170-
LOG.warn("The main BioPAX model was modified; "
171-
+ "do not forget to re-index, update counts, re-export other files, etc.");
169+
/*
170+
* Runs a class that analyses and modifies the main BioPAX model and index.
171+
*
172+
* @param analysisClass a class that implements {@link Analysis} and can edit the data.
173+
*/
174+
private void modifyModel(String analysisClass) throws IOException {
175+
Analysis<Model> analysis;
176+
try {
177+
Class c = Class.forName(analysisClass);
178+
analysis = (Analysis<Model>) c.getDeclaredConstructor().newInstance();
179+
} catch (Exception e) {
180+
throw new RuntimeException(e);
172181
}
173-
182+
//load current model from the file
183+
Model model = CPathUtils.importFromTheArchive(service.settings().mainModelFile());
184+
// and apply the changes
185+
LOG.info("Running class: {}...", analysisClass);
186+
analysis.execute(model);
187+
// export the modified model to the file
188+
LOG.info("Over-writing model: {}...", service.settings().mainModelFile());
189+
new SimpleIOHandler(BioPAXLevel.L3).convertToOWL(model,
190+
new GZIPOutputStream(new FileOutputStream(service.settings().mainModelFile())));
191+
//init the lucene index as read-write
192+
service.initIndex(model, service.settings().indexDir(), false);
193+
//re-index the model
194+
service.index().save(model);
174195
}
175196

176197
/*

src/main/java/cpath/service/Merger.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,20 @@ public void merge() {
7272
}
7373
Model providerModel = merge(datasource); //uses lucene index, via service.mapping() repo, for id-mapping
7474
log.info("Replacing xml:base of non-generated/normalized URIs in {}", datasource.getIdentifier());
75+
//todo: new URI must be valid (e.g. base/prefix cannot contain '_' or '-'; or start with a standard URI scheme, e.g. 'urn:' or 'http://')
7576
CPathUtils.rebaseUris(providerModel, null, datasource.getIdentifier()+":");
7677
log.info("Replacing conflicting URIs in {} before merging into Main...", datasource.getIdentifier());
7778
replaceConflictingUris(providerModel, m);
7879
save(providerModel, datasource);
7980
log.info("Merging '{}' model into the Main BioPAX model...", datasource.getIdentifier());
8081
simpleMerger.merge(m, providerModel);
8182
}
83+
84+
//remove dangling SPEs (such non-participant/components molecules are not useful for pathway analyses...)
85+
ModelUtils.removeObjectsIfDangling(m, SimplePhysicalEntity.class);
86+
//now, remove dangling xrefs, CV et al. utility type individuals
8287
ModelUtils.removeObjectsIfDangling(m, UtilityClass.class);
88+
8389
//m.repair(); //todo: check if we really need this call (unlikely)
8490
save(m); //save the main model as rdfxml file
8591
log.info("Merged, saved.");

src/main/java/cpath/web/PagesController.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ String robots() {
7070
// but allow - to web page resources (css, js, images)
7171
return "User-agent: *\n" +
7272
"Disallow: /v2\n" +
73-
"Disallow: /fetch\n" +
73+
"Disallow: /get\n" +
7474
"Disallow: /search\n" +
7575
"Disallow: /graph\n" +
7676
"Disallow: /top_pathways\n" +

src/main/resources/metadata.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
},
3333
{
3434
"dataUrl": "classpath:test_mapping.zip",
35-
"identifier": "TEST_MAPPING",
35+
"identifier": "TESTMAPPING",
3636
"homepageUrl": "https://www.ebi.ac.uk/unichem/",
3737
"name": [
3838
"UniChem"

0 commit comments

Comments
 (0)