Adds pegi3s/id-mapping

hlfernandez · hlfernandez · commit 91db7f013eac · 2023-07-28T12:14:15.000+02:00
diff --git a/docs/index.html b/docs/index.html
@@ -243,6 +243,9 @@ <h5>Programs:</h5>
               <li><a href="https://hub.docker.com/r/pegi3s/hyphy/" target="_blank"><b>hyphy</b></a> 
                 <a href="http://hyphy.org/tutorials/CL-prompt-tutorial/" target="_blank">[doc]</a> - Phylogenetics inferences
               </li>
+              <li><a href="https://hub.docker.com/r/pegi3s/id-mapping" target="_blank"><b>id-mapping</b></a> 
+                <a href="https://hub.docker.com/r/pegi3s/id-mapping" target="_blank">[doc]</a> - ID mapping
+              </li>
               <li><a href="https://hub.docker.com/r/pegi3s/igv/" target="_blank"><b>igv</b></a> 
                 <a href="https://software.broadinstitute.org/software/igv/UserGuide" target="_blank">[doc]</a> - Genomics viewer
               </li>
diff --git a/id_mapping/.vscode/tasks.json b/id_mapping/.vscode/tasks.json
@@ -0,0 +1,31 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "build docker",
+            "type": "shell",
+            "command": "CURRENT_VERSION=$(cat current.version) && docker build ./ -t pegi3s/id-mapping:${CURRENT_VERSION} --build-arg version=${CURRENT_VERSION} && docker tag pegi3s/id-mapping:${CURRENT_VERSION} pegi3s/id-mapping:latest",
+            "problemMatcher": []
+        },
+        {
+            "label": "id-mapping 1 [without cache]",
+            "type": "shell",
+            "command": "rm -f test.tsv && docker run --rm -v $(pwd):/data -w /data pegi3s/id-mapping map-ids --from-db UniProtKB_AC-ID --to-db Gene_Name --input test_data/ids.txt --batch-size 2 --output test.tsv && bat test.tsv",
+            "problemMatcher": []
+        },
+        {
+            "label": "id-mapping 2 [with cache]",
+            "type": "shell",
+            "command": "rm -f test.tsv && docker run --rm -v $(pwd):/data -w /data pegi3s/id-mapping map-ids --from-db UniProtKB_AC-ID --to-db Gene_Name --input test_data/ids.txt --batch-size 2 --output test.tsv --cache-dir tmp_cache && bat test.tsv",
+            "problemMatcher": []
+        },
+        {
+            "label": "list-from-dbs",
+            "type": "shell",
+            "command": "docker run --rm pegi3s/id-mapping list-from-dbs",
+            "problemMatcher": []
+        }
+    ]
+}
diff --git a/id_mapping/BUILD.md b/id_mapping/BUILD.md
@@ -0,0 +1,11 @@
+# Building instructions
+
+Run:
+
+```bash
+CURRENT_VERSION=$(cat current.version) && docker build ./ -t pegi3s/id-mapping:${CURRENT_VERSION} --build-arg version=${CURRENT_VERSION} && docker tag pegi3s/id-mapping:${CURRENT_VERSION} pegi3s/id-mapping:latest
+```
+
+# Build log
+
+- 1.0.0 - 28/07/2023 - Hugo López Fernández
diff --git a/id_mapping/Dockerfile b/id_mapping/Dockerfile
@@ -0,0 +1,32 @@
+#
+#   Copyright 2018-2023 Hugo López-Fernández, Pedro M. Ferreira, Miguel 
+#   Reboiro-Jato, Cristina P. Vieira, and Jorge Vieira
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+FROM ubuntu:22.04
+
+RUN apt-get update && \
+    apt-get install -y python3-pip && \
+    pip install -Iv unipressed==1.2.0
+
+ARG version
+
+ENV VERSION=${version}
+
+ADD scripts /opt/scripts
+
+RUN chmod u+x /opt/scripts/*
+
+ENV PATH=/opt/scripts/:${PATH}
diff --git a/id_mapping/README.md b/id_mapping/README.md
@@ -0,0 +1,65 @@
+# This image belongs to a larger project called Bioinformatics Docker Images Project (http://pegi3s.github.io/dockerfiles)
+
+# ID mapping
+
+The `pegi3s/id-mapping` Docker image allows mapping identifiers using the [UniProt server](https://www.uniprot.org/id-mapping/) through the [Unipressed](https://github.com/multimeric/Unipressed) API client.
+
+The main script is `map-ids`, so you should adapt and run the following command: 
+```sh
+docker run --rm -v /your/data/dir:/data -w /data pegi3s/id-mapping --from-db <FROM_DB> --to-db <TO_DB> --input input.txt --output output.tsv
+```
+
+In this command, you should replace:
+- `/your/data/dir` to point to the directory that contains the file you want to process.
+- `input.txt` to the actual name of your input TXT file with the identifiers to map (one per line).
+- `output.tsv` to the actual name of your output TSV file.
+- `<FROM_DB>` to the actual name of the source database of the input identifiers.
+- `<TO_DB>` to the actual name of the destination database.
+
+The valid names for `<FROM_DB>` and `<TO_DB>` can be obtained with `docker run --rm pegi3s/id-mapping list-from-dbs` and `docker run --rm pegi3s/id-mapping list-to-dbs`, respectively.
+
+The script help can be obtained with `docker run --rm pegi3s/id-mapping map-ids -h`.
+
+Advanced script options are described in the next subsections.
+
+## Cache
+
+To avoid repeating time and again the same mapping queries it is possible to enable a cache mechanism by using the `--cache-dir <cache_dir_name>` parameter. This way, the script will maintain a cache of previous queries for each different combination of `<FROM_DB>` and `<TO_DB>`.
+
+## Batch size and delay
+
+By default, the script has a batch size of 10, which means that it will send queries with at most ten identifiers to the server. The default delay between queries is 1 second, which means that the script will wait for this time before sending a new batch query.
+
+These values can be changed by specifying `--batch-size <BATCH_SIZE> --delay <DELAY_SECONDS>`.
+
+# Test data
+
+To test the `map-ids` script, start creating a new file named `ids.txt` with the following identifiers:
+
+```
+A1L190
+A0JP26
+A0PK11
+```
+
+And then run the following command (change `/your/data/dir` to the actual path to the `ids.txt` file)
+
+```sh
+docker run --rm -v /your/data/dir:/data -w /data \
+    pegi3s/id-mapping map-ids \
+        --from-db UniProtKB_AC-ID \
+        --to-db Gene_Name \
+        --input ids.txt \
+        --output mapping.tsv \
+        --cache-dir id_mapping_cache
+```
+
+The result will be available in the new `mapping.tsv` file created at `/your/data/dir`.
+
+# Changelog
+
+The `latest` tag contains always the most recent version.
+
+## [1.0.0] - 28/07/2022
+
+- Initial `id-mapping` image containing the `map-ids`, `list-from-dbs` and `list-to-dbs` scripts.
diff --git a/id_mapping/current.version b/id_mapping/current.version
@@ -0,0 +1 @@
+1.0.0
diff --git a/id_mapping/scripts/id-mapping.py b/id_mapping/scripts/id-mapping.py
@@ -0,0 +1,15 @@
+from unipressed.id_mapping.types import From, To
+
+print(From)
+print(To)
+
+from unipressed import IdMappingClient
+request = IdMappingClient.submit(
+    source="UniProtKB_AC-ID", dest="Gene_Name", ids={"A1L190", "A0JP26", "A0PK11"}
+)
+
+print(request.get_status())
+import time
+while request.get_status() != "FINISHED":
+    time.sleep(1)
+print(list(request.each_result()))
diff --git a/id_mapping/scripts/list-from-dbs b/id_mapping/scripts/list-from-dbs
@@ -0,0 +1,7 @@
+#!/usr/bin/python3
+
+from unipressed.id_mapping.types import From
+from typing import get_args
+
+for db in get_args(From):
+    print(db)
diff --git a/id_mapping/scripts/list-to-dbs b/id_mapping/scripts/list-to-dbs
@@ -0,0 +1,7 @@
+#!/usr/bin/python3
+
+from unipressed.id_mapping.types import To
+from typing import get_args
+
+for db in get_args(To):
+    print(db)
diff --git a/id_mapping/scripts/map-ids b/id_mapping/scripts/map-ids
@@ -0,0 +1,129 @@
+#!/usr/bin/python3
+
+import os
+import argparse
+import time
+from unipressed.id_mapping.types import From, To
+from unipressed import IdMappingClient
+from typing import get_args
+
+
+def validate_db(db_str, valid_dbs, db_type):
+    if not db_str in get_args(valid_dbs):
+        print(
+            f'Error: The specified {db_type} database is not valid. It must be one of: {get_args(valid_dbs)}')
+        exit(1)
+
+
+def map_ids_unipressed(from_db, to_db, ids):
+    request = IdMappingClient.submit(source=from_db, dest=to_db, ids=ids)
+
+    while request.get_status() != "FINISHED":
+        time.sleep(1)
+
+    return list(request.each_result())
+
+
+def load_input(input_file):
+    if input_file and os.path.isfile(input_file) and os.access(input_file, os.R_OK):
+        with open(input_file, "r") as f:
+            return [line.strip() for line in f.readlines()]
+    else:
+        print("Error: The input file is missing or not readable.")
+        exit(1)
+
+
+def load_cache_and_subset_ids(cache_dir, from_db, to_db, source_ids):
+    cached_data = {}
+    source_ids_not_cached = source_ids
+
+    if cache_dir and os.path.isdir(cache_dir) and os.access(cache_dir, os.R_OK):
+        cache_file = os.path.join(cache_dir, f"cache_{from_db}_{to_db}.tsv")
+        if os.path.isfile(cache_file) and os.access(cache_file, os.R_OK):
+            with open(cache_file, "r") as f:
+                for line in f:
+                    key, value = line.strip().split("\t")
+                    cached_data[key] = value
+            print(f"Loaded data from cache. Size: {len(cached_data)}")
+
+            source_ids_not_cached = [item for item in source_ids if item not in cached_data]
+
+    return source_ids_not_cached, cached_data
+
+
+def map_ids(ids, from_db, to_db, batch_size, delay):
+    total_items = len(ids)
+    num_batches = (total_items + batch_size - 1) // batch_size
+
+    mapped_ids = []
+    for i in range(num_batches):
+        start_idx = i * batch_size
+        end_idx = min(start_idx + batch_size, total_items)
+        batch_data = ids[start_idx:end_idx]
+
+        print(f"Mapping batch {i+1}")
+        mapped_ids.extend(map_ids_unipressed(from_db, to_db, batch_data))
+
+        time.sleep(delay)
+
+    mapped_ids_dict = {}
+    for mapping in mapped_ids:
+        mapped_ids_dict[mapping['from']] = mapping['to']
+
+    return mapped_ids_dict
+
+
+def write_mapped_ids(output_file, source_ids, mapped_ids_dict, cached_data):
+    with open(output_file, "w") as output:
+        for source_id in source_ids:
+            if source_id in cached_data:
+                output.write(f"{source_id}\t{cached_data[source_id]}\n")
+            elif source_id in mapped_ids_dict:
+                output.write(f"{source_id}\t{mapped_ids_dict[source_id]}\n")
+            else:
+                output.write(f"{source_id}\t-\n")
+
+
+def save_cache(cache_dir, from_db, to_db, mapped_ids_dict):
+    os.makedirs(cache_dir, exist_ok=True)
+    if cache_dir and os.path.isdir(cache_dir) and os.access(cache_dir, os.R_OK):
+        cache_file = os.path.join(cache_dir, f"cache_{from_db}_{to_db}.tsv")
+        cached_data = {}
+        with open(cache_file, "a") as cache_file:
+            for key in mapped_ids_dict:
+                cache_file.write(f"{key}\t{mapped_ids_dict[key]}\n")
+
+
+def main(from_db, to_db, input_file, output_file, batch_size=10, delay=1, cache_dir=""):
+    validate_db(from_db, From, 'from')
+    validate_db(to_db, To, 'to')
+
+    print(f"Mapping IDs from '{from_db}' to '{to_db}' in batches of {batch_size} with a delay of {delay} second(s).")
+    print(f"Cache directory: '{cache_dir}'")
+    print(f"Input file: '{input_file}'")
+    print(f"Output file: '{output_file}'\n")
+
+    source_ids = load_input(input_file)
+    source_ids_not_cached, cached_data = load_cache_and_subset_ids(cache_dir, from_db, to_db, source_ids)
+    mapped_ids_dict = map_ids(source_ids_not_cached, from_db, to_db, batch_size, delay)
+    write_mapped_ids(output_file, source_ids, mapped_ids_dict, cached_data)
+
+    if cache_dir:
+        save_cache(cache_dir, from_db, to_db, mapped_ids_dict)
+
+
+if __name__ == "__main__":
+    print('Script version:', os.getenv('VERSION', 'NA'))
+    parser = argparse.ArgumentParser(description="Converts identifiers using the UniProt ID mapping server.")
+
+    parser.add_argument("--from-db", type=str, help="Source database.", required=True)
+    parser.add_argument("--to-db", type=str, help="Destination database.", required=True)
+    parser.add_argument("--input", type=str, help="Path to the input data file with the source IDs to be converted (one per line).", required=True)
+    parser.add_argument("--output", type=str, help="Path to the output file.", required=True)
+
+    parser.add_argument("--batch-size", type=int, default=10, help="Batch size for querying IDs to the UniProt server.")
+    parser.add_argument("--delay", type=int, default=1, help="Delay in seconds between batches.")
+    parser.add_argument("--cache-dir", type=str, default="", help="Cache directory.")
+
+    args = parser.parse_args()
+    main(args.from_db, args.to_db, args.input, args.output, args.batch_size, args.delay, args.cache_dir)
diff --git a/id_mapping/test_data/ids.txt b/id_mapping/test_data/ids.txt
@@ -0,0 +1,3 @@
+A1L190
+A0JP26
+A0PK11