diff convert_lineage_defs.py @ 0:5e8505f27681 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
author iuc
date Tue, 08 Aug 2023 15:11:47 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_lineage_defs.py	Tue Aug 08 15:11:47 2023 +0000
@@ -0,0 +1,173 @@
+# Try to convert constellations files into the format expected by lineagespot.
+# Constellations files can define parent lineages, in which case the script
+# parses parent mutations recursively and adds them to the signature of the
+# child.
+
+# CURRENT AND GENERAL LIMITATIONS
+# Important to understand, please read carefully
+# 1. Constellations sometimes uses base instead of amino acid positions for
+# defining mutations. These can take two forms like in these examples:
+# "nuc:C8986T", i.e. a SNV given in base coordinates
+# "del:22029:6", i.e. a deletion of 6 bases given in base coordinates
+# The current version of the script makes no attempt to convert such lines to
+# amino acid coordinates, but simply drops them.
+# 2. In other cases, constellations lists deletions in amino acid poisitions like
+# this:
+# "s:HV69-"
+# While this notation could be parsed such lines are currently *also* dropped
+# because it's not entirely clear how lineagespot describes deletions.
+# 3. In some cases, constellation also provides mutations in mature peptide
+# coordinates, like "nsp15:K259R". Lines like this are currently dropped, too.
+# 4. The constellations data provided by
+# https://github.com/cov-lineages/constellations
+# lists mostly lineage-defining mutations that can be used to *distinguish*
+# between lineages, but makes no attempt to provide complete lists of mutations
+# (even through parent lineage definitions) for any lineage.
+
+import argparse
+import json
+import os
+import re
+import sys
+
+
+genes_names_translation = {
+    "orf1a": "ORF1a",
+    "orf1ab": "ORF1ab",
+    "1ab": "ORF1ab",
+    "orf1b": "ORF1b",
+    "s": "S",
+    "spike": "S",
+    "orf3a": "ORF3a",
+    "e": "E",
+    "m": "M",
+    "n": "N",
+    "orf6": "ORF6",
+    "orf7a": "ORF7a",
+    "orf7b": "ORF7b",
+    "orf8": "ORF8",
+    "8": "ORF8",
+    "n": "N",
+    # NOTE: in constellations, mutations are sometimes, but not always, given
+    # in nsp coordinates instead of ORF1a/b ones. Currently, we drop these,
+    # while we should convert instead!!!
+    "nsp2": "NSP2",
+    "nsp3": "NSP3",
+    "nsp4": "NSP4",
+    "nsp5": "NSP5",
+    "nsp6": "NSP6",
+    "nsp7": "NSP7",
+    "nsp8": "NSP8",
+    "nsp9": "NSP9",
+    "nsp10": "NSP10",
+    "nsp12": "NSP12",
+    "nsp13": "NSP13",
+    "nsp14": "NSP14",
+    "nsp15": "NSP15",
+    "nsp16": "NSP16",
+}
+
+
+lineagespot_template = dict.fromkeys(["ORF1a", "ORF1b", "S", "ORF3a", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "E", "N"])
+definitions = {}
+
+pat = re.compile(r'(?P<gene>.+):(?P<ref>[A-Z]+)(?P<pos>\d+)(?P<alt>[A-Z*]+)')
+
+
+def read_lineage_variants(x, lineage_name):
+    data = json.load(x)
+
+    sites = {}
+    for mut in data["sites"]:
+        match = pat.match(mut)
+        if match is None:
+            # Likely a del or nuc mutation given at the base level
+            continue
+        # try to get a canonical gene name
+        gene = genes_names_translation.get(
+            match.group('gene'),
+            match.group('gene')
+        )
+        pos = int(match.group('pos'))
+        if gene == 'ORF1ab':
+            # constellations isn't very consistent in representing ORF1ab
+            # mutations. They may be provided in ORF1a or ORF1b coordinates,
+            # but could also just be given as ORF1ab.
+            if pos <= 4401:
+                gene = 'ORF1a'
+            else:
+                gene = 'ORF1b'
+                # 4715 == 314 in constellations
+                pos = pos - 4401
+        if gene not in sites:
+            sites[gene] = {}
+        sites[gene][pos] = (match.group('ref'), match.group('alt'))
+
+    # recursively parse parent lineages and
+    # add their mutations to the global definitions
+    if "parent_lineage" in data["variant"]:
+        x_parent = data["variant"]["parent_lineage"]
+        if x_parent not in definitions:
+            parent_filename = f"c{x_parent}.json"
+            lineage_def_dir = os.path.dirname(x.name)
+            parent_file = os.path.join(lineage_def_dir, parent_filename)
+            if not os.path.isfile(parent_file):
+                raise FileNotFoundError(
+                    f"{x_parent} is defined as a parent of {lineage_name}, but "
+                    f"definitions file {parent_filename} not found in "
+                    f"{lineage_def_dir}."
+                )
+            with open(parent_file) as parent_in:
+                read_lineage_variants(parent_in, x_parent)
+
+        # update the sites dictionary to include also mutations defined for the parent
+        for gene, muts in definitions[x_parent].items():
+            if gene in sites:
+                for pos, ref_alt in muts.items():
+                    if pos in sites[gene]:
+                        # exotic case of a parent site being affected in the child
+                        # lineage again. Kepp the child site unaltered.
+                        continue
+                    sites[gene][pos] = ref_alt
+            else:
+                # only the parent has mutations in this gene listed
+                sites[gene] = muts
+    # done with this lineage and all of its parents
+    definitions[lineage_name] = sites
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i", "--input", required=True,
+    help="Name of the input folder"
+)
+parser.add_argument(
+    "-o", "--output", required=True,
+    help="Name of the output folder"
+)
+if len(sys.argv) < 2:
+    sys.exit('Please run with -h / --help for help.')
+
+args = parser.parse_args()
+
+for definitions_file in os.listdir(args.input):
+    # In constellations, the only reliable way to get the lineage name is from
+    # the file name by stripping the .json suffix from it and dropping the
+    # leading 'c' (e.g. cBA.5.json holds the definition for lineage BA.5).
+    if definitions_file[0] != 'c' or definitions_file[-5:] != '.json':
+        continue
+    lineage_name_from_file = definitions_file[1:-5]
+    if lineage_name_from_file in definitions:
+        # seems we have parsed this lineage already as a parent of another lineage
+        continue
+    with open(os.path.join(args.input, definitions_file)) as data_in:
+        read_lineage_variants(data_in, lineage_name_from_file)
+
+for lineage, sites in definitions.items():
+    # if path isn't there, create one could be added
+    with open(os.path.join(args.output, lineage) + '.txt', "w") as data_out:
+        data_out.write('gene\tamino acid\n')
+        for gene, muts in sites.items():
+            if gene in lineagespot_template:
+                for pos, ref_alt in muts.items():
+                    data_out.write(f'{gene}\t{ref_alt[0]}{pos}{ref_alt[1]}\n')