Mercurial > repos > mvdbeek > tepid_merge_insertions

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,29 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.8.0">tepid</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@WRAPPER_VERSION@">0.8.0</token>
+    <token name="@REFERENCES@">
+<![CDATA[
+------
+This tool is part of the `TEPID`_ pipeline from the `Lister laboratory`_.
+.. _TEPID package: https://github.com/ListerLab/TEPID
+.. _Lister laboratory http://listerlab.org/
+]]>
+    </token>
+    <token name="@PROC@">-p \$GALAXY_SLOTS</token>
+    <token name="@NAME@">-n '$bowtie2_bam.element_identifier'</token>
+    <token name="@LINK_CONC@"><![CDATA[
+        ln -f -s '$bowtie2_bam' conc.bam &&
+        ln -f -s '$bowtie2_bam.metadata.bam_index' conc.bam.bai &&
+    ]]></token>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.7554/eLife.20777</citation>
+            <yield />
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_insertions.py	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,112 @@
+from argparse import ArgumentParser
+import os
+import tempfile
+import pandas as pd
+import pybedtools
+
+COLUMNS = ['ins_chrom',
+           'ins_start',
+           'ins_end',
+           'ref_chrom',
+           'ref_start',
+           'ref_end',
+           'agi',
+           'accession',
+           'cluster']
+
+def overlap(start1, stop1, start2, stop2, d=50):
+    """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome"""
+    return start1 <= stop2+d and stop1 >= start2-d
+
+
+def merge(i, insertion, result):
+    if len(result) == 0:
+        result[i] = insertion
+    else:
+        if not can_merge(insertion, result):
+            result[i] = insertion
+
+
+def can_merge(insertion, result):
+    """
+    Merges insertions and returns True if all requirements are met
+    """
+    for j, master_insertion in result.items():
+        if insertion['agi'] & master_insertion['agi']:
+            if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']):
+                # Adjusting the insertion start (doesn't really do anything?!)
+                if len(insertion['agi']) < len(master_insertion['agi']):
+                    ref_start = master_insertion['ref_start']
+                else:
+                    ref_start = insertion['ref_start']
+                if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']:
+                   result[j]['accession'] = result[j]['accession'] | (insertion['accession'])
+                   result[j]['agi'] = result[j]['agi'] | (insertion['agi'])
+                return True
+    return False
+
+
+def inner_merge(s):
+    result = {}
+    for i, insertion in s.items():
+        merge(i, insertion, result)
+    return result.values()
+
+
+def reduce_and_cluster(inputfiles):
+    """
+    Read in inputfiles using pandas, write additional column with sample identifier,
+    sort and cluster using pybedtools and return dataframe.
+    """
+    usecols = [0,1,2,3,4,5,6]  # skip col 7, which contains the read support id
+    tables = [pd.read_table(f, header=None) for f in inputfiles]
+    sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles]
+    for sample_id, df in zip(sample_ids, tables):
+        df[7] = sample_id
+    merged_table = pd.concat(tables)
+    tfile = tempfile.NamedTemporaryFile()
+    merged_table.to_csv(tfile, sep="\t", header=None, index=False)
+    tfile.flush()
+    bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50)
+    df = bedfile.to_dataframe()
+    df.columns = COLUMNS
+    # Split comma separated agi values and make set
+    df['agi'] = [set(v.split(',')) for v in df['agi'].values]
+    df['accession'] = [set(str(v).split(',')) for v in df['accession'].values]
+    return df
+
+
+def split_clusters(df):
+    """
+    clusters as defined by bedtools allow for 50 nt distance. This means that
+    clusters can be many kb large, so we check each individual insertion in
+    the cluster against the other insertions. We split the clusters based on
+    whether the overlap and TE identity criteria are fulfilled (so a
+    different TE would lead to a split in the clusters)
+    """
+    groups = df.groupby('cluster')
+    nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups]
+    return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS]
+
+
+def write_output(df, output):
+    # Turn sets back to comma-separated values
+    df['agi'] = [",".join(agi) for agi in df['agi']]
+    df['accession'] = [",".join(acc) for acc in df['accession']]
+    # write out result without last column
+    df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1])
+
+
+def main(inputfiles, output):
+    df = reduce_and_cluster(inputfiles)
+    df = split_clusters(df)
+    write_output(df, output)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description='Merge TE insertions calls')
+    parser.add_argument('-o', '--output', help='output file', required=True)
+    parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True)
+    options = parser.parse_args()
+
+    main(inputfiles=options.input, output=options.output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tepid_merge_insertions.xml	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,41 @@
+<tool id="tepid_merge_insertions" name="tepid-merge-insertions" version="@WRAPPER_VERSION@">
+    <description>discover TE variants</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <version_command>tepid-discover --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        #for $input in $insertions:
+            ln -s -f '$input' '$input.element_identifier' &&
+        #end for
+        python '$__tool_directory__'/merge_insertions.py -i
+        #for $input in $insertions:
+            '$input.element_identifier'
+        #end for
+        -o '$merged_out'
+    ]]></command>
+    <inputs>
+        <param name="insertions" label="TEPID insertions" argument="--input" type="data_collection" format="bed"/>
+    </inputs>
+    <outputs>
+        <data name="merged_out" format="bed" label="tepid_discover merged insertions on ${on_string}">
+        </data>
+    </outputs>
+    <tests>
+        <test>
+      <param name="insertions">
+          <collection type="list">
+              <element name="1" value="1.bed" />
+              <element name="2" value="2.bed" />
+          </collection>
+        </param>
+           <output name="merged_out" file="test.bed"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+        This step merges insertions found by tepid discover.
+        The output can be used as input in the tepid-refine step.
+    ]]></help>
+<expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.bed	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,10 @@
+2L	4691	4799	2L	20117108	20119062	FBti0060250,FBti0060251	0
+2L	47518	47726	2L	4918232	4923234	FBti0019122	1
+2L	47850	48473	2L	4918232	4923234	FBti0019122	2
+2L	48486	48590	2L	4918232	4923234	FBti0019122	3
+2L	48723	49124	2L	4918232	4923234	FBti0019122	4
+2L	49186	49976	2L	4918232	4923234	FBti0019122	5
+2L	50072	50549	2L	4918232	4923234	FBti0019122	6
+2L	50612	50758	2L	4918232	4923234	FBti0019122	7
+2L	50770	52523	2L	4918232	4923234	FBti0019122	8
+2L	52532	52565	2L	2112167	2119774	FBti0019109	9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/2.bed	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,12 @@
+2L	47518	47717	2L	4918232	4923234	FBti0019122	0
+2L	48293	48410	2L	4918232	4923234	FBti0019122	1
+2L	48699	49247	2L	4918232	4923234	FBti0019122	2
+2L	49442	49809	2L	4918232	4923234	FBti0019122	3
+2L	49823	50022	2L	4918232	4923234	FBti0019122	4
+2L	50448	50572	2L	4918232	4923234	FBti0019122	5
+2L	50959	51202	2L	4918232	4923234	FBti0019122	6
+2L	51549	51773	2L	4918232	4923234	FBti0019122	7
+2L	51866	52171	2L	4918232	4923234	FBti0019122	8
+2L	52178	52523	2L	1618308	1618557	FBti0019105	9
+2L	52531	52564	2L	2112167	2119774	FBti0019109	10
+2L	64355	64575	3L	23219851	23224264	FBti0059667,FBti0020284	11
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.bed	Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,13 @@
+2L	4691	4799	2L	20117108	20119062	FBti0060251,FBti0060250	1
+2L	47518	47726	2L	4918232	4923234	FBti0019122	1,2
+2L	47850	48473	2L	4918232	4923234	FBti0019122	1,2
+2L	48699	49247	2L	4918232	4923234	FBti0019122	1,2
+2L	49442	49809	2L	4918232	4923234	FBti0019122	2
+2L	50072	50549	2L	4918232	4923234	FBti0019122	1,2
+2L	50612	50758	2L	4918232	4923234	FBti0019122	1
+2L	50959	51202	2L	4918232	4923234	FBti0019122	2
+2L	51549	51773	2L	4918232	4923234	FBti0019122	2
+2L	51866	52171	2L	4918232	4923234	FBti0019122	2
+2L	52178	52523	2L	1618308	1618557	FBti0019105	2
+2L	52531	52564	2L	2112167	2119774	FBti0019109	1,2
+2L	64355	64575	3L	23219851	23224264	FBti0020284,FBti0059667	2