Mercurial > repos > mvdbeek > tepid_merge_insertions
changeset 0:6e4b5319cb89 draft default tip
planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
author | mvdbeek |
---|---|
date | Mon, 23 Jan 2017 10:05:12 -0500 |
parents | |
children | |
files | macros.xml merge_insertions.py tepid_merge_insertions.xml test-data/1.bed test-data/2.bed test-data/test.bed |
diffstat | 6 files changed, 217 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,29 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="0.8.0">tepid</requirement> + <yield/> + </requirements> + </xml> + <token name="@WRAPPER_VERSION@">0.8.0</token> + <token name="@REFERENCES@"> +<![CDATA[ +------ +This tool is part of the `TEPID`_ pipeline from the `Lister laboratory`_. +.. _TEPID package: https://github.com/ListerLab/TEPID +.. _Lister laboratory http://listerlab.org/ +]]> + </token> + <token name="@PROC@">-p \$GALAXY_SLOTS</token> + <token name="@NAME@">-n '$bowtie2_bam.element_identifier'</token> + <token name="@LINK_CONC@"><![CDATA[ + ln -f -s '$bowtie2_bam' conc.bam && + ln -f -s '$bowtie2_bam.metadata.bam_index' conc.bam.bai && + ]]></token> + <xml name="citations"> + <citations> + <citation type="doi">10.7554/eLife.20777</citation> + <yield /> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/merge_insertions.py Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,112 @@ +from argparse import ArgumentParser +import os +import tempfile +import pandas as pd +import pybedtools + +COLUMNS = ['ins_chrom', + 'ins_start', + 'ins_end', + 'ref_chrom', + 'ref_start', + 'ref_end', + 'agi', + 'accession', + 'cluster'] + +def overlap(start1, stop1, start2, stop2, d=50): + """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome""" + return start1 <= stop2+d and stop1 >= start2-d + + +def merge(i, insertion, result): + if len(result) == 0: + result[i] = insertion + else: + if not can_merge(insertion, result): + result[i] = insertion + + +def can_merge(insertion, result): + """ + Merges insertions and returns True if all requirements are met + """ + for j, master_insertion in result.items(): + if insertion['agi'] & master_insertion['agi']: + if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']): + # Adjusting the insertion start (doesn't really do anything?!) + if len(insertion['agi']) < len(master_insertion['agi']): + ref_start = master_insertion['ref_start'] + else: + ref_start = insertion['ref_start'] + if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']: + result[j]['accession'] = result[j]['accession'] | (insertion['accession']) + result[j]['agi'] = result[j]['agi'] | (insertion['agi']) + return True + return False + + +def inner_merge(s): + result = {} + for i, insertion in s.items(): + merge(i, insertion, result) + return result.values() + + +def reduce_and_cluster(inputfiles): + """ + Read in inputfiles using pandas, write additional column with sample identifier, + sort and cluster using pybedtools and return dataframe. + """ + usecols = [0,1,2,3,4,5,6] # skip col 7, which contains the read support id + tables = [pd.read_table(f, header=None) for f in inputfiles] + sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles] + for sample_id, df in zip(sample_ids, tables): + df[7] = sample_id + merged_table = pd.concat(tables) + tfile = tempfile.NamedTemporaryFile() + merged_table.to_csv(tfile, sep="\t", header=None, index=False) + tfile.flush() + bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50) + df = bedfile.to_dataframe() + df.columns = COLUMNS + # Split comma separated agi values and make set + df['agi'] = [set(v.split(',')) for v in df['agi'].values] + df['accession'] = [set(str(v).split(',')) for v in df['accession'].values] + return df + + +def split_clusters(df): + """ + clusters as defined by bedtools allow for 50 nt distance. This means that + clusters can be many kb large, so we check each individual insertion in + the cluster against the other insertions. We split the clusters based on + whether the overlap and TE identity criteria are fulfilled (so a + different TE would lead to a split in the clusters) + """ + groups = df.groupby('cluster') + nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups] + return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS] + + +def write_output(df, output): + # Turn sets back to comma-separated values + df['agi'] = [",".join(agi) for agi in df['agi']] + df['accession'] = [",".join(acc) for acc in df['accession']] + # write out result without last column + df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1]) + + +def main(inputfiles, output): + df = reduce_and_cluster(inputfiles) + df = split_clusters(df) + write_output(df, output) + + +if __name__ == "__main__": + parser = ArgumentParser(description='Merge TE insertions calls') + parser.add_argument('-o', '--output', help='output file', required=True) + parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True) + options = parser.parse_args() + + main(inputfiles=options.input, output=options.output)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tepid_merge_insertions.xml Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,41 @@ +<tool id="tepid_merge_insertions" name="tepid-merge-insertions" version="@WRAPPER_VERSION@"> + <description>discover TE variants</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <version_command>tepid-discover --version</version_command> + <command detect_errors="exit_code"><![CDATA[ + #for $input in $insertions: + ln -s -f '$input' '$input.element_identifier' && + #end for + python '$__tool_directory__'/merge_insertions.py -i + #for $input in $insertions: + '$input.element_identifier' + #end for + -o '$merged_out' + ]]></command> + <inputs> + <param name="insertions" label="TEPID insertions" argument="--input" type="data_collection" format="bed"/> + </inputs> + <outputs> + <data name="merged_out" format="bed" label="tepid_discover merged insertions on ${on_string}"> + </data> + </outputs> + <tests> + <test> + <param name="insertions"> + <collection type="list"> + <element name="1" value="1.bed" /> + <element name="2" value="2.bed" /> + </collection> + </param> + <output name="merged_out" file="test.bed"/> + </test> + </tests> + <help><![CDATA[ + This step merges insertions found by tepid discover. + The output can be used as input in the tepid-refine step. + ]]></help> +<expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/1.bed Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,10 @@ +2L 4691 4799 2L 20117108 20119062 FBti0060250,FBti0060251 0 +2L 47518 47726 2L 4918232 4923234 FBti0019122 1 +2L 47850 48473 2L 4918232 4923234 FBti0019122 2 +2L 48486 48590 2L 4918232 4923234 FBti0019122 3 +2L 48723 49124 2L 4918232 4923234 FBti0019122 4 +2L 49186 49976 2L 4918232 4923234 FBti0019122 5 +2L 50072 50549 2L 4918232 4923234 FBti0019122 6 +2L 50612 50758 2L 4918232 4923234 FBti0019122 7 +2L 50770 52523 2L 4918232 4923234 FBti0019122 8 +2L 52532 52565 2L 2112167 2119774 FBti0019109 9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/2.bed Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,12 @@ +2L 47518 47717 2L 4918232 4923234 FBti0019122 0 +2L 48293 48410 2L 4918232 4923234 FBti0019122 1 +2L 48699 49247 2L 4918232 4923234 FBti0019122 2 +2L 49442 49809 2L 4918232 4923234 FBti0019122 3 +2L 49823 50022 2L 4918232 4923234 FBti0019122 4 +2L 50448 50572 2L 4918232 4923234 FBti0019122 5 +2L 50959 51202 2L 4918232 4923234 FBti0019122 6 +2L 51549 51773 2L 4918232 4923234 FBti0019122 7 +2L 51866 52171 2L 4918232 4923234 FBti0019122 8 +2L 52178 52523 2L 1618308 1618557 FBti0019105 9 +2L 52531 52564 2L 2112167 2119774 FBti0019109 10 +2L 64355 64575 3L 23219851 23224264 FBti0059667,FBti0020284 11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.bed Mon Jan 23 10:05:12 2017 -0500 @@ -0,0 +1,13 @@ +2L 4691 4799 2L 20117108 20119062 FBti0060251,FBti0060250 1 +2L 47518 47726 2L 4918232 4923234 FBti0019122 1,2 +2L 47850 48473 2L 4918232 4923234 FBti0019122 1,2 +2L 48699 49247 2L 4918232 4923234 FBti0019122 1,2 +2L 49442 49809 2L 4918232 4923234 FBti0019122 2 +2L 50072 50549 2L 4918232 4923234 FBti0019122 1,2 +2L 50612 50758 2L 4918232 4923234 FBti0019122 1 +2L 50959 51202 2L 4918232 4923234 FBti0019122 2 +2L 51549 51773 2L 4918232 4923234 FBti0019122 2 +2L 51866 52171 2L 4918232 4923234 FBti0019122 2 +2L 52178 52523 2L 1618308 1618557 FBti0019105 2 +2L 52531 52564 2L 2112167 2119774 FBti0019109 1,2 +2L 64355 64575 3L 23219851 23224264 FBti0020284,FBti0059667 2