# HG changeset patch
# User mvdbeek
# Date 1485183912 18000
# Node ID 6e4b5319cb897dc81ce954450a4076929e0f2acb
planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
diff -r 000000000000 -r 6e4b5319cb89 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,29 @@
+
+
+
+ tepid
+
+
+
+ 0.8.0
+
+
+
+ -p \$GALAXY_SLOTS
+ -n '$bowtie2_bam.element_identifier'
+
+
+
+ 10.7554/eLife.20777
+
+
+
+
diff -r 000000000000 -r 6e4b5319cb89 merge_insertions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/merge_insertions.py Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,112 @@
+from argparse import ArgumentParser
+import os
+import tempfile
+import pandas as pd
+import pybedtools
+
+COLUMNS = ['ins_chrom',
+ 'ins_start',
+ 'ins_end',
+ 'ref_chrom',
+ 'ref_start',
+ 'ref_end',
+ 'agi',
+ 'accession',
+ 'cluster']
+
+def overlap(start1, stop1, start2, stop2, d=50):
+ """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome"""
+ return start1 <= stop2+d and stop1 >= start2-d
+
+
+def merge(i, insertion, result):
+ if len(result) == 0:
+ result[i] = insertion
+ else:
+ if not can_merge(insertion, result):
+ result[i] = insertion
+
+
+def can_merge(insertion, result):
+ """
+ Merges insertions and returns True if all requirements are met
+ """
+ for j, master_insertion in result.items():
+ if insertion['agi'] & master_insertion['agi']:
+ if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']):
+ # Adjusting the insertion start (doesn't really do anything?!)
+ if len(insertion['agi']) < len(master_insertion['agi']):
+ ref_start = master_insertion['ref_start']
+ else:
+ ref_start = insertion['ref_start']
+ if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']:
+ result[j]['accession'] = result[j]['accession'] | (insertion['accession'])
+ result[j]['agi'] = result[j]['agi'] | (insertion['agi'])
+ return True
+ return False
+
+
+def inner_merge(s):
+ result = {}
+ for i, insertion in s.items():
+ merge(i, insertion, result)
+ return result.values()
+
+
+def reduce_and_cluster(inputfiles):
+ """
+ Read in inputfiles using pandas, write additional column with sample identifier,
+ sort and cluster using pybedtools and return dataframe.
+ """
+ usecols = [0,1,2,3,4,5,6] # skip col 7, which contains the read support id
+ tables = [pd.read_table(f, header=None) for f in inputfiles]
+ sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles]
+ for sample_id, df in zip(sample_ids, tables):
+ df[7] = sample_id
+ merged_table = pd.concat(tables)
+ tfile = tempfile.NamedTemporaryFile()
+ merged_table.to_csv(tfile, sep="\t", header=None, index=False)
+ tfile.flush()
+ bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50)
+ df = bedfile.to_dataframe()
+ df.columns = COLUMNS
+ # Split comma separated agi values and make set
+ df['agi'] = [set(v.split(',')) for v in df['agi'].values]
+ df['accession'] = [set(str(v).split(',')) for v in df['accession'].values]
+ return df
+
+
+def split_clusters(df):
+ """
+ clusters as defined by bedtools allow for 50 nt distance. This means that
+ clusters can be many kb large, so we check each individual insertion in
+ the cluster against the other insertions. We split the clusters based on
+ whether the overlap and TE identity criteria are fulfilled (so a
+ different TE would lead to a split in the clusters)
+ """
+ groups = df.groupby('cluster')
+ nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups]
+ return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS]
+
+
+def write_output(df, output):
+ # Turn sets back to comma-separated values
+ df['agi'] = [",".join(agi) for agi in df['agi']]
+ df['accession'] = [",".join(acc) for acc in df['accession']]
+ # write out result without last column
+ df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1])
+
+
+def main(inputfiles, output):
+ df = reduce_and_cluster(inputfiles)
+ df = split_clusters(df)
+ write_output(df, output)
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser(description='Merge TE insertions calls')
+ parser.add_argument('-o', '--output', help='output file', required=True)
+ parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True)
+ options = parser.parse_args()
+
+ main(inputfiles=options.input, output=options.output)
diff -r 000000000000 -r 6e4b5319cb89 tepid_merge_insertions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tepid_merge_insertions.xml Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,41 @@
+
+ discover TE variants
+
+ macros.xml
+
+
+ tepid-discover --version
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 6e4b5319cb89 test-data/1.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.bed Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,10 @@
+2L 4691 4799 2L 20117108 20119062 FBti0060250,FBti0060251 0
+2L 47518 47726 2L 4918232 4923234 FBti0019122 1
+2L 47850 48473 2L 4918232 4923234 FBti0019122 2
+2L 48486 48590 2L 4918232 4923234 FBti0019122 3
+2L 48723 49124 2L 4918232 4923234 FBti0019122 4
+2L 49186 49976 2L 4918232 4923234 FBti0019122 5
+2L 50072 50549 2L 4918232 4923234 FBti0019122 6
+2L 50612 50758 2L 4918232 4923234 FBti0019122 7
+2L 50770 52523 2L 4918232 4923234 FBti0019122 8
+2L 52532 52565 2L 2112167 2119774 FBti0019109 9
diff -r 000000000000 -r 6e4b5319cb89 test-data/2.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/2.bed Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,12 @@
+2L 47518 47717 2L 4918232 4923234 FBti0019122 0
+2L 48293 48410 2L 4918232 4923234 FBti0019122 1
+2L 48699 49247 2L 4918232 4923234 FBti0019122 2
+2L 49442 49809 2L 4918232 4923234 FBti0019122 3
+2L 49823 50022 2L 4918232 4923234 FBti0019122 4
+2L 50448 50572 2L 4918232 4923234 FBti0019122 5
+2L 50959 51202 2L 4918232 4923234 FBti0019122 6
+2L 51549 51773 2L 4918232 4923234 FBti0019122 7
+2L 51866 52171 2L 4918232 4923234 FBti0019122 8
+2L 52178 52523 2L 1618308 1618557 FBti0019105 9
+2L 52531 52564 2L 2112167 2119774 FBti0019109 10
+2L 64355 64575 3L 23219851 23224264 FBti0059667,FBti0020284 11
diff -r 000000000000 -r 6e4b5319cb89 test-data/test.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.bed Mon Jan 23 10:05:12 2017 -0500
@@ -0,0 +1,13 @@
+2L 4691 4799 2L 20117108 20119062 FBti0060251,FBti0060250 1
+2L 47518 47726 2L 4918232 4923234 FBti0019122 1,2
+2L 47850 48473 2L 4918232 4923234 FBti0019122 1,2
+2L 48699 49247 2L 4918232 4923234 FBti0019122 1,2
+2L 49442 49809 2L 4918232 4923234 FBti0019122 2
+2L 50072 50549 2L 4918232 4923234 FBti0019122 1,2
+2L 50612 50758 2L 4918232 4923234 FBti0019122 1
+2L 50959 51202 2L 4918232 4923234 FBti0019122 2
+2L 51549 51773 2L 4918232 4923234 FBti0019122 2
+2L 51866 52171 2L 4918232 4923234 FBti0019122 2
+2L 52178 52523 2L 1618308 1618557 FBti0019105 2
+2L 52531 52564 2L 2112167 2119774 FBti0019109 1,2
+2L 64355 64575 3L 23219851 23224264 FBti0020284,FBti0059667 2