comparison merge_insertions.py @ 0:6e4b5319cb89 draft default tip

planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
author mvdbeek
date Mon, 23 Jan 2017 10:05:12 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e4b5319cb89
1 from argparse import ArgumentParser
2 import os
3 import tempfile
4 import pandas as pd
5 import pybedtools
6
7 COLUMNS = ['ins_chrom',
8 'ins_start',
9 'ins_end',
10 'ref_chrom',
11 'ref_start',
12 'ref_end',
13 'agi',
14 'accession',
15 'cluster']
16
17 def overlap(start1, stop1, start2, stop2, d=50):
18 """returns True if sets of coordinates overlap. Assumes coordinates are on same chromosome"""
19 return start1 <= stop2+d and stop1 >= start2-d
20
21
22 def merge(i, insertion, result):
23 if len(result) == 0:
24 result[i] = insertion
25 else:
26 if not can_merge(insertion, result):
27 result[i] = insertion
28
29
30 def can_merge(insertion, result):
31 """
32 Merges insertions and returns True if all requirements are met
33 """
34 for j, master_insertion in result.items():
35 if insertion['agi'] & master_insertion['agi']:
36 if overlap(master_insertion['ins_start'], master_insertion['ins_end'], insertion['ins_start'],insertion['ins_end']):
37 # Adjusting the insertion start (doesn't really do anything?!)
38 if len(insertion['agi']) < len(master_insertion['agi']):
39 ref_start = master_insertion['ref_start']
40 else:
41 ref_start = insertion['ref_start']
42 if master_insertion['ins_chrom'] == insertion['ins_chrom'] and insertion['ref_chrom'] == master_insertion['ref_chrom'] and ref_start == master_insertion['ref_start']:
43 result[j]['accession'] = result[j]['accession'] | (insertion['accession'])
44 result[j]['agi'] = result[j]['agi'] | (insertion['agi'])
45 return True
46 return False
47
48
49 def inner_merge(s):
50 result = {}
51 for i, insertion in s.items():
52 merge(i, insertion, result)
53 return result.values()
54
55
56 def reduce_and_cluster(inputfiles):
57 """
58 Read in inputfiles using pandas, write additional column with sample identifier,
59 sort and cluster using pybedtools and return dataframe.
60 """
61 usecols = [0,1,2,3,4,5,6] # skip col 7, which contains the read support id
62 tables = [pd.read_table(f, header=None) for f in inputfiles]
63 sample_ids = [os.path.basename(f).rsplit('.')[0] for f in inputfiles]
64 for sample_id, df in zip(sample_ids, tables):
65 df[7] = sample_id
66 merged_table = pd.concat(tables)
67 tfile = tempfile.NamedTemporaryFile()
68 merged_table.to_csv(tfile, sep="\t", header=None, index=False)
69 tfile.flush()
70 bedfile = pybedtools.BedTool(tfile.name).sort().cluster(d=50)
71 df = bedfile.to_dataframe()
72 df.columns = COLUMNS
73 # Split comma separated agi values and make set
74 df['agi'] = [set(v.split(',')) for v in df['agi'].values]
75 df['accession'] = [set(str(v).split(',')) for v in df['accession'].values]
76 return df
77
78
79 def split_clusters(df):
80 """
81 clusters as defined by bedtools allow for 50 nt distance. This means that
82 clusters can be many kb large, so we check each individual insertion in
83 the cluster against the other insertions. We split the clusters based on
84 whether the overlap and TE identity criteria are fulfilled (so a
85 different TE would lead to a split in the clusters)
86 """
87 groups = df.groupby('cluster')
88 nested_list = [inner_merge(group.transpose().to_dict()) for _, group in groups]
89 return pd.DataFrame([i for n in nested_list for i in n])[COLUMNS]
90
91
92 def write_output(df, output):
93 # Turn sets back to comma-separated values
94 df['agi'] = [",".join(agi) for agi in df['agi']]
95 df['accession'] = [",".join(acc) for acc in df['accession']]
96 # write out result without last column
97 df.to_csv(output, sep="\t",header=None, index=None, columns=COLUMNS[:-1])
98
99
100 def main(inputfiles, output):
101 df = reduce_and_cluster(inputfiles)
102 df = split_clusters(df)
103 write_output(df, output)
104
105
106 if __name__ == "__main__":
107 parser = ArgumentParser(description='Merge TE insertions calls')
108 parser.add_argument('-o', '--output', help='output file', required=True)
109 parser.add_argument('-i', '--input', help='Insertion files to merge', nargs="+", required=True)
110 options = parser.parse_args()
111
112 main(inputfiles=options.input, output=options.output)