annotate merge_deletions.py @ 0:ff6683f8e9a1 draft

planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
author mvdbeek
date Mon, 23 Jan 2017 10:05:02 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
1 #! /usr/bin/env python
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
2
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
3 import os
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
4 from argparse import ArgumentParser
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
5
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
6 def create_master_dict(sample, fname):
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
7 with open(fname, 'r') as masterfile:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
8 x = 0
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
9 master_dict = {}
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
10 for line in masterfile:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
11 field = line.rsplit()
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
12 if not line[0] == 'ins_chr':
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
13 coords = '\t'.join(field[:5])
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
14 master_dict[x] = {'coords': coords, 'accessions': [sample]}
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
15 x += 1
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
16 return master_dict
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
17
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
18
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
19 def merge_deletions(master, fname, sample):
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
20 with open(fname, 'r') as infile:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
21 for line in infile:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
22 field = line.rsplit()
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
23 coords = '\t'.join(field[:5])
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
24 i = len(master)-1
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
25 x = 0
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
26 while x <= i:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
27 if master[x]['coords'] == coords:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
28 master[x]['accessions'].append(sample)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
29 break
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
30 elif x == i:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
31 master[x+1] = {'coords': coords, 'accessions': [sample]}
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
32 break
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
33 else:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
34 x += 1
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
35
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
36
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
37 def save_deletions(master, outf):
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
38 with open(outf, 'w+') as outfile:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
39 for key, value in master.iteritems():
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
40 accessions = set(value['accessions'])
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
41 outfile.write('{c}\t{a}\n'.format(c=value['coords'], a=','.join(accessions)))
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
42
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
43 def get_name_from_filename(filename):
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
44 return os.path.basename(filename).rsplit('.', 1)[0]
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
45
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
46 if __name__ == "__main__":
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
47
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
48 parser = ArgumentParser(description='Merge TE deletions calls')
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
49 parser.add_argument('-o', '--output', help="File to write merged deletions to.", required=True)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
50 parser.add_argument('-i', '--input', help='all files that should be merged', nargs="+", required=True)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
51 options = parser.parse_args()
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
52
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
53 first_file = options.input[0]
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
54 first_samplename = get_name_from_filename(first_file)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
55 master_dictionary = create_master_dict(first_samplename, first_file)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
56 for filename in options.input[1:]:
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
57 samplename = get_name_from_filename(filename)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
58 merge_deletions(master_dictionary, filename, samplename)
ff6683f8e9a1 planemo upload for repository https://github.com/ListerLab/TEPID commit 82fd0448ff5baa9822a388aee78753e4b1cd94d7
mvdbeek
parents:
diff changeset
59 save_deletions(master_dictionary, options.output)