Mercurial > repos > mvdbeek > dapars
comparison filter_utr.py @ 0:bb84ee2f2137 draft
planemo upload for repository https://github.com/mvdbeek/dapars commit 868f8f2f7ac5d70c39b7d725ff087833b0f24f52-dirty
author | mvdbeek |
---|---|
date | Tue, 27 Oct 2015 10:14:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bb84ee2f2137 |
---|---|
1 #!/usr/bin/env python | |
2 # Filter out UTRs and, if a gene has multiple UTRs, return a single UTR with minimum start and maximum end coordinate | |
3 # usage: python filter_utr.py input.gtf output.gtf | |
4 | |
5 import sys | |
6 from collections import OrderedDict | |
7 | |
8 def get_gtf_fields(): | |
9 return [ "chr", "source", "feature", "start", "end", "score", "strand", "frame", "group" ] | |
10 | |
11 def get_feature_dict(line, gtf_fields, utr_dict, feature="UTR"): | |
12 """ | |
13 Return a dictionary with lines of a GTF if the line describes a UTR. | |
14 Key is the first attribute of the group. | |
15 """ | |
16 if line.split("\t")[2] == feature: | |
17 fields = line.strip().split("\t") | |
18 fields[3] = int(fields[3]) | |
19 fields[4] = int(fields[4]) | |
20 gene_id = fields[-1].split("\"")[1] | |
21 if not gene_id in utr_dict: | |
22 utr_dict[gene_id] = [OrderedDict(zip(gtf_fields, fields))] | |
23 else: | |
24 utr_dict[gene_id].append(OrderedDict(zip(gtf_fields, fields))) | |
25 | |
26 def remove_five_prime_utrs(utr_dict, start_codon_dict): | |
27 for gene, features in start_codon_dict.iteritems(): | |
28 is_reverse = features[0]["strand"] == "-" | |
29 if is_reverse: | |
30 stop_codon_end = min([feature["start"] for feature in features]) | |
31 else: | |
32 stop_codon_end = max([feature["end"] for feature in features])# get last start codon | |
33 to_remove = [] | |
34 if gene in utr_dict: | |
35 for utr in utr_dict[gene]: | |
36 start = utr["start"] | |
37 end = utr["end"] | |
38 if is_reverse: | |
39 if end >= stop_codon_end: | |
40 to_remove.append(utr) | |
41 else: | |
42 if start <= stop_codon_end: | |
43 to_remove.append(utr) | |
44 [utr_dict[gene].remove(utr) for utr in to_remove ] | |
45 | |
46 | |
47 def get_longest_utr(utr_dict): | |
48 """ | |
49 Start of the composite utr is the most 5p start, end is the most 3p end. | |
50 """ | |
51 gtf = [] | |
52 for gene_id, values in utr_dict.iteritems(): | |
53 if not values: | |
54 continue | |
55 if len(values) == 1: | |
56 values[0]["start"] = str(values[0]["start"]) | |
57 values[0]["end"] = str(values[0]["end"]) | |
58 gtf.append( "\t".join( values[0].values() ) ) | |
59 else: | |
60 start = min( [fields["start"] for fields in values] ) | |
61 end = max( [fields["end"] for fields in values] ) | |
62 values[0]["start"] = str(start) | |
63 values[0]["end"] = str(end) | |
64 gtf.append( "\t".join( values[0].values() ) ) | |
65 return gtf | |
66 | |
67 def main(): | |
68 utr_dict = OrderedDict() | |
69 start_codon_dict = {} | |
70 header = [] | |
71 gtf_fields = get_gtf_fields() | |
72 with open(sys.argv[1]) as input: | |
73 for line in input: | |
74 if line.startswith("#"): | |
75 header.append( line.strip() ) | |
76 else: | |
77 get_feature_dict(line, gtf_fields, utr_dict, feature="UTR") | |
78 get_feature_dict(line, gtf_fields, start_codon_dict, feature="CDS") | |
79 remove_five_prime_utrs(utr_dict, start_codon_dict) | |
80 gtf = header + get_longest_utr(utr_dict) | |
81 if len(sys.argv) == 3: | |
82 with open(sys.argv[2], "w") as output: | |
83 [output.write(line + "\n") for line in gtf] | |
84 else: | |
85 for line in gtf: | |
86 print(line) | |
87 | |
88 if __name__ == "__main__": | |
89 main() | |
90 | |
91 | |
92 | |
93 | |
94 | |
95 | |
96 |