comparison filter_utr.py @ 0:bb84ee2f2137 draft

planemo upload for repository https://github.com/mvdbeek/dapars commit 868f8f2f7ac5d70c39b7d725ff087833b0f24f52-dirty
author mvdbeek
date Tue, 27 Oct 2015 10:14:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:bb84ee2f2137
1 #!/usr/bin/env python
2 # Filter out UTRs and, if a gene has multiple UTRs, return a single UTR with minimum start and maximum end coordinate
3 # usage: python filter_utr.py input.gtf output.gtf
4
5 import sys
6 from collections import OrderedDict
7
8 def get_gtf_fields():
9 return [ "chr", "source", "feature", "start", "end", "score", "strand", "frame", "group" ]
10
11 def get_feature_dict(line, gtf_fields, utr_dict, feature="UTR"):
12 """
13 Return a dictionary with lines of a GTF if the line describes a UTR.
14 Key is the first attribute of the group.
15 """
16 if line.split("\t")[2] == feature:
17 fields = line.strip().split("\t")
18 fields[3] = int(fields[3])
19 fields[4] = int(fields[4])
20 gene_id = fields[-1].split("\"")[1]
21 if not gene_id in utr_dict:
22 utr_dict[gene_id] = [OrderedDict(zip(gtf_fields, fields))]
23 else:
24 utr_dict[gene_id].append(OrderedDict(zip(gtf_fields, fields)))
25
26 def remove_five_prime_utrs(utr_dict, start_codon_dict):
27 for gene, features in start_codon_dict.iteritems():
28 is_reverse = features[0]["strand"] == "-"
29 if is_reverse:
30 stop_codon_end = min([feature["start"] for feature in features])
31 else:
32 stop_codon_end = max([feature["end"] for feature in features])# get last start codon
33 to_remove = []
34 if gene in utr_dict:
35 for utr in utr_dict[gene]:
36 start = utr["start"]
37 end = utr["end"]
38 if is_reverse:
39 if end >= stop_codon_end:
40 to_remove.append(utr)
41 else:
42 if start <= stop_codon_end:
43 to_remove.append(utr)
44 [utr_dict[gene].remove(utr) for utr in to_remove ]
45
46
47 def get_longest_utr(utr_dict):
48 """
49 Start of the composite utr is the most 5p start, end is the most 3p end.
50 """
51 gtf = []
52 for gene_id, values in utr_dict.iteritems():
53 if not values:
54 continue
55 if len(values) == 1:
56 values[0]["start"] = str(values[0]["start"])
57 values[0]["end"] = str(values[0]["end"])
58 gtf.append( "\t".join( values[0].values() ) )
59 else:
60 start = min( [fields["start"] for fields in values] )
61 end = max( [fields["end"] for fields in values] )
62 values[0]["start"] = str(start)
63 values[0]["end"] = str(end)
64 gtf.append( "\t".join( values[0].values() ) )
65 return gtf
66
67 def main():
68 utr_dict = OrderedDict()
69 start_codon_dict = {}
70 header = []
71 gtf_fields = get_gtf_fields()
72 with open(sys.argv[1]) as input:
73 for line in input:
74 if line.startswith("#"):
75 header.append( line.strip() )
76 else:
77 get_feature_dict(line, gtf_fields, utr_dict, feature="UTR")
78 get_feature_dict(line, gtf_fields, start_codon_dict, feature="CDS")
79 remove_five_prime_utrs(utr_dict, start_codon_dict)
80 gtf = header + get_longest_utr(utr_dict)
81 if len(sys.argv) == 3:
82 with open(sys.argv[2], "w") as output:
83 [output.write(line + "\n") for line in gtf]
84 else:
85 for line in gtf:
86 print(line)
87
88 if __name__ == "__main__":
89 main()
90
91
92
93
94
95
96