Mercurial > repos > mvdbeek > dapars
diff filter_utr.py @ 0:bb84ee2f2137 draft
planemo upload for repository https://github.com/mvdbeek/dapars commit 868f8f2f7ac5d70c39b7d725ff087833b0f24f52-dirty
author | mvdbeek |
---|---|
date | Tue, 27 Oct 2015 10:14:33 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_utr.py Tue Oct 27 10:14:33 2015 -0400 @@ -0,0 +1,96 @@ +#!/usr/bin/env python +# Filter out UTRs and, if a gene has multiple UTRs, return a single UTR with minimum start and maximum end coordinate +# usage: python filter_utr.py input.gtf output.gtf + +import sys +from collections import OrderedDict + +def get_gtf_fields(): + return [ "chr", "source", "feature", "start", "end", "score", "strand", "frame", "group" ] + +def get_feature_dict(line, gtf_fields, utr_dict, feature="UTR"): + """ + Return a dictionary with lines of a GTF if the line describes a UTR. + Key is the first attribute of the group. + """ + if line.split("\t")[2] == feature: + fields = line.strip().split("\t") + fields[3] = int(fields[3]) + fields[4] = int(fields[4]) + gene_id = fields[-1].split("\"")[1] + if not gene_id in utr_dict: + utr_dict[gene_id] = [OrderedDict(zip(gtf_fields, fields))] + else: + utr_dict[gene_id].append(OrderedDict(zip(gtf_fields, fields))) + +def remove_five_prime_utrs(utr_dict, start_codon_dict): + for gene, features in start_codon_dict.iteritems(): + is_reverse = features[0]["strand"] == "-" + if is_reverse: + stop_codon_end = min([feature["start"] for feature in features]) + else: + stop_codon_end = max([feature["end"] for feature in features])# get last start codon + to_remove = [] + if gene in utr_dict: + for utr in utr_dict[gene]: + start = utr["start"] + end = utr["end"] + if is_reverse: + if end >= stop_codon_end: + to_remove.append(utr) + else: + if start <= stop_codon_end: + to_remove.append(utr) + [utr_dict[gene].remove(utr) for utr in to_remove ] + + +def get_longest_utr(utr_dict): + """ + Start of the composite utr is the most 5p start, end is the most 3p end. + """ + gtf = [] + for gene_id, values in utr_dict.iteritems(): + if not values: + continue + if len(values) == 1: + values[0]["start"] = str(values[0]["start"]) + values[0]["end"] = str(values[0]["end"]) + gtf.append( "\t".join( values[0].values() ) ) + else: + start = min( [fields["start"] for fields in values] ) + end = max( [fields["end"] for fields in values] ) + values[0]["start"] = str(start) + values[0]["end"] = str(end) + gtf.append( "\t".join( values[0].values() ) ) + return gtf + +def main(): + utr_dict = OrderedDict() + start_codon_dict = {} + header = [] + gtf_fields = get_gtf_fields() + with open(sys.argv[1]) as input: + for line in input: + if line.startswith("#"): + header.append( line.strip() ) + else: + get_feature_dict(line, gtf_fields, utr_dict, feature="UTR") + get_feature_dict(line, gtf_fields, start_codon_dict, feature="CDS") + remove_five_prime_utrs(utr_dict, start_codon_dict) + gtf = header + get_longest_utr(utr_dict) + if len(sys.argv) == 3: + with open(sys.argv[2], "w") as output: + [output.write(line + "\n") for line in gtf] + else: + for line in gtf: + print(line) + +if __name__ == "__main__": + main() + + + + + + +