view filter_utr.py @ 22:4309798bfb61 draft default tip

planemo upload for repository https://github.com/mvdbeek/dapars commit b1b007c561ea6c9db145c88b6b128d66ecd05e24-dirty
author mvdbeek
date Fri, 30 Oct 2015 12:30:22 -0400
parents bb84ee2f2137
children
line wrap: on
line source

#!/usr/bin/env python
# Filter out UTRs and, if a gene has multiple UTRs, return a single UTR with minimum start and maximum end coordinate
# usage: python filter_utr.py input.gtf output.gtf

import sys
from collections import OrderedDict

def get_gtf_fields():
    return [ "chr", "source", "feature", "start", "end", "score", "strand", "frame", "group" ]

def get_feature_dict(line, gtf_fields, utr_dict, feature="UTR"):
    """
    Return a dictionary with lines of a GTF if the line describes a UTR.
    Key is the first attribute of the group.
    """
    if line.split("\t")[2] == feature:
        fields = line.strip().split("\t")
        fields[3] = int(fields[3])
        fields[4] = int(fields[4])
        gene_id = fields[-1].split("\"")[1]
        if not gene_id in utr_dict:
            utr_dict[gene_id] = [OrderedDict(zip(gtf_fields, fields))]
        else:
            utr_dict[gene_id].append(OrderedDict(zip(gtf_fields, fields)))

def remove_five_prime_utrs(utr_dict, start_codon_dict):
    for gene, features in start_codon_dict.iteritems():
        is_reverse = features[0]["strand"] == "-"
        if is_reverse:
            stop_codon_end = min([feature["start"] for feature in features])
        else:
            stop_codon_end = max([feature["end"] for feature in features])# get last start codon
        to_remove = []
        if gene in utr_dict:
            for utr in utr_dict[gene]:
                start = utr["start"]
                end = utr["end"]
                if is_reverse:
                    if end >= stop_codon_end:
                        to_remove.append(utr)
                else:
                    if start <= stop_codon_end:
                        to_remove.append(utr)
            [utr_dict[gene].remove(utr) for utr in to_remove ]


def get_longest_utr(utr_dict):
    """
    Start of the composite utr is the most 5p start, end is the most 3p end.
    """
    gtf = []
    for gene_id, values  in utr_dict.iteritems():
        if not values:
            continue
        if len(values) == 1:
            values[0]["start"] = str(values[0]["start"])
            values[0]["end"] = str(values[0]["end"])
            gtf.append( "\t".join( values[0].values() ) )
        else:
            start = min( [fields["start"] for fields in values] )
            end = max( [fields["end"] for fields in values] )
            values[0]["start"] = str(start)
            values[0]["end"] = str(end)
            gtf.append( "\t".join( values[0].values() ) )
    return gtf

def main():
    utr_dict = OrderedDict()
    start_codon_dict = {}
    header = []
    gtf_fields = get_gtf_fields()
    with open(sys.argv[1]) as input:
        for line in input:
            if line.startswith("#"):
                header.append( line.strip() )
            else:
                get_feature_dict(line, gtf_fields, utr_dict, feature="UTR")
                get_feature_dict(line, gtf_fields, start_codon_dict, feature="CDS")
    remove_five_prime_utrs(utr_dict, start_codon_dict)
    gtf = header + get_longest_utr(utr_dict)
    if len(sys.argv) == 3:
        with open(sys.argv[2], "w") as output:
            [output.write(line + "\n") for line in gtf]
    else:
        for line in gtf:
            print(line)

if __name__ == "__main__":
    main()