view reprof.py @ 7:fb0936cf5bef draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/reprof commit db55deda35d1623ff48fbea7b3d0690a2cd9cd9f-dirty
author iuc
date Tue, 08 Dec 2015 13:46:13 -0500
parents 141da185be70
children
line wrap: on
line source

#!/usr/bin/env python
import re
import os
import argparse
import subprocess
import tempfile
from Bio import SeqIO
from BCBio import GFF
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation

def run_reprof(query_path, modeldir):
    outtmp = tempfile.NamedTemporaryFile(delete=False)
    cmd = [
        'reprof',
        '-i', query_path,
        '--modeldir=%s' % modeldir,
        '-o', outtmp.name
    ]
    subprocess.check_call(cmd)
    outtmp.seek(0)
    data = outtmp.read()
    outtmp.close()
    os.unlink(outtmp.name)
    return data

def process_reprof_report(data):
    KEYS = ['idx', 'AA', 'PHEL', 'RI_S', 'pH', 'pE', 'pL', 'PACC', 'PREL', 'P10', 'RI_A', 'Pbe', 'Pbie']
    data_tracks = {k: [] for k in KEYS}

    for line in data.split('\n'):
        if line.startswith('#') or line.startswith('No') or len(line.strip()) == 0:
            continue

        for idx, (key, value) in enumerate(zip(KEYS, line.strip().split('\t'))):
            # numerical columns
            if idx not in (1, 2, 11, 12):
                value = int(value)

            data_tracks[key].append(value)
    return data_tracks

def storeWigData(idx, data, id, path):
    with open(path, 'a') as handle:
        handle.write('variableStep chrom=%s\n' % id)
        for (pos, val) in zip(idx, data):
            handle.write('%s %s\n' % (pos, val))

def storeGff3Data(path, id, positions, values, decodeMap):
    merged = ''.join(values)
    # http://stackoverflow.com/a/19683549
    regions = [(x[0][0].upper(), len(x[0])) for x in re.findall('((.)(\\2*))', merged)]

    location = 1

    rec = SeqRecord(Seq("ACTG"), id=id)
    for (region_char, region_length) in regions:
        # If we don't wish to decode this region, skip it.
        if region_char not in decodeMap:
            location += region_length
            continue

        region_info = decodeMap[region_char]
        # Create a feature representing this region
        region_feat = SeqFeature(
            FeatureLocation(location - 1, location - 1 + region_length),
            type=region_info['type'], strand=0,
            qualifiers={k: v for (k, v) in region_info.iteritems() if k != 'type'}
        )
        # Update our start location
        location += region_length
        rec.features.append(region_feat)

    with open(path, 'a') as handle:
        GFF.write([rec], handle)

def storeClassData(path, id, phel):
    h = float(phel.count('H')) / float(len(phel))
    e = float(phel.count('E')) / float(len(phel))

    if h > .45 and e < .05:
        classification = 'all-alpha'
    elif h < .05 and e > .45:
        classification = 'all-beta'
    elif h > .3 and e > .2:
        classification = 'alpha-beta'
    else:
        classification = 'mixed'

    with open(path, 'a') as handle:
        handle.write('# id\t% alpha\t% beta\tclass\n')
        handle.write('{0}\t{1:.2f}\t{2:.2f}\t{3}'.format(id, h, e, classification))

def main(fasta, modeldir):
    for record in SeqIO.parse(fasta, 'fasta'):
        tmp = tempfile.NamedTemporaryFile(delete=False)
        SeqIO.write([record], tmp, 'fasta')
        tmp.close()

        # Run reprof
        data = process_reprof_report(run_reprof(tmp.name, modeldir))
        for col in ('RI_S', 'P10', 'RI_A', 'PACC', 'PREL', 'pH', 'pE', 'pL'):
            storeWigData(data['idx'], data[col], record.id, col + '.wig')

        eco = ['{ECO:0000255|reprof_1.0.1}']
        storeGff3Data(
            'secondary_structure.gff3', record.id, data['idx'], data['PHEL'],
            {
                'H': {
                    'type': 'peptide_helix',
                    'description': ['Helix'],
                    'evidence': eco,
                },
                'E': {
                    'type': 'beta_strand',
                    'description': ['Extended/Sheet'],
                    'evidence': eco,
                },
                'L': {
                    'type': 'loop',
                    'description': ['Loop'],
                    'evidence': eco,
                }
            }
        )

        storeGff3Data(
            'solvent_accessibility.gff3', record.id, data['idx'], data['Pbe'],
            {
                'B': {
                    'type': 'experimental_result_region',
                    'description': ['Buried'],
                    'evidence': eco,
                },
                'E': {
                    'type': 'experimental_result_region',
                    'description': ['Exposed'],
                    'evidence': eco,
                },
            }
        )

        storeClassData('report.tsv', record.id, data['PHEL'])


if __name__ == '__main__':
    # Grab all of the filters from our plugin loader
    parser = argparse.ArgumentParser(description='Wrapper for reprof')
    parser.add_argument('fasta', type=file, help='Fasta Input')
    parser.add_argument('modeldir')
    args = parser.parse_args()

    main(**vars(args))