Mercurial > repos > iuc > virannot_otu
comparison rps2tsv.py @ 0:e924ab4ab00a draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
| author | iuc |
|---|---|
| date | Wed, 21 Aug 2024 13:13:10 +0000 |
| parents | |
| children | 7e160902125e |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:e924ab4ab00a |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 | |
| 4 # Name: rps2ecsv | |
| 5 # Author: Marie Lefebvre - INRAE | |
| 6 # Aims: Convert rpsblast xml output to csv and add taxonomy | |
| 7 | |
| 8 | |
| 9 import argparse | |
| 10 import json | |
| 11 import logging as log | |
| 12 from urllib import request | |
| 13 from urllib.error import HTTPError, URLError | |
| 14 | |
| 15 from Bio.Blast import NCBIXML | |
| 16 from ete3 import NCBITaxa | |
| 17 | |
| 18 ncbi = NCBITaxa() | |
| 19 | |
| 20 | |
| 21 def main(): | |
| 22 options = _set_options() | |
| 23 _set_log_level(options.verbosity) | |
| 24 hits = _read_xml(options) | |
| 25 _write_tsv(options, hits) | |
| 26 | |
| 27 | |
| 28 def _read_xml(options): | |
| 29 """ | |
| 30 Parse XML RPSblast results file | |
| 31 """ | |
| 32 log.info("Read XML file " + options.xml_file) | |
| 33 xml = open(options.xml_file, 'r') | |
| 34 records = NCBIXML.parse(xml) | |
| 35 xml_results = {} | |
| 36 for blast_record in records: | |
| 37 for aln in blast_record.alignments: | |
| 38 for hit in aln.hsps: | |
| 39 hsp = {} | |
| 40 hit_evalue = hit.expect | |
| 41 if hit_evalue > options.max_evalue: | |
| 42 continue | |
| 43 hit_frame = hit.frame[0] # frame | |
| 44 hit_evalue = hit.expect # evalue | |
| 45 hit_startQ = hit.query_start | |
| 46 hit_endQ = hit.query_end | |
| 47 hsp["frame"] = hit_frame | |
| 48 hsp["evalue"] = hit_evalue | |
| 49 hsp["startQ"] = hit_startQ | |
| 50 hsp["endQ"] = hit_endQ | |
| 51 hsp["query_id"] = blast_record.query_id | |
| 52 hsp["cdd_id"] = aln.hit_def.split(",")[0] | |
| 53 hsp["hit_id"] = aln.hit_id | |
| 54 hsp["query_length"] = blast_record.query_length # length of the query | |
| 55 hsp["description"] = aln.hit_def | |
| 56 hsp["accession"] = aln.accession | |
| 57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF") | |
| 58 log.info("Requeting Interpro for " + hsp["pfam_id"]) | |
| 59 url = "https://www.ebi.ac.uk/interpro/api/taxonomy/uniprot/entry/pfam/" + hsp["pfam_id"] | |
| 60 req = request.Request(url) | |
| 61 try: | |
| 62 response = request.urlopen(req) | |
| 63 except HTTPError as e: | |
| 64 log.debug('Http error for interpro: ', e.code) | |
| 65 except URLError as e: | |
| 66 log.debug('Url error for interpro: ', e.reason) | |
| 67 else: | |
| 68 encoded_response = response.read() | |
| 69 decoded_response = encoded_response.decode() | |
| 70 payload = json.loads(decoded_response) | |
| 71 kingdoms = [] | |
| 72 for item in payload["results"][:6]: | |
| 73 if item["metadata"]["parent"] is not None: | |
| 74 lineage_parent = item["metadata"]["parent"] | |
| 75 translation = ncbi.get_taxid_translator([int(lineage_parent)]) | |
| 76 names = list(translation.values()) | |
| 77 if len(names) > 0: | |
| 78 if names[0] == "root": | |
| 79 taxonomy = names[1:] # remove 'root' at the begining | |
| 80 else: | |
| 81 taxonomy = names | |
| 82 else: | |
| 83 taxonomy = names | |
| 84 if len(taxonomy) != 0: | |
| 85 kingdoms.append(taxonomy[0]) | |
| 86 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} | |
| 87 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) | |
| 88 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) | |
| 89 hsp["taxonomy"] = concat_freq | |
| 90 xml_results[hsp["query_id"]] = hsp | |
| 91 return xml_results | |
| 92 | |
| 93 | |
| 94 def _write_tsv(options, hits): | |
| 95 """ | |
| 96 Write output | |
| 97 """ | |
| 98 log.info("Write output file " + options.output) | |
| 99 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" | |
| 100 f = open(options.output, "w+") | |
| 101 f.write(headers) | |
| 102 for h in hits: | |
| 103 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") | |
| 104 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") | |
| 105 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") | |
| 106 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) | |
| 107 f.write("\n") | |
| 108 f.close() | |
| 109 | |
| 110 | |
| 111 def _set_options(): | |
| 112 parser = argparse.ArgumentParser() | |
| 113 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
| 114 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
| 115 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') | |
| 116 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
| 117 args = parser.parse_args() | |
| 118 return args | |
| 119 | |
| 120 | |
| 121 def _set_log_level(verbosity): | |
| 122 if verbosity == 1: | |
| 123 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
| 124 log.basicConfig(level=log.INFO, format=log_format) | |
| 125 elif verbosity == 3: | |
| 126 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
| 127 log.basicConfig(level=log.DEBUG, format=log_format) | |
| 128 | |
| 129 | |
| 130 if __name__ == "__main__": | |
| 131 main() |
