Mercurial > repos > devteam > megablast_xml_parser
comparison megablast_xml_parser.py @ 1:3ce5d56297ed draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/megablast_xml_parser commit 3ccddd4e2032535ead030efa401e690ffb80d145"
| author | devteam |
|---|---|
| date | Wed, 09 Sep 2020 10:27:20 +0000 |
| parents | 35ff246876fc |
| children |
comparison
equal
deleted
inserted
replaced
| 0:35ff246876fc | 1:3ce5d56297ed |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | |
| 3 import sys, os, re | |
| 4 | 2 |
| 5 if sys.version_info[:2] >= ( 2, 5 ): | 3 import re |
| 6 import xml.etree.cElementTree as ElementTree | 4 import sys |
| 7 else: | 5 import xml.etree.cElementTree as ElementTree |
| 8 from galaxy import eggs | |
| 9 import pkg_resources; pkg_resources.require( "elementtree" ) | |
| 10 from elementtree import ElementTree | |
| 11 | 6 |
| 12 def stop_err( msg ): | |
| 13 sys.stderr.write( "%s\n" % msg ) | |
| 14 sys.exit() | |
| 15 | 7 |
| 16 def __main__(): | 8 def __main__(): |
| 17 source = sys.argv[1] | 9 source = sys.argv[1] |
| 18 hspTags = [ | 10 hspTags = ["Hsp_bit-score", |
| 19 "Hsp_bit-score", | 11 "Hsp_evalue", |
| 20 "Hsp_evalue", | 12 "Hsp_query-from", |
| 21 "Hsp_query-from", | 13 "Hsp_query-to", |
| 22 "Hsp_query-to", | 14 "Hsp_hit-from", |
| 23 "Hsp_hit-from", | 15 "Hsp_hit-to", |
| 24 "Hsp_hit-to", | 16 "Hsp_query-frame", |
| 25 "Hsp_query-frame", | 17 "Hsp_hit-frame", |
| 26 "Hsp_hit-frame", | 18 "Hsp_identity", |
| 27 "Hsp_identity", | 19 "Hsp_align-len", |
| 28 "Hsp_align-len", | 20 "Hsp_qseq", |
| 29 "Hsp_qseq", | 21 "Hsp_hseq", |
| 30 "Hsp_hseq", | 22 "Hsp_midline"] |
| 31 "Hsp_midline" | |
| 32 ] | |
| 33 hspData = [] | |
| 34 | 23 |
| 35 # get an iterable | 24 # get an iterable |
| 36 try: | 25 try: |
| 37 context = ElementTree.iterparse( source, events=( "start", "end" ) ) | 26 context = ElementTree.iterparse(source, events=("start", "end")) |
| 38 except: | 27 except Exception: |
| 39 stop_err( "Invalid data format." ) | 28 sys.exit("Invalid data format.") |
| 40 # turn it into an iterator | 29 # turn it into an iterator |
| 41 context = iter( context ) | 30 context = iter(context) |
| 42 # get the root element | 31 # get the root element |
| 43 try: | 32 try: |
| 44 event, root = context.next() | 33 event, root = next(context) |
| 45 except: | 34 except Exception: |
| 46 stop_err( "Invalid data format." ) | 35 sys.exit("Invalid data format.") |
| 47 | 36 |
| 48 outfile = open( sys.argv[2], 'w' ) | 37 with open(sys.argv[2], 'w') as outfile: |
| 49 try: | 38 try: |
| 50 for event, elem in context: | 39 for event, elem in context: |
| 51 # for every <Iteration> tag | 40 # for every <Iteration> tag |
| 52 if event == "end" and elem.tag == "Iteration": | 41 if event == "end" and elem.tag == "Iteration": |
| 53 query = elem.findtext( "Iteration_query-def" ) | 42 query = elem.findtext("Iteration_query-def") |
| 54 qLen = elem.findtext( "Iteration_query-len" ) | 43 qLen = elem.findtext("Iteration_query-len") |
| 55 # for every <Hit> within <Iteration> | 44 # for every <Hit> within <Iteration> |
| 56 for hit in elem.findall( "Iteration_hits/Hit" ): | 45 for hit in elem.findall("Iteration_hits/Hit"): |
| 57 subject = hit.findtext( "Hit_id" ) | 46 subject = hit.findtext("Hit_id") |
| 58 if re.search( '^gi', subject ): | 47 if re.search('^gi', subject): |
| 59 subject = subject.split('|')[1] | 48 subject = subject.split('|')[1] |
| 60 sLen = hit.findtext( "Hit_len" ) | 49 sLen = hit.findtext("Hit_len") |
| 61 # for every <Hsp> within <Hit> | 50 # for every <Hsp> within <Hit> |
| 62 for hsp in hit.findall( "Hit_hsps/Hsp" ): | 51 for hsp in hit.findall("Hit_hsps/Hsp"): |
| 63 outfile.write( "%s\t%s\t%s\t%s" % ( query, qLen, subject, sLen ) ) | 52 outfile.write("%s\t%s\t%s\t%s" % (query, qLen, subject, sLen)) |
| 64 for tag in hspTags: | 53 for tag in hspTags: |
| 65 outfile.write("\t%s" %(hsp.findtext( tag ))) | 54 outfile.write("\t%s" % (hsp.findtext(tag))) |
| 66 #hspData.append( hsp.findtext( tag ) ) | 55 outfile.write('\n') |
| 67 #hspData = [] | 56 # prevents ElementTree from growing large datastructure |
| 68 outfile.write('\n') | 57 root.clear() |
| 69 # prevents ElementTree from growing large datastructure | 58 elem.clear() |
| 70 root.clear() | 59 except Exception: |
| 71 elem.clear() | 60 sys.exit("The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1]) |
| 72 except: | |
| 73 outfile.close() | |
| 74 stop_err( "The input data is malformed, or there is more than one dataset in the input file. Error: %s" % sys.exc_info()[1] ) | |
| 75 | 61 |
| 76 outfile.close() | |
| 77 | 62 |
| 78 if __name__ == "__main__": __main__() | 63 if __name__ == "__main__": |
| 64 __main__() |
