Mercurial > repos > greg > gregs_test_repo
comparison blast2go-7b53cc52e7ed/tools/ncbi_blast_plus/blast2go.py @ 1:5ab17fe9e056
Uploaded
author | greg |
---|---|
date | Tue, 19 Jul 2011 11:33:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:4f07f3a33605 | 1:5ab17fe9e056 |
---|---|
1 #!/usr/bin/env python | |
2 """Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.3.5. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * Input BLAST XML filename | |
6 * Blast2GO properties filename (settings file) | |
7 * Output tabular filename | |
8 | |
9 Sadly b2g4pipe v2.3.5 cannot cope with current style large BLAST XML | |
10 files (e.g. from BLAST 2.2.25+), so we have to reformat these to | |
11 avoid it crashing with a Java heap space OutOfMemoryError. | |
12 | |
13 As part of this reformatting, we check for BLASTP or BLASTX output | |
14 (otherwise raise an error), and print the query count. | |
15 | |
16 It then calls the Java command line tool, and moves the output file to | |
17 the location Galaxy is expecting, and removes the tempory XML file. | |
18 """ | |
19 import sys | |
20 import os | |
21 import subprocess | |
22 | |
23 #You may need to edit this to match your local setup, | |
24 blast2go_jar = "/opt/b2g4pipe/blast2go.jar" | |
25 | |
26 | |
27 def stop_err(msg, error_level=1): | |
28 """Print error message to stdout and quit with given error level.""" | |
29 sys.stderr.write("%s\n" % msg) | |
30 sys.exit(error_level) | |
31 | |
32 if len(sys.argv) != 4: | |
33 stop_err("Require three arguments: XML filename, properties filename, output tabular filename") | |
34 | |
35 xml_file, prop_file, tabular_file = sys.argv[1:] | |
36 | |
37 #We should have write access here: | |
38 tmp_xml_file = tabular_file + ".tmp.xml" | |
39 | |
40 if not os.path.isfile(xml_file): | |
41 stop_err("Input BLAST XML file not found: %s" % xml_file) | |
42 | |
43 if not os.path.isfile(prop_file): | |
44 stop_err("Blast2GO configuration file not found: %s" % prop_file) | |
45 | |
46 def prepare_xml(original_xml, mangled_xml): | |
47 """Reformat BLAST XML to suit Blast2GO. | |
48 | |
49 Blast2GO can't cope with 1000s of <Iteration> tags within a | |
50 single <BlastResult> tag, so instead split this into one | |
51 full XML record per interation (i.e. per query). This gives | |
52 a concatenated XML file mimicing old versions of BLAST. | |
53 | |
54 This also checks for BLASTP or BLASTX output, and outputs | |
55 the number of queries. Galaxy will show this as "info". | |
56 """ | |
57 in_handle = open(original_xml) | |
58 footer = " </BlastOutput_iterations>\n</BlastOutput>\n" | |
59 header = "" | |
60 while True: | |
61 line = in_handle.readline() | |
62 if not line: | |
63 #No hits? | |
64 stop_err("Problem with XML file?") | |
65 if line.strip() == "<Iteration>": | |
66 break | |
67 header += line | |
68 | |
69 if "<BlastOutput_program>blastx</BlastOutput_program>" in header: | |
70 print "BLASTX output identified" | |
71 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: | |
72 print "BLASTP output identified" | |
73 else: | |
74 in_handle.close() | |
75 stop_err("Expect BLASTP or BLASTX output") | |
76 | |
77 out_handle = open(mangled_xml, "w") | |
78 out_handle.write(header) | |
79 out_handle.write(line) | |
80 count = 1 | |
81 while True: | |
82 line = in_handle.readline() | |
83 if not line: | |
84 break | |
85 elif line.strip() == "<Iteration>": | |
86 #Insert footer/header | |
87 out_handle.write(footer) | |
88 out_handle.write(header) | |
89 count += 1 | |
90 out_handle.write(line) | |
91 | |
92 out_handle.close() | |
93 in_handle.close() | |
94 print "Input has %i queries" % count | |
95 | |
96 | |
97 def run(cmd): | |
98 #Avoid using shell=True when we call subprocess to ensure if the Python | |
99 #script is killed, so too is the child process. | |
100 try: | |
101 child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
102 except Exception, err: | |
103 stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err)) | |
104 #Use .communicate as can get deadlocks with .wait(), | |
105 stdout, stderr = child.communicate() | |
106 return_code = child.returncode | |
107 if return_code: | |
108 cmd_str = " ".join(cmd) | |
109 if stderr and stdout: | |
110 stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, cmd_str, stdout, stderr)) | |
111 else: | |
112 stop_err("Return code %i from command:\n%s\n%s" % (return_code, cmd_str, stderr)) | |
113 #For early diagnostics, | |
114 else: | |
115 print stdout | |
116 print stderr | |
117 | |
118 if not os.path.isfile(blast2go_jar): | |
119 stop_err("Blast2GO JAR file not found: %s" % blast2go_jar) | |
120 | |
121 prepare_xml(xml_file, tmp_xml_file) | |
122 #print "XML file prepared for Blast2GO" | |
123 | |
124 #We will have write access wherever the output should be, | |
125 #so we'll ask Blast2GO to use that as the stem for its output | |
126 #(it will append .annot to the filename) | |
127 cmd = ["java", "-jar", blast2go_jar, | |
128 "-in", tmp_xml_file, | |
129 "-prop", prop_file, | |
130 "-out", tabular_file, #Used as base name for output files | |
131 "-a", # Generate *.annot tabular file | |
132 #"-img", # Generate images, feature not in v2.3.5 | |
133 ] | |
134 #print " ".join(cmd) | |
135 run(cmd) | |
136 | |
137 #Remove the temp XML file | |
138 os.remove(tmp_xml_file) | |
139 | |
140 out_file = tabular_file + ".annot" | |
141 if not os.path.isfile(out_file): | |
142 stop_err("ERROR - No output annotation file from Blast2GO") | |
143 | |
144 #Move the output file where Galaxy expects it to be: | |
145 os.rename(out_file, tabular_file) | |
146 | |
147 print "Done" |