comparison rgFastQC.py @ 9:f126b49e93e7 draft

Uploaded
author bgruening
date Thu, 06 Jun 2013 02:38:59 -0400
parents
children
comparison
equal deleted inserted replaced
8:fbc64ecd5295 9:f126b49e93e7
1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7
8 called as
9 <command interpreter="python">
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
11 </command>
12
13
14
15 Current release seems overly intolerant of sam/bam header strangeness
16 Author notified...
17
18
19 """
20 import re
21 import os
22 import sys
23 import subprocess
24 import optparse
25 import shutil
26 import tempfile
27 import zipfile
28 import gzip
29
30 def pathfind(program):
31 """ toolshed path munging isn't so try to work around june 5 2013
32 """
33 def is_exe(fpath):
34 return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
35
36 fpath, fname = os.path.split(program)
37 if fpath:
38 if is_exe(program):
39 return program
40 else:
41 for path in os.environ["PATH"].split(os.pathsep):
42 path = path.strip('"')
43 exe_file = os.path.join(path, program)
44 if is_exe(exe_file):
45 return exe_file
46
47 return None
48
49 class FastQC():
50 """wrapper
51 """
52
53
54 def __init__(self,opts=None):
55 assert opts <> None
56 self.opts = opts
57 fastqcexe = pathfind(opts.executable)
58 assert (fastqcexe != None),'##rgFastQC.py error - cannot find passed fastqc executable %s in path %s' % (opts.executable,os.environ['PATH'])
59 self.fastqcexe = fastqcexe
60
61 def getFileString(self, fpath, outpath):
62 """
63 format a nice file size string
64 """
65 size = ''
66 fp = os.path.join(outpath, fpath)
67 s = fpath
68 if os.path.isfile(fp):
69 n = float(os.path.getsize(fp))
70 if n > 2**20:
71 size = ' (%1.1f MB)' % (n/2**20)
72 elif n > 2**10:
73 size = ' (%1.1f KB)' % (n/2**10)
74 elif n > 0:
75 size = ' (%d B)' % (int(n))
76 s = '%s %s' % (fpath, size)
77 return s
78
79 def run_fastqc(self):
80 """
81 In batch mode fastqc behaves not very nicely - will write to a new folder in
82 the same place as the infile called [infilebasename]_fastqc
83 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
84 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
85 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
86 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
87
88 """
89 serr = ''
90 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
91 sout = open(tlog, 'w')
92 fastq = os.path.basename(self.opts.input)
93 cl = [self.fastqcexe,'--outdir=%s' % self.opts.outputdir]
94 if self.opts.informat in ['sam','bam']:
95 cl.append('--f=%s' % self.opts.informat)
96 if self.opts.contaminants <> None :
97 cl.append('--contaminants=%s' % self.opts.contaminants)
98 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
99 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
100 # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files
101 # which fastqc helpfully tries to uncompress again - hilarity ensues.
102 # patched may 29 2013 until this is fixed properly
103 infname = self.opts.inputfilename
104 linf = infname.lower()
105 trimext = False
106 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
107 f = gzip.open(self.opts.input)
108 try:
109 testrow = f.readline()
110 except:
111 trimext = True
112 f.close()
113 elif linf.endswith('bz2'):
114 f = bz2.open(self.opts.input,'rb')
115 try:
116 f.readline()
117 except:
118 trimext = True
119 f.close()
120 elif linf.endswith('.zip'):
121 if not zipfile.is_zipfile(self.opts.input):
122 trimext = True
123 if trimext:
124 infname = os.path.splitext(infname)[0]
125 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
126 link_name = os.path.join(self.opts.outputdir, fastqinfilename)
127 os.symlink(self.opts.input, link_name)
128 cl.append(link_name)
129 sout.write('# FastQC cl = %s\n' % ' '.join(cl))
130 sout.flush()
131 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
132 retval = p.wait()
133 sout.close()
134 runlog = open(tlog,'r').readlines()
135 os.unlink(link_name)
136 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
137 odpath = None
138 for f in flist:
139 d = os.path.join(self.opts.outputdir,f)
140 if os.path.isdir(d):
141 if d.endswith('_fastqc'):
142 odpath = d
143 hpath = None
144 if odpath <> None:
145 try:
146 hpath = os.path.join(odpath,'fastqc_report.html')
147 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
148 except:
149 pass
150 if hpath == None:
151 serr = '\n'.join(runlog)
152 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
153 res += runlog
154 res += ['</pre>\n',
155 'Please read the above for clues<br/>\n',
156 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
157 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
158 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
159 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
160 return res,1,serr
161 self.fix_fastqcimages(odpath)
162 flist = os.listdir(self.opts.outputdir) # these have now been fixed
163 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
164 flist = [x for x in flist if not x in excludefiles]
165 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
166 rep[i] = rep[i].replace('Icons/','')
167 rep[i] = rep[i].replace('Images/','')
168 html = self.fix_fastqc(rep,flist,runlog)
169 return html,retval,serr
170
171
172
173 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
174 """ add some of our stuff to the html
175 """
176 bodyindex = len(rep) -1 # hope they don't change this
177 footrow = bodyindex - 1
178 footer = rep[footrow]
179 rep = rep[:footrow] + rep[footrow+1:]
180 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
181 flist.sort()
182 for i,f in enumerate(flist):
183 if not(os.path.isdir(f)):
184 fn = os.path.split(f)[-1]
185 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,self.getFileString(fn, self.opts.outputdir)))
186 res.append('</table>\n')
187 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
188 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>')
189 res.append(footer)
190 fixed = rep[:bodyindex] + res + rep[bodyindex:]
191 return fixed # with our additions
192
193
194 def fix_fastqcimages(self,odpath):
195 """ Galaxy wants everything in the same files_dir
196 """
197 icpath = os.path.join(odpath,'Icons')
198 impath = os.path.join(odpath,'Images')
199 for adir in [icpath,impath,odpath]:
200 if os.path.exists(adir):
201 flist = os.listdir(adir) # get all files created
202 for f in flist:
203 if not os.path.isdir(os.path.join(adir,f)):
204 sauce = os.path.join(adir,f)
205 dest = os.path.join(self.opts.outputdir,f)
206 shutil.move(sauce,dest)
207 os.rmdir(adir)
208
209
210 if __name__ == '__main__':
211 op = optparse.OptionParser()
212 op.add_option('-i', '--input', default=None)
213 op.add_option('-j', '--inputfilename', default=None)
214 op.add_option('-o', '--htmloutput', default=None)
215 op.add_option('-d', '--outputdir', default="/tmp/shortread")
216 op.add_option('-f', '--informat', default='fastq')
217 op.add_option('-n', '--namejob', default='rgFastQC')
218 op.add_option('-c', '--contaminants', default=None)
219 op.add_option('-e', '--executable', default='fastqc')
220 opts, args = op.parse_args()
221 assert opts.input <> None
222 if not os.path.exists(opts.outputdir):
223 os.makedirs(opts.outputdir)
224 f = FastQC(opts)
225 html,retval,serr = f.run_fastqc()
226 f = open(opts.htmloutput, 'w')
227 f.write(''.join(html))
228 f.close()
229 if retval <> 0:
230 print >> sys.stderr, serr # indicate failure
231
232
233