comparison FastQC/rgFastQC.py @ 0:42251cbdeeac draft

Initial commit of test for FastQC with installation of the java stuff
author fubar
date Mon, 03 Jun 2013 20:30:24 -0400
parents
children 91cb2603b56c
comparison
equal deleted inserted replaced
-1:000000000000 0:42251cbdeeac
1 """
2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
3 # added sanitizer for user supplied name
4 # removed shell and make cl a sequence for Popen call
5 # ross lazarus August 10 2012 in response to anon insecurity report
6 wrapper for fastqc
7
8 called as
9 <command interpreter="python">
10 rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
11 </command>
12
13
14
15 Current release seems overly intolerant of sam/bam header strangeness
16 Author notified...
17
18
19 """
20 import re
21 import os
22 import sys
23 import subprocess
24 import optparse
25 import shutil
26 import tempfile
27 from rgutils import getFileString
28 import zipfile
29 import gzip
30
31 class FastQC():
32 """wrapper
33 """
34
35
36 def __init__(self,opts=None):
37 assert opts <> None
38 self.opts = opts
39
40
41 def run_fastqc(self):
42 """
43 In batch mode fastqc behaves not very nicely - will write to a new folder in
44 the same place as the infile called [infilebasename]_fastqc
45 rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
46 duplication_levels.png fastqc_icon.png per_base_n_content.png per_sequence_gc_content.png summary.txt
47 error.png fastqc_report.html per_base_quality.png per_sequence_quality.png tick.png
48 fastqc_data.txt per_base_gc_content.png per_base_sequence_content.png sequence_length_distribution.png warning.png
49
50 """
51 serr = ''
52 dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
53 sout = open(tlog, 'w')
54 fastq = os.path.basename(self.opts.input)
55 cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir]
56 if self.opts.informat in ['sam','bam']:
57 cl.append('--f=%s' % self.opts.informat)
58 if self.opts.contaminants <> None :
59 cl.append('--contaminants=%s' % self.opts.contaminants)
60 # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
61 # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
62 # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files
63 # which fastqc helpfully tries to uncompress again - hilarity ensues.
64 # patched may 29 2013 until this is fixed properly
65 infname = self.opts.inputfilename
66 linf = infname.lower()
67 trimext = False
68 if ( linf.endswith('.gz') or linf.endswith('.gzip') ):
69 f = gzip.open(self.opts.input)
70 try:
71 testrow = f.readline()
72 except:
73 trimext = True
74 f.close()
75 elif linf.endswith('bz2'):
76 f = bz2.open(self.opts.input,'rb')
77 try:
78 f.readline()
79 except:
80 trimext = True
81 f.close()
82 elif linf.endswith('.zip'):
83 if not zipfile.is_zipfile(self.opts.input):
84 trimext = True
85 if trimext:
86 infname = os.path.splitext(infname)[0]
87 fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
88 link_name = os.path.join(self.opts.outputdir, fastqinfilename)
89 os.symlink(self.opts.input, link_name)
90 cl.append(link_name)
91 sout.write('# FastQC cl = %s\n' % ' '.join(cl))
92 sout.flush()
93 p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
94 retval = p.wait()
95 sout.close()
96 runlog = open(tlog,'r').readlines()
97 os.unlink(link_name)
98 flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
99 odpath = None
100 for f in flist:
101 d = os.path.join(self.opts.outputdir,f)
102 if os.path.isdir(d):
103 if d.endswith('_fastqc'):
104 odpath = d
105 hpath = None
106 if odpath <> None:
107 try:
108 hpath = os.path.join(odpath,'fastqc_report.html')
109 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
110 except:
111 pass
112 if hpath == None:
113 serr = '\n'.join(runlog)
114 res = ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
115 res += runlog
116 res += ['</pre>\n',
117 'Please read the above for clues<br/>\n',
118 'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
119 'It is also possible that the log shows that fastqc is not installed?<br/>\n',
120 'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
121 'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
122 return res,1,serr
123 self.fix_fastqcimages(odpath)
124 flist = os.listdir(self.opts.outputdir) # these have now been fixed
125 excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
126 flist = [x for x in flist if not x in excludefiles]
127 for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
128 rep[i] = rep[i].replace('Icons/','')
129 rep[i] = rep[i].replace('Images/','')
130 html = self.fix_fastqc(rep,flist,runlog)
131 return html,retval,serr
132
133
134
135 def fix_fastqc(self,rep=[],flist=[],runlog=[]):
136 """ add some of our stuff to the html
137 """
138 bodyindex = len(rep) -1 # hope they don't change this
139 footrow = bodyindex - 1
140 footer = rep[footrow]
141 rep = rep[:footrow] + rep[footrow+1:]
142 res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
143 flist.sort()
144 for i,f in enumerate(flist):
145 if not(os.path.isdir(f)):
146 fn = os.path.split(f)[-1]
147 res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,getFileString(fn, self.opts.outputdir)))
148 res.append('</table>\n')
149 res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
150 res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>')
151 res.append(footer)
152 fixed = rep[:bodyindex] + res + rep[bodyindex:]
153 return fixed # with our additions
154
155
156 def fix_fastqcimages(self,odpath):
157 """ Galaxy wants everything in the same files_dir
158 """
159 icpath = os.path.join(odpath,'Icons')
160 impath = os.path.join(odpath,'Images')
161 for adir in [icpath,impath,odpath]:
162 if os.path.exists(adir):
163 flist = os.listdir(adir) # get all files created
164 for f in flist:
165 if not os.path.isdir(os.path.join(adir,f)):
166 sauce = os.path.join(adir,f)
167 dest = os.path.join(self.opts.outputdir,f)
168 shutil.move(sauce,dest)
169 os.rmdir(adir)
170
171
172
173 if __name__ == '__main__':
174 op = optparse.OptionParser()
175 op.add_option('-i', '--input', default=None)
176 op.add_option('-j', '--inputfilename', default=None)
177 op.add_option('-o', '--htmloutput', default=None)
178 op.add_option('-d', '--outputdir', default="/tmp/shortread")
179 op.add_option('-f', '--informat', default='fastq')
180 op.add_option('-n', '--namejob', default='rgFastQC')
181 op.add_option('-c', '--contaminants', default=None)
182 op.add_option('-e', '--executable', default='fastqc')
183 opts, args = op.parse_args()
184 assert opts.input <> None
185 assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable
186 if not os.path.exists(opts.outputdir):
187 os.makedirs(opts.outputdir)
188 f = FastQC(opts)
189 html,retval,serr = f.run_fastqc()
190 f = open(opts.htmloutput, 'w')
191 f.write(''.join(html))
192 f.close()
193 if retval <> 0:
194 print >> sys.stderr, serr # indicate failure
195
196
197