| 0 | 1 """ | 
|  | 2 # May 2013 ross added check for bogus gz extension - fastqc gets confused | 
|  | 3 # added sanitizer for user supplied name | 
|  | 4 # removed shell and make cl a sequence for Popen call | 
|  | 5 # ross lazarus August 10 2012 in response to anon insecurity report | 
|  | 6 wrapper for fastqc | 
|  | 7 | 
|  | 8 called as | 
|  | 9   <command interpreter="python"> | 
|  | 10     rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix" | 
|  | 11   </command> | 
|  | 12 | 
|  | 13 | 
|  | 14 | 
|  | 15 Current release seems overly intolerant of sam/bam header strangeness | 
|  | 16 Author notified... | 
|  | 17 | 
|  | 18 | 
|  | 19 """ | 
|  | 20 import re | 
|  | 21 import os | 
|  | 22 import sys | 
|  | 23 import subprocess | 
|  | 24 import optparse | 
|  | 25 import shutil | 
|  | 26 import tempfile | 
|  | 27 import zipfile | 
|  | 28 import gzip | 
|  | 29 | 
|  | 30 class FastQC(): | 
|  | 31     """wrapper | 
|  | 32     """ | 
|  | 33 | 
|  | 34 | 
|  | 35     def __init__(self,opts=None): | 
|  | 36         assert opts <> None | 
|  | 37         self.opts = opts | 
|  | 38 | 
|  | 39     def getFileString(self, fpath, outpath): | 
|  | 40         """ | 
|  | 41         format a nice file size string | 
|  | 42         """ | 
|  | 43         size = '' | 
|  | 44         fp = os.path.join(outpath, fpath) | 
|  | 45         s = fpath | 
|  | 46         if os.path.isfile(fp): | 
|  | 47             n = float(os.path.getsize(fp)) | 
|  | 48             if n > 2**20: | 
|  | 49                 size = ' (%1.1f MB)' % (n/2**20) | 
|  | 50             elif n > 2**10: | 
|  | 51                 size = ' (%1.1f KB)' % (n/2**10) | 
|  | 52             elif n > 0: | 
|  | 53                 size = ' (%d B)' % (int(n)) | 
|  | 54         s = '%s %s' % (fpath, size) | 
|  | 55         return s | 
|  | 56 | 
|  | 57     def run_fastqc(self): | 
|  | 58         """ | 
|  | 59         In batch mode fastqc behaves not very nicely - will write to a new folder in | 
|  | 60         the same place as the infile called [infilebasename]_fastqc | 
|  | 61     rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc | 
|  | 62     duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt | 
|  | 63     error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png | 
|  | 64     fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png | 
|  | 65 | 
|  | 66         """ | 
|  | 67         serr = '' | 
|  | 68         dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir) | 
|  | 69         sout = open(tlog, 'w') | 
|  | 70         fastq = os.path.basename(self.opts.input) | 
|  | 71         cl = [self.opts.executable,'--outdir=%s' % self.opts.outputdir] | 
|  | 72         if self.opts.informat in ['sam','bam']: | 
|  | 73             cl.append('--f=%s' % self.opts.informat) | 
|  | 74         if self.opts.contaminants <> None : | 
|  | 75             cl.append('--contaminants=%s' % self.opts.contaminants) | 
|  | 76         # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30 | 
|  | 77         # use a symlink in a temporary directory so that the FastQC report reflects the history input file name | 
|  | 78         # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files | 
|  | 79         # which fastqc helpfully tries to uncompress again - hilarity ensues. | 
|  | 80         # patched may 29 2013 until this is fixed properly | 
|  | 81         infname = self.opts.inputfilename | 
|  | 82         linf = infname.lower() | 
|  | 83         trimext = False | 
|  | 84         if ( linf.endswith('.gz') or linf.endswith('.gzip') ): | 
|  | 85             f = gzip.open(self.opts.input) | 
|  | 86             try: | 
|  | 87                testrow = f.readline() | 
|  | 88             except: | 
|  | 89                trimext = True | 
|  | 90             f.close() | 
|  | 91         elif linf.endswith('bz2'): | 
|  | 92            f = bz2.open(self.opts.input,'rb') | 
|  | 93            try: | 
|  | 94               f.readline() | 
|  | 95            except: | 
|  | 96               trimext = True | 
|  | 97            f.close() | 
|  | 98         elif linf.endswith('.zip'): | 
|  | 99            if not zipfile.is_zipfile(self.opts.input): | 
|  | 100               trimext = True | 
|  | 101         if trimext: | 
|  | 102            infname = os.path.splitext(infname)[0] | 
|  | 103         fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname)) | 
|  | 104         link_name = os.path.join(self.opts.outputdir, fastqinfilename) | 
|  | 105         os.symlink(self.opts.input, link_name) | 
|  | 106         cl.append(link_name) | 
|  | 107         sout.write('# FastQC cl = %s\n' % ' '.join(cl)) | 
|  | 108         sout.flush() | 
|  | 109         p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir) | 
|  | 110         retval = p.wait() | 
|  | 111         sout.close() | 
|  | 112         runlog = open(tlog,'r').readlines() | 
|  | 113         os.unlink(link_name) | 
|  | 114         flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh | 
|  | 115         odpath = None | 
|  | 116         for f in flist: | 
|  | 117             d = os.path.join(self.opts.outputdir,f) | 
|  | 118             if os.path.isdir(d): | 
|  | 119                 if d.endswith('_fastqc'): | 
|  | 120                     odpath = d | 
|  | 121         hpath = None | 
|  | 122         if odpath <> None: | 
|  | 123             try: | 
|  | 124                 hpath = os.path.join(odpath,'fastqc_report.html') | 
|  | 125                 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag | 
|  | 126             except: | 
|  | 127                 pass | 
|  | 128         if hpath == None: | 
|  | 129             serr = '\n'.join(runlog) | 
|  | 130             res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),] | 
|  | 131             res += runlog | 
|  | 132             res += ['</pre>\n', | 
|  | 133                    'Please read the above for clues<br/>\n', | 
|  | 134                    'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n', | 
|  | 135                    'It is also possible that the log shows that fastqc is not installed?<br/>\n', | 
|  | 136                    'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n', | 
|  | 137                    'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',] | 
|  | 138             return res,1,serr | 
|  | 139         self.fix_fastqcimages(odpath) | 
|  | 140         flist = os.listdir(self.opts.outputdir) # these have now been fixed | 
|  | 141         excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png'] | 
|  | 142         flist = [x for x in flist if not x in excludefiles] | 
|  | 143         for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh | 
|  | 144             rep[i] = rep[i].replace('Icons/','') | 
|  | 145             rep[i] = rep[i].replace('Images/','') | 
|  | 146         html = self.fix_fastqc(rep,flist,runlog) | 
|  | 147         return html,retval,serr | 
|  | 148 | 
|  | 149 | 
|  | 150 | 
|  | 151     def fix_fastqc(self,rep=[],flist=[],runlog=[]): | 
|  | 152         """ add some of our stuff to the html | 
|  | 153         """ | 
|  | 154         bodyindex = len(rep) -1  # hope they don't change this | 
|  | 155         footrow = bodyindex - 1 | 
|  | 156         footer = rep[footrow] | 
|  | 157         rep = rep[:footrow] + rep[footrow+1:] | 
|  | 158         res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n'] | 
|  | 159         flist.sort() | 
|  | 160         for i,f in enumerate(flist): | 
|  | 161              if not(os.path.isdir(f)): | 
|  | 162                  fn = os.path.split(f)[-1] | 
|  | 163                  res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,self.getFileString(fn, self.opts.outputdir))) | 
|  | 164         res.append('</table>\n') | 
|  | 165         res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n') | 
|  | 166         res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>') | 
|  | 167         res.append(footer) | 
|  | 168         fixed = rep[:bodyindex] + res + rep[bodyindex:] | 
|  | 169         return fixed # with our additions | 
|  | 170 | 
|  | 171 | 
|  | 172     def fix_fastqcimages(self,odpath): | 
|  | 173         """ Galaxy wants everything in the same files_dir | 
|  | 174         """ | 
|  | 175         icpath = os.path.join(odpath,'Icons') | 
|  | 176         impath = os.path.join(odpath,'Images') | 
|  | 177         for adir in [icpath,impath,odpath]: | 
|  | 178             if os.path.exists(adir): | 
|  | 179                 flist = os.listdir(adir) # get all files created | 
|  | 180                 for f in flist: | 
|  | 181                    if not os.path.isdir(os.path.join(adir,f)): | 
|  | 182                        sauce = os.path.join(adir,f) | 
|  | 183                        dest = os.path.join(self.opts.outputdir,f) | 
|  | 184                        shutil.move(sauce,dest) | 
|  | 185                 os.rmdir(adir) | 
|  | 186 | 
|  | 187 | 
|  | 188 | 
|  | 189 if __name__ == '__main__': | 
|  | 190     op = optparse.OptionParser() | 
|  | 191     op.add_option('-i', '--input', default=None) | 
|  | 192     op.add_option('-j', '--inputfilename', default=None) | 
|  | 193     op.add_option('-o', '--htmloutput', default=None) | 
|  | 194     op.add_option('-d', '--outputdir', default="/tmp/shortread") | 
|  | 195     op.add_option('-f', '--informat', default='fastq') | 
|  | 196     op.add_option('-n', '--namejob', default='rgFastQC') | 
|  | 197     op.add_option('-c', '--contaminants', default=None) | 
|  | 198     op.add_option('-e', '--executable', default='fastqc') | 
|  | 199     opts, args = op.parse_args() | 
|  | 200     assert opts.input <> None | 
|  | 201     assert os.path.isfile(opts.executable),'##rgFastQC.py error - cannot find executable %s' % opts.executable | 
|  | 202     if not os.path.exists(opts.outputdir): | 
|  | 203         os.makedirs(opts.outputdir) | 
|  | 204     f = FastQC(opts) | 
|  | 205     html,retval,serr = f.run_fastqc() | 
|  | 206     f = open(opts.htmloutput, 'w') | 
|  | 207     f.write(''.join(html)) | 
|  | 208     f.close() | 
|  | 209     if retval <> 0: | 
|  | 210          print >> sys.stderr, serr # indicate failure | 
|  | 211 | 
|  | 212 | 
|  | 213 |