| 
0
 | 
     1 """
 | 
| 
 | 
     2 # May 2013 ross added check for bogus gz extension - fastqc gets confused
 | 
| 
 | 
     3 # added sanitizer for user supplied name
 | 
| 
 | 
     4 # removed shell and make cl a sequence for Popen call
 | 
| 
 | 
     5 # ross lazarus August 10 2012 in response to anon insecurity report
 | 
| 
 | 
     6 wrapper for fastqc
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 called as
 | 
| 
 | 
     9   <command interpreter="python">
 | 
| 
 | 
    10     rgFastqc.py -i $input_file -d $html_file.files_path -o $html_file -n "$out_prefix"
 | 
| 
 | 
    11   </command>
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 
 | 
| 
 | 
    15 Current release seems overly intolerant of sam/bam header strangeness
 | 
| 
 | 
    16 Author notified...
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 """
 | 
| 
 | 
    20 import re
 | 
| 
 | 
    21 import os
 | 
| 
 | 
    22 import sys
 | 
| 
 | 
    23 import subprocess
 | 
| 
 | 
    24 import optparse
 | 
| 
 | 
    25 import shutil
 | 
| 
 | 
    26 import tempfile
 | 
| 
 | 
    27 import zipfile
 | 
| 
 | 
    28 import gzip
 | 
| 
 | 
    29 
 | 
| 
3
 | 
    30 def pathfind(program):
 | 
| 
 | 
    31     """ toolshed path munging isn't so try to work around june 5 2013
 | 
| 
 | 
    32     """
 | 
| 
 | 
    33     def is_exe(fpath):
 | 
| 
 | 
    34         return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
 | 
| 
 | 
    35 
 | 
| 
 | 
    36     fpath, fname = os.path.split(program)
 | 
| 
 | 
    37     if fpath:
 | 
| 
 | 
    38         if is_exe(program):
 | 
| 
 | 
    39             return program
 | 
| 
 | 
    40     else:
 | 
| 
 | 
    41         for path in os.environ["PATH"].split(os.pathsep):
 | 
| 
 | 
    42             path = path.strip('"')
 | 
| 
 | 
    43             exe_file = os.path.join(path, program)
 | 
| 
 | 
    44             if is_exe(exe_file):
 | 
| 
 | 
    45                 return exe_file
 | 
| 
 | 
    46 
 | 
| 
 | 
    47     return None    
 | 
| 
 | 
    48 
 | 
| 
0
 | 
    49 class FastQC():
 | 
| 
 | 
    50     """wrapper
 | 
| 
 | 
    51     """
 | 
| 
 | 
    52     
 | 
| 
 | 
    53     
 | 
| 
 | 
    54     def __init__(self,opts=None):
 | 
| 
 | 
    55         assert opts <> None
 | 
| 
 | 
    56         self.opts = opts
 | 
| 
3
 | 
    57         fastqcexe = pathfind(opts.executable)
 | 
| 
 | 
    58         assert (fastqcexe != None),'##rgFastQC.py error - cannot find passed fastqc executable %s in path %s' % (opts.executable,os.environ['PATH'])
 | 
| 
 | 
    59         self.fastqcexe = fastqcexe
 | 
| 
0
 | 
    60         
 | 
| 
 | 
    61     def getFileString(self, fpath, outpath):
 | 
| 
 | 
    62         """
 | 
| 
 | 
    63         format a nice file size string
 | 
| 
 | 
    64         """
 | 
| 
 | 
    65         size = ''
 | 
| 
 | 
    66         fp = os.path.join(outpath, fpath)
 | 
| 
 | 
    67         s = fpath
 | 
| 
 | 
    68         if os.path.isfile(fp):
 | 
| 
 | 
    69             n = float(os.path.getsize(fp))
 | 
| 
 | 
    70             if n > 2**20:
 | 
| 
 | 
    71                 size = ' (%1.1f MB)' % (n/2**20)
 | 
| 
 | 
    72             elif n > 2**10:
 | 
| 
 | 
    73                 size = ' (%1.1f KB)' % (n/2**10)
 | 
| 
 | 
    74             elif n > 0:
 | 
| 
 | 
    75                 size = ' (%d B)' % (int(n))
 | 
| 
 | 
    76         s = '%s %s' % (fpath, size)
 | 
| 
 | 
    77         return s
 | 
| 
 | 
    78 
 | 
| 
 | 
    79     def run_fastqc(self):
 | 
| 
 | 
    80         """
 | 
| 
 | 
    81         In batch mode fastqc behaves not very nicely - will write to a new folder in
 | 
| 
 | 
    82         the same place as the infile called [infilebasename]_fastqc
 | 
| 
 | 
    83     rlazarus@omics:/data/galaxy/test$ ls FC041_1_sequence_fastqc
 | 
| 
 | 
    84     duplication_levels.png  fastqc_icon.png          per_base_n_content.png         per_sequence_gc_content.png       summary.txt
 | 
| 
 | 
    85     error.png               fastqc_report.html       per_base_quality.png           per_sequence_quality.png          tick.png
 | 
| 
 | 
    86     fastqc_data.txt         per_base_gc_content.png  per_base_sequence_content.png  sequence_length_distribution.png  warning.png
 | 
| 
 | 
    87 
 | 
| 
 | 
    88         """
 | 
| 
 | 
    89         serr = ''
 | 
| 
 | 
    90         dummy,tlog = tempfile.mkstemp(prefix='rgFastQC',suffix=".log",dir=self.opts.outputdir)
 | 
| 
 | 
    91         sout = open(tlog, 'w')
 | 
| 
 | 
    92         fastq = os.path.basename(self.opts.input)
 | 
| 
3
 | 
    93         cl = [self.fastqcexe,'--outdir=%s' % self.opts.outputdir]
 | 
| 
0
 | 
    94         if self.opts.informat in ['sam','bam']:
 | 
| 
 | 
    95             cl.append('--f=%s' % self.opts.informat)
 | 
| 
 | 
    96         if self.opts.contaminants <> None :
 | 
| 
 | 
    97             cl.append('--contaminants=%s' % self.opts.contaminants)
 | 
| 
 | 
    98         # patch suggested by bwlang https://bitbucket.org/galaxy/galaxy-central/pull-request/30
 | 
| 
 | 
    99         # use a symlink in a temporary directory so that the FastQC report reflects the history input file name
 | 
| 
 | 
   100         # note this exposes a bug in the EBI_SRA download tool which leaves bogus .gz extensions on uncompressed files
 | 
| 
 | 
   101         # which fastqc helpfully tries to uncompress again - hilarity ensues.
 | 
| 
 | 
   102         # patched may 29 2013 until this is fixed properly
 | 
| 
 | 
   103         infname = self.opts.inputfilename
 | 
| 
 | 
   104         linf = infname.lower()
 | 
| 
 | 
   105         trimext = False
 | 
| 
 | 
   106         if ( linf.endswith('.gz') or linf.endswith('.gzip') ): 
 | 
| 
 | 
   107             f = gzip.open(self.opts.input)
 | 
| 
 | 
   108             try:
 | 
| 
 | 
   109                testrow = f.readline()
 | 
| 
 | 
   110             except:
 | 
| 
 | 
   111                trimext = True
 | 
| 
 | 
   112             f.close()
 | 
| 
 | 
   113         elif linf.endswith('bz2'):
 | 
| 
 | 
   114            f = bz2.open(self.opts.input,'rb')
 | 
| 
 | 
   115            try:
 | 
| 
 | 
   116               f.readline()
 | 
| 
 | 
   117            except:
 | 
| 
 | 
   118               trimext = True
 | 
| 
 | 
   119            f.close()
 | 
| 
 | 
   120         elif linf.endswith('.zip'):
 | 
| 
 | 
   121            if not zipfile.is_zipfile(self.opts.input):
 | 
| 
 | 
   122               trimext = True
 | 
| 
 | 
   123         if trimext:
 | 
| 
 | 
   124            infname = os.path.splitext(infname)[0]
 | 
| 
 | 
   125         fastqinfilename = re.sub(ur'[^a-zA-Z0-9_\-\.]', '_', os.path.basename(infname))
 | 
| 
 | 
   126         link_name = os.path.join(self.opts.outputdir, fastqinfilename)
 | 
| 
 | 
   127         os.symlink(self.opts.input, link_name)
 | 
| 
 | 
   128         cl.append(link_name)        
 | 
| 
 | 
   129         sout.write('# FastQC cl = %s\n' % ' '.join(cl))
 | 
| 
 | 
   130         sout.flush()
 | 
| 
 | 
   131         p = subprocess.Popen(cl, shell=False, stderr=sout, stdout=sout, cwd=self.opts.outputdir)
 | 
| 
 | 
   132         retval = p.wait()
 | 
| 
 | 
   133         sout.close()
 | 
| 
 | 
   134         runlog = open(tlog,'r').readlines()
 | 
| 
 | 
   135         os.unlink(link_name)
 | 
| 
 | 
   136         flist = os.listdir(self.opts.outputdir) # fastqc plays games with its output directory name. eesh
 | 
| 
 | 
   137         odpath = None
 | 
| 
 | 
   138         for f in flist:
 | 
| 
 | 
   139             d = os.path.join(self.opts.outputdir,f)
 | 
| 
 | 
   140             if os.path.isdir(d):
 | 
| 
 | 
   141                 if d.endswith('_fastqc'):
 | 
| 
 | 
   142                     odpath = d 
 | 
| 
 | 
   143         hpath = None
 | 
| 
 | 
   144         if odpath <> None:
 | 
| 
 | 
   145             try: 
 | 
| 
 | 
   146                 hpath = os.path.join(odpath,'fastqc_report.html')
 | 
| 
 | 
   147                 rep = open(hpath,'r').readlines() # for our new html file but we need to insert our stuff after the <body> tag
 | 
| 
 | 
   148             except:
 | 
| 
 | 
   149                 pass
 | 
| 
 | 
   150         if hpath == None:
 | 
| 
 | 
   151             serr = '\n'.join(runlog)       
 | 
| 
 | 
   152             res =  ['## odpath=%s: No output found in %s. Output for the run was:<pre>\n' % (odpath,hpath),]
 | 
| 
 | 
   153             res += runlog
 | 
| 
 | 
   154             res += ['</pre>\n',
 | 
| 
 | 
   155                    'Please read the above for clues<br/>\n',
 | 
| 
 | 
   156                    'If you selected a sam/bam format file, it might not have headers or they may not start with @HD?<br/>\n',
 | 
| 
 | 
   157                    'It is also possible that the log shows that fastqc is not installed?<br/>\n',
 | 
| 
 | 
   158                    'If that is the case, please tell the relevant Galaxy administrator that it can be snarfed from<br/>\n',
 | 
| 
 | 
   159                    'http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/<br/>\n',]
 | 
| 
 | 
   160             return res,1,serr
 | 
| 
 | 
   161         self.fix_fastqcimages(odpath)
 | 
| 
 | 
   162         flist = os.listdir(self.opts.outputdir) # these have now been fixed
 | 
| 
 | 
   163         excludefiles = ['tick.png','warning.png','fastqc_icon.png','error.png']
 | 
| 
 | 
   164         flist = [x for x in flist if not x in excludefiles]
 | 
| 
 | 
   165         for i in range(len(rep)): # need to fix links to Icons and Image subdirectories in lastest fastqc code - ugh
 | 
| 
 | 
   166             rep[i] = rep[i].replace('Icons/','')
 | 
| 
 | 
   167             rep[i] = rep[i].replace('Images/','')
 | 
| 
 | 
   168         html = self.fix_fastqc(rep,flist,runlog)
 | 
| 
 | 
   169         return html,retval,serr
 | 
| 
 | 
   170         
 | 
| 
 | 
   171 
 | 
| 
 | 
   172         
 | 
| 
 | 
   173     def fix_fastqc(self,rep=[],flist=[],runlog=[]):
 | 
| 
 | 
   174         """ add some of our stuff to the html
 | 
| 
 | 
   175         """
 | 
| 
 | 
   176         bodyindex = len(rep) -1  # hope they don't change this
 | 
| 
 | 
   177         footrow = bodyindex - 1 
 | 
| 
 | 
   178         footer = rep[footrow]
 | 
| 
 | 
   179         rep = rep[:footrow] + rep[footrow+1:]
 | 
| 
 | 
   180         res = ['<div class="module"><h2>Files created by FastQC</h2><table cellspacing="2" cellpadding="2">\n']
 | 
| 
 | 
   181         flist.sort()
 | 
| 
 | 
   182         for i,f in enumerate(flist):
 | 
| 
 | 
   183              if not(os.path.isdir(f)):
 | 
| 
 | 
   184                  fn = os.path.split(f)[-1]
 | 
| 
 | 
   185                  res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,self.getFileString(fn, self.opts.outputdir)))
 | 
| 
 | 
   186         res.append('</table>\n') 
 | 
| 
 | 
   187         res.append('<a href="http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/">FastQC documentation and full attribution is here</a><br/><hr/>\n')
 | 
| 
 | 
   188         res.append('FastQC was run by Galaxy using the rgenetics rgFastQC wrapper - see http://rgenetics.org for details and licensing\n</div>')
 | 
| 
 | 
   189         res.append(footer)
 | 
| 
 | 
   190         fixed = rep[:bodyindex] + res + rep[bodyindex:]
 | 
| 
 | 
   191         return fixed # with our additions
 | 
| 
 | 
   192 
 | 
| 
 | 
   193 
 | 
| 
 | 
   194     def fix_fastqcimages(self,odpath):
 | 
| 
 | 
   195         """ Galaxy wants everything in the same files_dir
 | 
| 
 | 
   196         """
 | 
| 
 | 
   197         icpath = os.path.join(odpath,'Icons')
 | 
| 
 | 
   198         impath = os.path.join(odpath,'Images')
 | 
| 
 | 
   199         for adir in [icpath,impath,odpath]:
 | 
| 
 | 
   200             if os.path.exists(adir):
 | 
| 
 | 
   201                 flist = os.listdir(adir) # get all files created
 | 
| 
 | 
   202                 for f in flist:
 | 
| 
 | 
   203                    if not os.path.isdir(os.path.join(adir,f)):
 | 
| 
 | 
   204                        sauce = os.path.join(adir,f)
 | 
| 
 | 
   205                        dest = os.path.join(self.opts.outputdir,f)
 | 
| 
 | 
   206                        shutil.move(sauce,dest)
 | 
| 
 | 
   207                 os.rmdir(adir)
 | 
| 
 | 
   208 
 | 
| 
 | 
   209 
 | 
| 
 | 
   210 if __name__ == '__main__':
 | 
| 
 | 
   211     op = optparse.OptionParser()
 | 
| 
 | 
   212     op.add_option('-i', '--input', default=None)
 | 
| 
 | 
   213     op.add_option('-j', '--inputfilename', default=None)    
 | 
| 
 | 
   214     op.add_option('-o', '--htmloutput', default=None)
 | 
| 
 | 
   215     op.add_option('-d', '--outputdir', default="/tmp/shortread")
 | 
| 
 | 
   216     op.add_option('-f', '--informat', default='fastq')
 | 
| 
 | 
   217     op.add_option('-n', '--namejob', default='rgFastQC')
 | 
| 
 | 
   218     op.add_option('-c', '--contaminants', default=None)
 | 
| 
 | 
   219     op.add_option('-e', '--executable', default='fastqc')
 | 
| 
 | 
   220     opts, args = op.parse_args()
 | 
| 
 | 
   221     assert opts.input <> None
 | 
| 
 | 
   222     if not os.path.exists(opts.outputdir): 
 | 
| 
 | 
   223         os.makedirs(opts.outputdir)
 | 
| 
 | 
   224     f = FastQC(opts)
 | 
| 
 | 
   225     html,retval,serr = f.run_fastqc()
 | 
| 
 | 
   226     f = open(opts.htmloutput, 'w')
 | 
| 
 | 
   227     f.write(''.join(html))
 | 
| 
 | 
   228     f.close()
 | 
| 
 | 
   229     if retval <> 0:
 | 
| 
 | 
   230          print >> sys.stderr, serr # indicate failure
 | 
| 
 | 
   231          
 | 
| 
 | 
   232     
 | 
| 
 | 
   233 
 |