Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
comparison amplicon_analysis_pipeline.py @ 0:b433086738d6 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit ba3e5b591407db52a586361efb21927c8171ec0e
| author | pjbriggs |
|---|---|
| date | Wed, 08 Nov 2017 08:43:02 -0500 |
| parents | |
| children | a898ee628343 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b433086738d6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # Wrapper script to run Amplicon_analysis_pipeline.sh | |
| 4 # from Galaxy tool | |
| 5 | |
| 6 import sys | |
| 7 import os | |
| 8 import argparse | |
| 9 import subprocess | |
| 10 import glob | |
| 11 | |
| 12 class PipelineCmd(object): | |
| 13 def __init__(self,cmd): | |
| 14 self.cmd = [str(cmd)] | |
| 15 def add_args(self,*args): | |
| 16 for arg in args: | |
| 17 self.cmd.append(str(arg)) | |
| 18 def __repr__(self): | |
| 19 return ' '.join([str(arg) for arg in self.cmd]) | |
| 20 | |
| 21 def ahref(target,name=None,type=None): | |
| 22 if name is None: | |
| 23 name = os.path.basename(target) | |
| 24 ahref = "<a href='%s'" % target | |
| 25 if type is not None: | |
| 26 ahref += " type='%s'" % type | |
| 27 ahref += ">%s</a>" % name | |
| 28 return ahref | |
| 29 | |
| 30 def check_errors(): | |
| 31 # Errors in Amplicon_analysis_pipeline.log | |
| 32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log: | |
| 33 log = pipeline_log.read() | |
| 34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log: | |
| 35 print_error("""*** Sample IDs don't match dataset names *** | |
| 36 | |
| 37 The sample IDs (first column of the Metatable file) don't match the | |
| 38 supplied sample names for the input Fastq pairs. | |
| 39 """) | |
| 40 # Errors in pipeline output | |
| 41 with open('pipeline.log','r') as pipeline_log: | |
| 42 log = pipeline_log.read() | |
| 43 if "Errors and/or warnings detected in mapping file" in log: | |
| 44 with open("Metatable_log/Metatable.log","r") as metatable_log: | |
| 45 # Echo the Metatable log file to the tool log | |
| 46 print_error("""*** Error in Metatable mapping file *** | |
| 47 | |
| 48 %s""" % metatable_log.read()) | |
| 49 elif "No header line was found in mapping file" in log: | |
| 50 # Report error to the tool log | |
| 51 print_error("""*** No header in Metatable mapping file *** | |
| 52 | |
| 53 Check you've specified the correct file as the input Metatable""") | |
| 54 | |
| 55 def print_error(message): | |
| 56 width = max([len(line) for line in message.split('\n')]) + 4 | |
| 57 sys.stderr.write("\n%s\n" % ('*'*width)) | |
| 58 for line in message.split('\n'): | |
| 59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4))) | |
| 60 sys.stderr.write("%s\n\n" % ('*'*width)) | |
| 61 | |
| 62 def clean_up_name(sample): | |
| 63 # Remove trailing "_L[0-9]+_001" from Fastq | |
| 64 # pair names | |
| 65 split_name = sample.split('_') | |
| 66 if split_name[-1] == "001": | |
| 67 split_name = split_name[:-1] | |
| 68 if split_name[-1].startswith('L'): | |
| 69 try: | |
| 70 int(split_name[-1][1:]) | |
| 71 split_name = split_name[:-1] | |
| 72 except ValueError: | |
| 73 pass | |
| 74 return '_'.join(split_name) | |
| 75 | |
| 76 def list_outputs(filen=None): | |
| 77 # List the output directory contents | |
| 78 # If filen is specified then will be the filename to | |
| 79 # write to, otherwise write to stdout | |
| 80 if filen is not None: | |
| 81 fp = open(filen,'w') | |
| 82 else: | |
| 83 fp = sys.stdout | |
| 84 results_dir = os.path.abspath("RESULTS") | |
| 85 fp.write("Listing contents of output dir %s:\n" % results_dir) | |
| 86 ix = 0 | |
| 87 for d,dirs,files in os.walk(results_dir): | |
| 88 ix += 1 | |
| 89 fp.write("-- %d: %s\n" % (ix, | |
| 90 os.path.relpath(d,results_dir))) | |
| 91 for f in files: | |
| 92 ix += 1 | |
| 93 fp.write("---- %d: %s\n" % (ix, | |
| 94 os.path.relpath(f,results_dir))) | |
| 95 # Close output file | |
| 96 if filen is not None: | |
| 97 fp.close() | |
| 98 | |
| 99 if __name__ == "__main__": | |
| 100 # Command line | |
| 101 print "Amplicon analysis: starting" | |
| 102 p = argparse.ArgumentParser() | |
| 103 p.add_argument("metatable", | |
| 104 metavar="METATABLE_FILE", | |
| 105 help="Metatable.txt file") | |
| 106 p.add_argument("fastq_pairs", | |
| 107 metavar="SAMPLE_NAME FQ_R1 FQ_R2", | |
| 108 nargs="+", | |
| 109 default=list(), | |
| 110 help="Triplets of SAMPLE_NAME followed by " | |
| 111 "a R1/R2 FASTQ file pair") | |
| 112 p.add_argument("-g",dest="forward_pcr_primer") | |
| 113 p.add_argument("-G",dest="reverse_pcr_primer") | |
| 114 p.add_argument("-q",dest="trimming_threshold") | |
| 115 p.add_argument("-O",dest="minimum_overlap") | |
| 116 p.add_argument("-L",dest="minimum_length") | |
| 117 p.add_argument("-l",dest="sliding_window_length") | |
| 118 p.add_argument("-P",dest="pipeline", | |
| 119 choices=["vsearch","uparse","qiime"], | |
| 120 type=str.lower, | |
| 121 default="vsearch") | |
| 122 p.add_argument("-S",dest="use_silva",action="store_true") | |
| 123 p.add_argument("-r",dest="reference_data_path") | |
| 124 p.add_argument("-c",dest="categories_file") | |
| 125 args = p.parse_args() | |
| 126 | |
| 127 # Build the environment for running the pipeline | |
| 128 print "Amplicon analysis: building the environment" | |
| 129 metatable_file = os.path.abspath(args.metatable) | |
| 130 os.symlink(metatable_file,"Metatable.txt") | |
| 131 print "-- made symlink to Metatable.txt" | |
| 132 | |
| 133 # Link to Categories.txt file (if provided) | |
| 134 if args.categories_file is not None: | |
| 135 categories_file = os.path.abspath(args.categories_file) | |
| 136 os.symlink(categories_file,"Categories.txt") | |
| 137 print "-- made symlink to Categories.txt" | |
| 138 | |
| 139 # Link to FASTQs and construct Final_name.txt file | |
| 140 sample_names = [] | |
| 141 with open("Final_name.txt",'w') as final_name: | |
| 142 fastqs = iter(args.fastq_pairs) | |
| 143 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs): | |
| 144 sample_name = clean_up_name(sample_name) | |
| 145 r1 = "%s_R1_.fastq" % sample_name | |
| 146 r2 = "%s_R2_.fastq" % sample_name | |
| 147 os.symlink(fqr1,r1) | |
| 148 os.symlink(fqr2,r2) | |
| 149 final_name.write("%s\n" % '\t'.join((r1,sample_name))) | |
| 150 final_name.write("%s\n" % '\t'.join((r2,sample_name))) | |
| 151 sample_names.append(sample_name) | |
| 152 | |
| 153 # Construct the pipeline command | |
| 154 print "Amplicon analysis: constructing pipeline command" | |
| 155 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh") | |
| 156 if args.forward_pcr_primer: | |
| 157 pipeline.add_args("-g",args.forward_pcr_primer) | |
| 158 if args.reverse_pcr_primer: | |
| 159 pipeline.add_args("-G",args.reverse_pcr_primer) | |
| 160 if args.trimming_threshold: | |
| 161 pipeline.add_args("-q",args.trimming_threshold) | |
| 162 if args.minimum_overlap: | |
| 163 pipeline.add_args("-O",args.minimum_overlap) | |
| 164 if args.minimum_length: | |
| 165 pipeline.add_args("-L",args.minimum_length) | |
| 166 if args.sliding_window_length: | |
| 167 pipeline.add_args("-l",args.sliding_window_length) | |
| 168 if args.reference_data_path: | |
| 169 pipeline.add_args("-r",args.reference_data_path) | |
| 170 pipeline.add_args("-P",args.pipeline) | |
| 171 if args.use_silva: | |
| 172 pipeline.add_args("-S") | |
| 173 | |
| 174 # Echo the pipeline command to stdout | |
| 175 print "Running %s" % pipeline | |
| 176 | |
| 177 # Run the pipeline | |
| 178 with open("pipeline.log","w") as pipeline_out: | |
| 179 try: | |
| 180 subprocess.check_call(pipeline.cmd, | |
| 181 stdout=pipeline_out, | |
| 182 stderr=subprocess.STDOUT) | |
| 183 exit_code = 0 | |
| 184 print "Pipeline completed ok" | |
| 185 except subprocess.CalledProcessError as ex: | |
| 186 # Non-zero exit status | |
| 187 sys.stderr.write("Pipeline failed: exit code %s\n" % | |
| 188 ex.returncode) | |
| 189 exit_code = ex.returncode | |
| 190 except Exception as ex: | |
| 191 # Some other problem | |
| 192 sys.stderr.write("Unexpected error: %s\n" % str(ex)) | |
| 193 | |
| 194 # Write out the list of outputs | |
| 195 outputs_file = "Pipeline_outputs.txt" | |
| 196 list_outputs(outputs_file) | |
| 197 | |
| 198 # Check for log file | |
| 199 log_file = "Amplicon_analysis_pipeline.log" | |
| 200 if os.path.exists(log_file): | |
| 201 print "Found log file: %s" % log_file | |
| 202 if exit_code == 0: | |
| 203 # Create an HTML file to link to log files etc | |
| 204 # NB the paths to the files should be correct once | |
| 205 # copied by Galaxy on job completion | |
| 206 with open("pipeline_outputs.html","w") as html_out: | |
| 207 html_out.write("""<html> | |
| 208 <head> | |
| 209 <title>Amplicon analysis pipeline: log files</title> | |
| 210 <head> | |
| 211 <body> | |
| 212 <h1>Amplicon analysis pipeline: log files</h1> | |
| 213 <ul> | |
| 214 """) | |
| 215 html_out.write( | |
| 216 "<li>%s</li>\n" % | |
| 217 ahref("Amplicon_analysis_pipeline.log", | |
| 218 type="text/plain")) | |
| 219 html_out.write( | |
| 220 "<li>%s</li>\n" % | |
| 221 ahref("pipeline.log",type="text/plain")) | |
| 222 html_out.write( | |
| 223 "<li>%s</li>\n" % | |
| 224 ahref("Pipeline_outputs.txt", | |
| 225 type="text/plain")) | |
| 226 html_out.write( | |
| 227 "<li>%s</li>\n" % | |
| 228 ahref("Metatable.html")) | |
| 229 html_out.write("""<ul> | |
| 230 </body> | |
| 231 </html> | |
| 232 """) | |
| 233 else: | |
| 234 # Check for known error messages | |
| 235 check_errors() | |
| 236 # Write pipeline stdout to tool stderr | |
| 237 sys.stderr.write("\nOutput from pipeline:\n") | |
| 238 with open("pipeline.log",'r') as log: | |
| 239 sys.stderr.write("%s" % log.read()) | |
| 240 # Write log file contents to tool log | |
| 241 print "\nAmplicon_analysis_pipeline.log:" | |
| 242 with open(log_file,'r') as log: | |
| 243 print "%s" % log.read() | |
| 244 else: | |
| 245 sys.stderr.write("ERROR missing log file \"%s\"\n" % | |
| 246 log_file) | |
| 247 | |
| 248 # Handle FastQC boxplots | |
| 249 print "Amplicon analysis: collating per base quality boxplots" | |
| 250 with open("fastqc_quality_boxplots.html","w") as quality_boxplots: | |
| 251 # PHRED value for trimming | |
| 252 phred_score = 20 | |
| 253 if args.trimming_threshold is not None: | |
| 254 phred_score = args.trimming_threshold | |
| 255 # Write header for HTML output file | |
| 256 quality_boxplots.write("""<html> | |
| 257 <head> | |
| 258 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title> | |
| 259 <head> | |
| 260 <body> | |
| 261 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1> | |
| 262 """) | |
| 263 # Look for raw and trimmed FastQC output for each sample | |
| 264 for sample_name in sample_names: | |
| 265 fastqc_dir = os.path.join(sample_name,"FastQC") | |
| 266 quality_boxplots.write("<h2>%s</h2>" % sample_name) | |
| 267 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score): | |
| 268 quality_boxplots.write("<h3>%s</h3>" % d) | |
| 269 fastqc_html_files = glob.glob( | |
| 270 os.path.join(fastqc_dir,d,"*_fastqc.html")) | |
| 271 if not fastqc_html_files: | |
| 272 quality_boxplots.write("<p>No FastQC outputs found</p>") | |
| 273 continue | |
| 274 # Pull out the per-base quality boxplots | |
| 275 for f in fastqc_html_files: | |
| 276 boxplot = None | |
| 277 with open(f) as fp: | |
| 278 for line in fp.read().split(">"): | |
| 279 try: | |
| 280 line.index("alt=\"Per base quality graph\"") | |
| 281 boxplot = line + ">" | |
| 282 break | |
| 283 except ValueError: | |
| 284 pass | |
| 285 if boxplot is None: | |
| 286 boxplot = "Missing plot" | |
| 287 quality_boxplots.write("<h4>%s</h4><p>%s</p>" % | |
| 288 (os.path.basename(f), | |
| 289 boxplot)) | |
| 290 quality_boxplots.write("""</body> | |
| 291 </html> | |
| 292 """) | |
| 293 | |
| 294 # Handle additional output when categories file was supplied | |
| 295 if args.categories_file is not None: | |
| 296 # Alpha diversity boxplots | |
| 297 print "Amplicon analysis: indexing alpha diversity boxplots" | |
| 298 boxplots_dir = os.path.abspath( | |
| 299 os.path.join("RESULTS", | |
| 300 "%s_%s" % (args.pipeline.title(), | |
| 301 ("gg" if not args.use_silva | |
| 302 else "silva")), | |
| 303 "Alpha_diversity", | |
| 304 "Alpha_diversity_boxplot", | |
| 305 "Categories_shannon")) | |
| 306 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir | |
| 307 boxplot_pdfs = [os.path.basename(pdf) | |
| 308 for pdf in | |
| 309 sorted(glob.glob( | |
| 310 os.path.join(boxplots_dir,"*.pdf")))] | |
| 311 with open("alpha_diversity_boxplots.html","w") as boxplots_out: | |
| 312 boxplots_out.write("""<html> | |
| 313 <head> | |
| 314 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title> | |
| 315 <head> | |
| 316 <body> | |
| 317 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1> | |
| 318 """) | |
| 319 boxplots_out.write("<ul>\n") | |
| 320 for pdf in boxplot_pdfs: | |
| 321 boxplots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
| 322 boxplots_out.write("<ul>\n") | |
| 323 boxplots_out.write("""</body> | |
| 324 </html> | |
| 325 """) | |
| 326 | |
| 327 # Finish | |
| 328 print "Amplicon analysis: finishing, exit code: %s" % exit_code | |
| 329 sys.exit(exit_code) |
