Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
comparison amplicon_analysis_pipeline.py @ 42:098ad1dd7760 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit 10be6f00106e853a6720e4052871d9d84e027137
| author | pjbriggs |
|---|---|
| date | Thu, 05 Dec 2019 11:48:01 +0000 |
| parents | |
| children | 1bc0077b2c91 |
comparison
equal
deleted
inserted
replaced
| 41:7b9786a43a16 | 42:098ad1dd7760 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # | |
| 3 # Wrapper script to run Amplicon_analysis_pipeline.sh | |
| 4 # from Galaxy tool | |
| 5 | |
| 6 import sys | |
| 7 import os | |
| 8 import argparse | |
| 9 import subprocess | |
| 10 import glob | |
| 11 | |
| 12 class PipelineCmd(object): | |
| 13 def __init__(self,cmd): | |
| 14 self.cmd = [str(cmd)] | |
| 15 def add_args(self,*args): | |
| 16 for arg in args: | |
| 17 self.cmd.append(str(arg)) | |
| 18 def __repr__(self): | |
| 19 return ' '.join([str(arg) for arg in self.cmd]) | |
| 20 | |
| 21 def ahref(target,name=None,type=None): | |
| 22 if name is None: | |
| 23 name = os.path.basename(target) | |
| 24 ahref = "<a href='%s'" % target | |
| 25 if type is not None: | |
| 26 ahref += " type='%s'" % type | |
| 27 ahref += ">%s</a>" % name | |
| 28 return ahref | |
| 29 | |
| 30 def check_errors(): | |
| 31 # Errors in Amplicon_analysis_pipeline.log | |
| 32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log: | |
| 33 log = pipeline_log.read() | |
| 34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log: | |
| 35 print_error("""*** Sample IDs don't match dataset names *** | |
| 36 | |
| 37 The sample IDs (first column of the Metatable file) don't match the | |
| 38 supplied sample names for the input Fastq pairs. | |
| 39 """) | |
| 40 # Errors in pipeline output | |
| 41 with open('pipeline.log','r') as pipeline_log: | |
| 42 log = pipeline_log.read() | |
| 43 if "Errors and/or warnings detected in mapping file" in log: | |
| 44 with open("Metatable_log/Metatable.log","r") as metatable_log: | |
| 45 # Echo the Metatable log file to the tool log | |
| 46 print_error("""*** Error in Metatable mapping file *** | |
| 47 | |
| 48 %s""" % metatable_log.read()) | |
| 49 elif "No header line was found in mapping file" in log: | |
| 50 # Report error to the tool log | |
| 51 print_error("""*** No header in Metatable mapping file *** | |
| 52 | |
| 53 Check you've specified the correct file as the input Metatable""") | |
| 54 | |
| 55 def print_error(message): | |
| 56 width = max([len(line) for line in message.split('\n')]) + 4 | |
| 57 sys.stderr.write("\n%s\n" % ('*'*width)) | |
| 58 for line in message.split('\n'): | |
| 59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4))) | |
| 60 sys.stderr.write("%s\n\n" % ('*'*width)) | |
| 61 | |
| 62 def clean_up_name(sample): | |
| 63 # Remove extensions and trailing "_L[0-9]+_001" from | |
| 64 # Fastq pair names | |
| 65 sample_name = '.'.join(sample.split('.')[:1]) | |
| 66 split_name = sample_name.split('_') | |
| 67 if split_name[-1] == "001": | |
| 68 split_name = split_name[:-1] | |
| 69 if split_name[-1].startswith('L'): | |
| 70 try: | |
| 71 int(split_name[-1][1:]) | |
| 72 split_name = split_name[:-1] | |
| 73 except ValueError: | |
| 74 pass | |
| 75 return '_'.join(split_name) | |
| 76 | |
| 77 def list_outputs(filen=None): | |
| 78 # List the output directory contents | |
| 79 # If filen is specified then will be the filename to | |
| 80 # write to, otherwise write to stdout | |
| 81 if filen is not None: | |
| 82 fp = open(filen,'w') | |
| 83 else: | |
| 84 fp = sys.stdout | |
| 85 results_dir = os.path.abspath("RESULTS") | |
| 86 fp.write("Listing contents of output dir %s:\n" % results_dir) | |
| 87 ix = 0 | |
| 88 for d,dirs,files in os.walk(results_dir): | |
| 89 ix += 1 | |
| 90 fp.write("-- %d: %s\n" % (ix, | |
| 91 os.path.relpath(d,results_dir))) | |
| 92 for f in files: | |
| 93 ix += 1 | |
| 94 fp.write("---- %d: %s\n" % (ix, | |
| 95 os.path.relpath(f,results_dir))) | |
| 96 # Close output file | |
| 97 if filen is not None: | |
| 98 fp.close() | |
| 99 | |
| 100 if __name__ == "__main__": | |
| 101 # Command line | |
| 102 print "Amplicon analysis: starting" | |
| 103 p = argparse.ArgumentParser() | |
| 104 p.add_argument("metatable", | |
| 105 metavar="METATABLE_FILE", | |
| 106 help="Metatable.txt file") | |
| 107 p.add_argument("fastq_pairs", | |
| 108 metavar="SAMPLE_NAME FQ_R1 FQ_R2", | |
| 109 nargs="+", | |
| 110 default=list(), | |
| 111 help="Triplets of SAMPLE_NAME followed by " | |
| 112 "a R1/R2 FASTQ file pair") | |
| 113 p.add_argument("-g",dest="forward_pcr_primer") | |
| 114 p.add_argument("-G",dest="reverse_pcr_primer") | |
| 115 p.add_argument("-q",dest="trimming_threshold") | |
| 116 p.add_argument("-O",dest="minimum_overlap") | |
| 117 p.add_argument("-L",dest="minimum_length") | |
| 118 p.add_argument("-l",dest="sliding_window_length") | |
| 119 p.add_argument("-P",dest="pipeline", | |
| 120 choices=["Vsearch","DADA2"], | |
| 121 type=str, | |
| 122 default="Vsearch") | |
| 123 p.add_argument("-S",dest="use_silva",action="store_true") | |
| 124 p.add_argument("-H",dest="use_homd",action="store_true") | |
| 125 p.add_argument("-r",dest="reference_data_path") | |
| 126 p.add_argument("-c",dest="categories_file") | |
| 127 args = p.parse_args() | |
| 128 | |
| 129 # Build the environment for running the pipeline | |
| 130 print "Amplicon analysis: building the environment" | |
| 131 metatable_file = os.path.abspath(args.metatable) | |
| 132 os.symlink(metatable_file,"Metatable.txt") | |
| 133 print "-- made symlink to Metatable.txt" | |
| 134 | |
| 135 # Link to Categories.txt file (if provided) | |
| 136 if args.categories_file is not None: | |
| 137 categories_file = os.path.abspath(args.categories_file) | |
| 138 os.symlink(categories_file,"Categories.txt") | |
| 139 print "-- made symlink to Categories.txt" | |
| 140 | |
| 141 # Link to FASTQs and construct Final_name.txt file | |
| 142 sample_names = [] | |
| 143 print "-- making Final_name.txt" | |
| 144 with open("Final_name.txt",'w') as final_name: | |
| 145 fastqs = iter(args.fastq_pairs) | |
| 146 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs): | |
| 147 sample_name = clean_up_name(sample_name) | |
| 148 print " %s" % sample_name | |
| 149 r1 = "%s_R1_.fastq" % sample_name | |
| 150 r2 = "%s_R2_.fastq" % sample_name | |
| 151 os.symlink(fqr1,r1) | |
| 152 os.symlink(fqr2,r2) | |
| 153 final_name.write("%s\n" % '\t'.join((r1,sample_name))) | |
| 154 final_name.write("%s\n" % '\t'.join((r2,sample_name))) | |
| 155 sample_names.append(sample_name) | |
| 156 | |
| 157 # Reference database | |
| 158 if args.use_silva: | |
| 159 ref_database = "silva" | |
| 160 elif args.use_homd: | |
| 161 ref_database = "homd" | |
| 162 else: | |
| 163 ref_database = "gg" | |
| 164 | |
| 165 # Construct the pipeline command | |
| 166 print "Amplicon analysis: constructing pipeline command" | |
| 167 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh") | |
| 168 if args.forward_pcr_primer: | |
| 169 pipeline.add_args("-g",args.forward_pcr_primer) | |
| 170 if args.reverse_pcr_primer: | |
| 171 pipeline.add_args("-G",args.reverse_pcr_primer) | |
| 172 if args.trimming_threshold: | |
| 173 pipeline.add_args("-q",args.trimming_threshold) | |
| 174 if args.minimum_overlap: | |
| 175 pipeline.add_args("-O",args.minimum_overlap) | |
| 176 if args.minimum_length: | |
| 177 pipeline.add_args("-L",args.minimum_length) | |
| 178 if args.sliding_window_length: | |
| 179 pipeline.add_args("-l",args.sliding_window_length) | |
| 180 if args.reference_data_path: | |
| 181 pipeline.add_args("-r",args.reference_data_path) | |
| 182 pipeline.add_args("-P",args.pipeline) | |
| 183 if ref_database == "silva": | |
| 184 pipeline.add_args("-S") | |
| 185 elif ref_database == "homd": | |
| 186 pipeline.add_args("-H") | |
| 187 | |
| 188 # Echo the pipeline command to stdout | |
| 189 print "Running %s" % pipeline | |
| 190 | |
| 191 # Run the pipeline | |
| 192 with open("pipeline.log","w") as pipeline_out: | |
| 193 try: | |
| 194 subprocess.check_call(pipeline.cmd, | |
| 195 stdout=pipeline_out, | |
| 196 stderr=subprocess.STDOUT) | |
| 197 exit_code = 0 | |
| 198 print "Pipeline completed ok" | |
| 199 except subprocess.CalledProcessError as ex: | |
| 200 # Non-zero exit status | |
| 201 sys.stderr.write("Pipeline failed: exit code %s\n" % | |
| 202 ex.returncode) | |
| 203 exit_code = ex.returncode | |
| 204 except Exception as ex: | |
| 205 # Some other problem | |
| 206 sys.stderr.write("Unexpected error: %s\n" % str(ex)) | |
| 207 exit_code = 1 | |
| 208 | |
| 209 # Write out the list of outputs | |
| 210 outputs_file = "Pipeline_outputs.txt" | |
| 211 list_outputs(outputs_file) | |
| 212 | |
| 213 # Check for log file | |
| 214 log_file = "Amplicon_analysis_pipeline.log" | |
| 215 if os.path.exists(log_file): | |
| 216 print "Found log file: %s" % log_file | |
| 217 if exit_code == 0: | |
| 218 # Create an HTML file to link to log files etc | |
| 219 # NB the paths to the files should be correct once | |
| 220 # copied by Galaxy on job completion | |
| 221 with open("pipeline_outputs.html","w") as html_out: | |
| 222 html_out.write("""<html> | |
| 223 <head> | |
| 224 <title>Amplicon analysis pipeline: log files</title> | |
| 225 <head> | |
| 226 <body> | |
| 227 <h1>Amplicon analysis pipeline: log files</h1> | |
| 228 <ul> | |
| 229 """) | |
| 230 html_out.write( | |
| 231 "<li>%s</li>\n" % | |
| 232 ahref("Amplicon_analysis_pipeline.log", | |
| 233 type="text/plain")) | |
| 234 html_out.write( | |
| 235 "<li>%s</li>\n" % | |
| 236 ahref("pipeline.log",type="text/plain")) | |
| 237 html_out.write( | |
| 238 "<li>%s</li>\n" % | |
| 239 ahref("Pipeline_outputs.txt", | |
| 240 type="text/plain")) | |
| 241 html_out.write( | |
| 242 "<li>%s</li>\n" % | |
| 243 ahref("Metatable.html")) | |
| 244 html_out.write("""<ul> | |
| 245 </body> | |
| 246 </html> | |
| 247 """) | |
| 248 else: | |
| 249 # Check for known error messages | |
| 250 check_errors() | |
| 251 # Write pipeline stdout to tool stderr | |
| 252 sys.stderr.write("\nOutput from pipeline:\n") | |
| 253 with open("pipeline.log",'r') as log: | |
| 254 sys.stderr.write("%s" % log.read()) | |
| 255 # Write log file contents to tool log | |
| 256 print "\nAmplicon_analysis_pipeline.log:" | |
| 257 with open(log_file,'r') as log: | |
| 258 print "%s" % log.read() | |
| 259 else: | |
| 260 sys.stderr.write("ERROR missing log file \"%s\"\n" % | |
| 261 log_file) | |
| 262 | |
| 263 # Handle FastQC boxplots | |
| 264 print "Amplicon analysis: collating per base quality boxplots" | |
| 265 with open("fastqc_quality_boxplots.html","w") as quality_boxplots: | |
| 266 # PHRED value for trimming | |
| 267 phred_score = 20 | |
| 268 if args.trimming_threshold is not None: | |
| 269 phred_score = args.trimming_threshold | |
| 270 # Write header for HTML output file | |
| 271 quality_boxplots.write("""<html> | |
| 272 <head> | |
| 273 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title> | |
| 274 <head> | |
| 275 <body> | |
| 276 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1> | |
| 277 """) | |
| 278 # Look for raw and trimmed FastQC output for each sample | |
| 279 for sample_name in sample_names: | |
| 280 fastqc_dir = os.path.join(sample_name,"FastQC") | |
| 281 quality_boxplots.write("<h2>%s</h2>" % sample_name) | |
| 282 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score): | |
| 283 quality_boxplots.write("<h3>%s</h3>" % d) | |
| 284 fastqc_html_files = glob.glob( | |
| 285 os.path.join(fastqc_dir,d,"*_fastqc.html")) | |
| 286 if not fastqc_html_files: | |
| 287 quality_boxplots.write("<p>No FastQC outputs found</p>") | |
| 288 continue | |
| 289 # Pull out the per-base quality boxplots | |
| 290 for f in fastqc_html_files: | |
| 291 boxplot = None | |
| 292 with open(f) as fp: | |
| 293 for line in fp.read().split(">"): | |
| 294 try: | |
| 295 line.index("alt=\"Per base quality graph\"") | |
| 296 boxplot = line + ">" | |
| 297 break | |
| 298 except ValueError: | |
| 299 pass | |
| 300 if boxplot is None: | |
| 301 boxplot = "Missing plot" | |
| 302 quality_boxplots.write("<h4>%s</h4><p>%s</p>" % | |
| 303 | |
| 304 (os.path.basename(f), | |
| 305 boxplot)) | |
| 306 quality_boxplots.write("""</body> | |
| 307 </html> | |
| 308 """) | |
| 309 | |
| 310 # Handle DADA2 error rate plot PDFs | |
| 311 if args.pipeline == "DADA2": | |
| 312 print("Amplicon analysis: collecting error rate plots") | |
| 313 error_rate_plots_dir = os.path.abspath( | |
| 314 os.path.join("DADA2_OTU_tables", | |
| 315 "Error_rate_plots")) | |
| 316 error_rate_plot_pdfs = [os.path.basename(pdf) | |
| 317 for pdf in | |
| 318 sorted(glob.glob( | |
| 319 os.path.join(error_rate_plots_dir,"*.pdf")))] | |
| 320 with open("error_rate_plots.html","w") as error_rate_plots_out: | |
| 321 error_rate_plots_out.write("""<html> | |
| 322 <head> | |
| 323 <title>Amplicon analysis pipeline: DADA2 Error Rate Plots</title> | |
| 324 <head> | |
| 325 <body> | |
| 326 <h1>Amplicon analysis pipeline: DADA2 Error Rate Plots</h1> | |
| 327 """) | |
| 328 error_rate_plots_out.write("<ul>\n") | |
| 329 for pdf in error_rate_plot_pdfs: | |
| 330 error_rate_plots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
| 331 error_rate_plots_out.write("<ul>\n") | |
| 332 error_rate_plots_out.write("""</body> | |
| 333 </html> | |
| 334 """) | |
| 335 | |
| 336 # Handle additional output when categories file was supplied | |
| 337 if args.categories_file is not None: | |
| 338 # Alpha diversity boxplots | |
| 339 print "Amplicon analysis: indexing alpha diversity boxplots" | |
| 340 boxplots_dir = os.path.abspath( | |
| 341 os.path.join("RESULTS", | |
| 342 "%s_%s" % (args.pipeline, | |
| 343 ref_database), | |
| 344 "Alpha_diversity", | |
| 345 "Alpha_diversity_boxplot", | |
| 346 "Categories_shannon")) | |
| 347 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir | |
| 348 boxplot_pdfs = [os.path.basename(pdf) | |
| 349 for pdf in | |
| 350 sorted(glob.glob( | |
| 351 os.path.join(boxplots_dir,"*.pdf")))] | |
| 352 with open("alpha_diversity_boxplots.html","w") as boxplots_out: | |
| 353 boxplots_out.write("""<html> | |
| 354 <head> | |
| 355 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title> | |
| 356 <head> | |
| 357 <body> | |
| 358 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1> | |
| 359 """) | |
| 360 boxplots_out.write("<ul>\n") | |
| 361 for pdf in boxplot_pdfs: | |
| 362 boxplots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
| 363 boxplots_out.write("<ul>\n") | |
| 364 boxplots_out.write("""</body> | |
| 365 </html> | |
| 366 """) | |
| 367 | |
| 368 # Finish | |
| 369 print "Amplicon analysis: finishing, exit code: %s" % exit_code | |
| 370 sys.exit(exit_code) |
