amplicon_analysis_pipeline: amplicon_analysis

comparison amplicon_analysis_pipeline.py @ 42:098ad1dd7760 draft

planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit 10be6f00106e853a6720e4052871d9d84e027137

author	pjbriggs
date	Thu, 05 Dec 2019 11:48:01 +0000
parents
children	1bc0077b2c91

comparison

equal deleted inserted replaced

-:7b9786a43a16
+:098ad1dd7760
+#!/usr/bin/env python
+#
+# Wrapper script to run Amplicon_analysis_pipeline.sh
+# from Galaxy tool
+import sys
+import os
+import argparse
+import subprocess
+import glob
+class PipelineCmd(object):
+def __init__(self,cmd):
+self.cmd = [str(cmd)]
+def add_args(self,*args):
+for arg in args:
+self.cmd.append(str(arg))
+def __repr__(self):
+return ' '.join([str(arg) for arg in self.cmd])
+def ahref(target,name=None,type=None):
+if name is None:
+name = os.path.basename(target)
+ahref = "<a href='%s'" % target
+if type is not None:
+ahref += " type='%s'" % type
+ahref += ">%s</a>" % name
+return ahref
+def check_errors():
+# Errors in Amplicon_analysis_pipeline.log
+with open('Amplicon_analysis_pipeline.log','r') as pipeline_log:
+log = pipeline_log.read()
+if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log:
+print_error("""*** Sample IDs don't match dataset names ***
+The sample IDs (first column of the Metatable file) don't match the
+supplied sample names for the input Fastq pairs.
+""")
+# Errors in pipeline output
+with open('pipeline.log','r') as pipeline_log:
+log = pipeline_log.read()
+if "Errors and/or warnings detected in mapping file" in log:
+with open("Metatable_log/Metatable.log","r") as metatable_log:
+# Echo the Metatable log file to the tool log
+print_error("""*** Error in Metatable mapping file ***
+%s""" % metatable_log.read())
+elif "No header line was found in mapping file" in log:
+# Report error to the tool log
+print_error("""*** No header in Metatable mapping file ***
+Check you've specified the correct file as the input Metatable""")
+def print_error(message):
+width = max([len(line) for line in message.split('\n')]) + 4
+sys.stderr.write("\n%s\n" % ('*'*width))
+for line in message.split('\n'):
+sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4)))
+sys.stderr.write("%s\n\n" % ('*'*width))
+def clean_up_name(sample):
+# Remove extensions and trailing "_L[0-9]+_001" from
+# Fastq pair names
+sample_name = '.'.join(sample.split('.')[:1])
+split_name = sample_name.split('_')
+if split_name[-1] == "001":
+split_name = split_name[:-1]
+if split_name[-1].startswith('L'):
+try:
+int(split_name[-1][1:])
+split_name = split_name[:-1]
+except ValueError:
+pass
+return '_'.join(split_name)
+def list_outputs(filen=None):
+# List the output directory contents
+# If filen is specified then will be the filename to
+# write to, otherwise write to stdout
+if filen is not None:
+fp = open(filen,'w')
+else:
+fp = sys.stdout
+results_dir = os.path.abspath("RESULTS")
+fp.write("Listing contents of output dir %s:\n" % results_dir)
+ix = 0
+for d,dirs,files in os.walk(results_dir):
+ix += 1
+fp.write("-- %d: %s\n" % (ix,
+os.path.relpath(d,results_dir)))
+for f in files:
+ix += 1
+fp.write("---- %d: %s\n" % (ix,
+os.path.relpath(f,results_dir)))
+# Close output file
+if filen is not None:
+fp.close()
+if __name__ == "__main__":
+# Command line
+print "Amplicon analysis: starting"
+p = argparse.ArgumentParser()
+p.add_argument("metatable",
+metavar="METATABLE_FILE",
+help="Metatable.txt file")
+p.add_argument("fastq_pairs",
+metavar="SAMPLE_NAME FQ_R1 FQ_R2",
+nargs="+",
+default=list(),
+help="Triplets of SAMPLE_NAME followed by "
+"a R1/R2 FASTQ file pair")
+p.add_argument("-g",dest="forward_pcr_primer")
+p.add_argument("-G",dest="reverse_pcr_primer")
+p.add_argument("-q",dest="trimming_threshold")
+p.add_argument("-O",dest="minimum_overlap")
+p.add_argument("-L",dest="minimum_length")
+p.add_argument("-l",dest="sliding_window_length")
+p.add_argument("-P",dest="pipeline",
+choices=["Vsearch","DADA2"],
+type=str,
+default="Vsearch")
+p.add_argument("-S",dest="use_silva",action="store_true")
+p.add_argument("-H",dest="use_homd",action="store_true")
+p.add_argument("-r",dest="reference_data_path")
+p.add_argument("-c",dest="categories_file")
+args = p.parse_args()
+# Build the environment for running the pipeline
+print "Amplicon analysis: building the environment"
+metatable_file = os.path.abspath(args.metatable)
+os.symlink(metatable_file,"Metatable.txt")
+print "-- made symlink to Metatable.txt"
+# Link to Categories.txt file (if provided)
+if args.categories_file is not None:
+categories_file = os.path.abspath(args.categories_file)
+os.symlink(categories_file,"Categories.txt")
+print "-- made symlink to Categories.txt"
+# Link to FASTQs and construct Final_name.txt file
+sample_names = []
+print "-- making Final_name.txt"
+with open("Final_name.txt",'w') as final_name:
+fastqs = iter(args.fastq_pairs)
+for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs):
+sample_name = clean_up_name(sample_name)
+print "   %s" % sample_name
+r1 = "%s_R1_.fastq" % sample_name
+r2 = "%s_R2_.fastq" % sample_name
+os.symlink(fqr1,r1)
+os.symlink(fqr2,r2)
+final_name.write("%s\n" % '\t'.join((r1,sample_name)))
+final_name.write("%s\n" % '\t'.join((r2,sample_name)))
+sample_names.append(sample_name)
+# Reference database
+if args.use_silva:
+ref_database = "silva"
+elif args.use_homd:
+ref_database = "homd"
+else:
+ref_database = "gg"
+# Construct the pipeline command
+print "Amplicon analysis: constructing pipeline command"
+pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh")
+if args.forward_pcr_primer:
+pipeline.add_args("-g",args.forward_pcr_primer)
+if args.reverse_pcr_primer:
+pipeline.add_args("-G",args.reverse_pcr_primer)
+if args.trimming_threshold:
+pipeline.add_args("-q",args.trimming_threshold)
+if args.minimum_overlap:
+pipeline.add_args("-O",args.minimum_overlap)
+if args.minimum_length:
+pipeline.add_args("-L",args.minimum_length)
+if args.sliding_window_length:
+pipeline.add_args("-l",args.sliding_window_length)
+if args.reference_data_path:
+pipeline.add_args("-r",args.reference_data_path)
+pipeline.add_args("-P",args.pipeline)
+if ref_database == "silva":
+pipeline.add_args("-S")
+elif ref_database == "homd":
+pipeline.add_args("-H")
+# Echo the pipeline command to stdout
+print "Running %s" % pipeline
+# Run the pipeline
+with open("pipeline.log","w") as pipeline_out:
+try:
+subprocess.check_call(pipeline.cmd,
+stdout=pipeline_out,
+stderr=subprocess.STDOUT)
+exit_code = 0
+print "Pipeline completed ok"
+except subprocess.CalledProcessError as ex:
+# Non-zero exit status
+sys.stderr.write("Pipeline failed: exit code %s\n" %
+ex.returncode)
+exit_code = ex.returncode
+except Exception as ex:
+# Some other problem
+sys.stderr.write("Unexpected error: %s\n" % str(ex))
+exit_code = 1
+# Write out the list of outputs
+outputs_file = "Pipeline_outputs.txt"
+list_outputs(outputs_file)
+# Check for log file
+log_file = "Amplicon_analysis_pipeline.log"
+if os.path.exists(log_file):
+print "Found log file: %s" % log_file
+if exit_code == 0:
+# Create an HTML file to link to log files etc
+# NB the paths to the files should be correct once
+# copied by Galaxy on job completion
+with open("pipeline_outputs.html","w") as html_out:
+html_out.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: log files</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: log files</h1>
+<ul>
+""")
+html_out.write(
+"<li>%s</li>\n" %
+ahref("Amplicon_analysis_pipeline.log",
+type="text/plain"))
+html_out.write(
+"<li>%s</li>\n" %
+ahref("pipeline.log",type="text/plain"))
+html_out.write(
+"<li>%s</li>\n" %
+ahref("Pipeline_outputs.txt",
+type="text/plain"))
+html_out.write(
+"<li>%s</li>\n" %
+ahref("Metatable.html"))
+html_out.write("""<ul>
+</body>
+</html>
+""")
+else:
+# Check for known error messages
+check_errors()
+# Write pipeline stdout to tool stderr
+sys.stderr.write("\nOutput from pipeline:\n")
+with open("pipeline.log",'r') as log:
+sys.stderr.write("%s" % log.read())
+# Write log file contents to tool log
+print "\nAmplicon_analysis_pipeline.log:"
+with open(log_file,'r') as log:
+print "%s" % log.read()
+else:
+sys.stderr.write("ERROR missing log file \"%s\"\n" %
+log_file)
+# Handle FastQC boxplots
+print "Amplicon analysis: collating per base quality boxplots"
+with open("fastqc_quality_boxplots.html","w") as quality_boxplots:
+# PHRED value for trimming
+phred_score = 20
+if args.trimming_threshold is not None:
+phred_score = args.trimming_threshold
+# Write header for HTML output file
+quality_boxplots.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1>
+""")
+# Look for raw and trimmed FastQC output for each sample
+for sample_name in sample_names:
+fastqc_dir = os.path.join(sample_name,"FastQC")
+quality_boxplots.write("<h2>%s</h2>" % sample_name)
+for d in ("Raw","cutdapt_sickle/Q%s" % phred_score):
+quality_boxplots.write("<h3>%s</h3>" % d)
+fastqc_html_files = glob.glob(
+os.path.join(fastqc_dir,d,"*_fastqc.html"))
+if not fastqc_html_files:
+quality_boxplots.write("<p>No FastQC outputs found</p>")
+continue
+# Pull out the per-base quality boxplots
+for f in fastqc_html_files:
+boxplot = None
+with open(f) as fp:
+for line in fp.read().split(">"):
+try:
+line.index("alt=\"Per base quality graph\"")
+boxplot = line + ">"
+break
+except ValueError:
+pass
+if boxplot is None:
+boxplot = "Missing plot"
+quality_boxplots.write("<h4>%s</h4><p>%s</p>" %
+(os.path.basename(f),
+boxplot))
+quality_boxplots.write("""</body>
+</html>
+""")
+# Handle DADA2 error rate plot PDFs
+if args.pipeline == "DADA2":
+print("Amplicon analysis: collecting error rate plots")
+error_rate_plots_dir = os.path.abspath(
+os.path.join("DADA2_OTU_tables",
+"Error_rate_plots"))
+error_rate_plot_pdfs = [os.path.basename(pdf)
+for pdf in
+sorted(glob.glob(
+os.path.join(error_rate_plots_dir,"*.pdf")))]
+with open("error_rate_plots.html","w") as error_rate_plots_out:
+error_rate_plots_out.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: DADA2 Error Rate Plots</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: DADA2 Error Rate Plots</h1>
+""")
+error_rate_plots_out.write("<ul>\n")
+for pdf in error_rate_plot_pdfs:
+error_rate_plots_out.write("<li>%s</li>\n" % ahref(pdf))
+error_rate_plots_out.write("<ul>\n")
+error_rate_plots_out.write("""</body>
+</html>
+""")
+# Handle additional output when categories file was supplied
+if args.categories_file is not None:
+# Alpha diversity boxplots
+print "Amplicon analysis: indexing alpha diversity boxplots"
+boxplots_dir = os.path.abspath(
+os.path.join("RESULTS",
+"%s_%s" % (args.pipeline,
+ref_database),
+"Alpha_diversity",
+"Alpha_diversity_boxplot",
+"Categories_shannon"))
+print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir
+boxplot_pdfs = [os.path.basename(pdf)
+for pdf in
+sorted(glob.glob(
+os.path.join(boxplots_dir,"*.pdf")))]
+with open("alpha_diversity_boxplots.html","w") as boxplots_out:
+boxplots_out.write("""<html>
+<head>
+<title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title>
+<head>
+<body>
+<h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1>
+""")
+boxplots_out.write("<ul>\n")
+for pdf in boxplot_pdfs:
+boxplots_out.write("<li>%s</li>\n" % ahref(pdf))
+boxplots_out.write("<ul>\n")
+boxplots_out.write("""</body>
+</html>
+""")
+# Finish
+print "Amplicon analysis: finishing, exit code: %s" % exit_code
+sys.exit(exit_code)

Mercurial > repos > pjbriggs > amplicon_analysis_pipeline

comparison amplicon_analysis_pipeline.py @ 42:098ad1dd7760 draft