amplicon_analysis_pipeline: amplicon_analysis

comparison amplicon_analysis_pipeline.xml @ 42:098ad1dd7760 draft

planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit 10be6f00106e853a6720e4052871d9d84e027137

author	pjbriggs
date	Thu, 05 Dec 2019 11:48:01 +0000
parents
children	4bfa62618f7c

comparison

equal deleted inserted replaced

-:7b9786a43a16
+:098ad1dd7760
+<tool id="amplicon_analysis_pipeline" name="Amplicon Analysis Pipeline" version="1.3.5.0">
+<description>analyse 16S rRNA data from Illumina Miseq paired-end reads</description>
+<requirements>
+<requirement type="package" version="1.3.5">amplicon_analysis_pipeline</requirement>
+</requirements>
+<stdio>
+<exit_code range="1:" />
+</stdio>
+<command><![CDATA[
+## Convenience variable for pipeline name
+#set $pipeline_name = $pipeline.pipeline_name
+## Set the reference database name
+#if str( $pipeline_name ) == "DADA2"
+#set reference_database_name = "silva"
+#else
+#set reference_database = $pipeline.reference_database
+#if $reference_database == "-S"
+#set reference_database_name = "silva"
+#else if $reference_database == "-H"
+#set reference_database_name = "homd"
+#else
+#set reference_database_name = "gg"
+#end if
+#end if
+## Run the amplicon analysis pipeline wrapper
+python $__tool_directory__/amplicon_analysis_pipeline.py
+## Set options
+#if str( $forward_pcr_primer ) != ""
+-g "$forward_pcr_primer"
+#end if
+#if str( $reverse_pcr_primer ) != ""
+-G "$reverse_pcr_primer"
+#end if
+#if str( $trimming_threshold ) != ""
+-q $trimming_threshold
+#end if
+#if str( $sliding_window_length ) != ""
+-l $sliding_window_length
+#end if
+#if str( $minimum_overlap ) != ""
+-O $minimum_overlap
+#end if
+#if str( $minimum_length ) != ""
+-L $minimum_length
+#end if
+-P $pipeline_name
+-r \${AMPLICON_ANALYSIS_REF_DATA_PATH-ReferenceData}
+#if str( $pipeline_name ) != "DADA2"
+${reference_database}
+#end if
+#if str($categories_file_in) != 'None'
+-c "${categories_file_in}"
+#end if
+## Input files
+"${metatable_file_in}"
+## FASTQ pairs
+#if str($input_type.pairs_or_collection) == "collection"
+#set fastq_pairs = $input_type.fastq_collection
+#else
+#set fastq_pairs = $input_type.fastq_pairs
+#end if
+#for $fq_pair in $fastq_pairs
+"${fq_pair.name}" "${fq_pair.forward}" "${fq_pair.reverse}"
+#end for
+&&
+## Collect outputs
+cp Metatable_log/Metatable_mod.txt "${metatable_mod}" &&
+#if str( $pipeline_name ) == "Vsearch"
+# Vsearch-specific
+cp ${pipeline_name}_OTU_tables/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
+cp Multiplexed_files/${pipeline_name}_pipeline/multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta "${dereplicated_nonchimera_otus_fasta}" &&
+cp QUALITY_CONTROL/Reads_count.txt "$read_counts_out" &&
+#else
+# DADA2-specific
+cp ${pipeline_name}_OTU_tables/DADA2_tax_OTU_table.biom "${tax_otu_table_biom_file}" &&
+cp ${pipeline_name}_OTU_tables/seqs.fa "${dereplicated_nonchimera_otus_fasta}" &&
+#end if
+cp ${pipeline_name}_OTU_tables/otus.tre "${otus_tre_file}" &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/OTUs_count.txt "${otus_count_file}" &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/table_summary.txt "${table_summary_file}" &&
+cp fastqc_quality_boxplots.html "${fastqc_quality_boxplots_html}" &&
+## OTU table heatmap
+cp RESULTS/${pipeline_name}_${reference_database_name}/Heatmap.pdf "${heatmap_otu_table_pdf}"" &&
+## HTML outputs
+## Phylum genus barcharts
+mkdir $phylum_genus_dist_barcharts_html.files_path &&
+cp -r RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/charts $phylum_genus_dist_barcharts_html.files_path &&
+cp -r RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/raw_data $phylum_genus_dist_barcharts_html.files_path &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/phylum_genus_charts/bar_charts.html "${phylum_genus_dist_barcharts_html}" &&
+## Beta diversity weighted 2d plots
+mkdir $beta_div_even_weighted_2d_plots.files_path &&
+cp -r RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/weighted_2d_plot/* $beta_div_even_weighted_2d_plots.files_path &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/weighted_2d_plot/weighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_weighted_2d_plots}" &&
+## Beta diversity unweighted 2d plots
+mkdir $beta_div_even_unweighted_2d_plots.files_path &&
+cp -r RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/unweighted_2d_plot/* $beta_div_even_unweighted_2d_plots.files_path &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/beta_div_even/unweighted_2d_plot/unweighted_unifrac_pc_2D_PCoA_plots.html "${beta_div_even_unweighted_2d_plots}" &&
+## Alpha diversity rarefaction plots
+mkdir $alpha_div_rarefaction_plots.files_path &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/rarefaction_curves/rarefaction_plots.html $alpha_div_rarefaction_plots &&
+cp -r RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/rarefaction_curves/average_plots $alpha_div_rarefaction_plots.files_path &&
+## DADA2 error rate plots
+#if str($pipeline_name) == "DADA2"
+mkdir $dada2_error_rate_plots.files_path &&
+cp DADA2_OTU_tables/Error_rate_plots/error_rate_plots.html $dada2_error_rate_plots &&
+cp -r DADA2_OTU_tables/Error_rate_plots/*.pdf $dada2_error_rate_plots.files_path &&
+#end if
+## Categories data
+#if str($categories_file_in) != 'None'
+## Alpha diversity boxplots
+mkdir $alpha_div_boxplots.files_path &&
+cp alpha_diversity_boxplots.html "$alpha_div_boxplots" &&
+cp RESULTS/${pipeline_name}_${reference_database_name}/Alpha_diversity/Alpha_diversity_boxplot/Categories_shannon/*.pdf $alpha_div_boxplots.files_path &&
+#end if
+## Pipeline outputs (log files etc)
+mkdir $log_files.files_path &&
+cp Amplicon_analysis_pipeline.log $log_files.files_path &&
+cp pipeline.log $log_files.files_path &&
+cp Pipeline_outputs.txt $log_files.files_path &&
+cp Metatable_log/Metatable.html $log_files.files_path &&
+cp pipeline_outputs.html "$log_files"
+]]></command>
+<inputs>
+<param name="title" type="text" value="test" size="25"
+	   label="Title" help="Optional text that will be added to the output dataset names" />
+<param type="data" name="metatable_file_in" format="tabular"
+	   label="Input Metatable.txt file" />
+<param type="data" name="categories_file_in" format="txt"
+	   label="Input Categories.txt file" optional="true"
+	   help="(optional)" />
+<conditional name="input_type">
+<param name="pairs_or_collection" type="select"
+	     label="Input FASTQ type">
+	<option value="pairs_of_files">Pairs of datasets</option>
+	<option value="collection" selected="true">Dataset pairs in a collection</option>
+</param>
+<when value="collection">
+	<param name="fastq_collection" type="data_collection"
+	       format="fastqsanger,fastq" collection_type="list:paired"
+	       label="Collection of FASTQ forward and reverse (R1/R2) pairs"
+	       help="Each FASTQ pair will be treated as one sample; the name of each sample will be taken from the first column of the Metatable file " />
+</when>
+<when value="pairs_of_files">
+	<repeat name="fastq_pairs" title="Input fastq pairs" min="1">
+	  <param type="text" name="name" value=""
+		 label="Final name for FASTQ pair" />
+	  <param type="data" name="fastq_r1" format="fastqsanger,fastq"
+		 label="FASTQ with forward reads (R1)" />
+	  <param type="data" name="fastq_r2" format="fastqsanger,fastq"
+		 label="FASTQ with reverse reads (R2)" />
+	</repeat>
+</when>
+</conditional>
+<param type="text" name="forward_pcr_primer" value=""
+	   label="Forward PCR primer sequence"
+	   help="Optional; must not include barcode or adapter sequence (-g)" />
+<param type="text" name="reverse_pcr_primer" value=""
+	   label="Reverse PCR primer sequence"
+	   help="Optional; must not include barcode or adapter sequence (-G)" />
+<param type="integer" name="trimming_threshold" value="20"
+	   label="Threshold quality below which read will be trimmed"
+	   help="Phred score; default is 20 (-q)" />
+<param type="integer" name="minimum_overlap" value="10"
+	   label="Minimum overlap in bp between forward and reverse reads"
+	   help="Default is 10 (-O)" />
+<param type="integer" name="minimum_length" value="200"
+	   label="Minimum length in bp to keep sequence after overlapping"
+	   help="Default is 200 (-L)" />
+<param type="integer" name="sliding_window_length" value="10"
+	   label="Minimum length in bp to retain a read after trimming"
+	   help="Supplied to Sickle; default is 10 (-l)" />
+<conditional name="pipeline">
+<param type="select" name="pipeline_name"
+	     label="Pipeline to use for analysis">
+	<option value="Vsearch" selected="true" >Vsearch</option>
+	<option value="DADA2">DADA2</option>
+</param>
+<when value="Vsearch">
+	<param type="select" name="reference_database"
+	       label="Reference database">
+	  <option value="" selected="true">GreenGenes</option>
+	  <option value="-S">Silva</option>
+	  <option value="-H">Human Oral Microbiome Database (HOMD)</option>
+	</param>
+</when>
+<when value="DADA2">
+</when>
+</conditional>
+</inputs>
+<outputs>
+<data format="tabular" name="metatable_mod"
+	  label="${tool.name}:${title} Metatable_mod.txt" />
+<data format="tabular" name="read_counts_out"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} read counts">
+<filter>pipeline['pipeline_name'] == 'Vsearch'</filter>
+</data>
+<data format="biom" name="tax_otu_table_biom_file"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} tax OTU table (biom format)" />
+<data format="tabular" name="otus_tre_file"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} otus.tre" />
+<data format="html" name="phylum_genus_dist_barcharts_html"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} phylum genus dist barcharts HTML" />
+<data format="tabular" name="otus_count_file"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} OTUs count file" />
+<data format="tabular" name="table_summary_file"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} table summary file" />
+<data format="fasta" name="dereplicated_nonchimera_otus_fasta"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} multiplexed linearized dereplicated mc2 repset nonchimeras OTUs FASTA" />
+<data format="html" name="fastqc_quality_boxplots_html"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} FastQC per-base quality boxplots HTML" />
+<data format="pdf" name="heatmap_otu_table_pdf"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} heatmap OTU table PDF" />
+<data format="html" name="beta_div_even_weighted_2d_plots"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} beta diversity weighted 2D plots HTML" />
+<data format="html" name="beta_div_even_unweighted_2d_plots"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} beta diversity unweighted 2D plots HTML" />
+<data format="html" name="alpha_div_rarefaction_plots"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} alpha diversity rarefaction plots HTML" />
+<data format="html" name="dada2_error_rate_plots"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} DADA2 error rate plots">
+<filter>pipeline['pipeline_name'] == 'DADA2'</filter>
+</data>
+<data format="html" name="alpha_div_boxplots"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} alpha diversity boxplots">
+<filter>categories_file_in is not None</filter>
+</data>
+<data format="html" name="log_files"
+	  label="${tool.name} (${pipeline.pipeline_name}):${title} log files" />
+</outputs>
+<tests>
+</tests>
+<help><![CDATA[
+What it does
+------------
+This pipeline has been designed for the analysis of 16S rRNA data from
+Illumina Miseq (Casava >= 1.8) paired-end reads.
+Usage
+-----
+1. Preparation of the mapping file and format of unique sample id
+*****************************************************************
+Before using the amplicon analysis pipeline it would be necessary to
+follow the steps as below to avoid analysis failures and ensure samples
+are labelled appropriately. Sample names for the labelling are derived
+from the fastq files names that are generated from the sequencing. The
+labels will include everything between the beginning of the name and
+the sample number (from C11 to S19 in Fig. 1)
+.. image:: Pipeline_description_Fig1.png
+:height: 46
+:width: 382
+**Figure 1**
+If analysing 16S data from multiple runs:
+The samples from different runs may have identical IDs. For example,
+when sequencing the same samples twice, by chance, these could be at
+the same position in both the runs. This would cause the fastq files
+to have exactly the same IDs (Fig. 2).
+.. image:: Pipeline_description_Fig2.png
+:height: 100
+:width: 463
+**Figure 2**
+In case of identical sample IDs the pipeline will fail to run and
+generate an error at the beginning of the analysis.
+To avoid having to change the file names, before uploading the files,
+ensure that the samples IDs are not repeated.
+2. To upload the file
+*********************
+Click on **Get Data/Upload File** from the Galaxy tool panel on the
+left hand side.
+From the pop-up window, choose how to upload the file. The
+**Choose local file** option can be used for files up to 4Gb. Fastq files
+from Illumina MiSeq will rarely be bigger than 4Gb and this option is
+recommended.
+After choosing the files click **Start** to begin the upload. The window can
+now be closed and the files will be uploaded onto the Galaxy server. You
+will see the progress on the ``HISTORY`` panel on the right
+side of the screen. The colour will change from grey (queuing), to yellow
+(uploading) and finally green (uploaded).
+Once all the files are uploaded, click on the operations on multiple
+datasets icon and select the fastq files that need to be analysed.
+Click on the tab **For all selected...** and on the option
+**Build List of Dataset pairs** (Fig. 3).
+.. image:: Pipeline_description_Fig3.png
+:height: 247
+:width: 586
+**Figure 3**
+Change the filter parameter ``_1`` and ``_2`` to be ``_R1`` and ``_R2``.
+The fastq files forward R1 and reverse R2 should now appear in the
+corresponding columns.
+Select **Autopair**. This creates a collection of paired fastq files for
+the forward and reverse reads for each sample. The name of the pairs will
+be the ones used by the pipeline. You are free to change the names at this
+point as long as they are the same used in the Metatable file
+(see section 3).
+Name the collection and click on **create list**. This reduces the time
+required to input the forward and reverse reads for each individual sample.
+3. Create the Metatable files
+*****************************
+Metatable.txt
+~~~~~~~~~~~~~
+Click on the list of pairs you just created to see the name of the single
+pairs. The name of the pairs will be the ones used by the pipeline,
+therefore, these are the names that need to be used in the Metatable file.
+The Metatable file has to be in QIIME format. You can find a description
+of it on QIIME website http://qiime.org/documentation/file_formats.html
+EXAMPLE::
+#SampleID    BarcodeSequence    LinkerPrimerSequence    Disease    Gender    Description
+Mock-RUN1    TAAGGCGAGCGTAAGA                           PsA        Male      Control
+Mock-RUN2    CGTACTAGGCGTAAGA                           PsA        Male      Control
+Mock-RUN3    AGGCAGAAGCGTAAGA                           PsC        Female    Control
+Briefly: the column ``LinkerPrimerSequence`` is empty but it cannot be
+deleted. The header is very important. ``#SampleID``, ``Barcode``,
+``LinkerPrimerSequence`` and ``Description`` are mandatory. Between
+``LinkerPrimerSequence`` and ``Description`` you can add as many columns
+as you want. For every column a PCoA plot will be created (see
+**Results** section). You can create this file in Excel and it will have
+to be saved as ``Text(Tab delimited)``.
+During the analysis the Metatable.txt will be checked to ensure that the
+file has the correct format. If necessary, this will be modified and will
+be available as Metatable_corrected.txt in the history panel. If you are
+going to use the metatable file for any other statistical analyses,
+remember to use the ``Metatable_mod.txt`` one, otherwise the sample
+names might not match!
+Categories.txt (optional)
+~~~~~~~~~~~~~~~~~~~~~~~~~
+This file is required if you want to get box plots for comparison of
+alpha diversity indices (see **Results** section). The file is a list
+(without header and IN ONE COLUMN) of categories present in the
+Metatable.txt file. THE NAMES YOU ARE USING HAVE TO BE THE SAME AS THE
+ONES USED IN THE METATABLE.TXT. You can create this file in Excel and
+will have to be saved as ``Text(Tab delimited)``.
+EXAMPLE::
+Disease
+Gender
+Metatable and categories files can be uploaded using Get Data as done
+with the fatsq files.
+4. Analysis
+***********
+Under **Amplicon_Analysis_Pipeline**
+* **Title** Name to distinguish between the runs. It will be shown at
+the beginning of each output file name.
+* **Input Metatable.txt file** Select the Metatable.txt file related to
+this analysis
+* **Input Categories.txt file (Optional)** Select the Categories.txt file
+related to this analysis
+* **Input FASTQ type** select *Dataset pairs in a collection* and, then,
+the collection of pairs you created earlier.
+* **Forward/Reverse PCR primer sequence** if the PCR primer sequences
+have not been removed from the MiSeq during the fastq creation, they
+have to be removed before the analysis. Insert the PCR primer sequence
+in the corresponding field. DO NOT include any barcode or adapter
+sequence. If the PCR primers have been already trimmed by the MiSeq,
+and you include the sequence in this field, this would lead to an error.
+Only include the sequences if still present in the fastq files.
+* **Threshold quality below which reads will be trimmed** Choose the
+Phred score used by Sickle to trim the reads at the 3’ end.
+* **Minimum length to retain a read after trimming** If the read length
+after trimming is shorter than a user defined length, the read, along
+with the corresponding read pair, will be discarded.
+* **Minimum overlap in bp between forward and reverse reads** Choose the
+minimum basepair overlap used by Pandaseq to assemble the reads.
+Default is 10.
+* **Minimum length in bp to keep a sequence after overlapping** Choose the
+minimum sequence length used by Pandaseq to keep a sequence after the
+overlapping. This depends on the expected amplicon length. Default is
+380 (used for V3-V4 16S sequencing; expected length ~440bp)
+* **Pipeline to use for analysis** Choose the pipeline to use for OTU
+clustering and chimera removal. The Galaxy tool supports the ``Vsearch``
+and ``DADA2`` pipelines.
+* **Reference database** Choose between ``GreenGenes``, ``Silva`` or
+``HOMD`` (Human Oral Microbiome Database) for taxa assignment.
+Click on **Execute** to start the analysis.
+5. Results
+**********
+Results are entirely generated using QIIME scripts. The results will
+appear in the History panel when the analysis is completed.
+The following outputs are captured:
+* **Vsearch_tax_OTU_table.biom|DADA2_tax_OTU_table.biom (biom format)**
+The OTU table in BIOM format (http://biom-format.org/)
+* **otus.tre** Phylogenetic tree constructed using ``make_phylogeny.py``
+(fasttree) QIIME script (http://qiime.org/scripts/make_phylogeny.html)
+* **Phylum_genus_dist_barcharts_HTML** HTML file with bar charts at
+Phylum, Genus and Species level
+(http://qiime.org/scripts/summarize_taxa.html and
+http://qiime.org/scripts/plot_taxa_summary.html)
+* **OTUs_count_file** Summary of OTU counts per sample
+(http://biom-format.org/documentation/summarizing_biom_tables.html)
+* **Table_summary_file** Summary of sequences counts per sample
+(http://biom-format.org/documentation/summarizing_biom_tables.html)
+* **multiplexed_linearized_dereplicated_mc2_repset_nonchimeras_OTUs.fasta|seqs.fa**
+Fasta file with OTU sequences (Vsearch|DADA2)
+* **Heatmap_PDF** OTU heatmap in PDF format
+(http://qiime.org/1.8.0/scripts/make_otu_heatmap_html.html )
+* **Vsearch_beta_diversity_weighted_2D_plots_HTML** PCoA plots in HTML
+format using weighted Unifrac distance measure. Samples are grouped
+by the column names present in the Metatable file. The samples are
+firstly rarefied to the minimum sequencing depth
+(http://qiime.org/scripts/beta_diversity_through_plots.html )
+* **Vsearch_beta_diversity_unweighted_2D_plots_HTML** PCoA plots in HTML
+format using Unweighted Unifrac distance measure. Samples are grouped
+by the column names present in the Metatable file. The samples are
+firstly rarefied to the minimum sequencing depth
+(http://qiime.org/scripts/beta_diversity_through_plots.html )
+Code availability
+-----------------
+**Code is available at** https://github.com/MTutino/Amplicon_analysis
+Credits
+-------
+Pipeline author: Mauro Tutino
+Galaxy tool: Peter Briggs
+	]]></help>
+<citations>
+<citation type="bibtex">
+@misc{githubAmplicon_analysis,
+author = {Tutino, Mauro},
+year = {2017},
+title = {Amplicon Analysis Pipeline},
+publisher = {GitHub},
+journal = {GitHub repository},
+url = {https://github.com/MTutino/Amplicon_analysis},
+}</citation>
+</citations>
+</tool>

Mercurial > repos > pjbriggs > amplicon_analysis_pipeline

comparison amplicon_analysis_pipeline.xml @ 42:098ad1dd7760 draft