diff pal_finder_wrapper.xml @ 15:a3af1ff4cad1 draft

pal_finder 0.02.04.7 for testing.
author pjbriggs
date Mon, 14 May 2018 11:10:19 -0400
parents 3f8bf1a0403b
children f7d63032217b
line wrap: on
line diff
--- a/pal_finder_wrapper.xml	Thu Mar 22 07:21:26 2018 -0400
+++ b/pal_finder_wrapper.xml	Mon May 14 11:10:19 2018 -0400
@@ -9,7 +9,7 @@
     <requirement type="package" version="1.65">biopython</requirement>
     <requirement type="package" version="2.8.1">pandaseq</requirement>
   </requirements>
-  <command><![CDATA[
+  <command detect_errors="exit_code"><![CDATA[
   @CONDA_PAL_FINDER_SCRIPT_DIR@ &&
   @CONDA_PAL_FINDER_DATA_DIR@ &&
   bash $__tool_directory__/pal_finder_wrapper.sh
@@ -64,6 +64,10 @@
     #if str( $platform.assembly ) == '-assembly'
       $platform.assembly "$output_assembly"
     #end if
+    #set $use_all_reads = $platform.subset_conditional.use_all_reads
+    #if str( $use_all_reads ) != "yes"
+      --subset "$platform.subset_conditional.subset"
+    #end if
   #end if
   ]]></command>
   <inputs>
@@ -91,6 +95,13 @@
 		   label="Select FASTQ dataset collection with R1/R2 pair" />
 	  </when>
 	</conditional>
+	<conditional name="subset_conditional">
+	  <param name="use_all_reads" type="boolean" label="Use all reads for microsatellite detection?" checked="True" truevalue="yes" falsevalue="no" />
+	  <when value="no">
+	    <param name="subset" type="text" value="0.5" label="Number or fraction of reads to use" help="Either an integer number of reads or a decimal fraction (e.g. 0.5 to select 50% of reads)" />
+	  </when>
+	  <when value="yes" />
+	</conditional>
 	<param name="filters" type="select" display="checkboxes"
 	       multiple="True" label="Filters to apply to the pal_finder results"
 	       help="Apply none, one or more filters to refine results">
@@ -106,7 +117,7 @@
 	<param name="input_fasta" type="data" format="fasta" label="454 fasta file with raw reads" />
       </when>
     </conditional>
-    <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" help="Set to zero to ignore repeats of this n-mer unit" />
+    <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" min="1" help="Must detect at least one repeat of this n-mer unit" />
     <param name="min_3mer_repeats" type="integer" value="0" label="Minimum number of 3-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
     <param name="min_4mer_repeats" type="integer" value="0" label="Minimum number of 4-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
     <param name="min_5mer_repeats" type="integer" value="0" label="Minimum number of 5-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" />
@@ -158,8 +169,9 @@
 	       label="Maximum acceptable difference between melting temperatures of left and right primers (PRIMER_PAIR_MAX_DIFF_TM)"
 	       help="Temperature should be in degrees Celsius" />
       </when>
+      <when value="default" />
     </conditional>
-    <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer ranges" help="Can be used to screen input Fastqs" />
+    <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer product size ranges" help="Can be used to screen reads in input Fastqs " />
     <param name="keep_config_file" type="boolean" truevalue="True" falsevalue="False"
 	   label="Output the config file to the history"
 	   help="Can be used to run pal_finder outside of Galaxy" />
@@ -254,18 +266,76 @@
       <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
       <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_rankmotifs.out.re_match" />
     </test>
-    <!-- Test with Illumina input generating bad primer ranges
-    -->
+    <!-- Test with Illumina input using subset of reads -->
+    <test>
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="use_all_reads" value="no" />
+      <param name="subset" value="0.5" />
+      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
+      <expand macro="output_illumina_microsat_subset_summary" />
+      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_subset.out.re_match" />
+    </test>
+    <!-- Test with Illumina input filter that doesn't find any
+	 microsatellites -->
+    <test expect_failure="true">
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
+      <assert_stderr>
+	<has_text text="pal_finder failed to locate any microsatellites" />
+      </assert_stderr>
+    </test>
+    <!-- Test with Illumina input generating bad ranges -->
     <test>
       <param name="platform_type" value="illumina" />
       <param name="filters" value="" />
       <param name="assembly" value="false" />
-      <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" />
-      <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" />
-      <param name="output_bad_primer_read_ids" value="true" />
-      <expand macro="output_illumina_microsat_summary" />
-      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" />
-      <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_ids.out" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_bad_ranges.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_bad_ranges.fq" ftype="fastqsanger" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="min_3mer_repeats" value="8" />
+      <param name="min_4mer_repeats" value="8" />
+      <param name="min_5mer_repeats" value="8" />
+      <param name="min_6mer_repeats" value="8" />
+      <param name="primer_options" value="custom" />
+      <param name="primer_opt_size" value="25" />
+      <param name="primer_min_size" value="21" />
+      <param name="primer_max_size" value="30" />
+      <param name="primer_min_gc" value="40.0" />
+      <param name="primer_max_gc" value="60.0" />
+      <param name="primer_gc_clamp" value="3" />
+      <param name="primer_max_end_gc" value="5" />
+      <param name="primer_min_tm" value="60.0" />
+      <param name="primer_max_tm" value="80.0" />
+      <param name="primer_opt_tm" value="68.0" />
+      <param name="primer_pair_max_diff_tm" value="3.0" />
+      <param name="report_bad_primer_ranges" value="true" />
+      <expand macro="output_illumina_microsat_summary_bad_ranges" />
+      <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_bad_ranges.out.re_match" />
+      <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_read_ids.out" />
+    </test>
+    <!-- Test with bad n-mers specified -->
+    <test expect_failure="true">
+      <param name="platform_type" value="illumina" />
+      <param name="filters" value="" />
+      <param name="assembly" value="false" />
+      <param name="min_2mer_repeats" value="8" />
+      <param name="min_3mer_repeats" value="8" />
+      <param name="min_4mer_repeats" value="0" />
+      <param name="min_5mer_repeats" value="8" />
+      <param name="min_6mer_repeats" value="8" />
+      <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" />
+      <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" />
+      <assert_stderr>
+	<has_text text="Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero" />
+      </assert_stderr>
     </test>
     <!-- Test with 454 input -->
     <test>
@@ -300,26 +370,49 @@
 
 -------------
 
+.. class:: infomark
+
+**Known issues**
+
 .. class:: warning
 
-**Known problems**
-
-.. class:: infomark
-
-**Bad primer product size ranges**
+**Low number of reads used for microsatellite detection/bad primer product size ranges**
 
 For some datasets pal_finder may generate 'bad' product size ranges (where the
 lower limit exceeds the upper limit) for one or more reads, for input into
-primer3_core.
+primer3_core. In these cases primer3_core will terminate prematurely, which can
+result in a substantially lower number of reads being used for microsatellite
+detection and potentially sub-optimal primer design.
+
+The number of reads generating the bad size ranges are reported in the
+*Summary of microsat types* output dataset as 'readsWithBadRanges'. Ideally
+the reported value should be zero.
+
+The conditions which cause this issue within pal_finder are still unclear,
+however we believe it to be associated with short or low quality reads. If this
+problem affects your data then:
+
+* Ensure that the input data are sufficiently trimmed and filtered (using
+  e.g. the Trimmomatic tool) before rerunning pal_finder.
 
-If this occurs then the tool will terminate with an error. A list of the reads
-for which the bad ranges were generated can be found in the error message
-which can be accessed via the 'bug' icon from a failed dataset.
+* A list of read IDs for which pal_finder generates bad product size ranges can
+  be output by turning on *Output IDs for input reads which generate bad primer
+  ranges*. This outputs an additional dataset with a list of read IDs which can
+  be used to remove read pairs from the input Fastq files (using e.g. the *Filter
+  sequences by ID* tool) before rerunning pal_finder.
+
+.. class:: warning
 
-The conditions which cause this error are unclear. However we believe it to be
-associated with short or low quality reads. It is recommended that the input
-data are sufficiently trimmed and filtered (using e.g. the Trimmomatic tool)
-before rerunning pal_finder.
+**Pal_finder takes a long time to run for large input datasets**
+
+pal_finder was originally developed using MiSeq data, and is not optimised for
+working with the larger Fastqs that are output from other platforms such as
+HiSeq and NextSeq. As a consequence pal_finder may take a very long time to
+complete when operating on larger datasets.
+
+If this is a problem then the tool can be run using a subset of the input reads
+by unchecking the *Use all reads...* option and entering either an integer number
+of reads to use, or a decimal fraction (e.g. 0.5 will select 50% of the reads).
 
 -------------