Mercurial > repos > pjbriggs > pal_finder
diff pal_finder_wrapper.xml @ 15:a3af1ff4cad1 draft
pal_finder 0.02.04.7 for testing.
author | pjbriggs |
---|---|
date | Mon, 14 May 2018 11:10:19 -0400 |
parents | 3f8bf1a0403b |
children | f7d63032217b |
line wrap: on
line diff
--- a/pal_finder_wrapper.xml Thu Mar 22 07:21:26 2018 -0400 +++ b/pal_finder_wrapper.xml Mon May 14 11:10:19 2018 -0400 @@ -9,7 +9,7 @@ <requirement type="package" version="1.65">biopython</requirement> <requirement type="package" version="2.8.1">pandaseq</requirement> </requirements> - <command><![CDATA[ + <command detect_errors="exit_code"><![CDATA[ @CONDA_PAL_FINDER_SCRIPT_DIR@ && @CONDA_PAL_FINDER_DATA_DIR@ && bash $__tool_directory__/pal_finder_wrapper.sh @@ -64,6 +64,10 @@ #if str( $platform.assembly ) == '-assembly' $platform.assembly "$output_assembly" #end if + #set $use_all_reads = $platform.subset_conditional.use_all_reads + #if str( $use_all_reads ) != "yes" + --subset "$platform.subset_conditional.subset" + #end if #end if ]]></command> <inputs> @@ -91,6 +95,13 @@ label="Select FASTQ dataset collection with R1/R2 pair" /> </when> </conditional> + <conditional name="subset_conditional"> + <param name="use_all_reads" type="boolean" label="Use all reads for microsatellite detection?" checked="True" truevalue="yes" falsevalue="no" /> + <when value="no"> + <param name="subset" type="text" value="0.5" label="Number or fraction of reads to use" help="Either an integer number of reads or a decimal fraction (e.g. 0.5 to select 50% of reads)" /> + </when> + <when value="yes" /> + </conditional> <param name="filters" type="select" display="checkboxes" multiple="True" label="Filters to apply to the pal_finder results" help="Apply none, one or more filters to refine results"> @@ -106,7 +117,7 @@ <param name="input_fasta" type="data" format="fasta" label="454 fasta file with raw reads" /> </when> </conditional> - <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" help="Set to zero to ignore repeats of this n-mer unit" /> + <param name="min_2mer_repeats" type="integer" value="6" label="Minimum number of 2-mer repeat units to detect" min="1" help="Must detect at least one repeat of this n-mer unit" /> <param name="min_3mer_repeats" type="integer" value="0" label="Minimum number of 3-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" /> <param name="min_4mer_repeats" type="integer" value="0" label="Minimum number of 4-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" /> <param name="min_5mer_repeats" type="integer" value="0" label="Minimum number of 5-mer repeat units" help="Set to zero to ignore repeats of this n-mer unit" /> @@ -158,8 +169,9 @@ label="Maximum acceptable difference between melting temperatures of left and right primers (PRIMER_PAIR_MAX_DIFF_TM)" help="Temperature should be in degrees Celsius" /> </when> + <when value="default" /> </conditional> - <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer ranges" help="Can be used to screen input Fastqs" /> + <param name="report_bad_primer_ranges" type="boolean" truevalue="True" falsevalue="False" label="Output IDs for input reads which generate bad primer product size ranges" help="Can be used to screen reads in input Fastqs " /> <param name="keep_config_file" type="boolean" truevalue="True" falsevalue="False" label="Output the config file to the history" help="Can be used to run pal_finder outside of Galaxy" /> @@ -254,18 +266,76 @@ <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" /> <output name="output_filtered_microsats" compare="re_match" file="illuminaPE_filtered_microsats_rankmotifs.out.re_match" /> </test> - <!-- Test with Illumina input generating bad primer ranges - --> + <!-- Test with Illumina input using subset of reads --> + <test> + <param name="platform_type" value="illumina" /> + <param name="filters" value="" /> + <param name="assembly" value="false" /> + <param name="use_all_reads" value="no" /> + <param name="subset" value="0.5" /> + <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" /> + <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" /> + <expand macro="output_illumina_microsat_subset_summary" /> + <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_subset.out.re_match" /> + </test> + <!-- Test with Illumina input filter that doesn't find any + microsatellites --> + <test expect_failure="true"> + <param name="platform_type" value="illumina" /> + <param name="filters" value="" /> + <param name="assembly" value="false" /> + <param name="min_2mer_repeats" value="8" /> + <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" /> + <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" /> + <assert_stderr> + <has_text text="pal_finder failed to locate any microsatellites" /> + </assert_stderr> + </test> + <!-- Test with Illumina input generating bad ranges --> <test> <param name="platform_type" value="illumina" /> <param name="filters" value="" /> <param name="assembly" value="false" /> - <param name="input_fastq_r1" value="illuminaPE_r1.fq" ftype="fastqsanger" /> - <param name="input_fastq_r2" value="illuminaPE_r2.fq" ftype="fastqsanger" /> - <param name="output_bad_primer_read_ids" value="true" /> - <expand macro="output_illumina_microsat_summary" /> - <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats.out.re_match" /> - <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_ids.out" /> + <param name="min_2mer_repeats" value="8" /> + <param name="input_fastq_r1" value="illuminaPE_r1_bad_ranges.fq" ftype="fastqsanger" /> + <param name="input_fastq_r2" value="illuminaPE_r2_bad_ranges.fq" ftype="fastqsanger" /> + <param name="min_2mer_repeats" value="8" /> + <param name="min_3mer_repeats" value="8" /> + <param name="min_4mer_repeats" value="8" /> + <param name="min_5mer_repeats" value="8" /> + <param name="min_6mer_repeats" value="8" /> + <param name="primer_options" value="custom" /> + <param name="primer_opt_size" value="25" /> + <param name="primer_min_size" value="21" /> + <param name="primer_max_size" value="30" /> + <param name="primer_min_gc" value="40.0" /> + <param name="primer_max_gc" value="60.0" /> + <param name="primer_gc_clamp" value="3" /> + <param name="primer_max_end_gc" value="5" /> + <param name="primer_min_tm" value="60.0" /> + <param name="primer_max_tm" value="80.0" /> + <param name="primer_opt_tm" value="68.0" /> + <param name="primer_pair_max_diff_tm" value="3.0" /> + <param name="report_bad_primer_ranges" value="true" /> + <expand macro="output_illumina_microsat_summary_bad_ranges" /> + <output name="output_pal_summary" compare="re_match" file="illuminaPE_microsats_bad_ranges.out.re_match" /> + <output name="output_bad_primer_read_ids" file="illuminaPE_bad_primer_read_ids.out" /> + </test> + <!-- Test with bad n-mers specified --> + <test expect_failure="true"> + <param name="platform_type" value="illumina" /> + <param name="filters" value="" /> + <param name="assembly" value="false" /> + <param name="min_2mer_repeats" value="8" /> + <param name="min_3mer_repeats" value="8" /> + <param name="min_4mer_repeats" value="0" /> + <param name="min_5mer_repeats" value="8" /> + <param name="min_6mer_repeats" value="8" /> + <param name="input_fastq_r1" value="illuminaPE_r1_no_microsats.fq" ftype="fastqsanger" /> + <param name="input_fastq_r2" value="illuminaPE_r2_no_microsats.fq" ftype="fastqsanger" /> + <assert_stderr> + <has_text text="Minimum number of 4-mers cannot be zero if number of 5-mers is non-zero" /> + </assert_stderr> </test> <!-- Test with 454 input --> <test> @@ -300,26 +370,49 @@ ------------- +.. class:: infomark + +**Known issues** + .. class:: warning -**Known problems** - -.. class:: infomark - -**Bad primer product size ranges** +**Low number of reads used for microsatellite detection/bad primer product size ranges** For some datasets pal_finder may generate 'bad' product size ranges (where the lower limit exceeds the upper limit) for one or more reads, for input into -primer3_core. +primer3_core. In these cases primer3_core will terminate prematurely, which can +result in a substantially lower number of reads being used for microsatellite +detection and potentially sub-optimal primer design. + +The number of reads generating the bad size ranges are reported in the +*Summary of microsat types* output dataset as 'readsWithBadRanges'. Ideally +the reported value should be zero. + +The conditions which cause this issue within pal_finder are still unclear, +however we believe it to be associated with short or low quality reads. If this +problem affects your data then: + +* Ensure that the input data are sufficiently trimmed and filtered (using + e.g. the Trimmomatic tool) before rerunning pal_finder. -If this occurs then the tool will terminate with an error. A list of the reads -for which the bad ranges were generated can be found in the error message -which can be accessed via the 'bug' icon from a failed dataset. +* A list of read IDs for which pal_finder generates bad product size ranges can + be output by turning on *Output IDs for input reads which generate bad primer + ranges*. This outputs an additional dataset with a list of read IDs which can + be used to remove read pairs from the input Fastq files (using e.g. the *Filter + sequences by ID* tool) before rerunning pal_finder. + +.. class:: warning -The conditions which cause this error are unclear. However we believe it to be -associated with short or low quality reads. It is recommended that the input -data are sufficiently trimmed and filtered (using e.g. the Trimmomatic tool) -before rerunning pal_finder. +**Pal_finder takes a long time to run for large input datasets** + +pal_finder was originally developed using MiSeq data, and is not optimised for +working with the larger Fastqs that are output from other platforms such as +HiSeq and NextSeq. As a consequence pal_finder may take a very long time to +complete when operating on larger datasets. + +If this is a problem then the tool can be run using a subset of the input reads +by unchecking the *Use all reads...* option and entering either an integer number +of reads to use, or a decimal fraction (e.g. 0.5 will select 50% of the reads). -------------