Mercurial > repos > dfornika > snippy_clean_full_aln

--- a/snippy_clean_full_aln.xml	Fri Jan 24 22:36:09 2020 +0000
+++ b/snippy_clean_full_aln.xml	Thu Jan 30 00:02:46 2020 +0000
@@ -10,21 +10,69 @@
     <command detect_errors="exit_code"><![CDATA[
         snippy-clean_full_aln
             '${full_aln}'
+            --to '${to_char}'
             > '${clean_full_aln}'
     ]]></command>

     <inputs>
         <param name="full_aln" type="data" format="fasta" label="Snippy core.full.aln file" help="" />
+        <param name="to_char" type="text" default="N" label="Replacement character" help="Replace non-[AGTCN-] chars with this character" >
+            <sanitizer>
+                <valid initial="string.printable">
+                    <remove value="&apos;" />
+                </valid>
+                <mapping initial="none">
+                    <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;" />
+                </mapping>
+            </sanitizer>
+        </param>
     </inputs>
-
     <outputs>
         <data name="clean_full_aln" format="fasta" label="${tool.name} on ${on_string} cleaned core alignment" />
     </outputs>
-
     <tests>
+        <test>
+            <param name="full_aln" value="core.full.nonclean.aln" />
+            <output name="clean_full_aln">
+                <assert_contents>
+                    <has_line line="GCNNGC" />
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <param name="full_aln" value="core.full.nonclean.aln" />
+            <param name="to_char" value="X" />
+            <output name="clean_full_aln">
+                <assert_contents>
+                    <has_line line="GCXXGC" />
+                </assert_contents>
+            </output>
+        </test>
     </tests>
+    <help><![CDATA[
+    The core.full.aln file is a FASTA formatted multiple sequence alignment file.
+    It has one sequence for the reference, and one for each sample participating
+    in the core genome calculation. Each sequence has the same length as the reference
+    sequence.

-    <help><![CDATA[
+    Character 	Meaning
+    ATGC 	Same as the reference
+    atgc 	Different from the reference
+    - 	Zero coverage in this sample or a deletion relative to the reference
+    N 	Low coverage in this sample (based on --mincov)
+    X 	Masked region of reference (from --mask)
+    n 	Heterozygous or poor quality genotype (has GT=0/1 or QUAL < --minqual in snps.raw.vcf)
+
+    You can remove all the "weird" characters and replace them with N using this tool.
+    This is useful when you need to pass it to a tree-building or recombination-removal
+    tool:
+
+    ```
+    % snippy-clean_full_aln core.full.aln > clean.full.aln
+    % run_gubbins.py -p gubbins clean.full.aln
+    % snp-sites -c gubbins.filtered_polymorphic_sites.fasta > clean.core.aln
+    % FastTree -gtr -nt clean.core.aln > clean.core.tree
+    ```
     ]]></help>
     <expand macro="citations" />
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/core.full.nonclean.aln	Thu Jan 30 00:02:46 2020 +0000
@@ -0,0 +1,4 @@
+>Reference
+CGATGC
+>S1
+GC--GC