annotate repex_full_clustering.xml @ 4:0f9dfaada8ef draft

Uploaded
author petrn
date Fri, 20 Dec 2019 12:32:17 +0000
parents 2d43ed150abe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
2d43ed150abe Uploaded
petrn
parents: 0
diff changeset
1 <tool id="repeatexplorer2" name="RepeatExplorer2 clustering: " version="2.3.6" >
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
2 <stdio>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
3 <regex match="lastdb: can't open file: NEAR" source="stderr" level="fatal" description="Version of last is too old, use ver 956 or higher\n" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
4 <regex match="Traceback" source="stderr" level="fatal" description="Unknown error" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
5 <regex match="error" source="stderr" level="fatal" description="Unknown error" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
6 <regex match="Warning" source="stderr" level="warning" description="Unknown error" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
7 <exit_code range="1:" level="fatal" description="Error" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
8 </stdio>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
9 <description>Improved version or repeat discovery and characterization using graph based sequence clustering</description>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
10 <requirements>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
11 <requirement type="package" version="3.7">python</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
12 <requirement type="package" version="0.9.1" >pyrserve</requirement>
1
2d43ed150abe Uploaded
petrn
parents: 0
diff changeset
13 <requirement type="package" version=">956" >last</requirement>
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
14 <requirement type="package">mafft</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
15 <requirement type="package">imagemagick</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
16 <requirement type="package">blast</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
17 <requirement type="package">diamond</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
18 <requirement type="package">blast-legacy</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
19 <requirement type="package">r-igraph</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
20 <requirement type="package">r-data.tree</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
21 <requirement type="package">r-stringr</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
22 <requirement type="package">r-r2html</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
23 <requirement type="package">r-hwriter</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
24 <requirement type="package">r-dt</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
25 <requirement type="package">r-scales</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
26 <requirement type="package">r-plotrix</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
27 <requirement type="package">r-png</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
28 <requirement type="package">r-plyr</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
29 <requirement type="package">r-dplyr</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
30 <requirement type="package">r-optparse</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
31 <requirement type="package">r-dbi</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
32 <requirement type="package">r-rsqlite</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
33 <requirement type="package">r-rserve</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
34 <requirement type="package">bioconductor-biostrings</requirement>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
35 </requirements>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
36 <command >
4
0f9dfaada8ef Uploaded
petrn
parents: 1
diff changeset
37 make -C ${__tool_directory__};
0
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
38 export PYTHONHASHSEED=0;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
39 ${__tool_directory__}/seqclust --sample ${sample} --output_dir=tarean_output --logfile=${log} --cleanup $paired --taxon $taxon
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
40
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
41 #if $advanced_options.advanced:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
42 --mincl $advanced_options.size_threshold $advanced_options.keep_names $advanced_options.automatic_filtering -D $advanced_options.blastx.options_blastx
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
43 --assembly_min $advanced_options.assembly_min_cluster_size
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
44
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
45 #if $advanced_options.comparative.options_comparative:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
46 --prefix_length $advanced_options.comparative.prefix_length
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
47 #end if
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
48
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
49 #if $advanced_options.custom_library.options_custom_library:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
50 -d $advanced_options.custom_library.library extra_database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
51 #end if
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
52
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
53 #if $advanced_options.options.options:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
54 -opt $advanced_options.options.options
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
55 #end if
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
56 #end if
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
57 ${FastaFile} >stdout.log 2> stderr.log ;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
58 echo "STDOUT CONTENT:" >> ${log} ;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
59 cat stdout.log >> ${log} ;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
60 echo "STDERR CONTENT:" >> ${log};
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
61 cat stderr.log >> ${log} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
62 cd tarean_output &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
63 zip -r ${ReportArchive}.zip * &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
64 mv ${ReportArchive}.zip ${ReportArchive} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
65 cp index.html ${ReportFile} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
66 mkdir ${ReportFile.files_path} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
67 cp -r --parents libdir ${ReportFile.files_path} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
68 cp -r --parents seqclust/clustering/superclusters ${ReportFile.files_path} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
69 cp -r --parents seqclust/clustering/clusters ${ReportFile.files_path} &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
70 cp seqclust/clustering/hitsort.cls ${ReportFile.files_path}/seqclust/clustering/hitsort.cls &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
71 cp *.png ${ReportFile.files_path}/ &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
72 cp *.csv ${ReportFile.files_path}/ &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
73 cp *.html ${ReportFile.files_path}/ &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
74 cp *.css ${ReportFile.files_path}/ &amp;&amp;
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
75 cp *.fasta ${ReportFile.files_path}/ 2>>$log &amp;&amp; rm -r ../tarean_output || :
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
76
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
77 </command>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
78 <inputs>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
79 <param name="FastaFile" label="NGS reads" type="data" format="fasta"
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
80 help="Input file must contain fasta-formatted NGS reads. If paired end reads are used, reads must be interlaced and all pairs must be complete. Example of input data format is provided in the help below. "/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
81 <param name="paired" type="boolean" truevalue="--paired" falsevalue="" checked="True" label="Paired-end reads" help="Check if you are using pair reads and input sequences contain both read mates and left mates alternate with their right mates" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
82
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
83 <param name="sample" label="Sample size" type="integer" value="500000" min="10000"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
84 <param name="taxon" label="Select taxon and protein domain database version (REXdb)" type="select" help="Reference database of transposable element protein domains - REXdb - is used for annotation of repeats">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
85 <option value="VIRIDIPLANTAE3.0" selected="true">Viridiplantae version 3.0 </option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
86 <option value="VIRIDIPLANTAE2.2" selected="true">Viridiplantae version 2.2</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
87 <option value="METAZOA3.0" >Metazoa version 3.0</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
88 <option value="METAZOA2.0" >Metazoa version 2.0</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
89 <!-- Modify setting in config.py accordingly -->
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
90 </param>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
91
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
92 <conditional name="advanced_options">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
93 <param name="advanced" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Advanced options" />
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
94 <when value="false">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
95 <!-- pass -->
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
96 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
97 <when value="true">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
98 <conditional name="comparative">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
99 <param name="options_comparative" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Perform comparative analysis" help="Use this options when you want to compare sequences multiple groups"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
100 <when value="false">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
101 <!-- do nothing here -->
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
102 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
103 <when value="true">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
104 <param name="prefix_length" label="Group code length" type="integer" value="3" min="1" max="10" help="For comparative analysis, sequences are from individial groups distinguished by sample code which must be used as prefix for sequence name. See example below."/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
105 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
106 </conditional>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
107
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
108 <conditional name="blastx">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
109 <param name="options_blastx" type="select" label="Select parameters for protein domain search">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
110 <option value="BLASTX_W2" selected="false">blastx with word size 2 (the most sensitive, slowest)</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
111 <option value="BLASTX_W3" selected="true">blastx with word size 3 (default)</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
112 <option value="DIAMOND" selected="false">diamond program (the least sensitive, fastest)</option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
113 </param>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
114 </conditional>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
115
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
116 <conditional name="options">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
117 <param name="options" type="select" label="Similarity search options" help="Different similarity search parameters are used depending on the used input data to adjust search to differences in length and error rate">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
118 <option value="ILLUMINA" selected="true">Illumina reads, read length 100nt or more </option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
119 <option value="ILLUMINA_SHORT" selected="false">Illumina reads, shorter than 100nt (Do not use reads shorter than 50nt!) </option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
120 <option value="ILLUMINA_DUST_OFF" selected="false">Illumina reads, no masking of low complexity repeats </option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
121 <option value="OXFORD_NANOPORE" selected="false">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
122 Pseudo short reads simulated from Oxford Nanopore data (experimental feature)
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
123 </option>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
124 </param>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
125 </conditional>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
126
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
127 <conditional name="custom_library">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
128 <param name="options_custom_library" type="boolean" truevalue="true" falsevalue="false" checked="False" label="Use custom repeat database"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
129 <when value="false">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
130 <!-- do nothing here -->
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
131 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
132 <when value="true">
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
133 <param name="library" format="fasta" type="data" label="Custom library of repeats" help="Library of repeats as DNA sequences in fasta format. The required format for IDs in a custom library is : '>reapeatname#class/subclass'"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
134 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
135 </conditional>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
136 <param name="size_threshold" label="Cluster size threshold for detailed analysis" type="float" value="0.01" min="0.0001" max="100" help ="Minimal size (as percentage of input reads) of the smallest cluster which is analyzed, cluster with less than 20 reads are not considered at all."/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
137 <param name="automatic_filtering" label="Perform automatic filtering of abundant satellite repeats" help="Automatic filtering tries to identify the most abundant tandem repeats and remove such sequences partially from analysis. Removal of abundant tandem repeat can enable to analyze higher proportion of other less abundant repeats." type="boolean" truevalue="--automatic_filtering" falsevalue="" checked="false"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
138 <param name="keep_names" label="Keep original sequences names" type="boolean" truevalue="--keep_names" falsevalue="" checked="false" help="By default sequence are relabeled using integers. If you want to keep original names, use this option."/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
139 <param name="assembly_min_cluster_size" type="integer" label="min cluster size for assembly" value="5" min="2" max="100"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
140 </when>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
141 </conditional>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
142
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
143 </inputs>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
144 <outputs>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
145 <data name="log" format="txt" label="RepeatExplorer2 - log file"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
146 <data name="ReportArchive" format="zip" label="RepeatExplorer2 - Archive with HTML report from data ${FastaFile.hid}"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
147 <data name="ReportFile" format="html" label="RepeatExplorer2 - HTML report from data ${FastaFile.hid}"/>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
148 </outputs>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
149
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
150 <help>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
151 **HELP**
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
152
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
153 RepeatExplorer2 clustering is a computational pipeline for unsupervised
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
154 identification of repeats from unassembled sequence reads. The
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
155 pipeline uses low-pass whole genome sequence reads and performs graph-based
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
156 clustering. Resulting clusters, representing all types of repeats, are then
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
157 examined to identify and classify into repeats groups.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
158
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
159 **Input data**
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
160
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
161 The analysis requires either **single** or **paired-end reads** generated
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
162 by whole genome shotgun sequencing provided as a single fasta-formatted file.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
163 Generally, paired-end reads provide significantly better results than single
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
164 reads. Reads should be of uniform length (optimal size range is 100-200 nt) and
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
165 the number of analyzed reads should represent less than 1x genome equivalent
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
166 (genome coverage of 0.01 - 0.50 x is recommended). Reads should be
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
167 quality-filtered (recommended filtering : quality score >=10 over 95% of bases
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
168 and no Ns allowed) and only **complete read pairs** should be submitted for
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
169 analysis. When paired reads are used, input data must be **interlaced** format
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
170 as fasta file:
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
171
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
172 example of interlaced input format::
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
173
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
174 >0001_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
175 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
176 >0001_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
177 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
178 >0002_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
179 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
180 >0002_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
181 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
182 >0003_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
183 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
184 >0003_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
185 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
186 ...
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
187
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
188
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
189 **Comparative analysis**
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
190
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
191 For comparative analysis sequence names must contain code (prefix) for each group.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
192 Prefix in sequences names must be of fixed length.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
193
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
194 Example of labeling two groups with where **group code length** is 2 and is used to distinguish groups - AA and BB ::
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
195
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
196 >AA0001_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
197 CGTAATATACATACTTGCTAGCTAGTTGGATGCATCCAACTTGCAAGCTAGTTTGATG
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
198 >AA0001_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
199 GATTTGACGGACACACTAACTAGCTAGTTGCATCTAAGCGGGCACACTAACTAACTAT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
200 >AA0002_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
201 ACTCATTTGGACTTAACTTTGATAATAAAAACTTAAAAAGGTTTCTGCACATGAATCG
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
202 >AA0002_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
203 TATGTTGAAAAATTGAATTTCGGGACGAAACAGCGTCTATCGTCACGACATAGTGCTC
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
204 >BB0001_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
205 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
206 >BB0001_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
207 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
208 >BB0002_f
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
209 TGACATTTGTGAACGTTAATGTTCAACAAATCTTTCCAATGTCTTTTTATCTTATCAT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
210 >BB0002_r
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
211 TATTGAAATACTGGACACAAATTGGAAATGAAACCTTGTGAGTTATTCAATTTATGTT
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
212
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
213
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
214 To prepare quality filtered and interlaced input fasta file from fastq
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
215 files, use `Preprocessing of paired-reads`__ tool.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
216
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
217 .. __: tool_runner?tool_id=paired_fastq_filtering
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
218
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
219
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
220 **Additional parameters**
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
221
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
222 **Sample size** defines how many reads should be used in calculation.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
223 Default setting with 500,000 reads will enable detection of high copy
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
224 repeats within several hours of computation time. For higher
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
225 sensitivity the sample size can be set higher. Since sample size affects
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
226 the memory usage, this parameter may be automatically adjusted to lower
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
227 value during the run. Maximum sample size which can be processed depends on
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
228 the repetitiveness of analyzed genome.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
229
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
230
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
231 **Select taxon and protein domain database version (REXdb)**. Classification
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
232 of transposable elements is based on the similarity to our reference database
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
233 of transposable element protein domains (**REXdb**). Standalone database for Viridiplantae species
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
234 can be obtained on `repeatexplorer.org`__. Classification
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
235 system used in REXdb is described in article `Systematic survey of plant
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
236 LTR-retrotransposons elucidates phylogenetic relationships of their
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
237 polyprotein domains and provides a reference for element classification`__
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
238 Database for Metazoa species is still under development so use it with caution.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
239
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
240 .. __: http://repeatexplorer.org
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
241 .. __: https://doi.org/10.1186/s13100-018-0144-1
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
242
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
243 **Select parameters for protein domain search** REXdb is compared with s
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
244 equence clusters either using blastx or diamond aligner. Diamond program
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
245 is about three time faster than blastx with word size 3.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
246
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
247 **Similarity search options** By default sequence reads are compared using
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
248 mgblast program. Default threshold is explicitly set to 90% sequence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
249 similarity spanning at least 55% of the read length (in the case of reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
250 differing in length it applies to the longer one). Additionally, sequence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
251 overlap must be at least 55 nt. If you select option for shorter reads
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
252 than 100 nt, minimum overlap 55 nt is not required.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
253
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
254 By default,
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
255 mgblast search use DUST program to filter out
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
256 low-complexity sequences. If you want
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
257 to increase sensitivity of detection of satellites with shorter monomer
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
258 use option with '*no masking of low complexity repeats*'. Note that omitting
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
259 DUST filtering will significantly increase running times
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
260
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
261
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
262 **Automatic filtering of abundant satellite repeats** perform clustering on
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
263 smaller dataset of sequence reads to detect abundant high confidence
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
264 satellite repeats. If such satellites are detected, sequence reads derived
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
265 from these satellites are depleted from input dataset. This step enable more
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
266 sensitive detection of less abundant repeats as more reads can be used
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
267 in clustering step.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
268
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
269 **Use custom repeat database**. This option allows users to perform similarity
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
270 comparison of identified repeats to their custom databases. The repeat class must
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
271 be encoded in FASTA headers of database entries in order to allow correct
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
272 parsing of similarity hits. Required format for custom database sequence name is: ::
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
273
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
274 >reapeatname#class/subclass
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
275
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
276
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
277 **Output**
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
278
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
279 List of clusters identified as putative satellite repeats, their genomic
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
280 abundance and various cluster characteristics.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
281
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
282 Output includes a **HTML summary** with table listing of all analyzed
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
283 clusters. More detailed information about clusters is provided in
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
284 additional files and directories. All results are also provided as
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
285 downloadable **zip archive**. Additionally a **log file** reporting
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
286 the progress of the computational pipeline is provided.
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
287
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
288 </help>
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
289
f6ebec6e235e Uploaded
petrn
parents:
diff changeset
290 </tool>