comparison create_reference_dataset.xml @ 0:63f23d5db27c draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 2c2fd38cb761ec57bac7a0bd376e6aa2b88265d0-dirty
author jjohnson
date Mon, 20 May 2019 15:25:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:63f23d5db27c
1 <tool id="create_defuse_reference" name="Create DeFuse Reference" version="@DEFUSE_VERSION@.1">
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <expand macro="defuse_requirement" />
8 </requirements>
9 <command detect_errors="aggressive"><![CDATA[
10 mkdir -p $config_txt.dataset.extra_files_path &&
11 ## Put executable paths in config file
12 $__tool_directory__/config_sub.sh $defuse_config $config_txt &&
13 ## defuse_create_ref.pl
14 defuse_create_ref.pl -c $config_txt
15 ]]></command>
16 <configfiles>
17 <configfile name="defuse_config">
18 #
19 # Configuration file for defuse
20 #
21 # Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__
22 # will be set by the runtime script using the ENV PATH
23 #
24
25 # Directory where the defuse code was unpacked
26 source_directory = __DEFUSE_PATH__
27
28 # Organism IDs
29 ensembl_organism = $genome.ensembl_organism
30 ensembl_prefix = $genome.ensembl_prefix
31 ensembl_version = $genome.ensembl_version
32 ensembl_genome_version = $genome.ensembl_genome_version
33 ucsc_genome_version = $genome.ucsc_genome_version
34 ncbi_organism = $genome.ncbi_organism
35 ncbi_prefix = $genome.ncbi_prefix
36
37 # Directory where you want your dataset
38 dataset_directory = $config_txt.dataset.extra_files_path
39
40 #raw
41 # Input genome and gene models
42 gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
43 genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
44
45 # Repeat table from ucsc genome browser
46 repeats_filename = $(dataset_directory)/repeats.txt
47
48 # EST info downloaded from ucsc genome browser
49 est_fasta = $(dataset_directory)/est.fa
50 est_alignments = $(dataset_directory)/intronEst.txt
51
52 # Unigene clusters downloaded from ncbi
53 unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq
54 #end raw
55
56 # Paths to external tools
57 samtools_bin = __SAMTOOLS_BIN__
58 bowtie_bin = __BOWTIE_BIN__
59 bowtie_build_bin = __BOWTIE_BUILD_BIN__
60 blat_bin = __BLAT_BIN__
61 fatotwobit_bin = __FATOTWOBIT_BIN__
62 gmap_bin = __GMAP_BIN__
63 gmap_setup_bin = __GMAP_SETUP_BIN__
64 r_bin = __R_BIN__
65 rscript_bin = __RSCRIPT_BIN__
66
67 #raw
68 # Directory where you want your dataset
69 gmap_index_directory = $(dataset_directory)/gmap
70 #end raw
71
72 #raw
73 # Dataset files
74 dataset_prefix = $(dataset_directory)/defuse
75 chromosome_prefix = $(dataset_prefix).dna.chromosomes
76 exons_fasta = $(dataset_prefix).exons.fa
77 cds_fasta = $(dataset_prefix).cds.fa
78 cdna_regions = $(dataset_prefix).cdna.regions
79 cdna_fasta = $(dataset_prefix).cdna.fa
80 reference_fasta = $(dataset_prefix).reference.fa
81 rrna_fasta = $(dataset_prefix).rrna.fa
82 ig_gene_list = $(dataset_prefix).ig.gene.list
83 repeats_regions = $(dataset_directory)/repeats.regions
84 est_split_fasta1 = $(dataset_directory)/est.1.fa
85 est_split_fasta2 = $(dataset_directory)/est.2.fa
86 est_split_fasta3 = $(dataset_directory)/est.3.fa
87 est_split_fasta4 = $(dataset_directory)/est.4.fa
88 est_split_fasta5 = $(dataset_directory)/est.5.fa
89 est_split_fasta6 = $(dataset_directory)/est.6.fa
90 est_split_fasta7 = $(dataset_directory)/est.7.fa
91 est_split_fasta8 = $(dataset_directory)/est.8.fa
92 est_split_fasta9 = $(dataset_directory)/est.9.fa
93
94 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
95 prefilter1 = $(unigene_fasta)
96
97 # deFuse scripts and tools
98 scripts_directory = $(source_directory)/scripts
99 tools_directory = $(source_directory)/tools
100 data_directory = $(source_directory)/data
101 #end raw
102
103 # Parameters for building the dataset
104 chromosomes = $genome.chromosomes
105 mt_chromosome = $genome.mt_chromosome
106 gene_sources = $genome.gene_sources
107 ig_gene_sources = $genome.ig_gene_sources
108 rrna_gene_sources = $genome.rrna_gene_sources
109 gene_biotypes = $genome.gene_sources
110 ig_gene_biotypes = $genome.ig_gene_sources
111 rrna_gene_biotypes = $genome.rrna_gene_sources
112
113 #raw
114 # Remove temp files
115 remove_job_files = yes
116 remove_job_temp_files = yes
117 #end raw
118 </configfile>
119 </configfiles>
120 <inputs>
121 <conditional name="genome">
122 <param name="choice" type="select" label="Select a Genome Build">
123 <option value="GRCh38">Homo_sapiens GRCh38 hg38</option>
124 <option value="GRCh37">Homo_sapiens GRCh37 hg19</option>
125 <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
126 <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
127 <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
128 <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
129 <option value="user_specified">User specified</option>
130 </param>
131 <when value="GRCh38">
132 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
133 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
134 <param name="ensembl_genome_version" type="hidden" value="GRCh38"/>
135 <param name="ensembl_version" type="hidden" value="80"/>
136 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
137 <param name="ncbi_prefix" type="hidden" value="Hs"/>
138 <param name="ucsc_genome_version" type="hidden" value="hg38"/>
139 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
140 <param name="mt_chromosome" type="hidden" value="MT"/>
141 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
142 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
143 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
144 </when>
145 <when value="GRCh37">
146 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
147 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
148 <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
149 <param name="ensembl_version" type="hidden" value="71"/>
150 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
151 <param name="ncbi_prefix" type="hidden" value="Hs"/>
152 <param name="ucsc_genome_version" type="hidden" value="hg19"/>
153 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
154 <param name="mt_chromosome" type="hidden" value="MT"/>
155 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
156 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
157 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
158 </when>
159 <when value="NCBI36">
160 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
161 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
162 <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
163 <param name="ensembl_version" type="hidden" value="54"/>
164 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
165 <param name="ncbi_prefix" type="hidden" value="Hs"/>
166 <param name="ucsc_genome_version" type="hidden" value="hg18"/>
167 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
168 <param name="mt_chromosome" type="hidden" value="MT"/>
169 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
170 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
171 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
172 </when>
173 <when value="GRCm38">
174 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
175 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
176 <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
177 <param name="ensembl_version" type="hidden" value="71"/>
178 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
179 <param name="ncbi_prefix" type="hidden" value="Mm"/>
180 <param name="ucsc_genome_version" type="hidden" value="mm10"/>
181 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
182 <param name="mt_chromosome" type="hidden" value="MT"/>
183 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
184 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
185 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
186 </when>
187 <when value="NCBIM37">
188 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
189 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
190 <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
191 <param name="ensembl_version" type="hidden" value="67"/>
192 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
193 <param name="ncbi_prefix" type="hidden" value="Mm"/>
194 <param name="ucsc_genome_version" type="hidden" value="mm9"/>
195 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
196 <param name="mt_chromosome" type="hidden" value="MT"/>
197 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
198 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
199 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
200 </when>
201 <when value="Rnor_5.0">
202 <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
203 <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
204 <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
205 <param name="ensembl_version" type="hidden" value="71"/>
206 <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
207 <param name="ncbi_prefix" type="hidden" value="Rn"/>
208 <param name="ucsc_genome_version" type="hidden" value="rn5"/>
209 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
210 <param name="mt_chromosome" type="hidden" value="MT"/>
211 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
212 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
213 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
214 </when>
215 <when value="user_specified">
216 <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name">
217 <help>
218 Examples: homo_sapiens, mus_musculus, rattus_norvegicus
219 ftp://ftp.ensembl.org/pub/release-$ensembl_version/fasta/$ensembl_organism/dna/$ensembl_prefix.$ensembl_genome_version.$ensembl_version.dna.chromosome.$chromosome.fa.gz
220 </help>
221 </param>
222 <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
223 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh37, GRCm38, Rnor_5.0"/>
224 <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 71"/>
225 <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
226 <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
227 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg19, mm10, rn5"/>
228 <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
229 <help> Examples:
230 Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
231 Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
232 Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
233 ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
234 </help>
235 </param>
236 <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
237 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
238 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
239 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
240 </when>
241 </conditional>
242 </inputs>
243
244 <outputs>
245 <data format="defuse.conf" name="config_txt" label="${tool.name} on ${genome.ensembl_genome_version} : config.txt"/>
246 </outputs>
247 <tests>
248 </tests>
249 <help>
250 **DeFuse**
251
252 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details.
253
254 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
255 - genome_fasta from Ensembl
256 - gene_models from Ensembl
257 - repeats_filename from UCSC RepeatMasker rmsk.txt
258 - est_fasta from UCSC
259 - est_alignments from UCSC intronEst.txt
260 - unigene_fasta from NCBI
261
262 The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
263
264
265 It will generate a config.txt file that can be input into the deFuse Galaxy tool.
266
267 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
268
269 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
270
271 .. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
272
273 ------
274
275 **Outputs**
276
277 The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.
278
279 </help>
280 <expand macro="citations"/>
281 </tool>