comparison datamanager_create_reference.xml @ 0:63f23d5db27c draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 2c2fd38cb761ec57bac7a0bd376e6aa2b88265d0-dirty
author jjohnson
date Mon, 20 May 2019 15:25:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:63f23d5db27c
1 <tool id="data_manager_defuse_reference" name="DeFuse Reference DataManager" version="@DEFUSE_VERSION@.1" tool_type="manage_data">
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <expand macro="defuse_requirement" />
8 </requirements>
9 <command detect_errors="exit_code"><![CDATA[
10 python '$__tool_directory__/datamanager_create_reference.py'
11 --dbkey $genome.ensembl_genome_version
12 --description "$genome.ensembl_prefix $genome.ensembl_genome_version ($genome.ucsc_genome_version)"
13 --defuse_config $defuse_config
14 --defuse_script $defuse_script
15 $out_file
16 ]]></command>
17 <configfiles>
18 <configfile name="defuse_config">
19 #
20 # Configuration file for defuse
21 #
22 # Variables that desiganate the PATH to an application, e.g. __SAMTOOLS_BIN__
23 # will be set by the runtime script using the ENV PATH
24 #
25
26 # Directory where the defuse code was unpacked
27 source_directory = __DEFUSE_PATH__
28
29 # Organism IDs
30 ensembl_organism = $genome.ensembl_organism
31 ensembl_prefix = $genome.ensembl_prefix
32 ensembl_version = $genome.ensembl_version
33 ensembl_genome_version = $genome.ensembl_genome_version
34 ucsc_genome_version = $genome.ucsc_genome_version
35 ncbi_organism = $genome.ncbi_organism
36 ncbi_prefix = $genome.ncbi_prefix
37
38 # Directory where you want your dataset
39 dataset_directory = __DATASET_DIRECTORY__
40
41 #raw
42 # Input genome and gene models
43 gene_models = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).gtf
44 genome_fasta = $(dataset_directory)/$(ensembl_prefix).$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
45
46 # Repeat table from ucsc genome browser
47 repeats_filename = $(dataset_directory)/repeats.txt
48
49 # EST info downloaded from ucsc genome browser
50 est_fasta = $(dataset_directory)/est.fa
51 est_alignments = $(dataset_directory)/intronEst.txt
52
53 # Unigene clusters downloaded from ncbi
54 unigene_fasta = $(dataset_directory)/$(ncbi_prefix).seq.uniq
55 #end raw
56
57 # Paths to external tools
58 samtools_bin = __SAMTOOLS_BIN__
59 bowtie_bin = __BOWTIE_BIN__
60 bowtie_build_bin = __BOWTIE_BUILD_BIN__
61 blat_bin = __BLAT_BIN__
62 fatotwobit_bin = __FATOTWOBIT_BIN__
63 gmap_bin = __GMAP_BIN__
64 gmap_setup_bin = __GMAP_SETUP_BIN__
65 gmap_build_bin = __GMAP_BUILD_BIN__
66 r_bin = __R_BIN__
67 rscript_bin = __RSCRIPT_BIN__
68
69 #raw
70 # Directory where you want your dataset
71 gmap_index_directory = $(dataset_directory)/gmap
72 #end raw
73
74 #raw
75 # Dataset files
76 dataset_prefix = $(dataset_directory)/defuse
77 chromosome_prefix = $(dataset_prefix).dna.chromosomes
78 exons_fasta = $(dataset_prefix).exons.fa
79 cds_fasta = $(dataset_prefix).cds.fa
80 cdna_regions = $(dataset_prefix).cdna.regions
81 cdna_fasta = $(dataset_prefix).cdna.fa
82 reference_fasta = $(dataset_prefix).reference.fa
83 rrna_fasta = $(dataset_prefix).rrna.fa
84 ig_gene_list = $(dataset_prefix).ig.gene.list
85 repeats_regions = $(dataset_directory)/repeats.regions
86 est_split_fasta1 = $(dataset_directory)/est.1.fa
87 est_split_fasta2 = $(dataset_directory)/est.2.fa
88 est_split_fasta3 = $(dataset_directory)/est.3.fa
89 est_split_fasta4 = $(dataset_directory)/est.4.fa
90 est_split_fasta5 = $(dataset_directory)/est.5.fa
91 est_split_fasta6 = $(dataset_directory)/est.6.fa
92 est_split_fasta7 = $(dataset_directory)/est.7.fa
93 est_split_fasta8 = $(dataset_directory)/est.8.fa
94 est_split_fasta9 = $(dataset_directory)/est.9.fa
95
96 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
97 prefilter1 = $(unigene_fasta)
98
99 # deFuse scripts and tools
100 scripts_directory = $(source_directory)/scripts
101 tools_directory = $(source_directory)/tools
102 data_directory = $(source_directory)/data
103 #end raw
104
105 # Parameters for building the dataset
106 chromosomes = $genome.chromosomes
107 mt_chromosome = $genome.mt_chromosome
108 gene_sources = $genome.gene_sources
109 ig_gene_sources = $genome.ig_gene_sources
110 rrna_gene_sources = $genome.rrna_gene_sources
111 gene_biotypes = $genome.gene_sources
112 ig_gene_biotypes = $genome.ig_gene_sources
113 rrna_gene_biotypes = $genome.rrna_gene_sources
114
115 #raw
116 # Remove temp files
117 remove_job_files = yes
118 remove_job_temp_files = yes
119 #end raw
120 </configfile>
121 <configfile name="defuse_script">#slurp
122 #!/bin/bash
123 ## define some things for cheetah proccessing
124 #set $amp = chr(38)
125 #set $gt = chr(62)
126 ## substitute pathnames into config file
127 if `grep __DATASET_DIRECTORY__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DATASET_DIRECTORY__#\$1#" $defuse_config; fi
128 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
129 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
130 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
131 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
132 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
133 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
134 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
135 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
136 if `grep __GMAP_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BUILD_BIN=`which gmap_build`;then sed -i'.tmp' "s#__GMAP_BUILD_BIN__#\${GMAP_BUILD_BIN}#" $defuse_config; fi
137 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
138 ## copy config to output
139 cp $defuse_config \$1/defuse_config.txt
140 ## Run the create_reference_dataset.pl
141 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
142 </configfile>
143 </configfiles>
144 <inputs>
145 <conditional name="genome">
146 <param name="choice" type="select" label="Select a Genome Build">
147 <option value="GRCh38">Homo_sapiens GRCh38 hg38</option>
148 <option value="GRCh37">Homo_sapiens GRCh37 hg19</option>
149 <option value="NCBI36">Homo_sapiens NCBI36 hg18</option>
150 <option value="GRCm38">Mus_musculus GRCm38 mm10</option>
151 <option value="NCBIM37">Mus_musculus NCBIM37 mm9</option>
152 <option value="Rnor_5.0">Rattus_norvegicus Rnor_5.0 rn5</option>
153 <option value="user_specified">User specified</option>
154 </param>
155 <when value="GRCh38">
156 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
157 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
158 <param name="ensembl_genome_version" type="hidden" value="GRCh38"/>
159 <param name="ensembl_version" type="hidden" value="80"/>
160 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
161 <param name="ncbi_prefix" type="hidden" value="Hs"/>
162 <param name="ucsc_genome_version" type="hidden" value="hg38"/>
163 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
164 <param name="mt_chromosome" type="hidden" value="MT"/>
165 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
166 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
167 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
168 </when>
169 <when value="GRCh37">
170 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
171 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
172 <param name="ensembl_genome_version" type="hidden" value="GRCh37"/>
173 <param name="ensembl_version" type="hidden" value="71"/>
174 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
175 <param name="ncbi_prefix" type="hidden" value="Hs"/>
176 <param name="ucsc_genome_version" type="hidden" value="hg19"/>
177 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
178 <param name="mt_chromosome" type="hidden" value="MT"/>
179 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
180 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
181 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
182 </when>
183 <when value="NCBI36">
184 <param name="ensembl_organism" type="hidden" value="homo_sapiens"/>
185 <param name="ensembl_prefix" type="hidden" value="Homo_sapiens"/>
186 <param name="ensembl_genome_version" type="hidden" value="NCBI36"/>
187 <param name="ensembl_version" type="hidden" value="54"/>
188 <param name="ncbi_organism" type="hidden" value="Homo_sapiens"/>
189 <param name="ncbi_prefix" type="hidden" value="Hs"/>
190 <param name="ucsc_genome_version" type="hidden" value="hg18"/>
191 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
192 <param name="mt_chromosome" type="hidden" value="MT"/>
193 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
194 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
195 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
196 </when>
197 <when value="GRCm38">
198 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
199 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
200 <param name="ensembl_genome_version" type="hidden" value="GRCm38"/>
201 <param name="ensembl_version" type="hidden" value="71"/>
202 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
203 <param name="ncbi_prefix" type="hidden" value="Mm"/>
204 <param name="ucsc_genome_version" type="hidden" value="mm10"/>
205 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
206 <param name="mt_chromosome" type="hidden" value="MT"/>
207 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
208 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
209 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
210 </when>
211 <when value="NCBIM37">
212 <param name="ensembl_organism" type="hidden" value="mus_musculus"/>
213 <param name="ensembl_prefix" type="hidden" value="Mus_musculus"/>
214 <param name="ensembl_genome_version" type="hidden" value="NCBIM37"/>
215 <param name="ensembl_version" type="hidden" value="67"/>
216 <param name="ncbi_organism" type="hidden" value="Mus_musculus"/>
217 <param name="ncbi_prefix" type="hidden" value="Mm"/>
218 <param name="ucsc_genome_version" type="hidden" value="mm9"/>
219 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT"/>
220 <param name="mt_chromosome" type="hidden" value="MT"/>
221 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
222 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
223 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
224 </when>
225 <when value="Rnor_5.0">
226 <param name="ensembl_organism" type="hidden" value="rattus_norvegicus"/>
227 <param name="ensembl_prefix" type="hidden" value="Rattus_norvegicus"/>
228 <param name="ensembl_genome_version" type="hidden" value="Rnor_5.0"/>
229 <param name="ensembl_version" type="hidden" value="71"/>
230 <param name="ncbi_organism" type="hidden" value="Rattus_norvegicus"/>
231 <param name="ncbi_prefix" type="hidden" value="Rn"/>
232 <param name="ucsc_genome_version" type="hidden" value="rn5"/>
233 <param name="chromosomes" type="hidden" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT"/>
234 <param name="mt_chromosome" type="hidden" value="MT"/>
235 <param name="gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding"/>
236 <param name="ig_gene_sources" type="hidden" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene"/>
237 <param name="rrna_gene_sources" type="hidden" value="Mt_rRNA,rRNA,rRNA_pseudogene"/>
238 </when>
239 <when value="user_specified">
240 <param name="ensembl_organism" type="text" value="" label="Ensembl Organism Name" help="Examples: homo_sapiens, mus_musculus, rattus_norvegicus"/>
241 <param name="ensembl_prefix" type="text" value="" label="Ensembl Organism prefix" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
242 <param name="ensembl_genome_version" type="text" value="" label="Ensembl Genome Version" help="Examples: GRCh38, GRCh37, GRCm38, Rnor_5.0"/>
243 <param name="ensembl_version" type="integer" value="" label="Ensembl Release Version" help="Example: 86"/>
244 <param name="ncbi_organism" type="text" value="" label="NCBI Organism Name" help="Examples: Homo_sapiens, Mus_musculus, Rattus_norvegicus"/>
245 <param name="ncbi_prefix" type="text" value="" label="NCBI Organism Unigene prefix" help="Examples: Hs, Mm, Rn"/>
246 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Examples: hg38, hg19, mm10, rn5"/>
247 <param name="chromosomes" type="text" value="" label="Chromosomes for Ensembl genome build" >
248 <help> Examples:
249 Homo_sapiens: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
250 Mus_musculus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,Y,MT
251 Rattus_norvegicus: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,X,MT
252 ( ftp://ftp.ensembl.org/pub/release-71/fasta/homo_sapiens/dna/ )
253 </help>
254 </param>
255 <param name="mt_chromosome" type="text" value="MT" label="Ensembl Mitochonrial Chromosome name" />
256 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
257 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
258 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
259 </when>
260 </conditional>
261 </inputs>
262 <outputs>
263 <data name="out_file" format="data_manager_json" label="${tool.name} : ${genome.ensembl_genome_version}"/>
264 </outputs>
265 <tests>
266 </tests>
267 <help>
268 **DeFuse**
269
270 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion. See the DeFuse_Version_0.6_ manual for details.
271
272 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.6_:
273 - genome_fasta from Ensembl
274 - gene_models from Ensembl
275 - repeats_filename from UCSC RepeatMasker rmsk.txt
276 - est_fasta from UCSC
277 - est_alignments from UCSC intronEst.txt
278 - unigene_fasta from NCBI
279
280 The create_defuse_reference Galaxy tool downloads the reference genome and other source files, and builds any derivative files including bowtie indices, gmap indices, and 2bit files. Expect this step to take at least 12 hours.
281
282
283 It will generate the refernce data for deFuse Galaxy tool.
284
285 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
286
287 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
288
289 .. _DeFuse_Version_0.6: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.6.1
290
291 ------
292
293 **Outputs**
294
295 The galaxy history will contain: the config.txt file that provides DeFuse with the reference data paths.
296
297 </help>
298 </tool>