comparison create_reference_dataset.xml @ 12:33e2235bf003

Add create_reference_dataset.xml
author Jim Johnson <jj@umn.edu>
date Sun, 09 Jun 2013 20:30:21 -0500
parents
children 85693cb5339f
comparison
equal deleted inserted replaced
11:19c48803a377 12:33e2235bf003
1 <tool id="create_defusei_reference" name="Create DeFuse Reference" version="1.6.1">
2 <description>create a defuse reference from Ensembl and UCSC sources</description>
3 <requirements>
4 <requirement type="package" version="0.6.1">defuse</requirement>
5 <requirement type="package" version="0.1.18">samtools</requirement>
6 <requirement type="package" version="1.0.0">bowtie</requirement>
7 <requirement type="package" version="2013-05-09">gmap</requirement>
8 <requirement type="package" version="latest">kent</requirement>
9 </requirements>
10 <command interpreter="command"> /bin/bash $shscript </command>
11 <inputs>
12 <param name="ensembl_genome_version" type="text" value="" label="Esembl Genome Version" help="Example: GRCh37"/>
13 <param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/>
14 <param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/>
15 <param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
16 <param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" />
17 <param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
18 <param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
19 <param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
20 </inputs>
21 <outputs>
22 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
23 </outputs>
24 <configfiles>
25 <configfile name="defuse_config">
26 #import ast
27 #
28 # Configuration file for defuse
29 #
30 # At a minimum, change all values enclused by []
31 #
32
33 # Directory where the defuse code was unpacked
34 ## Default location in the tool/defuse directory
35 # source_directory = ${__root_dir__}/tools/defuse
36 source_directory = __DEFUSE_PATH__
37
38 ensembl_version = $ensembl_version
39 ensembl_genome_version = $ensembl_genome_version
40 ucsc_genome_version = $ucsc_genome_version
41
42 # Directory where you want your dataset
43 dataset_directory = $config_txt.extra_files_path
44
45 #raw
46 # Input genome and gene models
47 gene_models = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf
48 genome_fasta = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
49
50 # Repeat table from ucsc genome browser
51 repeats_filename = $(dataset_directory)/repeats.txt
52
53 # EST info downloaded from ucsc genome browser
54 est_fasta = $(dataset_directory)/est.fa
55 est_alignments = $(dataset_directory)/intronEst.txt
56
57 # Unigene clusters downloaded from ncbi
58 unigene_fasta = $(dataset_directory)/Hs.seq.uniq
59 #end raw
60
61 # Paths to external tools
62 samtools_bin = __SAMTOOLS_BIN__
63 bowtie_bin = __BOWTIE_BIN__
64 bowtie_build_bin = __BOWTIE_BUILD_BIN__
65 blat_bin = __BLAT_BIN__
66 fatotwobit_bin = __FATOTWOBIT_BIN__
67 gmap_bin = __GMAP_BIN__
68 gmap_setup_bin = __GMAP_SETUP_BIN__
69 r_bin = __R_BIN__
70 rscript_bin = __RSCRIPT_BIN__
71
72 #raw
73 # Directory where you want your dataset
74 gmap_index_directory = $(dataset_directory)/gmap
75 #end raw
76
77 #raw
78 # Dataset files
79 dataset_prefix = $(dataset_directory)/defuse
80 chromosome_prefix = $(dataset_prefix).dna.chromosomes
81 exons_fasta = $(dataset_prefix).exons.fa
82 cds_fasta = $(dataset_prefix).cds.fa
83 cdna_regions = $(dataset_prefix).cdna.regions
84 cdna_fasta = $(dataset_prefix).cdna.fa
85 reference_fasta = $(dataset_prefix).reference.fa
86 rrna_fasta = $(dataset_prefix).rrna.fa
87 ig_gene_list = $(dataset_prefix).ig.gene.list
88 repeats_regions = $(dataset_directory)/repeats.regions
89 est_split_fasta1 = $(dataset_directory)/est.1.fa
90 est_split_fasta2 = $(dataset_directory)/est.2.fa
91 est_split_fasta3 = $(dataset_directory)/est.3.fa
92 est_split_fasta4 = $(dataset_directory)/est.4.fa
93 est_split_fasta5 = $(dataset_directory)/est.5.fa
94 est_split_fasta6 = $(dataset_directory)/est.6.fa
95 est_split_fasta7 = $(dataset_directory)/est.7.fa
96 est_split_fasta8 = $(dataset_directory)/est.8.fa
97 est_split_fasta9 = $(dataset_directory)/est.9.fa
98
99 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
100 prefilter1 = $(unigene_fasta)
101
102 # deFuse scripts and tools
103 scripts_directory = $(source_directory)/scripts
104 tools_directory = $(source_directory)/tools
105 data_directory = $(source_directory)/data
106 #end raw
107
108 #raw
109 # Bowtie parameters
110 bowtie_threads = 1
111 bowtie_quals = --phred33-quals
112 max_insert_size = 500
113 #end raw
114
115 # Parameters for building the dataset
116 chromosomes = $chromosomes
117 mt_chromosome = $mt_chromosome
118 gene_sources = $gene_sources
119 ig_gene_sources = $ig_gene_sources
120 rrna_gene_sources = $rrna_gene_sources
121
122 #raw
123 # Blat sequences per job
124 num_blat_sequences = 10000
125
126 # Minimum gene fusion range
127 dna_concordant_length = 2000
128
129 # Trim length for discordant reads (split reads are not trimmed)
130 discord_read_trim = 50
131
132 # Calculate extra annotations, fusion splice index and interrupted index
133 calculate_extra_annotations = no
134
135 # Filtering parameters
136 clustering_precision = 0.95
137 span_count_threshold = 5
138 percent_identity_threshold = 0.90
139 split_min_anchor = 4
140 splice_bias = 10
141 positive_controls = $(data_directory)/controls.txt
142 probability_threshold = 0.50
143
144 # Position density when calculating covariance
145 covariance_sampling_density = 0.01
146
147 # Number of reads for each job in split
148 reads_per_job = 1000000
149
150 # If you have command line 'mail' and wish to be notified
151 mailto = andrew.mcpherson@gmail.com
152
153 # Remove temp files
154 remove_job_files = yes
155 remove_job_temp_files = yes
156 #end raw
157 </configfile>
158 <configfile name="shscript">
159 #!/bin/bash
160 ## define some things for cheetah proccessing
161 #set $ds = chr(36)
162 #set $amp = chr(38)
163 #set $gt = chr(62)
164 #set $lt = chr(60)
165 #set $echo_cmd = 'echo'
166 ## Find the defuse.pl in the galaxy tool path
167 #import Cheetah.FileUtils
168 ## substitute pathnames into config file
169 if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
170 if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
171 if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
172 if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
173 if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
174 if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
175 if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
176 if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
177 if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
178 if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
179 if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
180
181 ## copy config to output
182 cp $defuse_config $config_txt
183 ## make a data_dir and ln -s the input fastq
184 mkdir -p $config_txt.extra_files_path
185 ## run defuse.pl
186 perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
187 </configfile>
188 </configfiles>
189
190 <tests>
191 </tests>
192 <help>
193 **DeFuse**
194
195 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
196
197 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
198
199 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
200
201 ------
202
203 **Inputs**
204
205 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
206
207 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
208
209 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
210 - genome_fasta from Ensembl
211 - gene_models from Ensembl
212 - repeats_filename from UCSC RepeatMasker rmsk.txt
213 - est_fasta from UCSC
214 - est_alignments from UCSC intronEst.txt
215 - unigene_fasta from NCBI
216
217 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
218
219 ------
220
221 **Outputs**
222
223 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
224
225 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
226
227 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
228
229 - **Identification**
230 - cluster_id : random identifier assigned to each prediction
231 - library_name : library name given on the command line of defuse
232 - gene1 : ensembl id of gene 1
233 - gene2 : ensembl id of gene 2
234 - gene_name1 : name of gene 1
235 - gene_name2 : name of gene 2
236 - **Evidence**
237 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
238 - concordant_ratio : proportion of spanning reads considered concordant by blat
239 - denovo_min_count : minimum kmer count across denovo assembled sequence
240 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
241 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
242 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
243 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
244 - min_map_count : minimum of the number of genomic mappings for each spanning read
245 - max_map_count : maximum of the number of genomic mappings for each spanning read
246 - mean_map_count : average of the number of genomic mappings for each spanning read
247 - num_multi_map : number of spanning reads that map to more than one genomic location
248 - span_count : number of spanning reads supporting the fusion
249 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
250 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
251 - span_coverage_min : minimum of span_coverage1 and span_coverage2
252 - span_coverage_max : maximum of span_coverage1 and span_coverage2
253 - splitr_count : number of split reads supporting the prediction
254 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
255 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
256 - splitr_sequence : fusion sequence predicted by split reads
257 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
258 - **Annotation**
259 - adjacent : fusion between adjacent genes
260 - altsplice : fusion likely the product of alternative splicing between adjacent genes
261 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
262 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
263 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
264 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
265 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
266 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
267 - deletion : fusion produced by a genomic deletion
268 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
269 - eversion : fusion produced by a genomic eversion
270 - exonboundaries : fusion splice at exon boundaries
271 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
272 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
273 - gene_chromosome1 : chromosome of gene 1
274 - gene_chromosome2 : chromosome of gene 2
275 - gene_end1 : end position for gene 1
276 - gene_end2 : end position for gene 2
277 - gene_location1 : location of breakpoint in gene 1
278 - gene_location2 : location of breakpoint in gene 2
279 - gene_start1 : start of gene 1
280 - gene_start2 : start of gene 2
281 - gene_strand1 : strand of gene 1
282 - gene_strand2 : strand of gene 2
283 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
284 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
285 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
286 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
287 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
288 - interchromosomal : fusion produced by an interchromosomal translocation
289 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
290 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
291 - inversion : fusion produced by genomic inversion
292 - orf : fusion combines genes in a way that preserves a reading frame
293 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
294 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
295 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
296 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
297 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
298 - splice_score : number of nucleotides similar to GTAG at fusion splice
299 - num_splice_variants : number of potential splice variants for this gene pair
300 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
301 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
302
303
304 **Example**
305
306 results.tsv::
307
308 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
309 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
310 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
311
312 </help>
313 </tool>