comparison defuse.xml @ 0:63f23d5db27c draft

planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/defuse commit 2c2fd38cb761ec57bac7a0bd376e6aa2b88265d0-dirty
author jjohnson
date Mon, 20 May 2019 15:25:03 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:63f23d5db27c
1 <tool id="defuse" name="DeFuse" version="@DEFUSE_VERSION@.1">
2 <description>identify fusion transcripts</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <expand macro="defuse_requirement" />
8 </requirements>
9 <command detect_errors="default"><![CDATA[
10 #if $defuse_out.__str__ != 'None':
11 ## ln to output_dir in from_work_dir
12 mkdir -p $defuse_out.files_path &&
13 ln -s $defuse_out.files_path output_dir &&
14 #else
15 mkdir -p output_dir &&
16 #end if
17 ## Put executable paths in config file
18 $__tool_directory__/config_sub.sh $defuse_config output_dir/defuse.cfg &&
19 ## copy config to output
20 cp output_dir/defuse.cfg $config_txt &&
21 ## make a data_dir and ln -s the input fastq
22 mkdir -p data_dir &&
23 ln -s "$left_pairendreads" data_dir/reads_1.fastq &&
24 ln -s "$right_pairendreads" data_dir/reads_2.fastq &&
25 ## run
26 DATASET_DIRECTORY=`grep '^dataset_directory' output_dir/defuse.cfg | awk '{print \$NF}'` &&
27 defuse_run.pl --name "$library_name" --config output_dir/defuse.cfg --dataset \$DATASET_DIRECTORY -1 data_dir/reads_1.fastq -2 data_dir/reads_2.fastq -o output_dir -p \$GALAXY_SLOTS &&
28 grep -v cluster_id output_dir/results.filtered.tsv | awk '{print $1}' > cluster_id_list &&
29 get_fusion_fastq.pl --list cluster_id_list --output output_dir --fastq1 results.fusions_1.fq --fastq2 results.fusions_2.fq &&
30 cp output_dir/results.* . &&
31 cp `find -L output_dir -name defuse.log` $defuse_log
32 #if $defuse_out.__str__ != 'None':
33 && $__tool_directory__/make_html.sh $defuse_out $defuse_out.files_path
34 #end if
35 ]]></command>
36 <configfiles>
37 <configfile name="defuse_config">
38 #import re
39 #if $refGenomeSource.genomeSource == "history":
40 #set config_file = $refGenomeSource.config.__str__
41 #else
42 #set config_file = $refGenomeSource.index.value
43 #end if
44 #set pat = '^\s*([^#=][^=]*?)\s*=\s*(.*?)\s*$'
45 #set fh = open($config_file)
46 #set keys = ['dataset_directory','ensembl_organism','ensembl_prefix','ensembl_version','ensembl_genome_version','ucsc_genome_version','ncbi_organism','ncbi_prefix','chromosomes','mt_chromosome','gene_sources','ig_gene_sources','rrna_gene_sources']
47 #set kv = []
48 #for $line in $fh:
49 #set m = $re.match($pat,$line)
50 #if $m and len($m.groups()) == 2:
51 ## #echo $line
52 #if $m.groups()[0] in keys:
53 #set k = $m.groups()[0]
54 #if k == 'dataset_directory' and $refGenomeSource.genomeSource == "indexed":
55 ## The DataManager is conifgured to place the config file in the same directory as the defuse_data: dataset_directory
56 #set v = $os.path.dirname($config_file)
57 #else:
58 #set v = $m.groups()[1]
59 #end if
60 #set kv = $kv + [[$k, $v]]
61 #end if
62 #end if
63 #end for
64 ## #echo $kv
65 #set ref_dict = dict($kv)
66 ## #echo $ref_dict
67 ## include raw $refGenomeSource.config.__str__
68 #
69 # Configuration file for defuse
70 #
71 # At a minimum, change all values enclused by []
72 #
73
74 # Directory where the defuse code was unpacked
75 ## Default location in the tool/defuse directory
76 # source_directory = ${__root_dir__}/tools/defuse
77 source_directory = __DEFUSE_PATH__
78
79 # Directory where you want your dataset
80 dataset_directory = #slurp
81 #try
82 $ref_dict['dataset_directory']
83 #except
84 /project/db/genomes/Hsapiens/hg19/defuse
85 #end try
86
87 # Organism IDs
88 ensembl_organism = #slurp
89 #try
90 $ref_dict['ensembl_organism']
91 #except
92 homo_sapiens
93 #end try
94
95 ensembl_prefix = #slurp
96 #try
97 $ref_dict['ensembl_prefix']
98 #except
99 Homo_sapiens
100 #end try
101
102 ensembl_version = #slurp
103 #try
104 $ref_dict['ensembl_version']
105 #except
106 71
107 #end try
108
109 ensembl_genome_version = #slurp
110 #try
111 $ref_dict['ensembl_genome_version']
112 #except
113 GRCh37
114 #end try
115
116 ucsc_genome_version = #slurp
117 #try
118 $ref_dict['ucsc_genome_version']
119 #except
120 hg19
121 #end try
122
123 ncbi_organism = #slurp
124 #try
125 $ref_dict['ncbi_organism']
126 #except
127 Homo_sapiens
128 #end try
129
130 ncbi_prefix = #slurp
131 #try
132 $ref_dict['ncbi_prefix']
133 #except
134 Hs
135 #end try
136
137 # Input genome and gene models
138 gene_models = #slurp
139 #try
140 $ref_dict['gene_models']
141 #except
142 \$(dataset_directory)/\$(ensembl_prefix).\$(ensembl_genome_version).\$(ensembl_version).gtf
143 #end try
144 genome_fasta = #slurp
145 #try
146 $ref_dict['genome_fasta']
147 #except
148 \$(dataset_directory)/\$(ensembl_prefix).\$(ensembl_genome_version).\$(ensembl_version).dna.chromosomes.fa
149 #end try
150
151 # Repeat table from ucsc genome browser
152 repeats_filename = #slurp
153 #try
154 $ref_dict['repeats_filename']
155 #except
156 \$(dataset_directory)/rmsk.txt
157 #end try
158
159 # EST info downloaded from ucsc genome browser
160 est_fasta = #slurp
161 #try
162 $ref_dict['est_fasta']
163 #except
164 \$(dataset_directory)/est.fa
165 #end try
166 est_alignments = #slurp
167 #try
168 $ref_dict['est_alignments']
169 #except
170 \$(dataset_directory)/intronEst.txt
171 #end try
172
173 # Unigene clusters downloaded from ncbi
174 unigene_fasta = #slurp
175 #try
176 $ref_dict['unigene_fasta']
177 #except
178 \$(dataset_directory)/\$(ncbi_prefix).seq.uniq
179 #end try
180
181 # Paths to external tools
182 bowtie_bin = __BOWTIE_BIN__
183 bowtie_build_bin = __BOWTIE_BUILD_BIN__
184 blat_bin = __BLAT_BIN__
185 fatotwobit_bin = __FATOTWOBIT_BIN__
186 gmap_bin = __GMAP_BIN__
187 gmap_bin = __GMAP_BIN__
188 gmap_setup_bin = __GMAP_SETUP_BIN__
189 r_bin = __R_BIN__
190 rscript_bin = __RSCRIPT_BIN__
191
192 # Directory where you want your dataset
193 gmap_index_directory = #slurp
194 #try
195 $ref_dict['gmap_index_directory']
196 #except
197 #raw
198 $(dataset_directory)/gmap
199 #end raw
200 #end try
201
202 #raw
203 # Dataset files
204 dataset_prefix = $(dataset_directory)/defuse
205 chromosome_prefix = $(dataset_prefix).dna.chromosomes
206 exons_fasta = $(dataset_prefix).exons.fa
207 cds_fasta = $(dataset_prefix).cds.fa
208 cdna_regions = $(dataset_prefix).cdna.regions
209 cdna_fasta = $(dataset_prefix).cdna.fa
210 reference_fasta = $(dataset_prefix).reference.fa
211 rrna_fasta = $(dataset_prefix).rrna.fa
212 ig_gene_list = $(dataset_prefix).ig.gene.list
213 repeats_regions = $(dataset_directory)/repeats.regions
214 est_split_fasta1 = $(dataset_directory)/est.1.fa
215 est_split_fasta2 = $(dataset_directory)/est.2.fa
216 est_split_fasta3 = $(dataset_directory)/est.3.fa
217 est_split_fasta4 = $(dataset_directory)/est.4.fa
218 est_split_fasta5 = $(dataset_directory)/est.5.fa
219 est_split_fasta6 = $(dataset_directory)/est.6.fa
220 est_split_fasta7 = $(dataset_directory)/est.7.fa
221 est_split_fasta8 = $(dataset_directory)/est.8.fa
222 est_split_fasta9 = $(dataset_directory)/est.9.fa
223
224 # Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
225 prefilter1 = $(unigene_fasta)
226
227 # deFuse scripts and tools
228 scripts_directory = $(source_directory)/scripts
229 tools_directory = $(source_directory)/tools
230 data_directory = $(source_directory)/data
231 #end raw
232
233 # Path to samtools, 0.1.8 is compiled for you, use other versions at your own risk
234 samtools_bin = #slurp
235 #try
236 $ref_dict['samtools_bin']
237 #except
238 \$(source_directory)/external/samtools-0.1.8/samtools
239 #end try
240
241 # Bowtie parameters
242 bowtie_threads = #slurp
243 #try
244 $ref_dict['bowtie_threads']
245 #except
246 4
247 #end try
248 bowtie_quals = #slurp
249 #try
250 $ref_dict['bowtie_quals']
251 #except
252 --phred33-quals
253 #end try
254 bowtie_params = #slurp
255 #try
256 $ref_dict['bowtie_params']
257 #except
258 --chunkmbs 200
259 #end try
260 max_insert_size = #slurp
261 #if $defuse_param.settings == "full" and $defuse_param.max_insert_size.__str__ != "":
262 $defuse_param.max_insert_size
263 #else
264 #try
265 $ref_dict['max_insert_size']
266 #except
267 500
268 #end try
269 #end if
270
271 # Parameters for building the dataset
272 chromosomes = #slurp
273 #try
274 $ref_dict.chromosomes
275 #except
276 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT
277 #end try
278 mt_chromosome = #slurp
279 #try
280 $ref_dict['mt_chromosome']
281 #except
282 MT
283 #end try
284 gene_sources = #slurp
285 #try
286 $ref_dict['gene_sources']
287 #except
288 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding
289 #end try
290 ig_gene_sources = #slurp
291 #try
292 $ref_dict['ig_gene_sources']
293 #except
294 IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene
295 #end try
296 rrna_gene_sources = #slurp
297 #try
298 $ref_dict['rrna_gene_sources']
299 #except
300 Mt_rRNA,rRNA,rRNA_pseudogene
301 #end try
302
303 # Blat sequences per job
304 num_blat_sequences = #slurp
305 #try
306 $ref_dict['num_blat_sequences']
307 #except
308 10000
309 #end try
310
311 # Minimum gene fusion range
312 dna_concordant_length = #slurp
313 #if $defuse_param.settings == "full" and $defuse_param.dna_concordant_length.__str__ != "":
314 $defuse_param.dna_concordant_length
315 #else
316 #try
317 $ref_dict['dna_concordant_length']
318 #except
319 2000
320 #end try
321 #end if
322
323 # Trim length for discordant reads (split reads are not trimmed)
324 discord_read_trim = #slurp
325 #if $defuse_param.settings == "full" and $defuse_param.discord_read_trim.__str__ != "":
326 $defuse_param.discord_read_trim
327 #else
328 #try
329 $ref_dict['discord_read_trim']
330 #except
331 50
332 #end try
333 #end if
334 # Calculate extra annotations, fusion splice index and interrupted index
335 calculate_extra_annotations = #slurp
336 #if $defuse_param.settings == "full" and $defuse_param.calculate_extra_annotations.__str__ != "":
337 $defuse_param.calculate_extra_annotations
338 #else
339 #try
340 $ref_dict['calculate_extra_annotations']
341 #except
342 no
343 #end try
344 #end if
345 # Filtering parameters
346 clustering_precision = #slurp
347 #if $defuse_param.settings == "full" and $defuse_param.clustering_precision.__str__ != ""
348 $defuse_param.clustering_precision
349 #else
350 #try
351 $ref_dict['clustering_precision']
352 #except
353 0.95
354 #end try
355 #end if
356 span_count_threshold = #slurp
357 #if $defuse_param.settings == "full" and $defuse_param.span_count_threshold.__str__ != ""
358 $defuse_param.span_count_threshold
359 #else
360 #try
361 $ref_dict['span_count_threshold']
362 #except
363 5
364 #end try
365 #end if
366 percent_identity_threshold = #slurp
367 #if $defuse_param.settings == "full" and $defuse_param.percent_identity_threshold.__str__ != ""
368 $defuse_param.percent_identity_threshold
369 #else
370 #try
371 $ref_dict['percent_identity_threshold']
372 #except
373 0.90
374 #end try
375 #end if
376 split_min_anchor = #slurp
377 #if $defuse_param.settings == "full" and $defuse_param.split_min_anchor.__str__ != ""
378 $defuse_param.split_min_anchor
379 #else
380 #try
381 $ref_dict['split_min_anchor']
382 #except
383 4
384 #end try
385 #end if
386 splice_bias = #slurp
387 #if $defuse_param.settings == "full" and $defuse_param.splice_bias.__str__ != ""
388 $defuse_param.splice_bias
389 #else
390 #try
391 $ref_dict['splice_bias']
392 #except
393 10
394 #end try
395 #end if
396 denovo_assembly = #slurp
397 #if $defuse_param.settings == "full" and $defuse_param.denovo_assembly.__str__ != ""
398 $defuse_param.denovo_assembly
399 #else
400 #try
401 $ref_dict['denovo_assembly']
402 #except
403 no
404 #end try
405 #end if
406 probability_threshold = #slurp
407 #if $defuse_param.settings == "full" and $defuse_param.probability_threshold.__str__ != ""
408 $defuse_param.probability_threshold
409 #else
410 #try
411 $ref_dict['probability_threshold']
412 #except
413 0.50
414 #end try
415 #end if
416 positive_controls = \$(data_directory)/controls.txt
417
418 # Use multiple exon transcripts for stats calculations (yes/no)
419 # should be enabled for very small libraries
420 multi_exon_transcripts_stats = #slurp
421 #if $defuse_param.settings == "full" and $defuse_param.multi_exon_transcripts_stats.__str__ != ""
422 $defuse_param.multi_exon_transcripts_stats
423 #else
424 #try
425 $ref_dict['multi_exon_transcripts_stats']
426 #except
427 no
428 #end try
429 #end if
430
431 # Position density when calculating covariance
432 covariance_sampling_density = #slurp
433 #if $defuse_param.settings == "full" and $defuse_param.covariance_sampling_density.__str__ != ""
434 $defuse_param.covariance_sampling_density
435 #else
436 #try
437 $ref_dict['covariance_sampling_density']
438 #except
439 0.01
440 #end try
441 #end if
442
443 # Maximum number of alignments for a read pair
444 # Pairs with more alignments are filtered
445 max_paired_alignments = #slurp
446 #if $defuse_param.settings == "full" and $defuse_param.max_paired_alignments.__str__ != ""
447 $defuse_param.max_paired_alignments
448 #else
449 #try
450 $ref_dict['max_paired_alignments']
451 #except
452 10
453 #end try
454 #end if
455
456 # Number of reads for each job in split
457 reads_per_job = #slurp
458 #if $defuse_param.settings == "full" and $defuse_param.reads_per_job.__str__ != ""
459 $defuse_param.reads_per_job
460 #else
461 #try
462 $ref_dict['reads_per_job']
463 #except
464 1000000
465 #end try
466 #end if
467
468 #raw
469 # If you have command line 'mail' and wish to be notified
470 # mailto = andrew.mcpherson@gmail.com
471
472 # Remove temp files
473 remove_job_files = yes
474 remove_job_temp_files = yes
475
476 qsub_params = ""
477
478 #end raw
479
480 </configfile>
481 </configfiles>
482 <inputs>
483 <param name="left_pairendreads" type="data" format="fastq" label="left part of read pairs" help="The left and right reads pairs must be in the same order, and not have any unpaired reads. (FASTQ interlacer will pair reads and remove the unpaired. FASTQ de-interlacer will separate the result into left and right reads.)"/>
484 <param name="right_pairendreads" type="data" format="fastq" label="right part of read pairs" help="In the same order as the left reads"/>
485 <param name="library_name" type="text" value="unknown" label="library name" help="Value to put in the results library_name column">
486 <validator type="length" min="1"/>
487 </param>
488 <conditional name="refGenomeSource">
489 <param name="genomeSource" type="select" label="Will you select a built-in DeFuse Reference Dataset, or supply a configuration from your history" help="">
490 <option value="indexed">Use a built-in DeFuse Reference Dataset</option>
491 <option value="history">Use a configuration from your history that specifies the DeFuse Reference Dataset</option>
492 </param>
493 <when value="indexed">
494 <param name="index" type="select" label="Select a Reference Dataset" help="if your genome of interest is not listed - contact Galaxy team">
495 <options from_file="defuse_reference.loc">
496 <column name="name" index="1"/>
497 <column name="value" index="3"/>
498 <filter type="sort_by" column="0" />
499 <validator type="no_options" message="No indexes are available" />
500 </options>
501 </param>
502 </when>
503 <when value="history">
504 <param name="config" type="data" format="defuse.conf" label="Defuse Config file" help=""/>
505 </when> <!-- history -->
506 </conditional> <!-- refGenomeSource -->
507 <conditional name="defuse_param">
508 <param name="settings" type="select" label="Defuse parameter settings" help="">
509 <option value="preSet">Default settings</option>
510 <option value="full">Full parameter list</option>
511 </param>
512 <when value="preSet" />
513 <when value="full">
514 <param name="max_insert_size" type="integer" value="500" optional="true" label="Bowtie max_insert_size" />
515 <param name="dna_concordant_length" type="integer" value="2000" optional="true" label="Minimum gene fusion range dna_concordant_length" />
516 <param name="discord_read_trim" type="integer" value="50" optional="true" label="Trim length for discordant reads discord_read_trim" help="(split reads are not trimmed)" />
517 <param name="calculate_extra_annotations" type="select" label="Calculate extra annotations, fusion splice index and interrupted index" help="">
518 <option value="">Use Default</option>
519 <option value="no">no</option>
520 <option value="yes">yes</option>
521 </param>
522 <param name="clustering_precision" type="float" value=".95" optional="true" label="Filter clustering_precision">
523 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
524 </param>
525 <param name="span_count_threshold" type="integer" value="5" optional="true" label="Filter span_count_threshold" />
526 <param name="percent_identity_threshold" type="float" value=".90" optional="true" label="Filter percent_identity_threshold">
527 <validator type="in_range" message="Choose a value between .1 and 1.0" min=".1" max="1"/>
528 </param>
529 <param name="split_min_anchor" type="integer" value="4" optional="true" label="Filter split_min_anchor" />
530 <param name="splice_bias" type="integer" value="10" optional="true" label="Filter splice_bias" />
531 <param name="probability_threshold" type="float" value="0.50" optional="true" label="Filter probability_threshold">
532 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
533 </param>
534 <param name="multi_exon_transcripts_stats" type="select" label="Use multiple exon transcripts for stats calculations" help="should be enabled for very small libraries">
535 <option value="no" selected="true">no</option>
536 <option value="yes">yes</option>
537 </param>
538 <param name="covariance_sampling_density" type="float" value="0.01" optional="true" label="covariance_sampling_density">
539 <help>Position density when calculating covariance</help>
540 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="0" max="1"/>
541 </param>
542 <param name="max_paired_alignments" type="integer" value="10" optional="true" label="max_paired_alignments">
543 <help>Maximum number of alignments for a read pair, Pairs with more alignments are filtered, default is 10</help>
544 <validator type="in_range" message="Choose a value between 0.0 and 1.0" min="1" max="100"/>
545 </param>
546 <param name="denovo_assembly" type="select" label="denovo_assembly" help="">
547 <option value="">Use Default</option>
548 <option value="no">no</option>
549 <option value="yes">yes</option>
550 </param>
551 <!--
552 <param name="positive_controls" type="data" format="txt" optional=true label="Defuse positive_controls" help=""/>
553 -->
554 <param name="reads_per_job" type="integer" value="1000000" optional="true" label="Number of reads for each job in split" />
555 </when> <!-- full -->
556 </conditional> <!-- defuse_param -->
557 <param name="keep_output" type="boolean" checked="true" truevalue="yes" falsevalue="no" label="Save DeFuse working directory files"
558 help="The defuse output working directory can be helpful for determining errors that may have occurred during the run,
559 but they require considerable diskspace, and should be deleted and purged when no longer needed."/>
560 </inputs>
561 <outputs>
562 <data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
563 <data format="txt" name="defuse_log" label="${tool.name} on ${on_string}: defuse.log" />
564 <data format="html" name="defuse_out" label="${tool.name} on ${on_string}: defuse_output (purge when no longer needed)">
565 <filter>keep_output == True</filter>
566 </data>
567 <data format="defuse.results.tsv" name="results_classify_tsv" label="${tool.name} on ${on_string}: results.classify.tsv" from_work_dir="results.classify.tsv"/>
568 <data format="defuse.results.tsv" name="results_filtered_tsv" label="${tool.name} on ${on_string}: results.filtered.tsv" from_work_dir="results.filtered.tsv"/>
569 <data format="fastqsanger" name="results_fusions1_fq" label="${tool.name} on ${on_string}: fusions_1.fq" from_work_dir="results.fusions_1.fq" />
570 <data format="fastqsanger" name="results_fusions2_fq" label="${tool.name} on ${on_string}: fusions_2.fq" from_work_dir="results.fusions_2.fq" />
571 <!--
572 expression_plot
573 circos plot
574 -->
575 </outputs>
576
577 <tests>
578 </tests>
579 <help>
580 **DeFuse**
581
582 DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
583
584 Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
585
586 .. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
587
588 ------
589
590 **Inputs**
591
592 DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
593
594 If your fastq files have reads in different orders or include unpaired reads, you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
595
596 DeFuse uses a Reference Dataset to search for gene fusions. The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
597 - genome_fasta from Ensembl
598 - gene_models from Ensembl
599 - repeats_filename from UCSC RepeatMasker rmsk.txt
600 - est_fasta from UCSC
601 - est_alignments from UCSC intronEst.txt
602 - unigene_fasta from NCBI
603
604 .. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
605
606 ------
607
608 **Outputs**
609
610 The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters, the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
611
612 DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
613
614 The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
615
616 - **Identification**
617 - cluster_id : random identifier assigned to each prediction
618 - library_name : library name given on the command line of defuse
619 - gene1 : ensembl id of gene 1
620 - gene2 : ensembl id of gene 2
621 - gene_name1 : name of gene 1
622 - gene_name2 : name of gene 2
623 - **Evidence**
624 - break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
625 - concordant_ratio : proportion of spanning reads considered concordant by blat
626 - denovo_min_count : minimum kmer count across denovo assembled sequence
627 - denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
628 - denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
629 - gene_align_strand1 : alignment strand for spanning read alignments to gene 1
630 - gene_align_strand2 : alignment strand for spanning read alignments to gene 2
631 - min_map_count : minimum of the number of genomic mappings for each spanning read
632 - max_map_count : maximum of the number of genomic mappings for each spanning read
633 - mean_map_count : average of the number of genomic mappings for each spanning read
634 - num_multi_map : number of spanning reads that map to more than one genomic location
635 - span_count : number of spanning reads supporting the fusion
636 - span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
637 - span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
638 - span_coverage_min : minimum of span_coverage1 and span_coverage2
639 - span_coverage_max : maximum of span_coverage1 and span_coverage2
640 - splitr_count : number of split reads supporting the prediction
641 - splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
642 - splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
643 - splitr_sequence : fusion sequence predicted by split reads
644 - splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
645 - **Annotation**
646 - adjacent : fusion between adjacent genes
647 - altsplice : fusion likely the product of alternative splicing between adjacent genes
648 - break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
649 - break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
650 - break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
651 - breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
652 - breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
653 - cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
654 - deletion : fusion produced by a genomic deletion
655 - est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
656 - eversion : fusion produced by a genomic eversion
657 - exonboundaries : fusion splice at exon boundaries
658 - expression1 : expression of gene 1 as number of concordant pairs aligned to exons
659 - expression2 : expression of gene 2 as number of concordant pairs aligned to exons
660 - gene_chromosome1 : chromosome of gene 1
661 - gene_chromosome2 : chromosome of gene 2
662 - gene_end1 : end position for gene 1
663 - gene_end2 : end position for gene 2
664 - gene_location1 : location of breakpoint in gene 1
665 - gene_location2 : location of breakpoint in gene 2
666 - gene_start1 : start of gene 1
667 - gene_start2 : start of gene 2
668 - gene_strand1 : strand of gene 1
669 - gene_strand2 : strand of gene 2
670 - genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
671 - genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
672 - genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
673 - genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
674 - genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
675 - interchromosomal : fusion produced by an interchromosomal translocation
676 - interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
677 - interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
678 - inversion : fusion produced by genomic inversion
679 - orf : fusion combines genes in a way that preserves a reading frame
680 - probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
681 - read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
682 - repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
683 - repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
684 - max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
685 - splice_score : number of nucleotides similar to GTAG at fusion splice
686 - num_splice_variants : number of potential splice variants for this gene pair
687 - splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
688 - splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
689
690
691 **Example**
692
693 results.tsv::
694
695 cluster_id splitr_sequence splitr_count splitr_span_pvalue splitr_pos_pvalue splitr_min_pvalue adjacent altsplice break_adj_entropy1 break_adj_entropy2 break_adj_entropy_min break_predict breakpoint_homology breakseqs_estislands_percident cdna_breakseqs_percident concordant_ratio deletion est_breakseqs_percident eversion exonboundaries expression1 expression2 gene1 gene2 gene_align_strand1 gene_align_strand2 gene_chromosome1 gene_chromosome2 gene_end1 gene_end2 gene_location1 gene_location2 gene_name1 gene_name2 gene_start1 gene_start2 gene_strand1 gene_strand2 genome_breakseqs_percident genomic_break_pos1 genomic_break_pos2 genomic_strand1 genomic_strand2 interchromosomal interrupted_index1 interrupted_index2 inversion library_name max_map_count max_repeat_proportion mean_map_count min_map_count num_multi_map num_splice_variants orf read_through repeat_proportion1 repeat_proportion2 span_count span_coverage1 span_coverage2 span_coverage_max span_coverage_min splice_score splicing_index1 splicing_index2
696 1169 GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT 2 0.000436307890680442 0.110748295953850 0.0880671602973091 N Y 3.19872427442695 3.48337348351473 3.19872427442695 splitr 0 0 0 0 Y 0 N N 0 0 ENSG00000105549 ENSG00000213753 + - 19 19 376013 59111168 intron upstream THEG AC016629.2 361750 59084870 - + 0 375099 386594 + - N 8.34107429512245 - N output_dir 82 0.677852348993289 40.6666666666667 1 11 1 N N 0.361271676300578 0.677852348993289 12 0.758602776578432 0.569678713445872 0.758602776578432 0.569678713445872 2 0.416666666666667 -
697 3596 TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG 250 7.00711162298275e-72 0.00912124762512338 0.00684237452309549 N N 3.31745197152461 3.47233119514066 3.31745197152461 splitr 7 0.0157657657657656 0 0 N 0.0135135135135136 N N 0 0 ENSG00000156860 ENSG00000212932 - + 16 21 30682131 48111157 coding upstream FBRS RPL23AP4 30670289 48110676 + + 0.0157657657657656 30680678 9827473 - + Y - - N output_dir 2 1 1.11111111111111 1 1 1 N N 0 1 9 0.325530693397641 0.296465452915709 0.325530693397641 0.296465452915709 2 - -
698
699 </help>
700 <expand macro="citations"/>
701 </tool>