defuse: create_reference_dataset.xml comparison

comparison create_reference_dataset.xml @ 12:33e2235bf003

Add create_reference_dataset.xml

author	Jim Johnson <jj@umn.edu>
date	Sun, 09 Jun 2013 20:30:21 -0500
parents
children	85693cb5339f

comparison

equal deleted inserted replaced

-:19c48803a377
+:33e2235bf003
+<tool id="create_defusei_reference" name="Create DeFuse Reference" version="1.6.1">
+<description>create a defuse reference from Ensembl and UCSC sources</description>
+<requirements>
+<requirement type="package" version="0.6.1">defuse</requirement>
+<requirement type="package" version="0.1.18">samtools</requirement>
+<requirement type="package" version="1.0.0">bowtie</requirement>
+<requirement type="package" version="2013-05-09">gmap</requirement>
+<requirement type="package" version="latest">kent</requirement>
+</requirements>
+<command interpreter="command"> /bin/bash $shscript </command>
+<inputs>
+<param name="ensembl_genome_version" type="text" value="" label="Esembl Genome Version" help="Example: GRCh37"/>
+<param name="ensembl_version" type="integer" value="" label="Esembl Release Version" help="Example: 71"/>
+<param name="ucsc_genome_version" type="text" value="" label="UCSC Genome Version" help="Example: hg19"/>
+<param name="chromosomes" type="text" value="" label="Chromosomes" help="Example: 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y,MT"/>
+<param name="mt_chromosome" type="text" value="MT" label="Mitochonrial Chromosome" />
+<param name="gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,processed_transcript,protein_coding" label="Gene sources" />
+<param name="ig_gene_sources" type="text" value="IG_C_gene,IG_D_gene,IG_J_gene,IG_V_gene,IG_pseudogene" label="IG Gene sources" />
+<param name="rrna_gene_sources" type="text" value="Mt_rRNA,rRNA,rRNA_pseudogene" label="Ribosomal Gene sources" />
+</inputs>
+<outputs>
+<data format="txt" name="config_txt" label="${tool.name} on ${on_string}: config.txt"/>
+</outputs>
+<configfiles>
+<configfile name="defuse_config">
+#import ast
+#
+# Configuration file for defuse
+#
+# At a minimum, change all values enclused by []
+#
+# Directory where the defuse code was unpacked
+## Default location in the tool/defuse directory
+# source_directory = ${__root_dir__}/tools/defuse
+source_directory = __DEFUSE_PATH__
+ensembl_version = $ensembl_version
+ensembl_genome_version = $ensembl_genome_version
+ucsc_genome_version = $ucsc_genome_version
+# Directory where you want your dataset
+dataset_directory = $config_txt.extra_files_path
+#raw
+# Input genome and gene models
+gene_models                                 = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).gtf
+genome_fasta                                = $(dataset_directory)/Homo_sapiens.$(ensembl_genome_version).$(ensembl_version).dna.chromosomes.fa
+# Repeat table from ucsc genome browser
+repeats_filename                            = $(dataset_directory)/repeats.txt
+# EST info downloaded from ucsc genome browser
+est_fasta                                   = $(dataset_directory)/est.fa
+est_alignments                              = $(dataset_directory)/intronEst.txt
+# Unigene clusters downloaded from ncbi
+unigene_fasta                               = $(dataset_directory)/Hs.seq.uniq
+#end raw
+# Paths to external tools
+samtools_bin =  __SAMTOOLS_BIN__
+bowtie_bin = __BOWTIE_BIN__
+bowtie_build_bin = __BOWTIE_BUILD_BIN__
+blat_bin = __BLAT_BIN__
+fatotwobit_bin = __FATOTWOBIT_BIN__
+gmap_bin = __GMAP_BIN__
+gmap_setup_bin = __GMAP_SETUP_BIN__
+r_bin = __R_BIN__
+rscript_bin = __RSCRIPT_BIN__
+#raw
+# Directory where you want your dataset
+gmap_index_directory                        = $(dataset_directory)/gmap
+#end raw
+#raw
+# Dataset files
+dataset_prefix       = $(dataset_directory)/defuse
+chromosome_prefix    = $(dataset_prefix).dna.chromosomes
+exons_fasta          = $(dataset_prefix).exons.fa
+cds_fasta            = $(dataset_prefix).cds.fa
+cdna_regions         = $(dataset_prefix).cdna.regions
+cdna_fasta           = $(dataset_prefix).cdna.fa
+reference_fasta      = $(dataset_prefix).reference.fa
+rrna_fasta           = $(dataset_prefix).rrna.fa
+ig_gene_list         = $(dataset_prefix).ig.gene.list
+repeats_regions      = $(dataset_directory)/repeats.regions
+est_split_fasta1     = $(dataset_directory)/est.1.fa
+est_split_fasta2     = $(dataset_directory)/est.2.fa
+est_split_fasta3     = $(dataset_directory)/est.3.fa
+est_split_fasta4     = $(dataset_directory)/est.4.fa
+est_split_fasta5     = $(dataset_directory)/est.5.fa
+est_split_fasta6     = $(dataset_directory)/est.6.fa
+est_split_fasta7     = $(dataset_directory)/est.7.fa
+est_split_fasta8     = $(dataset_directory)/est.8.fa
+est_split_fasta9     = $(dataset_directory)/est.9.fa
+# Fasta files with bowtie indices for prefiltering reads for concordantly mapping pairs
+prefilter1           = $(unigene_fasta)
+# deFuse scripts and tools
+scripts_directory    = $(source_directory)/scripts
+tools_directory      = $(source_directory)/tools
+data_directory       = $(source_directory)/data
+#end raw
+#raw
+# Bowtie parameters
+bowtie_threads                              = 1
+bowtie_quals                                = --phred33-quals
+max_insert_size                             = 500
+#end raw
+# Parameters for building the dataset
+chromosomes = $chromosomes
+mt_chromosome = $mt_chromosome
+gene_sources = $gene_sources
+ig_gene_sources = $ig_gene_sources
+rrna_gene_sources = $rrna_gene_sources
+#raw
+# Blat sequences per job
+num_blat_sequences                          = 10000
+# Minimum gene fusion range
+dna_concordant_length                       = 2000
+# Trim length for discordant reads (split reads are not trimmed)
+discord_read_trim                           = 50
+# Calculate extra annotations, fusion splice index and interrupted index
+calculate_extra_annotations                 = no
+# Filtering parameters
+clustering_precision                        = 0.95
+span_count_threshold                        = 5
+percent_identity_threshold                  = 0.90
+split_min_anchor                            = 4
+splice_bias                                 = 10
+positive_controls                           = $(data_directory)/controls.txt
+probability_threshold                       = 0.50
+# Position density when calculating covariance
+covariance_sampling_density                 = 0.01
+# Number of reads for each job in split
+reads_per_job                               = 1000000
+# If you have command line 'mail' and wish to be notified
+mailto                                      = andrew.mcpherson@gmail.com
+# Remove temp files
+remove_job_files                            = yes
+remove_job_temp_files                       = yes
+#end raw
+</configfile>
+<configfile name="shscript">
+#!/bin/bash
+## define some things for cheetah proccessing
+#set $ds = chr(36)
+#set $amp = chr(38)
+#set $gt = chr(62)
+#set $lt = chr(60)
+#set $echo_cmd = 'echo'
+## Find the defuse.pl in the galaxy tool path
+#import Cheetah.FileUtils
+## substitute pathnames into config file
+if `grep __DEFUSE_PATH__ $defuse_config ${gt} /dev/null`;then sed -i'.tmp' "s#__DEFUSE_PATH__#\${DEFUSE_PATH}#" $defuse_config; fi
+if `grep __SAMTOOLS_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} SAMTOOLS_BIN=`which samtools`;then sed -i'.tmp' "s#__SAMTOOLS_BIN__#\${SAMTOOLS_BIN}#" $defuse_config; fi
+if `grep __BOWTIE_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BIN=`which bowtie`;then sed -i'.tmp' "s#__BOWTIE_BIN__#\${BOWTIE_BIN}#" $defuse_config; fi
+if `grep __BOWTIE_BUILD_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BOWTIE_BUILD_BIN=`which bowtie-build`;then sed -i'.tmp' "s#__BOWTIE_BUILD_BIN__#\${BOWTIE_BUILD_BIN}#" $defuse_config; fi
+if `grep __BLAT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} BLAT_BIN=`which blat`;then sed -i'.tmp' "s#__BLAT_BIN__#\${BLAT_BIN}#" $defuse_config; fi
+if `grep __FATOTWOBIT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} FATOTWOBIT_BIN=`which faToTwoBit`;then sed -i'.tmp' "s#__FATOTWOBIT_BIN__#\${FATOTWOBIT_BIN}#" $defuse_config; fi
+if `grep __GMAP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_BIN=`which gmap`;then sed -i'.tmp' "s#__GMAP_BIN__#\${GMAP_BIN}#" $defuse_config; fi
+if `grep __GMAP_SETUP_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_SETUP_BIN=`which gmap_setup`;then sed -i'.tmp' "s#__GMAP_SETUP_BIN__#\${GMAP_SETUP_BIN}#" $defuse_config; fi
+if `grep __GMAP_INDEX_DIR__ $defuse_config ${gt} /dev/null` ${amp}${amp} GMAP_INDEX_DIR=`pwd`/gmap;then sed -i'.tmp' "s#__GMAP_INDEX_DIR__#\${GMAP_INDEX_DIR}#" $defuse_config; fi
+if `grep __R_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} R_BIN=`which R`;then sed -i'.tmp' "s#__R_BIN__#\${R_BIN}#" $defuse_config; fi
+if `grep __RSCRIPT_BIN__ $defuse_config ${gt} /dev/null` ${amp}${amp} RSCRIPT_BIN=`which Rscript`;then sed -i'.tmp' "s#__RSCRIPT_BIN__#\${RSCRIPT_BIN}#" $defuse_config; fi
+## copy config to output
+cp $defuse_config $config_txt
+## make a data_dir  and ln -s the input fastq
+mkdir -p $config_txt.extra_files_path
+## run defuse.pl
+perl \${DEFUSE_PATH}/scripts/create_reference_dataset.pl -c $defuse_config
+</configfile>
+</configfiles>
+<tests>
+</tests>
+<help>
+**DeFuse**
+DeFuse_ is a software package for gene fusion discovery using RNA-Seq data. The software uses clusters of discordant paired end alignments to inform a split read alignment analysis for finding fusion boundaries. The software also employs a number of heuristic filters in an attempt to reduce the number of false positives and produces a fully annotated output for each predicted fusion.
+Journal reference: http://www.ploscompbiol.org/article/info%3Adoi%2F10.1371%2Fjournal.pcbi.1001138
+.. _DeFuse: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=Main_Page
+------
+**Inputs**
+DeFuse requires 2 fastq files for paried reads, one with the left mate of the paired reads, and a second fastq with the the right mate of the paired reads (**with reads in the same order as in the first fastq dataset**).
+If your fastq files have reads in different orders or include unpaired reads,  you can preprocess them with **FASTQ interlacer** to create a single interlaced fastq dataset with only the paired reads and input that to **FASTQ de-interlacer** to separate the reads into a left fastq and right fastq.
+DeFuse uses a Reference Dataset to search for gene fusions.  The Reference Dataset is generated from the following sources in DeFuse_Version_0.4_:
+- genome_fasta from Ensembl
+- gene_models from Ensembl
+- repeats_filename from UCSC RepeatMasker rmsk.txt
+- est_fasta from UCSC
+- est_alignments from UCSC intronEst.txt
+- unigene_fasta from NCBI
+.. _DeFuse_Version_0.4: http://sourceforge.net/apps/mediawiki/defuse/index.php?title=DeFuse_Version_0.4.2
+------
+**Outputs**
+The galaxy history will contain 5 outputs: the config.txt file that provides DeFuse with its parameters,  the defuse.log which details what DeFuse has done and can be useful in determining any errors, and the 3 results files that defuse generates.
+DeFuse generates 3 results files: results.txt, results.filtered.txt, and results.classify.txt. All three files have the same format, though results.classify.txt has a probability column from the application of the classifier to results.txt, and results.filtered.txt has been filtered according to the threshold probability as set in config.txt.
+The file format is tab delimited with one prediction per line, and the following fields per prediction (not necessarily in this order):
+- **Identification**
+- cluster_id : random identifier assigned to each prediction
+- library_name : library name given on the command line of defuse
+- gene1 : ensembl id of gene 1
+- gene2 : ensembl id of gene 2
+- gene_name1 : name of gene 1
+- gene_name2 : name of gene 2
+- **Evidence**
+- break_predict : breakpoint prediction method, denovo or splitr, that is considered most reliable
+- concordant_ratio : proportion of spanning reads considered concordant by blat
+- denovo_min_count : minimum kmer count across denovo assembled sequence
+- denovo_sequence : fusion sequence predicted by debruijn based denovo sequence assembly
+- denovo_span_pvalue : p-value, lower values are evidence the prediction is a false positive
+- gene_align_strand1 : alignment strand for spanning read alignments to gene 1
+- gene_align_strand2 : alignment strand for spanning read alignments to gene 2
+- min_map_count : minimum of the number of genomic mappings for each spanning read
+- max_map_count : maximum of the number of genomic mappings for each spanning read
+- mean_map_count : average of the number of genomic mappings for each spanning read
+- num_multi_map : number of spanning reads that map to more than one genomic location
+- span_count : number of spanning reads supporting the fusion
+- span_coverage1 : coverage of spanning reads aligned to gene 1 as a proportion of expected coverage
+- span_coverage2 : coverage of spanning reads aligned to gene 2 as a proportion of expected coverage
+- span_coverage_min : minimum of span_coverage1 and span_coverage2
+- span_coverage_max : maximum of span_coverage1 and span_coverage2
+- splitr_count : number of split reads supporting the prediction
+- splitr_min_pvalue : p-value, lower values are evidence the prediction is a false positive
+- splitr_pos_pvalue : p-value, lower values are evidence the prediction is a false positive
+- splitr_sequence : fusion sequence predicted by split reads
+- splitr_span_pvalue : p-value, lower values are evidence the prediction is a false positive
+- **Annotation**
+- adjacent : fusion between adjacent genes
+- altsplice : fusion likely the product of alternative splicing between adjacent genes
+- break_adj_entropy1 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 1
+- break_adj_entropy2 : di-nucleotide entropy of the 40 nucleotides adjacent to the fusion splice in gene 2
+- break_adj_entropy_min : minimum of break_adj_entropy1 and break_adj_entropy2
+- breakpoint_homology : number of nucleotides at the fusion splice that align equally well to gene 1 or gene 2
+- breakseqs_estislands_percident : maximum percent identity of fusion sequence alignments to est islands
+- cdna_breakseqs_percident : maximum percent identity of fusion sequence alignments to cdna
+- deletion : fusion produced by a genomic deletion
+- est_breakseqs_percident : maximum percent identity of fusion sequence alignments to est
+- eversion : fusion produced by a genomic eversion
+- exonboundaries : fusion splice at exon boundaries
+- expression1 : expression of gene 1 as number of concordant pairs aligned to exons
+- expression2 : expression of gene 2 as number of concordant pairs aligned to exons
+- gene_chromosome1 : chromosome of gene 1
+- gene_chromosome2 : chromosome of gene 2
+- gene_end1 : end position for gene 1
+- gene_end2 : end position for gene 2
+- gene_location1 : location of breakpoint in gene 1
+- gene_location2 : location of breakpoint in gene 2
+- gene_start1 : start of gene 1
+- gene_start2 : start of gene 2
+- gene_strand1 : strand of gene 1
+- gene_strand2 : strand of gene 2
+- genome_breakseqs_percident : maximum percent identity of fusion sequence alignments to genome
+- genomic_break_pos1 : genomic position in gene 1 of fusion splice / breakpoint
+- genomic_break_pos2 : genomic position in gene 2 of fusion splice / breakpoint
+- genomic_strand1 : genomic strand in gene 1 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
+- genomic_strand2 : genomic strand in gene 2 of fusion splice / breakpoint, retained sequence upstream on this strand, breakpoint is downstream
+- interchromosomal : fusion produced by an interchromosomal translocation
+- interrupted_index1 : ratio of coverage before and after the fusion splice / breakpoint in gene 1
+- interrupted_index2 : ratio of coverage before and after the fusion splice / breakpoint in gene 2
+- inversion : fusion produced by genomic inversion
+- orf : fusion combines genes in a way that preserves a reading frame
+- probability : probability produced by classification using adaboost and example positives/negatives (only given in results.classified.txt)
+- read_through : fusion involving adjacent potentially resulting from co-transcription rather than genome rearrangement
+- repeat_proportion1 : proportion of the spanning reads in gene 1 that span a repeat region
+- repeat_proportion2 : proportion of the spanning reads in gene 2 that span a repeat region
+- max_repeat_proportion : max of repeat_proportion1 and repeat_proportion2
+- splice_score : number of nucleotides similar to GTAG at fusion splice
+- num_splice_variants : number of potential splice variants for this gene pair
+- splicing_index1 : number of concordant pairs in gene 1 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 2
+- splicing_index2 : number of concordant pairs in gene 2 spanning the fusion splice / breakpoint, divided by number of spanning reads supporting the fusion with gene 1
+**Example**
+results.tsv::
+cluster_id	splitr_sequence	splitr_count	splitr_span_pvalue	splitr_pos_pvalue	splitr_min_pvalue	adjacent	altsplice	break_adj_entropy1	break_adj_entropy2	break_adj_entropy_min	break_predict	breakpoint_homology	breakseqs_estislands_percident	cdna_breakseqs_percident	concordant_ratio	deletion	est_breakseqs_percident	eversion	exonboundaries	expression1	expression2	gene1	gene2	gene_align_strand1	gene_align_strand2	gene_chromosome1	gene_chromosome2	gene_end1	gene_end2	gene_location1	gene_location2	gene_name1	gene_name2	gene_start1	gene_start2	gene_strand1	gene_strand2	genome_breakseqs_percident	genomic_break_pos1	genomic_break_pos2	genomic_strand1	genomic_strand2	interchromosomal	interrupted_index1	interrupted_index2	inversion	library_name	max_map_count	max_repeat_proportion	mean_map_count	min_map_count	num_multi_map	num_splice_variants	orf	read_through	repeat_proportion1	repeat_proportion2	span_count	span_coverage1	span_coverage2	span_coverage_max	span_coverage_min	splice_score	splicing_index1	splicing_index2
+1169	GCTTACTGTATGCCAGGCCCCAGAGGGGCAACCACCCTCTAAAGAGAGCGGCTCCTGCCTCCCAGAAAGCTCACAGACTGTGGGAGGGAAACAGGCAGCAGGTGAAGATGCCAAATGCCAGGATATCTGCCCTGTCCTTGCTTGATGCAGCTGCTGGCTCCCACGTTCTCCCCAGAATCCCCTCACACTCCTGCTGTTTTCTCTGCAGGTTGGCAGAGCCCCATGAGGGCAGGGCAGCCACTTTGTTCTTGGGCGGCAAACCTCCCTGGGCGGCACGGAAACCACGGTGAGAAGGGGGCAGGTCGGGCACGTGCAGGGACCACGCTGCAGG|TGTACCCAACAGCTCCGAAGAGACAGCGACCATCGAGAACGGGCCATGATGACGATGGCGGTTTTGTCGAAAAGAAAAGGGGGAAATGTGGGGAAAAGCAAGAGAGATCAGATTGTTACTGTGTCTGTGTAGAAAGAAGTAGACATGGGAGACTCCATTTTGTTCTGTACTAAGAAAAATTCTTCTGCCTTGAGATTCGGTGACCCCACCCCCAACCCCGTGCTCTCTGAAACATGTGCTGTGTCCACTCAGGGTTGAATGGATTAAGGGCGGTGCGAGACGTGCTTT	2	0.000436307890680442	0.110748295953850	0.0880671602973091	N	Y	3.19872427442695	3.48337348351473	3.19872427442695	splitr	0	0	0	0	Y	0	N	N	0	0	ENSG00000105549	ENSG00000213753	+	-	19	19	376013	59111168	intron	upstream	THEG	AC016629.2	361750	59084870	-	+	0	375099	386594	+	-	N	8.34107429512245	-	N	output_dir	82	0.677852348993289	40.6666666666667	1	11	1	N	N	0.361271676300578	0.677852348993289	12	0.758602776578432	0.569678713445872	0.758602776578432	0.569678713445872	2	0.416666666666667	-
+3596	TGGGGGTTGAGGCTTCTGTTCCCAGGTTCCATGACCTCAGAGGTGGCTGGTGAGGTTATGACCTTTGCCCTCCAGCCCTGGCTTAAAACCTCAGCCCTAGGACCTGGTTAAAGGAAGGGGAGATGGAGCTTTGCCCCGACCCCCCCCCGTTCCCCTCACCTGTCAGCCCGAGCTGGGCCAGGGCCCCTAGGTGGGGAACTGGGCCGGGGGGCGGGCACAAGCGGAGGTGGTGCCCCCAAAAGGGCTCCCGGTGGGGTCTTGCTGAGAAGGTGAGGGGTTCCCGGGGCCGCAGCAGGTGGTGGTGGAGGAGCCAAGCGGCTGTAGAGCAAGGGGTGAGCAGGTTCCAGACCGTAGAGGCGGGCAGCGGCCACGGCCCCGGGTCCAGTTAGCTCCTCACCCGCCTCATAGAAGCGGGGTGGCCTTGCCAGGCGTGGGGGTGCTGCC|TTCCTTGGATGTGGTAGCCGTTTCTCAGGCTCCCTCTCCGGAATCGAACCCTGATTCCCCGTCACCCGTGGTCACCATGGTAGGCACGGCGACTACCATCGAAAGTTGATAGGGCAGACGTTCGAATGGGTCGTCGCCGCCACGGGGGGCGTGCGATCAGCCCGAGGTTATCTAGAGTCACCAAAGCCGCCGGCGCCCGCCCCCCGGCCGGGGCCGGAGAGGGGCTGACCGGGTTGGTTTTGATCTGATAAATGCACGCATCCCCCCCGCGAAGGGGGTCAGCGCCCGTCGGCATGTATTAGCTCTAGAATTACCACAGTTATCCAAGTAGGAGAGGAGCGAGCGACCAAAGGAACCATAACTGATTTAATGAGCCATTCGCAGTTTCACTGTACCGGCCGTGCGTACTTAGACATGCATGGCTTAATCTTTGAGACAAGCATATGCTACTGGCAGG	250	7.00711162298275e-72	0.00912124762512338	0.00684237452309549	N	N	3.31745197152461	3.47233119514066	3.31745197152461	splitr	7	0.0157657657657656	0	0	N	0.0135135135135136	N	N	0	0	ENSG00000156860	ENSG00000212932	-	+	16	21	30682131	48111157	coding	upstream	FBRS	RPL23AP4	30670289	48110676	+	+	0.0157657657657656	30680678	9827473	-	+	Y	-	-	N	output_dir	2	1	1.11111111111111	1	1	1	N	N	0	1	9	0.325530693397641	0.296465452915709	0.325530693397641	0.296465452915709	2	-	-
+</help>
+</tool>

Mercurial > repos > jjohnson > defuse

comparison create_reference_dataset.xml @ 12:33e2235bf003