Mercurial > repos > devteam > tophat
comparison tophat_wrapper.py @ 0:51c6602b46b9 draft
Imported from capsule None
| author | devteam |
|---|---|
| date | Thu, 23 Jan 2014 12:30:49 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:51c6602b46b9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import optparse, os, shutil, subprocess, sys, tempfile, fileinput | |
| 4 | |
| 5 def stop_err( msg ): | |
| 6 sys.stderr.write( "%s\n" % msg ) | |
| 7 sys.exit() | |
| 8 | |
| 9 def __main__(): | |
| 10 #Parse Command Line | |
| 11 parser = optparse.OptionParser() | |
| 12 parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) | |
| 13 parser.add_option( '-C', '--color-space', dest='color_space', action='store_true', help='This indicates color-space data' ) | |
| 14 parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' ) | |
| 15 parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is BAM.' ) | |
| 16 parser.add_option( '', '--own-file', dest='own_file', help='' ) | |
| 17 parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) | |
| 18 parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ | |
| 19 For, example, for paired end runs with fragments selected at 300bp, \ | |
| 20 where each end is 50bp, you should set -r to be 200. There is no default, \ | |
| 21 and this parameter is required for paired end runs.') | |
| 22 parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) | |
| 23 parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', | |
| 24 help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) | |
| 25 parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) | |
| 26 parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', | |
| 27 help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) | |
| 28 parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', | |
| 29 help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) | |
| 30 parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) | |
| 31 parser.add_option( '', '--initial-read-mismatches', dest='initial_read_mismatches', help='Number of mismatches allowed in the initial read mapping' ) | |
| 32 parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) | |
| 33 parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) | |
| 34 parser.add_option( '', '--library-type', dest='library_type', help='TopHat will treat the reads as strand specific. Every read alignment will have an XS attribute tag. Consider supplying library type options below to select the correct RNA-seq protocol.' ) | |
| 35 parser.add_option( '', '--allow-indels', action="store_true", help='Allow indel search. Indel search is disabled by default.(Not used since version 1.3.0)' ) | |
| 36 parser.add_option( '', '--max-insertion-length', dest='max_insertion_length', help='The maximum insertion length. The default is 3.' ) | |
| 37 parser.add_option( '', '--max-deletion-length', dest='max_deletion_length', help='The maximum deletion length. The default is 3.' ) | |
| 38 | |
| 39 # Options for supplying own junctions | |
| 40 parser.add_option( '-G', '--GTF', dest='gene_model_annotations', help='Supply TopHat with a list of gene model annotations. \ | |
| 41 TopHat will use the exon records in this file to build \ | |
| 42 a set of known splice junctions for each gene, and will \ | |
| 43 attempt to align reads to these junctions even if they \ | |
| 44 would not normally be covered by the initial mapping.') | |
| 45 parser.add_option( '-j', '--raw-juncs', dest='raw_juncs', help='Supply TopHat with a list of raw junctions. Junctions are \ | |
| 46 specified one per line, in a tab-delimited format. Records \ | |
| 47 look like: <chrom> <left> <right> <+/-> left and right are \ | |
| 48 zero-based coordinates, and specify the last character of the \ | |
| 49 left sequenced to be spliced to the first character of the right \ | |
| 50 sequence, inclusive.') | |
| 51 parser.add_option( '', '--no-novel-juncs', action="store_true", dest='no_novel_juncs', help="Only look for junctions indicated in the \ | |
| 52 supplied GFF file. (ignored without -G)") | |
| 53 parser.add_option( '', '--no-novel-indels', action="store_true", dest='no_novel_indels', help="Skip indel search. Indel search is enabled by default.") | |
| 54 # Types of search. | |
| 55 parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') | |
| 56 parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') | |
| 57 parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) | |
| 58 parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') | |
| 59 parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) | |
| 60 parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) | |
| 61 parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) | |
| 62 parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) | |
| 63 parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) | |
| 64 parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) | |
| 65 parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) | |
| 66 parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) | |
| 67 | |
| 68 # Wrapper options. | |
| 69 parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) | |
| 70 parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) | |
| 71 parser.add_option( '', '--single-paired', dest='single_paired', help='' ) | |
| 72 parser.add_option( '', '--settings', dest='settings', help='' ) | |
| 73 | |
| 74 (options, args) = parser.parse_args() | |
| 75 | |
| 76 # output version # of tool | |
| 77 try: | |
| 78 tmp = tempfile.NamedTemporaryFile().name | |
| 79 tmp_stdout = open( tmp, 'wb' ) | |
| 80 proc = subprocess.Popen( args='tophat -v', shell=True, stdout=tmp_stdout ) | |
| 81 tmp_stdout.close() | |
| 82 returncode = proc.wait() | |
| 83 stdout = open( tmp_stdout.name, 'rb' ).readline().strip() | |
| 84 if stdout: | |
| 85 sys.stdout.write( '%s\n' % stdout ) | |
| 86 else: | |
| 87 raise Exception | |
| 88 except: | |
| 89 sys.stdout.write( 'Could not determine Tophat version\n' ) | |
| 90 | |
| 91 # Color or base space | |
| 92 space = '' | |
| 93 if options.color_space: | |
| 94 space = '-C' | |
| 95 | |
| 96 # Creat bowtie index if necessary. | |
| 97 tmp_index_dir = tempfile.mkdtemp() | |
| 98 if options.own_file: | |
| 99 index_path = os.path.join( tmp_index_dir, '.'.join( os.path.split( options.own_file )[1].split( '.' )[:-1] ) ) | |
| 100 try: | |
| 101 os.link( options.own_file, index_path + '.fa' ) | |
| 102 except: | |
| 103 # Tophat prefers (but doesn't require) fasta file to be in same directory, with .fa extension | |
| 104 pass | |
| 105 cmd_index = 'bowtie-build %s -f %s %s' % ( space, options.own_file, index_path ) | |
| 106 try: | |
| 107 tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name | |
| 108 tmp_stderr = open( tmp, 'wb' ) | |
| 109 proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) | |
| 110 returncode = proc.wait() | |
| 111 tmp_stderr.close() | |
| 112 # get stderr, allowing for case where it's very large | |
| 113 tmp_stderr = open( tmp, 'rb' ) | |
| 114 stderr = '' | |
| 115 buffsize = 1048576 | |
| 116 try: | |
| 117 while True: | |
| 118 stderr += tmp_stderr.read( buffsize ) | |
| 119 if not stderr or len( stderr ) % buffsize != 0: | |
| 120 break | |
| 121 except OverflowError: | |
| 122 pass | |
| 123 tmp_stderr.close() | |
| 124 if returncode != 0: | |
| 125 raise Exception, stderr | |
| 126 except Exception, e: | |
| 127 if os.path.exists( tmp_index_dir ): | |
| 128 shutil.rmtree( tmp_index_dir ) | |
| 129 stop_err( 'Error indexing reference sequence\n' + str( e ) ) | |
| 130 else: | |
| 131 index_path = options.index_path | |
| 132 | |
| 133 # Build tophat command. | |
| 134 cmd = 'tophat %s %s %s' | |
| 135 reads = options.input1 | |
| 136 if options.input2: | |
| 137 reads += ' ' + options.input2 | |
| 138 opts = '-p %s %s' % ( options.num_threads, space ) | |
| 139 if options.single_paired == 'paired': | |
| 140 opts += ' -r %s' % options.mate_inner_dist | |
| 141 if options.settings == 'preSet': | |
| 142 cmd = cmd % ( opts, index_path, reads ) | |
| 143 else: | |
| 144 try: | |
| 145 if int( options.min_anchor_length ) >= 3: | |
| 146 opts += ' -a %s' % options.min_anchor_length | |
| 147 else: | |
| 148 raise Exception, 'Minimum anchor length must be 3 or greater' | |
| 149 opts += ' -m %s' % options.splice_mismatches | |
| 150 opts += ' -i %s' % options.min_intron_length | |
| 151 opts += ' -I %s' % options.max_intron_length | |
| 152 opts += ' -g %s' % options.max_multihits | |
| 153 # Custom junctions options. | |
| 154 if options.gene_model_annotations: | |
| 155 opts += ' -G %s' % options.gene_model_annotations | |
| 156 if options.raw_juncs: | |
| 157 opts += ' -j %s' % options.raw_juncs | |
| 158 if options.no_novel_juncs: | |
| 159 opts += ' --no-novel-juncs' | |
| 160 if options.library_type: | |
| 161 opts += ' --library-type %s' % options.library_type | |
| 162 if options.no_novel_indels: | |
| 163 opts += ' --no-novel-indels' | |
| 164 else: | |
| 165 if options.max_insertion_length: | |
| 166 opts += ' --max-insertion-length %i' % int( options.max_insertion_length ) | |
| 167 if options.max_deletion_length: | |
| 168 opts += ' --max-deletion-length %i' % int( options.max_deletion_length ) | |
| 169 # Max options do not work for Tophat v1.2.0, despite documentation to the contrary. (Fixed in version 1.3.1) | |
| 170 # need to warn user of this fact | |
| 171 #sys.stdout.write( "Max insertion length and max deletion length options don't work in Tophat v1.2.0\n" ) | |
| 172 | |
| 173 # Search type options. | |
| 174 if options.coverage_search: | |
| 175 opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron ) | |
| 176 else: | |
| 177 opts += ' --no-coverage-search' | |
| 178 if options.closure_search: | |
| 179 opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) | |
| 180 else: | |
| 181 opts += ' --no-closure-search' | |
| 182 if options.microexon_search: | |
| 183 opts += ' --microexon-search' | |
| 184 if options.single_paired == 'paired': | |
| 185 opts += ' --mate-std-dev %s' % options.mate_std_dev | |
| 186 if options.initial_read_mismatches: | |
| 187 opts += ' --initial-read-mismatches %d' % int( options.initial_read_mismatches ) | |
| 188 if options.seg_mismatches: | |
| 189 opts += ' --segment-mismatches %d' % int( options.seg_mismatches ) | |
| 190 if options.seg_length: | |
| 191 opts += ' --segment-length %d' % int( options.seg_length ) | |
| 192 if options.min_segment_intron: | |
| 193 opts += ' --min-segment-intron %d' % int( options.min_segment_intron ) | |
| 194 if options.max_segment_intron: | |
| 195 opts += ' --max-segment-intron %d' % int( options.max_segment_intron ) | |
| 196 cmd = cmd % ( opts, index_path, reads ) | |
| 197 except Exception, e: | |
| 198 # Clean up temp dirs | |
| 199 if os.path.exists( tmp_index_dir ): | |
| 200 shutil.rmtree( tmp_index_dir ) | |
| 201 stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) | |
| 202 #print cmd | |
| 203 | |
| 204 # Run | |
| 205 try: | |
| 206 tmp_out = tempfile.NamedTemporaryFile().name | |
| 207 tmp_stdout = open( tmp_out, 'wb' ) | |
| 208 tmp_err = tempfile.NamedTemporaryFile().name | |
| 209 tmp_stderr = open( tmp_err, 'wb' ) | |
| 210 print cmd | |
| 211 proc = subprocess.Popen( args=cmd, shell=True, cwd=".", stdout=tmp_stdout, stderr=tmp_stderr ) | |
| 212 returncode = proc.wait() | |
| 213 tmp_stderr.close() | |
| 214 # get stderr, allowing for case where it's very large | |
| 215 tmp_stderr = open( tmp_err, 'rb' ) | |
| 216 stderr = '' | |
| 217 buffsize = 1048576 | |
| 218 try: | |
| 219 while True: | |
| 220 stderr += tmp_stderr.read( buffsize ) | |
| 221 if not stderr or len( stderr ) % buffsize != 0: | |
| 222 break | |
| 223 except OverflowError: | |
| 224 pass | |
| 225 tmp_stdout.close() | |
| 226 tmp_stderr.close() | |
| 227 if returncode != 0: | |
| 228 raise Exception, stderr | |
| 229 | |
| 230 # TODO: look for errors in program output. | |
| 231 except Exception, e: | |
| 232 stop_err( 'Error in tophat:\n' + str( e ) ) | |
| 233 | |
| 234 # Clean up temp dirs | |
| 235 if os.path.exists( tmp_index_dir ): | |
| 236 shutil.rmtree( tmp_index_dir ) | |
| 237 | |
| 238 if __name__=="__main__": __main__() |
