annotate cuffcompare_wrapper.py @ 0:d0d26169cc2a draft

Uploaded
author devteam
date Wed, 26 Nov 2014 13:54:44 -0500
parents
children a5674ddf2ad7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
2
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
3 # Supports Cuffcompare versions v1.3.0 and newer.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
4
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
5 import optparse, os, shutil, subprocess, sys, tempfile
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
6
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
7 def stop_err( msg ):
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
8 sys.stderr.write( '%s\n' % msg )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
9 sys.exit()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
10
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
11 def __main__():
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
12 #Parse Command Line
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
13 parser = optparse.OptionParser()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
14 parser.add_option( '-r', dest='ref_annotation', help='An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
15 parser.add_option( '-R', action="store_true", dest='ignore_nonoverlap_reference', help='If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts accuracy file' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
16 parser.add_option( '-Q', action="store_true", dest='ignore_nonoverlap_transfrag', help='If -r was specified, this option causes cuffcompare to consider only the input transcripts that overlap any of the reference transcripts (Sp correction); Warning: this will discard all "novel" loci!)' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
17
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
18 parser.add_option( '-s', dest='use_seq_data', action="store_true", help='Causes cuffcompare to look into for fasta files with the underlying genomic sequences (one file per contig) against which your reads were aligned for some optional classification functions. For example, Cufflinks transcripts consisting mostly of lower-case bases are classified as repeats. Note that <seq_dir> must contain one fasta file per reference chromosome, and each file must be named after the chromosome, and have a .fa or .fasta extension.')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
19
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
20 parser.add_option( '-M', action="store_true", dest='discard_single_exon_all', help='discard (ignore) single-exon transfrags and reference transcript')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
21 parser.add_option( '-N', action="store_true", dest='discard_single_exon_ref', help='discard (ignore) single-exon reference transcripts')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
22 parser.add_option( '-e', dest='max_dist_exon', help='Max. Distance for assessing exon accuracy" help="max. distance (range) allowed from free ends of terminal exons of reference transcripts when assessing exon accuracy. Default: 100')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
23 parser.add_option( '-d', dest='max_dist_group', help='Max.Distance for transcript grouping" help="max. distance (range) for grouping transcript start sites. Default: 100')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
24 parser.add_option( '-F', action="store_true", dest='discard_redundant_intron_transfrags', help='Discard intron-redundant transfrags if they share the 5-prime end (if they differ only at the 3-prime end)')
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
25
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
26 # Wrapper / Galaxy options.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
27 parser.add_option( '', '--index', dest='index', help='The path of the reference genome' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
28 parser.add_option( '', '--ref_file', dest='ref_file', help='The reference dataset from the history' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
29
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
30 # Outputs.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
31 parser.add_option( '', '--combined-transcripts', dest='combined_transcripts' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
32
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
33 (options, args) = parser.parse_args()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
34
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
35 # output version # of tool
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
36 try:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
37 tmp = tempfile.NamedTemporaryFile().name
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
38 tmp_stdout = open( tmp, 'wb' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
39 proc = subprocess.Popen( args='cuffcompare 2>&1', shell=True, stdout=tmp_stdout )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
40 tmp_stdout.close()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
41 returncode = proc.wait()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
42 stdout = None
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
43 for line in open( tmp_stdout.name, 'rb' ):
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
44 if line.lower().find( 'cuffcompare v' ) >= 0:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
45 stdout = line.strip()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
46 break
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
47 if stdout:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
48 sys.stdout.write( '%s\n' % stdout )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
49 else:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
50 raise Exception
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
51 except:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
52 sys.stdout.write( 'Could not determine Cuffcompare version\n' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
53
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
54 # Set/link to sequence file.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
55 if options.use_seq_data:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
56 if options.ref_file:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
57 # Sequence data from history.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
58 # Create symbolic link to ref_file so that index will be created in working directory.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
59 seq_path = "ref.fa"
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
60 os.symlink( options.ref_file, seq_path )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
61 else:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
62 if not os.path.exists( options.index ):
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
63 stop_err( 'Reference genome %s not present, request it by reporting this error.' % options.index )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
64 seq_path = options.index
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
65
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
66 # Build command.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
67
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
68 # Base.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
69 cmd = "cuffcompare -o cc_output "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
70
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
71 # Add options.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
72 if options.ref_annotation:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
73 cmd += " -r %s " % options.ref_annotation
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
74 if options.ignore_nonoverlap_reference:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
75 cmd += " -R "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
76 if options.ignore_nonoverlap_transfrag:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
77 cmd += " -Q "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
78 if options.use_seq_data:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
79 cmd += " -s %s " % seq_path
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
80 if options.discard_single_exon_all:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
81 cmd += " -M "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
82 if options.discard_single_exon_ref:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
83 cmd += " -N "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
84 if options.max_dist_exon:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
85 cmd += " -e %i " % int( options.max_dist_exon )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
86 if options.max_dist_group:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
87 cmd += " -d %i " % int( options.max_dist_group )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
88 if options.discard_redundant_intron_transfrags:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
89 cmd += " -F "
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
90 # Add input files.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
91
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
92 # Need to symlink inputs so that output files are written to temp directory.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
93 for i, arg in enumerate( args ):
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
94 input_file_name = "./input%i" % ( i+1 )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
95 os.symlink( arg, input_file_name )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
96 cmd += "%s " % input_file_name
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
97
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
98 # Debugging.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
99 print cmd
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
100
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
101 # Run command.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
102 try:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
103 tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
104 tmp_stderr = open( tmp_name, 'wb' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
105 proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
106 returncode = proc.wait()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
107 tmp_stderr.close()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
108
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
109 # Get stderr, allowing for case where it's very large.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
110 tmp_stderr = open( tmp_name, 'rb' )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
111 stderr = ''
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
112 buffsize = 1048576
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
113 try:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
114 while True:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
115 stderr += tmp_stderr.read( buffsize )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
116 if not stderr or len( stderr ) % buffsize != 0:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
117 break
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
118 except OverflowError:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
119 pass
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
120 tmp_stderr.close()
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
121
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
122 # Error checking.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
123 if returncode != 0:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
124 raise Exception, stderr
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
125
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
126 # Copy outputs.
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
127 shutil.copyfile( "cc_output.combined.gtf" , options.combined_transcripts )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
128
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
129 # check that there are results in the output file
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
130 cc_output_fname = "cc_output.stats"
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
131 if len( open( cc_output_fname, 'rb' ).read().strip() ) == 0:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
132 raise Exception, 'The main output file is empty, there may be an error with your input file or settings.'
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
133 except Exception, e:
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
134 stop_err( 'Error running cuffcompare. ' + str( e ) )
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
135
d0d26169cc2a Uploaded
devteam
parents:
diff changeset
136 if __name__=="__main__": __main__()