Mercurial > repos > bgruening > upload_testing
comparison short_reads_figure_score.py @ 90:b061185bcb83 draft
Uploaded
| author | bernhardlutz | 
|---|---|
| date | Thu, 23 Jan 2014 14:53:46 -0500 | 
| parents | c4a3a8999945 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 89:71e953a4191d | 90:b061185bcb83 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 boxplot: | |
| 4 - box: first quartile and third quartile | |
| 5 - line inside the box: median | |
| 6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile | |
| 7 IQR = third quartile - first quartile | |
| 8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. | |
| 9 """ | |
| 10 | |
| 11 import os, sys, math, tempfile, re | |
| 12 #from rpy import * | |
| 13 import rpy2.robjects as robjects | |
| 14 import rpy2.rlike.container as rlc | |
| 15 import rpy2.rinterface as ri | |
| 16 r = robjects.r | |
| 17 | |
| 18 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 19 | |
| 20 def stop_err( msg ): | |
| 21 sys.stderr.write( "%s\n" % msg ) | |
| 22 sys.exit() | |
| 23 | |
| 24 def merge_to_20_datapoints( score ): | |
| 25 number_of_points = 20 | |
| 26 read_length = len( score ) | |
| 27 step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) | |
| 28 scores = [] | |
| 29 point = 1 | |
| 30 point_sum = 0 | |
| 31 step_average = 0 | |
| 32 score_points = 0 | |
| 33 | |
| 34 for i in xrange( 1, read_length ): | |
| 35 if i < ( point * step ): | |
| 36 point_sum += int( score[i] ) | |
| 37 step_average += 1 | |
| 38 else: | |
| 39 point_avg = point_sum * 1.0 / step_average | |
| 40 scores.append( point_avg ) | |
| 41 point += 1 | |
| 42 point_sum = 0 | |
| 43 step_average = 0 | |
| 44 if step_average > 0: | |
| 45 point_avg = point_sum * 1.0 / step_average | |
| 46 scores.append( point_avg ) | |
| 47 if len( scores ) > number_of_points: | |
| 48 last_avg = 0 | |
| 49 for j in xrange( number_of_points - 1, len( scores ) ): | |
| 50 last_avg += scores[j] | |
| 51 last_avg = last_avg / ( len(scores) - number_of_points + 1 ) | |
| 52 else: | |
| 53 last_avg = scores[-1] | |
| 54 score_points = [] | |
| 55 for k in range( number_of_points - 1 ): | |
| 56 score_points.append( scores[k] ) | |
| 57 score_points.append( last_avg ) | |
| 58 return score_points | |
| 59 | |
| 60 def __main__(): | |
| 61 | |
| 62 invalid_lines = 0 | |
| 63 | |
| 64 infile_score_name = sys.argv[1].strip() | |
| 65 outfile_R_name = sys.argv[2].strip() | |
| 66 | |
| 67 infile_name = infile_score_name | |
| 68 | |
| 69 # Determine tabular or fasta format within the first 100 lines | |
| 70 seq_method = None | |
| 71 data_type = None | |
| 72 for i, line in enumerate( file( infile_name ) ): | |
| 73 line = line.rstrip( '\r\n' ) | |
| 74 if not line or line.startswith( '#' ): | |
| 75 continue | |
| 76 if data_type == None: | |
| 77 if line.startswith( '>' ): | |
| 78 data_type = 'fasta' | |
| 79 continue | |
| 80 elif len( line.split( '\t' ) ) > 0: | |
| 81 fields = line.split() | |
| 82 for score in fields: | |
| 83 try: | |
| 84 int( score ) | |
| 85 data_type = 'tabular' | |
| 86 seq_method = 'solexa' | |
| 87 break | |
| 88 except: | |
| 89 break | |
| 90 elif data_type == 'fasta': | |
| 91 fields = line.split() | |
| 92 for score in fields: | |
| 93 try: | |
| 94 int( score ) | |
| 95 seq_method = '454' | |
| 96 break | |
| 97 except: | |
| 98 break | |
| 99 if i == 100: | |
| 100 break | |
| 101 | |
| 102 if data_type is None: | |
| 103 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
| 104 if seq_method is None: | |
| 105 stop_err( 'Invalid data for fasta format.') | |
| 106 | |
| 107 # Determine fixed length or variable length within the first 100 lines | |
| 108 read_length = 0 | |
| 109 variable_length = False | |
| 110 if seq_method == 'solexa': | |
| 111 for i, line in enumerate( file( infile_name ) ): | |
| 112 line = line.rstrip( '\r\n' ) | |
| 113 if not line or line.startswith( '#' ): | |
| 114 continue | |
| 115 scores = line.split('\t') | |
| 116 if read_length == 0: | |
| 117 read_length = len( scores ) | |
| 118 if read_length != len( scores ): | |
| 119 variable_length = True | |
| 120 break | |
| 121 if i == 100: | |
| 122 break | |
| 123 elif seq_method == '454': | |
| 124 score = '' | |
| 125 for i, line in enumerate( file( infile_name ) ): | |
| 126 line = line.rstrip( '\r\n' ) | |
| 127 if not line or line.startswith( '#' ): | |
| 128 continue | |
| 129 if line.startswith( '>' ): | |
| 130 if len( score ) > 0: | |
| 131 score = score.split() | |
| 132 if read_length == 0: | |
| 133 read_length = len( score ) | |
| 134 if read_length != len( score ): | |
| 135 variable_length = True | |
| 136 break | |
| 137 score = '' | |
| 138 else: | |
| 139 score = score + ' ' + line | |
| 140 if i == 100: | |
| 141 break | |
| 142 | |
| 143 if variable_length: | |
| 144 number_of_points = 20 | |
| 145 else: | |
| 146 number_of_points = read_length | |
| 147 read_length_threshold = 100 # minimal read length for 454 file | |
| 148 score_points = [] | |
| 149 score_matrix = [] | |
| 150 invalid_scores = 0 | |
| 151 | |
| 152 if seq_method == 'solexa': | |
| 153 for i, line in enumerate( open( infile_name ) ): | |
| 154 line = line.rstrip( '\r\n' ) | |
| 155 if not line or line.startswith( '#' ): | |
| 156 continue | |
| 157 tmp_array = [] | |
| 158 scores = line.split( '\t' ) | |
| 159 for bases in scores: | |
| 160 nuc_errors = bases.split() | |
| 161 try: | |
| 162 nuc_errors[0] = int( nuc_errors[0] ) | |
| 163 nuc_errors[1] = int( nuc_errors[1] ) | |
| 164 nuc_errors[2] = int( nuc_errors[2] ) | |
| 165 nuc_errors[3] = int( nuc_errors[3] ) | |
| 166 big = max( nuc_errors ) | |
| 167 except: | |
| 168 #print 'Invalid numbers in the file. Skipped.' | |
| 169 invalid_scores += 1 | |
| 170 big = 0 | |
| 171 tmp_array.append( big ) | |
| 172 score_points.append( tmp_array ) | |
| 173 elif seq_method == '454': | |
| 174 # skip the last fasta sequence | |
| 175 score = '' | |
| 176 for i, line in enumerate( open( infile_name ) ): | |
| 177 line = line.rstrip( '\r\n' ) | |
| 178 if not line or line.startswith( '#' ): | |
| 179 continue | |
| 180 if line.startswith( '>' ): | |
| 181 if len( score ) > 0: | |
| 182 score = ['0'] + score.split() | |
| 183 read_length = len( score ) | |
| 184 tmp_array = [] | |
| 185 if not variable_length: | |
| 186 score.pop(0) | |
| 187 score_points.append( score ) | |
| 188 tmp_array = score | |
| 189 elif read_length > read_length_threshold: | |
| 190 score_points_tmp = merge_to_20_datapoints( score ) | |
| 191 score_points.append( score_points_tmp ) | |
| 192 tmp_array = score_points_tmp | |
| 193 score = '' | |
| 194 else: | |
| 195 score = "%s %s" % ( score, line ) | |
| 196 if len( score ) > 0: | |
| 197 score = ['0'] + score.split() | |
| 198 read_length = len( score ) | |
| 199 if not variable_length: | |
| 200 score.pop(0) | |
| 201 score_points.append( score ) | |
| 202 elif read_length > read_length_threshold: | |
| 203 score_points_tmp = merge_to_20_datapoints( score ) | |
| 204 score_points.append( score_points_tmp ) | |
| 205 tmp_array = score_points_tmp | |
| 206 | |
| 207 # reverse the matrix, for R | |
| 208 for i in range( number_of_points - 1 ): | |
| 209 tmp_array = [] | |
| 210 for j in range( len( score_points ) ): | |
| 211 try: | |
| 212 tmp_array.append( int( score_points[j][i] ) ) | |
| 213 except: | |
| 214 invalid_lines += 1 | |
| 215 score_matrix.append( tmp_array ) | |
| 216 | |
| 217 # generate pdf figures | |
| 218 #outfile_R_pdf = outfile_R_name | |
| 219 #r.pdf( outfile_R_pdf ) | |
| 220 outfile_R_png = outfile_R_name | |
| 221 print 'Writing bitmap' | |
| 222 r.bitmap( outfile_R_png ) | |
| 223 | |
| 224 title = "boxplot of quality scores" | |
| 225 empty_score_matrix_columns = 0 | |
| 226 for i, subset in enumerate( score_matrix ): | |
| 227 if not subset: | |
| 228 empty_score_matrix_columns += 1 | |
| 229 score_matrix[i] = [0] | |
| 230 | |
| 231 if not variable_length: | |
| 232 print 'Creating fixed boxplot ' | |
| 233 r.boxplot( score_matrix, xlab="location in read length", main=title ) | |
| 234 else: | |
| 235 print 'Creating variable boxplot' | |
| 236 r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) | |
| 237 x_old_range = [] | |
| 238 x_new_range = [] | |
| 239 step = read_length_threshold / number_of_points | |
| 240 for i in xrange( 0, read_length_threshold, step ): | |
| 241 x_old_range.append( ( i / step ) ) | |
| 242 x_new_range.append( i ) | |
| 243 print 'Writing axis' | |
| 244 r.axis( 1, x_old_range, x_new_range ) | |
| 245 | |
| 246 print 'calling dev.off()' | |
| 247 r('dev.off()') | |
| 248 | |
| 249 if invalid_scores > 0: | |
| 250 print 'Skipped %d invalid scores. ' % invalid_scores | |
| 251 if invalid_lines > 0: | |
| 252 print 'Skipped %d invalid lines. ' % invalid_lines | |
| 253 if empty_score_matrix_columns > 0: | |
| 254 print '%d missing scores in score_matrix. ' % empty_score_matrix_columns | |
| 255 | |
| 256 #r.quit(save = "no") | |
| 257 | |
| 258 if __name__=="__main__":__main__() | 
