| 80 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 boxplot: | 
|  | 4 - box: first quartile and third quartile | 
|  | 5 - line inside the box: median | 
|  | 6 - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile | 
|  | 7            IQR = third quartile - first quartile | 
|  | 8 - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. | 
|  | 9 """ | 
|  | 10 | 
|  | 11 import os, sys, math, tempfile, re | 
|  | 12 #from rpy import * | 
|  | 13 import rpy2.robjects as robjects | 
|  | 14 import rpy2.rlike.container as rlc | 
|  | 15 import rpy2.rinterface as ri | 
|  | 16 r = robjects.r | 
|  | 17 | 
|  | 18 assert sys.version_info[:2] >= ( 2, 4 ) | 
|  | 19 | 
|  | 20 def stop_err( msg ): | 
|  | 21     sys.stderr.write( "%s\n" % msg ) | 
|  | 22     sys.exit() | 
|  | 23 | 
|  | 24 def merge_to_20_datapoints( score ): | 
|  | 25     number_of_points = 20 | 
|  | 26     read_length = len( score ) | 
|  | 27     step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) | 
|  | 28     scores = [] | 
|  | 29     point = 1 | 
|  | 30     point_sum = 0 | 
|  | 31     step_average = 0 | 
|  | 32     score_points = 0 | 
|  | 33 | 
|  | 34     for i in xrange( 1, read_length ): | 
|  | 35         if i < ( point * step ): | 
|  | 36             point_sum += int( score[i] ) | 
|  | 37             step_average += 1 | 
|  | 38         else: | 
|  | 39             point_avg = point_sum * 1.0 / step_average | 
|  | 40             scores.append( point_avg ) | 
|  | 41             point += 1 | 
|  | 42             point_sum = 0 | 
|  | 43             step_average = 0 | 
|  | 44     if step_average > 0: | 
|  | 45         point_avg = point_sum * 1.0 / step_average | 
|  | 46         scores.append( point_avg ) | 
|  | 47     if len( scores ) > number_of_points: | 
|  | 48         last_avg = 0 | 
|  | 49         for j in xrange( number_of_points - 1, len( scores ) ): | 
|  | 50             last_avg += scores[j] | 
|  | 51         last_avg = last_avg / ( len(scores) - number_of_points + 1 ) | 
|  | 52     else: | 
|  | 53         last_avg = scores[-1] | 
|  | 54     score_points = [] | 
|  | 55     for k in range( number_of_points - 1 ): | 
|  | 56         score_points.append( scores[k] ) | 
|  | 57     score_points.append( last_avg ) | 
|  | 58     return score_points | 
|  | 59 | 
|  | 60 def __main__(): | 
|  | 61 | 
|  | 62     invalid_lines = 0 | 
|  | 63 | 
|  | 64     infile_score_name = sys.argv[1].strip() | 
|  | 65     outfile_R_name = sys.argv[2].strip() | 
|  | 66 | 
|  | 67     infile_name = infile_score_name | 
|  | 68 | 
|  | 69     # Determine tabular or fasta format within the first 100 lines | 
|  | 70     seq_method = None | 
|  | 71     data_type = None | 
|  | 72     for i, line in enumerate( file( infile_name ) ): | 
|  | 73         line = line.rstrip( '\r\n' ) | 
|  | 74         if not line or line.startswith( '#' ): | 
|  | 75             continue | 
|  | 76         if data_type == None: | 
|  | 77             if line.startswith( '>' ): | 
|  | 78                 data_type = 'fasta' | 
|  | 79                 continue | 
|  | 80             elif len( line.split( '\t' ) ) > 0: | 
|  | 81                 fields = line.split() | 
|  | 82                 for score in fields: | 
|  | 83                     try: | 
|  | 84                         int( score ) | 
|  | 85                         data_type = 'tabular' | 
|  | 86                         seq_method = 'solexa' | 
|  | 87                         break | 
|  | 88                     except: | 
|  | 89                         break | 
|  | 90         elif data_type == 'fasta': | 
|  | 91             fields = line.split() | 
|  | 92             for score in fields: | 
|  | 93                 try: | 
|  | 94                     int( score ) | 
|  | 95                     seq_method = '454' | 
|  | 96                     break | 
|  | 97                 except: | 
|  | 98                     break | 
|  | 99         if i == 100: | 
|  | 100             break | 
|  | 101 | 
|  | 102     if data_type is None: | 
|  | 103         stop_err( 'This tool can only use fasta data or tabular data.' ) | 
|  | 104     if seq_method is None: | 
|  | 105         stop_err( 'Invalid data for fasta format.') | 
|  | 106 | 
|  | 107     # Determine fixed length or variable length within the first 100 lines | 
|  | 108     read_length = 0 | 
|  | 109     variable_length = False | 
|  | 110     if seq_method == 'solexa': | 
|  | 111         for i, line in enumerate( file( infile_name ) ): | 
|  | 112             line = line.rstrip( '\r\n' ) | 
|  | 113             if not line or line.startswith( '#' ): | 
|  | 114                 continue | 
|  | 115             scores = line.split('\t') | 
|  | 116             if read_length == 0: | 
|  | 117                 read_length = len( scores ) | 
|  | 118             if read_length != len( scores ): | 
|  | 119                 variable_length = True | 
|  | 120                 break | 
|  | 121             if i == 100: | 
|  | 122                 break | 
|  | 123     elif seq_method == '454': | 
|  | 124         score = '' | 
|  | 125         for i, line in enumerate( file( infile_name ) ): | 
|  | 126             line = line.rstrip( '\r\n' ) | 
|  | 127             if not line or line.startswith( '#' ): | 
|  | 128                 continue | 
|  | 129             if line.startswith( '>' ): | 
|  | 130                 if len( score ) > 0: | 
|  | 131                     score = score.split() | 
|  | 132                     if read_length == 0: | 
|  | 133                         read_length = len( score ) | 
|  | 134                     if read_length != len( score ): | 
|  | 135                         variable_length = True | 
|  | 136                         break | 
|  | 137                 score = '' | 
|  | 138             else: | 
|  | 139                 score = score + ' ' + line | 
|  | 140             if i == 100: | 
|  | 141                 break | 
|  | 142 | 
|  | 143     if variable_length: | 
|  | 144         number_of_points = 20 | 
|  | 145     else: | 
|  | 146         number_of_points = read_length | 
|  | 147     read_length_threshold = 100 # minimal read length for 454 file | 
|  | 148     score_points = [] | 
|  | 149     score_matrix = [] | 
|  | 150     invalid_scores = 0 | 
|  | 151 | 
|  | 152     if seq_method == 'solexa': | 
|  | 153         for i, line in enumerate( open( infile_name ) ): | 
|  | 154             line = line.rstrip( '\r\n' ) | 
|  | 155             if not line or line.startswith( '#' ): | 
|  | 156                 continue | 
|  | 157             tmp_array = [] | 
|  | 158             scores = line.split( '\t' ) | 
|  | 159             for bases in scores: | 
|  | 160                 nuc_errors = bases.split() | 
|  | 161                 try: | 
|  | 162                     nuc_errors[0] = int( nuc_errors[0] ) | 
|  | 163                     nuc_errors[1] = int( nuc_errors[1] ) | 
|  | 164                     nuc_errors[2] = int( nuc_errors[2] ) | 
|  | 165                     nuc_errors[3] = int( nuc_errors[3] ) | 
|  | 166                     big = max( nuc_errors ) | 
|  | 167                 except: | 
|  | 168                     #print 'Invalid numbers in the file. Skipped.' | 
|  | 169                     invalid_scores += 1 | 
|  | 170                     big = 0 | 
|  | 171                 tmp_array.append( big ) | 
|  | 172             score_points.append( tmp_array ) | 
|  | 173     elif seq_method == '454': | 
|  | 174         # skip the last fasta sequence | 
|  | 175         score = '' | 
|  | 176         for i, line in enumerate( open( infile_name ) ): | 
|  | 177             line = line.rstrip( '\r\n' ) | 
|  | 178             if not line or line.startswith( '#' ): | 
|  | 179                 continue | 
|  | 180             if line.startswith( '>' ): | 
|  | 181                 if len( score ) > 0: | 
|  | 182                     score = ['0'] + score.split() | 
|  | 183                     read_length = len( score ) | 
|  | 184                     tmp_array = [] | 
|  | 185                     if not variable_length: | 
|  | 186                         score.pop(0) | 
|  | 187                         score_points.append( score ) | 
|  | 188                         tmp_array = score | 
|  | 189                     elif read_length > read_length_threshold: | 
|  | 190                         score_points_tmp = merge_to_20_datapoints( score ) | 
|  | 191                         score_points.append( score_points_tmp ) | 
|  | 192                         tmp_array = score_points_tmp | 
|  | 193                 score = '' | 
|  | 194             else: | 
|  | 195                 score = "%s %s" % ( score, line ) | 
|  | 196         if len( score ) > 0: | 
|  | 197             score = ['0'] + score.split() | 
|  | 198             read_length = len( score ) | 
|  | 199             if not variable_length: | 
|  | 200                 score.pop(0) | 
|  | 201                 score_points.append( score ) | 
|  | 202             elif read_length > read_length_threshold: | 
|  | 203                 score_points_tmp = merge_to_20_datapoints( score ) | 
|  | 204                 score_points.append( score_points_tmp ) | 
|  | 205                 tmp_array = score_points_tmp | 
|  | 206 | 
|  | 207     # reverse the matrix, for R | 
|  | 208     for i in range( number_of_points - 1 ): | 
|  | 209         tmp_array = [] | 
|  | 210         for j in range( len( score_points ) ): | 
|  | 211             try: | 
|  | 212                 tmp_array.append( int( score_points[j][i] ) ) | 
|  | 213             except: | 
|  | 214                 invalid_lines += 1 | 
|  | 215         score_matrix.append( tmp_array ) | 
|  | 216 | 
|  | 217     # generate pdf figures | 
|  | 218     #outfile_R_pdf = outfile_R_name | 
|  | 219     #r.pdf( outfile_R_pdf ) | 
|  | 220     outfile_R_png = outfile_R_name | 
|  | 221     print 'Writing bitmap' | 
|  | 222     r.bitmap( outfile_R_png ) | 
|  | 223 | 
|  | 224     title = "boxplot of quality scores" | 
|  | 225     empty_score_matrix_columns = 0 | 
|  | 226     for i, subset in enumerate( score_matrix ): | 
|  | 227         if not subset: | 
|  | 228             empty_score_matrix_columns += 1 | 
|  | 229             score_matrix[i] = [0] | 
|  | 230 | 
|  | 231     if not variable_length: | 
|  | 232         print 'Creating fixed boxplot ' | 
|  | 233         r.boxplot( score_matrix, xlab="location in read length", main=title ) | 
|  | 234     else: | 
|  | 235         print 'Creating variable boxplot' | 
|  | 236         r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) | 
|  | 237         x_old_range = [] | 
|  | 238         x_new_range = [] | 
|  | 239         step = read_length_threshold / number_of_points | 
|  | 240         for i in xrange( 0, read_length_threshold, step ): | 
|  | 241             x_old_range.append( ( i / step ) ) | 
|  | 242             x_new_range.append( i ) | 
|  | 243         print 'Writing axis' | 
|  | 244         r.axis( 1, x_old_range, x_new_range ) | 
|  | 245 | 
|  | 246     print 'calling dev.off()' | 
|  | 247     r('dev.off()') | 
|  | 248 | 
|  | 249     if invalid_scores > 0: | 
|  | 250         print 'Skipped %d invalid scores. ' % invalid_scores | 
|  | 251     if invalid_lines > 0: | 
|  | 252         print 'Skipped %d invalid lines. ' % invalid_lines | 
|  | 253     if empty_score_matrix_columns > 0: | 
|  | 254         print '%d missing scores in score_matrix. ' % empty_score_matrix_columns | 
|  | 255 | 
|  | 256     #r.quit(save = "no") | 
|  | 257 | 
|  | 258 if __name__=="__main__":__main__() |