Mercurial > repos > devteam > short_reads_trim_seq
comparison short_reads_trim_seq.py @ 0:8c0b907e6e5b draft
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 19 May 2014 10:59:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8c0b907e6e5b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 trim reads based on the quality scores | |
| 4 input: read file and quality score file | |
| 5 output: trimmed read file | |
| 6 """ | |
| 7 | |
| 8 import os, sys, math, tempfile, re | |
| 9 | |
| 10 assert sys.version_info[:2] >= ( 2, 4 ) | |
| 11 | |
| 12 def stop_err( msg ): | |
| 13 sys.stderr.write( "%s\n" % msg ) | |
| 14 sys.exit() | |
| 15 | |
| 16 def append_to_outfile( outfile_name, seq_title, segments ): | |
| 17 segments = segments.split( ',' ) | |
| 18 if len( segments ) > 1: | |
| 19 outfile = open( outfile_name, 'a' ) | |
| 20 for i in range( len( segments ) ): | |
| 21 outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) ) | |
| 22 outfile.close() | |
| 23 elif segments[0]: | |
| 24 outfile = open( outfile_name, 'a' ) | |
| 25 outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) ) | |
| 26 outfile.close() | |
| 27 | |
| 28 def trim_seq( seq, score, arg, trim_score, threshold ): | |
| 29 seq_method = '454' | |
| 30 trim_pos = 0 | |
| 31 # trim after a certain position | |
| 32 if arg.isdigit(): | |
| 33 keep_homopolymers = False | |
| 34 trim_pos = int( arg ) | |
| 35 if trim_pos > 0 and trim_pos < len( seq ): | |
| 36 seq = seq[0:trim_pos] | |
| 37 else: | |
| 38 keep_homopolymers = arg=='yes' | |
| 39 | |
| 40 new_trim_seq = '' | |
| 41 max_segment = 0 | |
| 42 | |
| 43 for i in range( len( seq ) ): | |
| 44 if i >= len( score ): | |
| 45 score.append(-1) | |
| 46 if int( score[i] ) >= trim_score: | |
| 47 pass_nuc = seq[ i:( i + 1 ) ] | |
| 48 else: | |
| 49 if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ): | |
| 50 pass_nuc = seq[ i:( i + 1 ) ] | |
| 51 else: | |
| 52 pass_nuc = ' ' | |
| 53 new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc ) | |
| 54 # find the max substrings | |
| 55 segments = new_trim_seq.split() | |
| 56 max_segment = '' | |
| 57 len_max_segment = 0 | |
| 58 if threshold == 0: | |
| 59 for seg in segments: | |
| 60 if len_max_segment < len( seg ): | |
| 61 max_segment = '%s,' % seg | |
| 62 len_max_segment = len( seg ) | |
| 63 elif len_max_segment == len( seg ): | |
| 64 max_segment = '%s%s,' % ( max_segment, seg ) | |
| 65 else: | |
| 66 for seg in segments: | |
| 67 if len( seg ) >= threshold: | |
| 68 max_segment = '%s%s,' % ( max_segment, seg ) | |
| 69 return max_segment[ 0:-1 ] | |
| 70 | |
| 71 def __main__(): | |
| 72 | |
| 73 try: | |
| 74 threshold_trim = int( sys.argv[1].strip() ) | |
| 75 except: | |
| 76 stop_err( "Minimal quality score must be numeric." ) | |
| 77 try: | |
| 78 threshold_report = int( sys.argv[2].strip() ) | |
| 79 except: | |
| 80 stop_err( "Minimal length of trimmed reads must be numeric." ) | |
| 81 outfile_seq_name = sys.argv[3].strip() | |
| 82 infile_seq_name = sys.argv[4].strip() | |
| 83 infile_score_name = sys.argv[5].strip() | |
| 84 arg = sys.argv[6].strip() | |
| 85 | |
| 86 seq_infile_name = infile_seq_name | |
| 87 score_infile_name = infile_score_name | |
| 88 | |
| 89 | |
| 90 # Determine quailty score format: tabular or fasta format within the first 100 lines | |
| 91 seq_method = None | |
| 92 data_type = None | |
| 93 for i, line in enumerate( file( score_infile_name ) ): | |
| 94 line = line.rstrip( '\r\n' ) | |
| 95 if not line or line.startswith( '#' ): | |
| 96 continue | |
| 97 if data_type == None: | |
| 98 if line.startswith( '>' ): | |
| 99 data_type = 'fasta' | |
| 100 continue | |
| 101 elif len( line.split( '\t' ) ) > 0: | |
| 102 fields = line.split() | |
| 103 for score in fields: | |
| 104 try: | |
| 105 int( score ) | |
| 106 data_type = 'tabular' | |
| 107 seq_method = 'solexa' | |
| 108 break | |
| 109 except: | |
| 110 break | |
| 111 elif data_type == 'fasta': | |
| 112 fields = line.split() | |
| 113 for score in fields: | |
| 114 try: | |
| 115 int( score ) | |
| 116 seq_method = '454' | |
| 117 break | |
| 118 except: | |
| 119 break | |
| 120 if i == 100: | |
| 121 break | |
| 122 | |
| 123 if data_type is None: | |
| 124 stop_err( 'This tool can only use fasta data or tabular data.' ) | |
| 125 if seq_method is None: | |
| 126 stop_err( 'Invalid data for fasta format.') | |
| 127 | |
| 128 if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ): | |
| 129 seq = None | |
| 130 score = None | |
| 131 score_found = False | |
| 132 | |
| 133 score_file = open( score_infile_name, 'r' ) | |
| 134 | |
| 135 for i, line in enumerate( open( seq_infile_name ) ): | |
| 136 line = line.rstrip( '\r\n' ) | |
| 137 if not line or line.startswith( '#' ): | |
| 138 continue | |
| 139 if line.startswith( '>' ): | |
| 140 if seq: | |
| 141 scores = [] | |
| 142 if data_type == 'fasta': | |
| 143 score = None | |
| 144 score_found = False | |
| 145 score_line = 'start' | |
| 146 while not score_found and score_line: | |
| 147 score_line = score_file.readline().rstrip( '\r\n' ) | |
| 148 if not score_line or score_line.startswith( '#' ): | |
| 149 continue | |
| 150 if score_line.startswith( '>' ): | |
| 151 if score: | |
| 152 scores = score.split() | |
| 153 score_found = True | |
| 154 score = None | |
| 155 else: | |
| 156 for val in score_line.split(): | |
| 157 try: | |
| 158 int( val ) | |
| 159 except: | |
| 160 score_file.close() | |
| 161 stop_err( "Non-numerical value '%s' in score file." % val ) | |
| 162 if not score: | |
| 163 score = score_line | |
| 164 else: | |
| 165 score = '%s %s' % ( score, score_line ) | |
| 166 elif data_type == 'tabular': | |
| 167 score = score_file.readline().rstrip('\r\n') | |
| 168 loc = score.split( '\t' ) | |
| 169 for base in loc: | |
| 170 nuc_error = base.split() | |
| 171 try: | |
| 172 nuc_error[0] = int( nuc_error[0] ) | |
| 173 nuc_error[1] = int( nuc_error[1] ) | |
| 174 nuc_error[2] = int( nuc_error[2] ) | |
| 175 nuc_error[3] = int( nuc_error[3] ) | |
| 176 big = max( nuc_error ) | |
| 177 except: | |
| 178 score_file.close() | |
| 179 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) | |
| 180 scores.append( big ) | |
| 181 if scores: | |
| 182 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) | |
| 183 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) | |
| 184 | |
| 185 seq_title = line | |
| 186 seq = None | |
| 187 else: | |
| 188 if not seq: | |
| 189 seq = line | |
| 190 else: | |
| 191 seq = "%s%s" % ( seq, line ) | |
| 192 if seq: | |
| 193 scores = [] | |
| 194 if data_type == 'fasta': | |
| 195 score = None | |
| 196 while score_line: | |
| 197 score_line = score_file.readline().rstrip( '\r\n' ) | |
| 198 if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ): | |
| 199 continue | |
| 200 for val in score_line.split(): | |
| 201 try: | |
| 202 int( val ) | |
| 203 except: | |
| 204 score_file.close() | |
| 205 stop_err( "Non-numerical value '%s' in score file." % val ) | |
| 206 if not score: | |
| 207 score = score_line | |
| 208 else: | |
| 209 score = "%s %s" % ( score, score_line ) | |
| 210 if score: | |
| 211 scores = score.split() | |
| 212 elif data_type == 'tabular': | |
| 213 score = score_file.readline().rstrip('\r\n') | |
| 214 loc = score.split( '\t' ) | |
| 215 for base in loc: | |
| 216 nuc_error = base.split() | |
| 217 try: | |
| 218 nuc_error[0] = int( nuc_error[0] ) | |
| 219 nuc_error[1] = int( nuc_error[1] ) | |
| 220 nuc_error[2] = int( nuc_error[2] ) | |
| 221 nuc_error[3] = int( nuc_error[3] ) | |
| 222 big = max( nuc_error ) | |
| 223 except: | |
| 224 score_file.close() | |
| 225 stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) | |
| 226 scores.append( big ) | |
| 227 if scores: | |
| 228 new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) | |
| 229 append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) | |
| 230 score_file.close() | |
| 231 else: | |
| 232 stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) ) | |
| 233 | |
| 234 if __name__ == "__main__": __main__() |
