Mercurial > repos > matthias > fasta_regex_finder
comparison fastaregexfinder.py @ 0:0c5613c6a863 draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools.git commit 98943baecfa613d91dbef112fce8c6189f0431db
| author | matthias |
|---|---|
| date | Mon, 18 Dec 2017 05:16:59 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0c5613c6a863 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import re | |
| 4 import sys | |
| 5 import string | |
| 6 import argparse | |
| 7 import operator | |
| 8 | |
| 9 VERSION='0.1.1' | |
| 10 | |
| 11 parser = argparse.ArgumentParser(description=""" | |
| 12 | |
| 13 DESCRIPTION | |
| 14 | |
| 15 Search a fasta file for matches to a regex and return a bed file with the | |
| 16 coordinates of the match and the matched sequence itself. | |
| 17 | |
| 18 Output bed file has columns: | |
| 19 1. Name of fasta sequence (e.g. chromosome) | |
| 20 2. Start of the match | |
| 21 3. End of the match | |
| 22 4. ID of the match | |
| 23 5. Length of the match | |
| 24 6. Strand | |
| 25 7. Matched sequence as it appears on the forward strand | |
| 26 | |
| 27 For matches on the reverse strand it is reported the start and end position on the | |
| 28 forward strand and the matched string on the forward strand (so the G4 'GGGAGGGT' | |
| 29 present on the reverse strand is reported as ACCCTCCC). | |
| 30 | |
| 31 Note: Fasta sequences (chroms) are read in memory one at a time along with the | |
| 32 matches for that chromosome. | |
| 33 The order of the output is: chroms as they are found in the inut fasta, matches | |
| 34 sorted within chroms by positions. | |
| 35 | |
| 36 EXAMPLE: | |
| 37 ## Test data: | |
| 38 echo '>mychr' > /tmp/mychr.fa | |
| 39 echo 'ACTGnACTGnACTGnTGAC' >> /tmp/mychr.fa | |
| 40 | |
| 41 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' | |
| 42 mychr 0 4 mychr_0_4_for 4 + ACTG | |
| 43 mychr 5 9 mychr_5_9_for 4 + ACTG | |
| 44 mychr 10 14 mychr_10_14_for 4 + ACTG | |
| 45 | |
| 46 fastaRegexFinder.py -f /tmp/mychr.fa -r 'ACTG' --maxstr 3 | |
| 47 mychr 0 4 mychr_0_4_for 4 + ACT[3,4] | |
| 48 mychr 5 9 mychr_5_9_for 4 + ACT[3,4] | |
| 49 mychr 10 14 mychr_10_14_for 4 + ACT[3,4] | |
| 50 | |
| 51 less /tmp/mychr.fa | fastaRegexFinder.py -f - -r 'A\w\wGn' | |
| 52 mychr 0 5 mychr_0_5_for 5 + ACTGn | |
| 53 mychr 5 10 mychr_5_10_for 5 + ACTGn | |
| 54 mychr 10 15 mychr_10_15_for 5 + ACTGn | |
| 55 | |
| 56 DOWNLOAD | |
| 57 fastaRegexFinder.py is hosted at https://github.com/dariober/bioinformatics-cafe/tree/master/fastaRegexFinder | |
| 58 | |
| 59 """, formatter_class= argparse.RawTextHelpFormatter) | |
| 60 | |
| 61 parser.add_argument('--fasta', '-f', | |
| 62 type= str, | |
| 63 help='''Input fasta file to search. Use '-' to read the file from stdin. | |
| 64 | |
| 65 ''', | |
| 66 required= True) | |
| 67 | |
| 68 parser.add_argument('--regex', '-r', | |
| 69 type= str, | |
| 70 help='''Regex to be searched in the fasta input. | |
| 71 Matches to the reverse complement will have - strand. | |
| 72 The default regex is '([gG]{3,}\w{1,7}){3,}[gG]{3,}' which searches | |
| 73 for G-quadruplexes. | |
| 74 ''', | |
| 75 default= '([gG]{3,}\w{1,7}){3,}[gG]{3,}') | |
| 76 | |
| 77 parser.add_argument('--matchcase', '-m', | |
| 78 action= 'store_true', | |
| 79 help='''Match case while searching for matches. Default is | |
| 80 to ignore case (I.e. 'ACTG' will match 'actg'). | |
| 81 ''') | |
| 82 | |
| 83 parser.add_argument('--noreverse', | |
| 84 action= 'store_true', | |
| 85 help='''Do not search the reverse complement of the input fasta. | |
| 86 Use this flag to search protein sequences. | |
| 87 ''') | |
| 88 | |
| 89 parser.add_argument('--maxstr', | |
| 90 type= int, | |
| 91 required= False, | |
| 92 default= 10000, | |
| 93 help='''Maximum length of the match to report in the 7th column of the output. | |
| 94 Default is to report up to 10000nt. | |
| 95 Truncated matches are reported as <ACTG...ACTG>[<maxstr>,<tot length>] | |
| 96 ''') | |
| 97 | |
| 98 parser.add_argument('--seqnames', '-s', | |
| 99 type= str, | |
| 100 nargs= '+', | |
| 101 default= [None], | |
| 102 required= False, | |
| 103 help='''List of fasta sequences in --fasta to | |
| 104 search. E.g. use --seqnames chr1 chr2 chrM to search only these crhomosomes. | |
| 105 Default is to search all the sequences in input. | |
| 106 ''') | |
| 107 parser.add_argument('--quiet', '-q', | |
| 108 action= 'store_true', | |
| 109 help='''Do not print progress report (i.e. sequence names as they are scanned). | |
| 110 ''') | |
| 111 | |
| 112 | |
| 113 | |
| 114 parser.add_argument('--version', '-v', action='version', version='%(prog)s ' + VERSION) | |
| 115 | |
| 116 | |
| 117 args = parser.parse_args() | |
| 118 | |
| 119 " --------------------------[ Check and parse arguments ]---------------------- " | |
| 120 | |
| 121 if args.matchcase: | |
| 122 flag= 0 | |
| 123 else: | |
| 124 flag= re.IGNORECASE | |
| 125 | |
| 126 " ------------------------------[ Functions ]--------------------------------- " | |
| 127 | |
| 128 def sort_table(table, cols): | |
| 129 """ Code to sort list of lists | |
| 130 see http://www.saltycrane.com/blog/2007/12/how-to-sort-table-by-columns-in-python/ | |
| 131 | |
| 132 sort a table by multiple columns | |
| 133 table: a list of lists (or tuple of tuples) where each inner list | |
| 134 represents a row | |
| 135 cols: a list (or tuple) specifying the column numbers to sort by | |
| 136 e.g. (1,0) would sort by column 1, then by column 0 | |
| 137 """ | |
| 138 for col in reversed(cols): | |
| 139 table = sorted(table, key=operator.itemgetter(col)) | |
| 140 return(table) | |
| 141 | |
| 142 def trimMatch(x, n): | |
| 143 """ Trim the string x to be at most length n. Trimmed matches will be reported | |
| 144 with the syntax ACTG[a,b] where Ns are the beginning of x, a is the length of | |
| 145 the trimmed strng (e.g 4 here) and b is the full length of the match | |
| 146 EXAMPLE: | |
| 147 trimMatch('ACTGNNNN', 4) | |
| 148 >>>'ACTG[4,8]' | |
| 149 trimMatch('ACTGNNNN', 8) | |
| 150 >>>'ACTGNNNN' | |
| 151 """ | |
| 152 if len(x) > n and n is not None: | |
| 153 m= x[0:n] + '[' + str(n) + ',' + str(len(x)) + ']' | |
| 154 else: | |
| 155 m= x | |
| 156 return(m) | |
| 157 | |
| 158 def revcomp(x): | |
| 159 """Reverse complement string x. Ambiguity codes are handled and case conserved. | |
| 160 | |
| 161 Test | |
| 162 x= 'ACGTRYSWKMBDHVNacgtryswkmbdhvn' | |
| 163 revcomp(x) | |
| 164 """ | |
| 165 compdict= {'A':'T', | |
| 166 'C':'G', | |
| 167 'G':'C', | |
| 168 'T':'A', | |
| 169 'R':'Y', | |
| 170 'Y':'R', | |
| 171 'S':'W', | |
| 172 'W':'S', | |
| 173 'K':'M', | |
| 174 'M':'K', | |
| 175 'B':'V', | |
| 176 'D':'H', | |
| 177 'H':'D', | |
| 178 'V':'B', | |
| 179 'N':'N', | |
| 180 'a':'t', | |
| 181 'c':'g', | |
| 182 'g':'c', | |
| 183 't':'a', | |
| 184 'r':'y', | |
| 185 'y':'r', | |
| 186 's':'w', | |
| 187 'w':'s', | |
| 188 'k':'m', | |
| 189 'm':'k', | |
| 190 'b':'v', | |
| 191 'd':'h', | |
| 192 'h':'d', | |
| 193 'v':'b', | |
| 194 'n':'n'} | |
| 195 xrc= [] | |
| 196 for n in x: | |
| 197 xrc.append(compdict[n]) | |
| 198 xrc= ''.join(xrc)[::-1] | |
| 199 return(xrc) | |
| 200 # ----------------------------------------------------------------------------- | |
| 201 | |
| 202 psq_re_f= re.compile(args.regex, flags= flag) | |
| 203 ## psq_re_r= re.compile(regexrev) | |
| 204 | |
| 205 if args.fasta != '-': | |
| 206 ref_seq_fh= open(args.fasta) | |
| 207 else: | |
| 208 ref_seq_fh= sys.stdin | |
| 209 | |
| 210 ref_seq=[] | |
| 211 line= (ref_seq_fh.readline()).strip() | |
| 212 chr= re.sub('^>', '', line) | |
| 213 line= (ref_seq_fh.readline()).strip() | |
| 214 gquad_list= [] | |
| 215 while True: | |
| 216 if not args.quiet: | |
| 217 sys.stderr.write('Processing %s\n' %(chr)) | |
| 218 while line.startswith('>') is False: | |
| 219 ref_seq.append(line) | |
| 220 line= (ref_seq_fh.readline()).strip() | |
| 221 if line == '': | |
| 222 break | |
| 223 ref_seq= ''.join(ref_seq) | |
| 224 if args.seqnames == [None] or chr in args.seqnames: | |
| 225 for m in re.finditer(psq_re_f, ref_seq): | |
| 226 matchstr= trimMatch(m.group(0), args.maxstr) | |
| 227 quad_id= str(chr) + '_' + str(m.start()) + '_' + str(m.end()) + '_for' | |
| 228 gquad_list.append([chr, m.start(), m.end(), quad_id, len(m.group(0)), '+', matchstr]) | |
| 229 if args.noreverse is False: | |
| 230 ref_seq= revcomp(ref_seq) | |
| 231 seqlen= len(ref_seq) | |
| 232 for m in re.finditer(psq_re_f, ref_seq): | |
| 233 matchstr= trimMatch(revcomp(m.group(0)), args.maxstr) | |
| 234 mstart= seqlen - m.end() | |
| 235 mend= seqlen - m.start() | |
| 236 quad_id= str(chr) + '_' + str(mstart) + '_' + str(mend) + '_rev' | |
| 237 gquad_list.append([chr, mstart, mend, quad_id, len(m.group(0)), '-', matchstr]) | |
| 238 gquad_sorted= sort_table(gquad_list, (1,2,3)) | |
| 239 gquad_list= [] | |
| 240 for xline in gquad_sorted: | |
| 241 xline= '\t'.join([str(x) for x in xline]) | |
| 242 print(xline) | |
| 243 chr= re.sub('^>', '', line) | |
| 244 ref_seq= [] | |
| 245 line= (ref_seq_fh.readline()).strip() | |
| 246 if line == '': | |
| 247 break | |
| 248 | |
| 249 #gquad_sorted= sort_table(gquad_list, (0,1,2,3)) | |
| 250 # | |
| 251 #for line in gquad_sorted: | |
| 252 # line= '\t'.join([str(x) for x in line]) | |
| 253 # print(line) | |
| 254 sys.exit() |
