Mercurial > repos > galaxyp > hirieftools
comparison pi_database_splitter.py @ 0:4e84bf65f99a draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/pi_db_tools commit a58e2a324724f344a07d4499c860a5b2da06927d
| author | galaxyp |
|---|---|
| date | Mon, 22 May 2017 05:08:04 -0400 |
| parents | |
| children | 70757404c4f6 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4e84bf65f99a |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 import sys | |
| 3 import argparse | |
| 4 from numpy import median | |
| 5 from contextlib import ExitStack | |
| 6 | |
| 7 | |
| 8 def main(): | |
| 9 if sys.argv[1:] == []: | |
| 10 sys.argv.append('-h') | |
| 11 args = parse_commandline() | |
| 12 locfun = {False: locatefraction, | |
| 13 True: reverse_locatefraction}[args.reverse] | |
| 14 # Column nrs should start from 0 | |
| 15 # If negative, -1 is last item in list, etc | |
| 16 if args.fdrcol > 0: | |
| 17 args.fdrcol -= 1 | |
| 18 if args.deltapicol > 0: | |
| 19 args.deltapicol -= 1 | |
| 20 pishift = get_pishift(args.train_peptable, args.fdrcol, args.deltapicol, | |
| 21 args.fdrcutoff, args.picutoff) | |
| 22 binarray = get_bin_array(args.fr_amount, args.fr_width, args.intercept, | |
| 23 args.tolerance, pishift) | |
| 24 write_fractions(args.pipeps, args.fr_amount, args.prefix, | |
| 25 binarray, locfun, args.minlen, args.maxlen) | |
| 26 | |
| 27 | |
| 28 def locatefraction(pep_pi, bins): | |
| 29 index = [] | |
| 30 for pibin in bins: | |
| 31 if pep_pi > pibin[2]: | |
| 32 continue | |
| 33 elif pep_pi >= pibin[1]: | |
| 34 index.append(pibin[0]) | |
| 35 else: | |
| 36 return index | |
| 37 return index | |
| 38 | |
| 39 | |
| 40 def reverse_locatefraction(pep_pi, bins): | |
| 41 index = [] | |
| 42 for pibin in bins: | |
| 43 if pep_pi < pibin[1]: | |
| 44 continue | |
| 45 elif pep_pi < pibin[2]: | |
| 46 index.append(pibin[0]) | |
| 47 else: | |
| 48 return index | |
| 49 return index | |
| 50 | |
| 51 | |
| 52 def parse_commandline(): | |
| 53 parser = argparse.ArgumentParser( | |
| 54 formatter_class=argparse.RawTextHelpFormatter) | |
| 55 parser.add_argument('-p', dest='train_peptable', help='Peptide table with ' | |
| 56 'peptides, FDR, and fraction numbers. Used to ' | |
| 57 'calculate pI shift. Leave emtpy for no shift. ' | |
| 58 'Tab separated file.') | |
| 59 parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column' | |
| 60 ' number in peptide table. First column is nr. 1. ' | |
| 61 'Negative number for counting from last col ' | |
| 62 '(-1 is last).', default=False, type=int) | |
| 63 parser.add_argument('--picutoff', dest='picutoff', | |
| 64 help='delta pI value to filter experimental peptides' | |
| 65 ' when calculating pi shift.', default=0.2, type=float) | |
| 66 parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in ' | |
| 67 'peptide table. First column is nr. 1. Empty includes ' | |
| 68 'all peptides', default=False, type=int) | |
| 69 parser.add_argument('--fdrcutoff', dest='fdrcutoff', | |
| 70 help='FDR cutoff value to filter experimental peptides' | |
| 71 ' when calculating pi shift.', default=0, type=float) | |
| 72 parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file ' | |
| 73 'with accession, peptide seq, pI value') | |
| 74 parser.add_argument('--prefix', dest='prefix', default='pisep', | |
| 75 help='Prefix for target/decoy output files') | |
| 76 parser.add_argument('--tolerance', dest='tolerance', | |
| 77 help='Strip fraction tolerance pi tolerance represents' | |
| 78 ' 2.5/97.5 percentile', type=float) | |
| 79 parser.add_argument('--amount', dest='fr_amount', | |
| 80 help='Strip fraction amount', type=int) | |
| 81 parser.add_argument('--reverse', dest='reverse', help='Strip is reversed', | |
| 82 action='store_const', const=True, default=False) | |
| 83 parser.add_argument('--intercept', dest='intercept', | |
| 84 help='pI Intercept of strip', type=float) | |
| 85 parser.add_argument('--width', dest='fr_width', | |
| 86 help='Strip fraction width in pI', type=float) | |
| 87 parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length', | |
| 88 type=int) | |
| 89 parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length', | |
| 90 type=int, default=False) | |
| 91 return parser.parse_args(sys.argv[1:]) | |
| 92 | |
| 93 | |
| 94 def get_pishift(peptable, fdrcol, deltapicol, fdrcutoff, delta_pi_cutoff): | |
| 95 delta_pis = [] | |
| 96 with open(peptable) as fp: | |
| 97 next(fp) # skip header | |
| 98 for line in fp: | |
| 99 line = line.strip('\n').split('\t') | |
| 100 if fdrcol: | |
| 101 try: | |
| 102 fdr = float(line[fdrcol]) | |
| 103 except ValueError: | |
| 104 continue | |
| 105 if fdr > fdrcutoff: | |
| 106 continue | |
| 107 try: | |
| 108 delta_pi = float(line[deltapicol]) | |
| 109 except ValueError: | |
| 110 continue | |
| 111 if delta_pi < delta_pi_cutoff: | |
| 112 delta_pis.append(delta_pi) | |
| 113 shift = median(delta_pis) | |
| 114 print('pI shift (median of delta pIs): {}'.format(shift)) | |
| 115 return shift | |
| 116 | |
| 117 | |
| 118 def get_bin_array(amount_fractions, fr_width, intercept, tolerance, pi_shift): | |
| 119 frnr = 1 | |
| 120 bin_array = [] | |
| 121 while frnr <= amount_fractions: | |
| 122 pi_center = fr_width * frnr + intercept | |
| 123 bin_left = pi_center - fr_width / 2 - tolerance - pi_shift | |
| 124 bin_right = pi_center + fr_width / 2 + tolerance - pi_shift | |
| 125 print('Bins in fraction', frnr, bin_left, bin_right) | |
| 126 bin_array.append((frnr, bin_left, bin_right)) | |
| 127 frnr += 1 | |
| 128 return bin_array | |
| 129 | |
| 130 | |
| 131 def write_fractions(pi_peptides_fn, amount_fractions, out_prefix, | |
| 132 bin_array, locate_function, minlen, maxlen): | |
| 133 amountpad = len(str(amount_fractions)) | |
| 134 with ExitStack() as stack: | |
| 135 target_out_fp = {frnr: ([], stack.enter_context( | |
| 136 open('{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, | |
| 137 pad=amountpad), 'w'))) | |
| 138 for frnr in range(1, amount_fractions + 1)} | |
| 139 decoy_out_fp = {frnr: ([], stack.enter_context( | |
| 140 open('decoy_{p}_fr{i:0{pad}}.fasta'.format(p=out_prefix, i=frnr, | |
| 141 pad=amountpad), 'w'))) | |
| 142 for frnr in range(1, amount_fractions + 1)} | |
| 143 input_fp = stack.enter_context(open(pi_peptides_fn)) | |
| 144 pepcount = 0 | |
| 145 for line in input_fp: | |
| 146 accs, pep, pi = line.strip().split("\t") | |
| 147 pi = float(pi) | |
| 148 if maxlen and len(pep) > maxlen: | |
| 149 continue | |
| 150 elif len(pep) >= minlen: | |
| 151 pepcount += 1 | |
| 152 if pep[-1] in {'K', 'R'}: | |
| 153 rev_pep = pep[::-1][1:] + pep[-1] | |
| 154 else: | |
| 155 rev_pep = pep[::-1] | |
| 156 for i in locate_function(pi, bin_array): | |
| 157 target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep)) | |
| 158 # write pseudoReversed decoy peptide at the same time | |
| 159 decoy_out_fp[i][0].append('>decoy_{}\n{}\n'.format( | |
| 160 accs, rev_pep)) | |
| 161 if pepcount > 1000000: | |
| 162 # write in chunks to make it go faster | |
| 163 pepcount = 0 | |
| 164 [fp.write(''.join(peps)) for peps, fp in | |
| 165 target_out_fp.values()] | |
| 166 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] | |
| 167 target_out_fp = {fr: ([], pep_fp[1]) | |
| 168 for fr, pep_fp in target_out_fp.items()} | |
| 169 decoy_out_fp = {fr: ([], pep_fp[1]) | |
| 170 for fr, pep_fp in decoy_out_fp.items()} | |
| 171 [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()] | |
| 172 [fp.write(''.join(peps)) for peps, fp in decoy_out_fp.values()] | |
| 173 | |
| 174 | |
| 175 if __name__ == '__main__': | |
| 176 main() |
