comparison mzsqlite_psm_align.py @ 0:f2dc9805107a draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mzsqlite_psm_align commit 464e05be1084ed9a65b542c8eabb18147d425666
author galaxyp
date Mon, 16 Apr 2018 18:00:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f2dc9805107a
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2017, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 from __future__ import print_function
16
17 import argparse
18 import re
19 import sys
20 import sqlite3 as sqlite
21 from time import time
22
23
24 from Bio.Seq import reverse_complement, translate
25
26
27 import pysam
28 from twobitreader import TwoBitFile
29
30 from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry
31
32
33 def regex_match(expr, item):
34 return re.match(expr, item) is not None
35
36
37 def regex_search(expr, item):
38 return re.search(expr, item) is not None
39
40
41 def regex_sub(expr, replace, item):
42 return re.sub(expr, replace, item)
43
44
45 def get_connection(sqlitedb_path, addfunctions=True):
46 conn = sqlite.connect(sqlitedb_path)
47 if addfunctions:
48 conn.create_function("re_match", 2, regex_match)
49 conn.create_function("re_search", 2, regex_search)
50 conn.create_function("re_sub", 3, regex_sub)
51 return conn
52
53
54 ## Peptide Spectral Match (PSM)s
55 PSM_QUERY = """\
56 SELECT
57 pe.dBSequence_ref,
58 pe.start,
59 pe.end,
60 pe.pre,
61 pe.post,
62 pep.sequence,
63 sr.id,
64 sr.spectrumTitle,
65 si.rank,
66 si.chargeState,
67 si.calculatedMassToCharge,
68 si.experimentalMassToCharge,
69 si.peptide_ref
70 FROM spectrum_identification_results sr
71 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
72 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
73 JOIN peptides pep ON pe.peptide_ref = pep.id
74 WHERE pe.isDecoy = 'false'
75 ORDER BY sr.spectrumTitle,si.rank
76 """
77
78
79 ## Peptide Post Translational Modifications
80 PEP_MODS_QUERY = """\
81 SELECT location, residue, name, modType, '' as "unimod"
82 FROM peptide_modifications
83 WHERE peptide_ref = :peptide_ref
84 ORDER BY location, modType, name
85 """
86
87
88 ## number of peptides to which the spectrum maps
89 ## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence
90 SPECTRUM_PEPTIDES_QUERY = """\
91 SELECT count(distinct pep.sequence)
92 FROM spectrum_identification_results sr
93 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id
94 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref
95 JOIN peptides pep ON pe.peptide_ref = pep.id
96 WHERE pe.isDecoy = 'false'
97 AND sr.id = :sr_id
98 GROUP BY sr.id
99 """
100
101
102 ## number of genomic locations to which the peptide sequence maps
103 ## uniqueness of the peptide mapping
104 ## peptides => peptide_evidence -> db_sequence -> location
105 ## proteins_by_peptide
106 PEPTIDE_ACC_QUERY = """\
107 SELECT
108 pe.dBSequence_ref,
109 pe.start,
110 pe.end
111 FROM peptide_evidence pe
112 JOIN peptides pep ON pe.peptide_ref = pep.id
113 WHERE pe.isDecoy = 'false'
114 AND pep.sequence = :sequence
115 """
116
117
118 MAP_QUERY = """\
119 SELECT distinct *
120 FROM feature_cds_map
121 WHERE name = :acc
122 AND :p_start < cds_end
123 AND :p_end >= cds_start
124 ORDER BY name,cds_start,cds_end
125 """
126
127
128 GENOMIC_POS_QUERY = """\
129 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos"
130 FROM feature_cds_map
131 WHERE name = :acc
132 AND :cds_offset >= cds_start
133 AND :cds_offset < cds_end
134 """
135
136
137 FEATURE_QUERY = """\
138 SELECT distinct seqid,start,end,featuretype,strand,
139 CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame",
140 CASE WHEN :strand = '+' THEN start - :start ELSE end - :end END as "start_pos",
141 CASE WHEN :strand = '+' THEN end - :end ELSE start - :start END as "end_pos",
142 parent as "name"
143 FROM features JOIN relations ON features.id = relations.child
144 WHERE seqid = :seqid
145 AND parent in (
146 SELECT id
147 FROM features
148 WHERE seqid = :seqid
149 AND featuretype = 'transcript'
150 AND start <= :tstart AND :tend <= end
151 )
152 AND :end >= start AND :start <= end
153 AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript')
154 ORDER BY strand = :strand DESC, featuretype
155 """
156
157 ## Use order by to get best matches
158 ## one_exon strand, featuretype, contained, inframe,
159 ## first_exon strand, featuretype, contained, end_pos,
160 ## middle_exon strand, featuretype, contained, start_pos, end_pos,
161 ## last_exon strand, featuretype, contained, start_pos,
162
163 ONLY_EXON_QUERY = FEATURE_QUERY + """,\
164 start <= :start AND end >= :end DESC,
165 in_frame DESC, end - start DESC, start DESC, end
166 """
167
168 FIRST_EXON_QUERY = FEATURE_QUERY + """,\
169 start <= :start AND end >= :end DESC,
170 in_frame DESC, abs(end_pos), start DESC, end
171 """
172
173 MIDDLE_EXON_QUERY = FEATURE_QUERY + """,\
174 start <= :start AND end >= :end DESC,
175 in_frame DESC, abs(start_pos), abs(end_pos), start DESC, end
176 """
177
178 LAST_EXON_QUERY = FEATURE_QUERY + """,\
179 start <= :start AND end >= :end DESC,
180 in_frame DESC, abs(start_pos), start DESC, end
181 """
182
183
184 def __main__():
185 parser = argparse.ArgumentParser(
186 description='Generate proBED and proBAM from mz.sqlite')
187 parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML")
188 parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table")
189 parser.add_argument(
190 '-R', '--genomeReference', default='Unknown',
191 help='Genome reference sequence in 2bit format')
192 parser.add_argument(
193 '-t', '--twobit', default=None,
194 help='Genome reference sequence in 2bit format')
195 parser.add_argument(
196 '-r', '--reads_bam', default=None,
197 help='reads alignment bam path')
198 parser.add_argument(
199 '-g', '--gffutils_sqlite', default=None,
200 help='gffutils GTF sqlite DB')
201 parser.add_argument(
202 '-B', '--probed', default=None,
203 help='proBed path')
204 parser.add_argument(
205 '-s', '--prosam', default=None,
206 help='proSAM path')
207 parser.add_argument(
208 '-b', '--probam', default=None,
209 help='proBAM path')
210 parser.add_argument(
211 '-l', '--limit', type=int, default=None,
212 help='limit numbers of PSMs for testing')
213 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose')
214 parser.add_argument('-d', '--debug', action='store_true', help='Debug')
215 args = parser.parse_args()
216
217 def get_sequence(chrom, start, end):
218 if twobit:
219 if chrom in twobit and 0 <= start < end < len(twobit[chrom]):
220 return twobit[chrom][start:end]
221 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom
222 if contig in twobit and 0 <= start < end < len(twobit[contig]):
223 return twobit[contig][start:end]
224 return ''
225 return None
226
227 twobit = TwoBitFile(args.twobit) if args.twobit else None
228 samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None
229 seqlens = twobit.sequence_sizes()
230
231 probed = open(args.probed,'w') if args.probed else sys.stdout
232
233 gff_cursor = get_connection(args.gffutils_sqlite).cursor() if args.gffutils_sqlite else None
234 map_cursor = get_connection(args.genomic_mapping_sqlite).cursor()
235 mz_cursor = get_connection(args.mzsqlite).cursor()
236
237 unmapped_accs = set()
238 timings = dict()
239 def add_time(name,elapsed):
240 if name in timings:
241 timings[name] += elapsed
242 else:
243 timings[name] = elapsed
244
245 XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
246 FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
247 def get_exon_features(exons):
248 efeatures = None
249 transcript_features = dict()
250 t_start = min(exons[0][2],exons[-1][2])
251 t_end = max(exons[0][3],exons[-1][3])
252 if gff_cursor:
253 ts = time()
254 (acc,gc,gs,ge,st,cs,ce) = exons[0]
255 ft_params = {"seqid" : str(gc).replace('chr',''), 'strand' : st, 'tstart': t_start, 'tend': t_end}
256 efeatures = [None] * len(exons)
257 for i,exon in enumerate(exons):
258 (acc,gc,gs,ge,st,cs,ce) = exon
259 fr = cs % 3
260 if args.debug:
261 print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr)
262 ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs + 1, "end" : ge, 'strand' : st, 'frame' : fr, 'tstart': t_start, 'tend': t_end}
263 if len(exons) == 1:
264 q = ONLY_EXON_QUERY
265 elif i == 0:
266 q = FIRST_EXON_QUERY
267 elif len(exons) - i == 1:
268 q = LAST_EXON_QUERY
269 else:
270 q = MIDDLE_EXON_QUERY
271 features = [f for f in gff_cursor.execute(q,ft_params)]
272 transcripts = []
273 efeatures[i] = []
274 for j,f in enumerate(features):
275 transcript = f[-1]
276 ftype = f[3]
277 if ftype == 'transcript':
278 if i > 0 or efeatures[i]:
279 continue
280 if i == 0:
281 if transcript not in transcript_features:
282 transcript_features[transcript] = [[] for _ in range(len(exons))]
283 transcript_features[transcript][i].append(f)
284 efeatures[i].append(f)
285 else:
286 if transcript in transcript_features:
287 transcript_features[transcript][i].append(f)
288 efeatures[i].append(f)
289 else:
290 del transcript_features[transcript]
291 if not efeatures[i]:
292
293 efeatures[i] = transcripts
294 for f in efeatures[i]:
295 (seqid,start,end,featuretype,strand,frame,in_frame,start_offset,end_offset,parent) = f
296 if args.debug:
297 print('feat%d:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % \
298 (i,seqid,start,end,featuretype,strand,frame,str(in_frame) == '1',start_offset,end_offset,parent),file=sys.stderr)
299 if args.debug:
300 print('fmap:\t%s' % transcript_features,file=sys.stderr)
301 return (efeatures,transcript_features)
302
303 def is_structural_variant(exons):
304 if len(exons) > 1:
305 (acc,gc,gs,ge,st,cs,ce) = exons[0]
306 for i in range(1,len(exons)):
307 if gc != exons[i][1] or st != exons[i][4]:
308 return True
309 return False
310
311 XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*']
312 FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript']
313 def classify_transcript(exons,transcript_features,ref_prot,peptide):
314 etypes = ['*'] * len(exons)
315 features = transcript_features[0]
316 (acc,gc,gs,ge,st,cs,ce) = exons[0]
317 if len(features) == 1:
318 (seqid,start,end,featuretype,strand,frame,in_frame,start_offset,end_offset,parent) = features[0]
319 if strand == st:
320 if featuretype == 'CDS':
321 if start <= gs and ge <= end:
322 if in_frame:
323 if ref_prot == peptide:
324 if len(exons) > 1:
325 pass
326 return 'N'
327 elif len(ref_prot) != len(peptide):
328 return 'W'
329 else:
330 return 'V'
331 else:
332 return 'O'
333 elif strand == '+' and start <= gs or ge > end:
334 return 'C'
335 elif strand == '-' and start <= gs and ge <= end:
336 return 'C'
337 elif featuretype == 'five_prime_utr':
338 return 'E'
339 elif featuretype == 'three_prime_utr':
340 return 'B'
341 elif featuretype == 'transcript':
342 return 'I'
343 else:
344 return 'R'
345 elif len(features) > 1:
346 ftypes = [f[3] for f in features]
347 if 'five_prime_utr' in ftypes:
348 return 'E'
349 elif 'three_prime_utr' in ftypes:
350 return 'B'
351 else:
352 return 'C'
353 return '*'
354
355 def classify_peptide(exons,ref_prot,peptide,pep_cds,probam_dict):
356 if ref_prot != peptide and samfile:
357 try:
358 if args.debug:
359 print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
360 ts = time()
361 for exon in exons:
362 (acc,chrom,start,end,strand,c_start,c_end) = exon
363 a_start = c_start / 3 * 3
364 a_end = c_end / 3 * 3
365 if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
366 pileup = get_exon_pileup(chrom,start,end)
367 for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
368 if ao == 0 or i == 0:
369 if ref_prot[ai] != peptide[ai]:
370 codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
371 if args.debug:
372 print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
373 if codon:
374 pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
375 te = time()
376 add_time('var_cds',te - ts)
377 except Exception as e:
378 print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
379 probam_dict['XG'] = '*'
380 if is_structural_variant(exons):
381 probam_dict['XG'] = 'G'
382 else:
383 (efeatures,transcript_features) = get_exon_features(exons)
384 n = len(exons)
385 if efeatures and efeatures[0]:
386 for f in efeatures[0]:
387 transcript = f[9]
388 features = transcript_features[transcript]
389 xg = classify_transcript(exons,features,ref_prot,peptide)
390 if xg != '*':
391 probam_dict['XG'] = xg
392 break
393 return pep_cds
394
395 def get_variant_cds(exons,ref_prot,peptide,pep_cds):
396 if ref_prot != peptide and samfile:
397 try:
398 if args.debug:
399 print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr)
400 ts = time()
401 for exon in exons:
402 (acc,chrom,start,end,strand,c_start,c_end) = exon
403 a_start = c_start / 3 * 3
404 a_end = c_end / 3 * 3
405 if ref_prot[a_start:a_end] != peptide[a_start:a_end]:
406 pileup = get_exon_pileup(chrom,start,end)
407 for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]):
408 if ao == 0 or i == 0:
409 if ref_prot[ai] != peptide[ai]:
410 codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao)
411 if args.debug:
412 print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr)
413 if codon:
414 pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:]
415 te = time()
416 add_time('var_cds',te - ts)
417 except Exception as e:
418 print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr)
419 return pep_cds
420
421 def get_mapping(acc,pep_start,pep_end):
422 ts = time()
423 p_start = (pep_start - 1) * 3
424 p_end = pep_end * 3
425 map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end}
426 if args.debug:
427 print('%s' % map_params, file=sys.stderr)
428 locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)]
429 exons = []
430 ## ========= pep
431 ## --- continue
432 ## --- trim
433 ## --- copy
434 ## --- trim
435 ## --- break
436 c_end = 0
437 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs):
438 if args.debug:
439 print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
440 c_start = c_end
441 if cds_end < p_start:
442 continue
443 if cds_start >= p_end:
444 break
445 if strand == '+':
446 if cds_start < p_start:
447 start += p_start - cds_start
448 if cds_end > p_end:
449 end -= cds_end - p_end
450 else:
451 if cds_start < p_start:
452 end -= p_start - cds_start
453 if cds_end > p_end:
454 start += cds_end - p_end
455 c_end = c_start + abs(end - start)
456 if args.debug:
457 print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr)
458 exons.append([acc,chrom,start,end,strand,c_start,c_end])
459 te = time()
460 add_time('get_mapping',te - ts)
461 return exons
462
463 def get_cds(exons):
464 ts = time()
465 seqs = []
466 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons):
467 seq = get_sequence(chrom, min(start,end), max(start,end))
468 if strand == '-':
469 seq = reverse_complement(seq)
470 seqs.append(seq)
471 te = time()
472 add_time('get_cds',te - ts)
473 if args.debug:
474 print('CDS: %s' % str(seqs),file=sys.stderr)
475 return ''.join(seqs) if seqs else ''
476
477 def genomic_mapping_count(peptide):
478 ts = time()
479 params = {"sequence" : peptide}
480 acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)]
481 te = time()
482 add_time('PEPTIDE_ACC_QUERY',te - ts)
483 if acc_locs:
484 if len(acc_locs) == 1:
485 return 1
486 locations = set()
487 for i,acc_loc in enumerate(acc_locs):
488 (acc,pep_start,pep_end) = acc_loc
489 if acc in unmapped_accs:
490 continue
491 try:
492 add_time('GENOMIC_POS_QUERY_COUNT',1)
493 ts = time()
494 p_start = pep_start * 3
495 p_end = pep_end * 3
496 params = {"acc" : acc, "cds_offset" : p_start}
497 (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
498 params = {"acc" : acc, "cds_offset" : p_end}
499 (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone()
500 locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos))
501 te = time()
502 add_time('GENOMIC_POS_QUERY',te - ts)
503 except:
504 unmapped_accs.add(acc)
505 if args.debug:
506 print('Unmapped: %s' % acc, file=sys.stderr)
507 return len(locations)
508 return -1
509
510 def spectrum_peptide_count(spectrum_id):
511 ts = time()
512 params = {"sr_id" : spectrum_id}
513 pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0]
514 te = time()
515 add_time('SPECTRUM_PEPTIDES_QUERY',te - ts)
516 return pep_count
517
518 def get_exon_pileup(chrom,chromStart,chromEnd):
519 cols = []
520 for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd):
521 if chromStart <= pileupcolumn.reference_pos <= chromEnd:
522 bases = dict()
523 col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases}
524 for pileupread in pileupcolumn.pileups:
525 if not pileupread.is_del and not pileupread.is_refskip:
526 col['depth'] += 1
527 base = pileupread.alignment.query_sequence[pileupread.query_position]
528 if base not in bases:
529 bases[base] = 1
530 else:
531 bases[base] += 1
532 cols.append(col)
533 return cols
534
535 codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
536 "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
537 "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
538 "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W",
539 "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
540 "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
541 "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
542 "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
543 "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
544 "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
545 "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
546 "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
547 "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
548 "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
549 "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
550 "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",}
551
552 aa_codon_map = dict()
553 for c,a in codon_map.items():
554 aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c]
555
556 aa_na_map = dict() # m[aa]{bo : {b1 : [b3]
557 for c,a in codon_map.items():
558 if a not in aa_na_map:
559 aa_na_map[a] = dict()
560 d = aa_na_map[a]
561 for i in range(3):
562 b = c[i]
563 if i < 2:
564 if b not in d:
565 d[b] = dict() if i < 1 else set()
566 d = d[b]
567 else:
568 d.add(b)
569
570 def get_pep_codon(pileup, idx, aa, ao):
571 try:
572 ts = time()
573 bases = []
574 for i in range(3):
575 if i < ao:
576 bases.append(list(set([c[i] for c in aa_codon_map[aa]])))
577 else:
578 bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))])
579 if args.debug:
580 print('%s' % bases,file=sys.stderr)
581 for b0 in bases[0]:
582 if b0 not in aa_na_map[aa]:
583 continue
584 for b1 in bases[1]:
585 if b1 not in aa_na_map[aa][b0]:
586 continue
587 for b2 in bases[2]:
588 if b2 in aa_na_map[aa][b0][b1]:
589 return '%s%s%s' % (b0,b1,b2)
590 te = time()
591 add_time('pep_codon',te - ts)
592 except Exception as e:
593 print("get_pep_codon: %s %s %s %s"
594 % (aa, ao, idx, pileup), file=sys.stderr)
595 raise e
596 return None
597
598 def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts,
599 spectrum,protacc,peptide,uniqueness,genomeReference,score=1000,
600 psmScore='.', fdr='.', mods='.', charge='.',
601 expMassToCharge='.', calcMassToCharge='.',
602 psmRank='.', datasetID='.', uri='.'):
603 probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \
604 (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount,
605 ','.join([str(v) for v in blockSizes]),
606 ','.join([str(v) for v in blockStarts]),
607 protacc,peptide,uniqueness, genomeReference,
608 psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri))
609
610 def get_genomic_location(exons):
611 chrom = exons[0][1]
612 strand = exons[0][4]
613 pos = [exon[2] for exon in exons] + [exon[3] for exon in exons]
614 chromStart = min(pos)
615 chromEnd = max(pos)
616 blockCount = len(exons)
617 blockSizes = [abs(exon[3] - exon[2]) for exon in exons]
618 blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons]
619 return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts)
620
621 def get_psm_modifications(peptide_ref):
622 mods = []
623 ts = time()
624 params = {"peptide_ref" : peptide_ref}
625 pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)]
626 if pepmods:
627 for (location, residue, name, modType, unimod) in pepmods:
628 mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue)))
629 te = time()
630 add_time('PEP_MODS_QUERY',te - ts)
631 return ';'.join(mods)
632
633 ## iterate through PSMs
634 psm_cursor = get_connection(args.mzsqlite).cursor()
635 ts = time()
636 psms = psm_cursor.execute(PSM_QUERY)
637 te = time()
638 add_time('PSM_QUERY',te - ts)
639 proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[]) if args.prosam or args.probam else None
640 proBED = ProBED(species=None,assembly=args.genomeReference,comments=[]) if args.probed else None
641 for i, psm in enumerate(psms):
642 probam_dict = PROBAM_DEFAULTS.copy()
643 (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm
644 scan_name = spectrum_title if spectrum_title else spectrum_id
645 if args.debug:
646 print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr)
647 exons = get_mapping(acc,pep_start,pep_end)
648 if args.debug:
649 print('%s' % exons, file=sys.stderr)
650 if not exons:
651 continue
652 mods = get_psm_modifications(pepref)
653 (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons)
654 ref_cds = get_cds(exons)
655 if args.debug:
656 print('%s' % ref_cds, file=sys.stderr)
657 ref_prot = translate(ref_cds)
658 if args.debug:
659 print('%s' % ref_prot, file=sys.stderr)
660 print('%s' % peptide, file=sys.stderr)
661 spectrum_peptides = spectrum_peptide_count(spectrum_id)
662 peptide_locations = genomic_mapping_count(peptide)
663 if args.debug:
664 print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr)
665 uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]'
666 if proBED is not None:
667 ts = time()
668 proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd,
669 '%s_%s' % (acc,scan_name),
670 1000,strand,
671 blockCount,blockSizes,blockStarts,
672 acc,peptide,uniqueness,args.genomeReference,
673 charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass,
674 mods=mods if mods else '.', psmRank=rank)
675 proBED.add_entry(proBEDEntry)
676 te = time()
677 add_time('add_probed',te - ts)
678 if proBAM is not None:
679 if len(ref_prot) != len(peptide):
680 pass
681 # continue
682 ts = time()
683 probam_dict['NH'] = peptide_locations
684 probam_dict['XO'] = uniqueness
685 probam_dict['XL'] = peptide_locations
686 probam_dict['XP'] = peptide
687 probam_dict['YP'] = acc
688 probam_dict['XC'] = charge
689 probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass)
690 probam_dict['XR'] = ref_prot # ? dbSequence
691 probam_dict['YA'] = aa_post
692 probam_dict['YB'] = aa_pre
693 probam_dict['XM'] = mods if mods else '*'
694 flag = 16 if strand == '-' else 0
695 if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="":
696 flag += 256
697 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons])
698 ## what if structural variant, need to split into multiple entries
699 ## probam_dict['XG'] = peptide_type
700 pep_cds = classify_peptide(exons,ref_prot,peptide,ref_cds,probam_dict)
701 ## probam_dict['MD'] = peptide
702
703 ## FIX SAM sequence is forward strand
704 seq = pep_cds if strand == '+' else reverse_complement(pep_cds)
705 ## cigar based on plus strand
706 cigar = ''
707 if strand == '+':
708 blkStarts = blockStarts
709 blkSizes = blockSizes
710 else:
711 blkStarts = [x for x in reversed(blockStarts)]
712 blkSizes = [x for x in reversed(blockSizes)]
713 for j in range(blockCount):
714 if j > 0:
715 intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1])
716 if intron > 0:
717 cigar += '%dN' % intron
718 cigar += '%dM' % blkSizes[j]
719 proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1,
720 cigar=cigar,seq=seq,optional=probam_dict)
721 proBAM.add_entry(proBAMEntry)
722 te = time()
723 add_time('add_probam',te - ts)
724 if args.debug:
725 print('%s' % probam_dict, file=sys.stderr)
726 if args.limit and i >= args.limit:
727 break
728 if args.probed:
729 ts = time()
730 with open(args.probed,'w') as fh:
731 proBED.write(fh)
732 te = time()
733 add_time('write_probed',te - ts)
734 if args.prosam or args.probam:
735 samfile = args.prosam if args.prosam else 'temp.sam'
736 ts = time()
737 with open(samfile,'w') as fh:
738 proBAM.write(fh)
739 te = time()
740 add_time('write_prosam',te - ts)
741 if args.probam:
742 ts = time()
743 bamfile = args.prosam.replace('.sam','.bam')
744 pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False)
745 te = time()
746 add_time('write_probam',te - ts)
747 pysam.index(args.probam)
748
749 if args.verbose:
750 print('\n%s\n' % str(timings), file=sys.stderr)
751
752 if __name__ == "__main__":
753 __main__()