Mercurial > repos > jjohnson > mzsqlite_psm_align
comparison mzsqlite_psm_align.py @ 0:492f98d89e26 draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/mzsqlite_psm_align commit 88e2fb9c31fbd687a0956924a870137d1fb9bee3-dirty
| author | jjohnson |
|---|---|
| date | Tue, 10 Apr 2018 09:57:49 -0400 |
| parents | |
| children | af5f22779a8e |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:492f98d89e26 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2017, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 | |
| 15 from __future__ import print_function | |
| 16 | |
| 17 import argparse | |
| 18 import re | |
| 19 import sys | |
| 20 import sqlite3 as sqlite | |
| 21 from time import time | |
| 22 | |
| 23 | |
| 24 from Bio.Seq import reverse_complement, translate | |
| 25 | |
| 26 ## from bedutil import bed_from_line | |
| 27 | |
| 28 ## import digest | |
| 29 | |
| 30 ## from ensembl_rest import get_cdna | |
| 31 | |
| 32 import pysam | |
| 33 from twobitreader import TwoBitFile | |
| 34 | |
| 35 from profmt import PROBAM_DEFAULTS,ProBAM,ProBAMEntry,ProBED,ProBEDEntry | |
| 36 | |
| 37 """ | |
| 38 inputs | |
| 39 proBed | |
| 40 mzIdentML | |
| 41 twobit | |
| 42 bam | |
| 43 | |
| 44 inputs | |
| 45 mz.sqlite | |
| 46 genomic.mapping | |
| 47 bam | |
| 48 | |
| 49 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); | |
| 50 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); | |
| 51 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); | |
| 52 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); | |
| 53 | |
| 54 SELECT | |
| 55 FROM spectrum_identification_result_items siri | |
| 56 JOIN peptide_evidence pe ON siri.peptide_ref = pe.peptide_ref | |
| 57 JOIN db_sequence dbs ON pe.dBSequence_ref = | |
| 58 WHERE pe.isDecoy = 'false' | |
| 59 | |
| 60 SELECT | |
| 61 psm.spectrumID, | |
| 62 psm.spectrumTitle as "QNAME", | |
| 63 psm.id, | |
| 64 psm.sequence, | |
| 65 psm.passThreshold, | |
| 66 psm."PeptideShaker PSM confidence", | |
| 67 psm."PeptideShaker PSM score", | |
| 68 pe.start, | |
| 69 pe.end, | |
| 70 pe.pre, | |
| 71 pe.post, | |
| 72 pe.dBSequence_ref | |
| 73 FROM psm_entries psm | |
| 74 JOIN peptide_evidence pe ON psm.id = pe.peptide_ref | |
| 75 JOIN db_sequence dbs ON pe.dBSequence_ref = dbs.accession | |
| 76 WHERE pe.isDecoy = 'false' | |
| 77 AND pe.peptide_ref = 'SFYPEEVSSMVITK_15.99491461956-ATAA-10' | |
| 78 ORDER BY psm.spectrumID | |
| 79 | |
| 80 | |
| 81 proBed to SQLite | |
| 82 or index proBed | |
| 83 | |
| 84 for psm in psms: | |
| 85 beds = get_bed(protein_acc) | |
| 86 cds = '' | |
| 87 for bed in beds: | |
| 88 bed.seq = twobit[bed.chrom][bed.start,bed.end] | |
| 89 cds += bed.get_cds() | |
| 90 refprot = translate(cds) | |
| 91 | |
| 92 def read_bed(path): | |
| 93 pdict = dict() | |
| 94 prog = re.compile('^([^\t]+\t[^\t]+\t[^\t]+\t([^\t]+)\t.*)$') | |
| 95 with open(path,'r') as bed: | |
| 96 for i,line in enumerate(bed): | |
| 97 m = prog.match(line) | |
| 98 prot = m.groups()[1] | |
| 99 pdict[prot] = m.groups()[0] | |
| 100 return pdict | |
| 101 | |
| 102 from pyteomics import mzid | |
| 103 with mzid.reader(args.mzid) as mzidrdr: | |
| 104 for psm in mzidrdr: | |
| 105 SpectrumIdentificationItems = psm['SpectrumIdentificationItem'] | |
| 106 for SpectrumIdentificationItem in SpectrumIdentificationItems: | |
| 107 PeptideEvidenceRef = SpectrumIdentificationItem['PeptideEvidenceRef'] | |
| 108 PepEvs = [r['peptideEvidence_ref'] for r in PeptideEvidenceRef] | |
| 109 for PepEv in PepEvs: | |
| 110 PepRef = mzidrdr[PepEv] | |
| 111 dBSequence_ref = PepRef['dBSequence_ref'] | |
| 112 | |
| 113 spectrum_peptides = count(distinct sequence) FROM psm_entries WHERE | |
| 114 | |
| 115 1 QNAME String Query template NAME Spectrum name * psm.spectrumTitle | |
| 116 2 FLAG Int Bitwise FLAG Bitwise FLAG map.strand | |
| 117 3 RNAME String Reference sequence NAME Reference sequence NAME * map.chrom | |
| 118 4 POS Int 1-based leftmost mapping POSition 1-based leftmost mapping POSition map.start | |
| 119 5 -MAPQ Int MAPping Quality (Phred-scaled) - 255 | |
| 120 6 CIGAR String Extended CIGAR string (operations: MIDN) CIGAR string * map.cigar | |
| 121 7 -RNEXT String Mate Reference NAME ('=' if same as RNAME) - * | |
| 122 8 -PNEXT Int 1-Based leftmost Mate POSition - 0 | |
| 123 9 TLEN Int observed Template LENgth - 0 | |
| 124 10 SEQ String segment SEQuence Coding sequence * genomic.seq | |
| 125 11 -QUAL String Query QUALity (ASCII-33=Phred base quality) - * | |
| 126 | |
| 127 1 QNAME psm.spectrumTitle | |
| 128 2 FLAG map.strand | |
| 129 3 RNAME map.chrom | |
| 130 4 POS map.start | |
| 131 5 -MAPQ | |
| 132 6 CIGAR map.cigar | |
| 133 7 -RNEXT | |
| 134 8 -PNEXT | |
| 135 9 -TLEN | |
| 136 10 SEQ genomic.seq | |
| 137 11 -QUAL | |
| 138 | |
| 139 'NH' : 'i' genomic_locations | |
| 140 'XO' : 'Z' | |
| 141 'XL' : 'i' spectrum_peptides | |
| 142 'XP' : 'Z' psm.sequence | |
| 143 'YP' : 'Z' peptide_evidence.dBSequence_ref | |
| 144 'XF' : 'Z' reading_frame | |
| 145 'XI' : 'f' | |
| 146 'XB' : 'Z' | |
| 147 'XR' : 'Z' | |
| 148 'YB' : 'Z' | |
| 149 'YA' : 'Z' | |
| 150 'XS' : 'f' | |
| 151 'XQ' : 'f' | |
| 152 'XC' : 'i' | |
| 153 'XA' : 'i' | |
| 154 'XM' : 'Z' | |
| 155 'XN' : 'i' | |
| 156 'XT' : 'i' | |
| 157 'XE' : 'i' | |
| 158 'XG' : 'A' | |
| 159 'XU' : 'Z' | |
| 160 | |
| 161 'NH' : 'i', #number of genomic locations to which the peptide sequence maps | |
| 162 'XO' : 'Z', #uniqueness of the peptide mapping | |
| 163 'XL' : 'i', #number of peptides to which the spectrum maps | |
| 164 'XP' : 'Z', #peptide sequence | |
| 165 'YP' : 'Z', #Protein accession ID from the original search result | |
| 166 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) | |
| 167 'XI' : 'f', #Peptide intensity | |
| 168 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). | |
| 169 'XR' : 'Z', #reference peptide sequence | |
| 170 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). | |
| 171 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). | |
| 172 'XS' : 'f', #PSM score | |
| 173 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). | |
| 174 'XC' : 'i', #peptide charge | |
| 175 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; | |
| 176 'XM' : 'Z', #Modifications | |
| 177 'XN' : 'i', #Number of missed cleavages in the peptide (XP) | |
| 178 'XT' : 'i', #Enzyme specificity | |
| 179 'XE' : 'i', #Enzyme used in the experiment | |
| 180 'XG' : 'A', #Peptide type | |
| 181 'XU' : 'Z', #URI | |
| 182 | |
| 183 | |
| 184 Datatype Field name Description Origin | |
| 185 RNAME string chrom map.chrom Reference sequence chromosome | |
| 186 POS uint chromStart map Start position of the first DNA base | |
| 187 uint chromEnd map End position of the last DNA base | |
| 188 QNAME string name spectrum.title Unique name | |
| 189 uint score Score | |
| 190 char[1] strand + or - for strand | |
| 191 uint thickStart Coding region start | |
| 192 uint thickEnd Coding region end | |
| 193 uint reserved Always 0 | |
| 194 int blockCount Number of blocks | |
| 195 int[blockCount] blockSizes Block sizes | |
| 196 int[blockCount] chromStarts Block starts | |
| 197 YP string proteinAccession Protein accession number | |
| 198 XP string peptideSequence Peptide sequence | |
| 199 XO string uniqueness Peptide uniqueness | |
| 200 string genomeReferenceVersion Genome reference version number | |
| 201 XS double psmScore PSM score | |
| 202 XQ double fdr Estimated global false discovery rate | |
| 203 XM string modifications Post-translational modifications | |
| 204 XC int charge Charge value | |
| 205 XB double expMassToCharge Experimental mass to charge value | |
| 206 XB double calcMassToCharge Calculated mass to charge value | |
| 207 int psmRank Peptide-Spectrum Match rank. | |
| 208 string datasetID Dataset Identifier | |
| 209 string uri Uniform Resource Identifier | |
| 210 | |
| 211 XG | |
| 212 N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
| 213 V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
| 214 W Indel peptide. An insertion or deletion is present as compared to the reference. | |
| 215 J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
| 216 A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
| 217 M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
| 218 C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
| 219 E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
| 220 B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
| 221 O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
| 222 T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
| 223 R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
| 224 I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
| 225 G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
| 226 D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
| 227 U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
| 228 X Unknown. | |
| 229 | |
| 230 | |
| 231 | |
| 232 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + cds_offset - cds_start ELSE end - cds_offset - cds_start END as "pos" | |
| 233 FROM feature_cds_map | |
| 234 WHERE name = acc_name AND cds_offset >= cds_start AND cds_offset < cds_end | |
| 235 | |
| 236 sqlite> select * from feature_cds_map WHERE name = 'pre_STRG.28813.4_j_5350_5470'; | |
| 237 pre_STRG.28813.4_j_5350_5470|chr7|5074750|5074857|+|0|107 | |
| 238 pre_STRG.28813.4_j_5350_5470|chr7|5075140|5075153|+|107|120 | |
| 239 | |
| 240 | |
| 241 | |
| 242 SELECT | |
| 243 pe.isDecoy, | |
| 244 pe.dBSequence_ref, | |
| 245 pe.start, | |
| 246 pe.end, | |
| 247 sr.spectrumTitle, | |
| 248 si.rank, | |
| 249 si.chargeState, | |
| 250 si.calculatedMassToCharge, | |
| 251 si.experimentalMassToCharge | |
| 252 FROM spectrum_identification_results sr | |
| 253 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
| 254 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
| 255 WHERE si.id = 'SII_7389_1' | |
| 256 ORDER BY si.rank; | |
| 257 | |
| 258 SELECT pe.isDecoy, pe.dBSequence_ref, pe.start, pe.end, sr.spectrumTitle, si.rank, si.chargeState, si.calculatedMassToCharge, si.experimentalMassToCharge | |
| 259 FROM spectrum_identification_results sr | |
| 260 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
| 261 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
| 262 WHERE si.id = 'SII_7389_1' | |
| 263 ORDER BY si.rank; | |
| 264 | |
| 265 | |
| 266 | |
| 267 CREATE TABLE spectrum_identification_results (id TEXT PRIMARY KEY, spectraData_ref TEXT, spectrumID TEXT, spectrumTitle TEXT); | |
| 268 CREATE TABLE spectrum_identification_result_items (id TEXT PRIMARY KEY, spectrum_identification_result_ref TEXT, passThreshold TEXT, rank INTEGER, peptide_ref TEXT, calculatedMassToCharge FLOAT, experimentalMassToCharge FLOAT, chargeState INTEGER); | |
| 269 CREATE TABLE peptide_evidence (id TEXT PRIMARY KEY, dBSequence_ref TEXT, isDecoy TEXT, pre TEXT, post TEXT, start INTEGER, end INTEGER, peptide_ref TEXT); | |
| 270 CREATE TABLE db_sequence (id TEXT PRIMARY KEY , accession TEXT, searchDatabase_ref TEXT, description TEXT, sequence TEXT, length INTEGER); | |
| 271 | |
| 272 {'write_probed': 0.08575654029846191, 'PSM_QUERY': 4.704349040985107, 'get_cds': 0.21015286445617676, 'SPECTRUM_PEPTIDES_QUERY': 32.92655086517334, 'PEPTIDE_ACC_QUERY': 425.11919951438904, 'get_mapping': 1.5911591053009033, 'GENOMIC_POS_QUERY': 10.909647226333618} | |
| 273 """ | |
| 274 | |
| 275 | |
| 276 | |
| 277 def regex_match(expr, item): | |
| 278 return re.match(expr, item) is not None | |
| 279 | |
| 280 | |
| 281 def regex_search(expr, item): | |
| 282 return re.search(expr, item) is not None | |
| 283 | |
| 284 | |
| 285 def regex_sub(expr, replace, item): | |
| 286 return re.sub(expr, replace, item) | |
| 287 | |
| 288 | |
| 289 def get_connection(sqlitedb_path, addfunctions=True): | |
| 290 conn = sqlite.connect(sqlitedb_path) | |
| 291 if addfunctions: | |
| 292 conn.create_function("re_match", 2, regex_match) | |
| 293 conn.create_function("re_search", 2, regex_search) | |
| 294 conn.create_function("re_sub", 3, regex_sub) | |
| 295 return conn | |
| 296 | |
| 297 PSM_QUERY = """\ | |
| 298 SELECT | |
| 299 pe.dBSequence_ref, | |
| 300 pe.start, | |
| 301 pe.end, | |
| 302 pe.pre, | |
| 303 pe.post, | |
| 304 pep.sequence, | |
| 305 sr.id, | |
| 306 sr.spectrumTitle, | |
| 307 si.rank, | |
| 308 si.chargeState, | |
| 309 si.calculatedMassToCharge, | |
| 310 si.experimentalMassToCharge, | |
| 311 si.peptide_ref | |
| 312 FROM spectrum_identification_results sr | |
| 313 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
| 314 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
| 315 JOIN peptides pep ON pe.peptide_ref = pep.id | |
| 316 WHERE pe.isDecoy = 'false' | |
| 317 ORDER BY sr.spectrumTitle,si.rank | |
| 318 """ | |
| 319 | |
| 320 PEP_MODS_QUERY = """\ | |
| 321 SELECT location, residue, name, modType, '' as "unimod" | |
| 322 FROM peptide_modifications | |
| 323 WHERE peptide_ref = :peptide_ref | |
| 324 ORDER BY location, modType, name | |
| 325 """ | |
| 326 | |
| 327 #number of peptides to which the spectrum maps | |
| 328 ## spectrum_identification_results => spectrum_identification_result_items -> peptide_evidence | |
| 329 SPECTRUM_PEPTIDES_QUERY = """\ | |
| 330 SELECT count(distinct pep.sequence) | |
| 331 FROM spectrum_identification_results sr | |
| 332 JOIN spectrum_identification_result_items si ON si.spectrum_identification_result_ref = sr.id | |
| 333 JOIN peptide_evidence pe ON si.peptide_ref = pe.peptide_ref | |
| 334 JOIN peptides pep ON pe.peptide_ref = pep.id | |
| 335 WHERE pe.isDecoy = 'false' | |
| 336 AND sr.id = :sr_id | |
| 337 GROUP BY sr.id | |
| 338 """ | |
| 339 #number of genomic locations to which the peptide sequence maps | |
| 340 #uniqueness of the peptide mapping | |
| 341 ## peptides => peptide_evidence -> db_sequence -> location | |
| 342 ## proteins_by_peptide | |
| 343 PEPTIDE_ACC_QUERY = """\ | |
| 344 SELECT | |
| 345 pe.dBSequence_ref, | |
| 346 pe.start, | |
| 347 pe.end | |
| 348 FROM peptide_evidence pe | |
| 349 JOIN peptides pep ON pe.peptide_ref = pep.id | |
| 350 WHERE pe.isDecoy = 'false' | |
| 351 AND pep.sequence = :sequence | |
| 352 """ | |
| 353 | |
| 354 MAP_QUERY = """\ | |
| 355 SELECT distinct * | |
| 356 FROM feature_cds_map | |
| 357 WHERE name = :acc | |
| 358 AND :p_start < cds_end | |
| 359 AND :p_end >= cds_start | |
| 360 ORDER BY name,cds_start,cds_end | |
| 361 """ | |
| 362 | |
| 363 GENOMIC_POS_QUERY = """\ | |
| 364 SELECT distinct chrom, CASE WHEN strand = '+' THEN start + :cds_offset - cds_start ELSE end - :cds_offset - cds_start END as "pos" | |
| 365 FROM feature_cds_map | |
| 366 WHERE name = :acc | |
| 367 AND :cds_offset >= cds_start | |
| 368 AND :cds_offset < cds_end | |
| 369 """ | |
| 370 | |
| 371 FEATURE_CONTAIN_QUERY = """\ | |
| 372 SELECT id,seqid,start,end,featuretype,strand,frame | |
| 373 FROM features | |
| 374 WHERE seqid = :seqid AND start <= :start AND end >= :end | |
| 375 AND strand = :strand AND featuretype = :ftype | |
| 376 """ | |
| 377 | |
| 378 FEATURE_OVERLAP_QUERY = """\ | |
| 379 SELECT id,seqid,start,end,featuretype,strand,frame | |
| 380 FROM features | |
| 381 WHERE seqid = :seqid | |
| 382 AND :end >= start AND :start <= end | |
| 383 AND strand = :strand AND featuretype = :ftype | |
| 384 """ | |
| 385 | |
| 386 FEATURE_ANY_QUERY = """\ | |
| 387 SELECT id,seqid,start,end,featuretype,strand,CAST(frame AS INTEGER) as "frame", CAST(frame AS INTEGER)==:frame as "in_frame" | |
| 388 FROM features | |
| 389 WHERE seqid = :seqid | |
| 390 AND :end >= start AND :start <= end | |
| 391 AND featuretype in ('CDS','five_prime_utr','three_prime_utr','transcript') | |
| 392 ORDER BY strand == :strand DESC, featuretype, | |
| 393 start <= :start AND end >= :end DESC, | |
| 394 in_frame DESC, end - start, start DESC, end | |
| 395 """ | |
| 396 | |
| 397 def __main__(): | |
| 398 parser = argparse.ArgumentParser( | |
| 399 description='Generate proBED and proBAM from mz.sqlite') | |
| 400 parser.add_argument('mzsqlite', help="mz.sqlite converted from mzIdentML") | |
| 401 parser.add_argument('genomic_mapping_sqlite', help="genomic_mapping.sqlite with feature_cds_map table") | |
| 402 parser.add_argument( | |
| 403 '-R', '--genomeReference', default='Unknown', | |
| 404 help='Genome reference sequence in 2bit format') | |
| 405 parser.add_argument( | |
| 406 '-t', '--twobit', default=None, | |
| 407 help='Genome reference sequence in 2bit format') | |
| 408 parser.add_argument( | |
| 409 '-r', '--reads_bam', default=None, | |
| 410 help='reads alignment bam path') | |
| 411 parser.add_argument( | |
| 412 '-g', '--gffutils_file', default=None, | |
| 413 help='gffutils GTF sqlite DB') | |
| 414 parser.add_argument( | |
| 415 '-B', '--probed', default=None, | |
| 416 help='proBed path') | |
| 417 parser.add_argument( | |
| 418 '-s', '--prosam', default=None, | |
| 419 help='proSAM path') | |
| 420 parser.add_argument( | |
| 421 '-b', '--probam', default=None, | |
| 422 help='proBAM path') | |
| 423 parser.add_argument( | |
| 424 '-l', '--limit', type=int, default=None, | |
| 425 help='limit numbers of PSMs for testing') | |
| 426 parser.add_argument('-v', '--verbose', action='store_true', help='Verbose') | |
| 427 parser.add_argument('-d', '--debug', action='store_true', help='Debug') | |
| 428 args = parser.parse_args() | |
| 429 | |
| 430 def get_sequence(chrom, start, end): | |
| 431 if twobit: | |
| 432 if chrom in twobit and 0 <= start < end < len(twobit[chrom]): | |
| 433 return twobit[chrom][start:end] | |
| 434 contig = chrom[3:] if chrom.startswith('chr') else 'chr%s' % chrom | |
| 435 if contig in twobit and 0 <= start < end < len(twobit[contig]): | |
| 436 return twobit[contig][start:end] | |
| 437 return '' | |
| 438 return None | |
| 439 | |
| 440 twobit = TwoBitFile(args.twobit) if args.twobit else None | |
| 441 samfile = pysam.AlignmentFile(args.reads_bam, "rb" ) if args.reads_bam else None | |
| 442 seqlens = twobit.sequence_sizes() | |
| 443 | |
| 444 probed = open(args.probed,'w') if args.probed else sys.stdout | |
| 445 | |
| 446 gff_cursor = get_connection(args.gffutils_file).cursor() if args.gffutils_file else None | |
| 447 map_cursor = get_connection(args.genomic_mapping_sqlite).cursor() | |
| 448 mz_cursor = get_connection(args.mzsqlite_file).cursor() | |
| 449 | |
| 450 unmapped_accs = set() | |
| 451 timings = dict() | |
| 452 def add_time(name,elapsed): | |
| 453 if name in timings: | |
| 454 timings[name] += elapsed | |
| 455 else: | |
| 456 timings[name] = elapsed | |
| 457 | |
| 458 XG_TYPES = ['N','V','W','J','A','M','C','E','B','O','T','R','I','G','D','U','X','*'] | |
| 459 FT_TYPES = ['CDS','five_prime_utr','three_prime_utr','transcript'] | |
| 460 def get_peptide_type(exons): | |
| 461 ## XG classify peptide | |
| 462 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
| 463 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
| 464 ## W Indel peptide. An insertion or deletion is present as compared to the reference. | |
| 465 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
| 466 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
| 467 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
| 468 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
| 469 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
| 470 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
| 471 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
| 472 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
| 473 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
| 474 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
| 475 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
| 476 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
| 477 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
| 478 ## X Unknown. | |
| 479 | |
| 480 peptide_type = '*' | |
| 481 if gff_cursor: | |
| 482 ts = time() | |
| 483 etypes = ['*'] * len(exons) | |
| 484 efeatures = [None] * len(exons) | |
| 485 if args.debug: | |
| 486 print('exons:%d\t%s'% (len(exons),etypes),file=sys.stderr) | |
| 487 for i,exon in enumerate(exons): | |
| 488 (acc,gc,gs,ge,st,cs,ce) = exon | |
| 489 fr = cs % 3 | |
| 490 if args.debug: | |
| 491 print('exon:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (acc,gc,gs,ge,st,cs,ce,fr),file=sys.stderr) | |
| 492 ft_params = {"seqid" : str(gc).replace('chr',''), "start" : gs, "end" : ge, 'strand' : st, 'frame' : fr, 'ftype' : 'CDS'} | |
| 493 features = [f for f in gff_cursor.execute(FEATURE_ANY_QUERY,ft_params)] | |
| 494 efeatures[i] = features | |
| 495 for i,exon in enumerate(exons): | |
| 496 (acc,gc,gs,ge,st,cs,ce) = exon | |
| 497 for f in efeatures[i]: | |
| 498 (id,seqid,start,end,featuretype,strand,frame,in_frame) = f | |
| 499 if args.debug: | |
| 500 print('feat:\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (id,seqid,start,end,featuretype,strand,frame,in_frame),file=sys.stderr) | |
| 501 if strand == st: | |
| 502 if start <= gs and ge <= end: | |
| 503 if in_frame: | |
| 504 etypes[i] = 'N' | |
| 505 break | |
| 506 elif XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): | |
| 507 etypes[i] = 'O' | |
| 508 break | |
| 509 else: | |
| 510 if XG_TYPES.index('O') < XG_TYPES.index(etypes[i]): | |
| 511 etypes[i] = 'O' | |
| 512 peptide_type = etypes[i] | |
| 513 te = time() | |
| 514 add_time('pep_type',te - ts) | |
| 515 return peptide_type | |
| 516 def classify_exon(exon,exons,features): | |
| 517 ## N Normal peptide. The peptide sequence is contained in the reference protein sequence. | |
| 518 # 1 exon, contained, in_frame | |
| 519 # 2+ exons, contained, in_frame, on_exon_boundary | |
| 520 ## V Variant peptide. A single amino acid variation (SAV) is present as compared to the reference. | |
| 521 # 1 exon, contained, in_frame, AA_mismatch | |
| 522 # 2+ exons, contained, in_frame, on_exon_boundary, AA_mismatch | |
| 523 ## W Indel peptide. An insertion or deletion is present as compared to the reference. | |
| 524 # 1 exon, contained, in_frame, AA_mismatch | |
| 525 # 2+ exons, contained, in_frame, on_exon_boundary or off by 3, AA_mismatch | |
| 526 ## J Novel junction peptide. A peptide that spans a novel exon-intron boundary as compared to the reference. | |
| 527 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons | |
| 528 ## A Alternative junction peptide. A peptide that spans a non-canonical exon-intron boundary as compared to the reference. | |
| 529 # 2+ exons, contained, on_exon_boundary, same transcript, non adjacent exons | |
| 530 ## M Novel exon peptide. A peptide that resides in a novel exon that is not present in the reference. | |
| 531 ## C Cross junction peptide. A peptide that spans through a splice site (partly exonic - partly intronic). | |
| 532 # 1 exon overlaps but not contained | |
| 533 ## E Extension peptide. A peptide that points to a non-canonical N-terminal protein extension. | |
| 534 ## B 3' UTR peptide. A peptide that maps to the 3' UTR region from the reference. | |
| 535 # exon overlaps a three_prime_utr | |
| 536 ## O Out-of-frame peptide. A peptide that is translated from an alternative frame as compared to the reference. | |
| 537 # exon contained but not in_frame | |
| 538 ## T Truncation peptide. A peptide that points to a non-canonical N-terminal protein truncation. | |
| 539 ## R Reverse strand peptide. A peptide that is derived from translation of the reverse strand of the reference. | |
| 540 ## I Intron peptide. A peptide that is located in an intronic region of the reference isoform. | |
| 541 # exon contained in transcript, not not overlapping any exon | |
| 542 ## G Gene fusion peptide. An (onco-) peptide that spans two exons of different genes, through gene-fusion. | |
| 543 # exonis from different seqs, strand, or transcripts | |
| 544 ## D Decoy peptide. A peptide that maps to a decoy sequence from the MS-based search strategy. | |
| 545 ## U Unmapped peptide. A peptide that could not be mapped to a reference sequence. | |
| 546 ## X Unknown. | |
| 547 return '*' | |
| 548 | |
| 549 def get_variant_cds(exons,ref_prot,peptide,pep_cds): | |
| 550 if ref_prot != peptide and samfile: | |
| 551 try: | |
| 552 if args.debug: | |
| 553 print('name: %s \nref: %s\npep: %s\n' % (scan_name,ref_prot,peptide), file=sys.stderr) | |
| 554 ts = time() | |
| 555 for exon in exons: | |
| 556 (acc,chrom,start,end,strand,c_start,c_end) = exon | |
| 557 a_start = c_start / 3 * 3 | |
| 558 a_end = c_end / 3 * 3 | |
| 559 if ref_prot[a_start:a_end] != peptide[a_start:a_end]: | |
| 560 pileup = get_exon_pileup(chrom,start,end) | |
| 561 for i, (bi,ai,ao) in enumerate([(i,i / 3, i % 3) for i in range(c_start, c_end)]): | |
| 562 if ao == 0 or i == 0: | |
| 563 if ref_prot[ai] != peptide[ai]: | |
| 564 codon = get_pep_codon(pileup, bi - c_start, peptide[ai], ao) | |
| 565 if args.debug: | |
| 566 print('%d %d %d %s : %s %s %s' % (bi,ai,ao, peptide[ai], str(pep_cds[:bi]), str(codon), str(pep_cds[bi+3:])), file=sys.stderr) | |
| 567 if codon: | |
| 568 pep_cds = pep_cds[:bi] + codon + pep_cds[bi+3:] | |
| 569 te = time() | |
| 570 add_time('var_cds',te - ts) | |
| 571 except Exception as e: | |
| 572 print('name: %s \nref: %s\npep: %s\n%s\n' % (scan_name,ref_prot,peptide,e), file=sys.stderr) | |
| 573 return pep_cds | |
| 574 | |
| 575 def get_mapping(acc,pep_start,pep_end): | |
| 576 ts = time() | |
| 577 p_start = (pep_start - 1) * 3 | |
| 578 p_end = pep_end * 3 | |
| 579 map_params = {"acc" : acc, "p_start" : p_start, "p_end" : p_end} | |
| 580 if args.debug: | |
| 581 print('%s' % map_params, file=sys.stderr) | |
| 582 locs = [l for l in map_cursor.execute(MAP_QUERY,map_params)] | |
| 583 exons = [] | |
| 584 ## ========= pep | |
| 585 ## --- continue | |
| 586 ## --- trim | |
| 587 ## --- copy | |
| 588 ## --- trim | |
| 589 ## --- break | |
| 590 c_end = 0 | |
| 591 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(locs): | |
| 592 if args.debug: | |
| 593 print('Prot: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) | |
| 594 c_start = c_end | |
| 595 if cds_end < p_start: | |
| 596 continue | |
| 597 if cds_start >= p_end: | |
| 598 break | |
| 599 if strand == '+': | |
| 600 if cds_start < p_start: | |
| 601 start += p_start - cds_start | |
| 602 if cds_end > p_end: | |
| 603 end -= cds_end - p_end | |
| 604 else: | |
| 605 if cds_start < p_start: | |
| 606 end -= p_start - cds_start | |
| 607 if cds_end > p_end: | |
| 608 start += cds_end - p_end | |
| 609 c_end = c_start + abs(end - start) | |
| 610 if args.debug: | |
| 611 print('Pep: %s\t%s:%d-%d\t%s\t%d\t%d' % (acc,chrom,start,end,strand,cds_start,cds_end),file=sys.stderr) | |
| 612 exons.append([acc,chrom,start,end,strand,c_start,c_end]) | |
| 613 te = time() | |
| 614 add_time('get_mapping',te - ts) | |
| 615 return exons | |
| 616 | |
| 617 def get_cds(exons): | |
| 618 ts = time() | |
| 619 seqs = [] | |
| 620 for i, (acc,chrom,start,end,strand,cds_start,cds_end) in enumerate(exons): | |
| 621 seq = get_sequence(chrom, min(start,end), max(start,end)) | |
| 622 if strand == '-': | |
| 623 seq = reverse_complement(seq) | |
| 624 seqs.append(seq) | |
| 625 te = time() | |
| 626 add_time('get_cds',te - ts) | |
| 627 if args.debug: | |
| 628 print('CDS: %s' % str(seqs),file=sys.stderr) | |
| 629 return ''.join(seqs) if seqs else '' | |
| 630 | |
| 631 def genomic_mapping_count(peptide): | |
| 632 ts = time() | |
| 633 params = {"sequence" : peptide} | |
| 634 acc_locs = [l for l in mz_cursor.execute(PEPTIDE_ACC_QUERY,params)] | |
| 635 te = time() | |
| 636 add_time('PEPTIDE_ACC_QUERY',te - ts) | |
| 637 if acc_locs: | |
| 638 if len(acc_locs) == 1: | |
| 639 return 1 | |
| 640 locations = set() | |
| 641 for i,acc_loc in enumerate(acc_locs): | |
| 642 (acc,pep_start,pep_end) = acc_loc | |
| 643 if acc in unmapped_accs: | |
| 644 continue | |
| 645 try: | |
| 646 add_time('GENOMIC_POS_QUERY_COUNT',1) | |
| 647 ts = time() | |
| 648 p_start = pep_start * 3 | |
| 649 p_end = pep_end * 3 | |
| 650 params = {"acc" : acc, "cds_offset" : p_start} | |
| 651 (start_chrom,start_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() | |
| 652 params = {"acc" : acc, "cds_offset" : p_end} | |
| 653 (end_chrom,end_pos) = map_cursor.execute(GENOMIC_POS_QUERY, params).fetchone() | |
| 654 locations.add('%s:%s-%s:%s' % (start_chrom,start_pos,end_chrom,end_pos)) | |
| 655 te = time() | |
| 656 add_time('GENOMIC_POS_QUERY',te - ts) | |
| 657 except: | |
| 658 unmapped_accs.add(acc) | |
| 659 print('Unmapped: %s' % acc, file=sys.stderr) | |
| 660 return len(locations) | |
| 661 return -1 | |
| 662 | |
| 663 def spectrum_peptide_count(spectrum_id): | |
| 664 ts = time() | |
| 665 params = {"sr_id" : spectrum_id} | |
| 666 pep_count = mz_cursor.execute(SPECTRUM_PEPTIDES_QUERY, params).fetchone()[0] | |
| 667 te = time() | |
| 668 add_time('SPECTRUM_PEPTIDES_QUERY',te - ts) | |
| 669 return pep_count | |
| 670 | |
| 671 def get_exon_pileup(chrom,chromStart,chromEnd): | |
| 672 cols = [] | |
| 673 for pileupcolumn in samfile.pileup(chrom, chromStart, chromEnd): | |
| 674 if chromStart <= pileupcolumn.reference_pos <= chromEnd: | |
| 675 bases = dict() | |
| 676 col = {'depth' : 0, 'cov' : pileupcolumn.nsegments, 'pos': pileupcolumn.reference_pos, 'bases' : bases} | |
| 677 for pileupread in pileupcolumn.pileups: | |
| 678 if not pileupread.is_del and not pileupread.is_refskip: | |
| 679 col['depth'] += 1 | |
| 680 base = pileupread.alignment.query_sequence[pileupread.query_position] | |
| 681 if base not in bases: | |
| 682 bases[base] = 1 | |
| 683 else: | |
| 684 bases[base] += 1 | |
| 685 cols.append(col) | |
| 686 return cols | |
| 687 | |
| 688 codon_map = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L", | |
| 689 "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S", | |
| 690 "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*", | |
| 691 "TGT":"C", "TGC":"C", "TGA":"*", "TGG":"W", | |
| 692 "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L", | |
| 693 "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P", | |
| 694 "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q", | |
| 695 "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R", | |
| 696 "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M", | |
| 697 "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T", | |
| 698 "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K", | |
| 699 "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R", | |
| 700 "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V", | |
| 701 "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A", | |
| 702 "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E", | |
| 703 "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G",} | |
| 704 | |
| 705 aa_codon_map = dict() | |
| 706 for c,a in codon_map.items(): | |
| 707 aa_codon_map[a] = [c] if a not in aa_codon_map else aa_codon_map[a] + [c] | |
| 708 | |
| 709 aa_na_map = dict() # m[aa]{bo : {b1 : [b3] | |
| 710 for c,a in codon_map.items(): | |
| 711 if a not in aa_na_map: | |
| 712 aa_na_map[a] = dict() | |
| 713 d = aa_na_map[a] | |
| 714 for i in range(3): | |
| 715 b = c[i] | |
| 716 if i < 2: | |
| 717 if b not in d: | |
| 718 d[b] = dict() if i < 1 else set() | |
| 719 d = d[b] | |
| 720 else: | |
| 721 d.add(b) | |
| 722 | |
| 723 def get_pep_codon(pileup, idx, aa, ao): | |
| 724 try: | |
| 725 ts = time() | |
| 726 bases = [] | |
| 727 for i in range(3): | |
| 728 if i < ao: | |
| 729 bases.append(list(set([c[i] for c in aa_codon_map[aa]]))) | |
| 730 else: | |
| 731 bases.append([b for b, cnt in reversed(sorted(pileup[idx + i]['bases'].iteritems(), key=lambda (k,v): (v,k)))]) | |
| 732 print('%s' % bases) | |
| 733 for b0 in bases[0]: | |
| 734 if b0 not in aa_na_map[aa]: | |
| 735 continue | |
| 736 for b1 in bases[1]: | |
| 737 if b1 not in aa_na_map[aa][b0]: | |
| 738 continue | |
| 739 for b2 in bases[2]: | |
| 740 if b2 in aa_na_map[aa][b0][b1]: | |
| 741 return '%s%s%s' % (b0,b1,b2) | |
| 742 te = time() | |
| 743 add_time('pep_codon',te - ts) | |
| 744 except Exception as e: | |
| 745 print("get_pep_codon: %s %s %s %s" | |
| 746 % (aa, ao, idx, pileup), file=sys.stderr) | |
| 747 raise e | |
| 748 return None | |
| 749 | |
| 750 def write_probed(chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts, | |
| 751 spectrum,protacc,peptide,uniqueness,genomeReference,score=1000, | |
| 752 psmScore='.', fdr='.', mods='.', charge='.', | |
| 753 expMassToCharge='.', calcMassToCharge='.', | |
| 754 psmRank='.', datasetID='.', uri='.'): | |
| 755 probed.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % \ | |
| 756 (chrom,chromStart,chromEnd,spectrum,score,strand,chromStart,chromEnd,'0',blockCount, | |
| 757 ','.join([str(v) for v in blockSizes]), | |
| 758 ','.join([str(v) for v in blockStarts]), | |
| 759 protacc,peptide,uniqueness, genomeReference, | |
| 760 psmScore, fdr, mods, charge, expMassToCharge, calcMassToCharge, psmRank, datasetID, uri)) | |
| 761 | |
| 762 def get_genomic_location(exons): | |
| 763 chrom = exons[0][1] | |
| 764 strand = exons[0][4] | |
| 765 pos = [exon[2] for exon in exons] + [exon[3] for exon in exons] | |
| 766 chromStart = min(pos) | |
| 767 chromEnd = max(pos) | |
| 768 blockCount = len(exons) | |
| 769 blockSizes = [abs(exon[3] - exon[2]) for exon in exons] | |
| 770 blockStarts = [min(exon[2],exon[3]) - chromStart for exon in exons] | |
| 771 return (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) | |
| 772 | |
| 773 def get_psm_modifications(peptide_ref): | |
| 774 mods = [] | |
| 775 ts = time() | |
| 776 params = {"peptide_ref" : peptide_ref} | |
| 777 pepmods = [m for m in mz_cursor.execute(PEP_MODS_QUERY, params)] | |
| 778 if pepmods: | |
| 779 for (location, residue, name, modType, unimod) in pepmods: | |
| 780 mods.append('%s-%s' % (location, unimod if unimod else '%s%s' % (name,residue))) | |
| 781 te = time() | |
| 782 add_time('PEP_MODS_QUERY',te - ts) | |
| 783 return ';'.join(mods) | |
| 784 | |
| 785 | |
| 786 """ | |
| 787 QNAME | |
| 788 FLAG | |
| 789 RNAME | |
| 790 POS | |
| 791 CIGAR | |
| 792 SEQ | |
| 793 'NH' : 'i', #number of genomic locations to which the peptide sequence maps | |
| 794 'XO' : 'Z', #uniqueness of the peptide mapping | |
| 795 'XL' : 'i', #number of peptides to which the spectrum maps | |
| 796 'XP' : 'Z', #peptide sequence | |
| 797 'YP' : 'Z', #Protein accession ID from the original search result | |
| 798 'XF' : 'Z', #Reading frame of the peptide (0, 1, 2) | |
| 799 'XI' : 'f', #Peptide intensity | |
| 800 'XB' : 'Z', #massdiff; experimental mass; calculated mass massdiff can be calculated by experimental mass - calculated mass. If any number is unavailable, the value should be left blank (such as 0.01;;). | |
| 801 'XR' : 'Z', #reference peptide sequence | |
| 802 'YB' : 'Z', #Preceding amino acids (2 AA, B stands for before). | |
| 803 'YA' : 'Z', #Following amino acids (2 AA, A stands for after). | |
| 804 'XS' : 'f', #PSM score | |
| 805 'XQ' : 'f', #PSM FDR (i.e. q-value or 1-PEP). | |
| 806 'XC' : 'i', #peptide charge | |
| 807 'XA' : 'i', #Whether the peptide is annotated 0:yes; 1:parially unknown; 2:totally unknown; | |
| 808 'XM' : 'Z', #Modifications | |
| 809 'XN' : 'i', #Number of missed cleavages in the peptide (XP) | |
| 810 'XT' : 'i', #Enzyme specificity | |
| 811 'XE' : 'i', #Enzyme used in the experiment | |
| 812 'XG' : 'A', #Peptide type | |
| 813 'XU' : 'Z', #URI | |
| 814 """ | |
| 815 psm_cursor = get_connection(args.mzsqlite_file).cursor() | |
| 816 ts = time() | |
| 817 psms = psm_cursor.execute(PSM_QUERY) | |
| 818 te = time() | |
| 819 add_time('PSM_QUERY',te - ts) | |
| 820 proBAM = ProBAM(species=None,assembly=args.genomeReference,seqlens=seqlens,comments=[]) | |
| 821 proBED = ProBED(species=None,assembly=args.genomeReference,comments=[]) | |
| 822 for i, psm in enumerate(psms): | |
| 823 probam_dict = PROBAM_DEFAULTS.copy() | |
| 824 (acc,pep_start,pep_end,aa_pre,aa_post,peptide,spectrum_id,spectrum_title,rank,charge,calcmass,exprmass,pepref) = psm | |
| 825 scan_name = spectrum_title if spectrum_title else spectrum_id | |
| 826 if args.debug: | |
| 827 print('\nPSM: %d\t%s' % (i, '\t'.join([str(v) for v in (acc,pep_start,pep_end,peptide,spectrum_id,scan_name,rank,charge,calcmass,exprmass)])), file=sys.stderr) | |
| 828 exons = get_mapping(acc,pep_start,pep_end) | |
| 829 if args.debug: | |
| 830 print('%s' % exons, file=sys.stderr) | |
| 831 if not exons: | |
| 832 continue | |
| 833 mods = get_psm_modifications(pepref) | |
| 834 (chrom,chromStart,chromEnd,strand,blockCount,blockSizes,blockStarts) = get_genomic_location(exons) | |
| 835 ref_cds = get_cds(exons) | |
| 836 if args.debug: | |
| 837 print('%s' % ref_cds, file=sys.stderr) | |
| 838 ref_prot = translate(ref_cds) | |
| 839 if args.debug: | |
| 840 print('%s' % ref_prot, file=sys.stderr) | |
| 841 print('%s' % peptide, file=sys.stderr) | |
| 842 spectrum_peptides = spectrum_peptide_count(spectrum_id) | |
| 843 peptide_locations = genomic_mapping_count(peptide) | |
| 844 if args.debug: | |
| 845 print('spectrum_peptide_count: %d\tpeptide_location_count: %d' % (spectrum_peptides,peptide_locations), file=sys.stderr) | |
| 846 uniqueness = 'unique' if peptide_locations == 1 else 'not-unique[unknown]' | |
| 847 ts = time() | |
| 848 proBEDEntry = ProBEDEntry(chrom,chromStart,chromEnd, | |
| 849 '%s_%s' % (acc,scan_name), | |
| 850 1000,strand, | |
| 851 blockCount,blockSizes,blockStarts, | |
| 852 acc,peptide,uniqueness,args.genomeReference, | |
| 853 charge=charge,expMassToCharge=exprmass,calcMassToCharge=calcmass, | |
| 854 mods=mods if mods else '.', psmRank=rank) | |
| 855 proBED.add_entry(proBEDEntry) | |
| 856 te = time() | |
| 857 add_time('add_probed',te - ts) | |
| 858 if len(ref_prot) != len(peptide): | |
| 859 continue | |
| 860 ts = time() | |
| 861 probam_dict['NH'] = peptide_locations | |
| 862 probam_dict['XO'] = uniqueness | |
| 863 probam_dict['XL'] = peptide_locations | |
| 864 probam_dict['XP'] = peptide | |
| 865 probam_dict['YP'] = acc | |
| 866 probam_dict['XC'] = charge | |
| 867 probam_dict['XB'] = '%f;%f;%f' % (exprmass - calcmass, exprmass, calcmass) | |
| 868 probam_dict['XR'] = ref_prot # ? dbSequence | |
| 869 probam_dict['YA'] = aa_post | |
| 870 probam_dict['YB'] = aa_pre | |
| 871 probam_dict['XM'] = mods if mods else '*' | |
| 872 flag = 16 if strand == '-' else 0 | |
| 873 if str(rank)!=str(1) and rank!='*' and rank!=[] and rank!="": | |
| 874 flag += 256 | |
| 875 probam_dict['XF'] = ','.join([str(e[2] % 3) for e in exons]) | |
| 876 ## check for variation from ref_cds | |
| 877 pep_cds = get_variant_cds(exons,ref_prot,peptide,ref_cds) | |
| 878 peptide_type = '*' | |
| 879 ## XG classify peptide | |
| 880 probam_dict['XG'] = get_peptide_type(exons) | |
| 881 ## probam_dict['MD'] = peptide | |
| 882 | |
| 883 ## FIX SAM sequence is forward strand | |
| 884 seq = pep_cds if strand == '+' else reverse_complement(pep_cds) | |
| 885 ## cigar based on plus strand | |
| 886 cigar = '' | |
| 887 if strand == '+': | |
| 888 blkStarts = blockStarts | |
| 889 blkSizes = blockSizes | |
| 890 else: | |
| 891 blkStarts = [x for x in reversed(blockStarts)] | |
| 892 blkSizes = [x for x in reversed(blockSizes)] | |
| 893 for j in range(blockCount): | |
| 894 if j > 0: | |
| 895 intron = blkStarts[j] - (blkStarts[j-1] + blkSizes[j-1]) | |
| 896 if intron > 0: | |
| 897 cigar += '%dN' % intron | |
| 898 cigar += '%dM' % blkSizes[j] | |
| 899 ## Mods TODO | |
| 900 proBAMEntry = ProBAMEntry(qname=scan_name, flag=flag, rname=chrom, pos=chromStart+1, | |
| 901 cigar=cigar,seq=seq,optional=probam_dict) | |
| 902 proBAM.add_entry(proBAMEntry) | |
| 903 te = time() | |
| 904 add_time('add_probam',te - ts) | |
| 905 | |
| 906 if args.debug: | |
| 907 print('%s' % probam_dict, file=sys.stderr) | |
| 908 | |
| 909 if args.limit and i >= args.limit: | |
| 910 break | |
| 911 if args.probed: | |
| 912 ts = time() | |
| 913 with open(args.probed,'w') as fh: | |
| 914 proBED.write(fh) | |
| 915 te = time() | |
| 916 add_time('write_probed',te - ts) | |
| 917 if args.prosam or args.probam: | |
| 918 samfile = args.prosam if args.prosam else 'temp.sam' | |
| 919 ts = time() | |
| 920 with open(samfile,'w') as fh: | |
| 921 proBAM.write(fh) | |
| 922 te = time() | |
| 923 add_time('write_prosam',te - ts) | |
| 924 if args.probam: | |
| 925 ts = time() | |
| 926 bamfile = args.prosam.replace('.sam','.bam') | |
| 927 pysam.view(samfile, '-b', '-o', args.probam, catch_stdout=False) | |
| 928 te = time() | |
| 929 add_time('write_probam',te - ts) | |
| 930 pysam.index(args.probam) | |
| 931 | |
| 932 print('\n%s\n' % str(timings), file=sys.stderr) | |
| 933 | |
| 934 if __name__ == "__main__": | |
| 935 __main__() |
