comparison mqppep_mrgfltr.py @ 0:c1403d18c189 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author eschen42
date Mon, 07 Mar 2022 19:05:01 +0000
parents
children d4d531006735
comparison
equal deleted inserted replaced
-1:000000000000 0:c1403d18c189
1 #!/usr/bin/env python
2
3 # Import the packages needed
4 import argparse
5 import os.path
6 import sys
7
8 import pandas
9 import re
10 import time
11 import sqlite3 as sql
12 from codecs import getreader as cx_getreader
13 import sys
14 import numpy as np
15
16 # for sorting list of lists using operator.itemgetter
17 import operator
18
19 # for formatting stack-trace
20 import traceback
21
22 # for Aho-Corasick search for fixed set of substrings
23 import ahocorasick
24 import operator
25 import hashlib
26
27 # for shutil.copyfile(src, dest)
28 import shutil
29
30 # global constants
31 N_A = 'N/A'
32
33 # ref: https://stackoverflow.com/a/8915613/15509512
34 # answers: "How to handle exceptions in a list comprehensions"
35 # usage:
36 # from math import log
37 # eggs = [1,3,0,3,2]
38 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
39 # producing:
40 # for <built-in function log>
41 # with args (0,)
42 # exception: math domain error
43 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
44 def catch(func, *args, handle=lambda e : e, **kwargs):
45 try:
46 return func(*args, **kwargs)
47 except Exception as e:
48 print("For %s" % str(func))
49 print(" with args %s" % str(args))
50 print(" caught exception: %s" % str(e))
51 (ty, va, tb) = sys.exc_info()
52 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
53 exit(-1)
54 return None # was handle(e)
55
56 def ppep_join(x):
57 x = [i for i in x if N_A != i]
58 result = "%s" % ' | '.join(x)
59 if result != "":
60 return result
61 else:
62 return N_A
63
64 def melt_join(x):
65 tmp = {key.lower(): key for key in x}
66 result = "%s" % ' | '.join([tmp[key] for key in tmp])
67 return result
68
69 def __main__():
70 # Parse Command Line
71 parser = argparse.ArgumentParser(
72 description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
73 )
74
75 # inputs:
76 # Phosphopeptide data for experimental results, including the intensities
77 # and the mapping to kinase domains, in tabular format.
78 parser.add_argument(
79 '--phosphopeptides', '-p',
80 nargs=1,
81 required=True,
82 dest='phosphopeptides',
83 help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
84 )
85 # UniProtKB/SwissProt DB input, SQLite
86 parser.add_argument(
87 '--ppep_mapping_db', '-d',
88 nargs=1,
89 required=True,
90 dest='ppep_mapping_db',
91 help='UniProtKB/SwissProt SQLite Database'
92 )
93 #ACE # PhosPhositesPlus DB input, csv
94 #ACE parser.add_argument(
95 #ACE '--psp_regulatory_sites', '-s',
96 #ACE nargs=1,
97 #ACE required=True,
98 #ACE dest='psp_regulatory_sites_csv',
99 #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
100 #ACE )
101 # species to limit records chosed from PhosPhositesPlus
102 parser.add_argument(
103 '--species', '-x',
104 nargs=1,
105 required=False,
106 default=[],
107 dest='species',
108 help='limit PhosphoSitePlus records to indicated species (field may be empty)'
109 )
110
111 # outputs:
112 # tabular output
113 parser.add_argument(
114 '--mrgfltr_tab', '-o',
115 nargs=1,
116 required=True,
117 dest='mrgfltr_tab',
118 help='Tabular output file for results'
119 )
120 # CSV output
121 parser.add_argument(
122 '--mrgfltr_csv', '-c',
123 nargs=1,
124 required=True,
125 dest='mrgfltr_csv',
126 help='CSV output file for results'
127 )
128 # SQLite output
129 parser.add_argument(
130 '--mrgfltr_sqlite', '-S',
131 nargs=1,
132 required=True,
133 dest='mrgfltr_sqlite',
134 help='SQLite output file for results'
135 )
136
137 # "Make it so!" (parse the arguments)
138 options = parser.parse_args()
139 print("options: " + str(options))
140
141 # determine phosphopeptide ("upstream map") input tabular file access
142 if options.phosphopeptides is None:
143 exit('Argument "phosphopeptides" is required but not supplied')
144 try:
145 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
146 input_file = open(upstream_map_filename_tab, 'r')
147 input_file.close()
148 except Exception as e:
149 exit('Error parsing phosphopeptides argument: %s' % str(e))
150
151 # determine input SQLite access
152 if options.ppep_mapping_db is None:
153 exit('Argument "ppep_mapping_db" is required but not supplied')
154 try:
155 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
156 input_file = open(uniprot_sqlite, 'rb')
157 input_file.close()
158 except Exception as e:
159 exit('Error parsing ppep_mapping_db argument: %s' % str(e))
160
161 # copy input SQLite dataset to output SQLite dataset
162 if options.mrgfltr_sqlite is None:
163 exit('Argument "mrgfltr_sqlite" is required but not supplied')
164 try:
165 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
166 shutil.copyfile(uniprot_sqlite, output_sqlite)
167 except Exception as e:
168 exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
169
170 #ACE # determine psp_regulatory_sites CSV access
171 #ACE if options.psp_regulatory_sites_csv is None:
172 #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
173 #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
174 #ACE try:
175 #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
176 #ACE input_file = open(phosphosite_filename_csv, 'r')
177 #ACE input_file.close()
178 #ACE except Exception as e:
179 #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
180 #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
181
182 # determine species to limit records from PSP_Regulatory_Sites
183 if options.species is None:
184 exit('Argument "species" is required (and may be empty) but not supplied')
185 try:
186 if len(options.species) > 0:
187 species = options.species[0]
188 else:
189 species = ''
190 except Exception as e:
191 exit('Error parsing species argument: %s' % str(e))
192
193 # determine tabular output destination
194 if options.mrgfltr_tab is None:
195 exit('Argument "mrgfltr_tab" is required but not supplied')
196 try:
197 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
198 output_file = open(output_filename_tab, 'w')
199 output_file.close()
200 except Exception as e:
201 exit('Error parsing mrgfltr_tab argument: %s' % str(e))
202
203 # determine CSV output destination
204 if options.mrgfltr_csv is None:
205 exit('Argument "mrgfltr_csv" is required but not supplied')
206 try:
207 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
208 output_file = open(output_filename_csv, 'w')
209 output_file.close()
210 except Exception as e:
211 exit('Error parsing mrgfltr_csv argument: %s' % str(e))
212
213
214 def mqpep_getswissprot():
215
216 ###############################################
217 # copied from Excel Output Script.ipynb BEGIN #
218 ###############################################
219
220 ########### String Constants #################
221 DEPHOSPHOPEP = 'DephosphoPep'
222 DESCRIPTION = 'Description'
223 FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
224 GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB
225 ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites
226 ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites
227 ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
228 ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites
229 ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites
230 PHOSPHOPEPTIDE = 'Phosphopeptide'
231 PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
232 PHOSPHORESIDUE = 'Phosphoresidue'
233 PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
234 SEQUENCE = 'Sequence'
235 SEQUENCE10 = 'Sequence10'
236 SEQUENCE7 = 'Sequence7'
237 SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
238 SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
239 UNIPROT_ID = 'UniProt_ID'
240 UNIPROT_SEQ_AND_META_SQL = '''
241 select Uniprot_ID, Description, Gene_Name, Sequence,
242 Organism_Name, Organism_ID, PE, SV
243 from UniProtKB
244 order by Sequence, UniProt_ID
245 '''
246 UNIPROT_UNIQUE_SEQ_SQL = '''
247 select distinct Sequence
248 from UniProtKB
249 group by Sequence
250 '''
251 PPEP_PEP_UNIPROTSEQ_SQL = '''
252 select distinct phosphopeptide, peptide, sequence
253 from uniprotkb_pep_ppep_view
254 order by sequence
255 '''
256 PPEP_MELT_SQL = '''
257 SELECT DISTINCT
258 phospho_peptide AS 'p_peptide',
259 kinase_map AS 'characterization',
260 'X' AS 'X'
261 FROM ppep_gene_site_view
262 '''
263 # CREATE TABLE PSP_Regulatory_site (
264 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
265 # domain TEXT,
266 # ON_FUNCTION TEXT,
267 # ON_PROCESS TEXT,
268 # ON_PROT_INTERACT TEXT,
269 # ON_OTHER_INTERACT TEXT,
270 # notes TEXT,
271 # organism TEXT
272 # );
273 PSP_REGSITE_SQL = '''
274 SELECT DISTINCT
275 SITE_PLUSMINUS_7AA ,
276 DOMAIN ,
277 ON_FUNCTION ,
278 ON_PROCESS ,
279 ON_PROT_INTERACT ,
280 ON_OTHER_INTERACT ,
281 NOTES ,
282 ORGANISM
283 FROM PSP_Regulatory_site
284 '''
285 PPEP_ID_SQL ='''
286 SELECT
287 id AS 'ppep_id',
288 seq AS 'ppep_seq'
289 FROM ppep
290 '''
291 MRGFLTR_DDL ='''
292 DROP VIEW IF EXISTS mrgfltr_metadata_view;
293 DROP TABLE IF EXISTS mrgfltr_metadata;
294 CREATE TABLE mrgfltr_metadata
295 ( ppep_id INTEGER REFERENCES ppep(id)
296 , Sequence10 TEXT
297 , Sequence7 TEXT
298 , GeneName TEXT
299 , Phosphoresidue TEXT
300 , UniProtID TEXT
301 , Description TEXT
302 , FunctionPhosphoresidue TEXT
303 , PutativeUpstreamDomains TEXT
304 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE
305 )
306 ;
307 CREATE VIEW mrgfltr_metadata_view AS
308 SELECT DISTINCT
309 ppep.seq AS phospho_peptide
310 , Sequence10
311 , Sequence7
312 , GeneName
313 , Phosphoresidue
314 , UniProtID
315 , Description
316 , FunctionPhosphoresidue
317 , PutativeUpstreamDomains
318 FROM
319 ppep, mrgfltr_metadata
320 WHERE
321 mrgfltr_metadata.ppep_id = ppep.id
322 ORDER BY
323 ppep.seq
324 ;
325 '''
326
327 CITATION_INSERT_STMT = '''
328 INSERT INTO Citation (
329 ObjectName,
330 CitationData
331 ) VALUES (?,?)
332 '''
333 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
334 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
335
336 MRGFLTR_METADATA_COLUMNS = [
337 'ppep_id',
338 'Sequence10',
339 'Sequence7',
340 'GeneName',
341 'Phosphoresidue',
342 'UniProtID',
343 'Description',
344 'FunctionPhosphoresidue',
345 'PutativeUpstreamDomains'
346 ]
347
348 ########### String Constants (end) ############
349
350 class Error(Exception):
351 """Base class for exceptions in this module."""
352 pass
353
354 class PreconditionError(Error):
355 """Exception raised for errors in the input.
356
357 Attributes:
358 expression -- input expression in which the error occurred
359 message -- explanation of the error
360 """
361
362 def __init__(self, expression, message):
363 self.expression = expression
364 self.message = message
365
366 #start_time = time.clock() #timer
367 start_time = time.process_time() #timer
368
369 #get keys from upstream tabular file using readline()
370 # ref: https://stackoverflow.com/a/16713581/15509512
371 # answer to "Use codecs to read file with correct encoding"
372 file1_encoded = open(upstream_map_filename_tab, 'rb')
373 file1 = cx_getreader("latin-1")(file1_encoded)
374
375 count = 0
376 upstream_map_p_peptide_list = []
377 re_tab = re.compile('^[^\t]*')
378 while True:
379 count += 1
380 # Get next line from file
381 line = file1.readline()
382 # if line is empty
383 # end of file is reached
384 if not line:
385 break
386 if count > 1:
387 m = re_tab.match(line)
388 upstream_map_p_peptide_list.append(m[0])
389 file1.close()
390 file1_encoded.close()
391
392 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
393 re_phos = re.compile('p')
394 dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
395
396 end_time = time.process_time() #timer
397 print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
398
399 ## ----------- Get SwissProt data from SQLite database (start) -----------
400 # build UniProt sequence LUT and list of unique SwissProt sequences
401
402 # Open SwissProt SQLite database
403 conn = sql.connect(uniprot_sqlite)
404 cur = conn.cursor()
405
406 # Set up structures to hold SwissProt data
407
408 uniprot_Sequence_List = []
409 UniProtSeqLUT = {}
410
411 # Execute query for unique seqs without fetching the results yet
412 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
413
414 while batch := uniprot_unique_seq_cur.fetchmany(size=50):
415 if None == batch:
416 # handle case where no records are returned
417 break
418 for row in batch:
419 Sequence = row[0]
420 UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
421 UniProtSeqLUT[(Sequence,GENE_NAME) ] = []
422 UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
423 UniProtSeqLUT[ Sequence ] = []
424
425 # Execute query for seqs and metadata without fetching the results yet
426 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
427
428 while batch := uniprot_seq_and_meta.fetchmany(size=50):
429 if None == batch:
430 # handle case where no records are returned
431 break
432 for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
433 uniprot_Sequence_List.append(Sequence)
434 UniProtSeqLUT[Sequence] = Sequence
435 UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
436 UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name)
437 if OS != N_A:
438 Description += ' OS=' + OS
439 if OX != N_A:
440 Description += ' OX=' + str(int(OX))
441 if Gene_Name != N_A:
442 Description += ' GN=' + Gene_Name
443 if PE != N_A:
444 Description += ' PE=' + PE
445 if SV != N_A:
446 Description += ' SV=' + SV
447 UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
448
449 # Close SwissProt SQLite database; clean up local variables
450 conn.close()
451 Sequence = ''
452 UniProt_ID = ''
453 Description = ''
454 Gene_Name = ''
455
456 ## ----------- Get SwissProt data from SQLite database (finish) -----------
457
458 end_time = time.process_time() #timer
459 print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
460
461 ## ----------- Get SwissProt data from SQLite database (start) -----------
462 # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
463 #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
464
465 # Open SwissProt SQLite database
466 conn = sql.connect(uniprot_sqlite)
467 cur = conn.cursor()
468
469 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
470 DephosphoPep_UniProtSeq_LUT = {}
471
472 # Set up dictionary to accumulate results
473 PhosphoPep_UniProtSeq_LUT = {}
474
475 # Execute query for tuples without fetching the results yet
476 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
477
478 while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
479 if None == batch:
480 # handle case where no records are returned
481 break
482 for (phospho_pep, dephospho_pep, sequence) in batch:
483 #do interesting stuff here...
484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
485 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
486 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
487 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
488 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = []
489 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = []
490 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = []
491 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = []
492 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
493
494 #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
495 if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
496 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
497 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
498 if phospho_pep != phospho_pep:
499 print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
500 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
501 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
502 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
503 r = list(zip(
504 [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
505 [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
506 [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
507 ))
508 # Sort by `UniProt_ID`
509 # ref: https://stackoverflow.com/a/4174955/15509512
510 r = sorted(r, key=operator.itemgetter(0))
511 # Get one tuple for each `phospho_pep`
512 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
513 for (upid, gn, desc) in r:
514 # Append pseudo-tuple per UniProt_ID but only when it is not present
515 if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
516 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
517 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
518 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
519
520 # Close SwissProt SQLite database; clean up local variables
521 conn.close()
522 # wipe local variables
523 phospho_pep = dephospho_pep = sequence = 0
524 upid = gn = desc = r = ''
525
526 ## ----------- Get SwissProt data from SQLite database (finish) -----------
527
528 end_time = time.process_time() #timer
529 print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
530
531 print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
532
533 #Read in Upstream tabular file
534 # We are discarding the intensity data; so read it as text
535 upstream_data = pandas.read_table(
536 upstream_map_filename_tab,
537 dtype='str',
538 index_col = 0
539 )
540
541 end_time = time.process_time() #timer
542 print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
543
544 upstream_data.index = upstream_map_p_peptide_list
545
546
547 end_time = time.process_time() #timer
548 print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
549
550
551 #trim upstream_data to include only the upstream map columns
552 old_cols = upstream_data.columns.tolist()
553 i = 0
554 first_intensity = -1
555 last_intensity = -1
556 intensity_re = re.compile('Intensity.*')
557 for col_name in old_cols:
558 m = intensity_re.match(col_name)
559 if m:
560 last_intensity = i
561 if first_intensity == -1:
562 first_intensity = i
563 i += 1
564 #print('last intensity = %d' % last_intensity)
565 col_PKCalpha = last_intensity + 2
566 col_firstIntensity = first_intensity
567
568 data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
569
570 if upstream_data.empty:
571 print("upstream_data is empty")
572 exit(0)
573
574 data_in = upstream_data.copy(deep=True)[data_in_cols]
575
576 # Convert floating-point integers to int64 integers
577 # ref: https://stackoverflow.com/a/68497603/15509512
578 data_in[list(data_in.columns[1:])] = data_in[
579 list(data_in.columns[1:])].astype('float64').apply(np.int64)
580
581 #create another phosphopeptide column that will be used to join later;
582 # MAY need to change depending on Phosphopeptide column position
583 #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
584 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
585
586
587
588
589 end_time = time.process_time() #timer
590 print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
591
592 # Produce a dictionary of metadata for a single phosphopeptide.
593 # This is a replacement of `UniProtInfo_subdict` in the original code.
594 def pseq_to_subdict(phospho_pep):
595 #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
596 # Strip "p" from phosphopeptide sequence
597 dephospho_pep = re_phos.sub('',phospho_pep)
598
599 # Determine number of phosphoresidues in phosphopeptide
600 numps = len(phospho_pep) - len(dephospho_pep)
601
602 # Determine location(s) of phosphoresidue(s) in phosphopeptide
603 # (used later for Phosphoresidue, Sequence7, and Sequence10)
604 ploc = [] #list of p locations
605 i = 0
606 p = phospho_pep
607 while i < numps:
608 ploc.append(p.find("p"))
609 p = p[:p.find("p")] + p[p.find("p")+1:]
610 i +=1
611
612
613 # Establish nested dictionary
614 result = {}
615 result[SEQUENCE] = []
616 result[UNIPROT_ID] = []
617 result[DESCRIPTION] = []
618 result[GENE_NAME] = []
619 result[PHOSPHORESIDUE] = []
620 result[SEQUENCE7] = []
621 result[SEQUENCE10] = []
622
623 # Add stripped sequence to dictionary
624 result[SEQUENCE].append(dephospho_pep)
625
626 # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
627 dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
628
629 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
630 ### Caller may elect to:
631 ## try:
632 ## ...
633 ## except PreconditionError as pe:
634 ## print("'{expression}': {message}".format(
635 ## expression = pe.expression,
636 ## message = pe.message))
637 ## )
638 ## )
639 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
640 raise PreconditionError( dephospho_pep,
641 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
642 )
643 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
644 raise PreconditionError( dephospho_pep,
645 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
646 )
647 if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
648 raise PreconditionError( dephospho_pep,
649 "dephosphorylated phosphopeptide does not match " +
650 "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
651 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
652 )
653 result[SEQUENCE] = [dephospho_pep]
654 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
655 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
656 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
657 if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
658 raise PreconditionError( dephospho_pep,
659 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
660 )
661 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
662 if len (UniProtSeqList) < 1:
663 print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
664 # raise PreconditionError(
665 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
666 # 'value has zero length'
667 # )
668 for UniProtSeq in UniProtSeqList:
669 i = 0
670 phosphoresidues = []
671 seq7s_set = set()
672 seq7s = []
673 seq10s_set = set()
674 seq10s = []
675 while i < len(ploc):
676 start = UniProtSeq.find(dephospho_pep)
677 psite = start+ploc[i] #location of phosphoresidue on protein sequence
678
679 #add Phosphoresidue
680 phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
681 phosphoresidues.append(phosphosite)
682
683 #Add Sequence7
684 if psite < 7: #phospho_pep at N terminus
685 seq7 = str(UniProtSeq)[:psite+8]
686 if seq7[psite] == "S": #if phosphosresidue is serine
687 pres = "s"
688 elif seq7[psite] == "T": #if phosphosresidue is threonine
689 pres = "t"
690 elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
691 pres = "y"
692 else: # if not pSTY
693 pres = "?"
694 seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
695 while len(seq7) < 15: #add appropriate number of "_" to the front
696 seq7 = "_" + seq7
697 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
698 seq7 = str(UniProtSeq)[psite-7:]
699 if seq7[7] == "S":
700 pres = "s"
701 elif seq7[7] == "T":
702 pres = "t"
703 elif seq7[7] == "Y":
704 pres = "y"
705 else:
706 pres = "?"
707 seq7 = seq7[:7] + pres + seq7[8:]
708 while len(seq7) < 15: #add appropriate number of "_" to the back
709 seq7 = seq7 + "_"
710 else:
711 seq7 = str(UniProtSeq)[psite-7:psite+8]
712 pres = "" #phosphoresidue
713 if seq7[7] == "S": #if phosphosresidue is serine
714 pres = "s"
715 elif seq7[7] == "T": #if phosphosresidue is threonine
716 pres = "t"
717 elif seq7[7] == "Y": #if phosphoresidue is tyrosine
718 pres = "y"
719 else: # if not pSTY
720 pres = "?"
721 seq7 = seq7[:7] + pres + seq7[8:]
722 if seq7 not in seq7s_set:
723 seq7s.append(seq7)
724 seq7s_set.add(seq7)
725
726 #add Sequence10
727 if psite < 10: #phospho_pep at N terminus
728 seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
729 elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
730 seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
731 else:
732 seq10 = str(UniProtSeq)[psite-10:psite+11]
733 seq10 = seq10[:10] + "p" + seq10[10:]
734 if seq10 not in seq10s_set:
735 seq10s.append(seq10)
736 seq10s_set.add(seq10)
737
738 i+=1
739
740 result[PHOSPHORESIDUE].append(phosphoresidues)
741 result[SEQUENCE7].append(seq7s)
742 # result[SEQUENCE10] is a list of lists of strings
743 result[SEQUENCE10].append(seq10s)
744
745
746
747
748 r = list(zip(
749 result[UNIPROT_ID],
750 result[GENE_NAME],
751 result[DESCRIPTION],
752 result[PHOSPHORESIDUE]
753 ))
754 # Sort by `UniProt_ID`
755 # ref: https://stackoverflow.com//4174955/15509512
756 s = sorted(r, key=operator.itemgetter(0))
757
758 result[UNIPROT_ID] = []
759 result[GENE_NAME] = []
760 result[DESCRIPTION] = []
761 result[PHOSPHORESIDUE] = []
762
763 for r in s:
764 result[UNIPROT_ID].append(r[0])
765 result[GENE_NAME].append(r[1])
766 result[DESCRIPTION].append(r[2])
767 result[PHOSPHORESIDUE].append(r[3])
768
769
770 #convert lists to strings in the dictionary
771 for key,value in result.items():
772 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
773 result[key] = '; '.join(map(str, value))
774 elif key in [SEQUENCE10]:
775 # result[SEQUENCE10] is a list of lists of strings
776 joined_value = ''
777 joined_set = set()
778 sep = ''
779 for valL in value:
780 # valL is a list of strings
781 for val in valL:
782 # val is a string
783 if val not in joined_set:
784 joined_set.add(val)
785 #joined_value += sep + '; '.join(map(str, val))
786 joined_value += sep + val
787 sep = '; '
788 # joined_value is a string
789 result[key] = joined_value
790
791
792 newstring = '; '.join(
793 [', '.join(l) for l in result[PHOSPHORESIDUE]]
794 )
795 ### #separate the isoforms in PHOSPHORESIDUE column with ";"
796 ### oldstring = result[PHOSPHORESIDUE]
797 ### oldlist = list(oldstring)
798 ### newstring = ""
799 ### i = 0
800 ### for e in oldlist:
801 ### if e == ";":
802 ### if numps > 1:
803 ### if i%numps:
804 ### newstring = newstring + ";"
805 ### else:
806 ### newstring = newstring + ","
807 ### else:
808 ### newstring = newstring + ";"
809 ### i +=1
810 ### else:
811 ### newstring = newstring + e
812 result[PHOSPHORESIDUE] = newstring
813
814
815 #separate sequence7's by |
816 oldstring = result[SEQUENCE7]
817 oldlist = oldstring
818 newstring = ""
819 for l in oldlist:
820 for e in l:
821 if e == ";":
822 newstring = newstring + " |"
823 elif len(newstring) > 0 and 1 > newstring.count(e):
824 newstring = newstring + " | " + e
825 elif 1 > newstring.count(e):
826 newstring = newstring + e
827 result[SEQUENCE7] = newstring
828
829
830 return [phospho_pep, result]
831
832 # Construct list of [string, dictionary] lists
833 # where the dictionary provides the SwissProt metadata for a phosphopeptide
834 result_list = [
835 catch(pseq_to_subdict,psequence)
836 for psequence
837 in data_in[PHOSPHOPEPTIDE_MATCH]
838 ]
839
840
841 end_time = time.process_time() #timer
842 print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
843
844 # Construct dictionary from list of lists
845 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
846 UniProt_Info = {
847 result[0]:result[1]
848 for result
849 in result_list
850 if result is not None
851 }
852
853
854 end_time = time.process_time() #timer
855 print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
856
857 #cosmetic: add N_A to phosphopeptide rows with no hits
858 p_peptide_list = []
859 for key in UniProt_Info:
860 p_peptide_list.append(key)
861 for nestedKey in UniProt_Info[key]:
862 if UniProt_Info[key][nestedKey] == "":
863 UniProt_Info[key][nestedKey] = N_A
864
865 end_time = time.process_time() #timer
866 print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
867
868 #convert UniProt_Info dictionary to dataframe
869 uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
870
871 #reorder columns to match expected output file
872 uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
873
874
875 cols = uniprot_df.columns.tolist()
876 #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
877 #uniprot_df = uniprot_df[cols]
878 uniprot_df = uniprot_df[[
879 PHOSPHOPEPTIDE,
880 SEQUENCE10,
881 SEQUENCE7,
882 GENE_NAME,
883 PHOSPHORESIDUE,
884 UNIPROT_ID,
885 DESCRIPTION
886 ]]
887
888
889 end_time = time.process_time() #timer
890 print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
891
892 #concat to split then groupby to collapse
893 seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
894 for _, row in uniprot_df.iterrows()]).reset_index()
895 seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
896
897 # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
898 #read in PhosphoSitePlus Regulatory Sites dataset
899 #ACE if (True):
900 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
901 conn = sql.connect(uniprot_sqlite)
902 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
903 # Close SwissProt SQLite database
904 conn.close()
905 #ACE # Array indexes are zero-based
906 #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
907 #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
908 #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
909 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
910 #ACE print(regsites_df)
911 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
912 #ACE else:
913 #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
914 #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
915 #ACE #ACE # Array indexes are zero-based
916 #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
917 #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID'
918 #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA'
919 #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
920 #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES'
921 #ACE #ACE ]
922 #ACE #ACE REGSITE_COL_SITE7AA = 9
923 #ACE #ACE REGSITE_COL_PROTEIN = 1
924 #ACE #ACE REGSITE_COL_DOMAIN = 10
925 #ACE #ACE REGSITE_COL_PMIDs = 15
926
927 # ... -------------- end read PSP_Regulatory_sites ------------------------------------
928
929
930 #keep only the human entries in dataframe
931 if len(species) > 0:
932 print('Limit PhosphoSitesPlus records to species "' + species + '"')
933 regsites_df = regsites_df[regsites_df.ORGANISM == species]
934
935 #merge the seq7 df with the regsites df based off of the sequence7
936 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
937 #ACE print(merge_df.columns.tolist()) #ACE
938
939 #after merging df, select only the columns of interest - note that PROTEIN is absent here
940 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
941 #ACE print(merge_df.columns.tolist()) #ACE
942 #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
943 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
944 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
945 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
946 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
947
948 #remove the columns that were combined
949 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
950
951 #ACE print(merge_df) #ACE
952 #ACE print(merge_df.columns.tolist()) #ACE
953
954 end_time = time.process_time() #timer
955 print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
956
957 #cosmetic changes to Function Phosphoresidue column
958 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
959
960 end_time = time.process_time() #timer
961 print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
962
963 i = 0
964 while i < len(fp_series):
965 #remove the extra ";" so that it looks more professional
966 if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
967 fp_series[i] = ""
968 while fp_series[i].endswith("; "): #remove ; from the ends
969 fp_series[i] = fp_series[i][:-2]
970 while fp_series[i].startswith("; "): #remove ; from the beginning
971 fp_series[i] = fp_series[i][2:]
972 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
973 fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
974 fp_series[i] = fp_series[i].replace("; ; ", "; ")
975
976 #turn blanks into N_A to signify the info was searched for but cannot be found
977 if fp_series[i] == "":
978 fp_series[i] = N_A
979
980 i += 1
981 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
982
983 end_time = time.process_time() #timer
984 print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
985
986 #merge uniprot df with merge df
987 uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
988
989 #collapse the merged df
990 uniprot_regsites_collapsed_df = pandas.DataFrame(
991 uniprot_regsites_merged_df
992 .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
993 .apply(lambda x: ppep_join(x)))
994 #.apply(lambda x: "%s" % ' | '.join(x)))
995
996
997 end_time = time.process_time() #timer
998 print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
999
1000 uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
1001
1002
1003 #rename columns
1004 uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
1005
1006
1007
1008 #select columns to be merged to uniprot_df
1009 #ACE cols = regsites_df.columns.tolist()
1010 #ACE print(cols) #ACE
1011 #ACE if len(cols) > 8:
1012 #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
1013 #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
1014 #ACE print(cols) #ACE
1015 #ACE regsite_merge_df = regsites_df[cols]
1016
1017 end_time = time.process_time() #timer
1018 print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
1019
1020 #add columns based on Sequence7 matching site_+/-7_AA
1021 uniprot_regsite_df = pandas.merge(
1022 left=uniprot_df,
1023 right=uniprot_regsites_collapsed_df,
1024 how='left',
1025 left_on=PHOSPHOPEPTIDE,
1026 right_on='ppp')
1027
1028 end_time = time.process_time() #timer
1029 print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
1030
1031 data_in.rename(
1032 {'Protein description': PHOSPHOPEPTIDE},
1033 axis='columns',
1034 inplace=True
1035 )
1036
1037
1038
1039 sort_start_time = time.process_time() #timer
1040
1041 #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
1042 res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
1043 data_in = data_in.loc[res2]
1044
1045 end_time = time.process_time() #timer
1046 print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
1047
1048
1049
1050 cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
1051 upstream_data = upstream_data[cols]
1052
1053 end_time = time.process_time() #timer
1054 print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
1055
1056
1057 #### #rename upstream columns in new list
1058 #### new_cols = []
1059 #### for name in cols:
1060 #### if "_NetworKIN" in name:
1061 #### name = name.split("_")[0]
1062 #### if " motif" in name:
1063 #### name = name.split(" motif")[0]
1064 #### if " sequence " in name:
1065 #### name = name.split(" sequence")[0]
1066 #### if "_Phosida" in name:
1067 #### name = name.split("_")[0]
1068 #### if "_PhosphoSite" in name:
1069 #### name = name.split("_")[0]
1070 #### new_cols.append(name)
1071
1072 #rename upstream columns in new list
1073 def col_rename(name):
1074 if "_NetworKIN" in name:
1075 name = name.split("_")[0]
1076 if " motif" in name:
1077 name = name.split(" motif")[0]
1078 if " sequence " in name:
1079 name = name.split(" sequence")[0]
1080 if "_Phosida" in name:
1081 name = name.split("_")[0]
1082 if "_PhosphoSite" in name:
1083 name = name.split("_")[0]
1084 return name
1085
1086 new_cols = [col_rename(col) for col in cols]
1087 upstream_data.columns = new_cols
1088
1089
1090
1091 end_time = time.process_time() #timer
1092 print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
1093
1094
1095 # Create upstream_data_cast as a copy of upstream_data
1096 # but with first column substituted by the phosphopeptide sequence
1097 upstream_data_cast = upstream_data.copy()
1098 new_cols_cast = new_cols
1099 new_cols_cast[0] = 'p_peptide'
1100 upstream_data_cast.columns = new_cols_cast
1101 upstream_data_cast['p_peptide'] = upstream_data.index
1102 new_cols_cast0 = new_cols_cast[0]
1103
1104 # --- -------------- begin read upstream_data_melt ------------------------------------
1105 ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
1106 conn = sql.connect(uniprot_sqlite)
1107 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
1108 # Close SwissProt SQLite database
1109 conn.close()
1110 upstream_data_melt = upstream_data_melt_df.copy()
1111 upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
1112 upstream_data_melt['characterization'] = [
1113 col_rename(s)
1114 for s in upstream_data_melt['characterization']
1115 ]
1116
1117 print('%0.6f upstream_data_melt_df initially has %d rows' %
1118 (end_time - start_time, len(upstream_data_melt.axes[0]))
1119 , file=sys.stderr)
1120 # ref: https://stackoverflow.com/a/27360130/15509512
1121 # e.g. df.drop(df[df.score < 50].index, inplace=True)
1122 upstream_data_melt.drop(
1123 upstream_data_melt[upstream_data_melt.X != 'X'].index,
1124 inplace = True
1125 )
1126 print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
1127 (end_time - start_time, len(upstream_data_melt.axes[0]))
1128 , file=sys.stderr)
1129 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
1130 #ACE print(upstream_data_melt)
1131 ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
1132 # ... -------------- end read upstream_data_melt --------------------------------------
1133
1134 end_time = time.process_time() #timer
1135 print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
1136 # ... end read upstream_data_melt
1137
1138 upstream_data_melt_index = upstream_data_melt.index
1139 upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
1140
1141 end_time = time.process_time() #timer
1142 print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
1143
1144 upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
1145
1146 melt_dict = {}
1147 for key in upstream_map_p_peptide_list:
1148 melt_dict[key] = []
1149
1150 for el in upstream_delta_melt_LoL:
1151 (p_peptide, characterization, X) = tuple(el)
1152 if p_peptide in melt_dict:
1153 melt_dict[p_peptide].append(characterization)
1154 else:
1155 exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
1156
1157
1158 end_time = time.process_time() #timer
1159 print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
1160
1161
1162 # for key in upstream_map_p_peptide_list:
1163 # melt_dict[key] = ' | '.join(melt_dict[key])
1164
1165 for key in upstream_map_p_peptide_list:
1166 melt_dict[key] = melt_join(melt_dict[key])
1167
1168 end_time = time.process_time() #timer
1169 print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
1170
1171 # map_dict is a dictionary of dictionaries
1172 map_dict = {}
1173 for key in upstream_map_p_peptide_list:
1174 map_dict[key] = {}
1175 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
1176
1177
1178 end_time = time.process_time() #timer
1179 print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
1180
1181 #convert map_dict to dataframe
1182 map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
1183 map_df["p-peptide"] = map_df.index #make index a column too
1184 cols_map_df = map_df.columns.tolist()
1185 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
1186 map_df = map_df[cols_map_df]
1187
1188 #join map_df to uniprot_regsite_df
1189 output_df = uniprot_regsite_df.merge(
1190 map_df,
1191 how="left",
1192 left_on=PHOSPHOPEPTIDE,
1193 right_on="p-peptide")
1194
1195 output_df = output_df[
1196 [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
1197 UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
1198 PUTATIVE_UPSTREAM_DOMAINS
1199 ]
1200 ]
1201
1202
1203 # cols_output_prelim = output_df.columns.tolist()
1204 #
1205 # print("cols_output_prelim")
1206 # print(cols_output_prelim)
1207 #
1208 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
1209 #
1210 # print("cols_output with p-peptide")
1211 # print(cols_output)
1212 #
1213 # cols_output = [col for col in cols_output if not col == "p-peptide"]
1214 #
1215 # print("cols_output")
1216 # print(cols_output)
1217 #
1218 # output_df = output_df[cols_output]
1219
1220 #join output_df back to quantitative columns in data_in df
1221 quant_cols = data_in.columns.tolist()
1222 quant_cols = quant_cols[1:]
1223 quant_data = data_in[quant_cols]
1224
1225 ## ----------- Write merge/filter metadata to SQLite database (start) -----------
1226 # Open SwissProt SQLite database
1227 conn = sql.connect(output_sqlite)
1228 cur = conn.cursor()
1229
1230 cur.executescript(MRGFLTR_DDL)
1231
1232 cur.execute(
1233 CITATION_INSERT_STMT,
1234 ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
1235 )
1236 cur.execute(
1237 CITATION_INSERT_STMT,
1238 ('mrgfltr_metadata', CITATION_INSERT_PSP)
1239 )
1240 cur.execute(
1241 CITATION_INSERT_STMT,
1242 ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
1243 )
1244 cur.execute(
1245 CITATION_INSERT_STMT,
1246 ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
1247 )
1248
1249 # Read ppep-to-sequence LUT
1250 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
1251 #ACE ppep_lut_df.info(verbose=True)
1252 # write only metadata for merged/filtered records to SQLite
1253 mrgfltr_metadata_df = output_df.copy()
1254 # replace phosphopeptide seq with ppep.id
1255 mrgfltr_metadata_df = ppep_lut_df.merge(
1256 mrgfltr_metadata_df,
1257 left_on='ppep_seq',
1258 right_on=PHOSPHOPEPTIDE,
1259 how='inner'
1260 )
1261 mrgfltr_metadata_df.drop(
1262 columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
1263 inplace=True
1264 )
1265 #rename columns
1266 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
1267 #ACE mrgfltr_metadata_df.info(verbose=True)
1268 mrgfltr_metadata_df.to_sql(
1269 'mrgfltr_metadata',
1270 con=conn,
1271 if_exists='append',
1272 index=False,
1273 method='multi'
1274 )
1275
1276 # Close SwissProt SQLite database
1277 conn.close()
1278 ## ----------- Write merge/filter metadata to SQLite database (finish) -----------
1279
1280 output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
1281 output_cols = output_df.columns.tolist()
1282 output_cols = output_cols[:-1]
1283 output_df = output_df[output_cols]
1284
1285 #cosmetic changes to Upstream column
1286 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
1287 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
1288 i = 0
1289 while i < len(us_series):
1290 #turn blanks into N_A to signify the info was searched for but cannot be found
1291 if us_series[i] == "":
1292 us_series[i] = N_A
1293 i += 1
1294 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
1295
1296 end_time = time.process_time() #timer
1297 print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
1298
1299 (output_rows, output_cols) = output_df.shape
1300
1301 #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
1302 output_df = output_df.convert_dtypes(convert_integer=True)
1303
1304
1305 #Output onto Final CSV file
1306 output_df.to_csv(output_filename_csv, index=False)
1307 output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
1308
1309 end_time = time.process_time() #timer
1310 print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
1311
1312 print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
1313
1314 end_time = time.process_time() #timer
1315 print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
1316
1317
1318 #Rev. 7/1/2016
1319 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
1320 #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
1321 #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
1322 # read from SwissProt SQLite database
1323 #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
1324
1325 #############################################
1326 # copied from Excel Output Script.ipynb END #
1327 #############################################
1328
1329 try:
1330 catch(mqpep_getswissprot,)
1331 exit(0)
1332 except Exception as e:
1333 exit('Internal error running mqpep_getswissprot(): %s' % (e))
1334
1335 if __name__ == "__main__":
1336 __main__()
1337