Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_mrgfltr.py @ 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | d4d531006735 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c1403d18c189 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Import the packages needed | |
4 import argparse | |
5 import os.path | |
6 import sys | |
7 | |
8 import pandas | |
9 import re | |
10 import time | |
11 import sqlite3 as sql | |
12 from codecs import getreader as cx_getreader | |
13 import sys | |
14 import numpy as np | |
15 | |
16 # for sorting list of lists using operator.itemgetter | |
17 import operator | |
18 | |
19 # for formatting stack-trace | |
20 import traceback | |
21 | |
22 # for Aho-Corasick search for fixed set of substrings | |
23 import ahocorasick | |
24 import operator | |
25 import hashlib | |
26 | |
27 # for shutil.copyfile(src, dest) | |
28 import shutil | |
29 | |
30 # global constants | |
31 N_A = 'N/A' | |
32 | |
33 # ref: https://stackoverflow.com/a/8915613/15509512 | |
34 # answers: "How to handle exceptions in a list comprehensions" | |
35 # usage: | |
36 # from math import log | |
37 # eggs = [1,3,0,3,2] | |
38 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) | |
39 # producing: | |
40 # for <built-in function log> | |
41 # with args (0,) | |
42 # exception: math domain error | |
43 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | |
44 def catch(func, *args, handle=lambda e : e, **kwargs): | |
45 try: | |
46 return func(*args, **kwargs) | |
47 except Exception as e: | |
48 print("For %s" % str(func)) | |
49 print(" with args %s" % str(args)) | |
50 print(" caught exception: %s" % str(e)) | |
51 (ty, va, tb) = sys.exc_info() | |
52 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
53 exit(-1) | |
54 return None # was handle(e) | |
55 | |
56 def ppep_join(x): | |
57 x = [i for i in x if N_A != i] | |
58 result = "%s" % ' | '.join(x) | |
59 if result != "": | |
60 return result | |
61 else: | |
62 return N_A | |
63 | |
64 def melt_join(x): | |
65 tmp = {key.lower(): key for key in x} | |
66 result = "%s" % ' | '.join([tmp[key] for key in tmp]) | |
67 return result | |
68 | |
69 def __main__(): | |
70 # Parse Command Line | |
71 parser = argparse.ArgumentParser( | |
72 description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' | |
73 ) | |
74 | |
75 # inputs: | |
76 # Phosphopeptide data for experimental results, including the intensities | |
77 # and the mapping to kinase domains, in tabular format. | |
78 parser.add_argument( | |
79 '--phosphopeptides', '-p', | |
80 nargs=1, | |
81 required=True, | |
82 dest='phosphopeptides', | |
83 help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' | |
84 ) | |
85 # UniProtKB/SwissProt DB input, SQLite | |
86 parser.add_argument( | |
87 '--ppep_mapping_db', '-d', | |
88 nargs=1, | |
89 required=True, | |
90 dest='ppep_mapping_db', | |
91 help='UniProtKB/SwissProt SQLite Database' | |
92 ) | |
93 #ACE # PhosPhositesPlus DB input, csv | |
94 #ACE parser.add_argument( | |
95 #ACE '--psp_regulatory_sites', '-s', | |
96 #ACE nargs=1, | |
97 #ACE required=True, | |
98 #ACE dest='psp_regulatory_sites_csv', | |
99 #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header' | |
100 #ACE ) | |
101 # species to limit records chosed from PhosPhositesPlus | |
102 parser.add_argument( | |
103 '--species', '-x', | |
104 nargs=1, | |
105 required=False, | |
106 default=[], | |
107 dest='species', | |
108 help='limit PhosphoSitePlus records to indicated species (field may be empty)' | |
109 ) | |
110 | |
111 # outputs: | |
112 # tabular output | |
113 parser.add_argument( | |
114 '--mrgfltr_tab', '-o', | |
115 nargs=1, | |
116 required=True, | |
117 dest='mrgfltr_tab', | |
118 help='Tabular output file for results' | |
119 ) | |
120 # CSV output | |
121 parser.add_argument( | |
122 '--mrgfltr_csv', '-c', | |
123 nargs=1, | |
124 required=True, | |
125 dest='mrgfltr_csv', | |
126 help='CSV output file for results' | |
127 ) | |
128 # SQLite output | |
129 parser.add_argument( | |
130 '--mrgfltr_sqlite', '-S', | |
131 nargs=1, | |
132 required=True, | |
133 dest='mrgfltr_sqlite', | |
134 help='SQLite output file for results' | |
135 ) | |
136 | |
137 # "Make it so!" (parse the arguments) | |
138 options = parser.parse_args() | |
139 print("options: " + str(options)) | |
140 | |
141 # determine phosphopeptide ("upstream map") input tabular file access | |
142 if options.phosphopeptides is None: | |
143 exit('Argument "phosphopeptides" is required but not supplied') | |
144 try: | |
145 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) | |
146 input_file = open(upstream_map_filename_tab, 'r') | |
147 input_file.close() | |
148 except Exception as e: | |
149 exit('Error parsing phosphopeptides argument: %s' % str(e)) | |
150 | |
151 # determine input SQLite access | |
152 if options.ppep_mapping_db is None: | |
153 exit('Argument "ppep_mapping_db" is required but not supplied') | |
154 try: | |
155 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) | |
156 input_file = open(uniprot_sqlite, 'rb') | |
157 input_file.close() | |
158 except Exception as e: | |
159 exit('Error parsing ppep_mapping_db argument: %s' % str(e)) | |
160 | |
161 # copy input SQLite dataset to output SQLite dataset | |
162 if options.mrgfltr_sqlite is None: | |
163 exit('Argument "mrgfltr_sqlite" is required but not supplied') | |
164 try: | |
165 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) | |
166 shutil.copyfile(uniprot_sqlite, output_sqlite) | |
167 except Exception as e: | |
168 exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) | |
169 | |
170 #ACE # determine psp_regulatory_sites CSV access | |
171 #ACE if options.psp_regulatory_sites_csv is None: | |
172 #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied') | |
173 #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv) | |
174 #ACE try: | |
175 #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0]) | |
176 #ACE input_file = open(phosphosite_filename_csv, 'r') | |
177 #ACE input_file.close() | |
178 #ACE except Exception as e: | |
179 #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e)) | |
180 #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv) | |
181 | |
182 # determine species to limit records from PSP_Regulatory_Sites | |
183 if options.species is None: | |
184 exit('Argument "species" is required (and may be empty) but not supplied') | |
185 try: | |
186 if len(options.species) > 0: | |
187 species = options.species[0] | |
188 else: | |
189 species = '' | |
190 except Exception as e: | |
191 exit('Error parsing species argument: %s' % str(e)) | |
192 | |
193 # determine tabular output destination | |
194 if options.mrgfltr_tab is None: | |
195 exit('Argument "mrgfltr_tab" is required but not supplied') | |
196 try: | |
197 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) | |
198 output_file = open(output_filename_tab, 'w') | |
199 output_file.close() | |
200 except Exception as e: | |
201 exit('Error parsing mrgfltr_tab argument: %s' % str(e)) | |
202 | |
203 # determine CSV output destination | |
204 if options.mrgfltr_csv is None: | |
205 exit('Argument "mrgfltr_csv" is required but not supplied') | |
206 try: | |
207 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) | |
208 output_file = open(output_filename_csv, 'w') | |
209 output_file.close() | |
210 except Exception as e: | |
211 exit('Error parsing mrgfltr_csv argument: %s' % str(e)) | |
212 | |
213 | |
214 def mqpep_getswissprot(): | |
215 | |
216 ############################################### | |
217 # copied from Excel Output Script.ipynb BEGIN # | |
218 ############################################### | |
219 | |
220 ########### String Constants ################# | |
221 DEPHOSPHOPEP = 'DephosphoPep' | |
222 DESCRIPTION = 'Description' | |
223 FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' | |
224 GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB | |
225 ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites | |
226 ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites | |
227 ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites | |
228 ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites | |
229 ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites | |
230 PHOSPHOPEPTIDE = 'Phosphopeptide' | |
231 PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' | |
232 PHOSPHORESIDUE = 'Phosphoresidue' | |
233 PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' | |
234 SEQUENCE = 'Sequence' | |
235 SEQUENCE10 = 'Sequence10' | |
236 SEQUENCE7 = 'Sequence7' | |
237 SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' | |
238 SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' | |
239 UNIPROT_ID = 'UniProt_ID' | |
240 UNIPROT_SEQ_AND_META_SQL = ''' | |
241 select Uniprot_ID, Description, Gene_Name, Sequence, | |
242 Organism_Name, Organism_ID, PE, SV | |
243 from UniProtKB | |
244 order by Sequence, UniProt_ID | |
245 ''' | |
246 UNIPROT_UNIQUE_SEQ_SQL = ''' | |
247 select distinct Sequence | |
248 from UniProtKB | |
249 group by Sequence | |
250 ''' | |
251 PPEP_PEP_UNIPROTSEQ_SQL = ''' | |
252 select distinct phosphopeptide, peptide, sequence | |
253 from uniprotkb_pep_ppep_view | |
254 order by sequence | |
255 ''' | |
256 PPEP_MELT_SQL = ''' | |
257 SELECT DISTINCT | |
258 phospho_peptide AS 'p_peptide', | |
259 kinase_map AS 'characterization', | |
260 'X' AS 'X' | |
261 FROM ppep_gene_site_view | |
262 ''' | |
263 # CREATE TABLE PSP_Regulatory_site ( | |
264 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, | |
265 # domain TEXT, | |
266 # ON_FUNCTION TEXT, | |
267 # ON_PROCESS TEXT, | |
268 # ON_PROT_INTERACT TEXT, | |
269 # ON_OTHER_INTERACT TEXT, | |
270 # notes TEXT, | |
271 # organism TEXT | |
272 # ); | |
273 PSP_REGSITE_SQL = ''' | |
274 SELECT DISTINCT | |
275 SITE_PLUSMINUS_7AA , | |
276 DOMAIN , | |
277 ON_FUNCTION , | |
278 ON_PROCESS , | |
279 ON_PROT_INTERACT , | |
280 ON_OTHER_INTERACT , | |
281 NOTES , | |
282 ORGANISM | |
283 FROM PSP_Regulatory_site | |
284 ''' | |
285 PPEP_ID_SQL =''' | |
286 SELECT | |
287 id AS 'ppep_id', | |
288 seq AS 'ppep_seq' | |
289 FROM ppep | |
290 ''' | |
291 MRGFLTR_DDL =''' | |
292 DROP VIEW IF EXISTS mrgfltr_metadata_view; | |
293 DROP TABLE IF EXISTS mrgfltr_metadata; | |
294 CREATE TABLE mrgfltr_metadata | |
295 ( ppep_id INTEGER REFERENCES ppep(id) | |
296 , Sequence10 TEXT | |
297 , Sequence7 TEXT | |
298 , GeneName TEXT | |
299 , Phosphoresidue TEXT | |
300 , UniProtID TEXT | |
301 , Description TEXT | |
302 , FunctionPhosphoresidue TEXT | |
303 , PutativeUpstreamDomains TEXT | |
304 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE | |
305 ) | |
306 ; | |
307 CREATE VIEW mrgfltr_metadata_view AS | |
308 SELECT DISTINCT | |
309 ppep.seq AS phospho_peptide | |
310 , Sequence10 | |
311 , Sequence7 | |
312 , GeneName | |
313 , Phosphoresidue | |
314 , UniProtID | |
315 , Description | |
316 , FunctionPhosphoresidue | |
317 , PutativeUpstreamDomains | |
318 FROM | |
319 ppep, mrgfltr_metadata | |
320 WHERE | |
321 mrgfltr_metadata.ppep_id = ppep.id | |
322 ORDER BY | |
323 ppep.seq | |
324 ; | |
325 ''' | |
326 | |
327 CITATION_INSERT_STMT = ''' | |
328 INSERT INTO Citation ( | |
329 ObjectName, | |
330 CitationData | |
331 ) VALUES (?,?) | |
332 ''' | |
333 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' | |
334 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' | |
335 | |
336 MRGFLTR_METADATA_COLUMNS = [ | |
337 'ppep_id', | |
338 'Sequence10', | |
339 'Sequence7', | |
340 'GeneName', | |
341 'Phosphoresidue', | |
342 'UniProtID', | |
343 'Description', | |
344 'FunctionPhosphoresidue', | |
345 'PutativeUpstreamDomains' | |
346 ] | |
347 | |
348 ########### String Constants (end) ############ | |
349 | |
350 class Error(Exception): | |
351 """Base class for exceptions in this module.""" | |
352 pass | |
353 | |
354 class PreconditionError(Error): | |
355 """Exception raised for errors in the input. | |
356 | |
357 Attributes: | |
358 expression -- input expression in which the error occurred | |
359 message -- explanation of the error | |
360 """ | |
361 | |
362 def __init__(self, expression, message): | |
363 self.expression = expression | |
364 self.message = message | |
365 | |
366 #start_time = time.clock() #timer | |
367 start_time = time.process_time() #timer | |
368 | |
369 #get keys from upstream tabular file using readline() | |
370 # ref: https://stackoverflow.com/a/16713581/15509512 | |
371 # answer to "Use codecs to read file with correct encoding" | |
372 file1_encoded = open(upstream_map_filename_tab, 'rb') | |
373 file1 = cx_getreader("latin-1")(file1_encoded) | |
374 | |
375 count = 0 | |
376 upstream_map_p_peptide_list = [] | |
377 re_tab = re.compile('^[^\t]*') | |
378 while True: | |
379 count += 1 | |
380 # Get next line from file | |
381 line = file1.readline() | |
382 # if line is empty | |
383 # end of file is reached | |
384 if not line: | |
385 break | |
386 if count > 1: | |
387 m = re_tab.match(line) | |
388 upstream_map_p_peptide_list.append(m[0]) | |
389 file1.close() | |
390 file1_encoded.close() | |
391 | |
392 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed | |
393 re_phos = re.compile('p') | |
394 dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] | |
395 | |
396 end_time = time.process_time() #timer | |
397 print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) | |
398 | |
399 ## ----------- Get SwissProt data from SQLite database (start) ----------- | |
400 # build UniProt sequence LUT and list of unique SwissProt sequences | |
401 | |
402 # Open SwissProt SQLite database | |
403 conn = sql.connect(uniprot_sqlite) | |
404 cur = conn.cursor() | |
405 | |
406 # Set up structures to hold SwissProt data | |
407 | |
408 uniprot_Sequence_List = [] | |
409 UniProtSeqLUT = {} | |
410 | |
411 # Execute query for unique seqs without fetching the results yet | |
412 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) | |
413 | |
414 while batch := uniprot_unique_seq_cur.fetchmany(size=50): | |
415 if None == batch: | |
416 # handle case where no records are returned | |
417 break | |
418 for row in batch: | |
419 Sequence = row[0] | |
420 UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] | |
421 UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] | |
422 UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] | |
423 UniProtSeqLUT[ Sequence ] = [] | |
424 | |
425 # Execute query for seqs and metadata without fetching the results yet | |
426 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) | |
427 | |
428 while batch := uniprot_seq_and_meta.fetchmany(size=50): | |
429 if None == batch: | |
430 # handle case where no records are returned | |
431 break | |
432 for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: | |
433 uniprot_Sequence_List.append(Sequence) | |
434 UniProtSeqLUT[Sequence] = Sequence | |
435 UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) | |
436 UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) | |
437 if OS != N_A: | |
438 Description += ' OS=' + OS | |
439 if OX != N_A: | |
440 Description += ' OX=' + str(int(OX)) | |
441 if Gene_Name != N_A: | |
442 Description += ' GN=' + Gene_Name | |
443 if PE != N_A: | |
444 Description += ' PE=' + PE | |
445 if SV != N_A: | |
446 Description += ' SV=' + SV | |
447 UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) | |
448 | |
449 # Close SwissProt SQLite database; clean up local variables | |
450 conn.close() | |
451 Sequence = '' | |
452 UniProt_ID = '' | |
453 Description = '' | |
454 Gene_Name = '' | |
455 | |
456 ## ----------- Get SwissProt data from SQLite database (finish) ----------- | |
457 | |
458 end_time = time.process_time() #timer | |
459 print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) | |
460 | |
461 ## ----------- Get SwissProt data from SQLite database (start) ----------- | |
462 # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT | |
463 #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) | |
464 | |
465 # Open SwissProt SQLite database | |
466 conn = sql.connect(uniprot_sqlite) | |
467 cur = conn.cursor() | |
468 | |
469 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide | |
470 DephosphoPep_UniProtSeq_LUT = {} | |
471 | |
472 # Set up dictionary to accumulate results | |
473 PhosphoPep_UniProtSeq_LUT = {} | |
474 | |
475 # Execute query for tuples without fetching the results yet | |
476 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) | |
477 | |
478 while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): | |
479 if None == batch: | |
480 # handle case where no records are returned | |
481 break | |
482 for (phospho_pep, dephospho_pep, sequence) in batch: | |
483 #do interesting stuff here... | |
484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
485 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep | |
486 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
487 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() | |
488 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] | |
489 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] | |
490 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] | |
491 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] | |
492 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) | |
493 | |
494 #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) | |
495 if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: | |
496 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) | |
497 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: | |
498 if phospho_pep != phospho_pep: | |
499 print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) | |
500 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
501 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
502 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep | |
503 r = list(zip( | |
504 [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], | |
505 [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], | |
506 [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] | |
507 )) | |
508 # Sort by `UniProt_ID` | |
509 # ref: https://stackoverflow.com/a/4174955/15509512 | |
510 r = sorted(r, key=operator.itemgetter(0)) | |
511 # Get one tuple for each `phospho_pep` | |
512 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] | |
513 for (upid, gn, desc) in r: | |
514 # Append pseudo-tuple per UniProt_ID but only when it is not present | |
515 if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: | |
516 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) | |
517 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) | |
518 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) | |
519 | |
520 # Close SwissProt SQLite database; clean up local variables | |
521 conn.close() | |
522 # wipe local variables | |
523 phospho_pep = dephospho_pep = sequence = 0 | |
524 upid = gn = desc = r = '' | |
525 | |
526 ## ----------- Get SwissProt data from SQLite database (finish) ----------- | |
527 | |
528 end_time = time.process_time() #timer | |
529 print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) | |
530 | |
531 print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) | |
532 | |
533 #Read in Upstream tabular file | |
534 # We are discarding the intensity data; so read it as text | |
535 upstream_data = pandas.read_table( | |
536 upstream_map_filename_tab, | |
537 dtype='str', | |
538 index_col = 0 | |
539 ) | |
540 | |
541 end_time = time.process_time() #timer | |
542 print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer | |
543 | |
544 upstream_data.index = upstream_map_p_peptide_list | |
545 | |
546 | |
547 end_time = time.process_time() #timer | |
548 print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer | |
549 | |
550 | |
551 #trim upstream_data to include only the upstream map columns | |
552 old_cols = upstream_data.columns.tolist() | |
553 i = 0 | |
554 first_intensity = -1 | |
555 last_intensity = -1 | |
556 intensity_re = re.compile('Intensity.*') | |
557 for col_name in old_cols: | |
558 m = intensity_re.match(col_name) | |
559 if m: | |
560 last_intensity = i | |
561 if first_intensity == -1: | |
562 first_intensity = i | |
563 i += 1 | |
564 #print('last intensity = %d' % last_intensity) | |
565 col_PKCalpha = last_intensity + 2 | |
566 col_firstIntensity = first_intensity | |
567 | |
568 data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] | |
569 | |
570 if upstream_data.empty: | |
571 print("upstream_data is empty") | |
572 exit(0) | |
573 | |
574 data_in = upstream_data.copy(deep=True)[data_in_cols] | |
575 | |
576 # Convert floating-point integers to int64 integers | |
577 # ref: https://stackoverflow.com/a/68497603/15509512 | |
578 data_in[list(data_in.columns[1:])] = data_in[ | |
579 list(data_in.columns[1:])].astype('float64').apply(np.int64) | |
580 | |
581 #create another phosphopeptide column that will be used to join later; | |
582 # MAY need to change depending on Phosphopeptide column position | |
583 #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] | |
584 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index | |
585 | |
586 | |
587 | |
588 | |
589 end_time = time.process_time() #timer | |
590 print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer | |
591 | |
592 # Produce a dictionary of metadata for a single phosphopeptide. | |
593 # This is a replacement of `UniProtInfo_subdict` in the original code. | |
594 def pseq_to_subdict(phospho_pep): | |
595 #ACE print("calling pseq_to_subdict, %s" % phospho_pep); | |
596 # Strip "p" from phosphopeptide sequence | |
597 dephospho_pep = re_phos.sub('',phospho_pep) | |
598 | |
599 # Determine number of phosphoresidues in phosphopeptide | |
600 numps = len(phospho_pep) - len(dephospho_pep) | |
601 | |
602 # Determine location(s) of phosphoresidue(s) in phosphopeptide | |
603 # (used later for Phosphoresidue, Sequence7, and Sequence10) | |
604 ploc = [] #list of p locations | |
605 i = 0 | |
606 p = phospho_pep | |
607 while i < numps: | |
608 ploc.append(p.find("p")) | |
609 p = p[:p.find("p")] + p[p.find("p")+1:] | |
610 i +=1 | |
611 | |
612 | |
613 # Establish nested dictionary | |
614 result = {} | |
615 result[SEQUENCE] = [] | |
616 result[UNIPROT_ID] = [] | |
617 result[DESCRIPTION] = [] | |
618 result[GENE_NAME] = [] | |
619 result[PHOSPHORESIDUE] = [] | |
620 result[SEQUENCE7] = [] | |
621 result[SEQUENCE10] = [] | |
622 | |
623 # Add stripped sequence to dictionary | |
624 result[SEQUENCE].append(dephospho_pep) | |
625 | |
626 # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT | |
627 dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep] | |
628 | |
629 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT | |
630 ### Caller may elect to: | |
631 ## try: | |
632 ## ... | |
633 ## except PreconditionError as pe: | |
634 ## print("'{expression}': {message}".format( | |
635 ## expression = pe.expression, | |
636 ## message = pe.message)) | |
637 ## ) | |
638 ## ) | |
639 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
640 raise PreconditionError( dephospho_pep, | |
641 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' | |
642 ) | |
643 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
644 raise PreconditionError( dephospho_pep, | |
645 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' | |
646 ) | |
647 if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: | |
648 raise PreconditionError( dephospho_pep, | |
649 "dephosphorylated phosphopeptide does not match " + | |
650 "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + | |
651 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] | |
652 ) | |
653 result[SEQUENCE] = [dephospho_pep] | |
654 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] | |
655 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] | |
656 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] | |
657 if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: | |
658 raise PreconditionError( dephospho_pep, | |
659 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' | |
660 ) | |
661 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] | |
662 if len (UniProtSeqList) < 1: | |
663 print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) | |
664 # raise PreconditionError( | |
665 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | |
666 # 'value has zero length' | |
667 # ) | |
668 for UniProtSeq in UniProtSeqList: | |
669 i = 0 | |
670 phosphoresidues = [] | |
671 seq7s_set = set() | |
672 seq7s = [] | |
673 seq10s_set = set() | |
674 seq10s = [] | |
675 while i < len(ploc): | |
676 start = UniProtSeq.find(dephospho_pep) | |
677 psite = start+ploc[i] #location of phosphoresidue on protein sequence | |
678 | |
679 #add Phosphoresidue | |
680 phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) | |
681 phosphoresidues.append(phosphosite) | |
682 | |
683 #Add Sequence7 | |
684 if psite < 7: #phospho_pep at N terminus | |
685 seq7 = str(UniProtSeq)[:psite+8] | |
686 if seq7[psite] == "S": #if phosphosresidue is serine | |
687 pres = "s" | |
688 elif seq7[psite] == "T": #if phosphosresidue is threonine | |
689 pres = "t" | |
690 elif seq7[psite] == "Y": #if phosphoresidue is tyrosine | |
691 pres = "y" | |
692 else: # if not pSTY | |
693 pres = "?" | |
694 seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] | |
695 while len(seq7) < 15: #add appropriate number of "_" to the front | |
696 seq7 = "_" + seq7 | |
697 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus | |
698 seq7 = str(UniProtSeq)[psite-7:] | |
699 if seq7[7] == "S": | |
700 pres = "s" | |
701 elif seq7[7] == "T": | |
702 pres = "t" | |
703 elif seq7[7] == "Y": | |
704 pres = "y" | |
705 else: | |
706 pres = "?" | |
707 seq7 = seq7[:7] + pres + seq7[8:] | |
708 while len(seq7) < 15: #add appropriate number of "_" to the back | |
709 seq7 = seq7 + "_" | |
710 else: | |
711 seq7 = str(UniProtSeq)[psite-7:psite+8] | |
712 pres = "" #phosphoresidue | |
713 if seq7[7] == "S": #if phosphosresidue is serine | |
714 pres = "s" | |
715 elif seq7[7] == "T": #if phosphosresidue is threonine | |
716 pres = "t" | |
717 elif seq7[7] == "Y": #if phosphoresidue is tyrosine | |
718 pres = "y" | |
719 else: # if not pSTY | |
720 pres = "?" | |
721 seq7 = seq7[:7] + pres + seq7[8:] | |
722 if seq7 not in seq7s_set: | |
723 seq7s.append(seq7) | |
724 seq7s_set.add(seq7) | |
725 | |
726 #add Sequence10 | |
727 if psite < 10: #phospho_pep at N terminus | |
728 seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] | |
729 elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus | |
730 seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] | |
731 else: | |
732 seq10 = str(UniProtSeq)[psite-10:psite+11] | |
733 seq10 = seq10[:10] + "p" + seq10[10:] | |
734 if seq10 not in seq10s_set: | |
735 seq10s.append(seq10) | |
736 seq10s_set.add(seq10) | |
737 | |
738 i+=1 | |
739 | |
740 result[PHOSPHORESIDUE].append(phosphoresidues) | |
741 result[SEQUENCE7].append(seq7s) | |
742 # result[SEQUENCE10] is a list of lists of strings | |
743 result[SEQUENCE10].append(seq10s) | |
744 | |
745 | |
746 | |
747 | |
748 r = list(zip( | |
749 result[UNIPROT_ID], | |
750 result[GENE_NAME], | |
751 result[DESCRIPTION], | |
752 result[PHOSPHORESIDUE] | |
753 )) | |
754 # Sort by `UniProt_ID` | |
755 # ref: https://stackoverflow.com//4174955/15509512 | |
756 s = sorted(r, key=operator.itemgetter(0)) | |
757 | |
758 result[UNIPROT_ID] = [] | |
759 result[GENE_NAME] = [] | |
760 result[DESCRIPTION] = [] | |
761 result[PHOSPHORESIDUE] = [] | |
762 | |
763 for r in s: | |
764 result[UNIPROT_ID].append(r[0]) | |
765 result[GENE_NAME].append(r[1]) | |
766 result[DESCRIPTION].append(r[2]) | |
767 result[PHOSPHORESIDUE].append(r[3]) | |
768 | |
769 | |
770 #convert lists to strings in the dictionary | |
771 for key,value in result.items(): | |
772 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: | |
773 result[key] = '; '.join(map(str, value)) | |
774 elif key in [SEQUENCE10]: | |
775 # result[SEQUENCE10] is a list of lists of strings | |
776 joined_value = '' | |
777 joined_set = set() | |
778 sep = '' | |
779 for valL in value: | |
780 # valL is a list of strings | |
781 for val in valL: | |
782 # val is a string | |
783 if val not in joined_set: | |
784 joined_set.add(val) | |
785 #joined_value += sep + '; '.join(map(str, val)) | |
786 joined_value += sep + val | |
787 sep = '; ' | |
788 # joined_value is a string | |
789 result[key] = joined_value | |
790 | |
791 | |
792 newstring = '; '.join( | |
793 [', '.join(l) for l in result[PHOSPHORESIDUE]] | |
794 ) | |
795 ### #separate the isoforms in PHOSPHORESIDUE column with ";" | |
796 ### oldstring = result[PHOSPHORESIDUE] | |
797 ### oldlist = list(oldstring) | |
798 ### newstring = "" | |
799 ### i = 0 | |
800 ### for e in oldlist: | |
801 ### if e == ";": | |
802 ### if numps > 1: | |
803 ### if i%numps: | |
804 ### newstring = newstring + ";" | |
805 ### else: | |
806 ### newstring = newstring + "," | |
807 ### else: | |
808 ### newstring = newstring + ";" | |
809 ### i +=1 | |
810 ### else: | |
811 ### newstring = newstring + e | |
812 result[PHOSPHORESIDUE] = newstring | |
813 | |
814 | |
815 #separate sequence7's by | | |
816 oldstring = result[SEQUENCE7] | |
817 oldlist = oldstring | |
818 newstring = "" | |
819 for l in oldlist: | |
820 for e in l: | |
821 if e == ";": | |
822 newstring = newstring + " |" | |
823 elif len(newstring) > 0 and 1 > newstring.count(e): | |
824 newstring = newstring + " | " + e | |
825 elif 1 > newstring.count(e): | |
826 newstring = newstring + e | |
827 result[SEQUENCE7] = newstring | |
828 | |
829 | |
830 return [phospho_pep, result] | |
831 | |
832 # Construct list of [string, dictionary] lists | |
833 # where the dictionary provides the SwissProt metadata for a phosphopeptide | |
834 result_list = [ | |
835 catch(pseq_to_subdict,psequence) | |
836 for psequence | |
837 in data_in[PHOSPHOPEPTIDE_MATCH] | |
838 ] | |
839 | |
840 | |
841 end_time = time.process_time() #timer | |
842 print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer | |
843 | |
844 # Construct dictionary from list of lists | |
845 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ | |
846 UniProt_Info = { | |
847 result[0]:result[1] | |
848 for result | |
849 in result_list | |
850 if result is not None | |
851 } | |
852 | |
853 | |
854 end_time = time.process_time() #timer | |
855 print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer | |
856 | |
857 #cosmetic: add N_A to phosphopeptide rows with no hits | |
858 p_peptide_list = [] | |
859 for key in UniProt_Info: | |
860 p_peptide_list.append(key) | |
861 for nestedKey in UniProt_Info[key]: | |
862 if UniProt_Info[key][nestedKey] == "": | |
863 UniProt_Info[key][nestedKey] = N_A | |
864 | |
865 end_time = time.process_time() #timer | |
866 print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer | |
867 | |
868 #convert UniProt_Info dictionary to dataframe | |
869 uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) | |
870 | |
871 #reorder columns to match expected output file | |
872 uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too | |
873 | |
874 | |
875 cols = uniprot_df.columns.tolist() | |
876 #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] | |
877 #uniprot_df = uniprot_df[cols] | |
878 uniprot_df = uniprot_df[[ | |
879 PHOSPHOPEPTIDE, | |
880 SEQUENCE10, | |
881 SEQUENCE7, | |
882 GENE_NAME, | |
883 PHOSPHORESIDUE, | |
884 UNIPROT_ID, | |
885 DESCRIPTION | |
886 ]] | |
887 | |
888 | |
889 end_time = time.process_time() #timer | |
890 print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer | |
891 | |
892 #concat to split then groupby to collapse | |
893 seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) | |
894 for _, row in uniprot_df.iterrows()]).reset_index() | |
895 seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] | |
896 | |
897 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- | |
898 #read in PhosphoSitePlus Regulatory Sites dataset | |
899 #ACE if (True): | |
900 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- | |
901 conn = sql.connect(uniprot_sqlite) | |
902 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) | |
903 # Close SwissProt SQLite database | |
904 conn.close() | |
905 #ACE # Array indexes are zero-based | |
906 #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) | |
907 #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT' | |
908 #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM'] | |
909 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also | |
910 #ACE print(regsites_df) | |
911 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) ----------- | |
912 #ACE else: | |
913 #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3) | |
914 #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA | |
915 #ACE #ACE # Array indexes are zero-based | |
916 #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) | |
917 #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID' | |
918 #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA' | |
919 #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT' | |
920 #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES' | |
921 #ACE #ACE ] | |
922 #ACE #ACE REGSITE_COL_SITE7AA = 9 | |
923 #ACE #ACE REGSITE_COL_PROTEIN = 1 | |
924 #ACE #ACE REGSITE_COL_DOMAIN = 10 | |
925 #ACE #ACE REGSITE_COL_PMIDs = 15 | |
926 | |
927 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ | |
928 | |
929 | |
930 #keep only the human entries in dataframe | |
931 if len(species) > 0: | |
932 print('Limit PhosphoSitesPlus records to species "' + species + '"') | |
933 regsites_df = regsites_df[regsites_df.ORGANISM == species] | |
934 | |
935 #merge the seq7 df with the regsites df based off of the sequence7 | |
936 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') | |
937 #ACE print(merge_df.columns.tolist()) #ACE | |
938 | |
939 #after merging df, select only the columns of interest - note that PROTEIN is absent here | |
940 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] | |
941 #ACE print(merge_df.columns.tolist()) #ACE | |
942 #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" | |
943 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") | |
944 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") | |
945 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") | |
946 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") | |
947 | |
948 #remove the columns that were combined | |
949 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] | |
950 | |
951 #ACE print(merge_df) #ACE | |
952 #ACE print(merge_df.columns.tolist()) #ACE | |
953 | |
954 end_time = time.process_time() #timer | |
955 print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer | |
956 | |
957 #cosmetic changes to Function Phosphoresidue column | |
958 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) | |
959 | |
960 end_time = time.process_time() #timer | |
961 print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer | |
962 | |
963 i = 0 | |
964 while i < len(fp_series): | |
965 #remove the extra ";" so that it looks more professional | |
966 if fp_series[i] == "; ; ; ; ": #remove ; from empty hits | |
967 fp_series[i] = "" | |
968 while fp_series[i].endswith("; "): #remove ; from the ends | |
969 fp_series[i] = fp_series[i][:-2] | |
970 while fp_series[i].startswith("; "): #remove ; from the beginning | |
971 fp_series[i] = fp_series[i][2:] | |
972 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") | |
973 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") | |
974 fp_series[i] = fp_series[i].replace("; ; ", "; ") | |
975 | |
976 #turn blanks into N_A to signify the info was searched for but cannot be found | |
977 if fp_series[i] == "": | |
978 fp_series[i] = N_A | |
979 | |
980 i += 1 | |
981 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series | |
982 | |
983 end_time = time.process_time() #timer | |
984 print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer | |
985 | |
986 #merge uniprot df with merge df | |
987 uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") | |
988 | |
989 #collapse the merged df | |
990 uniprot_regsites_collapsed_df = pandas.DataFrame( | |
991 uniprot_regsites_merged_df | |
992 .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] | |
993 .apply(lambda x: ppep_join(x))) | |
994 #.apply(lambda x: "%s" % ' | '.join(x))) | |
995 | |
996 | |
997 end_time = time.process_time() #timer | |
998 print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer | |
999 | |
1000 uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column | |
1001 | |
1002 | |
1003 #rename columns | |
1004 uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] | |
1005 | |
1006 | |
1007 | |
1008 #select columns to be merged to uniprot_df | |
1009 #ACE cols = regsites_df.columns.tolist() | |
1010 #ACE print(cols) #ACE | |
1011 #ACE if len(cols) > 8: | |
1012 #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] | |
1013 #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] | |
1014 #ACE print(cols) #ACE | |
1015 #ACE regsite_merge_df = regsites_df[cols] | |
1016 | |
1017 end_time = time.process_time() #timer | |
1018 print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer | |
1019 | |
1020 #add columns based on Sequence7 matching site_+/-7_AA | |
1021 uniprot_regsite_df = pandas.merge( | |
1022 left=uniprot_df, | |
1023 right=uniprot_regsites_collapsed_df, | |
1024 how='left', | |
1025 left_on=PHOSPHOPEPTIDE, | |
1026 right_on='ppp') | |
1027 | |
1028 end_time = time.process_time() #timer | |
1029 print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer | |
1030 | |
1031 data_in.rename( | |
1032 {'Protein description': PHOSPHOPEPTIDE}, | |
1033 axis='columns', | |
1034 inplace=True | |
1035 ) | |
1036 | |
1037 | |
1038 | |
1039 sort_start_time = time.process_time() #timer | |
1040 | |
1041 #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') | |
1042 res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold()) | |
1043 data_in = data_in.loc[res2] | |
1044 | |
1045 end_time = time.process_time() #timer | |
1046 print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer | |
1047 | |
1048 | |
1049 | |
1050 cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] | |
1051 upstream_data = upstream_data[cols] | |
1052 | |
1053 end_time = time.process_time() #timer | |
1054 print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer | |
1055 | |
1056 | |
1057 #### #rename upstream columns in new list | |
1058 #### new_cols = [] | |
1059 #### for name in cols: | |
1060 #### if "_NetworKIN" in name: | |
1061 #### name = name.split("_")[0] | |
1062 #### if " motif" in name: | |
1063 #### name = name.split(" motif")[0] | |
1064 #### if " sequence " in name: | |
1065 #### name = name.split(" sequence")[0] | |
1066 #### if "_Phosida" in name: | |
1067 #### name = name.split("_")[0] | |
1068 #### if "_PhosphoSite" in name: | |
1069 #### name = name.split("_")[0] | |
1070 #### new_cols.append(name) | |
1071 | |
1072 #rename upstream columns in new list | |
1073 def col_rename(name): | |
1074 if "_NetworKIN" in name: | |
1075 name = name.split("_")[0] | |
1076 if " motif" in name: | |
1077 name = name.split(" motif")[0] | |
1078 if " sequence " in name: | |
1079 name = name.split(" sequence")[0] | |
1080 if "_Phosida" in name: | |
1081 name = name.split("_")[0] | |
1082 if "_PhosphoSite" in name: | |
1083 name = name.split("_")[0] | |
1084 return name | |
1085 | |
1086 new_cols = [col_rename(col) for col in cols] | |
1087 upstream_data.columns = new_cols | |
1088 | |
1089 | |
1090 | |
1091 end_time = time.process_time() #timer | |
1092 print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer | |
1093 | |
1094 | |
1095 # Create upstream_data_cast as a copy of upstream_data | |
1096 # but with first column substituted by the phosphopeptide sequence | |
1097 upstream_data_cast = upstream_data.copy() | |
1098 new_cols_cast = new_cols | |
1099 new_cols_cast[0] = 'p_peptide' | |
1100 upstream_data_cast.columns = new_cols_cast | |
1101 upstream_data_cast['p_peptide'] = upstream_data.index | |
1102 new_cols_cast0 = new_cols_cast[0] | |
1103 | |
1104 # --- -------------- begin read upstream_data_melt ------------------------------------ | |
1105 ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- | |
1106 conn = sql.connect(uniprot_sqlite) | |
1107 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) | |
1108 # Close SwissProt SQLite database | |
1109 conn.close() | |
1110 upstream_data_melt = upstream_data_melt_df.copy() | |
1111 upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] | |
1112 upstream_data_melt['characterization'] = [ | |
1113 col_rename(s) | |
1114 for s in upstream_data_melt['characterization'] | |
1115 ] | |
1116 | |
1117 print('%0.6f upstream_data_melt_df initially has %d rows' % | |
1118 (end_time - start_time, len(upstream_data_melt.axes[0])) | |
1119 , file=sys.stderr) | |
1120 # ref: https://stackoverflow.com/a/27360130/15509512 | |
1121 # e.g. df.drop(df[df.score < 50].index, inplace=True) | |
1122 upstream_data_melt.drop( | |
1123 upstream_data_melt[upstream_data_melt.X != 'X'].index, | |
1124 inplace = True | |
1125 ) | |
1126 print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % | |
1127 (end_time - start_time, len(upstream_data_melt.axes[0])) | |
1128 , file=sys.stderr) | |
1129 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also | |
1130 #ACE print(upstream_data_melt) | |
1131 ## ----------- Get melted kinase mapping data from SQLite database (finish) ----------- | |
1132 # ... -------------- end read upstream_data_melt -------------------------------------- | |
1133 | |
1134 end_time = time.process_time() #timer | |
1135 print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer | |
1136 # ... end read upstream_data_melt | |
1137 | |
1138 upstream_data_melt_index = upstream_data_melt.index | |
1139 upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] | |
1140 | |
1141 end_time = time.process_time() #timer | |
1142 print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer | |
1143 | |
1144 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() | |
1145 | |
1146 melt_dict = {} | |
1147 for key in upstream_map_p_peptide_list: | |
1148 melt_dict[key] = [] | |
1149 | |
1150 for el in upstream_delta_melt_LoL: | |
1151 (p_peptide, characterization, X) = tuple(el) | |
1152 if p_peptide in melt_dict: | |
1153 melt_dict[p_peptide].append(characterization) | |
1154 else: | |
1155 exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) | |
1156 | |
1157 | |
1158 end_time = time.process_time() #timer | |
1159 print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer | |
1160 | |
1161 | |
1162 # for key in upstream_map_p_peptide_list: | |
1163 # melt_dict[key] = ' | '.join(melt_dict[key]) | |
1164 | |
1165 for key in upstream_map_p_peptide_list: | |
1166 melt_dict[key] = melt_join(melt_dict[key]) | |
1167 | |
1168 end_time = time.process_time() #timer | |
1169 print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer | |
1170 | |
1171 # map_dict is a dictionary of dictionaries | |
1172 map_dict = {} | |
1173 for key in upstream_map_p_peptide_list: | |
1174 map_dict[key] = {} | |
1175 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] | |
1176 | |
1177 | |
1178 end_time = time.process_time() #timer | |
1179 print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer | |
1180 | |
1181 #convert map_dict to dataframe | |
1182 map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) | |
1183 map_df["p-peptide"] = map_df.index #make index a column too | |
1184 cols_map_df = map_df.columns.tolist() | |
1185 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] | |
1186 map_df = map_df[cols_map_df] | |
1187 | |
1188 #join map_df to uniprot_regsite_df | |
1189 output_df = uniprot_regsite_df.merge( | |
1190 map_df, | |
1191 how="left", | |
1192 left_on=PHOSPHOPEPTIDE, | |
1193 right_on="p-peptide") | |
1194 | |
1195 output_df = output_df[ | |
1196 [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, | |
1197 UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, | |
1198 PUTATIVE_UPSTREAM_DOMAINS | |
1199 ] | |
1200 ] | |
1201 | |
1202 | |
1203 # cols_output_prelim = output_df.columns.tolist() | |
1204 # | |
1205 # print("cols_output_prelim") | |
1206 # print(cols_output_prelim) | |
1207 # | |
1208 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] | |
1209 # | |
1210 # print("cols_output with p-peptide") | |
1211 # print(cols_output) | |
1212 # | |
1213 # cols_output = [col for col in cols_output if not col == "p-peptide"] | |
1214 # | |
1215 # print("cols_output") | |
1216 # print(cols_output) | |
1217 # | |
1218 # output_df = output_df[cols_output] | |
1219 | |
1220 #join output_df back to quantitative columns in data_in df | |
1221 quant_cols = data_in.columns.tolist() | |
1222 quant_cols = quant_cols[1:] | |
1223 quant_data = data_in[quant_cols] | |
1224 | |
1225 ## ----------- Write merge/filter metadata to SQLite database (start) ----------- | |
1226 # Open SwissProt SQLite database | |
1227 conn = sql.connect(output_sqlite) | |
1228 cur = conn.cursor() | |
1229 | |
1230 cur.executescript(MRGFLTR_DDL) | |
1231 | |
1232 cur.execute( | |
1233 CITATION_INSERT_STMT, | |
1234 ('mrgfltr_metadata_view', CITATION_INSERT_PSP) | |
1235 ) | |
1236 cur.execute( | |
1237 CITATION_INSERT_STMT, | |
1238 ('mrgfltr_metadata', CITATION_INSERT_PSP) | |
1239 ) | |
1240 cur.execute( | |
1241 CITATION_INSERT_STMT, | |
1242 ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) | |
1243 ) | |
1244 cur.execute( | |
1245 CITATION_INSERT_STMT, | |
1246 ('mrgfltr_metadata', CITATION_INSERT_PSP_REF) | |
1247 ) | |
1248 | |
1249 # Read ppep-to-sequence LUT | |
1250 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) | |
1251 #ACE ppep_lut_df.info(verbose=True) | |
1252 # write only metadata for merged/filtered records to SQLite | |
1253 mrgfltr_metadata_df = output_df.copy() | |
1254 # replace phosphopeptide seq with ppep.id | |
1255 mrgfltr_metadata_df = ppep_lut_df.merge( | |
1256 mrgfltr_metadata_df, | |
1257 left_on='ppep_seq', | |
1258 right_on=PHOSPHOPEPTIDE, | |
1259 how='inner' | |
1260 ) | |
1261 mrgfltr_metadata_df.drop( | |
1262 columns=[PHOSPHOPEPTIDE, 'ppep_seq'], | |
1263 inplace=True | |
1264 ) | |
1265 #rename columns | |
1266 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS | |
1267 #ACE mrgfltr_metadata_df.info(verbose=True) | |
1268 mrgfltr_metadata_df.to_sql( | |
1269 'mrgfltr_metadata', | |
1270 con=conn, | |
1271 if_exists='append', | |
1272 index=False, | |
1273 method='multi' | |
1274 ) | |
1275 | |
1276 # Close SwissProt SQLite database | |
1277 conn.close() | |
1278 ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- | |
1279 | |
1280 output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) | |
1281 output_cols = output_df.columns.tolist() | |
1282 output_cols = output_cols[:-1] | |
1283 output_df = output_df[output_cols] | |
1284 | |
1285 #cosmetic changes to Upstream column | |
1286 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | |
1287 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) | |
1288 i = 0 | |
1289 while i < len(us_series): | |
1290 #turn blanks into N_A to signify the info was searched for but cannot be found | |
1291 if us_series[i] == "": | |
1292 us_series[i] = N_A | |
1293 i += 1 | |
1294 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series | |
1295 | |
1296 end_time = time.process_time() #timer | |
1297 print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer | |
1298 | |
1299 (output_rows, output_cols) = output_df.shape | |
1300 | |
1301 #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) | |
1302 output_df = output_df.convert_dtypes(convert_integer=True) | |
1303 | |
1304 | |
1305 #Output onto Final CSV file | |
1306 output_df.to_csv(output_filename_csv, index=False) | |
1307 output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) | |
1308 | |
1309 end_time = time.process_time() #timer | |
1310 print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer | |
1311 | |
1312 print('{:>10} phosphopeptides written to output'.format(str(output_rows))) | |
1313 | |
1314 end_time = time.process_time() #timer | |
1315 print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer | |
1316 | |
1317 | |
1318 #Rev. 7/1/2016 | |
1319 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | |
1320 #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | |
1321 #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | |
1322 # read from SwissProt SQLite database | |
1323 #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper | |
1324 | |
1325 ############################################# | |
1326 # copied from Excel Output Script.ipynb END # | |
1327 ############################################# | |
1328 | |
1329 try: | |
1330 catch(mqpep_getswissprot,) | |
1331 exit(0) | |
1332 except Exception as e: | |
1333 exit('Internal error running mqpep_getswissprot(): %s' % (e)) | |
1334 | |
1335 if __name__ == "__main__": | |
1336 __main__() | |
1337 |