comparison mqppep_mrgfltr.py @ 5:d4d531006735 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author eschen42
date Thu, 10 Mar 2022 23:42:48 +0000
parents c1403d18c189
children 922d309640db
comparison
equal deleted inserted replaced
4:cfc65ae227f8 5:d4d531006735
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 # Import the packages needed 3 # Import the packages needed
4 import argparse 4 import argparse
5 import operator # for operator.itemgetter
5 import os.path 6 import os.path
6 import sys 7 import re
7 8 import shutil # for shutil.copyfile(src, dest)
9 import sqlite3 as sql
10 import sys # import the sys module for exc_info
11 import time
12 import traceback # for formatting stack-trace
13 from codecs import getreader as cx_getreader
14
15 import numpy as np
8 import pandas 16 import pandas
9 import re
10 import time
11 import sqlite3 as sql
12 from codecs import getreader as cx_getreader
13 import sys
14 import numpy as np
15
16 # for sorting list of lists using operator.itemgetter
17 import operator
18
19 # for formatting stack-trace
20 import traceback
21
22 # for Aho-Corasick search for fixed set of substrings
23 import ahocorasick
24 import operator
25 import hashlib
26
27 # for shutil.copyfile(src, dest)
28 import shutil
29 17
30 # global constants 18 # global constants
31 N_A = 'N/A' 19 N_A = "N/A"
20
32 21
33 # ref: https://stackoverflow.com/a/8915613/15509512 22 # ref: https://stackoverflow.com/a/8915613/15509512
34 # answers: "How to handle exceptions in a list comprehensions" 23 # answers: "How to handle exceptions in a list comprehensions"
35 # usage: 24 # usage:
36 # from math import log 25 # from math import log
39 # producing: 28 # producing:
40 # for <built-in function log> 29 # for <built-in function log>
41 # with args (0,) 30 # with args (0,)
42 # exception: math domain error 31 # exception: math domain error
43 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] 32 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
44 def catch(func, *args, handle=lambda e : e, **kwargs): 33 def catch(func, *args, handle=lambda e: e, **kwargs):
34
45 try: 35 try:
46 return func(*args, **kwargs) 36 return func(*args, **kwargs)
47 except Exception as e: 37 except Exception as e:
48 print("For %s" % str(func)) 38 print("For %s" % str(func))
49 print(" with args %s" % str(args)) 39 print(" with args %s" % str(args))
50 print(" caught exception: %s" % str(e)) 40 print(" caught exception: %s" % str(e))
51 (ty, va, tb) = sys.exc_info() 41 (ty, va, tb) = sys.exc_info()
52 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) 42 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
53 exit(-1) 43 exit(-1)
54 return None # was handle(e) 44 return None
45
55 46
56 def ppep_join(x): 47 def ppep_join(x):
57 x = [i for i in x if N_A != i] 48 x = [i for i in x if N_A != i]
58 result = "%s" % ' | '.join(x) 49 result = "%s" % " | ".join(x)
59 if result != "": 50 if result != "":
60 return result 51 return result
61 else: 52 else:
62 return N_A 53 return N_A
63 54
55
64 def melt_join(x): 56 def melt_join(x):
65 tmp = {key.lower(): key for key in x} 57 tmp = {key.lower(): key for key in x}
66 result = "%s" % ' | '.join([tmp[key] for key in tmp]) 58 result = "%s" % " | ".join([tmp[key] for key in tmp])
67 return result 59 return result
60
68 61
69 def __main__(): 62 def __main__():
70 # Parse Command Line 63 # Parse Command Line
71 parser = argparse.ArgumentParser( 64 parser = argparse.ArgumentParser(
72 description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' 65 description="Phopsphoproteomic Enrichment Pipeline Merge and Filter."
73 ) 66 )
74 67
75 # inputs: 68 # inputs:
76 # Phosphopeptide data for experimental results, including the intensities 69 # Phosphopeptide data for experimental results, including the intensities
77 # and the mapping to kinase domains, in tabular format. 70 # and the mapping to kinase domains, in tabular format.
78 parser.add_argument( 71 parser.add_argument(
79 '--phosphopeptides', '-p', 72 "--phosphopeptides",
73 "-p",
80 nargs=1, 74 nargs=1,
81 required=True, 75 required=True,
82 dest='phosphopeptides', 76 dest="phosphopeptides",
83 help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' 77 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
84 ) 78 )
85 # UniProtKB/SwissProt DB input, SQLite 79 # UniProtKB/SwissProt DB input, SQLite
86 parser.add_argument( 80 parser.add_argument(
87 '--ppep_mapping_db', '-d', 81 "--ppep_mapping_db",
82 "-d",
88 nargs=1, 83 nargs=1,
89 required=True, 84 required=True,
90 dest='ppep_mapping_db', 85 dest="ppep_mapping_db",
91 help='UniProtKB/SwissProt SQLite Database' 86 help="UniProtKB/SwissProt SQLite Database",
92 ) 87 )
93 #ACE # PhosPhositesPlus DB input, csv
94 #ACE parser.add_argument(
95 #ACE '--psp_regulatory_sites', '-s',
96 #ACE nargs=1,
97 #ACE required=True,
98 #ACE dest='psp_regulatory_sites_csv',
99 #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
100 #ACE )
101 # species to limit records chosed from PhosPhositesPlus 88 # species to limit records chosed from PhosPhositesPlus
102 parser.add_argument( 89 parser.add_argument(
103 '--species', '-x', 90 "--species",
91 "-x",
104 nargs=1, 92 nargs=1,
105 required=False, 93 required=False,
106 default=[], 94 default=[],
107 dest='species', 95 dest="species",
108 help='limit PhosphoSitePlus records to indicated species (field may be empty)' 96 help="limit PhosphoSitePlus records to indicated species (field may be empty)",
109 ) 97 )
110 98
111 # outputs: 99 # outputs:
112 # tabular output 100 # tabular output
113 parser.add_argument( 101 parser.add_argument(
114 '--mrgfltr_tab', '-o', 102 "--mrgfltr_tab",
103 "-o",
115 nargs=1, 104 nargs=1,
116 required=True, 105 required=True,
117 dest='mrgfltr_tab', 106 dest="mrgfltr_tab",
118 help='Tabular output file for results' 107 help="Tabular output file for results",
119 ) 108 )
120 # CSV output 109 # CSV output
121 parser.add_argument( 110 parser.add_argument(
122 '--mrgfltr_csv', '-c', 111 "--mrgfltr_csv",
112 "-c",
123 nargs=1, 113 nargs=1,
124 required=True, 114 required=True,
125 dest='mrgfltr_csv', 115 dest="mrgfltr_csv",
126 help='CSV output file for results' 116 help="CSV output file for results",
127 ) 117 )
128 # SQLite output 118 # SQLite output
129 parser.add_argument( 119 parser.add_argument(
130 '--mrgfltr_sqlite', '-S', 120 "--mrgfltr_sqlite",
121 "-S",
131 nargs=1, 122 nargs=1,
132 required=True, 123 required=True,
133 dest='mrgfltr_sqlite', 124 dest="mrgfltr_sqlite",
134 help='SQLite output file for results' 125 help="SQLite output file for results",
135 ) 126 )
136 127
137 # "Make it so!" (parse the arguments) 128 # "Make it so!" (parse the arguments)
138 options = parser.parse_args() 129 options = parser.parse_args()
139 print("options: " + str(options)) 130 print("options: " + str(options))
140 131
141 # determine phosphopeptide ("upstream map") input tabular file access 132 # determine phosphopeptide ("upstream map") input tabular file access
142 if options.phosphopeptides is None: 133 if options.phosphopeptides is None:
143 exit('Argument "phosphopeptides" is required but not supplied') 134 exit('Argument "phosphopeptides" is required but not supplied')
144 try: 135 try:
145 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) 136 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
146 input_file = open(upstream_map_filename_tab, 'r') 137 input_file = open(upstream_map_filename_tab, "r")
147 input_file.close() 138 input_file.close()
148 except Exception as e: 139 except Exception as e:
149 exit('Error parsing phosphopeptides argument: %s' % str(e)) 140 exit("Error parsing phosphopeptides argument: %s" % str(e))
150 141
151 # determine input SQLite access 142 # determine input SQLite access
152 if options.ppep_mapping_db is None: 143 if options.ppep_mapping_db is None:
153 exit('Argument "ppep_mapping_db" is required but not supplied') 144 exit('Argument "ppep_mapping_db" is required but not supplied')
154 try: 145 try:
155 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) 146 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
156 input_file = open(uniprot_sqlite, 'rb') 147 input_file = open(uniprot_sqlite, "rb")
157 input_file.close() 148 input_file.close()
158 except Exception as e: 149 except Exception as e:
159 exit('Error parsing ppep_mapping_db argument: %s' % str(e)) 150 exit("Error parsing ppep_mapping_db argument: %s" % str(e))
160 151
161 # copy input SQLite dataset to output SQLite dataset 152 # copy input SQLite dataset to output SQLite dataset
162 if options.mrgfltr_sqlite is None: 153 if options.mrgfltr_sqlite is None:
163 exit('Argument "mrgfltr_sqlite" is required but not supplied') 154 exit('Argument "mrgfltr_sqlite" is required but not supplied')
164 try: 155 try:
165 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) 156 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
166 shutil.copyfile(uniprot_sqlite, output_sqlite) 157 shutil.copyfile(uniprot_sqlite, output_sqlite)
167 except Exception as e: 158 except Exception as e:
168 exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) 159 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
169
170 #ACE # determine psp_regulatory_sites CSV access
171 #ACE if options.psp_regulatory_sites_csv is None:
172 #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
173 #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
174 #ACE try:
175 #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
176 #ACE input_file = open(phosphosite_filename_csv, 'r')
177 #ACE input_file.close()
178 #ACE except Exception as e:
179 #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
180 #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
181 160
182 # determine species to limit records from PSP_Regulatory_Sites 161 # determine species to limit records from PSP_Regulatory_Sites
183 if options.species is None: 162 if options.species is None:
184 exit('Argument "species" is required (and may be empty) but not supplied') 163 exit(
164 'Argument "species" is required (and may be empty) but not supplied'
165 )
185 try: 166 try:
186 if len(options.species) > 0: 167 if len(options.species) > 0:
187 species = options.species[0] 168 species = options.species[0]
188 else: 169 else:
189 species = '' 170 species = ""
190 except Exception as e: 171 except Exception as e:
191 exit('Error parsing species argument: %s' % str(e)) 172 exit("Error parsing species argument: %s" % str(e))
192 173
193 # determine tabular output destination 174 # determine tabular output destination
194 if options.mrgfltr_tab is None: 175 if options.mrgfltr_tab is None:
195 exit('Argument "mrgfltr_tab" is required but not supplied') 176 exit('Argument "mrgfltr_tab" is required but not supplied')
196 try: 177 try:
197 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) 178 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
198 output_file = open(output_filename_tab, 'w') 179 output_file = open(output_filename_tab, "w")
199 output_file.close() 180 output_file.close()
200 except Exception as e: 181 except Exception as e:
201 exit('Error parsing mrgfltr_tab argument: %s' % str(e)) 182 exit("Error parsing mrgfltr_tab argument: %s" % str(e))
202 183
203 # determine CSV output destination 184 # determine CSV output destination
204 if options.mrgfltr_csv is None: 185 if options.mrgfltr_csv is None:
205 exit('Argument "mrgfltr_csv" is required but not supplied') 186 exit('Argument "mrgfltr_csv" is required but not supplied')
206 try: 187 try:
207 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) 188 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
208 output_file = open(output_filename_csv, 'w') 189 output_file = open(output_filename_csv, "w")
209 output_file.close() 190 output_file.close()
210 except Exception as e: 191 except Exception as e:
211 exit('Error parsing mrgfltr_csv argument: %s' % str(e)) 192 exit("Error parsing mrgfltr_csv argument: %s" % str(e))
212
213 193
214 def mqpep_getswissprot(): 194 def mqpep_getswissprot():
215 195
216 ############################################### 196 #
217 # copied from Excel Output Script.ipynb BEGIN # 197 # copied from Excel Output Script.ipynb BEGIN #
218 ############################################### 198 #
219 199
220 ########### String Constants ################# 200 # String Constants #################
221 DEPHOSPHOPEP = 'DephosphoPep' 201 DEPHOSPHOPEP = "DephosphoPep"
222 DESCRIPTION = 'Description' 202 DESCRIPTION = "Description"
223 FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' 203 FUNCTION_PHOSPHORESIDUE = (
224 GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB 204 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
225 ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites 205 )
226 ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites 206 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB
227 ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites 207 ON_FUNCTION = (
228 ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites 208 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites
229 ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites 209 )
230 PHOSPHOPEPTIDE = 'Phosphopeptide' 210 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites
231 PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' 211 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
232 PHOSPHORESIDUE = 'Phosphoresidue' 212 ON_PROCESS = (
233 PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' 213 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites
234 SEQUENCE = 'Sequence' 214 )
235 SEQUENCE10 = 'Sequence10' 215 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites
236 SEQUENCE7 = 'Sequence7' 216 PHOSPHOPEPTIDE = "Phosphopeptide"
237 SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' 217 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
238 SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' 218 PHOSPHORESIDUE = "Phosphoresidue"
239 UNIPROT_ID = 'UniProt_ID' 219 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
240 UNIPROT_SEQ_AND_META_SQL = ''' 220 SEQUENCE = "Sequence"
221 SEQUENCE10 = "Sequence10"
222 SEQUENCE7 = "Sequence7"
223 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
224 UNIPROT_ID = "UniProt_ID"
225 UNIPROT_SEQ_AND_META_SQL = """
241 select Uniprot_ID, Description, Gene_Name, Sequence, 226 select Uniprot_ID, Description, Gene_Name, Sequence,
242 Organism_Name, Organism_ID, PE, SV 227 Organism_Name, Organism_ID, PE, SV
243 from UniProtKB 228 from UniProtKB
244 order by Sequence, UniProt_ID 229 order by Sequence, UniProt_ID
245 ''' 230 """
246 UNIPROT_UNIQUE_SEQ_SQL = ''' 231 UNIPROT_UNIQUE_SEQ_SQL = """
247 select distinct Sequence 232 select distinct Sequence
248 from UniProtKB 233 from UniProtKB
249 group by Sequence 234 group by Sequence
250 ''' 235 """
251 PPEP_PEP_UNIPROTSEQ_SQL = ''' 236 PPEP_PEP_UNIPROTSEQ_SQL = """
252 select distinct phosphopeptide, peptide, sequence 237 select distinct phosphopeptide, peptide, sequence
253 from uniprotkb_pep_ppep_view 238 from uniprotkb_pep_ppep_view
254 order by sequence 239 order by sequence
255 ''' 240 """
256 PPEP_MELT_SQL = ''' 241 PPEP_MELT_SQL = """
257 SELECT DISTINCT 242 SELECT DISTINCT
258 phospho_peptide AS 'p_peptide', 243 phospho_peptide AS 'p_peptide',
259 kinase_map AS 'characterization', 244 kinase_map AS 'characterization',
260 'X' AS 'X' 245 'X' AS 'X'
261 FROM ppep_gene_site_view 246 FROM ppep_gene_site_view
262 ''' 247 """
263 # CREATE TABLE PSP_Regulatory_site ( 248 # CREATE TABLE PSP_Regulatory_site (
264 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, 249 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
265 # domain TEXT, 250 # domain TEXT,
266 # ON_FUNCTION TEXT, 251 # ON_FUNCTION TEXT,
267 # ON_PROCESS TEXT, 252 # ON_PROCESS TEXT,
268 # ON_PROT_INTERACT TEXT, 253 # ON_PROT_INTERACT TEXT,
269 # ON_OTHER_INTERACT TEXT, 254 # ON_OTHER_INTERACT TEXT,
270 # notes TEXT, 255 # notes TEXT,
271 # organism TEXT 256 # organism TEXT
272 # ); 257 # );
273 PSP_REGSITE_SQL = ''' 258 PSP_REGSITE_SQL = """
274 SELECT DISTINCT 259 SELECT DISTINCT
275 SITE_PLUSMINUS_7AA , 260 SITE_PLUSMINUS_7AA ,
276 DOMAIN , 261 DOMAIN ,
277 ON_FUNCTION , 262 ON_FUNCTION ,
278 ON_PROCESS , 263 ON_PROCESS ,
279 ON_PROT_INTERACT , 264 ON_PROT_INTERACT ,
280 ON_OTHER_INTERACT , 265 ON_OTHER_INTERACT ,
281 NOTES , 266 NOTES ,
282 ORGANISM 267 ORGANISM
283 FROM PSP_Regulatory_site 268 FROM PSP_Regulatory_site
284 ''' 269 """
285 PPEP_ID_SQL =''' 270 PPEP_ID_SQL = """
286 SELECT 271 SELECT
287 id AS 'ppep_id', 272 id AS 'ppep_id',
288 seq AS 'ppep_seq' 273 seq AS 'ppep_seq'
289 FROM ppep 274 FROM ppep
290 ''' 275 """
291 MRGFLTR_DDL =''' 276 MRGFLTR_DDL = """
292 DROP VIEW IF EXISTS mrgfltr_metadata_view; 277 DROP VIEW IF EXISTS mrgfltr_metadata_view;
293 DROP TABLE IF EXISTS mrgfltr_metadata; 278 DROP TABLE IF EXISTS mrgfltr_metadata;
294 CREATE TABLE mrgfltr_metadata 279 CREATE TABLE mrgfltr_metadata
295 ( ppep_id INTEGER REFERENCES ppep(id) 280 ( ppep_id INTEGER REFERENCES ppep(id)
296 , Sequence10 TEXT 281 , Sequence10 TEXT
320 WHERE 305 WHERE
321 mrgfltr_metadata.ppep_id = ppep.id 306 mrgfltr_metadata.ppep_id = ppep.id
322 ORDER BY 307 ORDER BY
323 ppep.seq 308 ppep.seq
324 ; 309 ;
325 ''' 310 """
326 311
327 CITATION_INSERT_STMT = ''' 312 CITATION_INSERT_STMT = """
328 INSERT INTO Citation ( 313 INSERT INTO Citation (
329 ObjectName, 314 ObjectName,
330 CitationData 315 CitationData
331 ) VALUES (?,?) 316 ) VALUES (?,?)
332 ''' 317 """
333 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' 318 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
334 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' 319 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
335 320
336 MRGFLTR_METADATA_COLUMNS = [ 321 MRGFLTR_METADATA_COLUMNS = [
337 'ppep_id', 322 "ppep_id",
338 'Sequence10', 323 "Sequence10",
339 'Sequence7', 324 "Sequence7",
340 'GeneName', 325 "GeneName",
341 'Phosphoresidue', 326 "Phosphoresidue",
342 'UniProtID', 327 "UniProtID",
343 'Description', 328 "Description",
344 'FunctionPhosphoresidue', 329 "FunctionPhosphoresidue",
345 'PutativeUpstreamDomains' 330 "PutativeUpstreamDomains",
346 ] 331 ]
347 332
348 ########### String Constants (end) ############ 333 # String Constants (end) ############
349 334
350 class Error(Exception): 335 class Error(Exception):
351 """Base class for exceptions in this module.""" 336 """Base class for exceptions in this module."""
337
352 pass 338 pass
353 339
354 class PreconditionError(Error): 340 class PreconditionError(Error):
355 """Exception raised for errors in the input. 341 """Exception raised for errors in the input.
356 342
361 347
362 def __init__(self, expression, message): 348 def __init__(self, expression, message):
363 self.expression = expression 349 self.expression = expression
364 self.message = message 350 self.message = message
365 351
366 #start_time = time.clock() #timer 352 # start_time = time.clock() #timer
367 start_time = time.process_time() #timer 353 start_time = time.process_time() # timer
368 354
369 #get keys from upstream tabular file using readline() 355 # get keys from upstream tabular file using readline()
370 # ref: https://stackoverflow.com/a/16713581/15509512 356 # ref: https://stackoverflow.com/a/16713581/15509512
371 # answer to "Use codecs to read file with correct encoding" 357 # answer to "Use codecs to read file with correct encoding"
372 file1_encoded = open(upstream_map_filename_tab, 'rb') 358 file1_encoded = open(upstream_map_filename_tab, "rb")
373 file1 = cx_getreader("latin-1")(file1_encoded) 359 file1 = cx_getreader("latin-1")(file1_encoded)
374 360
375 count = 0 361 count = 0
376 upstream_map_p_peptide_list = [] 362 upstream_map_p_peptide_list = []
377 re_tab = re.compile('^[^\t]*') 363 re_tab = re.compile("^[^\t]*")
378 while True: 364 while True:
379 count += 1 365 count += 1
380 # Get next line from file 366 # Get next line from file
381 line = file1.readline() 367 line = file1.readline()
382 # if line is empty 368 # if line is empty
388 upstream_map_p_peptide_list.append(m[0]) 374 upstream_map_p_peptide_list.append(m[0])
389 file1.close() 375 file1.close()
390 file1_encoded.close() 376 file1_encoded.close()
391 377
392 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed 378 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
393 re_phos = re.compile('p') 379 re_phos = re.compile("p")
394 dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] 380
395 381 end_time = time.process_time() # timer
396 end_time = time.process_time() #timer 382 print(
397 print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) 383 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
398 384 file=sys.stderr,
399 ## ----------- Get SwissProt data from SQLite database (start) ----------- 385 )
386
387 # ----------- Get SwissProt data from SQLite database (start) -----------
400 # build UniProt sequence LUT and list of unique SwissProt sequences 388 # build UniProt sequence LUT and list of unique SwissProt sequences
401 389
402 # Open SwissProt SQLite database 390 # Open SwissProt SQLite database
403 conn = sql.connect(uniprot_sqlite) 391 conn = sql.connect(uniprot_sqlite)
404 cur = conn.cursor() 392 cur = conn.cursor()
405 393
406 # Set up structures to hold SwissProt data 394 # Set up structures to hold SwissProt data
407 395
408 uniprot_Sequence_List = [] 396 uniprot_Sequence_List = []
409 UniProtSeqLUT = {} 397 UniProtSeqLUT = {}
410 398
411 # Execute query for unique seqs without fetching the results yet 399 # Execute query for unique seqs without fetching the results yet
412 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) 400 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
413 401
414 while batch := uniprot_unique_seq_cur.fetchmany(size=50): 402 while 1:
415 if None == batch: 403 batch = uniprot_unique_seq_cur.fetchmany(size=50)
404 if not batch:
416 # handle case where no records are returned 405 # handle case where no records are returned
417 break 406 break
418 for row in batch: 407 for row in batch:
419 Sequence = row[0] 408 Sequence = row[0]
420 UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] 409 UniProtSeqLUT[(Sequence, DESCRIPTION)] = []
421 UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] 410 UniProtSeqLUT[(Sequence, GENE_NAME)] = []
422 UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] 411 UniProtSeqLUT[(Sequence, UNIPROT_ID)] = []
423 UniProtSeqLUT[ Sequence ] = [] 412 UniProtSeqLUT[Sequence] = []
424 413
425 # Execute query for seqs and metadata without fetching the results yet 414 # Execute query for seqs and metadata without fetching the results yet
426 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) 415 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
427 416
428 while batch := uniprot_seq_and_meta.fetchmany(size=50): 417 while 1:
429 if None == batch: 418 batch = uniprot_seq_and_meta.fetchmany(size=50)
430 # handle case where no records are returned 419 if not batch:
420 # handle case where no records are returned
431 break 421 break
432 for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: 422 for (
423 UniProt_ID,
424 Description,
425 Gene_Name,
426 Sequence,
427 OS,
428 OX,
429 PE,
430 SV,
431 ) in batch:
433 uniprot_Sequence_List.append(Sequence) 432 uniprot_Sequence_List.append(Sequence)
434 UniProtSeqLUT[Sequence] = Sequence 433 UniProtSeqLUT[Sequence] = Sequence
435 UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) 434 UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID)
436 UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) 435 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
437 if OS != N_A: 436 if OS != N_A:
438 Description += ' OS=' + OS 437 Description += " OS=" + OS
439 if OX != N_A: 438 if OX != N_A:
440 Description += ' OX=' + str(int(OX)) 439 Description += " OX=" + str(int(OX))
441 if Gene_Name != N_A: 440 if Gene_Name != N_A:
442 Description += ' GN=' + Gene_Name 441 Description += " GN=" + Gene_Name
443 if PE != N_A: 442 if PE != N_A:
444 Description += ' PE=' + PE 443 Description += " PE=" + PE
445 if SV != N_A: 444 if SV != N_A:
446 Description += ' SV=' + SV 445 Description += " SV=" + SV
447 UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) 446 UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description)
448 447
449 # Close SwissProt SQLite database; clean up local variables 448 # Close SwissProt SQLite database; clean up local variables
450 conn.close() 449 conn.close()
451 Sequence = '' 450 Sequence = ""
452 UniProt_ID = '' 451 UniProt_ID = ""
453 Description = '' 452 Description = ""
454 Gene_Name = '' 453 Gene_Name = ""
455 454
456 ## ----------- Get SwissProt data from SQLite database (finish) ----------- 455 # ----------- Get SwissProt data from SQLite database (finish) -----------
457 456
458 end_time = time.process_time() #timer 457 end_time = time.process_time() # timer
459 print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) 458 print(
460 459 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
461 ## ----------- Get SwissProt data from SQLite database (start) ----------- 460 file=sys.stderr,
462 # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT 461 )
463 #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) 462
464 463 # ----------- Get SwissProt data from SQLite database (start) -----------
465 # Open SwissProt SQLite database 464 # Open SwissProt SQLite database
466 conn = sql.connect(uniprot_sqlite) 465 conn = sql.connect(uniprot_sqlite)
467 cur = conn.cursor() 466 cur = conn.cursor()
468 467
469 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide 468 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
470 DephosphoPep_UniProtSeq_LUT = {} 469 DephosphoPep_UniProtSeq_LUT = {}
471 470
472 # Set up dictionary to accumulate results 471 # Set up dictionary to accumulate results
473 PhosphoPep_UniProtSeq_LUT = {} 472 PhosphoPep_UniProtSeq_LUT = {}
474 473
475 # Execute query for tuples without fetching the results yet 474 # Execute query for tuples without fetching the results yet
476 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) 475 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
477 476
478 while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): 477 while 1:
479 if None == batch: 478 batch = ppep_pep_uniprotseq_cur.fetchmany(size=50)
479 if not batch:
480 # handle case where no records are returned 480 # handle case where no records are returned
481 break 481 break
482 for (phospho_pep, dephospho_pep, sequence) in batch: 482 for (phospho_pep, dephospho_pep, sequence) in batch:
483 #do interesting stuff here... 483 # do interesting stuff here...
484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep 484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
485 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep 485 PhosphoPep_UniProtSeq_LUT[
486 (phospho_pep, DEPHOSPHOPEP)
487 ] = dephospho_pep
486 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: 488 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
487 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() 489 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
488 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] 490 DephosphoPep_UniProtSeq_LUT[
489 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] 491 (dephospho_pep, DESCRIPTION)
490 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] 492 ] = []
491 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] 493 DephosphoPep_UniProtSeq_LUT[
494 (dephospho_pep, GENE_NAME)
495 ] = []
496 DephosphoPep_UniProtSeq_LUT[
497 (dephospho_pep, UNIPROT_ID)
498 ] = []
499 DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = []
492 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) 500 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
493 501
494 #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) 502 if (
495 if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: 503 sequence
496 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) 504 not in DephosphoPep_UniProtSeq_LUT[
505 (dephospho_pep, SEQUENCE)
506 ]
507 ):
508 DephosphoPep_UniProtSeq_LUT[
509 (dephospho_pep, SEQUENCE)
510 ].append(sequence)
497 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: 511 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
498 if phospho_pep != phospho_pep: 512 if phospho_pep != phospho_pep:
499 print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) 513 print(
514 "phospho_pep:'%s' phospho_pep:'%s'"
515 % (phospho_pep, phospho_pep)
516 )
500 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: 517 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
501 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep 518 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
502 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep 519 PhosphoPep_UniProtSeq_LUT[
503 r = list(zip( 520 (phospho_pep, DEPHOSPHOPEP)
504 [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], 521 ] = dephospho_pep
505 [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], 522 r = list(
506 [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] 523 zip(
507 )) 524 [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]],
525 [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]],
526 [
527 s
528 for s in UniProtSeqLUT[(sequence, DESCRIPTION)]
529 ],
530 )
531 )
508 # Sort by `UniProt_ID` 532 # Sort by `UniProt_ID`
509 # ref: https://stackoverflow.com/a/4174955/15509512 533 # ref: https://stackoverflow.com/a/4174955/15509512
510 r = sorted(r, key=operator.itemgetter(0)) 534 r = sorted(r, key=operator.itemgetter(0))
511 # Get one tuple for each `phospho_pep` 535 # Get one tuple for each `phospho_pep`
512 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] 536 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
513 for (upid, gn, desc) in r: 537 for (upid, gn, desc) in r:
514 # Append pseudo-tuple per UniProt_ID but only when it is not present 538 # Append pseudo-tuple per UniProt_ID but only when it is not present
515 if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: 539 if (
516 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) 540 upid
517 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) 541 not in DephosphoPep_UniProtSeq_LUT[
518 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) 542 (dephospho_pep, UNIPROT_ID)
543 ]
544 ):
545 DephosphoPep_UniProtSeq_LUT[
546 (dephospho_pep, UNIPROT_ID)
547 ].append(upid)
548 DephosphoPep_UniProtSeq_LUT[
549 (dephospho_pep, DESCRIPTION)
550 ].append(desc)
551 DephosphoPep_UniProtSeq_LUT[
552 (dephospho_pep, GENE_NAME)
553 ].append(gn)
519 554
520 # Close SwissProt SQLite database; clean up local variables 555 # Close SwissProt SQLite database; clean up local variables
521 conn.close() 556 conn.close()
522 # wipe local variables 557 # wipe local variables
523 phospho_pep = dephospho_pep = sequence = 0 558 phospho_pep = dephospho_pep = sequence = 0
524 upid = gn = desc = r = '' 559 upid = gn = desc = r = ""
525 560
526 ## ----------- Get SwissProt data from SQLite database (finish) ----------- 561 # ----------- Get SwissProt data from SQLite database (finish) -----------
527 562
528 end_time = time.process_time() #timer 563 end_time = time.process_time() # timer
529 print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) 564 print(
530 565 "%0.6f finished reading and decoding '%s' [0.4]"
531 print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) 566 % (end_time - start_time, upstream_map_filename_tab),
532 567 file=sys.stderr,
533 #Read in Upstream tabular file 568 )
569
570 print(
571 "{:>10} unique upstream phosphopeptides tested".format(
572 str(len(upstream_map_p_peptide_list))
573 )
574 )
575
576 # Read in Upstream tabular file
534 # We are discarding the intensity data; so read it as text 577 # We are discarding the intensity data; so read it as text
535 upstream_data = pandas.read_table( 578 upstream_data = pandas.read_table(
536 upstream_map_filename_tab, 579 upstream_map_filename_tab, dtype="str", index_col=0
537 dtype='str', 580 )
538 index_col = 0 581
539 ) 582 end_time = time.process_time() # timer
540 583 print(
541 end_time = time.process_time() #timer 584 "%0.6f read Upstream Map from file [1g_1]"
542 print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer 585 % (end_time - start_time,),
586 file=sys.stderr,
587 ) # timer
543 588
544 upstream_data.index = upstream_map_p_peptide_list 589 upstream_data.index = upstream_map_p_peptide_list
545 590
546 591 end_time = time.process_time() # timer
547 end_time = time.process_time() #timer 592 print(
548 print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer 593 "%0.6f added index to Upstream Map [1g_2]"
549 594 % (end_time - start_time,),
550 595 file=sys.stderr,
551 #trim upstream_data to include only the upstream map columns 596 ) # timer
597
598 # trim upstream_data to include only the upstream map columns
552 old_cols = upstream_data.columns.tolist() 599 old_cols = upstream_data.columns.tolist()
553 i = 0 600 i = 0
554 first_intensity = -1 601 first_intensity = -1
555 last_intensity = -1 602 last_intensity = -1
556 intensity_re = re.compile('Intensity.*') 603 intensity_re = re.compile("Intensity.*")
557 for col_name in old_cols: 604 for col_name in old_cols:
558 m = intensity_re.match(col_name) 605 m = intensity_re.match(col_name)
559 if m: 606 if m:
560 last_intensity = i 607 last_intensity = i
561 if first_intensity == -1: 608 if first_intensity == -1:
562 first_intensity = i 609 first_intensity = i
563 i += 1 610 i += 1
564 #print('last intensity = %d' % last_intensity) 611 # print('last intensity = %d' % last_intensity)
565 col_PKCalpha = last_intensity + 2 612 col_PKCalpha = last_intensity + 2
566 col_firstIntensity = first_intensity 613
567 614 data_in_cols = [old_cols[0]] + old_cols[
568 data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] 615 first_intensity: last_intensity + 1
616 ]
569 617
570 if upstream_data.empty: 618 if upstream_data.empty:
571 print("upstream_data is empty") 619 print("upstream_data is empty")
572 exit(0) 620 exit(0)
573 621
574 data_in = upstream_data.copy(deep=True)[data_in_cols] 622 data_in = upstream_data.copy(deep=True)[data_in_cols]
575 623
576 # Convert floating-point integers to int64 integers 624 # Convert floating-point integers to int64 integers
577 # ref: https://stackoverflow.com/a/68497603/15509512 625 # ref: https://stackoverflow.com/a/68497603/15509512
578 data_in[list(data_in.columns[1:])] = data_in[ 626 data_in[list(data_in.columns[1:])] = (
579 list(data_in.columns[1:])].astype('float64').apply(np.int64) 627 data_in[list(data_in.columns[1:])]
580 628 .astype("float64")
581 #create another phosphopeptide column that will be used to join later; 629 .apply(np.int64)
630 )
631
632 # create another phosphopeptide column that will be used to join later;
582 # MAY need to change depending on Phosphopeptide column position 633 # MAY need to change depending on Phosphopeptide column position
583 #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] 634 # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
584 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index 635 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
585 636
586 637 end_time = time.process_time() # timer
587 638 print(
588 639 "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]"
589 end_time = time.process_time() #timer 640 % (end_time - start_time,),
590 print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer 641 file=sys.stderr,
642 ) # timer
591 643
592 # Produce a dictionary of metadata for a single phosphopeptide. 644 # Produce a dictionary of metadata for a single phosphopeptide.
593 # This is a replacement of `UniProtInfo_subdict` in the original code. 645 # This is a replacement of `UniProtInfo_subdict` in the original code.
594 def pseq_to_subdict(phospho_pep): 646 def pseq_to_subdict(phospho_pep):
595 #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
596 # Strip "p" from phosphopeptide sequence 647 # Strip "p" from phosphopeptide sequence
597 dephospho_pep = re_phos.sub('',phospho_pep) 648 dephospho_pep = re_phos.sub("", phospho_pep)
598 649
599 # Determine number of phosphoresidues in phosphopeptide 650 # Determine number of phosphoresidues in phosphopeptide
600 numps = len(phospho_pep) - len(dephospho_pep) 651 numps = len(phospho_pep) - len(dephospho_pep)
601 652
602 # Determine location(s) of phosphoresidue(s) in phosphopeptide 653 # Determine location(s) of phosphoresidue(s) in phosphopeptide
603 # (used later for Phosphoresidue, Sequence7, and Sequence10) 654 # (used later for Phosphoresidue, Sequence7, and Sequence10)
604 ploc = [] #list of p locations 655 ploc = [] # list of p locations
605 i = 0 656 i = 0
606 p = phospho_pep 657 p = phospho_pep
607 while i < numps: 658 while i < numps:
608 ploc.append(p.find("p")) 659 ploc.append(p.find("p"))
609 p = p[:p.find("p")] + p[p.find("p")+1:] 660 p = p[: p.find("p")] + p[p.find("p") + 1:]
610 i +=1 661 i += 1
611
612 662
613 # Establish nested dictionary 663 # Establish nested dictionary
614 result = {} 664 result = {}
615 result[SEQUENCE] = [] 665 result[SEQUENCE] = []
616 result[UNIPROT_ID] = [] 666 result[UNIPROT_ID] = []
621 result[SEQUENCE10] = [] 671 result[SEQUENCE10] = []
622 672
623 # Add stripped sequence to dictionary 673 # Add stripped sequence to dictionary
624 result[SEQUENCE].append(dephospho_pep) 674 result[SEQUENCE].append(dephospho_pep)
625 675
626 # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
627 dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
628
629 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT 676 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
630 ### Caller may elect to: 677 # Caller may elect to:
631 ## try: 678 # try:
632 ## ... 679 # ...
633 ## except PreconditionError as pe: 680 # except PreconditionError as pe:
634 ## print("'{expression}': {message}".format( 681 # print("'{expression}': {message}".format(
635 ## expression = pe.expression, 682 # expression = pe.expression,
636 ## message = pe.message)) 683 # message = pe.message))
637 ## ) 684 # )
638 ## ) 685 # )
639 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: 686 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
640 raise PreconditionError( dephospho_pep, 687 raise PreconditionError(
641 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' 688 dephospho_pep,
642 ) 689 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
690 )
643 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: 691 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
644 raise PreconditionError( dephospho_pep, 692 raise PreconditionError(
645 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' 693 dephospho_pep,
646 ) 694 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
647 if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: 695 )
648 raise PreconditionError( dephospho_pep, 696 if (
649 "dephosphorylated phosphopeptide does not match " + 697 dephospho_pep
650 "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + 698 != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
651 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] 699 ):
652 ) 700 raise PreconditionError(
701 dephospho_pep,
702 "dephosphorylated phosphopeptide does not match "
703 + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
704 + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)],
705 )
653 result[SEQUENCE] = [dephospho_pep] 706 result[SEQUENCE] = [dephospho_pep]
654 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] 707 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
655 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] 708 (dephospho_pep, UNIPROT_ID)
656 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] 709 ]
657 if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: 710 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[
658 raise PreconditionError( dephospho_pep, 711 (dephospho_pep, DESCRIPTION)
659 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' 712 ]
660 ) 713 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[
661 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] 714 (dephospho_pep, GENE_NAME)
662 if len (UniProtSeqList) < 1: 715 ]
663 print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) 716 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
664 # raise PreconditionError( 717 raise PreconditionError(
665 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", 718 dephospho_pep,
666 # 'value has zero length' 719 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
667 # ) 720 )
721 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
722 (dephospho_pep, SEQUENCE)
723 ]
724 if len(UniProtSeqList) < 1:
725 print(
726 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
727 % dephospho_pep
728 )
729 # raise PreconditionError(
730 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
731 # 'value has zero length'
732 # )
668 for UniProtSeq in UniProtSeqList: 733 for UniProtSeq in UniProtSeqList:
669 i = 0 734 i = 0
670 phosphoresidues = [] 735 phosphoresidues = []
671 seq7s_set = set() 736 seq7s_set = set()
672 seq7s = [] 737 seq7s = []
673 seq10s_set = set() 738 seq10s_set = set()
674 seq10s = [] 739 seq10s = []
675 while i < len(ploc): 740 while i < len(ploc):
676 start = UniProtSeq.find(dephospho_pep) 741 start = UniProtSeq.find(dephospho_pep)
677 psite = start+ploc[i] #location of phosphoresidue on protein sequence 742 psite = (
678 743 start + ploc[i]
679 #add Phosphoresidue 744 ) # location of phosphoresidue on protein sequence
680 phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) 745
746 # add Phosphoresidue
747 phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1)
681 phosphoresidues.append(phosphosite) 748 phosphoresidues.append(phosphosite)
682 749
683 #Add Sequence7 750 # Add Sequence7
684 if psite < 7: #phospho_pep at N terminus 751 if psite < 7: # phospho_pep at N terminus
685 seq7 = str(UniProtSeq)[:psite+8] 752 seq7 = str(UniProtSeq)[: psite + 8]
686 if seq7[psite] == "S": #if phosphosresidue is serine 753 if seq7[psite] == "S": # if phosphosresidue is serine
687 pres = "s" 754 pres = "s"
688 elif seq7[psite] == "T": #if phosphosresidue is threonine 755 elif (
756 seq7[psite] == "T"
757 ): # if phosphosresidue is threonine
689 pres = "t" 758 pres = "t"
690 elif seq7[psite] == "Y": #if phosphoresidue is tyrosine 759 elif (
760 seq7[psite] == "Y"
761 ): # if phosphoresidue is tyrosine
691 pres = "y" 762 pres = "y"
692 else: # if not pSTY 763 else: # if not pSTY
693 pres = "?" 764 pres = "?"
694 seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] 765 seq7 = (
695 while len(seq7) < 15: #add appropriate number of "_" to the front 766 seq7[:psite] + pres + seq7[psite + 1: psite + 8]
767 )
768 while (
769 len(seq7) < 15
770 ): # add appropriate number of "_" to the front
696 seq7 = "_" + seq7 771 seq7 = "_" + seq7
697 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus 772 elif (
698 seq7 = str(UniProtSeq)[psite-7:] 773 len(UniProtSeq) - psite < 8
774 ): # phospho_pep at C terminus
775 seq7 = str(UniProtSeq)[psite - 7:]
699 if seq7[7] == "S": 776 if seq7[7] == "S":
700 pres = "s" 777 pres = "s"
701 elif seq7[7] == "T": 778 elif seq7[7] == "T":
702 pres = "t" 779 pres = "t"
703 elif seq7[7] == "Y": 780 elif seq7[7] == "Y":
704 pres = "y" 781 pres = "y"
705 else: 782 else:
706 pres = "?" 783 pres = "?"
707 seq7 = seq7[:7] + pres + seq7[8:] 784 seq7 = seq7[:7] + pres + seq7[8:]
708 while len(seq7) < 15: #add appropriate number of "_" to the back 785 while (
786 len(seq7) < 15
787 ): # add appropriate number of "_" to the back
709 seq7 = seq7 + "_" 788 seq7 = seq7 + "_"
710 else: 789 else:
711 seq7 = str(UniProtSeq)[psite-7:psite+8] 790 seq7 = str(UniProtSeq)[psite - 7: psite + 8]
712 pres = "" #phosphoresidue 791 pres = "" # phosphoresidue
713 if seq7[7] == "S": #if phosphosresidue is serine 792 if seq7[7] == "S": # if phosphosresidue is serine
714 pres = "s" 793 pres = "s"
715 elif seq7[7] == "T": #if phosphosresidue is threonine 794 elif seq7[7] == "T": # if phosphosresidue is threonine
716 pres = "t" 795 pres = "t"
717 elif seq7[7] == "Y": #if phosphoresidue is tyrosine 796 elif seq7[7] == "Y": # if phosphoresidue is tyrosine
718 pres = "y" 797 pres = "y"
719 else: # if not pSTY 798 else: # if not pSTY
720 pres = "?" 799 pres = "?"
721 seq7 = seq7[:7] + pres + seq7[8:] 800 seq7 = seq7[:7] + pres + seq7[8:]
722 if seq7 not in seq7s_set: 801 if seq7 not in seq7s_set:
723 seq7s.append(seq7) 802 seq7s.append(seq7)
724 seq7s_set.add(seq7) 803 seq7s_set.add(seq7)
725 804
726 #add Sequence10 805 # add Sequence10
727 if psite < 10: #phospho_pep at N terminus 806 if psite < 10: # phospho_pep at N terminus
728 seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] 807 seq10 = (
729 elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus 808 str(UniProtSeq)[:psite]
730 seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] 809 + "p"
810 + str(UniProtSeq)[psite: psite + 11]
811 )
812 elif (
813 len(UniProtSeq) - psite < 11
814 ): # phospho_pep at C terminus
815 seq10 = (
816 str(UniProtSeq)[psite - 10: psite]
817 + "p"
818 + str(UniProtSeq)[psite:]
819 )
731 else: 820 else:
732 seq10 = str(UniProtSeq)[psite-10:psite+11] 821 seq10 = str(UniProtSeq)[psite - 10: psite + 11]
733 seq10 = seq10[:10] + "p" + seq10[10:] 822 seq10 = seq10[:10] + "p" + seq10[10:]
734 if seq10 not in seq10s_set: 823 if seq10 not in seq10s_set:
735 seq10s.append(seq10) 824 seq10s.append(seq10)
736 seq10s_set.add(seq10) 825 seq10s_set.add(seq10)
737 826
738 i+=1 827 i += 1
739 828
740 result[PHOSPHORESIDUE].append(phosphoresidues) 829 result[PHOSPHORESIDUE].append(phosphoresidues)
741 result[SEQUENCE7].append(seq7s) 830 result[SEQUENCE7].append(seq7s)
742 # result[SEQUENCE10] is a list of lists of strings 831 # result[SEQUENCE10] is a list of lists of strings
743 result[SEQUENCE10].append(seq10s) 832 result[SEQUENCE10].append(seq10s)
744 833
745 834 r = list(
746 835 zip(
747 836 result[UNIPROT_ID],
748 r = list(zip( 837 result[GENE_NAME],
749 result[UNIPROT_ID], 838 result[DESCRIPTION],
750 result[GENE_NAME], 839 result[PHOSPHORESIDUE],
751 result[DESCRIPTION], 840 )
752 result[PHOSPHORESIDUE] 841 )
753 ))
754 # Sort by `UniProt_ID` 842 # Sort by `UniProt_ID`
755 # ref: https://stackoverflow.com//4174955/15509512 843 # ref: https://stackoverflow.com//4174955/15509512
756 s = sorted(r, key=operator.itemgetter(0)) 844 s = sorted(r, key=operator.itemgetter(0))
757 845
758 result[UNIPROT_ID] = [] 846 result[UNIPROT_ID] = []
764 result[UNIPROT_ID].append(r[0]) 852 result[UNIPROT_ID].append(r[0])
765 result[GENE_NAME].append(r[1]) 853 result[GENE_NAME].append(r[1])
766 result[DESCRIPTION].append(r[2]) 854 result[DESCRIPTION].append(r[2])
767 result[PHOSPHORESIDUE].append(r[3]) 855 result[PHOSPHORESIDUE].append(r[3])
768 856
769 857 # convert lists to strings in the dictionary
770 #convert lists to strings in the dictionary 858 for key, value in result.items():
771 for key,value in result.items():
772 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: 859 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
773 result[key] = '; '.join(map(str, value)) 860 result[key] = "; ".join(map(str, value))
774 elif key in [SEQUENCE10]: 861 elif key in [SEQUENCE10]:
775 # result[SEQUENCE10] is a list of lists of strings 862 # result[SEQUENCE10] is a list of lists of strings
776 joined_value = '' 863 joined_value = ""
777 joined_set = set() 864 joined_set = set()
778 sep = '' 865 sep = ""
779 for valL in value: 866 for valL in value:
780 # valL is a list of strings 867 # valL is a list of strings
781 for val in valL: 868 for val in valL:
782 # val is a string 869 # val is a string
783 if val not in joined_set: 870 if val not in joined_set:
784 joined_set.add(val) 871 joined_set.add(val)
785 #joined_value += sep + '; '.join(map(str, val)) 872 # joined_value += sep + '; '.join(map(str, val))
786 joined_value += sep + val 873 joined_value += sep + val
787 sep = '; ' 874 sep = "; "
788 # joined_value is a string 875 # joined_value is a string
789 result[key] = joined_value 876 result[key] = joined_value
790 877
791 878 newstring = "; ".join(
792 newstring = '; '.join( 879 [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
793 [', '.join(l) for l in result[PHOSPHORESIDUE]] 880 )
794 ) 881 # #separate the isoforms in PHOSPHORESIDUE column with ";"
795 ### #separate the isoforms in PHOSPHORESIDUE column with ";" 882 # oldstring = result[PHOSPHORESIDUE]
796 ### oldstring = result[PHOSPHORESIDUE] 883 # oldlist = list(oldstring)
797 ### oldlist = list(oldstring) 884 # newstring = ""
798 ### newstring = "" 885 # i = 0
799 ### i = 0 886 # for e in oldlist:
800 ### for e in oldlist: 887 # if e == ";":
801 ### if e == ";": 888 # if numps > 1:
802 ### if numps > 1: 889 # if i%numps:
803 ### if i%numps: 890 # newstring = newstring + ";"
804 ### newstring = newstring + ";" 891 # else:
805 ### else: 892 # newstring = newstring + ","
806 ### newstring = newstring + "," 893 # else:
807 ### else: 894 # newstring = newstring + ";"
808 ### newstring = newstring + ";" 895 # i +=1
809 ### i +=1 896 # else:
810 ### else: 897 # newstring = newstring + e
811 ### newstring = newstring + e
812 result[PHOSPHORESIDUE] = newstring 898 result[PHOSPHORESIDUE] = newstring
813 899
814 900 # separate sequence7's by |
815 #separate sequence7's by |
816 oldstring = result[SEQUENCE7] 901 oldstring = result[SEQUENCE7]
817 oldlist = oldstring 902 oldlist = oldstring
818 newstring = "" 903 newstring = ""
819 for l in oldlist: 904 for ol in oldlist:
820 for e in l: 905 for e in ol:
821 if e == ";": 906 if e == ";":
822 newstring = newstring + " |" 907 newstring = newstring + " |"
823 elif len(newstring) > 0 and 1 > newstring.count(e): 908 elif len(newstring) > 0 and 1 > newstring.count(e):
824 newstring = newstring + " | " + e 909 newstring = newstring + " | " + e
825 elif 1 > newstring.count(e): 910 elif 1 > newstring.count(e):
826 newstring = newstring + e 911 newstring = newstring + e
827 result[SEQUENCE7] = newstring 912 result[SEQUENCE7] = newstring
828
829 913
830 return [phospho_pep, result] 914 return [phospho_pep, result]
831 915
832 # Construct list of [string, dictionary] lists 916 # Construct list of [string, dictionary] lists
833 # where the dictionary provides the SwissProt metadata for a phosphopeptide 917 # where the dictionary provides the SwissProt metadata for a phosphopeptide
834 result_list = [ 918 result_list = [
835 catch(pseq_to_subdict,psequence) 919 catch(pseq_to_subdict, psequence)
836 for psequence 920 for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
837 in data_in[PHOSPHOPEPTIDE_MATCH] 921 ]
838 ] 922
839 923 end_time = time.process_time() # timer
840 924 print(
841 end_time = time.process_time() #timer 925 "%0.6f added SwissProt annotations to phosphopeptides [B]"
842 print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer 926 % (end_time - start_time,),
927 file=sys.stderr,
928 ) # timer
843 929
844 # Construct dictionary from list of lists 930 # Construct dictionary from list of lists
845 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ 931 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
846 UniProt_Info = { 932 UniProt_Info = {
847 result[0]:result[1] 933 result[0]: result[1]
848 for result 934 for result in result_list
849 in result_list
850 if result is not None 935 if result is not None
851 } 936 }
852 937
853 938 end_time = time.process_time() # timer
854 end_time = time.process_time() #timer 939 print(
855 print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer 940 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
856 941 % (end_time - start_time,),
857 #cosmetic: add N_A to phosphopeptide rows with no hits 942 file=sys.stderr,
943 ) # timer
944
945 # cosmetic: add N_A to phosphopeptide rows with no hits
858 p_peptide_list = [] 946 p_peptide_list = []
859 for key in UniProt_Info: 947 for key in UniProt_Info:
860 p_peptide_list.append(key) 948 p_peptide_list.append(key)
861 for nestedKey in UniProt_Info[key]: 949 for nestedKey in UniProt_Info[key]:
862 if UniProt_Info[key][nestedKey] == "": 950 if UniProt_Info[key][nestedKey] == "":
863 UniProt_Info[key][nestedKey] = N_A 951 UniProt_Info[key][nestedKey] = N_A
864 952
865 end_time = time.process_time() #timer 953 end_time = time.process_time() # timer
866 print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer 954 print(
867 955 "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,),
868 #convert UniProt_Info dictionary to dataframe 956 file=sys.stderr,
869 uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) 957 ) # timer
870 958
871 #reorder columns to match expected output file 959 # convert UniProt_Info dictionary to dataframe
872 uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too 960 uniprot_df = pandas.DataFrame.transpose(
873 961 pandas.DataFrame.from_dict(UniProt_Info)
962 )
963
964 # reorder columns to match expected output file
965 uniprot_df[
966 PHOSPHOPEPTIDE
967 ] = uniprot_df.index # make index a column too
874 968
875 cols = uniprot_df.columns.tolist() 969 cols = uniprot_df.columns.tolist()
876 #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] 970 # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
877 #uniprot_df = uniprot_df[cols] 971 # uniprot_df = uniprot_df[cols]
878 uniprot_df = uniprot_df[[ 972 uniprot_df = uniprot_df[
879 PHOSPHOPEPTIDE, 973 [
880 SEQUENCE10, 974 PHOSPHOPEPTIDE,
881 SEQUENCE7, 975 SEQUENCE10,
882 GENE_NAME, 976 SEQUENCE7,
883 PHOSPHORESIDUE, 977 GENE_NAME,
884 UNIPROT_ID, 978 PHOSPHORESIDUE,
885 DESCRIPTION 979 UNIPROT_ID,
886 ]] 980 DESCRIPTION,
887 981 ]
888 982 ]
889 end_time = time.process_time() #timer 983
890 print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer 984 end_time = time.process_time() # timer
891 985 print(
892 #concat to split then groupby to collapse 986 "%0.6f reordered columns to match expected output file [1]"
893 seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) 987 % (end_time - start_time,),
894 for _, row in uniprot_df.iterrows()]).reset_index() 988 file=sys.stderr,
895 seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] 989 ) # timer
990
991 # concat to split then groupby to collapse
992 seq7_df = pandas.concat(
993 [
994 pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | "))
995 for _, row in uniprot_df.iterrows()
996 ]
997 ).reset_index()
998 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
896 999
897 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- 1000 # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
898 #read in PhosphoSitePlus Regulatory Sites dataset 1001 # read in PhosphoSitePlus Regulatory Sites dataset
899 #ACE if (True): 1002 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
900 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
901 conn = sql.connect(uniprot_sqlite) 1003 conn = sql.connect(uniprot_sqlite)
902 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) 1004 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
903 # Close SwissProt SQLite database 1005 # Close SwissProt SQLite database
904 conn.close() 1006 conn.close()
905 #ACE # Array indexes are zero-based
906 #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
907 #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
908 #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
909 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
910 #ACE print(regsites_df)
911 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
912 #ACE else:
913 #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
914 #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
915 #ACE #ACE # Array indexes are zero-based
916 #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language)
917 #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID'
918 #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA'
919 #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
920 #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES'
921 #ACE #ACE ]
922 #ACE #ACE REGSITE_COL_SITE7AA = 9
923 #ACE #ACE REGSITE_COL_PROTEIN = 1
924 #ACE #ACE REGSITE_COL_DOMAIN = 10
925 #ACE #ACE REGSITE_COL_PMIDs = 15
926
927 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ 1007 # ... -------------- end read PSP_Regulatory_sites ------------------------------------
928 1008
929 1009 # keep only the human entries in dataframe
930 #keep only the human entries in dataframe
931 if len(species) > 0: 1010 if len(species) > 0:
932 print('Limit PhosphoSitesPlus records to species "' + species + '"') 1011 print(
1012 'Limit PhosphoSitesPlus records to species "' + species + '"'
1013 )
933 regsites_df = regsites_df[regsites_df.ORGANISM == species] 1014 regsites_df = regsites_df[regsites_df.ORGANISM == species]
934 1015
935 #merge the seq7 df with the regsites df based off of the sequence7 1016 # merge the seq7 df with the regsites df based off of the sequence7
936 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') 1017 merge_df = seq7_df.merge(
937 #ACE print(merge_df.columns.tolist()) #ACE 1018 regsites_df,
938 1019 left_on=SEQUENCE7,
939 #after merging df, select only the columns of interest - note that PROTEIN is absent here 1020 right_on=SITE_PLUSMINUS_7AA_SQL,
940 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] 1021 how="left",
941 #ACE print(merge_df.columns.tolist()) #ACE 1022 )
942 #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" 1023
943 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") 1024 # after merging df, select only the columns of interest - note that PROTEIN is absent here
944 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") 1025 merge_df = merge_df[
945 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") 1026 [
946 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") 1027 PHOSPHOPEPTIDE,
947 1028 SEQUENCE7,
948 #remove the columns that were combined 1029 ON_FUNCTION,
949 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] 1030 ON_PROCESS,
950 1031 ON_PROT_INTERACT,
951 #ACE print(merge_df) #ACE 1032 ON_OTHER_INTERACT,
952 #ACE print(merge_df.columns.tolist()) #ACE 1033 ON_NOTES,
953 1034 ]
954 end_time = time.process_time() #timer 1035 ]
955 print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer 1036 # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
956 1037 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
957 #cosmetic changes to Function Phosphoresidue column 1038 merge_df[ON_PROCESS], sep="; ", na_rep=""
1039 )
1040 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1041 FUNCTION_PHOSPHORESIDUE
1042 ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
1043 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1044 FUNCTION_PHOSPHORESIDUE
1045 ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
1046 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1047 FUNCTION_PHOSPHORESIDUE
1048 ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
1049
1050 # remove the columns that were combined
1051 merge_df = merge_df[
1052 [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE]
1053 ]
1054
1055 end_time = time.process_time() # timer
1056 print(
1057 "%0.6f merge regsite metadata [1a]" % (end_time - start_time,),
1058 file=sys.stderr,
1059 ) # timer
1060
1061 # cosmetic changes to Function Phosphoresidue column
958 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) 1062 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
959 1063
960 end_time = time.process_time() #timer 1064 end_time = time.process_time() # timer
961 print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer 1065 print(
1066 "%0.6f more cosmetic changes [1b]" % (end_time - start_time,),
1067 file=sys.stderr,
1068 ) # timer
962 1069
963 i = 0 1070 i = 0
964 while i < len(fp_series): 1071 while i < len(fp_series):
965 #remove the extra ";" so that it looks more professional 1072 # remove the extra ";" so that it looks more professional
966 if fp_series[i] == "; ; ; ; ": #remove ; from empty hits 1073 if fp_series[i] == "; ; ; ; ": # remove ; from empty hits
967 fp_series[i] = "" 1074 fp_series[i] = ""
968 while fp_series[i].endswith("; "): #remove ; from the ends 1075 while fp_series[i].endswith("; "): # remove ; from the ends
969 fp_series[i] = fp_series[i][:-2] 1076 fp_series[i] = fp_series[i][:-2]
970 while fp_series[i].startswith("; "): #remove ; from the beginning 1077 while fp_series[i].startswith("; "): # remove ; from the beginning
971 fp_series[i] = fp_series[i][2:] 1078 fp_series[i] = fp_series[i][2:]
972 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") 1079 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
973 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") 1080 fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
974 fp_series[i] = fp_series[i].replace("; ; ", "; ") 1081 fp_series[i] = fp_series[i].replace("; ; ", "; ")
975 1082
976 #turn blanks into N_A to signify the info was searched for but cannot be found 1083 # turn blanks into N_A to signify the info was searched for but cannot be found
977 if fp_series[i] == "": 1084 if fp_series[i] == "":
978 fp_series[i] = N_A 1085 fp_series[i] = N_A
979 1086
980 i += 1 1087 i += 1
981 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series 1088 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
982 1089
983 end_time = time.process_time() #timer 1090 end_time = time.process_time() # timer
984 print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer 1091 print(
985 1092 "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,),
986 #merge uniprot df with merge df 1093 file=sys.stderr,
987 uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") 1094 ) # timer
988 1095
989 #collapse the merged df 1096 # merge uniprot df with merge df
1097 uniprot_regsites_merged_df = uniprot_df.merge(
1098 merge_df,
1099 left_on=PHOSPHOPEPTIDE,
1100 right_on=PHOSPHOPEPTIDE,
1101 how="left",
1102 )
1103
1104 # collapse the merged df
990 uniprot_regsites_collapsed_df = pandas.DataFrame( 1105 uniprot_regsites_collapsed_df = pandas.DataFrame(
991 uniprot_regsites_merged_df 1106 uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[
992 .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] 1107 FUNCTION_PHOSPHORESIDUE
993 .apply(lambda x: ppep_join(x))) 1108 ].apply(lambda x: ppep_join(x))
994 #.apply(lambda x: "%s" % ' | '.join(x))) 1109 )
995 1110 # .apply(lambda x: "%s" % ' | '.join(x)))
996 1111
997 end_time = time.process_time() #timer 1112 end_time = time.process_time() # timer
998 print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer 1113 print(
999 1114 "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,),
1000 uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column 1115 file=sys.stderr,
1001 1116 ) # timer
1002 1117
1003 #rename columns 1118 uniprot_regsites_collapsed_df[
1004 uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] 1119 PHOSPHOPEPTIDE
1005 1120 ] = (
1006 1121 uniprot_regsites_collapsed_df.index
1007 1122 ) # add df index as its own column
1008 #select columns to be merged to uniprot_df 1123
1009 #ACE cols = regsites_df.columns.tolist() 1124 # rename columns
1010 #ACE print(cols) #ACE 1125 uniprot_regsites_collapsed_df.columns = [
1011 #ACE if len(cols) > 8: 1126 FUNCTION_PHOSPHORESIDUE,
1012 #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] 1127 "ppp",
1013 #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] 1128 ]
1014 #ACE print(cols) #ACE 1129
1015 #ACE regsite_merge_df = regsites_df[cols] 1130 end_time = time.process_time() # timer
1016 1131 print(
1017 end_time = time.process_time() #timer 1132 "%0.6f selected columns to be merged to uniprot_df [1e]"
1018 print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer 1133 % (end_time - start_time,),
1019 1134 file=sys.stderr,
1020 #add columns based on Sequence7 matching site_+/-7_AA 1135 ) # timer
1136
1137 # add columns based on Sequence7 matching site_+/-7_AA
1021 uniprot_regsite_df = pandas.merge( 1138 uniprot_regsite_df = pandas.merge(
1022 left=uniprot_df, 1139 left=uniprot_df,
1023 right=uniprot_regsites_collapsed_df, 1140 right=uniprot_regsites_collapsed_df,
1024 how='left', 1141 how="left",
1025 left_on=PHOSPHOPEPTIDE, 1142 left_on=PHOSPHOPEPTIDE,
1026 right_on='ppp') 1143 right_on="ppp",
1027 1144 )
1028 end_time = time.process_time() #timer 1145
1029 print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer 1146 end_time = time.process_time() # timer
1147 print(
1148 "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]"
1149 % (end_time - start_time,),
1150 file=sys.stderr,
1151 ) # timer
1030 1152
1031 data_in.rename( 1153 data_in.rename(
1032 {'Protein description': PHOSPHOPEPTIDE}, 1154 {"Protein description": PHOSPHOPEPTIDE},
1033 axis='columns', 1155 axis="columns",
1034 inplace=True 1156 inplace=True,
1035 ) 1157 )
1036 1158
1037 1159 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
1038 1160 res2 = sorted(
1039 sort_start_time = time.process_time() #timer 1161 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
1040 1162 )
1041 #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
1042 res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
1043 data_in = data_in.loc[res2] 1163 data_in = data_in.loc[res2]
1044 1164
1045 end_time = time.process_time() #timer 1165 end_time = time.process_time() # timer
1046 print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer 1166 print(
1047 1167 "%0.6f sorting time [1f]" % (end_time - start_time,),
1048 1168 file=sys.stderr,
1049 1169 ) # timer
1050 cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] 1170
1171 cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:]
1051 upstream_data = upstream_data[cols] 1172 upstream_data = upstream_data[cols]
1052 1173
1053 end_time = time.process_time() #timer 1174 end_time = time.process_time() # timer
1054 print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer 1175 print(
1055 1176 "%0.6f refactored columns for Upstream Map [1g]"
1056 1177 % (end_time - start_time,),
1057 #### #rename upstream columns in new list 1178 file=sys.stderr,
1058 #### new_cols = [] 1179 ) # timer
1059 #### for name in cols: 1180
1060 #### if "_NetworKIN" in name: 1181 # #rename upstream columns in new list
1061 #### name = name.split("_")[0] 1182 # new_cols = []
1062 #### if " motif" in name: 1183 # for name in cols:
1063 #### name = name.split(" motif")[0] 1184 # if "_NetworKIN" in name:
1064 #### if " sequence " in name: 1185 # name = name.split("_")[0]
1065 #### name = name.split(" sequence")[0] 1186 # if " motif" in name:
1066 #### if "_Phosida" in name: 1187 # name = name.split(" motif")[0]
1067 #### name = name.split("_")[0] 1188 # if " sequence " in name:
1068 #### if "_PhosphoSite" in name: 1189 # name = name.split(" sequence")[0]
1069 #### name = name.split("_")[0] 1190 # if "_Phosida" in name:
1070 #### new_cols.append(name) 1191 # name = name.split("_")[0]
1071 1192 # if "_PhosphoSite" in name:
1072 #rename upstream columns in new list 1193 # name = name.split("_")[0]
1194 # new_cols.append(name)
1195
1196 # rename upstream columns in new list
1073 def col_rename(name): 1197 def col_rename(name):
1074 if "_NetworKIN" in name: 1198 if "_NetworKIN" in name:
1075 name = name.split("_")[0] 1199 name = name.split("_")[0]
1076 if " motif" in name: 1200 if " motif" in name:
1077 name = name.split(" motif")[0] 1201 name = name.split(" motif")[0]
1084 return name 1208 return name
1085 1209
1086 new_cols = [col_rename(col) for col in cols] 1210 new_cols = [col_rename(col) for col in cols]
1087 upstream_data.columns = new_cols 1211 upstream_data.columns = new_cols
1088 1212
1089 1213 end_time = time.process_time() # timer
1090 1214 print(
1091 end_time = time.process_time() #timer 1215 "%0.6f renamed columns for Upstream Map [1h_1]"
1092 print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer 1216 % (end_time - start_time,),
1093 1217 file=sys.stderr,
1218 ) # timer
1094 1219
1095 # Create upstream_data_cast as a copy of upstream_data 1220 # Create upstream_data_cast as a copy of upstream_data
1096 # but with first column substituted by the phosphopeptide sequence 1221 # but with first column substituted by the phosphopeptide sequence
1097 upstream_data_cast = upstream_data.copy() 1222 upstream_data_cast = upstream_data.copy()
1098 new_cols_cast = new_cols 1223 new_cols_cast = new_cols
1099 new_cols_cast[0] = 'p_peptide' 1224 new_cols_cast[0] = "p_peptide"
1100 upstream_data_cast.columns = new_cols_cast 1225 upstream_data_cast.columns = new_cols_cast
1101 upstream_data_cast['p_peptide'] = upstream_data.index 1226 upstream_data_cast["p_peptide"] = upstream_data.index
1102 new_cols_cast0 = new_cols_cast[0]
1103 1227
1104 # --- -------------- begin read upstream_data_melt ------------------------------------ 1228 # --- -------------- begin read upstream_data_melt ------------------------------------
1105 ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- 1229 # ----------- Get melted kinase mapping data from SQLite database (start) -----------
1106 conn = sql.connect(uniprot_sqlite) 1230 conn = sql.connect(uniprot_sqlite)
1107 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) 1231 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
1108 # Close SwissProt SQLite database 1232 # Close SwissProt SQLite database
1109 conn.close() 1233 conn.close()
1110 upstream_data_melt = upstream_data_melt_df.copy() 1234 upstream_data_melt = upstream_data_melt_df.copy()
1111 upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] 1235 upstream_data_melt.columns = ["p_peptide", "characterization", "X"]
1112 upstream_data_melt['characterization'] = [ 1236 upstream_data_melt["characterization"] = [
1113 col_rename(s) 1237 col_rename(s) for s in upstream_data_melt["characterization"]
1114 for s in upstream_data_melt['characterization'] 1238 ]
1115 ] 1239
1116 1240 print(
1117 print('%0.6f upstream_data_melt_df initially has %d rows' % 1241 "%0.6f upstream_data_melt_df initially has %d rows"
1118 (end_time - start_time, len(upstream_data_melt.axes[0])) 1242 % (end_time - start_time, len(upstream_data_melt.axes[0])),
1119 , file=sys.stderr) 1243 file=sys.stderr,
1244 )
1120 # ref: https://stackoverflow.com/a/27360130/15509512 1245 # ref: https://stackoverflow.com/a/27360130/15509512
1121 # e.g. df.drop(df[df.score < 50].index, inplace=True) 1246 # e.g. df.drop(df[df.score < 50].index, inplace=True)
1122 upstream_data_melt.drop( 1247 upstream_data_melt.drop(
1123 upstream_data_melt[upstream_data_melt.X != 'X'].index, 1248 upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True
1124 inplace = True 1249 )
1125 ) 1250 print(
1126 print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % 1251 "%0.6f upstream_data_melt_df pre-dedup has %d rows"
1127 (end_time - start_time, len(upstream_data_melt.axes[0])) 1252 % (end_time - start_time, len(upstream_data_melt.axes[0])),
1128 , file=sys.stderr) 1253 file=sys.stderr,
1129 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also 1254 )
1130 #ACE print(upstream_data_melt) 1255 # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
1131 ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
1132 # ... -------------- end read upstream_data_melt -------------------------------------- 1256 # ... -------------- end read upstream_data_melt --------------------------------------
1133 1257
1134 end_time = time.process_time() #timer 1258 end_time = time.process_time() # timer
1135 print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer 1259 print(
1260 "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
1261 % (end_time - start_time,),
1262 file=sys.stderr,
1263 ) # timer
1136 # ... end read upstream_data_melt 1264 # ... end read upstream_data_melt
1137 1265
1138 upstream_data_melt_index = upstream_data_melt.index 1266 end_time = time.process_time() # timer
1139 upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] 1267 print(
1140 1268 "%0.6f indexed melted Upstream Map [1h_2a]"
1141 end_time = time.process_time() #timer 1269 % (end_time - start_time,),
1142 print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer 1270 file=sys.stderr,
1271 ) # timer
1143 1272
1144 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() 1273 upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
1145 1274
1146 melt_dict = {} 1275 melt_dict = {}
1147 for key in upstream_map_p_peptide_list: 1276 for key in upstream_map_p_peptide_list:
1150 for el in upstream_delta_melt_LoL: 1279 for el in upstream_delta_melt_LoL:
1151 (p_peptide, characterization, X) = tuple(el) 1280 (p_peptide, characterization, X) = tuple(el)
1152 if p_peptide in melt_dict: 1281 if p_peptide in melt_dict:
1153 melt_dict[p_peptide].append(characterization) 1282 melt_dict[p_peptide].append(characterization)
1154 else: 1283 else:
1155 exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) 1284 exit(
1156 1285 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
1157 1286 % (p_peptide)
1158 end_time = time.process_time() #timer 1287 )
1159 print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer 1288
1160 1289 end_time = time.process_time() # timer
1290 print(
1291 "%0.6f appended peptide characterizations [1h_2b]"
1292 % (end_time - start_time,),
1293 file=sys.stderr,
1294 ) # timer
1161 1295
1162 # for key in upstream_map_p_peptide_list: 1296 # for key in upstream_map_p_peptide_list:
1163 # melt_dict[key] = ' | '.join(melt_dict[key]) 1297 # melt_dict[key] = ' | '.join(melt_dict[key])
1164 1298
1165 for key in upstream_map_p_peptide_list: 1299 for key in upstream_map_p_peptide_list:
1166 melt_dict[key] = melt_join(melt_dict[key]) 1300 melt_dict[key] = melt_join(melt_dict[key])
1167 1301
1168 end_time = time.process_time() #timer 1302 end_time = time.process_time() # timer
1169 print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer 1303 print(
1304 "%0.6f concatenated multiple characterizations [1h_2c]"
1305 % (end_time - start_time,),
1306 file=sys.stderr,
1307 ) # timer
1170 1308
1171 # map_dict is a dictionary of dictionaries 1309 # map_dict is a dictionary of dictionaries
1172 map_dict = {} 1310 map_dict = {}
1173 for key in upstream_map_p_peptide_list: 1311 for key in upstream_map_p_peptide_list:
1174 map_dict[key] = {} 1312 map_dict[key] = {}
1175 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] 1313 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
1176 1314
1177 1315 end_time = time.process_time() # timer
1178 end_time = time.process_time() #timer 1316 print(
1179 print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer 1317 "%0.6f instantiated map dictionary [2]" % (end_time - start_time,),
1180 1318 file=sys.stderr,
1181 #convert map_dict to dataframe 1319 ) # timer
1182 map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) 1320
1183 map_df["p-peptide"] = map_df.index #make index a column too 1321 # convert map_dict to dataframe
1322 map_df = pandas.DataFrame.transpose(
1323 pandas.DataFrame.from_dict(map_dict)
1324 )
1325 map_df["p-peptide"] = map_df.index # make index a column too
1184 cols_map_df = map_df.columns.tolist() 1326 cols_map_df = map_df.columns.tolist()
1185 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] 1327 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
1186 map_df = map_df[cols_map_df] 1328 map_df = map_df[cols_map_df]
1187 1329
1188 #join map_df to uniprot_regsite_df 1330 # join map_df to uniprot_regsite_df
1189 output_df = uniprot_regsite_df.merge( 1331 output_df = uniprot_regsite_df.merge(
1190 map_df, 1332 map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide"
1191 how="left", 1333 )
1192 left_on=PHOSPHOPEPTIDE,
1193 right_on="p-peptide")
1194 1334
1195 output_df = output_df[ 1335 output_df = output_df[
1196 [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, 1336 [
1197 UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, 1337 PHOSPHOPEPTIDE,
1198 PUTATIVE_UPSTREAM_DOMAINS 1338 SEQUENCE10,
1199 ] 1339 SEQUENCE7,
1340 GENE_NAME,
1341 PHOSPHORESIDUE,
1342 UNIPROT_ID,
1343 DESCRIPTION,
1344 FUNCTION_PHOSPHORESIDUE,
1345 PUTATIVE_UPSTREAM_DOMAINS,
1200 ] 1346 ]
1201 1347 ]
1202 1348
1203 # cols_output_prelim = output_df.columns.tolist() 1349 # cols_output_prelim = output_df.columns.tolist()
1204 # 1350 #
1205 # print("cols_output_prelim") 1351 # print("cols_output_prelim")
1206 # print(cols_output_prelim) 1352 # print(cols_output_prelim)
1215 # print("cols_output") 1361 # print("cols_output")
1216 # print(cols_output) 1362 # print(cols_output)
1217 # 1363 #
1218 # output_df = output_df[cols_output] 1364 # output_df = output_df[cols_output]
1219 1365
1220 #join output_df back to quantitative columns in data_in df 1366 # join output_df back to quantitative columns in data_in df
1221 quant_cols = data_in.columns.tolist() 1367 quant_cols = data_in.columns.tolist()
1222 quant_cols = quant_cols[1:] 1368 quant_cols = quant_cols[1:]
1223 quant_data = data_in[quant_cols] 1369 quant_data = data_in[quant_cols]
1224 1370
1225 ## ----------- Write merge/filter metadata to SQLite database (start) ----------- 1371 # ----------- Write merge/filter metadata to SQLite database (start) -----------
1226 # Open SwissProt SQLite database 1372 # Open SwissProt SQLite database
1227 conn = sql.connect(output_sqlite) 1373 conn = sql.connect(output_sqlite)
1228 cur = conn.cursor() 1374 cur = conn.cursor()
1229 1375
1230 cur.executescript(MRGFLTR_DDL) 1376 cur.executescript(MRGFLTR_DDL)
1231 1377
1232 cur.execute( 1378 cur.execute(
1233 CITATION_INSERT_STMT, 1379 CITATION_INSERT_STMT,
1234 ('mrgfltr_metadata_view', CITATION_INSERT_PSP) 1380 ("mrgfltr_metadata_view", CITATION_INSERT_PSP),
1235 ) 1381 )
1382 cur.execute(
1383 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP)
1384 )
1236 cur.execute( 1385 cur.execute(
1237 CITATION_INSERT_STMT, 1386 CITATION_INSERT_STMT,
1238 ('mrgfltr_metadata', CITATION_INSERT_PSP) 1387 ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),
1239 ) 1388 )
1240 cur.execute( 1389 cur.execute(
1241 CITATION_INSERT_STMT, 1390 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)
1242 ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) 1391 )
1243 )
1244 cur.execute(
1245 CITATION_INSERT_STMT,
1246 ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
1247 )
1248 1392
1249 # Read ppep-to-sequence LUT 1393 # Read ppep-to-sequence LUT
1250 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) 1394 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
1251 #ACE ppep_lut_df.info(verbose=True)
1252 # write only metadata for merged/filtered records to SQLite 1395 # write only metadata for merged/filtered records to SQLite
1253 mrgfltr_metadata_df = output_df.copy() 1396 mrgfltr_metadata_df = output_df.copy()
1254 # replace phosphopeptide seq with ppep.id 1397 # replace phosphopeptide seq with ppep.id
1255 mrgfltr_metadata_df = ppep_lut_df.merge( 1398 mrgfltr_metadata_df = ppep_lut_df.merge(
1256 mrgfltr_metadata_df, 1399 mrgfltr_metadata_df,
1257 left_on='ppep_seq', 1400 left_on="ppep_seq",
1258 right_on=PHOSPHOPEPTIDE, 1401 right_on=PHOSPHOPEPTIDE,
1259 how='inner' 1402 how="inner",
1260 ) 1403 )
1261 mrgfltr_metadata_df.drop( 1404 mrgfltr_metadata_df.drop(
1262 columns=[PHOSPHOPEPTIDE, 'ppep_seq'], 1405 columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True
1263 inplace=True 1406 )
1264 ) 1407 # rename columns
1265 #rename columns
1266 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS 1408 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
1267 #ACE mrgfltr_metadata_df.info(verbose=True)
1268 mrgfltr_metadata_df.to_sql( 1409 mrgfltr_metadata_df.to_sql(
1269 'mrgfltr_metadata', 1410 "mrgfltr_metadata",
1270 con=conn, 1411 con=conn,
1271 if_exists='append', 1412 if_exists="append",
1272 index=False, 1413 index=False,
1273 method='multi' 1414 method="multi",
1274 ) 1415 )
1275 1416
1276 # Close SwissProt SQLite database 1417 # Close SwissProt SQLite database
1277 conn.close() 1418 conn.close()
1278 ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- 1419 # ----------- Write merge/filter metadata to SQLite database (finish) -----------
1279 1420
1280 output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) 1421 output_df = output_df.merge(
1422 quant_data,
1423 how="right",
1424 left_on=PHOSPHOPEPTIDE,
1425 right_on=PHOSPHOPEPTIDE_MATCH,
1426 )
1281 output_cols = output_df.columns.tolist() 1427 output_cols = output_df.columns.tolist()
1282 output_cols = output_cols[:-1] 1428 output_cols = output_cols[:-1]
1283 output_df = output_df[output_cols] 1429 output_df = output_df[output_cols]
1284 1430
1285 #cosmetic changes to Upstream column 1431 # cosmetic changes to Upstream column
1286 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping 1432 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
1433 PUTATIVE_UPSTREAM_DOMAINS
1434 ].fillna(
1435 ""
1436 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
1287 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) 1437 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
1288 i = 0 1438 i = 0
1289 while i < len(us_series): 1439 while i < len(us_series):
1290 #turn blanks into N_A to signify the info was searched for but cannot be found 1440 # turn blanks into N_A to signify the info was searched for but cannot be found
1291 if us_series[i] == "": 1441 if us_series[i] == "":
1292 us_series[i] = N_A 1442 us_series[i] = N_A
1293 i += 1 1443 i += 1
1294 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series 1444 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
1295 1445
1296 end_time = time.process_time() #timer 1446 end_time = time.process_time() # timer
1297 print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer 1447 print(
1448 "%0.6f establisheed output [3]" % (end_time - start_time,),
1449 file=sys.stderr,
1450 ) # timer
1298 1451
1299 (output_rows, output_cols) = output_df.shape 1452 (output_rows, output_cols) = output_df.shape
1300 1453
1301 #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
1302 output_df = output_df.convert_dtypes(convert_integer=True) 1454 output_df = output_df.convert_dtypes(convert_integer=True)
1303 1455
1304 1456 # Output onto Final CSV file
1305 #Output onto Final CSV file
1306 output_df.to_csv(output_filename_csv, index=False) 1457 output_df.to_csv(output_filename_csv, index=False)
1307 output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) 1458 output_df.to_csv(
1308 1459 output_filename_tab, quoting=None, sep="\t", index=False
1309 end_time = time.process_time() #timer 1460 )
1310 print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer 1461
1311 1462 end_time = time.process_time() # timer
1312 print('{:>10} phosphopeptides written to output'.format(str(output_rows))) 1463 print(
1313 1464 "%0.6f wrote output [4]" % (end_time - start_time,),
1314 end_time = time.process_time() #timer 1465 file=sys.stderr,
1315 print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer 1466 ) # timer
1316 1467
1317 1468 print(
1318 #Rev. 7/1/2016 1469 "{:>10} phosphopeptides written to output".format(str(output_rows))
1319 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's 1470 )
1320 #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS 1471
1321 #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ 1472 end_time = time.process_time() # timer
1473 print(
1474 "%0.6f seconds of non-system CPU time were consumed"
1475 % (end_time - start_time,),
1476 file=sys.stderr,
1477 ) # timer
1478
1479 # Rev. 7/1/2016
1480 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
1481 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
1482 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
1322 # read from SwissProt SQLite database 1483 # read from SwissProt SQLite database
1323 #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper 1484 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
1324 1485
1325 ############################################# 1486 #
1326 # copied from Excel Output Script.ipynb END # 1487 # copied from Excel Output Script.ipynb END #
1327 ############################################# 1488 #
1328 1489
1329 try: 1490 try:
1330 catch(mqpep_getswissprot,) 1491 catch(
1492 mqpep_getswissprot,
1493 )
1331 exit(0) 1494 exit(0)
1332 except Exception as e: 1495 except Exception as e:
1333 exit('Internal error running mqpep_getswissprot(): %s' % (e)) 1496 exit("Internal error running mqpep_getswissprot(): %s" % (e))
1497
1334 1498
1335 if __name__ == "__main__": 1499 if __name__ == "__main__":
1336 __main__() 1500 __main__()
1337