Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_mrgfltr.py @ 5:d4d531006735 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author | eschen42 |
---|---|
date | Thu, 10 Mar 2022 23:42:48 +0000 |
parents | c1403d18c189 |
children | 922d309640db |
comparison
equal
deleted
inserted
replaced
4:cfc65ae227f8 | 5:d4d531006735 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 # Import the packages needed | 3 # Import the packages needed |
4 import argparse | 4 import argparse |
5 import operator # for operator.itemgetter | |
5 import os.path | 6 import os.path |
6 import sys | 7 import re |
7 | 8 import shutil # for shutil.copyfile(src, dest) |
9 import sqlite3 as sql | |
10 import sys # import the sys module for exc_info | |
11 import time | |
12 import traceback # for formatting stack-trace | |
13 from codecs import getreader as cx_getreader | |
14 | |
15 import numpy as np | |
8 import pandas | 16 import pandas |
9 import re | |
10 import time | |
11 import sqlite3 as sql | |
12 from codecs import getreader as cx_getreader | |
13 import sys | |
14 import numpy as np | |
15 | |
16 # for sorting list of lists using operator.itemgetter | |
17 import operator | |
18 | |
19 # for formatting stack-trace | |
20 import traceback | |
21 | |
22 # for Aho-Corasick search for fixed set of substrings | |
23 import ahocorasick | |
24 import operator | |
25 import hashlib | |
26 | |
27 # for shutil.copyfile(src, dest) | |
28 import shutil | |
29 | 17 |
30 # global constants | 18 # global constants |
31 N_A = 'N/A' | 19 N_A = "N/A" |
20 | |
32 | 21 |
33 # ref: https://stackoverflow.com/a/8915613/15509512 | 22 # ref: https://stackoverflow.com/a/8915613/15509512 |
34 # answers: "How to handle exceptions in a list comprehensions" | 23 # answers: "How to handle exceptions in a list comprehensions" |
35 # usage: | 24 # usage: |
36 # from math import log | 25 # from math import log |
39 # producing: | 28 # producing: |
40 # for <built-in function log> | 29 # for <built-in function log> |
41 # with args (0,) | 30 # with args (0,) |
42 # exception: math domain error | 31 # exception: math domain error |
43 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | 32 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] |
44 def catch(func, *args, handle=lambda e : e, **kwargs): | 33 def catch(func, *args, handle=lambda e: e, **kwargs): |
34 | |
45 try: | 35 try: |
46 return func(*args, **kwargs) | 36 return func(*args, **kwargs) |
47 except Exception as e: | 37 except Exception as e: |
48 print("For %s" % str(func)) | 38 print("For %s" % str(func)) |
49 print(" with args %s" % str(args)) | 39 print(" with args %s" % str(args)) |
50 print(" caught exception: %s" % str(e)) | 40 print(" caught exception: %s" % str(e)) |
51 (ty, va, tb) = sys.exc_info() | 41 (ty, va, tb) = sys.exc_info() |
52 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | 42 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) |
53 exit(-1) | 43 exit(-1) |
54 return None # was handle(e) | 44 return None |
45 | |
55 | 46 |
56 def ppep_join(x): | 47 def ppep_join(x): |
57 x = [i for i in x if N_A != i] | 48 x = [i for i in x if N_A != i] |
58 result = "%s" % ' | '.join(x) | 49 result = "%s" % " | ".join(x) |
59 if result != "": | 50 if result != "": |
60 return result | 51 return result |
61 else: | 52 else: |
62 return N_A | 53 return N_A |
63 | 54 |
55 | |
64 def melt_join(x): | 56 def melt_join(x): |
65 tmp = {key.lower(): key for key in x} | 57 tmp = {key.lower(): key for key in x} |
66 result = "%s" % ' | '.join([tmp[key] for key in tmp]) | 58 result = "%s" % " | ".join([tmp[key] for key in tmp]) |
67 return result | 59 return result |
60 | |
68 | 61 |
69 def __main__(): | 62 def __main__(): |
70 # Parse Command Line | 63 # Parse Command Line |
71 parser = argparse.ArgumentParser( | 64 parser = argparse.ArgumentParser( |
72 description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' | 65 description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." |
73 ) | 66 ) |
74 | 67 |
75 # inputs: | 68 # inputs: |
76 # Phosphopeptide data for experimental results, including the intensities | 69 # Phosphopeptide data for experimental results, including the intensities |
77 # and the mapping to kinase domains, in tabular format. | 70 # and the mapping to kinase domains, in tabular format. |
78 parser.add_argument( | 71 parser.add_argument( |
79 '--phosphopeptides', '-p', | 72 "--phosphopeptides", |
73 "-p", | |
80 nargs=1, | 74 nargs=1, |
81 required=True, | 75 required=True, |
82 dest='phosphopeptides', | 76 dest="phosphopeptides", |
83 help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' | 77 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", |
84 ) | 78 ) |
85 # UniProtKB/SwissProt DB input, SQLite | 79 # UniProtKB/SwissProt DB input, SQLite |
86 parser.add_argument( | 80 parser.add_argument( |
87 '--ppep_mapping_db', '-d', | 81 "--ppep_mapping_db", |
82 "-d", | |
88 nargs=1, | 83 nargs=1, |
89 required=True, | 84 required=True, |
90 dest='ppep_mapping_db', | 85 dest="ppep_mapping_db", |
91 help='UniProtKB/SwissProt SQLite Database' | 86 help="UniProtKB/SwissProt SQLite Database", |
92 ) | 87 ) |
93 #ACE # PhosPhositesPlus DB input, csv | |
94 #ACE parser.add_argument( | |
95 #ACE '--psp_regulatory_sites', '-s', | |
96 #ACE nargs=1, | |
97 #ACE required=True, | |
98 #ACE dest='psp_regulatory_sites_csv', | |
99 #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header' | |
100 #ACE ) | |
101 # species to limit records chosed from PhosPhositesPlus | 88 # species to limit records chosed from PhosPhositesPlus |
102 parser.add_argument( | 89 parser.add_argument( |
103 '--species', '-x', | 90 "--species", |
91 "-x", | |
104 nargs=1, | 92 nargs=1, |
105 required=False, | 93 required=False, |
106 default=[], | 94 default=[], |
107 dest='species', | 95 dest="species", |
108 help='limit PhosphoSitePlus records to indicated species (field may be empty)' | 96 help="limit PhosphoSitePlus records to indicated species (field may be empty)", |
109 ) | 97 ) |
110 | 98 |
111 # outputs: | 99 # outputs: |
112 # tabular output | 100 # tabular output |
113 parser.add_argument( | 101 parser.add_argument( |
114 '--mrgfltr_tab', '-o', | 102 "--mrgfltr_tab", |
103 "-o", | |
115 nargs=1, | 104 nargs=1, |
116 required=True, | 105 required=True, |
117 dest='mrgfltr_tab', | 106 dest="mrgfltr_tab", |
118 help='Tabular output file for results' | 107 help="Tabular output file for results", |
119 ) | 108 ) |
120 # CSV output | 109 # CSV output |
121 parser.add_argument( | 110 parser.add_argument( |
122 '--mrgfltr_csv', '-c', | 111 "--mrgfltr_csv", |
112 "-c", | |
123 nargs=1, | 113 nargs=1, |
124 required=True, | 114 required=True, |
125 dest='mrgfltr_csv', | 115 dest="mrgfltr_csv", |
126 help='CSV output file for results' | 116 help="CSV output file for results", |
127 ) | 117 ) |
128 # SQLite output | 118 # SQLite output |
129 parser.add_argument( | 119 parser.add_argument( |
130 '--mrgfltr_sqlite', '-S', | 120 "--mrgfltr_sqlite", |
121 "-S", | |
131 nargs=1, | 122 nargs=1, |
132 required=True, | 123 required=True, |
133 dest='mrgfltr_sqlite', | 124 dest="mrgfltr_sqlite", |
134 help='SQLite output file for results' | 125 help="SQLite output file for results", |
135 ) | 126 ) |
136 | 127 |
137 # "Make it so!" (parse the arguments) | 128 # "Make it so!" (parse the arguments) |
138 options = parser.parse_args() | 129 options = parser.parse_args() |
139 print("options: " + str(options)) | 130 print("options: " + str(options)) |
140 | 131 |
141 # determine phosphopeptide ("upstream map") input tabular file access | 132 # determine phosphopeptide ("upstream map") input tabular file access |
142 if options.phosphopeptides is None: | 133 if options.phosphopeptides is None: |
143 exit('Argument "phosphopeptides" is required but not supplied') | 134 exit('Argument "phosphopeptides" is required but not supplied') |
144 try: | 135 try: |
145 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) | 136 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) |
146 input_file = open(upstream_map_filename_tab, 'r') | 137 input_file = open(upstream_map_filename_tab, "r") |
147 input_file.close() | 138 input_file.close() |
148 except Exception as e: | 139 except Exception as e: |
149 exit('Error parsing phosphopeptides argument: %s' % str(e)) | 140 exit("Error parsing phosphopeptides argument: %s" % str(e)) |
150 | 141 |
151 # determine input SQLite access | 142 # determine input SQLite access |
152 if options.ppep_mapping_db is None: | 143 if options.ppep_mapping_db is None: |
153 exit('Argument "ppep_mapping_db" is required but not supplied') | 144 exit('Argument "ppep_mapping_db" is required but not supplied') |
154 try: | 145 try: |
155 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) | 146 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) |
156 input_file = open(uniprot_sqlite, 'rb') | 147 input_file = open(uniprot_sqlite, "rb") |
157 input_file.close() | 148 input_file.close() |
158 except Exception as e: | 149 except Exception as e: |
159 exit('Error parsing ppep_mapping_db argument: %s' % str(e)) | 150 exit("Error parsing ppep_mapping_db argument: %s" % str(e)) |
160 | 151 |
161 # copy input SQLite dataset to output SQLite dataset | 152 # copy input SQLite dataset to output SQLite dataset |
162 if options.mrgfltr_sqlite is None: | 153 if options.mrgfltr_sqlite is None: |
163 exit('Argument "mrgfltr_sqlite" is required but not supplied') | 154 exit('Argument "mrgfltr_sqlite" is required but not supplied') |
164 try: | 155 try: |
165 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) | 156 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) |
166 shutil.copyfile(uniprot_sqlite, output_sqlite) | 157 shutil.copyfile(uniprot_sqlite, output_sqlite) |
167 except Exception as e: | 158 except Exception as e: |
168 exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) | 159 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) |
169 | |
170 #ACE # determine psp_regulatory_sites CSV access | |
171 #ACE if options.psp_regulatory_sites_csv is None: | |
172 #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied') | |
173 #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv) | |
174 #ACE try: | |
175 #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0]) | |
176 #ACE input_file = open(phosphosite_filename_csv, 'r') | |
177 #ACE input_file.close() | |
178 #ACE except Exception as e: | |
179 #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e)) | |
180 #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv) | |
181 | 160 |
182 # determine species to limit records from PSP_Regulatory_Sites | 161 # determine species to limit records from PSP_Regulatory_Sites |
183 if options.species is None: | 162 if options.species is None: |
184 exit('Argument "species" is required (and may be empty) but not supplied') | 163 exit( |
164 'Argument "species" is required (and may be empty) but not supplied' | |
165 ) | |
185 try: | 166 try: |
186 if len(options.species) > 0: | 167 if len(options.species) > 0: |
187 species = options.species[0] | 168 species = options.species[0] |
188 else: | 169 else: |
189 species = '' | 170 species = "" |
190 except Exception as e: | 171 except Exception as e: |
191 exit('Error parsing species argument: %s' % str(e)) | 172 exit("Error parsing species argument: %s" % str(e)) |
192 | 173 |
193 # determine tabular output destination | 174 # determine tabular output destination |
194 if options.mrgfltr_tab is None: | 175 if options.mrgfltr_tab is None: |
195 exit('Argument "mrgfltr_tab" is required but not supplied') | 176 exit('Argument "mrgfltr_tab" is required but not supplied') |
196 try: | 177 try: |
197 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) | 178 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) |
198 output_file = open(output_filename_tab, 'w') | 179 output_file = open(output_filename_tab, "w") |
199 output_file.close() | 180 output_file.close() |
200 except Exception as e: | 181 except Exception as e: |
201 exit('Error parsing mrgfltr_tab argument: %s' % str(e)) | 182 exit("Error parsing mrgfltr_tab argument: %s" % str(e)) |
202 | 183 |
203 # determine CSV output destination | 184 # determine CSV output destination |
204 if options.mrgfltr_csv is None: | 185 if options.mrgfltr_csv is None: |
205 exit('Argument "mrgfltr_csv" is required but not supplied') | 186 exit('Argument "mrgfltr_csv" is required but not supplied') |
206 try: | 187 try: |
207 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) | 188 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) |
208 output_file = open(output_filename_csv, 'w') | 189 output_file = open(output_filename_csv, "w") |
209 output_file.close() | 190 output_file.close() |
210 except Exception as e: | 191 except Exception as e: |
211 exit('Error parsing mrgfltr_csv argument: %s' % str(e)) | 192 exit("Error parsing mrgfltr_csv argument: %s" % str(e)) |
212 | |
213 | 193 |
214 def mqpep_getswissprot(): | 194 def mqpep_getswissprot(): |
215 | 195 |
216 ############################################### | 196 # |
217 # copied from Excel Output Script.ipynb BEGIN # | 197 # copied from Excel Output Script.ipynb BEGIN # |
218 ############################################### | 198 # |
219 | 199 |
220 ########### String Constants ################# | 200 # String Constants ################# |
221 DEPHOSPHOPEP = 'DephosphoPep' | 201 DEPHOSPHOPEP = "DephosphoPep" |
222 DESCRIPTION = 'Description' | 202 DESCRIPTION = "Description" |
223 FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' | 203 FUNCTION_PHOSPHORESIDUE = ( |
224 GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB | 204 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" |
225 ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites | 205 ) |
226 ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites | 206 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB |
227 ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites | 207 ON_FUNCTION = ( |
228 ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites | 208 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites |
229 ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites | 209 ) |
230 PHOSPHOPEPTIDE = 'Phosphopeptide' | 210 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites |
231 PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' | 211 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites |
232 PHOSPHORESIDUE = 'Phosphoresidue' | 212 ON_PROCESS = ( |
233 PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' | 213 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites |
234 SEQUENCE = 'Sequence' | 214 ) |
235 SEQUENCE10 = 'Sequence10' | 215 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites |
236 SEQUENCE7 = 'Sequence7' | 216 PHOSPHOPEPTIDE = "Phosphopeptide" |
237 SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' | 217 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" |
238 SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' | 218 PHOSPHORESIDUE = "Phosphoresidue" |
239 UNIPROT_ID = 'UniProt_ID' | 219 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" |
240 UNIPROT_SEQ_AND_META_SQL = ''' | 220 SEQUENCE = "Sequence" |
221 SEQUENCE10 = "Sequence10" | |
222 SEQUENCE7 = "Sequence7" | |
223 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" | |
224 UNIPROT_ID = "UniProt_ID" | |
225 UNIPROT_SEQ_AND_META_SQL = """ | |
241 select Uniprot_ID, Description, Gene_Name, Sequence, | 226 select Uniprot_ID, Description, Gene_Name, Sequence, |
242 Organism_Name, Organism_ID, PE, SV | 227 Organism_Name, Organism_ID, PE, SV |
243 from UniProtKB | 228 from UniProtKB |
244 order by Sequence, UniProt_ID | 229 order by Sequence, UniProt_ID |
245 ''' | 230 """ |
246 UNIPROT_UNIQUE_SEQ_SQL = ''' | 231 UNIPROT_UNIQUE_SEQ_SQL = """ |
247 select distinct Sequence | 232 select distinct Sequence |
248 from UniProtKB | 233 from UniProtKB |
249 group by Sequence | 234 group by Sequence |
250 ''' | 235 """ |
251 PPEP_PEP_UNIPROTSEQ_SQL = ''' | 236 PPEP_PEP_UNIPROTSEQ_SQL = """ |
252 select distinct phosphopeptide, peptide, sequence | 237 select distinct phosphopeptide, peptide, sequence |
253 from uniprotkb_pep_ppep_view | 238 from uniprotkb_pep_ppep_view |
254 order by sequence | 239 order by sequence |
255 ''' | 240 """ |
256 PPEP_MELT_SQL = ''' | 241 PPEP_MELT_SQL = """ |
257 SELECT DISTINCT | 242 SELECT DISTINCT |
258 phospho_peptide AS 'p_peptide', | 243 phospho_peptide AS 'p_peptide', |
259 kinase_map AS 'characterization', | 244 kinase_map AS 'characterization', |
260 'X' AS 'X' | 245 'X' AS 'X' |
261 FROM ppep_gene_site_view | 246 FROM ppep_gene_site_view |
262 ''' | 247 """ |
263 # CREATE TABLE PSP_Regulatory_site ( | 248 # CREATE TABLE PSP_Regulatory_site ( |
264 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, | 249 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, |
265 # domain TEXT, | 250 # domain TEXT, |
266 # ON_FUNCTION TEXT, | 251 # ON_FUNCTION TEXT, |
267 # ON_PROCESS TEXT, | 252 # ON_PROCESS TEXT, |
268 # ON_PROT_INTERACT TEXT, | 253 # ON_PROT_INTERACT TEXT, |
269 # ON_OTHER_INTERACT TEXT, | 254 # ON_OTHER_INTERACT TEXT, |
270 # notes TEXT, | 255 # notes TEXT, |
271 # organism TEXT | 256 # organism TEXT |
272 # ); | 257 # ); |
273 PSP_REGSITE_SQL = ''' | 258 PSP_REGSITE_SQL = """ |
274 SELECT DISTINCT | 259 SELECT DISTINCT |
275 SITE_PLUSMINUS_7AA , | 260 SITE_PLUSMINUS_7AA , |
276 DOMAIN , | 261 DOMAIN , |
277 ON_FUNCTION , | 262 ON_FUNCTION , |
278 ON_PROCESS , | 263 ON_PROCESS , |
279 ON_PROT_INTERACT , | 264 ON_PROT_INTERACT , |
280 ON_OTHER_INTERACT , | 265 ON_OTHER_INTERACT , |
281 NOTES , | 266 NOTES , |
282 ORGANISM | 267 ORGANISM |
283 FROM PSP_Regulatory_site | 268 FROM PSP_Regulatory_site |
284 ''' | 269 """ |
285 PPEP_ID_SQL =''' | 270 PPEP_ID_SQL = """ |
286 SELECT | 271 SELECT |
287 id AS 'ppep_id', | 272 id AS 'ppep_id', |
288 seq AS 'ppep_seq' | 273 seq AS 'ppep_seq' |
289 FROM ppep | 274 FROM ppep |
290 ''' | 275 """ |
291 MRGFLTR_DDL =''' | 276 MRGFLTR_DDL = """ |
292 DROP VIEW IF EXISTS mrgfltr_metadata_view; | 277 DROP VIEW IF EXISTS mrgfltr_metadata_view; |
293 DROP TABLE IF EXISTS mrgfltr_metadata; | 278 DROP TABLE IF EXISTS mrgfltr_metadata; |
294 CREATE TABLE mrgfltr_metadata | 279 CREATE TABLE mrgfltr_metadata |
295 ( ppep_id INTEGER REFERENCES ppep(id) | 280 ( ppep_id INTEGER REFERENCES ppep(id) |
296 , Sequence10 TEXT | 281 , Sequence10 TEXT |
320 WHERE | 305 WHERE |
321 mrgfltr_metadata.ppep_id = ppep.id | 306 mrgfltr_metadata.ppep_id = ppep.id |
322 ORDER BY | 307 ORDER BY |
323 ppep.seq | 308 ppep.seq |
324 ; | 309 ; |
325 ''' | 310 """ |
326 | 311 |
327 CITATION_INSERT_STMT = ''' | 312 CITATION_INSERT_STMT = """ |
328 INSERT INTO Citation ( | 313 INSERT INTO Citation ( |
329 ObjectName, | 314 ObjectName, |
330 CitationData | 315 CitationData |
331 ) VALUES (?,?) | 316 ) VALUES (?,?) |
332 ''' | 317 """ |
333 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' | 318 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' |
334 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' | 319 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' |
335 | 320 |
336 MRGFLTR_METADATA_COLUMNS = [ | 321 MRGFLTR_METADATA_COLUMNS = [ |
337 'ppep_id', | 322 "ppep_id", |
338 'Sequence10', | 323 "Sequence10", |
339 'Sequence7', | 324 "Sequence7", |
340 'GeneName', | 325 "GeneName", |
341 'Phosphoresidue', | 326 "Phosphoresidue", |
342 'UniProtID', | 327 "UniProtID", |
343 'Description', | 328 "Description", |
344 'FunctionPhosphoresidue', | 329 "FunctionPhosphoresidue", |
345 'PutativeUpstreamDomains' | 330 "PutativeUpstreamDomains", |
346 ] | 331 ] |
347 | 332 |
348 ########### String Constants (end) ############ | 333 # String Constants (end) ############ |
349 | 334 |
350 class Error(Exception): | 335 class Error(Exception): |
351 """Base class for exceptions in this module.""" | 336 """Base class for exceptions in this module.""" |
337 | |
352 pass | 338 pass |
353 | 339 |
354 class PreconditionError(Error): | 340 class PreconditionError(Error): |
355 """Exception raised for errors in the input. | 341 """Exception raised for errors in the input. |
356 | 342 |
361 | 347 |
362 def __init__(self, expression, message): | 348 def __init__(self, expression, message): |
363 self.expression = expression | 349 self.expression = expression |
364 self.message = message | 350 self.message = message |
365 | 351 |
366 #start_time = time.clock() #timer | 352 # start_time = time.clock() #timer |
367 start_time = time.process_time() #timer | 353 start_time = time.process_time() # timer |
368 | 354 |
369 #get keys from upstream tabular file using readline() | 355 # get keys from upstream tabular file using readline() |
370 # ref: https://stackoverflow.com/a/16713581/15509512 | 356 # ref: https://stackoverflow.com/a/16713581/15509512 |
371 # answer to "Use codecs to read file with correct encoding" | 357 # answer to "Use codecs to read file with correct encoding" |
372 file1_encoded = open(upstream_map_filename_tab, 'rb') | 358 file1_encoded = open(upstream_map_filename_tab, "rb") |
373 file1 = cx_getreader("latin-1")(file1_encoded) | 359 file1 = cx_getreader("latin-1")(file1_encoded) |
374 | 360 |
375 count = 0 | 361 count = 0 |
376 upstream_map_p_peptide_list = [] | 362 upstream_map_p_peptide_list = [] |
377 re_tab = re.compile('^[^\t]*') | 363 re_tab = re.compile("^[^\t]*") |
378 while True: | 364 while True: |
379 count += 1 | 365 count += 1 |
380 # Get next line from file | 366 # Get next line from file |
381 line = file1.readline() | 367 line = file1.readline() |
382 # if line is empty | 368 # if line is empty |
388 upstream_map_p_peptide_list.append(m[0]) | 374 upstream_map_p_peptide_list.append(m[0]) |
389 file1.close() | 375 file1.close() |
390 file1_encoded.close() | 376 file1_encoded.close() |
391 | 377 |
392 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed | 378 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed |
393 re_phos = re.compile('p') | 379 re_phos = re.compile("p") |
394 dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] | 380 |
395 | 381 end_time = time.process_time() # timer |
396 end_time = time.process_time() #timer | 382 print( |
397 print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) | 383 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), |
398 | 384 file=sys.stderr, |
399 ## ----------- Get SwissProt data from SQLite database (start) ----------- | 385 ) |
386 | |
387 # ----------- Get SwissProt data from SQLite database (start) ----------- | |
400 # build UniProt sequence LUT and list of unique SwissProt sequences | 388 # build UniProt sequence LUT and list of unique SwissProt sequences |
401 | 389 |
402 # Open SwissProt SQLite database | 390 # Open SwissProt SQLite database |
403 conn = sql.connect(uniprot_sqlite) | 391 conn = sql.connect(uniprot_sqlite) |
404 cur = conn.cursor() | 392 cur = conn.cursor() |
405 | 393 |
406 # Set up structures to hold SwissProt data | 394 # Set up structures to hold SwissProt data |
407 | 395 |
408 uniprot_Sequence_List = [] | 396 uniprot_Sequence_List = [] |
409 UniProtSeqLUT = {} | 397 UniProtSeqLUT = {} |
410 | 398 |
411 # Execute query for unique seqs without fetching the results yet | 399 # Execute query for unique seqs without fetching the results yet |
412 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) | 400 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) |
413 | 401 |
414 while batch := uniprot_unique_seq_cur.fetchmany(size=50): | 402 while 1: |
415 if None == batch: | 403 batch = uniprot_unique_seq_cur.fetchmany(size=50) |
404 if not batch: | |
416 # handle case where no records are returned | 405 # handle case where no records are returned |
417 break | 406 break |
418 for row in batch: | 407 for row in batch: |
419 Sequence = row[0] | 408 Sequence = row[0] |
420 UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] | 409 UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] |
421 UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] | 410 UniProtSeqLUT[(Sequence, GENE_NAME)] = [] |
422 UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] | 411 UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] |
423 UniProtSeqLUT[ Sequence ] = [] | 412 UniProtSeqLUT[Sequence] = [] |
424 | 413 |
425 # Execute query for seqs and metadata without fetching the results yet | 414 # Execute query for seqs and metadata without fetching the results yet |
426 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) | 415 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) |
427 | 416 |
428 while batch := uniprot_seq_and_meta.fetchmany(size=50): | 417 while 1: |
429 if None == batch: | 418 batch = uniprot_seq_and_meta.fetchmany(size=50) |
430 # handle case where no records are returned | 419 if not batch: |
420 # handle case where no records are returned | |
431 break | 421 break |
432 for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: | 422 for ( |
423 UniProt_ID, | |
424 Description, | |
425 Gene_Name, | |
426 Sequence, | |
427 OS, | |
428 OX, | |
429 PE, | |
430 SV, | |
431 ) in batch: | |
433 uniprot_Sequence_List.append(Sequence) | 432 uniprot_Sequence_List.append(Sequence) |
434 UniProtSeqLUT[Sequence] = Sequence | 433 UniProtSeqLUT[Sequence] = Sequence |
435 UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) | 434 UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) |
436 UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) | 435 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) |
437 if OS != N_A: | 436 if OS != N_A: |
438 Description += ' OS=' + OS | 437 Description += " OS=" + OS |
439 if OX != N_A: | 438 if OX != N_A: |
440 Description += ' OX=' + str(int(OX)) | 439 Description += " OX=" + str(int(OX)) |
441 if Gene_Name != N_A: | 440 if Gene_Name != N_A: |
442 Description += ' GN=' + Gene_Name | 441 Description += " GN=" + Gene_Name |
443 if PE != N_A: | 442 if PE != N_A: |
444 Description += ' PE=' + PE | 443 Description += " PE=" + PE |
445 if SV != N_A: | 444 if SV != N_A: |
446 Description += ' SV=' + SV | 445 Description += " SV=" + SV |
447 UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) | 446 UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) |
448 | 447 |
449 # Close SwissProt SQLite database; clean up local variables | 448 # Close SwissProt SQLite database; clean up local variables |
450 conn.close() | 449 conn.close() |
451 Sequence = '' | 450 Sequence = "" |
452 UniProt_ID = '' | 451 UniProt_ID = "" |
453 Description = '' | 452 Description = "" |
454 Gene_Name = '' | 453 Gene_Name = "" |
455 | 454 |
456 ## ----------- Get SwissProt data from SQLite database (finish) ----------- | 455 # ----------- Get SwissProt data from SQLite database (finish) ----------- |
457 | 456 |
458 end_time = time.process_time() #timer | 457 end_time = time.process_time() # timer |
459 print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) | 458 print( |
460 | 459 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), |
461 ## ----------- Get SwissProt data from SQLite database (start) ----------- | 460 file=sys.stderr, |
462 # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT | 461 ) |
463 #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) | 462 |
464 | 463 # ----------- Get SwissProt data from SQLite database (start) ----------- |
465 # Open SwissProt SQLite database | 464 # Open SwissProt SQLite database |
466 conn = sql.connect(uniprot_sqlite) | 465 conn = sql.connect(uniprot_sqlite) |
467 cur = conn.cursor() | 466 cur = conn.cursor() |
468 | 467 |
469 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide | 468 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide |
470 DephosphoPep_UniProtSeq_LUT = {} | 469 DephosphoPep_UniProtSeq_LUT = {} |
471 | 470 |
472 # Set up dictionary to accumulate results | 471 # Set up dictionary to accumulate results |
473 PhosphoPep_UniProtSeq_LUT = {} | 472 PhosphoPep_UniProtSeq_LUT = {} |
474 | 473 |
475 # Execute query for tuples without fetching the results yet | 474 # Execute query for tuples without fetching the results yet |
476 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) | 475 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) |
477 | 476 |
478 while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): | 477 while 1: |
479 if None == batch: | 478 batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) |
479 if not batch: | |
480 # handle case where no records are returned | 480 # handle case where no records are returned |
481 break | 481 break |
482 for (phospho_pep, dephospho_pep, sequence) in batch: | 482 for (phospho_pep, dephospho_pep, sequence) in batch: |
483 #do interesting stuff here... | 483 # do interesting stuff here... |
484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | 484 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep |
485 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep | 485 PhosphoPep_UniProtSeq_LUT[ |
486 (phospho_pep, DEPHOSPHOPEP) | |
487 ] = dephospho_pep | |
486 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | 488 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: |
487 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() | 489 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() |
488 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] | 490 DephosphoPep_UniProtSeq_LUT[ |
489 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] | 491 (dephospho_pep, DESCRIPTION) |
490 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] | 492 ] = [] |
491 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] | 493 DephosphoPep_UniProtSeq_LUT[ |
494 (dephospho_pep, GENE_NAME) | |
495 ] = [] | |
496 DephosphoPep_UniProtSeq_LUT[ | |
497 (dephospho_pep, UNIPROT_ID) | |
498 ] = [] | |
499 DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] | |
492 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) | 500 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) |
493 | 501 |
494 #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) | 502 if ( |
495 if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: | 503 sequence |
496 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) | 504 not in DephosphoPep_UniProtSeq_LUT[ |
505 (dephospho_pep, SEQUENCE) | |
506 ] | |
507 ): | |
508 DephosphoPep_UniProtSeq_LUT[ | |
509 (dephospho_pep, SEQUENCE) | |
510 ].append(sequence) | |
497 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: | 511 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: |
498 if phospho_pep != phospho_pep: | 512 if phospho_pep != phospho_pep: |
499 print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) | 513 print( |
514 "phospho_pep:'%s' phospho_pep:'%s'" | |
515 % (phospho_pep, phospho_pep) | |
516 ) | |
500 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | 517 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: |
501 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | 518 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep |
502 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep | 519 PhosphoPep_UniProtSeq_LUT[ |
503 r = list(zip( | 520 (phospho_pep, DEPHOSPHOPEP) |
504 [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], | 521 ] = dephospho_pep |
505 [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], | 522 r = list( |
506 [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] | 523 zip( |
507 )) | 524 [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], |
525 [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], | |
526 [ | |
527 s | |
528 for s in UniProtSeqLUT[(sequence, DESCRIPTION)] | |
529 ], | |
530 ) | |
531 ) | |
508 # Sort by `UniProt_ID` | 532 # Sort by `UniProt_ID` |
509 # ref: https://stackoverflow.com/a/4174955/15509512 | 533 # ref: https://stackoverflow.com/a/4174955/15509512 |
510 r = sorted(r, key=operator.itemgetter(0)) | 534 r = sorted(r, key=operator.itemgetter(0)) |
511 # Get one tuple for each `phospho_pep` | 535 # Get one tuple for each `phospho_pep` |
512 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] | 536 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] |
513 for (upid, gn, desc) in r: | 537 for (upid, gn, desc) in r: |
514 # Append pseudo-tuple per UniProt_ID but only when it is not present | 538 # Append pseudo-tuple per UniProt_ID but only when it is not present |
515 if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: | 539 if ( |
516 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) | 540 upid |
517 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) | 541 not in DephosphoPep_UniProtSeq_LUT[ |
518 DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) | 542 (dephospho_pep, UNIPROT_ID) |
543 ] | |
544 ): | |
545 DephosphoPep_UniProtSeq_LUT[ | |
546 (dephospho_pep, UNIPROT_ID) | |
547 ].append(upid) | |
548 DephosphoPep_UniProtSeq_LUT[ | |
549 (dephospho_pep, DESCRIPTION) | |
550 ].append(desc) | |
551 DephosphoPep_UniProtSeq_LUT[ | |
552 (dephospho_pep, GENE_NAME) | |
553 ].append(gn) | |
519 | 554 |
520 # Close SwissProt SQLite database; clean up local variables | 555 # Close SwissProt SQLite database; clean up local variables |
521 conn.close() | 556 conn.close() |
522 # wipe local variables | 557 # wipe local variables |
523 phospho_pep = dephospho_pep = sequence = 0 | 558 phospho_pep = dephospho_pep = sequence = 0 |
524 upid = gn = desc = r = '' | 559 upid = gn = desc = r = "" |
525 | 560 |
526 ## ----------- Get SwissProt data from SQLite database (finish) ----------- | 561 # ----------- Get SwissProt data from SQLite database (finish) ----------- |
527 | 562 |
528 end_time = time.process_time() #timer | 563 end_time = time.process_time() # timer |
529 print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) | 564 print( |
530 | 565 "%0.6f finished reading and decoding '%s' [0.4]" |
531 print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) | 566 % (end_time - start_time, upstream_map_filename_tab), |
532 | 567 file=sys.stderr, |
533 #Read in Upstream tabular file | 568 ) |
569 | |
570 print( | |
571 "{:>10} unique upstream phosphopeptides tested".format( | |
572 str(len(upstream_map_p_peptide_list)) | |
573 ) | |
574 ) | |
575 | |
576 # Read in Upstream tabular file | |
534 # We are discarding the intensity data; so read it as text | 577 # We are discarding the intensity data; so read it as text |
535 upstream_data = pandas.read_table( | 578 upstream_data = pandas.read_table( |
536 upstream_map_filename_tab, | 579 upstream_map_filename_tab, dtype="str", index_col=0 |
537 dtype='str', | 580 ) |
538 index_col = 0 | 581 |
539 ) | 582 end_time = time.process_time() # timer |
540 | 583 print( |
541 end_time = time.process_time() #timer | 584 "%0.6f read Upstream Map from file [1g_1]" |
542 print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer | 585 % (end_time - start_time,), |
586 file=sys.stderr, | |
587 ) # timer | |
543 | 588 |
544 upstream_data.index = upstream_map_p_peptide_list | 589 upstream_data.index = upstream_map_p_peptide_list |
545 | 590 |
546 | 591 end_time = time.process_time() # timer |
547 end_time = time.process_time() #timer | 592 print( |
548 print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer | 593 "%0.6f added index to Upstream Map [1g_2]" |
549 | 594 % (end_time - start_time,), |
550 | 595 file=sys.stderr, |
551 #trim upstream_data to include only the upstream map columns | 596 ) # timer |
597 | |
598 # trim upstream_data to include only the upstream map columns | |
552 old_cols = upstream_data.columns.tolist() | 599 old_cols = upstream_data.columns.tolist() |
553 i = 0 | 600 i = 0 |
554 first_intensity = -1 | 601 first_intensity = -1 |
555 last_intensity = -1 | 602 last_intensity = -1 |
556 intensity_re = re.compile('Intensity.*') | 603 intensity_re = re.compile("Intensity.*") |
557 for col_name in old_cols: | 604 for col_name in old_cols: |
558 m = intensity_re.match(col_name) | 605 m = intensity_re.match(col_name) |
559 if m: | 606 if m: |
560 last_intensity = i | 607 last_intensity = i |
561 if first_intensity == -1: | 608 if first_intensity == -1: |
562 first_intensity = i | 609 first_intensity = i |
563 i += 1 | 610 i += 1 |
564 #print('last intensity = %d' % last_intensity) | 611 # print('last intensity = %d' % last_intensity) |
565 col_PKCalpha = last_intensity + 2 | 612 col_PKCalpha = last_intensity + 2 |
566 col_firstIntensity = first_intensity | 613 |
567 | 614 data_in_cols = [old_cols[0]] + old_cols[ |
568 data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] | 615 first_intensity: last_intensity + 1 |
616 ] | |
569 | 617 |
570 if upstream_data.empty: | 618 if upstream_data.empty: |
571 print("upstream_data is empty") | 619 print("upstream_data is empty") |
572 exit(0) | 620 exit(0) |
573 | 621 |
574 data_in = upstream_data.copy(deep=True)[data_in_cols] | 622 data_in = upstream_data.copy(deep=True)[data_in_cols] |
575 | 623 |
576 # Convert floating-point integers to int64 integers | 624 # Convert floating-point integers to int64 integers |
577 # ref: https://stackoverflow.com/a/68497603/15509512 | 625 # ref: https://stackoverflow.com/a/68497603/15509512 |
578 data_in[list(data_in.columns[1:])] = data_in[ | 626 data_in[list(data_in.columns[1:])] = ( |
579 list(data_in.columns[1:])].astype('float64').apply(np.int64) | 627 data_in[list(data_in.columns[1:])] |
580 | 628 .astype("float64") |
581 #create another phosphopeptide column that will be used to join later; | 629 .apply(np.int64) |
630 ) | |
631 | |
632 # create another phosphopeptide column that will be used to join later; | |
582 # MAY need to change depending on Phosphopeptide column position | 633 # MAY need to change depending on Phosphopeptide column position |
583 #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] | 634 # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] |
584 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index | 635 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index |
585 | 636 |
586 | 637 end_time = time.process_time() # timer |
587 | 638 print( |
588 | 639 "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" |
589 end_time = time.process_time() #timer | 640 % (end_time - start_time,), |
590 print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer | 641 file=sys.stderr, |
642 ) # timer | |
591 | 643 |
592 # Produce a dictionary of metadata for a single phosphopeptide. | 644 # Produce a dictionary of metadata for a single phosphopeptide. |
593 # This is a replacement of `UniProtInfo_subdict` in the original code. | 645 # This is a replacement of `UniProtInfo_subdict` in the original code. |
594 def pseq_to_subdict(phospho_pep): | 646 def pseq_to_subdict(phospho_pep): |
595 #ACE print("calling pseq_to_subdict, %s" % phospho_pep); | |
596 # Strip "p" from phosphopeptide sequence | 647 # Strip "p" from phosphopeptide sequence |
597 dephospho_pep = re_phos.sub('',phospho_pep) | 648 dephospho_pep = re_phos.sub("", phospho_pep) |
598 | 649 |
599 # Determine number of phosphoresidues in phosphopeptide | 650 # Determine number of phosphoresidues in phosphopeptide |
600 numps = len(phospho_pep) - len(dephospho_pep) | 651 numps = len(phospho_pep) - len(dephospho_pep) |
601 | 652 |
602 # Determine location(s) of phosphoresidue(s) in phosphopeptide | 653 # Determine location(s) of phosphoresidue(s) in phosphopeptide |
603 # (used later for Phosphoresidue, Sequence7, and Sequence10) | 654 # (used later for Phosphoresidue, Sequence7, and Sequence10) |
604 ploc = [] #list of p locations | 655 ploc = [] # list of p locations |
605 i = 0 | 656 i = 0 |
606 p = phospho_pep | 657 p = phospho_pep |
607 while i < numps: | 658 while i < numps: |
608 ploc.append(p.find("p")) | 659 ploc.append(p.find("p")) |
609 p = p[:p.find("p")] + p[p.find("p")+1:] | 660 p = p[: p.find("p")] + p[p.find("p") + 1:] |
610 i +=1 | 661 i += 1 |
611 | |
612 | 662 |
613 # Establish nested dictionary | 663 # Establish nested dictionary |
614 result = {} | 664 result = {} |
615 result[SEQUENCE] = [] | 665 result[SEQUENCE] = [] |
616 result[UNIPROT_ID] = [] | 666 result[UNIPROT_ID] = [] |
621 result[SEQUENCE10] = [] | 671 result[SEQUENCE10] = [] |
622 | 672 |
623 # Add stripped sequence to dictionary | 673 # Add stripped sequence to dictionary |
624 result[SEQUENCE].append(dephospho_pep) | 674 result[SEQUENCE].append(dephospho_pep) |
625 | 675 |
626 # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT | |
627 dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep] | |
628 | |
629 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT | 676 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT |
630 ### Caller may elect to: | 677 # Caller may elect to: |
631 ## try: | 678 # try: |
632 ## ... | 679 # ... |
633 ## except PreconditionError as pe: | 680 # except PreconditionError as pe: |
634 ## print("'{expression}': {message}".format( | 681 # print("'{expression}': {message}".format( |
635 ## expression = pe.expression, | 682 # expression = pe.expression, |
636 ## message = pe.message)) | 683 # message = pe.message)) |
637 ## ) | 684 # ) |
638 ## ) | 685 # ) |
639 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | 686 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: |
640 raise PreconditionError( dephospho_pep, | 687 raise PreconditionError( |
641 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' | 688 dephospho_pep, |
642 ) | 689 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", |
690 ) | |
643 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | 691 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: |
644 raise PreconditionError( dephospho_pep, | 692 raise PreconditionError( |
645 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' | 693 dephospho_pep, |
646 ) | 694 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", |
647 if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: | 695 ) |
648 raise PreconditionError( dephospho_pep, | 696 if ( |
649 "dephosphorylated phosphopeptide does not match " + | 697 dephospho_pep |
650 "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + | 698 != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] |
651 PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] | 699 ): |
652 ) | 700 raise PreconditionError( |
701 dephospho_pep, | |
702 "dephosphorylated phosphopeptide does not match " | |
703 + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " | |
704 + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)], | |
705 ) | |
653 result[SEQUENCE] = [dephospho_pep] | 706 result[SEQUENCE] = [dephospho_pep] |
654 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] | 707 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ |
655 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] | 708 (dephospho_pep, UNIPROT_ID) |
656 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] | 709 ] |
657 if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: | 710 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ |
658 raise PreconditionError( dephospho_pep, | 711 (dephospho_pep, DESCRIPTION) |
659 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' | 712 ] |
660 ) | 713 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ |
661 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] | 714 (dephospho_pep, GENE_NAME) |
662 if len (UniProtSeqList) < 1: | 715 ] |
663 print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) | 716 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: |
664 # raise PreconditionError( | 717 raise PreconditionError( |
665 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | 718 dephospho_pep, |
666 # 'value has zero length' | 719 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", |
667 # ) | 720 ) |
721 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ | |
722 (dephospho_pep, SEQUENCE) | |
723 ] | |
724 if len(UniProtSeqList) < 1: | |
725 print( | |
726 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" | |
727 % dephospho_pep | |
728 ) | |
729 # raise PreconditionError( | |
730 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | |
731 # 'value has zero length' | |
732 # ) | |
668 for UniProtSeq in UniProtSeqList: | 733 for UniProtSeq in UniProtSeqList: |
669 i = 0 | 734 i = 0 |
670 phosphoresidues = [] | 735 phosphoresidues = [] |
671 seq7s_set = set() | 736 seq7s_set = set() |
672 seq7s = [] | 737 seq7s = [] |
673 seq10s_set = set() | 738 seq10s_set = set() |
674 seq10s = [] | 739 seq10s = [] |
675 while i < len(ploc): | 740 while i < len(ploc): |
676 start = UniProtSeq.find(dephospho_pep) | 741 start = UniProtSeq.find(dephospho_pep) |
677 psite = start+ploc[i] #location of phosphoresidue on protein sequence | 742 psite = ( |
678 | 743 start + ploc[i] |
679 #add Phosphoresidue | 744 ) # location of phosphoresidue on protein sequence |
680 phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) | 745 |
746 # add Phosphoresidue | |
747 phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) | |
681 phosphoresidues.append(phosphosite) | 748 phosphoresidues.append(phosphosite) |
682 | 749 |
683 #Add Sequence7 | 750 # Add Sequence7 |
684 if psite < 7: #phospho_pep at N terminus | 751 if psite < 7: # phospho_pep at N terminus |
685 seq7 = str(UniProtSeq)[:psite+8] | 752 seq7 = str(UniProtSeq)[: psite + 8] |
686 if seq7[psite] == "S": #if phosphosresidue is serine | 753 if seq7[psite] == "S": # if phosphosresidue is serine |
687 pres = "s" | 754 pres = "s" |
688 elif seq7[psite] == "T": #if phosphosresidue is threonine | 755 elif ( |
756 seq7[psite] == "T" | |
757 ): # if phosphosresidue is threonine | |
689 pres = "t" | 758 pres = "t" |
690 elif seq7[psite] == "Y": #if phosphoresidue is tyrosine | 759 elif ( |
760 seq7[psite] == "Y" | |
761 ): # if phosphoresidue is tyrosine | |
691 pres = "y" | 762 pres = "y" |
692 else: # if not pSTY | 763 else: # if not pSTY |
693 pres = "?" | 764 pres = "?" |
694 seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] | 765 seq7 = ( |
695 while len(seq7) < 15: #add appropriate number of "_" to the front | 766 seq7[:psite] + pres + seq7[psite + 1: psite + 8] |
767 ) | |
768 while ( | |
769 len(seq7) < 15 | |
770 ): # add appropriate number of "_" to the front | |
696 seq7 = "_" + seq7 | 771 seq7 = "_" + seq7 |
697 elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus | 772 elif ( |
698 seq7 = str(UniProtSeq)[psite-7:] | 773 len(UniProtSeq) - psite < 8 |
774 ): # phospho_pep at C terminus | |
775 seq7 = str(UniProtSeq)[psite - 7:] | |
699 if seq7[7] == "S": | 776 if seq7[7] == "S": |
700 pres = "s" | 777 pres = "s" |
701 elif seq7[7] == "T": | 778 elif seq7[7] == "T": |
702 pres = "t" | 779 pres = "t" |
703 elif seq7[7] == "Y": | 780 elif seq7[7] == "Y": |
704 pres = "y" | 781 pres = "y" |
705 else: | 782 else: |
706 pres = "?" | 783 pres = "?" |
707 seq7 = seq7[:7] + pres + seq7[8:] | 784 seq7 = seq7[:7] + pres + seq7[8:] |
708 while len(seq7) < 15: #add appropriate number of "_" to the back | 785 while ( |
786 len(seq7) < 15 | |
787 ): # add appropriate number of "_" to the back | |
709 seq7 = seq7 + "_" | 788 seq7 = seq7 + "_" |
710 else: | 789 else: |
711 seq7 = str(UniProtSeq)[psite-7:psite+8] | 790 seq7 = str(UniProtSeq)[psite - 7: psite + 8] |
712 pres = "" #phosphoresidue | 791 pres = "" # phosphoresidue |
713 if seq7[7] == "S": #if phosphosresidue is serine | 792 if seq7[7] == "S": # if phosphosresidue is serine |
714 pres = "s" | 793 pres = "s" |
715 elif seq7[7] == "T": #if phosphosresidue is threonine | 794 elif seq7[7] == "T": # if phosphosresidue is threonine |
716 pres = "t" | 795 pres = "t" |
717 elif seq7[7] == "Y": #if phosphoresidue is tyrosine | 796 elif seq7[7] == "Y": # if phosphoresidue is tyrosine |
718 pres = "y" | 797 pres = "y" |
719 else: # if not pSTY | 798 else: # if not pSTY |
720 pres = "?" | 799 pres = "?" |
721 seq7 = seq7[:7] + pres + seq7[8:] | 800 seq7 = seq7[:7] + pres + seq7[8:] |
722 if seq7 not in seq7s_set: | 801 if seq7 not in seq7s_set: |
723 seq7s.append(seq7) | 802 seq7s.append(seq7) |
724 seq7s_set.add(seq7) | 803 seq7s_set.add(seq7) |
725 | 804 |
726 #add Sequence10 | 805 # add Sequence10 |
727 if psite < 10: #phospho_pep at N terminus | 806 if psite < 10: # phospho_pep at N terminus |
728 seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] | 807 seq10 = ( |
729 elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus | 808 str(UniProtSeq)[:psite] |
730 seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] | 809 + "p" |
810 + str(UniProtSeq)[psite: psite + 11] | |
811 ) | |
812 elif ( | |
813 len(UniProtSeq) - psite < 11 | |
814 ): # phospho_pep at C terminus | |
815 seq10 = ( | |
816 str(UniProtSeq)[psite - 10: psite] | |
817 + "p" | |
818 + str(UniProtSeq)[psite:] | |
819 ) | |
731 else: | 820 else: |
732 seq10 = str(UniProtSeq)[psite-10:psite+11] | 821 seq10 = str(UniProtSeq)[psite - 10: psite + 11] |
733 seq10 = seq10[:10] + "p" + seq10[10:] | 822 seq10 = seq10[:10] + "p" + seq10[10:] |
734 if seq10 not in seq10s_set: | 823 if seq10 not in seq10s_set: |
735 seq10s.append(seq10) | 824 seq10s.append(seq10) |
736 seq10s_set.add(seq10) | 825 seq10s_set.add(seq10) |
737 | 826 |
738 i+=1 | 827 i += 1 |
739 | 828 |
740 result[PHOSPHORESIDUE].append(phosphoresidues) | 829 result[PHOSPHORESIDUE].append(phosphoresidues) |
741 result[SEQUENCE7].append(seq7s) | 830 result[SEQUENCE7].append(seq7s) |
742 # result[SEQUENCE10] is a list of lists of strings | 831 # result[SEQUENCE10] is a list of lists of strings |
743 result[SEQUENCE10].append(seq10s) | 832 result[SEQUENCE10].append(seq10s) |
744 | 833 |
745 | 834 r = list( |
746 | 835 zip( |
747 | 836 result[UNIPROT_ID], |
748 r = list(zip( | 837 result[GENE_NAME], |
749 result[UNIPROT_ID], | 838 result[DESCRIPTION], |
750 result[GENE_NAME], | 839 result[PHOSPHORESIDUE], |
751 result[DESCRIPTION], | 840 ) |
752 result[PHOSPHORESIDUE] | 841 ) |
753 )) | |
754 # Sort by `UniProt_ID` | 842 # Sort by `UniProt_ID` |
755 # ref: https://stackoverflow.com//4174955/15509512 | 843 # ref: https://stackoverflow.com//4174955/15509512 |
756 s = sorted(r, key=operator.itemgetter(0)) | 844 s = sorted(r, key=operator.itemgetter(0)) |
757 | 845 |
758 result[UNIPROT_ID] = [] | 846 result[UNIPROT_ID] = [] |
764 result[UNIPROT_ID].append(r[0]) | 852 result[UNIPROT_ID].append(r[0]) |
765 result[GENE_NAME].append(r[1]) | 853 result[GENE_NAME].append(r[1]) |
766 result[DESCRIPTION].append(r[2]) | 854 result[DESCRIPTION].append(r[2]) |
767 result[PHOSPHORESIDUE].append(r[3]) | 855 result[PHOSPHORESIDUE].append(r[3]) |
768 | 856 |
769 | 857 # convert lists to strings in the dictionary |
770 #convert lists to strings in the dictionary | 858 for key, value in result.items(): |
771 for key,value in result.items(): | |
772 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: | 859 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: |
773 result[key] = '; '.join(map(str, value)) | 860 result[key] = "; ".join(map(str, value)) |
774 elif key in [SEQUENCE10]: | 861 elif key in [SEQUENCE10]: |
775 # result[SEQUENCE10] is a list of lists of strings | 862 # result[SEQUENCE10] is a list of lists of strings |
776 joined_value = '' | 863 joined_value = "" |
777 joined_set = set() | 864 joined_set = set() |
778 sep = '' | 865 sep = "" |
779 for valL in value: | 866 for valL in value: |
780 # valL is a list of strings | 867 # valL is a list of strings |
781 for val in valL: | 868 for val in valL: |
782 # val is a string | 869 # val is a string |
783 if val not in joined_set: | 870 if val not in joined_set: |
784 joined_set.add(val) | 871 joined_set.add(val) |
785 #joined_value += sep + '; '.join(map(str, val)) | 872 # joined_value += sep + '; '.join(map(str, val)) |
786 joined_value += sep + val | 873 joined_value += sep + val |
787 sep = '; ' | 874 sep = "; " |
788 # joined_value is a string | 875 # joined_value is a string |
789 result[key] = joined_value | 876 result[key] = joined_value |
790 | 877 |
791 | 878 newstring = "; ".join( |
792 newstring = '; '.join( | 879 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] |
793 [', '.join(l) for l in result[PHOSPHORESIDUE]] | 880 ) |
794 ) | 881 # #separate the isoforms in PHOSPHORESIDUE column with ";" |
795 ### #separate the isoforms in PHOSPHORESIDUE column with ";" | 882 # oldstring = result[PHOSPHORESIDUE] |
796 ### oldstring = result[PHOSPHORESIDUE] | 883 # oldlist = list(oldstring) |
797 ### oldlist = list(oldstring) | 884 # newstring = "" |
798 ### newstring = "" | 885 # i = 0 |
799 ### i = 0 | 886 # for e in oldlist: |
800 ### for e in oldlist: | 887 # if e == ";": |
801 ### if e == ";": | 888 # if numps > 1: |
802 ### if numps > 1: | 889 # if i%numps: |
803 ### if i%numps: | 890 # newstring = newstring + ";" |
804 ### newstring = newstring + ";" | 891 # else: |
805 ### else: | 892 # newstring = newstring + "," |
806 ### newstring = newstring + "," | 893 # else: |
807 ### else: | 894 # newstring = newstring + ";" |
808 ### newstring = newstring + ";" | 895 # i +=1 |
809 ### i +=1 | 896 # else: |
810 ### else: | 897 # newstring = newstring + e |
811 ### newstring = newstring + e | |
812 result[PHOSPHORESIDUE] = newstring | 898 result[PHOSPHORESIDUE] = newstring |
813 | 899 |
814 | 900 # separate sequence7's by | |
815 #separate sequence7's by | | |
816 oldstring = result[SEQUENCE7] | 901 oldstring = result[SEQUENCE7] |
817 oldlist = oldstring | 902 oldlist = oldstring |
818 newstring = "" | 903 newstring = "" |
819 for l in oldlist: | 904 for ol in oldlist: |
820 for e in l: | 905 for e in ol: |
821 if e == ";": | 906 if e == ";": |
822 newstring = newstring + " |" | 907 newstring = newstring + " |" |
823 elif len(newstring) > 0 and 1 > newstring.count(e): | 908 elif len(newstring) > 0 and 1 > newstring.count(e): |
824 newstring = newstring + " | " + e | 909 newstring = newstring + " | " + e |
825 elif 1 > newstring.count(e): | 910 elif 1 > newstring.count(e): |
826 newstring = newstring + e | 911 newstring = newstring + e |
827 result[SEQUENCE7] = newstring | 912 result[SEQUENCE7] = newstring |
828 | |
829 | 913 |
830 return [phospho_pep, result] | 914 return [phospho_pep, result] |
831 | 915 |
832 # Construct list of [string, dictionary] lists | 916 # Construct list of [string, dictionary] lists |
833 # where the dictionary provides the SwissProt metadata for a phosphopeptide | 917 # where the dictionary provides the SwissProt metadata for a phosphopeptide |
834 result_list = [ | 918 result_list = [ |
835 catch(pseq_to_subdict,psequence) | 919 catch(pseq_to_subdict, psequence) |
836 for psequence | 920 for psequence in data_in[PHOSPHOPEPTIDE_MATCH] |
837 in data_in[PHOSPHOPEPTIDE_MATCH] | 921 ] |
838 ] | 922 |
839 | 923 end_time = time.process_time() # timer |
840 | 924 print( |
841 end_time = time.process_time() #timer | 925 "%0.6f added SwissProt annotations to phosphopeptides [B]" |
842 print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer | 926 % (end_time - start_time,), |
927 file=sys.stderr, | |
928 ) # timer | |
843 | 929 |
844 # Construct dictionary from list of lists | 930 # Construct dictionary from list of lists |
845 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ | 931 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ |
846 UniProt_Info = { | 932 UniProt_Info = { |
847 result[0]:result[1] | 933 result[0]: result[1] |
848 for result | 934 for result in result_list |
849 in result_list | |
850 if result is not None | 935 if result is not None |
851 } | 936 } |
852 | 937 |
853 | 938 end_time = time.process_time() # timer |
854 end_time = time.process_time() #timer | 939 print( |
855 print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer | 940 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" |
856 | 941 % (end_time - start_time,), |
857 #cosmetic: add N_A to phosphopeptide rows with no hits | 942 file=sys.stderr, |
943 ) # timer | |
944 | |
945 # cosmetic: add N_A to phosphopeptide rows with no hits | |
858 p_peptide_list = [] | 946 p_peptide_list = [] |
859 for key in UniProt_Info: | 947 for key in UniProt_Info: |
860 p_peptide_list.append(key) | 948 p_peptide_list.append(key) |
861 for nestedKey in UniProt_Info[key]: | 949 for nestedKey in UniProt_Info[key]: |
862 if UniProt_Info[key][nestedKey] == "": | 950 if UniProt_Info[key][nestedKey] == "": |
863 UniProt_Info[key][nestedKey] = N_A | 951 UniProt_Info[key][nestedKey] = N_A |
864 | 952 |
865 end_time = time.process_time() #timer | 953 end_time = time.process_time() # timer |
866 print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer | 954 print( |
867 | 955 "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), |
868 #convert UniProt_Info dictionary to dataframe | 956 file=sys.stderr, |
869 uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) | 957 ) # timer |
870 | 958 |
871 #reorder columns to match expected output file | 959 # convert UniProt_Info dictionary to dataframe |
872 uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too | 960 uniprot_df = pandas.DataFrame.transpose( |
873 | 961 pandas.DataFrame.from_dict(UniProt_Info) |
962 ) | |
963 | |
964 # reorder columns to match expected output file | |
965 uniprot_df[ | |
966 PHOSPHOPEPTIDE | |
967 ] = uniprot_df.index # make index a column too | |
874 | 968 |
875 cols = uniprot_df.columns.tolist() | 969 cols = uniprot_df.columns.tolist() |
876 #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] | 970 # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] |
877 #uniprot_df = uniprot_df[cols] | 971 # uniprot_df = uniprot_df[cols] |
878 uniprot_df = uniprot_df[[ | 972 uniprot_df = uniprot_df[ |
879 PHOSPHOPEPTIDE, | 973 [ |
880 SEQUENCE10, | 974 PHOSPHOPEPTIDE, |
881 SEQUENCE7, | 975 SEQUENCE10, |
882 GENE_NAME, | 976 SEQUENCE7, |
883 PHOSPHORESIDUE, | 977 GENE_NAME, |
884 UNIPROT_ID, | 978 PHOSPHORESIDUE, |
885 DESCRIPTION | 979 UNIPROT_ID, |
886 ]] | 980 DESCRIPTION, |
887 | 981 ] |
888 | 982 ] |
889 end_time = time.process_time() #timer | 983 |
890 print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer | 984 end_time = time.process_time() # timer |
891 | 985 print( |
892 #concat to split then groupby to collapse | 986 "%0.6f reordered columns to match expected output file [1]" |
893 seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) | 987 % (end_time - start_time,), |
894 for _, row in uniprot_df.iterrows()]).reset_index() | 988 file=sys.stderr, |
895 seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] | 989 ) # timer |
990 | |
991 # concat to split then groupby to collapse | |
992 seq7_df = pandas.concat( | |
993 [ | |
994 pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) | |
995 for _, row in uniprot_df.iterrows() | |
996 ] | |
997 ).reset_index() | |
998 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] | |
896 | 999 |
897 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- | 1000 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- |
898 #read in PhosphoSitePlus Regulatory Sites dataset | 1001 # read in PhosphoSitePlus Regulatory Sites dataset |
899 #ACE if (True): | 1002 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- |
900 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- | |
901 conn = sql.connect(uniprot_sqlite) | 1003 conn = sql.connect(uniprot_sqlite) |
902 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) | 1004 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) |
903 # Close SwissProt SQLite database | 1005 # Close SwissProt SQLite database |
904 conn.close() | 1006 conn.close() |
905 #ACE # Array indexes are zero-based | |
906 #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) | |
907 #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT' | |
908 #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM'] | |
909 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also | |
910 #ACE print(regsites_df) | |
911 ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) ----------- | |
912 #ACE else: | |
913 #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3) | |
914 #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA | |
915 #ACE #ACE # Array indexes are zero-based | |
916 #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) | |
917 #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID' | |
918 #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA' | |
919 #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT' | |
920 #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES' | |
921 #ACE #ACE ] | |
922 #ACE #ACE REGSITE_COL_SITE7AA = 9 | |
923 #ACE #ACE REGSITE_COL_PROTEIN = 1 | |
924 #ACE #ACE REGSITE_COL_DOMAIN = 10 | |
925 #ACE #ACE REGSITE_COL_PMIDs = 15 | |
926 | |
927 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ | 1007 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ |
928 | 1008 |
929 | 1009 # keep only the human entries in dataframe |
930 #keep only the human entries in dataframe | |
931 if len(species) > 0: | 1010 if len(species) > 0: |
932 print('Limit PhosphoSitesPlus records to species "' + species + '"') | 1011 print( |
1012 'Limit PhosphoSitesPlus records to species "' + species + '"' | |
1013 ) | |
933 regsites_df = regsites_df[regsites_df.ORGANISM == species] | 1014 regsites_df = regsites_df[regsites_df.ORGANISM == species] |
934 | 1015 |
935 #merge the seq7 df with the regsites df based off of the sequence7 | 1016 # merge the seq7 df with the regsites df based off of the sequence7 |
936 merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') | 1017 merge_df = seq7_df.merge( |
937 #ACE print(merge_df.columns.tolist()) #ACE | 1018 regsites_df, |
938 | 1019 left_on=SEQUENCE7, |
939 #after merging df, select only the columns of interest - note that PROTEIN is absent here | 1020 right_on=SITE_PLUSMINUS_7AA_SQL, |
940 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] | 1021 how="left", |
941 #ACE print(merge_df.columns.tolist()) #ACE | 1022 ) |
942 #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" | 1023 |
943 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") | 1024 # after merging df, select only the columns of interest - note that PROTEIN is absent here |
944 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") | 1025 merge_df = merge_df[ |
945 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") | 1026 [ |
946 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") | 1027 PHOSPHOPEPTIDE, |
947 | 1028 SEQUENCE7, |
948 #remove the columns that were combined | 1029 ON_FUNCTION, |
949 merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] | 1030 ON_PROCESS, |
950 | 1031 ON_PROT_INTERACT, |
951 #ACE print(merge_df) #ACE | 1032 ON_OTHER_INTERACT, |
952 #ACE print(merge_df.columns.tolist()) #ACE | 1033 ON_NOTES, |
953 | 1034 ] |
954 end_time = time.process_time() #timer | 1035 ] |
955 print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer | 1036 # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" |
956 | 1037 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( |
957 #cosmetic changes to Function Phosphoresidue column | 1038 merge_df[ON_PROCESS], sep="; ", na_rep="" |
1039 ) | |
1040 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1041 FUNCTION_PHOSPHORESIDUE | |
1042 ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") | |
1043 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1044 FUNCTION_PHOSPHORESIDUE | |
1045 ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") | |
1046 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
1047 FUNCTION_PHOSPHORESIDUE | |
1048 ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") | |
1049 | |
1050 # remove the columns that were combined | |
1051 merge_df = merge_df[ | |
1052 [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] | |
1053 ] | |
1054 | |
1055 end_time = time.process_time() # timer | |
1056 print( | |
1057 "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), | |
1058 file=sys.stderr, | |
1059 ) # timer | |
1060 | |
1061 # cosmetic changes to Function Phosphoresidue column | |
958 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) | 1062 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) |
959 | 1063 |
960 end_time = time.process_time() #timer | 1064 end_time = time.process_time() # timer |
961 print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer | 1065 print( |
1066 "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), | |
1067 file=sys.stderr, | |
1068 ) # timer | |
962 | 1069 |
963 i = 0 | 1070 i = 0 |
964 while i < len(fp_series): | 1071 while i < len(fp_series): |
965 #remove the extra ";" so that it looks more professional | 1072 # remove the extra ";" so that it looks more professional |
966 if fp_series[i] == "; ; ; ; ": #remove ; from empty hits | 1073 if fp_series[i] == "; ; ; ; ": # remove ; from empty hits |
967 fp_series[i] = "" | 1074 fp_series[i] = "" |
968 while fp_series[i].endswith("; "): #remove ; from the ends | 1075 while fp_series[i].endswith("; "): # remove ; from the ends |
969 fp_series[i] = fp_series[i][:-2] | 1076 fp_series[i] = fp_series[i][:-2] |
970 while fp_series[i].startswith("; "): #remove ; from the beginning | 1077 while fp_series[i].startswith("; "): # remove ; from the beginning |
971 fp_series[i] = fp_series[i][2:] | 1078 fp_series[i] = fp_series[i][2:] |
972 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") | 1079 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") |
973 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") | 1080 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") |
974 fp_series[i] = fp_series[i].replace("; ; ", "; ") | 1081 fp_series[i] = fp_series[i].replace("; ; ", "; ") |
975 | 1082 |
976 #turn blanks into N_A to signify the info was searched for but cannot be found | 1083 # turn blanks into N_A to signify the info was searched for but cannot be found |
977 if fp_series[i] == "": | 1084 if fp_series[i] == "": |
978 fp_series[i] = N_A | 1085 fp_series[i] = N_A |
979 | 1086 |
980 i += 1 | 1087 i += 1 |
981 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series | 1088 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series |
982 | 1089 |
983 end_time = time.process_time() #timer | 1090 end_time = time.process_time() # timer |
984 print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer | 1091 print( |
985 | 1092 "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), |
986 #merge uniprot df with merge df | 1093 file=sys.stderr, |
987 uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") | 1094 ) # timer |
988 | 1095 |
989 #collapse the merged df | 1096 # merge uniprot df with merge df |
1097 uniprot_regsites_merged_df = uniprot_df.merge( | |
1098 merge_df, | |
1099 left_on=PHOSPHOPEPTIDE, | |
1100 right_on=PHOSPHOPEPTIDE, | |
1101 how="left", | |
1102 ) | |
1103 | |
1104 # collapse the merged df | |
990 uniprot_regsites_collapsed_df = pandas.DataFrame( | 1105 uniprot_regsites_collapsed_df = pandas.DataFrame( |
991 uniprot_regsites_merged_df | 1106 uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ |
992 .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] | 1107 FUNCTION_PHOSPHORESIDUE |
993 .apply(lambda x: ppep_join(x))) | 1108 ].apply(lambda x: ppep_join(x)) |
994 #.apply(lambda x: "%s" % ' | '.join(x))) | 1109 ) |
995 | 1110 # .apply(lambda x: "%s" % ' | '.join(x))) |
996 | 1111 |
997 end_time = time.process_time() #timer | 1112 end_time = time.process_time() # timer |
998 print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer | 1113 print( |
999 | 1114 "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), |
1000 uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column | 1115 file=sys.stderr, |
1001 | 1116 ) # timer |
1002 | 1117 |
1003 #rename columns | 1118 uniprot_regsites_collapsed_df[ |
1004 uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] | 1119 PHOSPHOPEPTIDE |
1005 | 1120 ] = ( |
1006 | 1121 uniprot_regsites_collapsed_df.index |
1007 | 1122 ) # add df index as its own column |
1008 #select columns to be merged to uniprot_df | 1123 |
1009 #ACE cols = regsites_df.columns.tolist() | 1124 # rename columns |
1010 #ACE print(cols) #ACE | 1125 uniprot_regsites_collapsed_df.columns = [ |
1011 #ACE if len(cols) > 8: | 1126 FUNCTION_PHOSPHORESIDUE, |
1012 #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] | 1127 "ppp", |
1013 #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] | 1128 ] |
1014 #ACE print(cols) #ACE | 1129 |
1015 #ACE regsite_merge_df = regsites_df[cols] | 1130 end_time = time.process_time() # timer |
1016 | 1131 print( |
1017 end_time = time.process_time() #timer | 1132 "%0.6f selected columns to be merged to uniprot_df [1e]" |
1018 print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer | 1133 % (end_time - start_time,), |
1019 | 1134 file=sys.stderr, |
1020 #add columns based on Sequence7 matching site_+/-7_AA | 1135 ) # timer |
1136 | |
1137 # add columns based on Sequence7 matching site_+/-7_AA | |
1021 uniprot_regsite_df = pandas.merge( | 1138 uniprot_regsite_df = pandas.merge( |
1022 left=uniprot_df, | 1139 left=uniprot_df, |
1023 right=uniprot_regsites_collapsed_df, | 1140 right=uniprot_regsites_collapsed_df, |
1024 how='left', | 1141 how="left", |
1025 left_on=PHOSPHOPEPTIDE, | 1142 left_on=PHOSPHOPEPTIDE, |
1026 right_on='ppp') | 1143 right_on="ppp", |
1027 | 1144 ) |
1028 end_time = time.process_time() #timer | 1145 |
1029 print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer | 1146 end_time = time.process_time() # timer |
1147 print( | |
1148 "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" | |
1149 % (end_time - start_time,), | |
1150 file=sys.stderr, | |
1151 ) # timer | |
1030 | 1152 |
1031 data_in.rename( | 1153 data_in.rename( |
1032 {'Protein description': PHOSPHOPEPTIDE}, | 1154 {"Protein description": PHOSPHOPEPTIDE}, |
1033 axis='columns', | 1155 axis="columns", |
1034 inplace=True | 1156 inplace=True, |
1035 ) | 1157 ) |
1036 | 1158 |
1037 | 1159 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') |
1038 | 1160 res2 = sorted( |
1039 sort_start_time = time.process_time() #timer | 1161 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() |
1040 | 1162 ) |
1041 #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') | |
1042 res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold()) | |
1043 data_in = data_in.loc[res2] | 1163 data_in = data_in.loc[res2] |
1044 | 1164 |
1045 end_time = time.process_time() #timer | 1165 end_time = time.process_time() # timer |
1046 print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer | 1166 print( |
1047 | 1167 "%0.6f sorting time [1f]" % (end_time - start_time,), |
1048 | 1168 file=sys.stderr, |
1049 | 1169 ) # timer |
1050 cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] | 1170 |
1171 cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] | |
1051 upstream_data = upstream_data[cols] | 1172 upstream_data = upstream_data[cols] |
1052 | 1173 |
1053 end_time = time.process_time() #timer | 1174 end_time = time.process_time() # timer |
1054 print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer | 1175 print( |
1055 | 1176 "%0.6f refactored columns for Upstream Map [1g]" |
1056 | 1177 % (end_time - start_time,), |
1057 #### #rename upstream columns in new list | 1178 file=sys.stderr, |
1058 #### new_cols = [] | 1179 ) # timer |
1059 #### for name in cols: | 1180 |
1060 #### if "_NetworKIN" in name: | 1181 # #rename upstream columns in new list |
1061 #### name = name.split("_")[0] | 1182 # new_cols = [] |
1062 #### if " motif" in name: | 1183 # for name in cols: |
1063 #### name = name.split(" motif")[0] | 1184 # if "_NetworKIN" in name: |
1064 #### if " sequence " in name: | 1185 # name = name.split("_")[0] |
1065 #### name = name.split(" sequence")[0] | 1186 # if " motif" in name: |
1066 #### if "_Phosida" in name: | 1187 # name = name.split(" motif")[0] |
1067 #### name = name.split("_")[0] | 1188 # if " sequence " in name: |
1068 #### if "_PhosphoSite" in name: | 1189 # name = name.split(" sequence")[0] |
1069 #### name = name.split("_")[0] | 1190 # if "_Phosida" in name: |
1070 #### new_cols.append(name) | 1191 # name = name.split("_")[0] |
1071 | 1192 # if "_PhosphoSite" in name: |
1072 #rename upstream columns in new list | 1193 # name = name.split("_")[0] |
1194 # new_cols.append(name) | |
1195 | |
1196 # rename upstream columns in new list | |
1073 def col_rename(name): | 1197 def col_rename(name): |
1074 if "_NetworKIN" in name: | 1198 if "_NetworKIN" in name: |
1075 name = name.split("_")[0] | 1199 name = name.split("_")[0] |
1076 if " motif" in name: | 1200 if " motif" in name: |
1077 name = name.split(" motif")[0] | 1201 name = name.split(" motif")[0] |
1084 return name | 1208 return name |
1085 | 1209 |
1086 new_cols = [col_rename(col) for col in cols] | 1210 new_cols = [col_rename(col) for col in cols] |
1087 upstream_data.columns = new_cols | 1211 upstream_data.columns = new_cols |
1088 | 1212 |
1089 | 1213 end_time = time.process_time() # timer |
1090 | 1214 print( |
1091 end_time = time.process_time() #timer | 1215 "%0.6f renamed columns for Upstream Map [1h_1]" |
1092 print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer | 1216 % (end_time - start_time,), |
1093 | 1217 file=sys.stderr, |
1218 ) # timer | |
1094 | 1219 |
1095 # Create upstream_data_cast as a copy of upstream_data | 1220 # Create upstream_data_cast as a copy of upstream_data |
1096 # but with first column substituted by the phosphopeptide sequence | 1221 # but with first column substituted by the phosphopeptide sequence |
1097 upstream_data_cast = upstream_data.copy() | 1222 upstream_data_cast = upstream_data.copy() |
1098 new_cols_cast = new_cols | 1223 new_cols_cast = new_cols |
1099 new_cols_cast[0] = 'p_peptide' | 1224 new_cols_cast[0] = "p_peptide" |
1100 upstream_data_cast.columns = new_cols_cast | 1225 upstream_data_cast.columns = new_cols_cast |
1101 upstream_data_cast['p_peptide'] = upstream_data.index | 1226 upstream_data_cast["p_peptide"] = upstream_data.index |
1102 new_cols_cast0 = new_cols_cast[0] | |
1103 | 1227 |
1104 # --- -------------- begin read upstream_data_melt ------------------------------------ | 1228 # --- -------------- begin read upstream_data_melt ------------------------------------ |
1105 ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- | 1229 # ----------- Get melted kinase mapping data from SQLite database (start) ----------- |
1106 conn = sql.connect(uniprot_sqlite) | 1230 conn = sql.connect(uniprot_sqlite) |
1107 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) | 1231 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) |
1108 # Close SwissProt SQLite database | 1232 # Close SwissProt SQLite database |
1109 conn.close() | 1233 conn.close() |
1110 upstream_data_melt = upstream_data_melt_df.copy() | 1234 upstream_data_melt = upstream_data_melt_df.copy() |
1111 upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] | 1235 upstream_data_melt.columns = ["p_peptide", "characterization", "X"] |
1112 upstream_data_melt['characterization'] = [ | 1236 upstream_data_melt["characterization"] = [ |
1113 col_rename(s) | 1237 col_rename(s) for s in upstream_data_melt["characterization"] |
1114 for s in upstream_data_melt['characterization'] | 1238 ] |
1115 ] | 1239 |
1116 | 1240 print( |
1117 print('%0.6f upstream_data_melt_df initially has %d rows' % | 1241 "%0.6f upstream_data_melt_df initially has %d rows" |
1118 (end_time - start_time, len(upstream_data_melt.axes[0])) | 1242 % (end_time - start_time, len(upstream_data_melt.axes[0])), |
1119 , file=sys.stderr) | 1243 file=sys.stderr, |
1244 ) | |
1120 # ref: https://stackoverflow.com/a/27360130/15509512 | 1245 # ref: https://stackoverflow.com/a/27360130/15509512 |
1121 # e.g. df.drop(df[df.score < 50].index, inplace=True) | 1246 # e.g. df.drop(df[df.score < 50].index, inplace=True) |
1122 upstream_data_melt.drop( | 1247 upstream_data_melt.drop( |
1123 upstream_data_melt[upstream_data_melt.X != 'X'].index, | 1248 upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True |
1124 inplace = True | 1249 ) |
1125 ) | 1250 print( |
1126 print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % | 1251 "%0.6f upstream_data_melt_df pre-dedup has %d rows" |
1127 (end_time - start_time, len(upstream_data_melt.axes[0])) | 1252 % (end_time - start_time, len(upstream_data_melt.axes[0])), |
1128 , file=sys.stderr) | 1253 file=sys.stderr, |
1129 #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also | 1254 ) |
1130 #ACE print(upstream_data_melt) | 1255 # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- |
1131 ## ----------- Get melted kinase mapping data from SQLite database (finish) ----------- | |
1132 # ... -------------- end read upstream_data_melt -------------------------------------- | 1256 # ... -------------- end read upstream_data_melt -------------------------------------- |
1133 | 1257 |
1134 end_time = time.process_time() #timer | 1258 end_time = time.process_time() # timer |
1135 print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer | 1259 print( |
1260 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" | |
1261 % (end_time - start_time,), | |
1262 file=sys.stderr, | |
1263 ) # timer | |
1136 # ... end read upstream_data_melt | 1264 # ... end read upstream_data_melt |
1137 | 1265 |
1138 upstream_data_melt_index = upstream_data_melt.index | 1266 end_time = time.process_time() # timer |
1139 upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] | 1267 print( |
1140 | 1268 "%0.6f indexed melted Upstream Map [1h_2a]" |
1141 end_time = time.process_time() #timer | 1269 % (end_time - start_time,), |
1142 print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer | 1270 file=sys.stderr, |
1271 ) # timer | |
1143 | 1272 |
1144 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() | 1273 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() |
1145 | 1274 |
1146 melt_dict = {} | 1275 melt_dict = {} |
1147 for key in upstream_map_p_peptide_list: | 1276 for key in upstream_map_p_peptide_list: |
1150 for el in upstream_delta_melt_LoL: | 1279 for el in upstream_delta_melt_LoL: |
1151 (p_peptide, characterization, X) = tuple(el) | 1280 (p_peptide, characterization, X) = tuple(el) |
1152 if p_peptide in melt_dict: | 1281 if p_peptide in melt_dict: |
1153 melt_dict[p_peptide].append(characterization) | 1282 melt_dict[p_peptide].append(characterization) |
1154 else: | 1283 else: |
1155 exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) | 1284 exit( |
1156 | 1285 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' |
1157 | 1286 % (p_peptide) |
1158 end_time = time.process_time() #timer | 1287 ) |
1159 print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer | 1288 |
1160 | 1289 end_time = time.process_time() # timer |
1290 print( | |
1291 "%0.6f appended peptide characterizations [1h_2b]" | |
1292 % (end_time - start_time,), | |
1293 file=sys.stderr, | |
1294 ) # timer | |
1161 | 1295 |
1162 # for key in upstream_map_p_peptide_list: | 1296 # for key in upstream_map_p_peptide_list: |
1163 # melt_dict[key] = ' | '.join(melt_dict[key]) | 1297 # melt_dict[key] = ' | '.join(melt_dict[key]) |
1164 | 1298 |
1165 for key in upstream_map_p_peptide_list: | 1299 for key in upstream_map_p_peptide_list: |
1166 melt_dict[key] = melt_join(melt_dict[key]) | 1300 melt_dict[key] = melt_join(melt_dict[key]) |
1167 | 1301 |
1168 end_time = time.process_time() #timer | 1302 end_time = time.process_time() # timer |
1169 print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer | 1303 print( |
1304 "%0.6f concatenated multiple characterizations [1h_2c]" | |
1305 % (end_time - start_time,), | |
1306 file=sys.stderr, | |
1307 ) # timer | |
1170 | 1308 |
1171 # map_dict is a dictionary of dictionaries | 1309 # map_dict is a dictionary of dictionaries |
1172 map_dict = {} | 1310 map_dict = {} |
1173 for key in upstream_map_p_peptide_list: | 1311 for key in upstream_map_p_peptide_list: |
1174 map_dict[key] = {} | 1312 map_dict[key] = {} |
1175 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] | 1313 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] |
1176 | 1314 |
1177 | 1315 end_time = time.process_time() # timer |
1178 end_time = time.process_time() #timer | 1316 print( |
1179 print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer | 1317 "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), |
1180 | 1318 file=sys.stderr, |
1181 #convert map_dict to dataframe | 1319 ) # timer |
1182 map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) | 1320 |
1183 map_df["p-peptide"] = map_df.index #make index a column too | 1321 # convert map_dict to dataframe |
1322 map_df = pandas.DataFrame.transpose( | |
1323 pandas.DataFrame.from_dict(map_dict) | |
1324 ) | |
1325 map_df["p-peptide"] = map_df.index # make index a column too | |
1184 cols_map_df = map_df.columns.tolist() | 1326 cols_map_df = map_df.columns.tolist() |
1185 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] | 1327 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] |
1186 map_df = map_df[cols_map_df] | 1328 map_df = map_df[cols_map_df] |
1187 | 1329 |
1188 #join map_df to uniprot_regsite_df | 1330 # join map_df to uniprot_regsite_df |
1189 output_df = uniprot_regsite_df.merge( | 1331 output_df = uniprot_regsite_df.merge( |
1190 map_df, | 1332 map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" |
1191 how="left", | 1333 ) |
1192 left_on=PHOSPHOPEPTIDE, | |
1193 right_on="p-peptide") | |
1194 | 1334 |
1195 output_df = output_df[ | 1335 output_df = output_df[ |
1196 [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, | 1336 [ |
1197 UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, | 1337 PHOSPHOPEPTIDE, |
1198 PUTATIVE_UPSTREAM_DOMAINS | 1338 SEQUENCE10, |
1199 ] | 1339 SEQUENCE7, |
1340 GENE_NAME, | |
1341 PHOSPHORESIDUE, | |
1342 UNIPROT_ID, | |
1343 DESCRIPTION, | |
1344 FUNCTION_PHOSPHORESIDUE, | |
1345 PUTATIVE_UPSTREAM_DOMAINS, | |
1200 ] | 1346 ] |
1201 | 1347 ] |
1202 | 1348 |
1203 # cols_output_prelim = output_df.columns.tolist() | 1349 # cols_output_prelim = output_df.columns.tolist() |
1204 # | 1350 # |
1205 # print("cols_output_prelim") | 1351 # print("cols_output_prelim") |
1206 # print(cols_output_prelim) | 1352 # print(cols_output_prelim) |
1215 # print("cols_output") | 1361 # print("cols_output") |
1216 # print(cols_output) | 1362 # print(cols_output) |
1217 # | 1363 # |
1218 # output_df = output_df[cols_output] | 1364 # output_df = output_df[cols_output] |
1219 | 1365 |
1220 #join output_df back to quantitative columns in data_in df | 1366 # join output_df back to quantitative columns in data_in df |
1221 quant_cols = data_in.columns.tolist() | 1367 quant_cols = data_in.columns.tolist() |
1222 quant_cols = quant_cols[1:] | 1368 quant_cols = quant_cols[1:] |
1223 quant_data = data_in[quant_cols] | 1369 quant_data = data_in[quant_cols] |
1224 | 1370 |
1225 ## ----------- Write merge/filter metadata to SQLite database (start) ----------- | 1371 # ----------- Write merge/filter metadata to SQLite database (start) ----------- |
1226 # Open SwissProt SQLite database | 1372 # Open SwissProt SQLite database |
1227 conn = sql.connect(output_sqlite) | 1373 conn = sql.connect(output_sqlite) |
1228 cur = conn.cursor() | 1374 cur = conn.cursor() |
1229 | 1375 |
1230 cur.executescript(MRGFLTR_DDL) | 1376 cur.executescript(MRGFLTR_DDL) |
1231 | 1377 |
1232 cur.execute( | 1378 cur.execute( |
1233 CITATION_INSERT_STMT, | 1379 CITATION_INSERT_STMT, |
1234 ('mrgfltr_metadata_view', CITATION_INSERT_PSP) | 1380 ("mrgfltr_metadata_view", CITATION_INSERT_PSP), |
1235 ) | 1381 ) |
1382 cur.execute( | |
1383 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) | |
1384 ) | |
1236 cur.execute( | 1385 cur.execute( |
1237 CITATION_INSERT_STMT, | 1386 CITATION_INSERT_STMT, |
1238 ('mrgfltr_metadata', CITATION_INSERT_PSP) | 1387 ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), |
1239 ) | 1388 ) |
1240 cur.execute( | 1389 cur.execute( |
1241 CITATION_INSERT_STMT, | 1390 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) |
1242 ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) | 1391 ) |
1243 ) | |
1244 cur.execute( | |
1245 CITATION_INSERT_STMT, | |
1246 ('mrgfltr_metadata', CITATION_INSERT_PSP_REF) | |
1247 ) | |
1248 | 1392 |
1249 # Read ppep-to-sequence LUT | 1393 # Read ppep-to-sequence LUT |
1250 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) | 1394 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) |
1251 #ACE ppep_lut_df.info(verbose=True) | |
1252 # write only metadata for merged/filtered records to SQLite | 1395 # write only metadata for merged/filtered records to SQLite |
1253 mrgfltr_metadata_df = output_df.copy() | 1396 mrgfltr_metadata_df = output_df.copy() |
1254 # replace phosphopeptide seq with ppep.id | 1397 # replace phosphopeptide seq with ppep.id |
1255 mrgfltr_metadata_df = ppep_lut_df.merge( | 1398 mrgfltr_metadata_df = ppep_lut_df.merge( |
1256 mrgfltr_metadata_df, | 1399 mrgfltr_metadata_df, |
1257 left_on='ppep_seq', | 1400 left_on="ppep_seq", |
1258 right_on=PHOSPHOPEPTIDE, | 1401 right_on=PHOSPHOPEPTIDE, |
1259 how='inner' | 1402 how="inner", |
1260 ) | 1403 ) |
1261 mrgfltr_metadata_df.drop( | 1404 mrgfltr_metadata_df.drop( |
1262 columns=[PHOSPHOPEPTIDE, 'ppep_seq'], | 1405 columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True |
1263 inplace=True | 1406 ) |
1264 ) | 1407 # rename columns |
1265 #rename columns | |
1266 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS | 1408 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS |
1267 #ACE mrgfltr_metadata_df.info(verbose=True) | |
1268 mrgfltr_metadata_df.to_sql( | 1409 mrgfltr_metadata_df.to_sql( |
1269 'mrgfltr_metadata', | 1410 "mrgfltr_metadata", |
1270 con=conn, | 1411 con=conn, |
1271 if_exists='append', | 1412 if_exists="append", |
1272 index=False, | 1413 index=False, |
1273 method='multi' | 1414 method="multi", |
1274 ) | 1415 ) |
1275 | 1416 |
1276 # Close SwissProt SQLite database | 1417 # Close SwissProt SQLite database |
1277 conn.close() | 1418 conn.close() |
1278 ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- | 1419 # ----------- Write merge/filter metadata to SQLite database (finish) ----------- |
1279 | 1420 |
1280 output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) | 1421 output_df = output_df.merge( |
1422 quant_data, | |
1423 how="right", | |
1424 left_on=PHOSPHOPEPTIDE, | |
1425 right_on=PHOSPHOPEPTIDE_MATCH, | |
1426 ) | |
1281 output_cols = output_df.columns.tolist() | 1427 output_cols = output_df.columns.tolist() |
1282 output_cols = output_cols[:-1] | 1428 output_cols = output_cols[:-1] |
1283 output_df = output_df[output_cols] | 1429 output_df = output_df[output_cols] |
1284 | 1430 |
1285 #cosmetic changes to Upstream column | 1431 # cosmetic changes to Upstream column |
1286 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | 1432 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ |
1433 PUTATIVE_UPSTREAM_DOMAINS | |
1434 ].fillna( | |
1435 "" | |
1436 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | |
1287 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) | 1437 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) |
1288 i = 0 | 1438 i = 0 |
1289 while i < len(us_series): | 1439 while i < len(us_series): |
1290 #turn blanks into N_A to signify the info was searched for but cannot be found | 1440 # turn blanks into N_A to signify the info was searched for but cannot be found |
1291 if us_series[i] == "": | 1441 if us_series[i] == "": |
1292 us_series[i] = N_A | 1442 us_series[i] = N_A |
1293 i += 1 | 1443 i += 1 |
1294 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series | 1444 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series |
1295 | 1445 |
1296 end_time = time.process_time() #timer | 1446 end_time = time.process_time() # timer |
1297 print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer | 1447 print( |
1448 "%0.6f establisheed output [3]" % (end_time - start_time,), | |
1449 file=sys.stderr, | |
1450 ) # timer | |
1298 | 1451 |
1299 (output_rows, output_cols) = output_df.shape | 1452 (output_rows, output_cols) = output_df.shape |
1300 | 1453 |
1301 #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) | |
1302 output_df = output_df.convert_dtypes(convert_integer=True) | 1454 output_df = output_df.convert_dtypes(convert_integer=True) |
1303 | 1455 |
1304 | 1456 # Output onto Final CSV file |
1305 #Output onto Final CSV file | |
1306 output_df.to_csv(output_filename_csv, index=False) | 1457 output_df.to_csv(output_filename_csv, index=False) |
1307 output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) | 1458 output_df.to_csv( |
1308 | 1459 output_filename_tab, quoting=None, sep="\t", index=False |
1309 end_time = time.process_time() #timer | 1460 ) |
1310 print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer | 1461 |
1311 | 1462 end_time = time.process_time() # timer |
1312 print('{:>10} phosphopeptides written to output'.format(str(output_rows))) | 1463 print( |
1313 | 1464 "%0.6f wrote output [4]" % (end_time - start_time,), |
1314 end_time = time.process_time() #timer | 1465 file=sys.stderr, |
1315 print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer | 1466 ) # timer |
1316 | 1467 |
1317 | 1468 print( |
1318 #Rev. 7/1/2016 | 1469 "{:>10} phosphopeptides written to output".format(str(output_rows)) |
1319 #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | 1470 ) |
1320 #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | 1471 |
1321 #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | 1472 end_time = time.process_time() # timer |
1473 print( | |
1474 "%0.6f seconds of non-system CPU time were consumed" | |
1475 % (end_time - start_time,), | |
1476 file=sys.stderr, | |
1477 ) # timer | |
1478 | |
1479 # Rev. 7/1/2016 | |
1480 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | |
1481 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | |
1482 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | |
1322 # read from SwissProt SQLite database | 1483 # read from SwissProt SQLite database |
1323 #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper | 1484 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper |
1324 | 1485 |
1325 ############################################# | 1486 # |
1326 # copied from Excel Output Script.ipynb END # | 1487 # copied from Excel Output Script.ipynb END # |
1327 ############################################# | 1488 # |
1328 | 1489 |
1329 try: | 1490 try: |
1330 catch(mqpep_getswissprot,) | 1491 catch( |
1492 mqpep_getswissprot, | |
1493 ) | |
1331 exit(0) | 1494 exit(0) |
1332 except Exception as e: | 1495 except Exception as e: |
1333 exit('Internal error running mqpep_getswissprot(): %s' % (e)) | 1496 exit("Internal error running mqpep_getswissprot(): %s" % (e)) |
1497 | |
1334 | 1498 |
1335 if __name__ == "__main__": | 1499 if __name__ == "__main__": |
1336 __main__() | 1500 __main__() |
1337 |