Mercurial > repos > eschen42 > mqppep_anova
comparison search_ppep.py @ 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | d4d531006735 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c1403d18c189 |
---|---|
1 #!/usr/bin/env python | |
2 # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB | |
3 | |
4 import argparse | |
5 import os.path | |
6 import sqlite3 | |
7 import re | |
8 from codecs import getreader as cx_getreader | |
9 import time | |
10 | |
11 # For Aho-Corasick search for fixed set of substrings | |
12 # - add_word | |
13 # - make_automaton | |
14 # - iter | |
15 import ahocorasick | |
16 # Support map over auto.iter(...) | |
17 # - itemgetter | |
18 import operator | |
19 #import hashlib | |
20 | |
21 # ref: https://stackoverflow.com/a/8915613/15509512 | |
22 # answers: "How to handle exceptions in a list comprehensions" | |
23 # usage: | |
24 # from math import log | |
25 # eggs = [1,3,0,3,2] | |
26 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) | |
27 # producing: | |
28 # for <built-in function log> | |
29 # with args (0,) | |
30 # exception: math domain error | |
31 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | |
32 def catch(func, *args, handle=lambda e : e, **kwargs): | |
33 try: | |
34 return func(*args, **kwargs) | |
35 except Exception as e: | |
36 print("For %s" % str(func)) | |
37 print(" with args %s" % str(args)) | |
38 print(" caught exception: %s" % str(e)) | |
39 (ty, va, tb) = sys.exc_info() | |
40 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
41 #exit(-1) | |
42 return None # was handle(e) | |
43 | |
44 def __main__(): | |
45 ITEM_GETTER = operator.itemgetter(1) | |
46 | |
47 DROP_TABLES_SQL = ''' | |
48 DROP VIEW IF EXISTS ppep_gene_site_view; | |
49 DROP VIEW IF EXISTS uniprot_view; | |
50 DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; | |
51 DROP VIEW IF EXISTS ppep_intensity_view; | |
52 DROP VIEW IF EXISTS ppep_metadata_view; | |
53 | |
54 DROP TABLE IF EXISTS sample; | |
55 DROP TABLE IF EXISTS ppep; | |
56 DROP TABLE IF EXISTS site_type; | |
57 DROP TABLE IF EXISTS deppep_UniProtKB; | |
58 DROP TABLE IF EXISTS deppep; | |
59 DROP TABLE IF EXISTS ppep_gene_site; | |
60 DROP TABLE IF EXISTS ppep_metadata; | |
61 DROP TABLE IF EXISTS ppep_intensity; | |
62 ''' | |
63 | |
64 CREATE_TABLES_SQL = ''' | |
65 CREATE TABLE deppep | |
66 ( id INTEGER PRIMARY KEY | |
67 , seq TEXT UNIQUE ON CONFLICT IGNORE | |
68 ) | |
69 ; | |
70 CREATE TABLE deppep_UniProtKB | |
71 ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE | |
72 , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE | |
73 , pos_start INTEGER | |
74 , pos_end INTEGER | |
75 , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) | |
76 ON CONFLICT IGNORE | |
77 ) | |
78 ; | |
79 CREATE TABLE ppep | |
80 ( id INTEGER PRIMARY KEY | |
81 , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE | |
82 , seq TEXT UNIQUE ON CONFLICT IGNORE | |
83 , scrubbed TEXT | |
84 ); | |
85 CREATE TABLE site_type | |
86 ( id INTEGER PRIMARY KEY | |
87 , type_name TEXT UNIQUE ON CONFLICT IGNORE | |
88 ); | |
89 CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) | |
90 ; | |
91 CREATE TABLE sample | |
92 ( id INTEGER PRIMARY KEY | |
93 , name TEXT UNIQUE ON CONFLICT IGNORE | |
94 ) | |
95 ; | |
96 CREATE VIEW uniprot_view AS | |
97 SELECT DISTINCT | |
98 Uniprot_ID | |
99 , Description | |
100 , Organism_Name | |
101 , Organism_ID | |
102 , Gene_Name | |
103 , PE | |
104 , SV | |
105 , Sequence | |
106 , Description || ' OS=' || | |
107 Organism_Name || ' OX=' || Organism_ID || | |
108 CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END || | |
109 CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END || | |
110 CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END | |
111 AS long_description | |
112 , Database | |
113 FROM UniProtKB | |
114 ; | |
115 CREATE VIEW uniprotkb_pep_ppep_view AS | |
116 SELECT deppep_UniProtKB.UniprotKB_ID AS accession | |
117 , deppep_UniProtKB.pos_start AS pos_start | |
118 , deppep_UniProtKB.pos_end AS pos_end | |
119 , deppep.seq AS peptide | |
120 , ppep.seq AS phosphopeptide | |
121 , ppep.scrubbed AS scrubbed | |
122 , uniprot_view.Sequence AS sequence | |
123 , uniprot_view.Description AS description | |
124 , uniprot_view.long_description AS long_description | |
125 , ppep.id AS ppep_id | |
126 FROM ppep, deppep, deppep_UniProtKB, uniprot_view | |
127 WHERE deppep.id = ppep.deppep_id | |
128 AND deppep.id = deppep_UniProtKB.deppep_id | |
129 AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID | |
130 ORDER BY UniprotKB_ID, deppep.seq, ppep.seq | |
131 ; | |
132 CREATE TABLE ppep_gene_site | |
133 ( ppep_id INTEGER REFERENCES ppep(id) | |
134 , gene_names TEXT | |
135 , site_type_id INTEGER REFERENCES site_type(id) | |
136 , kinase_map TEXT | |
137 , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE | |
138 ) | |
139 ; | |
140 CREATE VIEW ppep_gene_site_view AS | |
141 SELECT DISTINCT | |
142 ppep.seq AS phospho_peptide | |
143 , ppep_id | |
144 , gene_names | |
145 , type_name | |
146 , kinase_map | |
147 FROM | |
148 ppep, ppep_gene_site, site_type | |
149 WHERE | |
150 ppep_gene_site.ppep_id = ppep.id | |
151 AND | |
152 ppep_gene_site.site_type_id = site_type.id | |
153 ORDER BY | |
154 ppep.seq | |
155 ; | |
156 CREATE TABLE ppep_metadata | |
157 ( ppep_id INTEGER REFERENCES ppep(id) | |
158 , protein_description TEXT | |
159 , gene_name TEXT | |
160 , FASTA_name TEXT | |
161 , phospho_sites TEXT | |
162 , motifs_unique TEXT | |
163 , accessions TEXT | |
164 , motifs_all_members TEXT | |
165 , domain TEXT | |
166 , ON_FUNCTION TEXT | |
167 , ON_PROCESS TEXT | |
168 , ON_PROT_INTERACT TEXT | |
169 , ON_OTHER_INTERACT TEXT | |
170 , notes TEXT | |
171 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE | |
172 ) | |
173 ; | |
174 CREATE VIEW ppep_metadata_view AS | |
175 SELECT DISTINCT | |
176 ppep.seq AS phospho_peptide | |
177 , protein_description | |
178 , gene_name | |
179 , FASTA_name | |
180 , phospho_sites | |
181 , motifs_unique | |
182 , accessions | |
183 , motifs_all_members | |
184 , domain | |
185 , ON_FUNCTION | |
186 , ON_PROCESS | |
187 , ON_PROT_INTERACT | |
188 , ON_OTHER_INTERACT | |
189 , notes | |
190 FROM | |
191 ppep, ppep_metadata | |
192 WHERE | |
193 ppep_metadata.ppep_id = ppep.id | |
194 ORDER BY | |
195 ppep.seq | |
196 ; | |
197 CREATE TABLE ppep_intensity | |
198 ( ppep_id INTEGER REFERENCES ppep(id) | |
199 , sample_id INTEGER | |
200 , intensity INTEGER | |
201 , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE | |
202 ) | |
203 ; | |
204 CREATE VIEW ppep_intensity_view AS | |
205 SELECT DISTINCT | |
206 ppep.seq AS phospho_peptide | |
207 , sample.name AS sample | |
208 , intensity | |
209 FROM | |
210 ppep, sample, ppep_intensity | |
211 WHERE | |
212 ppep_intensity.sample_id = sample.id | |
213 AND | |
214 ppep_intensity.ppep_id = ppep.id | |
215 ; | |
216 ''' | |
217 | |
218 UNIPROT_SEQ_AND_ID_SQL = ''' | |
219 select Sequence, Uniprot_ID | |
220 from UniProtKB | |
221 ''' | |
222 | |
223 # Parse Command Line | |
224 parser = argparse.ArgumentParser( | |
225 description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).' | |
226 ) | |
227 | |
228 # inputs: | |
229 # Phosphopeptide data for experimental results, including the intensities | |
230 # and the mapping to kinase domains, in tabular format. | |
231 parser.add_argument( | |
232 '--phosphopeptides', '-p', | |
233 nargs=1, | |
234 required=True, | |
235 dest='phosphopeptides', | |
236 help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool' | |
237 ) | |
238 parser.add_argument( | |
239 '--uniprotkb', '-u', | |
240 nargs=1, | |
241 required=True, | |
242 dest='uniprotkb', | |
243 help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool' | |
244 ) | |
245 parser.add_argument( | |
246 '--schema', | |
247 action='store_true', | |
248 dest='db_schema', | |
249 help='show updated database schema' | |
250 ) | |
251 parser.add_argument( | |
252 '--warn-duplicates', | |
253 action='store_true', | |
254 dest='warn_duplicates', | |
255 help='show warnings for duplicated sequences' | |
256 ) | |
257 parser.add_argument( | |
258 '--verbose', | |
259 action='store_true', | |
260 dest='verbose', | |
261 help='show somewhat verbose program tracing' | |
262 ) | |
263 # "Make it so!" (parse the arguments) | |
264 options = parser.parse_args() | |
265 if options.verbose: | |
266 print("options: " + str(options) + "\n") | |
267 | |
268 # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file | |
269 if options.phosphopeptides is None: | |
270 exit('Argument "phosphopeptides" is required but not supplied') | |
271 try: | |
272 f_name = os.path.abspath(options.phosphopeptides[0]) | |
273 except Exception as e: | |
274 exit('Error parsing phosphopeptides argument: %s' % (e)) | |
275 | |
276 # path to SQLite input/output tabular file | |
277 if options.uniprotkb is None: | |
278 exit('Argument "uniprotkb" is required but not supplied') | |
279 try: | |
280 db_name = os.path.abspath(options.uniprotkb[0]) | |
281 except Exception as e: | |
282 exit('Error parsing uniprotkb argument: %s' % (e)) | |
283 | |
284 # print("options.schema is %d" % options.db_schema) | |
285 | |
286 # db_name = "demo/test.sqlite" | |
287 # f_name = "demo/test_input.txt" | |
288 | |
289 con = sqlite3.connect(db_name) | |
290 cur = con.cursor() | |
291 ker = con.cursor() | |
292 | |
293 cur.executescript(DROP_TABLES_SQL) | |
294 | |
295 # if options.db_schema: | |
296 # print("\nAfter dropping tables/views that are to be created, schema is:") | |
297 # cur.execute("SELECT * FROM sqlite_schema") | |
298 # for row in cur.fetchall(): | |
299 # if row[4] is not None: | |
300 # print("%s;" % row[4]) | |
301 | |
302 cur.executescript(CREATE_TABLES_SQL) | |
303 | |
304 if options.db_schema: | |
305 print("\nAfter creating tables/views that are to be created, schema is:") | |
306 cur.execute("SELECT * FROM sqlite_schema") | |
307 for row in cur.fetchall(): | |
308 if row[4] is not None: | |
309 print("%s;" % row[4]) | |
310 | |
311 def generate_ppep(f): | |
312 #get keys from upstream tabular file using readline() | |
313 # ref: https://stackoverflow.com/a/16713581/15509512 | |
314 # answer to "Use codecs to read file with correct encoding" | |
315 file1_encoded = open(f, 'rb') | |
316 file1 = cx_getreader("latin-1")(file1_encoded) | |
317 | |
318 count = 0 | |
319 re_tab = re.compile('^[^\t]*') | |
320 re_quote = re.compile('"') | |
321 while True: | |
322 count += 1 | |
323 # Get next line from file | |
324 line = file1.readline() | |
325 # if line is empty | |
326 # end of file is reached | |
327 if not line: | |
328 break | |
329 if count > 1: | |
330 m = re_tab.match(line) | |
331 m = re_quote.sub('',m[0]) | |
332 yield m | |
333 file1.close() | |
334 file1_encoded.close() | |
335 | |
336 # Build an Aho-Corasick automaton from a trie | |
337 # - ref: | |
338 # - https://pypi.org/project/pyahocorasick/ | |
339 # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm | |
340 # - https://en.wikipedia.org/wiki/Trie | |
341 auto = ahocorasick.Automaton() | |
342 re_phos = re.compile('p') | |
343 # scrub out unsearchable characters per section | |
344 # "Match the p_peptides to the @sequences array:" | |
345 # of the original | |
346 # PhosphoPeptide Upstream Kinase Mapping.pl | |
347 # which originally read | |
348 # $tmp_p_peptide =~ s/#//g; | |
349 # $tmp_p_peptide =~ s/\d//g; | |
350 # $tmp_p_peptide =~ s/\_//g; | |
351 # $tmp_p_peptide =~ s/\.//g; | |
352 # | |
353 re_scrub = re.compile('0-9_.#') | |
354 ppep_count = 0 | |
355 for ppep in generate_ppep(f_name): | |
356 ppep_count += 1 | |
357 add_to_trie = False | |
358 #print(ppep) | |
359 scrubbed = re_scrub.sub('',ppep) | |
360 deppep = re_phos.sub('',scrubbed) | |
361 if options.verbose: | |
362 print("deppep: %s; scrubbed: %s" % (deppep,scrubbed)) | |
363 #print(deppep) | |
364 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) | |
365 if cur.fetchone() is None: | |
366 add_to_trie = True | |
367 cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) | |
368 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) | |
369 deppep_id = cur.fetchone()[0] | |
370 if add_to_trie: | |
371 #print((deppep_id, deppep)) | |
372 # Build the trie | |
373 auto.add_word(deppep, (deppep_id, deppep)) | |
374 cur.execute( | |
375 "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", | |
376 (ppep, scrubbed, deppep_id) | |
377 ) | |
378 # def generate_deppep(): | |
379 # cur.execute("SELECT seq FROM deppep") | |
380 # for row in cur.fetchall(): | |
381 # yield row[0] | |
382 cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") | |
383 for row in cur.fetchall(): | |
384 deppep_count = row[0] | |
385 | |
386 cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)") | |
387 for row in cur.fetchall(): | |
388 sequence_count = row[0] | |
389 | |
390 print( | |
391 "%d phosphopeptides were read from input" % ppep_count | |
392 ) | |
393 print( | |
394 "%d corresponding dephosphopeptides are represented in input" % deppep_count | |
395 ) | |
396 # Look for cases where both Gene_Name and Sequence are identical | |
397 cur.execute(''' | |
398 SELECT Uniprot_ID, Gene_Name, Sequence | |
399 FROM UniProtKB | |
400 WHERE Sequence IN ( | |
401 SELECT Sequence | |
402 FROM UniProtKB | |
403 GROUP BY Sequence, Gene_Name | |
404 HAVING count(*) > 1 | |
405 ) | |
406 ORDER BY Sequence | |
407 ''') | |
408 duplicate_count = 0 | |
409 old_seq = '' | |
410 for row in cur.fetchall(): | |
411 if duplicate_count == 0: | |
412 print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).") | |
413 if row[2] != old_seq: | |
414 old_seq = row[2] | |
415 duplicate_count += 1 | |
416 if options.warn_duplicates: | |
417 print("\n%s\t%s\t%s" % row) | |
418 else: | |
419 if options.warn_duplicates: | |
420 print("%s\t%s" % (row[0], row[1])) | |
421 if duplicate_count > 0: | |
422 print("\n%d sequences have duplicated accession IDs\n" % duplicate_count) | |
423 | |
424 print( | |
425 "%s accession sequences will be searched\n" % sequence_count | |
426 ) | |
427 | |
428 #print(auto.dump()) | |
429 | |
430 # Convert the trie to an automaton (a finite-state machine) | |
431 auto.make_automaton() | |
432 | |
433 # Execute query for seqs and metadata without fetching the results yet | |
434 uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) | |
435 while batch := uniprot_seq_and_id.fetchmany(size=50): | |
436 if None == batch: | |
437 break | |
438 for Sequence, UniProtKB_id in batch: | |
439 if Sequence is not None: | |
440 for end_index, (insert_order, original_value) in auto.iter(Sequence): | |
441 ker.execute(''' | |
442 INSERT INTO deppep_UniProtKB | |
443 (deppep_id,UniProtKB_id,pos_start,pos_end) | |
444 VALUES (?,?,?,?) | |
445 ''', ( | |
446 insert_order, | |
447 UniProtKB_id, | |
448 1 + end_index - len(original_value), | |
449 end_index | |
450 ) | |
451 ) | |
452 else: | |
453 raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,)) | |
454 ker.execute(""" | |
455 SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' | |
456 FROM uniprotkb_pep_ppep_view | |
457 """ | |
458 ) | |
459 for row in ker.fetchall(): | |
460 print(row[0]) | |
461 | |
462 ker.execute(""" | |
463 SELECT count(*) || ' accession matches were found', count(*) AS accession_count | |
464 FROM ( | |
465 SELECT accession | |
466 FROM uniprotkb_pep_ppep_view | |
467 GROUP BY accession | |
468 ) | |
469 """ | |
470 ) | |
471 for row in ker.fetchall(): | |
472 print(row[0]) | |
473 accession_count = row[1] | |
474 | |
475 ker.execute(""" | |
476 SELECT count(*) || ' peptide matches were found' | |
477 FROM ( | |
478 SELECT peptide | |
479 FROM uniprotkb_pep_ppep_view | |
480 GROUP BY peptide | |
481 ) | |
482 """ | |
483 ) | |
484 for row in ker.fetchall(): | |
485 print(row[0]) | |
486 | |
487 ker.execute(""" | |
488 SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count | |
489 FROM ( | |
490 SELECT phosphopeptide | |
491 FROM uniprotkb_pep_ppep_view | |
492 GROUP BY phosphopeptide | |
493 ) | |
494 """ | |
495 ) | |
496 for row in ker.fetchall(): | |
497 print(row[0]) | |
498 phosphopeptide_count = row[1] | |
499 | |
500 con.commit() | |
501 ker.execute('vacuum') | |
502 con.close() | |
503 | |
504 if __name__ == "__main__": | |
505 wrap_start_time = time.perf_counter() | |
506 __main__() | |
507 wrap_stop_time = time.perf_counter() | |
508 # print(wrap_start_time) | |
509 # print(wrap_stop_time) | |
510 print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),) | |
511 | |
512 # vim: sw=4 ts=4 et ai : |