comparison search_ppep.py @ 0:c1403d18c189 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author eschen42
date Mon, 07 Mar 2022 19:05:01 +0000
parents
children d4d531006735
comparison
equal deleted inserted replaced
-1:000000000000 0:c1403d18c189
1 #!/usr/bin/env python
2 # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
3
4 import argparse
5 import os.path
6 import sqlite3
7 import re
8 from codecs import getreader as cx_getreader
9 import time
10
11 # For Aho-Corasick search for fixed set of substrings
12 # - add_word
13 # - make_automaton
14 # - iter
15 import ahocorasick
16 # Support map over auto.iter(...)
17 # - itemgetter
18 import operator
19 #import hashlib
20
21 # ref: https://stackoverflow.com/a/8915613/15509512
22 # answers: "How to handle exceptions in a list comprehensions"
23 # usage:
24 # from math import log
25 # eggs = [1,3,0,3,2]
26 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
27 # producing:
28 # for <built-in function log>
29 # with args (0,)
30 # exception: math domain error
31 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
32 def catch(func, *args, handle=lambda e : e, **kwargs):
33 try:
34 return func(*args, **kwargs)
35 except Exception as e:
36 print("For %s" % str(func))
37 print(" with args %s" % str(args))
38 print(" caught exception: %s" % str(e))
39 (ty, va, tb) = sys.exc_info()
40 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
41 #exit(-1)
42 return None # was handle(e)
43
44 def __main__():
45 ITEM_GETTER = operator.itemgetter(1)
46
47 DROP_TABLES_SQL = '''
48 DROP VIEW IF EXISTS ppep_gene_site_view;
49 DROP VIEW IF EXISTS uniprot_view;
50 DROP VIEW IF EXISTS uniprotkb_pep_ppep_view;
51 DROP VIEW IF EXISTS ppep_intensity_view;
52 DROP VIEW IF EXISTS ppep_metadata_view;
53
54 DROP TABLE IF EXISTS sample;
55 DROP TABLE IF EXISTS ppep;
56 DROP TABLE IF EXISTS site_type;
57 DROP TABLE IF EXISTS deppep_UniProtKB;
58 DROP TABLE IF EXISTS deppep;
59 DROP TABLE IF EXISTS ppep_gene_site;
60 DROP TABLE IF EXISTS ppep_metadata;
61 DROP TABLE IF EXISTS ppep_intensity;
62 '''
63
64 CREATE_TABLES_SQL = '''
65 CREATE TABLE deppep
66 ( id INTEGER PRIMARY KEY
67 , seq TEXT UNIQUE ON CONFLICT IGNORE
68 )
69 ;
70 CREATE TABLE deppep_UniProtKB
71 ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE
72 , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE
73 , pos_start INTEGER
74 , pos_end INTEGER
75 , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end)
76 ON CONFLICT IGNORE
77 )
78 ;
79 CREATE TABLE ppep
80 ( id INTEGER PRIMARY KEY
81 , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE
82 , seq TEXT UNIQUE ON CONFLICT IGNORE
83 , scrubbed TEXT
84 );
85 CREATE TABLE site_type
86 ( id INTEGER PRIMARY KEY
87 , type_name TEXT UNIQUE ON CONFLICT IGNORE
88 );
89 CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed)
90 ;
91 CREATE TABLE sample
92 ( id INTEGER PRIMARY KEY
93 , name TEXT UNIQUE ON CONFLICT IGNORE
94 )
95 ;
96 CREATE VIEW uniprot_view AS
97 SELECT DISTINCT
98 Uniprot_ID
99 , Description
100 , Organism_Name
101 , Organism_ID
102 , Gene_Name
103 , PE
104 , SV
105 , Sequence
106 , Description || ' OS=' ||
107 Organism_Name || ' OX=' || Organism_ID ||
108 CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END ||
109 CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END ||
110 CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END
111 AS long_description
112 , Database
113 FROM UniProtKB
114 ;
115 CREATE VIEW uniprotkb_pep_ppep_view AS
116 SELECT deppep_UniProtKB.UniprotKB_ID AS accession
117 , deppep_UniProtKB.pos_start AS pos_start
118 , deppep_UniProtKB.pos_end AS pos_end
119 , deppep.seq AS peptide
120 , ppep.seq AS phosphopeptide
121 , ppep.scrubbed AS scrubbed
122 , uniprot_view.Sequence AS sequence
123 , uniprot_view.Description AS description
124 , uniprot_view.long_description AS long_description
125 , ppep.id AS ppep_id
126 FROM ppep, deppep, deppep_UniProtKB, uniprot_view
127 WHERE deppep.id = ppep.deppep_id
128 AND deppep.id = deppep_UniProtKB.deppep_id
129 AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID
130 ORDER BY UniprotKB_ID, deppep.seq, ppep.seq
131 ;
132 CREATE TABLE ppep_gene_site
133 ( ppep_id INTEGER REFERENCES ppep(id)
134 , gene_names TEXT
135 , site_type_id INTEGER REFERENCES site_type(id)
136 , kinase_map TEXT
137 , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE
138 )
139 ;
140 CREATE VIEW ppep_gene_site_view AS
141 SELECT DISTINCT
142 ppep.seq AS phospho_peptide
143 , ppep_id
144 , gene_names
145 , type_name
146 , kinase_map
147 FROM
148 ppep, ppep_gene_site, site_type
149 WHERE
150 ppep_gene_site.ppep_id = ppep.id
151 AND
152 ppep_gene_site.site_type_id = site_type.id
153 ORDER BY
154 ppep.seq
155 ;
156 CREATE TABLE ppep_metadata
157 ( ppep_id INTEGER REFERENCES ppep(id)
158 , protein_description TEXT
159 , gene_name TEXT
160 , FASTA_name TEXT
161 , phospho_sites TEXT
162 , motifs_unique TEXT
163 , accessions TEXT
164 , motifs_all_members TEXT
165 , domain TEXT
166 , ON_FUNCTION TEXT
167 , ON_PROCESS TEXT
168 , ON_PROT_INTERACT TEXT
169 , ON_OTHER_INTERACT TEXT
170 , notes TEXT
171 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE
172 )
173 ;
174 CREATE VIEW ppep_metadata_view AS
175 SELECT DISTINCT
176 ppep.seq AS phospho_peptide
177 , protein_description
178 , gene_name
179 , FASTA_name
180 , phospho_sites
181 , motifs_unique
182 , accessions
183 , motifs_all_members
184 , domain
185 , ON_FUNCTION
186 , ON_PROCESS
187 , ON_PROT_INTERACT
188 , ON_OTHER_INTERACT
189 , notes
190 FROM
191 ppep, ppep_metadata
192 WHERE
193 ppep_metadata.ppep_id = ppep.id
194 ORDER BY
195 ppep.seq
196 ;
197 CREATE TABLE ppep_intensity
198 ( ppep_id INTEGER REFERENCES ppep(id)
199 , sample_id INTEGER
200 , intensity INTEGER
201 , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE
202 )
203 ;
204 CREATE VIEW ppep_intensity_view AS
205 SELECT DISTINCT
206 ppep.seq AS phospho_peptide
207 , sample.name AS sample
208 , intensity
209 FROM
210 ppep, sample, ppep_intensity
211 WHERE
212 ppep_intensity.sample_id = sample.id
213 AND
214 ppep_intensity.ppep_id = ppep.id
215 ;
216 '''
217
218 UNIPROT_SEQ_AND_ID_SQL = '''
219 select Sequence, Uniprot_ID
220 from UniProtKB
221 '''
222
223 # Parse Command Line
224 parser = argparse.ArgumentParser(
225 description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).'
226 )
227
228 # inputs:
229 # Phosphopeptide data for experimental results, including the intensities
230 # and the mapping to kinase domains, in tabular format.
231 parser.add_argument(
232 '--phosphopeptides', '-p',
233 nargs=1,
234 required=True,
235 dest='phosphopeptides',
236 help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool'
237 )
238 parser.add_argument(
239 '--uniprotkb', '-u',
240 nargs=1,
241 required=True,
242 dest='uniprotkb',
243 help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool'
244 )
245 parser.add_argument(
246 '--schema',
247 action='store_true',
248 dest='db_schema',
249 help='show updated database schema'
250 )
251 parser.add_argument(
252 '--warn-duplicates',
253 action='store_true',
254 dest='warn_duplicates',
255 help='show warnings for duplicated sequences'
256 )
257 parser.add_argument(
258 '--verbose',
259 action='store_true',
260 dest='verbose',
261 help='show somewhat verbose program tracing'
262 )
263 # "Make it so!" (parse the arguments)
264 options = parser.parse_args()
265 if options.verbose:
266 print("options: " + str(options) + "\n")
267
268 # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
269 if options.phosphopeptides is None:
270 exit('Argument "phosphopeptides" is required but not supplied')
271 try:
272 f_name = os.path.abspath(options.phosphopeptides[0])
273 except Exception as e:
274 exit('Error parsing phosphopeptides argument: %s' % (e))
275
276 # path to SQLite input/output tabular file
277 if options.uniprotkb is None:
278 exit('Argument "uniprotkb" is required but not supplied')
279 try:
280 db_name = os.path.abspath(options.uniprotkb[0])
281 except Exception as e:
282 exit('Error parsing uniprotkb argument: %s' % (e))
283
284 # print("options.schema is %d" % options.db_schema)
285
286 # db_name = "demo/test.sqlite"
287 # f_name = "demo/test_input.txt"
288
289 con = sqlite3.connect(db_name)
290 cur = con.cursor()
291 ker = con.cursor()
292
293 cur.executescript(DROP_TABLES_SQL)
294
295 # if options.db_schema:
296 # print("\nAfter dropping tables/views that are to be created, schema is:")
297 # cur.execute("SELECT * FROM sqlite_schema")
298 # for row in cur.fetchall():
299 # if row[4] is not None:
300 # print("%s;" % row[4])
301
302 cur.executescript(CREATE_TABLES_SQL)
303
304 if options.db_schema:
305 print("\nAfter creating tables/views that are to be created, schema is:")
306 cur.execute("SELECT * FROM sqlite_schema")
307 for row in cur.fetchall():
308 if row[4] is not None:
309 print("%s;" % row[4])
310
311 def generate_ppep(f):
312 #get keys from upstream tabular file using readline()
313 # ref: https://stackoverflow.com/a/16713581/15509512
314 # answer to "Use codecs to read file with correct encoding"
315 file1_encoded = open(f, 'rb')
316 file1 = cx_getreader("latin-1")(file1_encoded)
317
318 count = 0
319 re_tab = re.compile('^[^\t]*')
320 re_quote = re.compile('"')
321 while True:
322 count += 1
323 # Get next line from file
324 line = file1.readline()
325 # if line is empty
326 # end of file is reached
327 if not line:
328 break
329 if count > 1:
330 m = re_tab.match(line)
331 m = re_quote.sub('',m[0])
332 yield m
333 file1.close()
334 file1_encoded.close()
335
336 # Build an Aho-Corasick automaton from a trie
337 # - ref:
338 # - https://pypi.org/project/pyahocorasick/
339 # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
340 # - https://en.wikipedia.org/wiki/Trie
341 auto = ahocorasick.Automaton()
342 re_phos = re.compile('p')
343 # scrub out unsearchable characters per section
344 # "Match the p_peptides to the @sequences array:"
345 # of the original
346 # PhosphoPeptide Upstream Kinase Mapping.pl
347 # which originally read
348 # $tmp_p_peptide =~ s/#//g;
349 # $tmp_p_peptide =~ s/\d//g;
350 # $tmp_p_peptide =~ s/\_//g;
351 # $tmp_p_peptide =~ s/\.//g;
352 #
353 re_scrub = re.compile('0-9_.#')
354 ppep_count = 0
355 for ppep in generate_ppep(f_name):
356 ppep_count += 1
357 add_to_trie = False
358 #print(ppep)
359 scrubbed = re_scrub.sub('',ppep)
360 deppep = re_phos.sub('',scrubbed)
361 if options.verbose:
362 print("deppep: %s; scrubbed: %s" % (deppep,scrubbed))
363 #print(deppep)
364 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
365 if cur.fetchone() is None:
366 add_to_trie = True
367 cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
368 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
369 deppep_id = cur.fetchone()[0]
370 if add_to_trie:
371 #print((deppep_id, deppep))
372 # Build the trie
373 auto.add_word(deppep, (deppep_id, deppep))
374 cur.execute(
375 "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
376 (ppep, scrubbed, deppep_id)
377 )
378 # def generate_deppep():
379 # cur.execute("SELECT seq FROM deppep")
380 # for row in cur.fetchall():
381 # yield row[0]
382 cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
383 for row in cur.fetchall():
384 deppep_count = row[0]
385
386 cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)")
387 for row in cur.fetchall():
388 sequence_count = row[0]
389
390 print(
391 "%d phosphopeptides were read from input" % ppep_count
392 )
393 print(
394 "%d corresponding dephosphopeptides are represented in input" % deppep_count
395 )
396 # Look for cases where both Gene_Name and Sequence are identical
397 cur.execute('''
398 SELECT Uniprot_ID, Gene_Name, Sequence
399 FROM UniProtKB
400 WHERE Sequence IN (
401 SELECT Sequence
402 FROM UniProtKB
403 GROUP BY Sequence, Gene_Name
404 HAVING count(*) > 1
405 )
406 ORDER BY Sequence
407 ''')
408 duplicate_count = 0
409 old_seq = ''
410 for row in cur.fetchall():
411 if duplicate_count == 0:
412 print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")
413 if row[2] != old_seq:
414 old_seq = row[2]
415 duplicate_count += 1
416 if options.warn_duplicates:
417 print("\n%s\t%s\t%s" % row)
418 else:
419 if options.warn_duplicates:
420 print("%s\t%s" % (row[0], row[1]))
421 if duplicate_count > 0:
422 print("\n%d sequences have duplicated accession IDs\n" % duplicate_count)
423
424 print(
425 "%s accession sequences will be searched\n" % sequence_count
426 )
427
428 #print(auto.dump())
429
430 # Convert the trie to an automaton (a finite-state machine)
431 auto.make_automaton()
432
433 # Execute query for seqs and metadata without fetching the results yet
434 uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
435 while batch := uniprot_seq_and_id.fetchmany(size=50):
436 if None == batch:
437 break
438 for Sequence, UniProtKB_id in batch:
439 if Sequence is not None:
440 for end_index, (insert_order, original_value) in auto.iter(Sequence):
441 ker.execute('''
442 INSERT INTO deppep_UniProtKB
443 (deppep_id,UniProtKB_id,pos_start,pos_end)
444 VALUES (?,?,?,?)
445 ''', (
446 insert_order,
447 UniProtKB_id,
448 1 + end_index - len(original_value),
449 end_index
450 )
451 )
452 else:
453 raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))
454 ker.execute("""
455 SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found'
456 FROM uniprotkb_pep_ppep_view
457 """
458 )
459 for row in ker.fetchall():
460 print(row[0])
461
462 ker.execute("""
463 SELECT count(*) || ' accession matches were found', count(*) AS accession_count
464 FROM (
465 SELECT accession
466 FROM uniprotkb_pep_ppep_view
467 GROUP BY accession
468 )
469 """
470 )
471 for row in ker.fetchall():
472 print(row[0])
473 accession_count = row[1]
474
475 ker.execute("""
476 SELECT count(*) || ' peptide matches were found'
477 FROM (
478 SELECT peptide
479 FROM uniprotkb_pep_ppep_view
480 GROUP BY peptide
481 )
482 """
483 )
484 for row in ker.fetchall():
485 print(row[0])
486
487 ker.execute("""
488 SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
489 FROM (
490 SELECT phosphopeptide
491 FROM uniprotkb_pep_ppep_view
492 GROUP BY phosphopeptide
493 )
494 """
495 )
496 for row in ker.fetchall():
497 print(row[0])
498 phosphopeptide_count = row[1]
499
500 con.commit()
501 ker.execute('vacuum')
502 con.close()
503
504 if __name__ == "__main__":
505 wrap_start_time = time.perf_counter()
506 __main__()
507 wrap_stop_time = time.perf_counter()
508 # print(wrap_start_time)
509 # print(wrap_stop_time)
510 print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),)
511
512 # vim: sw=4 ts=4 et ai :