Mercurial > repos > eschen42 > mqppep_preproc
diff search_ppep.py @ 5:b91809a18dbe draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
| author | eschen42 | 
|---|---|
| date | Thu, 10 Mar 2022 23:43:38 +0000 | 
| parents | 2c7e1b167736 | 
| children | 42daf70d4ed4 | 
line wrap: on
 line diff
--- a/search_ppep.py Mon Mar 07 20:43:23 2022 +0000 +++ b/search_ppep.py Thu Mar 10 23:43:38 2022 +0000 @@ -3,20 +3,19 @@ import argparse import os.path +import re import sqlite3 -import re +import sys # import the sys module for exc_info +import time +import traceback # import the traceback module for format_exception from codecs import getreader as cx_getreader -import time # For Aho-Corasick search for fixed set of substrings # - add_word # - make_automaton # - iter import ahocorasick -# Support map over auto.iter(...) -# - itemgetter -import operator -#import hashlib + # ref: https://stackoverflow.com/a/8915613/15509512 # answers: "How to handle exceptions in a list comprehensions" @@ -29,7 +28,8 @@ # with args (0,) # exception: math domain error # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] -def catch(func, *args, handle=lambda e : e, **kwargs): +def catch(func, *args, handle=lambda e: e, **kwargs): + try: return func(*args, **kwargs) except Exception as e: @@ -38,13 +38,13 @@ print(" caught exception: %s" % str(e)) (ty, va, tb) = sys.exc_info() print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) - #exit(-1) - return None # was handle(e) + # exit(-1) + return None # was handle(e) + def __main__(): - ITEM_GETTER = operator.itemgetter(1) - DROP_TABLES_SQL = ''' + DROP_TABLES_SQL = """ DROP VIEW IF EXISTS ppep_gene_site_view; DROP VIEW IF EXISTS uniprot_view; DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; @@ -59,9 +59,9 @@ DROP TABLE IF EXISTS ppep_gene_site; DROP TABLE IF EXISTS ppep_metadata; DROP TABLE IF EXISTS ppep_intensity; - ''' + """ - CREATE_TABLES_SQL = ''' + CREATE_TABLES_SQL = """ CREATE TABLE deppep ( id INTEGER PRIMARY KEY , seq TEXT UNIQUE ON CONFLICT IGNORE @@ -213,53 +213,55 @@ AND ppep_intensity.ppep_id = ppep.id ; - ''' + """ - UNIPROT_SEQ_AND_ID_SQL = ''' + UNIPROT_SEQ_AND_ID_SQL = """ select Sequence, Uniprot_ID from UniProtKB - ''' + """ # Parse Command Line parser = argparse.ArgumentParser( - description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).' - ) + description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." + ) # inputs: # Phosphopeptide data for experimental results, including the intensities # and the mapping to kinase domains, in tabular format. parser.add_argument( - '--phosphopeptides', '-p', + "--phosphopeptides", + "-p", nargs=1, required=True, - dest='phosphopeptides', - help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool' - ) + dest="phosphopeptides", + help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", + ) parser.add_argument( - '--uniprotkb', '-u', + "--uniprotkb", + "-u", nargs=1, required=True, - dest='uniprotkb', - help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool' - ) + dest="uniprotkb", + help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", + ) parser.add_argument( - '--schema', - action='store_true', - dest='db_schema', - help='show updated database schema' - ) + "--schema", + action="store_true", + dest="db_schema", + help="show updated database schema", + ) parser.add_argument( - '--warn-duplicates', - action='store_true', - dest='warn_duplicates', - help='show warnings for duplicated sequences' - ) + "--warn-duplicates", + action="store_true", + dest="warn_duplicates", + help="show warnings for duplicated sequences", + ) parser.add_argument( - '--verbose', - action='store_true', - dest='verbose', - help='show somewhat verbose program tracing' - ) + "--verbose", + action="store_true", + dest="verbose", + help="show somewhat verbose program tracing", + ) # "Make it so!" (parse the arguments) options = parser.parse_args() if options.verbose: @@ -269,9 +271,9 @@ if options.phosphopeptides is None: exit('Argument "phosphopeptides" is required but not supplied') try: - f_name = os.path.abspath(options.phosphopeptides[0]) + f_name = os.path.abspath(options.phosphopeptides[0]) except Exception as e: - exit('Error parsing phosphopeptides argument: %s' % (e)) + exit("Error parsing phosphopeptides argument: %s" % (e)) # path to SQLite input/output tabular file if options.uniprotkb is None: @@ -279,7 +281,7 @@ try: db_name = os.path.abspath(options.uniprotkb[0]) except Exception as e: - exit('Error parsing uniprotkb argument: %s' % (e)) + exit("Error parsing uniprotkb argument: %s" % (e)) # print("options.schema is %d" % options.db_schema) @@ -302,21 +304,23 @@ cur.executescript(CREATE_TABLES_SQL) if options.db_schema: - print("\nAfter creating tables/views that are to be created, schema is:") + print( + "\nAfter creating tables/views that are to be created, schema is:" + ) cur.execute("SELECT * FROM sqlite_schema") for row in cur.fetchall(): if row[4] is not None: print("%s;" % row[4]) def generate_ppep(f): - #get keys from upstream tabular file using readline() + # get keys from upstream tabular file using readline() # ref: https://stackoverflow.com/a/16713581/15509512 # answer to "Use codecs to read file with correct encoding" - file1_encoded = open(f, 'rb') + file1_encoded = open(f, "rb") file1 = cx_getreader("latin-1")(file1_encoded) count = 0 - re_tab = re.compile('^[^\t]*') + re_tab = re.compile("^[^\t]*") re_quote = re.compile('"') while True: count += 1 @@ -328,7 +332,7 @@ break if count > 1: m = re_tab.match(line) - m = re_quote.sub('',m[0]) + m = re_quote.sub("", m[0]) yield m file1.close() file1_encoded.close() @@ -339,7 +343,7 @@ # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm # - https://en.wikipedia.org/wiki/Trie auto = ahocorasick.Automaton() - re_phos = re.compile('p') + re_phos = re.compile("p") # scrub out unsearchable characters per section # "Match the p_peptides to the @sequences array:" # of the original @@ -350,17 +354,17 @@ # $tmp_p_peptide =~ s/\_//g; # $tmp_p_peptide =~ s/\.//g; # - re_scrub = re.compile('0-9_.#') + re_scrub = re.compile("0-9_.#") ppep_count = 0 for ppep in generate_ppep(f_name): ppep_count += 1 add_to_trie = False - #print(ppep) - scrubbed = re_scrub.sub('',ppep) - deppep = re_phos.sub('',scrubbed) + # print(ppep) + scrubbed = re_scrub.sub("", ppep) + deppep = re_phos.sub("", scrubbed) if options.verbose: - print("deppep: %s; scrubbed: %s" % (deppep,scrubbed)) - #print(deppep) + print("deppep: %s; scrubbed: %s" % (deppep, scrubbed)) + # print(deppep) cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) if cur.fetchone() is None: add_to_trie = True @@ -368,13 +372,13 @@ cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) deppep_id = cur.fetchone()[0] if add_to_trie: - #print((deppep_id, deppep)) + # print((deppep_id, deppep)) # Build the trie auto.add_word(deppep, (deppep_id, deppep)) cur.execute( "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", - (ppep, scrubbed, deppep_id) - ) + (ppep, scrubbed, deppep_id), + ) # def generate_deppep(): # cur.execute("SELECT seq FROM deppep") # for row in cur.fetchall(): @@ -383,18 +387,20 @@ for row in cur.fetchall(): deppep_count = row[0] - cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)") + cur.execute( + "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" + ) for row in cur.fetchall(): sequence_count = row[0] - print( - "%d phosphopeptides were read from input" % ppep_count - ) + print("%d phosphopeptides were read from input" % ppep_count) print( - "%d corresponding dephosphopeptides are represented in input" % deppep_count - ) + "%d corresponding dephosphopeptides are represented in input" + % deppep_count + ) # Look for cases where both Gene_Name and Sequence are identical - cur.execute(''' + cur.execute( + """ SELECT Uniprot_ID, Gene_Name, Sequence FROM UniProtKB WHERE Sequence IN ( @@ -404,12 +410,15 @@ HAVING count(*) > 1 ) ORDER BY Sequence - ''') + """ + ) duplicate_count = 0 - old_seq = '' + old_seq = "" for row in cur.fetchall(): if duplicate_count == 0: - print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).") + print( + "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." + ) if row[2] != old_seq: old_seq = row[2] duplicate_count += 1 @@ -419,47 +428,57 @@ if options.warn_duplicates: print("%s\t%s" % (row[0], row[1])) if duplicate_count > 0: - print("\n%d sequences have duplicated accession IDs\n" % duplicate_count) + print( + "\n%d sequences have duplicated accession IDs\n" % duplicate_count + ) - print( - "%s accession sequences will be searched\n" % sequence_count - ) + print("%s accession sequences will be searched\n" % sequence_count) - #print(auto.dump()) + # print(auto.dump()) # Convert the trie to an automaton (a finite-state machine) auto.make_automaton() # Execute query for seqs and metadata without fetching the results yet uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) - while batch := uniprot_seq_and_id.fetchmany(size=50): - if None == batch: - break - for Sequence, UniProtKB_id in batch: - if Sequence is not None: - for end_index, (insert_order, original_value) in auto.iter(Sequence): - ker.execute(''' + while 1: + batch = uniprot_seq_and_id.fetchmany(size=50) + if not batch: + break + for Sequence, UniProtKB_id in batch: + if Sequence is not None: + for end_index, (insert_order, original_value) in auto.iter( + Sequence + ): + ker.execute( + """ INSERT INTO deppep_UniProtKB (deppep_id,UniProtKB_id,pos_start,pos_end) VALUES (?,?,?,?) - ''', ( - insert_order, - UniProtKB_id, - 1 + end_index - len(original_value), - end_index - ) - ) - else: - raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,)) - ker.execute(""" + """, + ( + insert_order, + UniProtKB_id, + 1 + end_index - len(original_value), + end_index, + ), + ) + else: + raise ValueError( + "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" + % (UniProtKB_id,) + ) + ker.execute( + """ SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' FROM uniprotkb_pep_ppep_view """ - ) + ) for row in ker.fetchall(): print(row[0]) - ker.execute(""" + ker.execute( + """ SELECT count(*) || ' accession matches were found', count(*) AS accession_count FROM ( SELECT accession @@ -467,12 +486,12 @@ GROUP BY accession ) """ - ) + ) for row in ker.fetchall(): - print(row[0]) - accession_count = row[1] + print(row[0]) - ker.execute(""" + ker.execute( + """ SELECT count(*) || ' peptide matches were found' FROM ( SELECT peptide @@ -480,11 +499,12 @@ GROUP BY peptide ) """ - ) + ) for row in ker.fetchall(): - print(row[0]) + print(row[0]) - ker.execute(""" + ker.execute( + """ SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count FROM ( SELECT phosphopeptide @@ -492,21 +512,24 @@ GROUP BY phosphopeptide ) """ - ) + ) for row in ker.fetchall(): - print(row[0]) - phosphopeptide_count = row[1] + print(row[0]) con.commit() - ker.execute('vacuum') + ker.execute("vacuum") con.close() + if __name__ == "__main__": wrap_start_time = time.perf_counter() __main__() wrap_stop_time = time.perf_counter() # print(wrap_start_time) # print(wrap_stop_time) - print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),) + print( + "\nThe matching process took %d milliseconds to run.\n" + % ((wrap_stop_time - wrap_start_time) * 1000), + ) - # vim: sw=4 ts=4 et ai : +# vim: sw=4 ts=4 et ai :
