mqppep_anova: search_ppep.py comparison

comparison search_ppep.py @ 5:d4d531006735 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"

author	eschen42
date	Thu, 10 Mar 2022 23:42:48 +0000
parents	c1403d18c189
children	922d309640db

comparison

equal deleted inserted replaced

-:cfc65ae227f8
+:d4d531006735
 #!/usr/bin/env python
 # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB
 import argparse
 import os.path
+import re
 import sqlite3
-import re
+import sys  # import the sys module for exc_info
+import time
+import traceback  # import the traceback module for format_exception
 from codecs import getreader as cx_getreader
-import time
 # For Aho-Corasick search for fixed set of substrings
 # - add_word
 # - make_automaton
 # - iter
 import ahocorasick
-# Support map over auto.iter(...)
-# - itemgetter
-import operator
-#import hashlib
 # ref: https://stackoverflow.com/a/8915613/15509512
 #   answers: "How to handle exceptions in a list comprehensions"
 #   usage:
 #       from math import log
 #   producing:
 #       for <built-in function log>
 #         with args (0,)
 #         exception: math domain error
 #       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
-def catch(func, *args, handle=lambda e : e, **kwargs):
+def catch(func, *args, handle=lambda e: e, **kwargs):
 try:
 return func(*args, **kwargs)
 except Exception as e:
 print("For %s" % str(func))
 print("  with args %s" % str(args))
 print("  caught exception: %s" % str(e))
 (ty, va, tb) = sys.exc_info()
 print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
-#exit(-1)
+# exit(-1)
-return None # was handle(e)
+return None  # was handle(e)
 def __main__():
-ITEM_GETTER = operator.itemgetter(1)
+DROP_TABLES_SQL = """
-DROP_TABLES_SQL = '''
 DROP VIEW  IF EXISTS ppep_gene_site_view;
 DROP VIEW  IF EXISTS uniprot_view;
 DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
 DROP VIEW  IF EXISTS ppep_intensity_view;
 DROP VIEW  IF EXISTS ppep_metadata_view;
 DROP TABLE IF EXISTS deppep_UniProtKB;
 DROP TABLE IF EXISTS deppep;
 DROP TABLE IF EXISTS ppep_gene_site;
 DROP TABLE IF EXISTS ppep_metadata;
 DROP TABLE IF EXISTS ppep_intensity;
-'''
+"""
-CREATE_TABLES_SQL = '''
+CREATE_TABLES_SQL = """
 CREATE TABLE deppep
 ( id INTEGER PRIMARY KEY
 , seq TEXT UNIQUE                            ON CONFLICT IGNORE
 )
 ;
 WHERE
 ppep_intensity.sample_id = sample.id
 AND
 ppep_intensity.ppep_id = ppep.id
 ;
-'''
+"""
-UNIPROT_SEQ_AND_ID_SQL = '''
+UNIPROT_SEQ_AND_ID_SQL = """
 select    Sequence, Uniprot_ID
 from UniProtKB
-'''
+"""
 # Parse Command Line
 parser = argparse.ArgumentParser(
-description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).'
+description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
 )
 # inputs:
 #   Phosphopeptide data for experimental results, including the intensities
 #   and the mapping to kinase domains, in tabular format.
 parser.add_argument(
-'--phosphopeptides', '-p',
+"--phosphopeptides",
+"-p",
 nargs=1,
 required=True,
-dest='phosphopeptides',
+dest="phosphopeptides",
-help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool'
+help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
 )
 parser.add_argument(
-'--uniprotkb', '-u',
+"--uniprotkb",
+"-u",
 nargs=1,
 required=True,
-dest='uniprotkb',
+dest="uniprotkb",
-help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool'
+help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
 )
 parser.add_argument(
-'--schema',
+"--schema",
-action='store_true',
+action="store_true",
-dest='db_schema',
+dest="db_schema",
-help='show updated database schema'
+help="show updated database schema",
 )
 parser.add_argument(
-'--warn-duplicates',
+"--warn-duplicates",
-action='store_true',
+action="store_true",
-dest='warn_duplicates',
+dest="warn_duplicates",
-help='show warnings for duplicated sequences'
+help="show warnings for duplicated sequences",
 )
 parser.add_argument(
-'--verbose',
+"--verbose",
-action='store_true',
+action="store_true",
-dest='verbose',
+dest="verbose",
-help='show somewhat verbose program tracing'
+help="show somewhat verbose program tracing",
 )
 # "Make it so!" (parse the arguments)
 options = parser.parse_args()
 if options.verbose:
 print("options: " + str(options) + "\n")
 # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file
 if options.phosphopeptides is None:
 exit('Argument "phosphopeptides" is required but not supplied')
 try:
-f_name  = os.path.abspath(options.phosphopeptides[0])
+f_name = os.path.abspath(options.phosphopeptides[0])
 except Exception as e:
-exit('Error parsing phosphopeptides argument: %s' % (e))
+exit("Error parsing phosphopeptides argument: %s" % (e))
 # path to SQLite input/output tabular file
 if options.uniprotkb is None:
 exit('Argument "uniprotkb" is required but not supplied')
 try:
 db_name = os.path.abspath(options.uniprotkb[0])
 except Exception as e:
-exit('Error parsing uniprotkb argument: %s' % (e))
+exit("Error parsing uniprotkb argument: %s" % (e))
 # print("options.schema is %d" % options.db_schema)
 # db_name = "demo/test.sqlite"
 # f_name  = "demo/test_input.txt"
 #             print("%s;" % row[4])
 cur.executescript(CREATE_TABLES_SQL)
 if options.db_schema:
-print("\nAfter creating tables/views that are to be created, schema is:")
+print(
+"\nAfter creating tables/views that are to be created, schema is:"
+)
 cur.execute("SELECT * FROM sqlite_schema")
 for row in cur.fetchall():
 if row[4] is not None:
 print("%s;" % row[4])
 def generate_ppep(f):
-#get keys from upstream tabular file using readline()
+# get keys from upstream tabular file using readline()
 # ref: https://stackoverflow.com/a/16713581/15509512
 #      answer to "Use codecs to read file with correct encoding"
-file1_encoded = open(f, 'rb')
+file1_encoded = open(f, "rb")
 file1 = cx_getreader("latin-1")(file1_encoded)
 count = 0
-re_tab = re.compile('^[^\t]*')
+re_tab = re.compile("^[^\t]*")
 re_quote = re.compile('"')
 while True:
 count += 1
 # Get next line from file
 line = file1.readline()
 # end of file is reached
 if not line:
 break
 if count > 1:
 m = re_tab.match(line)
-m = re_quote.sub('',m[0])
+m = re_quote.sub("", m[0])
 yield m
 file1.close()
 file1_encoded.close()
 # Build an Aho-Corasick automaton from a trie
 # - ref:
 #   - https://pypi.org/project/pyahocorasick/
 #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
 #   - https://en.wikipedia.org/wiki/Trie
 auto = ahocorasick.Automaton()
-re_phos = re.compile('p')
+re_phos = re.compile("p")
 # scrub out unsearchable characters per section
 #   "Match the p_peptides to the @sequences array:"
 # of the original
 #   PhosphoPeptide Upstream Kinase Mapping.pl
 # which originally read
 #   $tmp_p_peptide =~ s/#//g;
 #   $tmp_p_peptide =~ s/\d//g;
 #   $tmp_p_peptide =~ s/\_//g;
 #   $tmp_p_peptide =~ s/\.//g;
 #
-re_scrub = re.compile('0-9_.#')
+re_scrub = re.compile("0-9_.#")
 ppep_count = 0
 for ppep in generate_ppep(f_name):
 ppep_count += 1
 add_to_trie = False
-#print(ppep)
+# print(ppep)
-scrubbed = re_scrub.sub('',ppep)
+scrubbed = re_scrub.sub("", ppep)
-deppep = re_phos.sub('',scrubbed)
+deppep = re_phos.sub("", scrubbed)
 if options.verbose:
-print("deppep: %s; scrubbed: %s" % (deppep,scrubbed))
+print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
-#print(deppep)
+# print(deppep)
 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
 if cur.fetchone() is None:
 add_to_trie = True
 cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,))
 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
 deppep_id = cur.fetchone()[0]
 if add_to_trie:
-#print((deppep_id, deppep))
+# print((deppep_id, deppep))
 # Build the trie
 auto.add_word(deppep, (deppep_id, deppep))
 cur.execute(
 "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
-(ppep, scrubbed, deppep_id)
+(ppep, scrubbed, deppep_id),
 )
 # def generate_deppep():
 #     cur.execute("SELECT seq FROM deppep")
 #     for row in cur.fetchall():
 #         yield row[0]
 cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)")
 for row in cur.fetchall():
 deppep_count = row[0]
-cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)")
+cur.execute(
+"SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+)
 for row in cur.fetchall():
 sequence_count = row[0]
+print("%d phosphopeptides were read from input" % ppep_count)
 print(
-"%d phosphopeptides were read from input" % ppep_count
+"%d corresponding dephosphopeptides are represented in input"
-)
+% deppep_count
-print(
+)
-"%d corresponding dephosphopeptides are represented in input" % deppep_count
-)
 # Look for cases where both Gene_Name and Sequence are identical
-cur.execute('''
+cur.execute(
+"""
 SELECT Uniprot_ID, Gene_Name, Sequence
 FROM   UniProtKB
 WHERE  Sequence IN (
 SELECT   Sequence
 FROM     UniProtKB
 GROUP BY Sequence, Gene_Name
 HAVING   count(*) > 1
 )
 ORDER BY Sequence
-''')
+"""
+)
 duplicate_count = 0
-old_seq = ''
+old_seq = ""
 for row in cur.fetchall():
 if duplicate_count == 0:
-print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")
+print(
+"\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
+)
 if row[2] != old_seq:
 old_seq = row[2]
 duplicate_count += 1
 if options.warn_duplicates:
 print("\n%s\t%s\t%s" % row)
 else:
 if options.warn_duplicates:
 print("%s\t%s" % (row[0], row[1]))
 if duplicate_count > 0:
-print("\n%d sequences have duplicated accession IDs\n" % duplicate_count)
+print(
+"\n%d sequences have duplicated accession IDs\n" % duplicate_count
-print(
+)
-"%s accession sequences will be searched\n" % sequence_count
-)
+print("%s accession sequences will be searched\n" % sequence_count)
-#print(auto.dump())
+# print(auto.dump())
 # Convert the trie to an automaton (a finite-state machine)
 auto.make_automaton()
 # Execute query for seqs and metadata without fetching the results yet
 uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
-while batch := uniprot_seq_and_id.fetchmany(size=50):
+while 1:
-if None == batch:
+batch = uniprot_seq_and_id.fetchmany(size=50)
-break
+if not batch:
-for Sequence, UniProtKB_id in batch:
+break
-if Sequence is not None:
+for Sequence, UniProtKB_id in batch:
-for end_index, (insert_order, original_value) in auto.iter(Sequence):
+if Sequence is not None:
-ker.execute('''
+for end_index, (insert_order, original_value) in auto.iter(
+Sequence
+):
+ker.execute(
+"""
 INSERT INTO deppep_UniProtKB
 (deppep_id,UniProtKB_id,pos_start,pos_end)
 VALUES (?,?,?,?)
-''', (
+""",
-insert_order,
+(
-UniProtKB_id,
+insert_order,
-1 + end_index - len(original_value),
+UniProtKB_id,
-end_index
+1 + end_index - len(original_value),
-)
+end_index,
-)
+),
-else:
+)
-raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))
+else:
-ker.execute("""
+raise ValueError(
+"UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
+% (UniProtKB_id,)
+)
+ker.execute(
+"""
 SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
 FROM     uniprotkb_pep_ppep_view
 """
 )
 for row in ker.fetchall():
 print(row[0])
-ker.execute("""
+ker.execute(
+"""
 SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
 FROM     (
 SELECT   accession
 FROM     uniprotkb_pep_ppep_view
 GROUP BY accession
 )
 """
 )
 for row in ker.fetchall():
 print(row[0])
-accession_count = row[1]
+ker.execute(
-ker.execute("""
+"""
 SELECT   count(*) || ' peptide matches were found'
 FROM     (
 SELECT   peptide
 FROM     uniprotkb_pep_ppep_view
 GROUP BY peptide
 )
 """
 )
 for row in ker.fetchall():
 print(row[0])
-ker.execute("""
+ker.execute(
+"""
 SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
 FROM     (
 SELECT   phosphopeptide
 FROM     uniprotkb_pep_ppep_view
 GROUP BY phosphopeptide
 )
 """
 )
 for row in ker.fetchall():
 print(row[0])
-phosphopeptide_count = row[1]
 con.commit()
-ker.execute('vacuum')
+ker.execute("vacuum")
 con.close()
 if __name__ == "__main__":
 wrap_start_time = time.perf_counter()
 __main__()
 wrap_stop_time = time.perf_counter()
 # print(wrap_start_time)
 # print(wrap_stop_time)
-print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),)
+print(
+"\nThe matching process took %d milliseconds to run.\n"
-# vim: sw=4 ts=4 et ai :
+% ((wrap_stop_time - wrap_start_time) * 1000),
+)
+# vim: sw=4 ts=4 et ai :

Mercurial > repos > eschen42 > mqppep_anova

comparison search_ppep.py @ 5:d4d531006735 draft