diff search_ppep.py @ 5:d4d531006735 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author eschen42
date Thu, 10 Mar 2022 23:42:48 +0000
parents c1403d18c189
children 922d309640db
line wrap: on
line diff
--- a/search_ppep.py	Mon Mar 07 20:43:57 2022 +0000
+++ b/search_ppep.py	Thu Mar 10 23:42:48 2022 +0000
@@ -3,20 +3,19 @@
 
 import argparse
 import os.path
+import re
 import sqlite3
-import re
+import sys  # import the sys module for exc_info
+import time
+import traceback  # import the traceback module for format_exception
 from codecs import getreader as cx_getreader
-import time
 
 # For Aho-Corasick search for fixed set of substrings
 # - add_word
 # - make_automaton
 # - iter
 import ahocorasick
-# Support map over auto.iter(...)
-# - itemgetter
-import operator
-#import hashlib
+
 
 # ref: https://stackoverflow.com/a/8915613/15509512
 #   answers: "How to handle exceptions in a list comprehensions"
@@ -29,7 +28,8 @@
 #         with args (0,)
 #         exception: math domain error
 #       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
-def catch(func, *args, handle=lambda e : e, **kwargs):
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
     try:
         return func(*args, **kwargs)
     except Exception as e:
@@ -38,13 +38,13 @@
         print("  caught exception: %s" % str(e))
         (ty, va, tb) = sys.exc_info()
         print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
-        #exit(-1)
-        return None # was handle(e)
+        # exit(-1)
+        return None  # was handle(e)
+
 
 def __main__():
-    ITEM_GETTER = operator.itemgetter(1)
 
-    DROP_TABLES_SQL = '''
+    DROP_TABLES_SQL = """
         DROP VIEW  IF EXISTS ppep_gene_site_view;
         DROP VIEW  IF EXISTS uniprot_view;
         DROP VIEW  IF EXISTS uniprotkb_pep_ppep_view;
@@ -59,9 +59,9 @@
         DROP TABLE IF EXISTS ppep_gene_site;
         DROP TABLE IF EXISTS ppep_metadata;
         DROP TABLE IF EXISTS ppep_intensity;
-    '''
+    """
 
-    CREATE_TABLES_SQL = '''
+    CREATE_TABLES_SQL = """
         CREATE TABLE deppep
           ( id INTEGER PRIMARY KEY
           , seq TEXT UNIQUE                            ON CONFLICT IGNORE
@@ -213,53 +213,55 @@
             AND
               ppep_intensity.ppep_id = ppep.id
           ;
-    '''
+    """
 
-    UNIPROT_SEQ_AND_ID_SQL = '''
+    UNIPROT_SEQ_AND_ID_SQL = """
         select    Sequence, Uniprot_ID
              from UniProtKB
-    '''
+    """
 
     # Parse Command Line
     parser = argparse.ArgumentParser(
-        description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).'
-        )
+        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+    )
 
     # inputs:
     #   Phosphopeptide data for experimental results, including the intensities
     #   and the mapping to kinase domains, in tabular format.
     parser.add_argument(
-        '--phosphopeptides', '-p',
+        "--phosphopeptides",
+        "-p",
         nargs=1,
         required=True,
-        dest='phosphopeptides',
-        help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool'
-        )
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+    )
     parser.add_argument(
-        '--uniprotkb', '-u',
+        "--uniprotkb",
+        "-u",
         nargs=1,
         required=True,
-        dest='uniprotkb',
-        help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool'
-        )
+        dest="uniprotkb",
+        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+    )
     parser.add_argument(
-        '--schema',
-        action='store_true',
-        dest='db_schema',
-        help='show updated database schema'
-        )
+        "--schema",
+        action="store_true",
+        dest="db_schema",
+        help="show updated database schema",
+    )
     parser.add_argument(
-        '--warn-duplicates',
-        action='store_true',
-        dest='warn_duplicates',
-        help='show warnings for duplicated sequences'
-        )
+        "--warn-duplicates",
+        action="store_true",
+        dest="warn_duplicates",
+        help="show warnings for duplicated sequences",
+    )
     parser.add_argument(
-        '--verbose',
-        action='store_true',
-        dest='verbose',
-        help='show somewhat verbose program tracing'
-        )
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        help="show somewhat verbose program tracing",
+    )
     # "Make it so!" (parse the arguments)
     options = parser.parse_args()
     if options.verbose:
@@ -269,9 +271,9 @@
     if options.phosphopeptides is None:
         exit('Argument "phosphopeptides" is required but not supplied')
     try:
-        f_name  = os.path.abspath(options.phosphopeptides[0])
+        f_name = os.path.abspath(options.phosphopeptides[0])
     except Exception as e:
-        exit('Error parsing phosphopeptides argument: %s' % (e))
+        exit("Error parsing phosphopeptides argument: %s" % (e))
 
     # path to SQLite input/output tabular file
     if options.uniprotkb is None:
@@ -279,7 +281,7 @@
     try:
         db_name = os.path.abspath(options.uniprotkb[0])
     except Exception as e:
-        exit('Error parsing uniprotkb argument: %s' % (e))
+        exit("Error parsing uniprotkb argument: %s" % (e))
 
     # print("options.schema is %d" % options.db_schema)
 
@@ -302,21 +304,23 @@
     cur.executescript(CREATE_TABLES_SQL)
 
     if options.db_schema:
-        print("\nAfter creating tables/views that are to be created, schema is:")
+        print(
+            "\nAfter creating tables/views that are to be created, schema is:"
+        )
         cur.execute("SELECT * FROM sqlite_schema")
         for row in cur.fetchall():
             if row[4] is not None:
                 print("%s;" % row[4])
 
     def generate_ppep(f):
-        #get keys from upstream tabular file using readline()
+        # get keys from upstream tabular file using readline()
         # ref: https://stackoverflow.com/a/16713581/15509512
         #      answer to "Use codecs to read file with correct encoding"
-        file1_encoded = open(f, 'rb')
+        file1_encoded = open(f, "rb")
         file1 = cx_getreader("latin-1")(file1_encoded)
 
         count = 0
-        re_tab = re.compile('^[^\t]*')
+        re_tab = re.compile("^[^\t]*")
         re_quote = re.compile('"')
         while True:
             count += 1
@@ -328,7 +332,7 @@
                 break
             if count > 1:
                 m = re_tab.match(line)
-                m = re_quote.sub('',m[0])
+                m = re_quote.sub("", m[0])
                 yield m
         file1.close()
         file1_encoded.close()
@@ -339,7 +343,7 @@
     #   - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
     #   - https://en.wikipedia.org/wiki/Trie
     auto = ahocorasick.Automaton()
-    re_phos = re.compile('p')
+    re_phos = re.compile("p")
     # scrub out unsearchable characters per section
     #   "Match the p_peptides to the @sequences array:"
     # of the original
@@ -350,17 +354,17 @@
     #   $tmp_p_peptide =~ s/\_//g;
     #   $tmp_p_peptide =~ s/\.//g;
     #
-    re_scrub = re.compile('0-9_.#')
+    re_scrub = re.compile("0-9_.#")
     ppep_count = 0
     for ppep in generate_ppep(f_name):
         ppep_count += 1
         add_to_trie = False
-        #print(ppep)
-        scrubbed = re_scrub.sub('',ppep)
-        deppep = re_phos.sub('',scrubbed)
+        # print(ppep)
+        scrubbed = re_scrub.sub("", ppep)
+        deppep = re_phos.sub("", scrubbed)
         if options.verbose:
-            print("deppep: %s; scrubbed: %s" % (deppep,scrubbed))
-        #print(deppep)
+            print("deppep: %s; scrubbed: %s" % (deppep, scrubbed))
+        # print(deppep)
         cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
         if cur.fetchone() is None:
             add_to_trie = True
@@ -368,13 +372,13 @@
         cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,))
         deppep_id = cur.fetchone()[0]
         if add_to_trie:
-            #print((deppep_id, deppep))
+            # print((deppep_id, deppep))
             # Build the trie
             auto.add_word(deppep, (deppep_id, deppep))
         cur.execute(
             "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)",
-            (ppep, scrubbed, deppep_id)
-            )
+            (ppep, scrubbed, deppep_id),
+        )
     # def generate_deppep():
     #     cur.execute("SELECT seq FROM deppep")
     #     for row in cur.fetchall():
@@ -383,18 +387,20 @@
     for row in cur.fetchall():
         deppep_count = row[0]
 
-    cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)")
+    cur.execute(
+        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+    )
     for row in cur.fetchall():
         sequence_count = row[0]
 
-    print(
-      "%d phosphopeptides were read from input" % ppep_count
-      )
+    print("%d phosphopeptides were read from input" % ppep_count)
     print(
-      "%d corresponding dephosphopeptides are represented in input" % deppep_count
-      )
+        "%d corresponding dephosphopeptides are represented in input"
+        % deppep_count
+    )
     # Look for cases where both Gene_Name and Sequence are identical
-    cur.execute('''
+    cur.execute(
+        """
       SELECT Uniprot_ID, Gene_Name, Sequence
       FROM   UniProtKB
       WHERE  Sequence IN (
@@ -404,12 +410,15 @@
         HAVING   count(*) > 1
         )
       ORDER BY Sequence
-      ''')
+      """
+    )
     duplicate_count = 0
-    old_seq = ''
+    old_seq = ""
     for row in cur.fetchall():
         if duplicate_count == 0:
-            print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).")
+            print(
+                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
+            )
         if row[2] != old_seq:
             old_seq = row[2]
             duplicate_count += 1
@@ -419,47 +428,57 @@
             if options.warn_duplicates:
                 print("%s\t%s" % (row[0], row[1]))
     if duplicate_count > 0:
-        print("\n%d sequences have duplicated accession IDs\n" % duplicate_count)
+        print(
+            "\n%d sequences have duplicated accession IDs\n" % duplicate_count
+        )
 
-    print(
-      "%s accession sequences will be searched\n" % sequence_count
-      )
+    print("%s accession sequences will be searched\n" % sequence_count)
 
-    #print(auto.dump())
+    # print(auto.dump())
 
     # Convert the trie to an automaton (a finite-state machine)
     auto.make_automaton()
 
     # Execute query for seqs and metadata without fetching the results yet
     uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL)
-    while batch := uniprot_seq_and_id.fetchmany(size=50):
-      if None == batch:
-          break
-      for Sequence, UniProtKB_id in batch:
-          if Sequence is not None:
-              for end_index, (insert_order, original_value) in auto.iter(Sequence):
-                  ker.execute('''
+    while 1:
+        batch = uniprot_seq_and_id.fetchmany(size=50)
+        if not batch:
+            break
+        for Sequence, UniProtKB_id in batch:
+            if Sequence is not None:
+                for end_index, (insert_order, original_value) in auto.iter(
+                    Sequence
+                ):
+                    ker.execute(
+                        """
                       INSERT INTO deppep_UniProtKB
                         (deppep_id,UniProtKB_id,pos_start,pos_end)
                       VALUES (?,?,?,?)
-                      ''', (
-                          insert_order,
-                          UniProtKB_id,
-                          1 + end_index - len(original_value),
-                          end_index
-                          )
-                      )
-          else:
-              raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,))
-    ker.execute("""
+                      """,
+                        (
+                            insert_order,
+                            UniProtKB_id,
+                            1 + end_index - len(original_value),
+                            end_index,
+                        ),
+                    )
+            else:
+                raise ValueError(
+                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
+                    % (UniProtKB_id,)
+                )
+    ker.execute(
+        """
         SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
         FROM     uniprotkb_pep_ppep_view
         """
-        )
+    )
     for row in ker.fetchall():
         print(row[0])
 
-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
       FROM     (
         SELECT   accession
@@ -467,12 +486,12 @@
         GROUP BY accession
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
-      accession_count = row[1]
+        print(row[0])
 
-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' peptide matches were found'
       FROM     (
         SELECT   peptide
@@ -480,11 +499,12 @@
         GROUP BY peptide
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
+        print(row[0])
 
-    ker.execute("""
+    ker.execute(
+        """
       SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
       FROM     (
         SELECT   phosphopeptide
@@ -492,21 +512,24 @@
         GROUP BY phosphopeptide
         )
       """
-      )
+    )
     for row in ker.fetchall():
-      print(row[0])
-      phosphopeptide_count = row[1]
+        print(row[0])
 
     con.commit()
-    ker.execute('vacuum')
+    ker.execute("vacuum")
     con.close()
 
+
 if __name__ == "__main__":
     wrap_start_time = time.perf_counter()
     __main__()
     wrap_stop_time = time.perf_counter()
     # print(wrap_start_time)
     # print(wrap_stop_time)
-    print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),)
+    print(
+        "\nThe matching process took %d milliseconds to run.\n"
+        % ((wrap_stop_time - wrap_start_time) * 1000),
+    )
 
- # vim: sw=4 ts=4 et ai :
+# vim: sw=4 ts=4 et ai :