Mercurial > repos > eschen42 > mqppep_anova

diff mqppep_mrgfltr.py @ 5:d4d531006735 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author: eschen42
date: Thu, 10 Mar 2022 23:42:48 +0000
parents: c1403d18c189
children: 922d309640db
--- a/mqppep_mrgfltr.py	Mon Mar 07 20:43:57 2022 +0000
+++ b/mqppep_mrgfltr.py	Thu Mar 10 23:42:48 2022 +0000
@@ -1,1337 +1,1500 @@
-#!/usr/bin/env python
-
-# Import the packages needed
-import argparse
-import os.path
-import sys
-
-import pandas
-import re
-import time
-import sqlite3 as sql
-from codecs import getreader as cx_getreader
-import sys
-import numpy as np
-
-#   for sorting list of lists using operator.itemgetter
-import operator
-
-#   for formatting stack-trace
-import traceback
-
-#   for Aho-Corasick search for fixed set of substrings
-import ahocorasick
-import operator
-import hashlib
-
-# for shutil.copyfile(src, dest)
-import shutil
-
-# global constants
-N_A = 'N/A'
-
-# ref: https://stackoverflow.com/a/8915613/15509512
-#   answers: "How to handle exceptions in a list comprehensions"
-#   usage:
-#       from math import log
-#       eggs = [1,3,0,3,2]
-#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
-#   producing:
-#       for <built-in function log>
-#         with args (0,)
-#         exception: math domain error
-#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
-def catch(func, *args, handle=lambda e : e, **kwargs):
-    try:
-        return func(*args, **kwargs)
-    except Exception as e:
-        print("For %s" % str(func))
-        print("  with args %s" % str(args))
-        print("  caught exception: %s" % str(e))
-        (ty, va, tb) = sys.exc_info()
-        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
-        exit(-1)
-        return None # was handle(e)
-
-def ppep_join(x):
-    x = [i for i in x if N_A != i]
-    result = "%s" % ' | '.join(x)
-    if result != "":
-        return result
-    else:
-        return N_A
-
-def melt_join(x):
-    tmp = {key.lower(): key for key in x}
-    result = "%s" % ' | '.join([tmp[key] for key in tmp])
-    return result
-
-def __main__():
-    # Parse Command Line
-    parser = argparse.ArgumentParser(
-        description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
-        )
-
-    # inputs:
-    #   Phosphopeptide data for experimental results, including the intensities
-    #   and the mapping to kinase domains, in tabular format.
-    parser.add_argument(
-        '--phosphopeptides', '-p',
-        nargs=1,
-        required=True,
-        dest='phosphopeptides',
-        help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
-        )
-    #   UniProtKB/SwissProt DB input, SQLite
-    parser.add_argument(
-        '--ppep_mapping_db', '-d',
-        nargs=1,
-        required=True,
-        dest='ppep_mapping_db',
-        help='UniProtKB/SwissProt SQLite Database'
-        )
-    #ACE #   PhosPhositesPlus DB input, csv
-    #ACE parser.add_argument(
-    #ACE     '--psp_regulatory_sites', '-s',
-    #ACE     nargs=1,
-    #ACE     required=True,
-    #ACE     dest='psp_regulatory_sites_csv',
-    #ACE     help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
-    #ACE     )
-    #   species to limit records chosed from PhosPhositesPlus
-    parser.add_argument(
-        '--species', '-x',
-        nargs=1,
-        required=False,
-        default=[],
-        dest='species',
-        help='limit PhosphoSitePlus records to indicated species (field may be empty)'
-        )
-
-    # outputs:
-    #   tabular output
-    parser.add_argument(
-        '--mrgfltr_tab', '-o',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_tab',
-        help='Tabular output file for results'
-        )
-    #   CSV output
-    parser.add_argument(
-        '--mrgfltr_csv', '-c',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_csv',
-        help='CSV output file for results'
-        )
-    #   SQLite output
-    parser.add_argument(
-        '--mrgfltr_sqlite', '-S',
-        nargs=1,
-        required=True,
-        dest='mrgfltr_sqlite',
-        help='SQLite output file for results'
-        )
-
-    # "Make it so!" (parse the arguments)
-    options = parser.parse_args()
-    print("options: " + str(options))
-
-    # determine phosphopeptide ("upstream map") input tabular file access
-    if options.phosphopeptides is None:
-        exit('Argument "phosphopeptides" is required but not supplied')
-    try:
-        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
-        input_file = open(upstream_map_filename_tab, 'r')
-        input_file.close()
-    except Exception as e:
-        exit('Error parsing phosphopeptides argument: %s' % str(e))
-
-    # determine input SQLite access
-    if options.ppep_mapping_db is None:
-        exit('Argument "ppep_mapping_db" is required but not supplied')
-    try:
-        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
-        input_file = open(uniprot_sqlite, 'rb')
-        input_file.close()
-    except Exception as e:
-        exit('Error parsing ppep_mapping_db argument: %s' % str(e))
-
-    # copy input SQLite dataset to output SQLite dataset
-    if options.mrgfltr_sqlite is None:
-        exit('Argument "mrgfltr_sqlite" is required but not supplied')
-    try:
-        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
-        shutil.copyfile(uniprot_sqlite, output_sqlite)
-    except Exception as e:
-        exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
-
-    #ACE # determine psp_regulatory_sites CSV access
-    #ACE if options.psp_regulatory_sites_csv is None:
-    #ACE     exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
-    #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
-    #ACE try:
-    #ACE     phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
-    #ACE     input_file = open(phosphosite_filename_csv, 'r')
-    #ACE     input_file.close()
-    #ACE except Exception as e:
-    #ACE     exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
-    #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
-
-    # determine species to limit records from PSP_Regulatory_Sites
-    if options.species is None:
-        exit('Argument "species" is required (and may be empty) but not supplied')
-    try:
-        if len(options.species) > 0:
-            species = options.species[0]
-        else:
-            species = ''
-    except Exception as e:
-        exit('Error parsing species argument: %s' % str(e))
-
-    # determine tabular output destination
-    if options.mrgfltr_tab is None:
-        exit('Argument "mrgfltr_tab" is required but not supplied')
-    try:
-        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
-        output_file = open(output_filename_tab, 'w')
-        output_file.close()
-    except Exception as e:
-        exit('Error parsing mrgfltr_tab argument: %s' % str(e))
-
-    # determine CSV output destination
-    if options.mrgfltr_csv is None:
-        exit('Argument "mrgfltr_csv" is required but not supplied')
-    try:
-        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
-        output_file = open(output_filename_csv, 'w')
-        output_file.close()
-    except Exception as e:
-        exit('Error parsing mrgfltr_csv argument: %s' % str(e))
-
-
-    def mqpep_getswissprot():
-
-        ###############################################
-        # copied from Excel Output Script.ipynb BEGIN #
-        ###############################################
-
-        ###########  String Constants  #################
-        DEPHOSPHOPEP = 'DephosphoPep'
-        DESCRIPTION = 'Description'
-        FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
-        GENE_NAME = 'Gene_Name'                     # Gene Name from UniProtKB
-        ON_FUNCTION = 'ON_FUNCTION'                 # ON_FUNCTION column from PSP_Regulatory_Sites
-        ON_NOTES = 'NOTES'                          # NOTES column from PSP_Regulatory_Sites
-        ON_OTHER_INTERACT = 'ON_OTHER_INTERACT'     # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
-        ON_PROCESS = 'ON_PROCESS'                   # ON_PROCESS column from PSP_Regulatory_Sites
-        ON_PROT_INTERACT = 'ON_PROT_INTERACT'       # ON_PROT_INTERACT column from PSP_Regulatory_Sites
-        PHOSPHOPEPTIDE = 'Phosphopeptide'
-        PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
-        PHOSPHORESIDUE = 'Phosphoresidue'
-        PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
-        SEQUENCE = 'Sequence'
-        SEQUENCE10 = 'Sequence10'
-        SEQUENCE7 = 'Sequence7'
-        SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
-        SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
-        UNIPROT_ID = 'UniProt_ID'
-        UNIPROT_SEQ_AND_META_SQL = '''
-            select    Uniprot_ID, Description, Gene_Name, Sequence,
-                      Organism_Name, Organism_ID, PE, SV
-                 from UniProtKB
-             order by Sequence, UniProt_ID
-        '''
-        UNIPROT_UNIQUE_SEQ_SQL = '''
-            select distinct Sequence
-                       from UniProtKB
-                   group by Sequence
-        '''
-        PPEP_PEP_UNIPROTSEQ_SQL = '''
-            select distinct phosphopeptide, peptide, sequence
-                       from uniprotkb_pep_ppep_view
-                   order by sequence
-        '''
-        PPEP_MELT_SQL = '''
-            SELECT DISTINCT
-                phospho_peptide AS 'p_peptide',
-                kinase_map AS 'characterization',
-                'X' AS 'X'
-            FROM ppep_gene_site_view
-        '''
-        # CREATE TABLE PSP_Regulatory_site (
-        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
-        #   domain             TEXT,
-        #   ON_FUNCTION        TEXT,
-        #   ON_PROCESS         TEXT,
-        #   ON_PROT_INTERACT   TEXT,
-        #   ON_OTHER_INTERACT  TEXT,
-        #   notes              TEXT,
-        #   organism           TEXT
-        # );
-        PSP_REGSITE_SQL = '''
-            SELECT DISTINCT
-              SITE_PLUSMINUS_7AA ,
-              DOMAIN             ,
-              ON_FUNCTION        ,
-              ON_PROCESS         ,
-              ON_PROT_INTERACT   ,
-              ON_OTHER_INTERACT  ,
-              NOTES              ,
-              ORGANISM
-            FROM PSP_Regulatory_site
-        '''
-        PPEP_ID_SQL ='''
-            SELECT
-                id AS 'ppep_id',
-                seq AS 'ppep_seq'
-            FROM ppep
-        '''
-        MRGFLTR_DDL ='''
-        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
-        DROP TABLE IF EXISTS mrgfltr_metadata;
-        CREATE TABLE mrgfltr_metadata
-          ( ppep_id                 INTEGER REFERENCES ppep(id)
-          , Sequence10              TEXT
-          , Sequence7               TEXT
-          , GeneName                TEXT
-          , Phosphoresidue          TEXT
-          , UniProtID               TEXT
-          , Description             TEXT
-          , FunctionPhosphoresidue  TEXT
-          , PutativeUpstreamDomains TEXT
-          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
-          )
-          ;
-        CREATE VIEW mrgfltr_metadata_view AS
-          SELECT DISTINCT
-              ppep.seq             AS phospho_peptide
-            , Sequence10
-            , Sequence7
-            , GeneName
-            , Phosphoresidue
-            , UniProtID
-            , Description
-            , FunctionPhosphoresidue
-            , PutativeUpstreamDomains
-          FROM
-            ppep, mrgfltr_metadata
-          WHERE
-              mrgfltr_metadata.ppep_id = ppep.id
-          ORDER BY
-            ppep.seq
-            ;
-        '''
-
-        CITATION_INSERT_STMT = '''
-          INSERT INTO Citation (
-            ObjectName,
-            CitationData
-          ) VALUES (?,?)
-          '''
-        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
-        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
-
-        MRGFLTR_METADATA_COLUMNS = [
-            'ppep_id',
-            'Sequence10',
-            'Sequence7',
-            'GeneName',
-            'Phosphoresidue',
-            'UniProtID',
-            'Description',
-            'FunctionPhosphoresidue',
-            'PutativeUpstreamDomains'
-            ]
-
-        ###########  String Constants (end) ############
-
-        class Error(Exception):
-            """Base class for exceptions in this module."""
-            pass
-
-        class PreconditionError(Error):
-            """Exception raised for errors in the input.
-
-            Attributes:
-                expression -- input expression in which the error occurred
-                message -- explanation of the error
-            """
-
-            def __init__(self, expression, message):
-                self.expression = expression
-                self.message = message
-
-        #start_time = time.clock() #timer
-        start_time = time.process_time() #timer
-
-        #get keys from upstream tabular file using readline()
-        # ref: https://stackoverflow.com/a/16713581/15509512
-        #      answer to "Use codecs to read file with correct encoding"
-        file1_encoded = open(upstream_map_filename_tab, 'rb')
-        file1 = cx_getreader("latin-1")(file1_encoded)
-
-        count = 0
-        upstream_map_p_peptide_list = []
-        re_tab = re.compile('^[^\t]*')
-        while True:
-            count += 1
-            # Get next line from file
-            line = file1.readline()
-            # if line is empty
-            # end of file is reached
-            if not line:
-                break
-            if count > 1:
-                m = re_tab.match(line)
-                upstream_map_p_peptide_list.append(m[0])
-        file1.close()
-        file1_encoded.close()
-
-        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
-        re_phos = re.compile('p')
-        dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
-
-        end_time = time.process_time() #timer
-        print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
-
-        ## ----------- Get SwissProt data from SQLite database (start) -----------
-        # build UniProt sequence LUT and list of unique SwissProt sequences
-
-        # Open SwissProt SQLite database
-        conn = sql.connect(uniprot_sqlite)
-        cur  = conn.cursor()
-
-        # Set up structures to hold SwissProt data
-
-        uniprot_Sequence_List = []
-        UniProtSeqLUT = {}
-
-        # Execute query for unique seqs without fetching the results yet
-        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
-
-        while batch := uniprot_unique_seq_cur.fetchmany(size=50):
-            if None == batch:
-                # handle case where no records are returned
-                break
-            for row in batch:
-                Sequence = row[0]
-                UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
-                UniProtSeqLUT[(Sequence,GENE_NAME)  ] = []
-                UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
-                UniProtSeqLUT[ Sequence               ] = []
-
-        # Execute query for seqs and metadata without fetching the results yet
-        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
-
-        while batch := uniprot_seq_and_meta.fetchmany(size=50):
-            if None == batch:
-                  # handle case where no records are returned
-                break
-            for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
-                uniprot_Sequence_List.append(Sequence)
-                UniProtSeqLUT[Sequence] = Sequence
-                UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
-                UniProtSeqLUT[(Sequence,GENE_NAME)  ].append(Gene_Name)
-                if OS != N_A:
-                    Description += ' OS=' + OS
-                if OX != N_A:
-                    Description += ' OX=' + str(int(OX))
-                if Gene_Name != N_A:
-                    Description += ' GN=' + Gene_Name
-                if PE != N_A:
-                    Description += ' PE=' + PE
-                if SV != N_A:
-                    Description += ' SV=' + SV
-                UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
-
-        # Close SwissProt SQLite database; clean up local variables
-        conn.close()
-        Sequence = ''
-        UniProt_ID = ''
-        Description = ''
-        Gene_Name = ''
-
-        ## ----------- Get SwissProt data from SQLite database (finish) -----------
-
-        end_time = time.process_time() #timer
-        print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
-
-        ## ----------- Get SwissProt data from SQLite database (start) -----------
-        # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
-        #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
-
-        # Open SwissProt SQLite database
-        conn = sql.connect(uniprot_sqlite)
-        cur  = conn.cursor()
-
-        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
-        DephosphoPep_UniProtSeq_LUT = {}
-
-        # Set up dictionary to accumulate results
-        PhosphoPep_UniProtSeq_LUT = {}
-
-        # Execute query for tuples without fetching the results yet
-        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
-
-        while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
-            if None == batch:
-                # handle case where no records are returned
-                break
-            for (phospho_pep, dephospho_pep, sequence) in batch:
-                #do interesting stuff here...
-                PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
-                PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
-                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
-                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]  = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]    = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]   = []
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]     = []
-                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
-
-                #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
-                if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
-                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
-                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
-                    if phospho_pep != phospho_pep:
-                        print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
-                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
-                        PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
-                        PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
-                    r = list(zip(
-                       [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
-                       [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
-                       [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
-                       ))
-                    # Sort by `UniProt_ID`
-                    #   ref: https://stackoverflow.com/a/4174955/15509512
-                    r = sorted(r, key=operator.itemgetter(0))
-                    # Get one tuple for each `phospho_pep`
-                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
-                    for (upid, gn, desc) in r:
-                        # Append pseudo-tuple per UniProt_ID but only when it is not present
-                        if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
-                            DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
-
-        # Close SwissProt SQLite database; clean up local variables
-        conn.close()
-        # wipe local variables
-        phospho_pep = dephospho_pep = sequence = 0
-        upid = gn = desc = r = ''
-
-        ## ----------- Get SwissProt data from SQLite database (finish) -----------
-
-        end_time = time.process_time() #timer
-        print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
-
-        print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
-
-        #Read in Upstream tabular file
-        # We are discarding the intensity data; so read it as text
-        upstream_data = pandas.read_table(
-            upstream_map_filename_tab,
-            dtype='str',
-            index_col = 0
-            )
-
-        end_time = time.process_time() #timer
-        print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        upstream_data.index = upstream_map_p_peptide_list
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        #trim upstream_data to include only the upstream map columns
-        old_cols = upstream_data.columns.tolist()
-        i = 0
-        first_intensity = -1
-        last_intensity  = -1
-        intensity_re = re.compile('Intensity.*')
-        for col_name in old_cols:
-            m = intensity_re.match(col_name)
-            if m:
-                last_intensity = i
-                if first_intensity == -1:
-                    first_intensity = i
-            i += 1
-        #print('last intensity = %d' % last_intensity)
-        col_PKCalpha = last_intensity + 2
-        col_firstIntensity = first_intensity
-
-        data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
-
-        if upstream_data.empty:
-            print("upstream_data is empty")
-            exit(0)
-
-        data_in = upstream_data.copy(deep=True)[data_in_cols]
-
-        # Convert floating-point integers to int64 integers
-        #   ref: https://stackoverflow.com/a/68497603/15509512
-        data_in[list(data_in.columns[1:])] = data_in[
-            list(data_in.columns[1:])].astype('float64').apply(np.int64)
-
-        #create another phosphopeptide column that will be used to join later;
-        #  MAY need to change depending on Phosphopeptide column position
-        #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
-        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
-
-
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # Produce a dictionary of metadata for a single phosphopeptide.
-        #   This is a replacement of `UniProtInfo_subdict` in the original code.
-        def pseq_to_subdict(phospho_pep):
-            #ACE print("calling pseq_to_subdict, %s" % phospho_pep);
-            # Strip "p" from phosphopeptide sequence
-            dephospho_pep = re_phos.sub('',phospho_pep)
-
-            # Determine number of phosphoresidues in phosphopeptide
-            numps = len(phospho_pep) - len(dephospho_pep)
-
-            # Determine location(s) of phosphoresidue(s) in phosphopeptide
-            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
-            ploc = [] #list of p locations
-            i = 0
-            p = phospho_pep
-            while i < numps:
-                ploc.append(p.find("p"))
-                p = p[:p.find("p")] + p[p.find("p")+1:]
-                i +=1
-
-
-            # Establish nested dictionary
-            result = {}
-            result[SEQUENCE] = []
-            result[UNIPROT_ID] = []
-            result[DESCRIPTION] = []
-            result[GENE_NAME] = []
-            result[PHOSPHORESIDUE] = []
-            result[SEQUENCE7] = []
-            result[SEQUENCE10] = []
-
-            # Add stripped sequence to dictionary
-            result[SEQUENCE].append(dephospho_pep)
-
-            # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
-            dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
-
-            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
-            ### Caller may elect to:
-            ## try:
-            ##     ...
-            ## except PreconditionError as pe:
-            ##     print("'{expression}': {message}".format(
-            ##             expression = pe.expression,
-            ##             message = pe.message))
-            ##             )
-            ##         )
-            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
-                raise PreconditionError( dephospho_pep,
-                    'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
-                    )
-            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
-                raise PreconditionError( dephospho_pep,
-                    'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
-                    )
-            if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
-                raise PreconditionError( dephospho_pep,
-                    "dephosphorylated phosphopeptide does not match " +
-                    "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
-                    PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
-                    )
-            result[SEQUENCE] = [dephospho_pep]
-            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
-            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
-            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
-            if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
-               raise PreconditionError( dephospho_pep,
-                    'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
-                    )
-            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
-            if len (UniProtSeqList) < 1:
-               print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
-               # raise PreconditionError(
-               #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
-               #      'value has zero length'
-               #      )
-            for UniProtSeq in UniProtSeqList:
-                i = 0
-                phosphoresidues = []
-                seq7s_set = set()
-                seq7s = []
-                seq10s_set = set()
-                seq10s = []
-                while i < len(ploc):
-                    start = UniProtSeq.find(dephospho_pep)
-                    psite = start+ploc[i] #location of phosphoresidue on protein sequence
-
-                    #add Phosphoresidue
-                    phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
-                    phosphoresidues.append(phosphosite)
-
-                    #Add Sequence7
-                    if psite < 7: #phospho_pep at N terminus
-                        seq7 = str(UniProtSeq)[:psite+8]
-                        if seq7[psite] == "S": #if phosphosresidue is serine
-                            pres = "s"
-                        elif seq7[psite] == "T": #if phosphosresidue is threonine
-                            pres = "t"
-                        elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
-                            pres = "y"
-                        else: # if not pSTY
-                            pres = "?"
-                        seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
-                        while len(seq7) < 15: #add appropriate number of "_" to the front
-                            seq7 = "_" + seq7
-                    elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
-                        seq7 = str(UniProtSeq)[psite-7:]
-                        if seq7[7] == "S":
-                            pres = "s"
-                        elif seq7[7] == "T":
-                            pres = "t"
-                        elif seq7[7] == "Y":
-                            pres = "y"
-                        else:
-                            pres = "?"
-                        seq7 = seq7[:7] + pres + seq7[8:]
-                        while len(seq7) < 15: #add appropriate number of "_" to the back
-                            seq7 = seq7 + "_"
-                    else:
-                        seq7 = str(UniProtSeq)[psite-7:psite+8]
-                        pres = "" #phosphoresidue
-                        if seq7[7] == "S": #if phosphosresidue is serine
-                            pres = "s"
-                        elif seq7[7] == "T": #if phosphosresidue is threonine
-                            pres = "t"
-                        elif seq7[7] == "Y": #if phosphoresidue is tyrosine
-                            pres = "y"
-                        else: # if not pSTY
-                            pres = "?"
-                        seq7 = seq7[:7] + pres + seq7[8:]
-                    if seq7 not in seq7s_set:
-                        seq7s.append(seq7)
-                        seq7s_set.add(seq7)
-
-                    #add Sequence10
-                    if psite < 10: #phospho_pep at N terminus
-                        seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
-                    elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
-                        seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
-                    else:
-                        seq10 = str(UniProtSeq)[psite-10:psite+11]
-                        seq10 = seq10[:10] + "p" + seq10[10:]
-                    if seq10 not in seq10s_set:
-                        seq10s.append(seq10)
-                        seq10s_set.add(seq10)
-
-                    i+=1
-
-                result[PHOSPHORESIDUE].append(phosphoresidues)
-                result[SEQUENCE7].append(seq7s)
-                # result[SEQUENCE10] is a list of lists of strings
-                result[SEQUENCE10].append(seq10s)
-
-
-
-
-            r = list(zip(
-               result[UNIPROT_ID],
-               result[GENE_NAME],
-               result[DESCRIPTION],
-               result[PHOSPHORESIDUE]
-               ))
-            # Sort by `UniProt_ID`
-            #   ref: https://stackoverflow.com//4174955/15509512
-            s = sorted(r, key=operator.itemgetter(0))
-
-            result[UNIPROT_ID] = []
-            result[GENE_NAME] = []
-            result[DESCRIPTION] = []
-            result[PHOSPHORESIDUE] = []
-
-            for r in s:
-                result[UNIPROT_ID].append(r[0])
-                result[GENE_NAME].append(r[1])
-                result[DESCRIPTION].append(r[2])
-                result[PHOSPHORESIDUE].append(r[3])
-
-
-            #convert lists to strings in the dictionary
-            for key,value in result.items():
-                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
-                    result[key] = '; '.join(map(str, value))
-                elif key in [SEQUENCE10]:
-                    # result[SEQUENCE10] is a list of lists of strings
-                    joined_value = ''
-                    joined_set = set()
-                    sep = ''
-                    for valL in value:
-                        # valL is a list of strings
-                        for val in valL:
-                            # val is a string
-                            if val not in joined_set:
-                                joined_set.add(val)
-                                #joined_value += sep + '; '.join(map(str, val))
-                                joined_value += sep + val
-                                sep = '; '
-                    # joined_value is a string
-                    result[key] = joined_value
-
-
-            newstring = '; '.join(
-                [', '.join(l) for l in result[PHOSPHORESIDUE]]
-                )
-            ### #separate the isoforms in PHOSPHORESIDUE column with ";"
-            ### oldstring = result[PHOSPHORESIDUE]
-            ### oldlist = list(oldstring)
-            ### newstring = ""
-            ### i = 0
-            ### for e in oldlist:
-            ###     if e == ";":
-            ###         if numps > 1:
-            ###             if i%numps:
-            ###                 newstring = newstring + ";"
-            ###             else:
-            ###                 newstring = newstring + ","
-            ###         else:
-            ###             newstring = newstring + ";"
-            ###         i +=1
-            ###     else:
-            ###         newstring = newstring + e
-            result[PHOSPHORESIDUE] = newstring
-
-
-            #separate sequence7's by |
-            oldstring = result[SEQUENCE7]
-            oldlist = oldstring
-            newstring = ""
-            for l in oldlist:
-              for e in l:
-                if e == ";":
-                    newstring = newstring + " |"
-                elif len(newstring) > 0 and 1 > newstring.count(e):
-                    newstring = newstring + " | " + e
-                elif 1 > newstring.count(e):
-                    newstring = newstring + e
-            result[SEQUENCE7] = newstring
-
-
-            return [phospho_pep, result]
-
-        # Construct list of [string, dictionary] lists
-        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
-        result_list = [
-            catch(pseq_to_subdict,psequence)
-            for psequence
-            in data_in[PHOSPHOPEPTIDE_MATCH]
-            ]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # Construct dictionary from list of lists
-        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
-        UniProt_Info = {
-            result[0]:result[1]
-            for result
-            in result_list
-            if result is not None
-            }
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #cosmetic: add N_A to phosphopeptide rows with no hits
-        p_peptide_list = []
-        for key in UniProt_Info:
-            p_peptide_list.append(key)
-            for nestedKey in UniProt_Info[key]:
-                if UniProt_Info[key][nestedKey] == "":
-                    UniProt_Info[key][nestedKey] = N_A
-
-        end_time = time.process_time() #timer
-        print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #convert UniProt_Info dictionary to dataframe
-        uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
-
-        #reorder columns to match expected output file
-        uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
-
-
-        cols = uniprot_df.columns.tolist()
-        #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
-        #uniprot_df = uniprot_df[cols]
-        uniprot_df = uniprot_df[[
-            PHOSPHOPEPTIDE,
-            SEQUENCE10,
-            SEQUENCE7,
-            GENE_NAME,
-            PHOSPHORESIDUE,
-            UNIPROT_ID,
-            DESCRIPTION
-            ]]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #concat to split then groupby to collapse
-        seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
-                            for _, row in uniprot_df.iterrows()]).reset_index()
-        seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
-
-        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
-        #read in PhosphoSitePlus Regulatory Sites dataset
-        #ACE if (True):
-        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
-        conn = sql.connect(uniprot_sqlite)
-        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
-        # Close SwissProt SQLite database
-        conn.close()
-        #ACE # Array indexes are zero-based
-        #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
-        #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
-        #ACE               , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
-        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
-        #ACE     print(regsites_df)
-        ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
-        #ACE else:
-        #ACE     regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
-        #ACE     SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
-        #ACE     #ACE # Array indexes are zero-based
-        #ACE     #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
-        #ACE     #ACE RENAME_COLS = [ 'GENE'          , 'PROTEIN'    , 'PROT_TYPE' , 'ACC_ID'          , 'GENE_ID'
-        #ACE     #ACE               , 'HU_CHR_LOC'    , 'ORGANISM'   , 'MOD_RSD'   , 'SITE_GRP_ID'     , 'SITE_+/-7_AA'
-        #ACE     #ACE               , 'DOMAIN'        , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
-        #ACE     #ACE               , 'PMIDs'         , 'LT_LIT'     , 'MS_LIT'    , 'MS_CST'          , 'NOTES'
-        #ACE     #ACE               ]
-        #ACE     #ACE REGSITE_COL_SITE7AA = 9
-        #ACE     #ACE REGSITE_COL_PROTEIN = 1
-        #ACE     #ACE REGSITE_COL_DOMAIN = 10
-        #ACE     #ACE REGSITE_COL_PMIDs = 15
-
-        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
-
-
-        #keep only the human entries in dataframe
-        if len(species) > 0:
-            print('Limit PhosphoSitesPlus records to species "' + species + '"')
-            regsites_df = regsites_df[regsites_df.ORGANISM == species]
-
-        #merge the seq7 df with the regsites df based off of the sequence7
-        merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
-        #ACE print(merge_df.columns.tolist()) #ACE
-
-        #after merging df, select only the columns of interest - note that PROTEIN is absent here
-        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
-        #ACE print(merge_df.columns.tolist()) #ACE
-        #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
-        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
-
-        #remove the columns that were combined
-        merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
-
-        #ACE print(merge_df) #ACE
-        #ACE print(merge_df.columns.tolist()) #ACE
-
-        end_time = time.process_time() #timer
-        print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #cosmetic changes to Function Phosphoresidue column
-        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
-
-        end_time = time.process_time() #timer
-        print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        i = 0
-        while i < len(fp_series):
-            #remove the extra ";" so that it looks more professional
-            if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
-                fp_series[i] = ""
-            while fp_series[i].endswith("; "): #remove ; from the ends
-                fp_series[i] = fp_series[i][:-2]
-            while fp_series[i].startswith("; "): #remove ; from the beginning
-                fp_series[i] = fp_series[i][2:]
-            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
-            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
-            fp_series[i] = fp_series[i].replace("; ; ", "; ")
-
-            #turn blanks into N_A to signify the info was searched for but cannot be found
-            if fp_series[i] == "":
-                fp_series[i] = N_A
-
-            i += 1
-        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
-
-        end_time = time.process_time() #timer
-        print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #merge uniprot df with merge df
-        uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
-
-        #collapse the merged df
-        uniprot_regsites_collapsed_df = pandas.DataFrame(
-            uniprot_regsites_merged_df
-            .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
-            .apply(lambda x: ppep_join(x)))
-            #.apply(lambda x: "%s" % ' | '.join(x)))
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
-
-
-        #rename columns
-        uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
-
-
-
-        #select columns to be merged to uniprot_df
-        #ACE cols = regsites_df.columns.tolist()
-        #ACE print(cols) #ACE
-        #ACE if len(cols) > 8:
-        #ACE     cols = [cols[9]]+[cols[1]]+cols[10:15]
-        #ACE     #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
-        #ACE     print(cols) #ACE
-        #ACE regsite_merge_df = regsites_df[cols]
-
-        end_time = time.process_time() #timer
-        print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #add columns based on Sequence7 matching site_+/-7_AA
-        uniprot_regsite_df = pandas.merge(
-            left=uniprot_df,
-            right=uniprot_regsites_collapsed_df,
-            how='left',
-            left_on=PHOSPHOPEPTIDE,
-            right_on='ppp')
-
-        end_time = time.process_time() #timer
-        print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        data_in.rename(
-            {'Protein description': PHOSPHOPEPTIDE},
-            axis='columns',
-            inplace=True
-            )
-
-
-
-        sort_start_time = time.process_time() #timer
-
-        #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
-        res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
-        data_in = data_in.loc[res2]
-
-        end_time = time.process_time() #timer
-        print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-
-        cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
-        upstream_data = upstream_data[cols]
-
-        end_time = time.process_time() #timer
-        print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        #### #rename upstream columns in new list
-        #### new_cols = []
-        #### for name in cols:
-        ####     if "_NetworKIN" in name:
-        ####         name = name.split("_")[0]
-        ####     if " motif" in name:
-        ####         name = name.split(" motif")[0]
-        ####     if " sequence " in name:
-        ####         name = name.split(" sequence")[0]
-        ####     if "_Phosida" in name:
-        ####         name = name.split("_")[0]
-        ####     if "_PhosphoSite" in name:
-        ####         name = name.split("_")[0]
-        ####     new_cols.append(name)
-
-        #rename upstream columns in new list
-        def col_rename(name):
-            if "_NetworKIN" in name:
-                name = name.split("_")[0]
-            if " motif" in name:
-                name = name.split(" motif")[0]
-            if " sequence " in name:
-                name = name.split(" sequence")[0]
-            if "_Phosida" in name:
-                name = name.split("_")[0]
-            if "_PhosphoSite" in name:
-                name = name.split("_")[0]
-            return name
-
-        new_cols = [col_rename(col) for col in cols]
-        upstream_data.columns = new_cols
-
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        # Create upstream_data_cast as a copy of upstream_data
-        #   but with first column substituted by the phosphopeptide sequence
-        upstream_data_cast = upstream_data.copy()
-        new_cols_cast = new_cols
-        new_cols_cast[0] = 'p_peptide'
-        upstream_data_cast.columns = new_cols_cast
-        upstream_data_cast['p_peptide'] = upstream_data.index
-        new_cols_cast0 = new_cols_cast[0]
-
-        # --- -------------- begin read upstream_data_melt ------------------------------------
-        ## ----------- Get melted kinase mapping data from SQLite database (start) -----------
-        conn = sql.connect(uniprot_sqlite)
-        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
-        # Close SwissProt SQLite database
-        conn.close()
-        upstream_data_melt = upstream_data_melt_df.copy()
-        upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
-        upstream_data_melt['characterization'] = [
-                col_rename(s)
-                for s in upstream_data_melt['characterization']
-                ]
-
-        print('%0.6f upstream_data_melt_df initially has %d rows' %
-              (end_time - start_time, len(upstream_data_melt.axes[0]))
-              , file=sys.stderr)
-        # ref: https://stackoverflow.com/a/27360130/15509512
-        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
-        upstream_data_melt.drop(
-            upstream_data_melt[upstream_data_melt.X != 'X'].index,
-            inplace = True
-            )
-        print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
-              (end_time - start_time, len(upstream_data_melt.axes[0]))
-              , file=sys.stderr)
-        #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
-        #ACE     print(upstream_data_melt)
-        ## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
-        # ... -------------- end read upstream_data_melt --------------------------------------
-
-        end_time = time.process_time() #timer
-        print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
-        # ... end read upstream_data_melt
-
-        upstream_data_melt_index = upstream_data_melt.index
-        upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
-
-        end_time = time.process_time() #timer
-        print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
-
-        melt_dict = {}
-        for key in upstream_map_p_peptide_list:
-            melt_dict[key] = []
-
-        for el in upstream_delta_melt_LoL:
-            (p_peptide, characterization, X) = tuple(el)
-            if p_peptide in melt_dict:
-                melt_dict[p_peptide].append(characterization)
-            else:
-                exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
-
-
-        # for key in upstream_map_p_peptide_list:
-        #     melt_dict[key] = ' | '.join(melt_dict[key])
-
-        for key in upstream_map_p_peptide_list:
-            melt_dict[key] = melt_join(melt_dict[key])
-
-        end_time = time.process_time() #timer
-        print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        # map_dict is a dictionary of dictionaries
-        map_dict = {}
-        for key in upstream_map_p_peptide_list:
-            map_dict[key] = {}
-            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
-
-
-        end_time = time.process_time() #timer
-        print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        #convert map_dict to dataframe
-        map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
-        map_df["p-peptide"] = map_df.index #make index a column too
-        cols_map_df = map_df.columns.tolist()
-        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
-        map_df = map_df[cols_map_df]
-
-        #join map_df to uniprot_regsite_df
-        output_df = uniprot_regsite_df.merge(
-            map_df,
-            how="left",
-            left_on=PHOSPHOPEPTIDE,
-            right_on="p-peptide")
-
-        output_df = output_df[
-            [  PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
-                UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
-                PUTATIVE_UPSTREAM_DOMAINS
-                ]
-            ]
-
-
-        # cols_output_prelim = output_df.columns.tolist()
-        #
-        # print("cols_output_prelim")
-        # print(cols_output_prelim)
-        #
-        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
-        #
-        # print("cols_output with p-peptide")
-        # print(cols_output)
-        #
-        # cols_output = [col for col in cols_output if not col == "p-peptide"]
-        #
-        # print("cols_output")
-        # print(cols_output)
-        #
-        # output_df = output_df[cols_output]
-
-        #join output_df back to quantitative columns in data_in df
-        quant_cols = data_in.columns.tolist()
-        quant_cols = quant_cols[1:]
-        quant_data = data_in[quant_cols]
-
-        ## ----------- Write merge/filter metadata to SQLite database (start) -----------
-        # Open SwissProt SQLite database
-        conn = sql.connect(output_sqlite)
-        cur  = conn.cursor()
-
-        cur.executescript(MRGFLTR_DDL)
-
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata_view', CITATION_INSERT_PSP)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata', CITATION_INSERT_PSP)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
-            )
-        cur.execute(
-            CITATION_INSERT_STMT,
-            ('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
-            )
-
-        # Read ppep-to-sequence LUT
-        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
-        #ACE ppep_lut_df.info(verbose=True)
-        # write only metadata for merged/filtered records to SQLite
-        mrgfltr_metadata_df = output_df.copy()
-        # replace phosphopeptide seq with ppep.id
-        mrgfltr_metadata_df = ppep_lut_df.merge(
-            mrgfltr_metadata_df,
-            left_on='ppep_seq',
-            right_on=PHOSPHOPEPTIDE,
-            how='inner'
-            )
-        mrgfltr_metadata_df.drop(
-            columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
-            inplace=True
-            )
-        #rename columns
-        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
-        #ACE mrgfltr_metadata_df.info(verbose=True)
-        mrgfltr_metadata_df.to_sql(
-            'mrgfltr_metadata',
-            con=conn,
-            if_exists='append',
-            index=False,
-            method='multi'
-            )
-
-        # Close SwissProt SQLite database
-        conn.close()
-        ## ----------- Write merge/filter metadata to SQLite database (finish) -----------
-
-        output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
-        output_cols = output_df.columns.tolist()
-        output_cols = output_cols[:-1]
-        output_df = output_df[output_cols]
-
-        #cosmetic changes to Upstream column
-        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
-        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
-        i = 0
-        while i < len(us_series):
-            #turn blanks into N_A to signify the info was searched for but cannot be found
-            if us_series[i] == "":
-                us_series[i] = N_A
-            i += 1
-        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
-
-        end_time = time.process_time() #timer
-        print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        (output_rows, output_cols) = output_df.shape
-
-        #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
-        output_df = output_df.convert_dtypes(convert_integer=True)
-
-
-        #Output onto Final CSV file
-        output_df.to_csv(output_filename_csv, index=False)
-        output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
-
-        end_time = time.process_time() #timer
-        print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
-
-        print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
-
-        end_time = time.process_time() #timer
-        print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
-
-
-        #Rev. 7/1/2016
-        #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
-        #Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
-        #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
-        #                read from SwissProt SQLite database
-        #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
-
-        #############################################
-        # copied from Excel Output Script.ipynb END #
-        #############################################
-
-    try:
-        catch(mqpep_getswissprot,)
-        exit(0)
-    except Exception as e:
-        exit('Internal error running mqpep_getswissprot(): %s' % (e))
-
-if __name__ == "__main__":
-    __main__()
-
+#!/usr/bin/env python
+
+# Import the packages needed
+import argparse
+import operator  # for operator.itemgetter
+import os.path
+import re
+import shutil  # for shutil.copyfile(src, dest)
+import sqlite3 as sql
+import sys  # import the sys module for exc_info
+import time
+import traceback  # for formatting stack-trace
+from codecs import getreader as cx_getreader
+
+import numpy as np
+import pandas
+
+# global constants
+N_A = "N/A"
+
+
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e: e, **kwargs):
+
+    try:
+        return func(*args, **kwargs)
+    except Exception as e:
+        print("For %s" % str(func))
+        print("  with args %s" % str(args))
+        print("  caught exception: %s" % str(e))
+        (ty, va, tb) = sys.exc_info()
+        print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+        exit(-1)
+        return None
+
+
+def ppep_join(x):
+    x = [i for i in x if N_A != i]
+    result = "%s" % " | ".join(x)
+    if result != "":
+        return result
+    else:
+        return N_A
+
+
+def melt_join(x):
+    tmp = {key.lower(): key for key in x}
+    result = "%s" % " | ".join([tmp[key] for key in tmp])
+    return result
+
+
+def __main__():
+    # Parse Command Line
+    parser = argparse.ArgumentParser(
+        description="Phopsphoproteomic Enrichment Pipeline Merge and Filter."
+    )
+
+    # inputs:
+    #   Phosphopeptide data for experimental results, including the intensities
+    #   and the mapping to kinase domains, in tabular format.
+    parser.add_argument(
+        "--phosphopeptides",
+        "-p",
+        nargs=1,
+        required=True,
+        dest="phosphopeptides",
+        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
+    )
+    #   UniProtKB/SwissProt DB input, SQLite
+    parser.add_argument(
+        "--ppep_mapping_db",
+        "-d",
+        nargs=1,
+        required=True,
+        dest="ppep_mapping_db",
+        help="UniProtKB/SwissProt SQLite Database",
+    )
+    #   species to limit records chosed from PhosPhositesPlus
+    parser.add_argument(
+        "--species",
+        "-x",
+        nargs=1,
+        required=False,
+        default=[],
+        dest="species",
+        help="limit PhosphoSitePlus records to indicated species (field may be empty)",
+    )
+
+    # outputs:
+    #   tabular output
+    parser.add_argument(
+        "--mrgfltr_tab",
+        "-o",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_tab",
+        help="Tabular output file for results",
+    )
+    #   CSV output
+    parser.add_argument(
+        "--mrgfltr_csv",
+        "-c",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_csv",
+        help="CSV output file for results",
+    )
+    #   SQLite output
+    parser.add_argument(
+        "--mrgfltr_sqlite",
+        "-S",
+        nargs=1,
+        required=True,
+        dest="mrgfltr_sqlite",
+        help="SQLite output file for results",
+    )
+
+    # "Make it so!" (parse the arguments)
+    options = parser.parse_args()
+    print("options: " + str(options))
+
+    # determine phosphopeptide ("upstream map") input tabular file access
+    if options.phosphopeptides is None:
+        exit('Argument "phosphopeptides" is required but not supplied')
+    try:
+        upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
+        input_file = open(upstream_map_filename_tab, "r")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing phosphopeptides argument: %s" % str(e))
+
+    # determine input SQLite access
+    if options.ppep_mapping_db is None:
+        exit('Argument "ppep_mapping_db" is required but not supplied')
+    try:
+        uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
+        input_file = open(uniprot_sqlite, "rb")
+        input_file.close()
+    except Exception as e:
+        exit("Error parsing ppep_mapping_db argument: %s" % str(e))
+
+    # copy input SQLite dataset to output SQLite dataset
+    if options.mrgfltr_sqlite is None:
+        exit('Argument "mrgfltr_sqlite" is required but not supplied')
+    try:
+        output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
+        shutil.copyfile(uniprot_sqlite, output_sqlite)
+    except Exception as e:
+        exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
+
+    # determine species to limit records from PSP_Regulatory_Sites
+    if options.species is None:
+        exit(
+            'Argument "species" is required (and may be empty) but not supplied'
+        )
+    try:
+        if len(options.species) > 0:
+            species = options.species[0]
+        else:
+            species = ""
+    except Exception as e:
+        exit("Error parsing species argument: %s" % str(e))
+
+    # determine tabular output destination
+    if options.mrgfltr_tab is None:
+        exit('Argument "mrgfltr_tab" is required but not supplied')
+    try:
+        output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
+        output_file = open(output_filename_tab, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_tab argument: %s" % str(e))
+
+    # determine CSV output destination
+    if options.mrgfltr_csv is None:
+        exit('Argument "mrgfltr_csv" is required but not supplied')
+    try:
+        output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
+        output_file = open(output_filename_csv, "w")
+        output_file.close()
+    except Exception as e:
+        exit("Error parsing mrgfltr_csv argument: %s" % str(e))
+
+    def mqpep_getswissprot():
+
+        #
+        # copied from Excel Output Script.ipynb BEGIN #
+        #
+
+        #  String Constants  #################
+        DEPHOSPHOPEP = "DephosphoPep"
+        DESCRIPTION = "Description"
+        FUNCTION_PHOSPHORESIDUE = (
+            "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
+        )
+        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB
+        ON_FUNCTION = (
+            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites
+        )
+        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites
+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+        ON_PROCESS = (
+            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites
+        )
+        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        PHOSPHOPEPTIDE = "Phosphopeptide"
+        PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
+        PHOSPHORESIDUE = "Phosphoresidue"
+        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
+        SEQUENCE = "Sequence"
+        SEQUENCE10 = "Sequence10"
+        SEQUENCE7 = "Sequence7"
+        SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
+        UNIPROT_ID = "UniProt_ID"
+        UNIPROT_SEQ_AND_META_SQL = """
+            select    Uniprot_ID, Description, Gene_Name, Sequence,
+                      Organism_Name, Organism_ID, PE, SV
+                 from UniProtKB
+             order by Sequence, UniProt_ID
+        """
+        UNIPROT_UNIQUE_SEQ_SQL = """
+            select distinct Sequence
+                       from UniProtKB
+                   group by Sequence
+        """
+        PPEP_PEP_UNIPROTSEQ_SQL = """
+            select distinct phosphopeptide, peptide, sequence
+                       from uniprotkb_pep_ppep_view
+                   order by sequence
+        """
+        PPEP_MELT_SQL = """
+            SELECT DISTINCT
+                phospho_peptide AS 'p_peptide',
+                kinase_map AS 'characterization',
+                'X' AS 'X'
+            FROM ppep_gene_site_view
+        """
+        # CREATE TABLE PSP_Regulatory_site (
+        #   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+        #   domain             TEXT,
+        #   ON_FUNCTION        TEXT,
+        #   ON_PROCESS         TEXT,
+        #   ON_PROT_INTERACT   TEXT,
+        #   ON_OTHER_INTERACT  TEXT,
+        #   notes              TEXT,
+        #   organism           TEXT
+        # );
+        PSP_REGSITE_SQL = """
+            SELECT DISTINCT
+              SITE_PLUSMINUS_7AA ,
+              DOMAIN             ,
+              ON_FUNCTION        ,
+              ON_PROCESS         ,
+              ON_PROT_INTERACT   ,
+              ON_OTHER_INTERACT  ,
+              NOTES              ,
+              ORGANISM
+            FROM PSP_Regulatory_site
+        """
+        PPEP_ID_SQL = """
+            SELECT
+                id AS 'ppep_id',
+                seq AS 'ppep_seq'
+            FROM ppep
+        """
+        MRGFLTR_DDL = """
+        DROP VIEW  IF EXISTS mrgfltr_metadata_view;
+        DROP TABLE IF EXISTS mrgfltr_metadata;
+        CREATE TABLE mrgfltr_metadata
+          ( ppep_id                 INTEGER REFERENCES ppep(id)
+          , Sequence10              TEXT
+          , Sequence7               TEXT
+          , GeneName                TEXT
+          , Phosphoresidue          TEXT
+          , UniProtID               TEXT
+          , Description             TEXT
+          , FunctionPhosphoresidue  TEXT
+          , PutativeUpstreamDomains TEXT
+          , PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
+          )
+          ;
+        CREATE VIEW mrgfltr_metadata_view AS
+          SELECT DISTINCT
+              ppep.seq             AS phospho_peptide
+            , Sequence10
+            , Sequence7
+            , GeneName
+            , Phosphoresidue
+            , UniProtID
+            , Description
+            , FunctionPhosphoresidue
+            , PutativeUpstreamDomains
+          FROM
+            ppep, mrgfltr_metadata
+          WHERE
+              mrgfltr_metadata.ppep_id = ppep.id
+          ORDER BY
+            ppep.seq
+            ;
+        """
+
+        CITATION_INSERT_STMT = """
+          INSERT INTO Citation (
+            ObjectName,
+            CitationData
+          ) VALUES (?,?)
+          """
+        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
+        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+
+        MRGFLTR_METADATA_COLUMNS = [
+            "ppep_id",
+            "Sequence10",
+            "Sequence7",
+            "GeneName",
+            "Phosphoresidue",
+            "UniProtID",
+            "Description",
+            "FunctionPhosphoresidue",
+            "PutativeUpstreamDomains",
+        ]
+
+        #  String Constants (end) ############
+
+        class Error(Exception):
+            """Base class for exceptions in this module."""
+
+            pass
+
+        class PreconditionError(Error):
+            """Exception raised for errors in the input.
+
+            Attributes:
+                expression -- input expression in which the error occurred
+                message -- explanation of the error
+            """
+
+            def __init__(self, expression, message):
+                self.expression = expression
+                self.message = message
+
+        # start_time = time.clock() #timer
+        start_time = time.process_time()  # timer
+
+        # get keys from upstream tabular file using readline()
+        # ref: https://stackoverflow.com/a/16713581/15509512
+        #      answer to "Use codecs to read file with correct encoding"
+        file1_encoded = open(upstream_map_filename_tab, "rb")
+        file1 = cx_getreader("latin-1")(file1_encoded)
+
+        count = 0
+        upstream_map_p_peptide_list = []
+        re_tab = re.compile("^[^\t]*")
+        while True:
+            count += 1
+            # Get next line from file
+            line = file1.readline()
+            # if line is empty
+            # end of file is reached
+            if not line:
+                break
+            if count > 1:
+                m = re_tab.match(line)
+                upstream_map_p_peptide_list.append(m[0])
+        file1.close()
+        file1_encoded.close()
+
+        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+        re_phos = re.compile("p")
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # build UniProt sequence LUT and list of unique SwissProt sequences
+
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up structures to hold SwissProt data
+
+        uniprot_Sequence_List = []
+        UniProtSeqLUT = {}
+
+        # Execute query for unique seqs without fetching the results yet
+        uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
+
+        while 1:
+            batch = uniprot_unique_seq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for row in batch:
+                Sequence = row[0]
+                UniProtSeqLUT[(Sequence, DESCRIPTION)] = []
+                UniProtSeqLUT[(Sequence, GENE_NAME)] = []
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)] = []
+                UniProtSeqLUT[Sequence] = []
+
+        # Execute query for seqs and metadata without fetching the results yet
+        uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
+
+        while 1:
+            batch = uniprot_seq_and_meta.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (
+                UniProt_ID,
+                Description,
+                Gene_Name,
+                Sequence,
+                OS,
+                OX,
+                PE,
+                SV,
+            ) in batch:
+                uniprot_Sequence_List.append(Sequence)
+                UniProtSeqLUT[Sequence] = Sequence
+                UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID)
+                UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
+                if OS != N_A:
+                    Description += " OS=" + OS
+                if OX != N_A:
+                    Description += " OX=" + str(int(OX))
+                if Gene_Name != N_A:
+                    Description += " GN=" + Gene_Name
+                if PE != N_A:
+                    Description += " PE=" + PE
+                if SV != N_A:
+                    Description += " SV=" + SV
+                UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        Sequence = ""
+        UniProt_ID = ""
+        Description = ""
+        Gene_Name = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )
+
+        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(uniprot_sqlite)
+        cur = conn.cursor()
+
+        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+        DephosphoPep_UniProtSeq_LUT = {}
+
+        # Set up dictionary to accumulate results
+        PhosphoPep_UniProtSeq_LUT = {}
+
+        # Execute query for tuples without fetching the results yet
+        ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
+
+        while 1:
+            batch = ppep_pep_uniprotseq_cur.fetchmany(size=50)
+            if not batch:
+                # handle case where no records are returned
+                break
+            for (phospho_pep, dephospho_pep, sequence) in batch:
+                # do interesting stuff here...
+                PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                PhosphoPep_UniProtSeq_LUT[
+                    (phospho_pep, DEPHOSPHOPEP)
+                ] = dephospho_pep
+                if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                    DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, DESCRIPTION)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, GENE_NAME)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, UNIPROT_ID)
+                    ] = []
+                    DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = []
+                DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
+
+                if (
+                    sequence
+                    not in DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ]
+                ):
+                    DephosphoPep_UniProtSeq_LUT[
+                        (dephospho_pep, SEQUENCE)
+                    ].append(sequence)
+                for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
+                    if phospho_pep != phospho_pep:
+                        print(
+                            "phospho_pep:'%s' phospho_pep:'%s'"
+                            % (phospho_pep, phospho_pep)
+                        )
+                    if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                        PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
+                        PhosphoPep_UniProtSeq_LUT[
+                            (phospho_pep, DEPHOSPHOPEP)
+                        ] = dephospho_pep
+                    r = list(
+                        zip(
+                            [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]],
+                            [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]],
+                            [
+                                s
+                                for s in UniProtSeqLUT[(sequence, DESCRIPTION)]
+                            ],
+                        )
+                    )
+                    # Sort by `UniProt_ID`
+                    #   ref: https://stackoverflow.com/a/4174955/15509512
+                    r = sorted(r, key=operator.itemgetter(0))
+                    # Get one tuple for each `phospho_pep`
+                    #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+                    for (upid, gn, desc) in r:
+                        # Append pseudo-tuple per UniProt_ID but only when it is not present
+                        if (
+                            upid
+                            not in DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ]
+                        ):
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, UNIPROT_ID)
+                            ].append(upid)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, DESCRIPTION)
+                            ].append(desc)
+                            DephosphoPep_UniProtSeq_LUT[
+                                (dephospho_pep, GENE_NAME)
+                            ].append(gn)
+
+        # Close SwissProt SQLite database; clean up local variables
+        conn.close()
+        # wipe local variables
+        phospho_pep = dephospho_pep = sequence = 0
+        upid = gn = desc = r = ""
+
+        # ----------- Get SwissProt data from SQLite database (finish) -----------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f finished reading and decoding '%s' [0.4]"
+            % (end_time - start_time, upstream_map_filename_tab),
+            file=sys.stderr,
+        )
+
+        print(
+            "{:>10} unique upstream phosphopeptides tested".format(
+                str(len(upstream_map_p_peptide_list))
+            )
+        )
+
+        # Read in Upstream tabular file
+        # We are discarding the intensity data; so read it as text
+        upstream_data = pandas.read_table(
+            upstream_map_filename_tab, dtype="str", index_col=0
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f read Upstream Map from file [1g_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_data.index = upstream_map_p_peptide_list
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added index to Upstream Map [1g_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # trim upstream_data to include only the upstream map columns
+        old_cols = upstream_data.columns.tolist()
+        i = 0
+        first_intensity = -1
+        last_intensity = -1
+        intensity_re = re.compile("Intensity.*")
+        for col_name in old_cols:
+            m = intensity_re.match(col_name)
+            if m:
+                last_intensity = i
+                if first_intensity == -1:
+                    first_intensity = i
+            i += 1
+        # print('last intensity = %d' % last_intensity)
+        col_PKCalpha = last_intensity + 2
+
+        data_in_cols = [old_cols[0]] + old_cols[
+            first_intensity: last_intensity + 1
+        ]
+
+        if upstream_data.empty:
+            print("upstream_data is empty")
+            exit(0)
+
+        data_in = upstream_data.copy(deep=True)[data_in_cols]
+
+        # Convert floating-point integers to int64 integers
+        #   ref: https://stackoverflow.com/a/68497603/15509512
+        data_in[list(data_in.columns[1:])] = (
+            data_in[list(data_in.columns[1:])]
+            .astype("float64")
+            .apply(np.int64)
+        )
+
+        # create another phosphopeptide column that will be used to join later;
+        #  MAY need to change depending on Phosphopeptide column position
+        # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
+        data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Produce a dictionary of metadata for a single phosphopeptide.
+        #   This is a replacement of `UniProtInfo_subdict` in the original code.
+        def pseq_to_subdict(phospho_pep):
+            # Strip "p" from phosphopeptide sequence
+            dephospho_pep = re_phos.sub("", phospho_pep)
+
+            # Determine number of phosphoresidues in phosphopeptide
+            numps = len(phospho_pep) - len(dephospho_pep)
+
+            # Determine location(s) of phosphoresidue(s) in phosphopeptide
+            #   (used later for Phosphoresidue, Sequence7, and Sequence10)
+            ploc = []  # list of p locations
+            i = 0
+            p = phospho_pep
+            while i < numps:
+                ploc.append(p.find("p"))
+                p = p[: p.find("p")] + p[p.find("p") + 1:]
+                i += 1
+
+            # Establish nested dictionary
+            result = {}
+            result[SEQUENCE] = []
+            result[UNIPROT_ID] = []
+            result[DESCRIPTION] = []
+            result[GENE_NAME] = []
+            result[PHOSPHORESIDUE] = []
+            result[SEQUENCE7] = []
+            result[SEQUENCE10] = []
+
+            # Add stripped sequence to dictionary
+            result[SEQUENCE].append(dephospho_pep)
+
+            # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
+            # Caller may elect to:
+            # try:
+            #     ...
+            # except PreconditionError as pe:
+            #     print("'{expression}': {message}".format(
+            #             expression = pe.expression,
+            #             message = pe.message))
+            #             )
+            #         )
+            if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
+                )
+            if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+                )
+            if (
+                dephospho_pep
+                != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+            ):
+                raise PreconditionError(
+                    dephospho_pep,
+                    "dephosphorylated phosphopeptide does not match "
+                    + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
+                    + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)],
+                )
+            result[SEQUENCE] = [dephospho_pep]
+            result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, UNIPROT_ID)
+            ]
+            result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, DESCRIPTION)
+            ]
+            result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, GENE_NAME)
+            ]
+            if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
+                raise PreconditionError(
+                    dephospho_pep,
+                    "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
+                )
+            UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
+                (dephospho_pep, SEQUENCE)
+            ]
+            if len(UniProtSeqList) < 1:
+                print(
+                    "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
+                    % dephospho_pep
+                )
+                # raise PreconditionError(
+                #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
+                #      'value has zero length'
+                #      )
+            for UniProtSeq in UniProtSeqList:
+                i = 0
+                phosphoresidues = []
+                seq7s_set = set()
+                seq7s = []
+                seq10s_set = set()
+                seq10s = []
+                while i < len(ploc):
+                    start = UniProtSeq.find(dephospho_pep)
+                    psite = (
+                        start + ploc[i]
+                    )  # location of phosphoresidue on protein sequence
+
+                    # add Phosphoresidue
+                    phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1)
+                    phosphoresidues.append(phosphosite)
+
+                    # Add Sequence7
+                    if psite < 7:  # phospho_pep at N terminus
+                        seq7 = str(UniProtSeq)[: psite + 8]
+                        if seq7[psite] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif (
+                            seq7[psite] == "T"
+                        ):  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif (
+                            seq7[psite] == "Y"
+                        ):  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = (
+                            seq7[:psite] + pres + seq7[psite + 1: psite + 8]
+                        )
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the front
+                            seq7 = "_" + seq7
+                    elif (
+                        len(UniProtSeq) - psite < 8
+                    ):  # phospho_pep at C terminus
+                        seq7 = str(UniProtSeq)[psite - 7:]
+                        if seq7[7] == "S":
+                            pres = "s"
+                        elif seq7[7] == "T":
+                            pres = "t"
+                        elif seq7[7] == "Y":
+                            pres = "y"
+                        else:
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                        while (
+                            len(seq7) < 15
+                        ):  # add appropriate number of "_" to the back
+                            seq7 = seq7 + "_"
+                    else:
+                        seq7 = str(UniProtSeq)[psite - 7: psite + 8]
+                        pres = ""  # phosphoresidue
+                        if seq7[7] == "S":  # if phosphosresidue is serine
+                            pres = "s"
+                        elif seq7[7] == "T":  # if phosphosresidue is threonine
+                            pres = "t"
+                        elif seq7[7] == "Y":  # if phosphoresidue is tyrosine
+                            pres = "y"
+                        else:  # if not pSTY
+                            pres = "?"
+                        seq7 = seq7[:7] + pres + seq7[8:]
+                    if seq7 not in seq7s_set:
+                        seq7s.append(seq7)
+                        seq7s_set.add(seq7)
+
+                    # add Sequence10
+                    if psite < 10:  # phospho_pep at N terminus
+                        seq10 = (
+                            str(UniProtSeq)[:psite]
+                            + "p"
+                            + str(UniProtSeq)[psite: psite + 11]
+                        )
+                    elif (
+                        len(UniProtSeq) - psite < 11
+                    ):  # phospho_pep at C terminus
+                        seq10 = (
+                            str(UniProtSeq)[psite - 10: psite]
+                            + "p"
+                            + str(UniProtSeq)[psite:]
+                        )
+                    else:
+                        seq10 = str(UniProtSeq)[psite - 10: psite + 11]
+                        seq10 = seq10[:10] + "p" + seq10[10:]
+                    if seq10 not in seq10s_set:
+                        seq10s.append(seq10)
+                        seq10s_set.add(seq10)
+
+                    i += 1
+
+                result[PHOSPHORESIDUE].append(phosphoresidues)
+                result[SEQUENCE7].append(seq7s)
+                # result[SEQUENCE10] is a list of lists of strings
+                result[SEQUENCE10].append(seq10s)
+
+            r = list(
+                zip(
+                    result[UNIPROT_ID],
+                    result[GENE_NAME],
+                    result[DESCRIPTION],
+                    result[PHOSPHORESIDUE],
+                )
+            )
+            # Sort by `UniProt_ID`
+            #   ref: https://stackoverflow.com//4174955/15509512
+            s = sorted(r, key=operator.itemgetter(0))
+
+            result[UNIPROT_ID] = []
+            result[GENE_NAME] = []
+            result[DESCRIPTION] = []
+            result[PHOSPHORESIDUE] = []
+
+            for r in s:
+                result[UNIPROT_ID].append(r[0])
+                result[GENE_NAME].append(r[1])
+                result[DESCRIPTION].append(r[2])
+                result[PHOSPHORESIDUE].append(r[3])
+
+            # convert lists to strings in the dictionary
+            for key, value in result.items():
+                if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
+                    result[key] = "; ".join(map(str, value))
+                elif key in [SEQUENCE10]:
+                    # result[SEQUENCE10] is a list of lists of strings
+                    joined_value = ""
+                    joined_set = set()
+                    sep = ""
+                    for valL in value:
+                        # valL is a list of strings
+                        for val in valL:
+                            # val is a string
+                            if val not in joined_set:
+                                joined_set.add(val)
+                                # joined_value += sep + '; '.join(map(str, val))
+                                joined_value += sep + val
+                                sep = "; "
+                    # joined_value is a string
+                    result[key] = joined_value
+
+            newstring = "; ".join(
+                [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
+            )
+            # #separate the isoforms in PHOSPHORESIDUE column with ";"
+            # oldstring = result[PHOSPHORESIDUE]
+            # oldlist = list(oldstring)
+            # newstring = ""
+            # i = 0
+            # for e in oldlist:
+            #     if e == ";":
+            #         if numps > 1:
+            #             if i%numps:
+            #                 newstring = newstring + ";"
+            #             else:
+            #                 newstring = newstring + ","
+            #         else:
+            #             newstring = newstring + ";"
+            #         i +=1
+            #     else:
+            #         newstring = newstring + e
+            result[PHOSPHORESIDUE] = newstring
+
+            # separate sequence7's by |
+            oldstring = result[SEQUENCE7]
+            oldlist = oldstring
+            newstring = ""
+            for ol in oldlist:
+                for e in ol:
+                    if e == ";":
+                        newstring = newstring + " |"
+                    elif len(newstring) > 0 and 1 > newstring.count(e):
+                        newstring = newstring + " | " + e
+                    elif 1 > newstring.count(e):
+                        newstring = newstring + e
+            result[SEQUENCE7] = newstring
+
+            return [phospho_pep, result]
+
+        # Construct list of [string, dictionary] lists
+        #   where the dictionary provides the SwissProt metadata for a phosphopeptide
+        result_list = [
+            catch(pseq_to_subdict, psequence)
+            for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added SwissProt annotations to phosphopeptides [B]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Construct dictionary from list of lists
+        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+        UniProt_Info = {
+            result[0]: result[1]
+            for result in result_list
+            if result is not None
+        }
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic: add N_A to phosphopeptide rows with no hits
+        p_peptide_list = []
+        for key in UniProt_Info:
+            p_peptide_list.append(key)
+            for nestedKey in UniProt_Info[key]:
+                if UniProt_Info[key][nestedKey] == "":
+                    UniProt_Info[key][nestedKey] = N_A
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert UniProt_Info dictionary to dataframe
+        uniprot_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(UniProt_Info)
+        )
+
+        # reorder columns to match expected output file
+        uniprot_df[
+            PHOSPHOPEPTIDE
+        ] = uniprot_df.index  # make index a column too
+
+        cols = uniprot_df.columns.tolist()
+        # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
+        # uniprot_df = uniprot_df[cols]
+        uniprot_df = uniprot_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+            ]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f reordered columns to match expected output file [1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # concat to split then groupby to collapse
+        seq7_df = pandas.concat(
+            [
+                pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | "))
+                for _, row in uniprot_df.iterrows()
+            ]
+        ).reset_index()
+        seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
+
+        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
+        # read in PhosphoSitePlus Regulatory Sites dataset
+        # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+
+        # keep only the human entries in dataframe
+        if len(species) > 0:
+            print(
+                'Limit PhosphoSitesPlus records to species "' + species + '"'
+            )
+            regsites_df = regsites_df[regsites_df.ORGANISM == species]
+
+        # merge the seq7 df with the regsites df based off of the sequence7
+        merge_df = seq7_df.merge(
+            regsites_df,
+            left_on=SEQUENCE7,
+            right_on=SITE_PLUSMINUS_7AA_SQL,
+            how="left",
+        )
+
+        # after merging df, select only the columns of interest - note that PROTEIN is absent here
+        merge_df = merge_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE7,
+                ON_FUNCTION,
+                ON_PROCESS,
+                ON_PROT_INTERACT,
+                ON_OTHER_INTERACT,
+                ON_NOTES,
+            ]
+        ]
+        # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
+            merge_df[ON_PROCESS], sep="; ", na_rep=""
+        )
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
+        merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
+            FUNCTION_PHOSPHORESIDUE
+        ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
+
+        # remove the columns that were combined
+        merge_df = merge_df[
+            [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE]
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f merge regsite metadata [1a]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # cosmetic changes to Function Phosphoresidue column
+        fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f more cosmetic changes [1b]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        i = 0
+        while i < len(fp_series):
+            # remove the extra ";" so that it looks more professional
+            if fp_series[i] == "; ; ; ; ":  # remove ; from empty hits
+                fp_series[i] = ""
+            while fp_series[i].endswith("; "):  # remove ; from the ends
+                fp_series[i] = fp_series[i][:-2]
+            while fp_series[i].startswith("; "):  # remove ; from the beginning
+                fp_series[i] = fp_series[i][2:]
+            fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
+            fp_series[i] = fp_series[i].replace("; ; ", "; ")
+
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if fp_series[i] == "":
+                fp_series[i] = N_A
+
+            i += 1
+        merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # merge uniprot df with merge df
+        uniprot_regsites_merged_df = uniprot_df.merge(
+            merge_df,
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE,
+            how="left",
+        )
+
+        # collapse the merged df
+        uniprot_regsites_collapsed_df = pandas.DataFrame(
+            uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[
+                FUNCTION_PHOSPHORESIDUE
+            ].apply(lambda x: ppep_join(x))
+        )
+        # .apply(lambda x: "%s" % ' | '.join(x)))
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        uniprot_regsites_collapsed_df[
+            PHOSPHOPEPTIDE
+        ] = (
+            uniprot_regsites_collapsed_df.index
+        )  # add df index as its own column
+
+        # rename columns
+        uniprot_regsites_collapsed_df.columns = [
+            FUNCTION_PHOSPHORESIDUE,
+            "ppp",
+        ]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f selected columns to be merged to uniprot_df [1e]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # add columns based on Sequence7 matching site_+/-7_AA
+        uniprot_regsite_df = pandas.merge(
+            left=uniprot_df,
+            right=uniprot_regsites_collapsed_df,
+            how="left",
+            left_on=PHOSPHOPEPTIDE,
+            right_on="ppp",
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        data_in.rename(
+            {"Protein description": PHOSPHOPEPTIDE},
+            axis="columns",
+            inplace=True,
+        )
+
+        # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
+        res2 = sorted(
+            data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
+        )
+        data_in = data_in.loc[res2]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f sorting time [1f]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:]
+        upstream_data = upstream_data[cols]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f refactored columns for Upstream Map [1g]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # #rename upstream columns in new list
+        # new_cols = []
+        # for name in cols:
+        #     if "_NetworKIN" in name:
+        #         name = name.split("_")[0]
+        #     if " motif" in name:
+        #         name = name.split(" motif")[0]
+        #     if " sequence " in name:
+        #         name = name.split(" sequence")[0]
+        #     if "_Phosida" in name:
+        #         name = name.split("_")[0]
+        #     if "_PhosphoSite" in name:
+        #         name = name.split("_")[0]
+        #     new_cols.append(name)
+
+        # rename upstream columns in new list
+        def col_rename(name):
+            if "_NetworKIN" in name:
+                name = name.split("_")[0]
+            if " motif" in name:
+                name = name.split(" motif")[0]
+            if " sequence " in name:
+                name = name.split(" sequence")[0]
+            if "_Phosida" in name:
+                name = name.split("_")[0]
+            if "_PhosphoSite" in name:
+                name = name.split("_")[0]
+            return name
+
+        new_cols = [col_rename(col) for col in cols]
+        upstream_data.columns = new_cols
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f renamed columns for Upstream Map [1h_1]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Create upstream_data_cast as a copy of upstream_data
+        #   but with first column substituted by the phosphopeptide sequence
+        upstream_data_cast = upstream_data.copy()
+        new_cols_cast = new_cols
+        new_cols_cast[0] = "p_peptide"
+        upstream_data_cast.columns = new_cols_cast
+        upstream_data_cast["p_peptide"] = upstream_data.index
+
+        # --- -------------- begin read upstream_data_melt ------------------------------------
+        # ----------- Get melted kinase mapping data from SQLite database (start) -----------
+        conn = sql.connect(uniprot_sqlite)
+        upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
+        # Close SwissProt SQLite database
+        conn.close()
+        upstream_data_melt = upstream_data_melt_df.copy()
+        upstream_data_melt.columns = ["p_peptide", "characterization", "X"]
+        upstream_data_melt["characterization"] = [
+            col_rename(s) for s in upstream_data_melt["characterization"]
+        ]
+
+        print(
+            "%0.6f upstream_data_melt_df initially has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ref: https://stackoverflow.com/a/27360130/15509512
+        #      e.g. df.drop(df[df.score < 50].index, inplace=True)
+        upstream_data_melt.drop(
+            upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True
+        )
+        print(
+            "%0.6f upstream_data_melt_df pre-dedup has %d rows"
+            % (end_time - start_time, len(upstream_data_melt.axes[0])),
+            file=sys.stderr,
+        )
+        # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
+        # ... -------------- end read upstream_data_melt --------------------------------------
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+        # ... end read upstream_data_melt
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f indexed melted Upstream Map [1h_2a]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
+
+        melt_dict = {}
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = []
+
+        for el in upstream_delta_melt_LoL:
+            (p_peptide, characterization, X) = tuple(el)
+            if p_peptide in melt_dict:
+                melt_dict[p_peptide].append(characterization)
+            else:
+                exit(
+                    'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
+                    % (p_peptide)
+                )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f appended peptide characterizations [1h_2b]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # for key in upstream_map_p_peptide_list:
+        #     melt_dict[key] = ' | '.join(melt_dict[key])
+
+        for key in upstream_map_p_peptide_list:
+            melt_dict[key] = melt_join(melt_dict[key])
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f concatenated multiple characterizations [1h_2c]"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # map_dict is a dictionary of dictionaries
+        map_dict = {}
+        for key in upstream_map_p_peptide_list:
+            map_dict[key] = {}
+            map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f instantiated map dictionary [2]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # convert map_dict to dataframe
+        map_df = pandas.DataFrame.transpose(
+            pandas.DataFrame.from_dict(map_dict)
+        )
+        map_df["p-peptide"] = map_df.index  # make index a column too
+        cols_map_df = map_df.columns.tolist()
+        cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
+        map_df = map_df[cols_map_df]
+
+        # join map_df to uniprot_regsite_df
+        output_df = uniprot_regsite_df.merge(
+            map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide"
+        )
+
+        output_df = output_df[
+            [
+                PHOSPHOPEPTIDE,
+                SEQUENCE10,
+                SEQUENCE7,
+                GENE_NAME,
+                PHOSPHORESIDUE,
+                UNIPROT_ID,
+                DESCRIPTION,
+                FUNCTION_PHOSPHORESIDUE,
+                PUTATIVE_UPSTREAM_DOMAINS,
+            ]
+        ]
+
+        # cols_output_prelim = output_df.columns.tolist()
+        #
+        # print("cols_output_prelim")
+        # print(cols_output_prelim)
+        #
+        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
+        #
+        # print("cols_output with p-peptide")
+        # print(cols_output)
+        #
+        # cols_output = [col for col in cols_output if not col == "p-peptide"]
+        #
+        # print("cols_output")
+        # print(cols_output)
+        #
+        # output_df = output_df[cols_output]
+
+        # join output_df back to quantitative columns in data_in df
+        quant_cols = data_in.columns.tolist()
+        quant_cols = quant_cols[1:]
+        quant_data = data_in[quant_cols]
+
+        # ----------- Write merge/filter metadata to SQLite database (start) -----------
+        # Open SwissProt SQLite database
+        conn = sql.connect(output_sqlite)
+        cur = conn.cursor()
+
+        cur.executescript(MRGFLTR_DDL)
+
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP)
+        )
+        cur.execute(
+            CITATION_INSERT_STMT,
+            ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),
+        )
+        cur.execute(
+            CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)
+        )
+
+        # Read ppep-to-sequence LUT
+        ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
+        # write only metadata for merged/filtered records to SQLite
+        mrgfltr_metadata_df = output_df.copy()
+        # replace phosphopeptide seq with ppep.id
+        mrgfltr_metadata_df = ppep_lut_df.merge(
+            mrgfltr_metadata_df,
+            left_on="ppep_seq",
+            right_on=PHOSPHOPEPTIDE,
+            how="inner",
+        )
+        mrgfltr_metadata_df.drop(
+            columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True
+        )
+        # rename columns
+        mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
+        mrgfltr_metadata_df.to_sql(
+            "mrgfltr_metadata",
+            con=conn,
+            if_exists="append",
+            index=False,
+            method="multi",
+        )
+
+        # Close SwissProt SQLite database
+        conn.close()
+        # ----------- Write merge/filter metadata to SQLite database (finish) -----------
+
+        output_df = output_df.merge(
+            quant_data,
+            how="right",
+            left_on=PHOSPHOPEPTIDE,
+            right_on=PHOSPHOPEPTIDE_MATCH,
+        )
+        output_cols = output_df.columns.tolist()
+        output_cols = output_cols[:-1]
+        output_df = output_df[output_cols]
+
+        # cosmetic changes to Upstream column
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
+            PUTATIVE_UPSTREAM_DOMAINS
+        ].fillna(
+            ""
+        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+        us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
+        i = 0
+        while i < len(us_series):
+            # turn blanks into N_A to signify the info was searched for but cannot be found
+            if us_series[i] == "":
+                us_series[i] = N_A
+            i += 1
+        output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f establisheed output [3]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        (output_rows, output_cols) = output_df.shape
+
+        output_df = output_df.convert_dtypes(convert_integer=True)
+
+        # Output onto Final CSV file
+        output_df.to_csv(output_filename_csv, index=False)
+        output_df.to_csv(
+            output_filename_tab, quoting=None, sep="\t", index=False
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f wrote output [4]" % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        print(
+            "{:>10} phosphopeptides written to output".format(str(output_rows))
+        )
+
+        end_time = time.process_time()  # timer
+        print(
+            "%0.6f seconds of non-system CPU time were consumed"
+            % (end_time - start_time,),
+            file=sys.stderr,
+        )  # timer
+
+        # Rev. 7/1/2016
+        # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
+        # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
+        #                read from SwissProt SQLite database
+        # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
+
+        #
+        # copied from Excel Output Script.ipynb END #
+        #
+
+    try:
+        catch(
+            mqpep_getswissprot,
+        )
+        exit(0)
+    except Exception as e:
+        exit("Internal error running mqpep_getswissprot(): %s" % (e))
+
+
+if __name__ == "__main__":
+    __main__()
author	eschen42
date	Thu, 10 Mar 2022 23:42:48 +0000
parents	c1403d18c189
children	922d309640db