mqppep_anova: mqppep_mrgfltr.py comparison

comparison mqppep_mrgfltr.py @ 0:c1403d18c189 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"

author	eschen42
date	Mon, 07 Mar 2022 19:05:01 +0000
parents
children	d4d531006735

comparison

equal deleted inserted replaced

--1:000000000000
+:c1403d18c189
+#!/usr/bin/env python
+# Import the packages needed
+import argparse
+import os.path
+import sys
+import pandas
+import re
+import time
+import sqlite3 as sql
+from codecs import getreader as cx_getreader
+import sys
+import numpy as np
+#   for sorting list of lists using operator.itemgetter
+import operator
+#   for formatting stack-trace
+import traceback
+#   for Aho-Corasick search for fixed set of substrings
+import ahocorasick
+import operator
+import hashlib
+# for shutil.copyfile(src, dest)
+import shutil
+# global constants
+N_A = 'N/A'
+# ref: https://stackoverflow.com/a/8915613/15509512
+#   answers: "How to handle exceptions in a list comprehensions"
+#   usage:
+#       from math import log
+#       eggs = [1,3,0,3,2]
+#       print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
+#   producing:
+#       for <built-in function log>
+#         with args (0,)
+#         exception: math domain error
+#       [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
+def catch(func, *args, handle=lambda e : e, **kwargs):
+try:
+return func(*args, **kwargs)
+except Exception as e:
+print("For %s" % str(func))
+print("  with args %s" % str(args))
+print("  caught exception: %s" % str(e))
+(ty, va, tb) = sys.exc_info()
+print("  stack trace: " + str(traceback.format_exception(ty, va, tb)))
+exit(-1)
+return None # was handle(e)
+def ppep_join(x):
+x = [i for i in x if N_A != i]
+result = "%s" % ' | '.join(x)
+if result != "":
+return result
+else:
+return N_A
+def melt_join(x):
+tmp = {key.lower(): key for key in x}
+result = "%s" % ' | '.join([tmp[key] for key in tmp])
+return result
+def __main__():
+# Parse Command Line
+parser = argparse.ArgumentParser(
+description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.'
+)
+# inputs:
+#   Phosphopeptide data for experimental results, including the intensities
+#   and the mapping to kinase domains, in tabular format.
+parser.add_argument(
+'--phosphopeptides', '-p',
+nargs=1,
+required=True,
+dest='phosphopeptides',
+help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format'
+)
+#   UniProtKB/SwissProt DB input, SQLite
+parser.add_argument(
+'--ppep_mapping_db', '-d',
+nargs=1,
+required=True,
+dest='ppep_mapping_db',
+help='UniProtKB/SwissProt SQLite Database'
+)
+#ACE #   PhosPhositesPlus DB input, csv
+#ACE parser.add_argument(
+#ACE     '--psp_regulatory_sites', '-s',
+#ACE     nargs=1,
+#ACE     required=True,
+#ACE     dest='psp_regulatory_sites_csv',
+#ACE     help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header'
+#ACE     )
+#   species to limit records chosed from PhosPhositesPlus
+parser.add_argument(
+'--species', '-x',
+nargs=1,
+required=False,
+default=[],
+dest='species',
+help='limit PhosphoSitePlus records to indicated species (field may be empty)'
+)
+# outputs:
+#   tabular output
+parser.add_argument(
+'--mrgfltr_tab', '-o',
+nargs=1,
+required=True,
+dest='mrgfltr_tab',
+help='Tabular output file for results'
+)
+#   CSV output
+parser.add_argument(
+'--mrgfltr_csv', '-c',
+nargs=1,
+required=True,
+dest='mrgfltr_csv',
+help='CSV output file for results'
+)
+#   SQLite output
+parser.add_argument(
+'--mrgfltr_sqlite', '-S',
+nargs=1,
+required=True,
+dest='mrgfltr_sqlite',
+help='SQLite output file for results'
+)
+# "Make it so!" (parse the arguments)
+options = parser.parse_args()
+print("options: " + str(options))
+# determine phosphopeptide ("upstream map") input tabular file access
+if options.phosphopeptides is None:
+exit('Argument "phosphopeptides" is required but not supplied')
+try:
+upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
+input_file = open(upstream_map_filename_tab, 'r')
+input_file.close()
+except Exception as e:
+exit('Error parsing phosphopeptides argument: %s' % str(e))
+# determine input SQLite access
+if options.ppep_mapping_db is None:
+exit('Argument "ppep_mapping_db" is required but not supplied')
+try:
+uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
+input_file = open(uniprot_sqlite, 'rb')
+input_file.close()
+except Exception as e:
+exit('Error parsing ppep_mapping_db argument: %s' % str(e))
+# copy input SQLite dataset to output SQLite dataset
+if options.mrgfltr_sqlite is None:
+exit('Argument "mrgfltr_sqlite" is required but not supplied')
+try:
+output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
+shutil.copyfile(uniprot_sqlite, output_sqlite)
+except Exception as e:
+exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e))
+#ACE # determine psp_regulatory_sites CSV access
+#ACE if options.psp_regulatory_sites_csv is None:
+#ACE     exit('Argument "psp_regulatory_sites_csv" is required but not supplied')
+#ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv)
+#ACE try:
+#ACE     phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0])
+#ACE     input_file = open(phosphosite_filename_csv, 'r')
+#ACE     input_file.close()
+#ACE except Exception as e:
+#ACE     exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e))
+#ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv)
+# determine species to limit records from PSP_Regulatory_Sites
+if options.species is None:
+exit('Argument "species" is required (and may be empty) but not supplied')
+try:
+if len(options.species) > 0:
+species = options.species[0]
+else:
+species = ''
+except Exception as e:
+exit('Error parsing species argument: %s' % str(e))
+# determine tabular output destination
+if options.mrgfltr_tab is None:
+exit('Argument "mrgfltr_tab" is required but not supplied')
+try:
+output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
+output_file = open(output_filename_tab, 'w')
+output_file.close()
+except Exception as e:
+exit('Error parsing mrgfltr_tab argument: %s' % str(e))
+# determine CSV output destination
+if options.mrgfltr_csv is None:
+exit('Argument "mrgfltr_csv" is required but not supplied')
+try:
+output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
+output_file = open(output_filename_csv, 'w')
+output_file.close()
+except Exception as e:
+exit('Error parsing mrgfltr_csv argument: %s' % str(e))
+def mqpep_getswissprot():
+###############################################
+# copied from Excel Output Script.ipynb BEGIN #
+###############################################
+###########  String Constants  #################
+DEPHOSPHOPEP = 'DephosphoPep'
+DESCRIPTION = 'Description'
+FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)'
+GENE_NAME = 'Gene_Name'                     # Gene Name from UniProtKB
+ON_FUNCTION = 'ON_FUNCTION'                 # ON_FUNCTION column from PSP_Regulatory_Sites
+ON_NOTES = 'NOTES'                          # NOTES column from PSP_Regulatory_Sites
+ON_OTHER_INTERACT = 'ON_OTHER_INTERACT'     # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+ON_PROCESS = 'ON_PROCESS'                   # ON_PROCESS column from PSP_Regulatory_Sites
+ON_PROT_INTERACT = 'ON_PROT_INTERACT'       # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+PHOSPHOPEPTIDE = 'Phosphopeptide'
+PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match'
+PHOSPHORESIDUE = 'Phosphoresidue'
+PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains'
+SEQUENCE = 'Sequence'
+SEQUENCE10 = 'Sequence10'
+SEQUENCE7 = 'Sequence7'
+SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA'
+SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA'
+UNIPROT_ID = 'UniProt_ID'
+UNIPROT_SEQ_AND_META_SQL = '''
+select    Uniprot_ID, Description, Gene_Name, Sequence,
+Organism_Name, Organism_ID, PE, SV
+from UniProtKB
+order by Sequence, UniProt_ID
+'''
+UNIPROT_UNIQUE_SEQ_SQL = '''
+select distinct Sequence
+from UniProtKB
+group by Sequence
+'''
+PPEP_PEP_UNIPROTSEQ_SQL = '''
+select distinct phosphopeptide, peptide, sequence
+from uniprotkb_pep_ppep_view
+order by sequence
+'''
+PPEP_MELT_SQL = '''
+SELECT DISTINCT
+phospho_peptide AS 'p_peptide',
+kinase_map AS 'characterization',
+'X' AS 'X'
+FROM ppep_gene_site_view
+'''
+# CREATE TABLE PSP_Regulatory_site (
+#   site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+#   domain             TEXT,
+#   ON_FUNCTION        TEXT,
+#   ON_PROCESS         TEXT,
+#   ON_PROT_INTERACT   TEXT,
+#   ON_OTHER_INTERACT  TEXT,
+#   notes              TEXT,
+#   organism           TEXT
+# );
+PSP_REGSITE_SQL = '''
+SELECT DISTINCT
+SITE_PLUSMINUS_7AA ,
+DOMAIN             ,
+ON_FUNCTION        ,
+ON_PROCESS         ,
+ON_PROT_INTERACT   ,
+ON_OTHER_INTERACT  ,
+NOTES              ,
+ORGANISM
+FROM PSP_Regulatory_site
+'''
+PPEP_ID_SQL ='''
+SELECT
+id AS 'ppep_id',
+seq AS 'ppep_seq'
+FROM ppep
+'''
+MRGFLTR_DDL ='''
+DROP VIEW  IF EXISTS mrgfltr_metadata_view;
+DROP TABLE IF EXISTS mrgfltr_metadata;
+CREATE TABLE mrgfltr_metadata
+( ppep_id                 INTEGER REFERENCES ppep(id)
+, Sequence10              TEXT
+, Sequence7               TEXT
+, GeneName                TEXT
+, Phosphoresidue          TEXT
+, UniProtID               TEXT
+, Description             TEXT
+, FunctionPhosphoresidue  TEXT
+, PutativeUpstreamDomains TEXT
+, PRIMARY KEY (ppep_id)            ON CONFLICT IGNORE
+)
+;
+CREATE VIEW mrgfltr_metadata_view AS
+SELECT DISTINCT
+ppep.seq             AS phospho_peptide
+, Sequence10
+, Sequence7
+, GeneName
+, Phosphoresidue
+, UniProtID
+, Description
+, FunctionPhosphoresidue
+, PutativeUpstreamDomains
+FROM
+ppep, mrgfltr_metadata
+WHERE
+mrgfltr_metadata.ppep_id = ppep.id
+ORDER BY
+ppep.seq
+;
+'''
+CITATION_INSERT_STMT = '''
+INSERT INTO Citation (
+ObjectName,
+CitationData
+) VALUES (?,?)
+'''
+CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
+CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+MRGFLTR_METADATA_COLUMNS = [
+'ppep_id',
+'Sequence10',
+'Sequence7',
+'GeneName',
+'Phosphoresidue',
+'UniProtID',
+'Description',
+'FunctionPhosphoresidue',
+'PutativeUpstreamDomains'
+]
+###########  String Constants (end) ############
+class Error(Exception):
+"""Base class for exceptions in this module."""
+pass
+class PreconditionError(Error):
+"""Exception raised for errors in the input.
+Attributes:
+expression -- input expression in which the error occurred
+message -- explanation of the error
+"""
+def __init__(self, expression, message):
+self.expression = expression
+self.message = message
+#start_time = time.clock() #timer
+start_time = time.process_time() #timer
+#get keys from upstream tabular file using readline()
+# ref: https://stackoverflow.com/a/16713581/15509512
+#      answer to "Use codecs to read file with correct encoding"
+file1_encoded = open(upstream_map_filename_tab, 'rb')
+file1 = cx_getreader("latin-1")(file1_encoded)
+count = 0
+upstream_map_p_peptide_list = []
+re_tab = re.compile('^[^\t]*')
+while True:
+count += 1
+# Get next line from file
+line = file1.readline()
+# if line is empty
+# end of file is reached
+if not line:
+break
+if count > 1:
+m = re_tab.match(line)
+upstream_map_p_peptide_list.append(m[0])
+file1.close()
+file1_encoded.close()
+# Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+re_phos = re.compile('p')
+dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ]
+end_time = time.process_time() #timer
+print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr)
+## ----------- Get SwissProt data from SQLite database (start) -----------
+# build UniProt sequence LUT and list of unique SwissProt sequences
+# Open SwissProt SQLite database
+conn = sql.connect(uniprot_sqlite)
+cur  = conn.cursor()
+# Set up structures to hold SwissProt data
+uniprot_Sequence_List = []
+UniProtSeqLUT = {}
+# Execute query for unique seqs without fetching the results yet
+uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
+while batch := uniprot_unique_seq_cur.fetchmany(size=50):
+if None == batch:
+# handle case where no records are returned
+break
+for row in batch:
+Sequence = row[0]
+UniProtSeqLUT[(Sequence,DESCRIPTION)] = []
+UniProtSeqLUT[(Sequence,GENE_NAME)  ] = []
+UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = []
+UniProtSeqLUT[ Sequence               ] = []
+# Execute query for seqs and metadata without fetching the results yet
+uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
+while batch := uniprot_seq_and_meta.fetchmany(size=50):
+if None == batch:
+# handle case where no records are returned
+break
+for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch:
+uniprot_Sequence_List.append(Sequence)
+UniProtSeqLUT[Sequence] = Sequence
+UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID)
+UniProtSeqLUT[(Sequence,GENE_NAME)  ].append(Gene_Name)
+if OS != N_A:
+Description += ' OS=' + OS
+if OX != N_A:
+Description += ' OX=' + str(int(OX))
+if Gene_Name != N_A:
+Description += ' GN=' + Gene_Name
+if PE != N_A:
+Description += ' PE=' + PE
+if SV != N_A:
+Description += ' SV=' + SV
+UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description)
+# Close SwissProt SQLite database; clean up local variables
+conn.close()
+Sequence = ''
+UniProt_ID = ''
+Description = ''
+Gene_Name = ''
+## ----------- Get SwissProt data from SQLite database (finish) -----------
+end_time = time.process_time() #timer
+print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr)
+## ----------- Get SwissProt data from SQLite database (start) -----------
+# build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT
+#ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) )
+# Open SwissProt SQLite database
+conn = sql.connect(uniprot_sqlite)
+cur  = conn.cursor()
+# Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+DephosphoPep_UniProtSeq_LUT = {}
+# Set up dictionary to accumulate results
+PhosphoPep_UniProtSeq_LUT = {}
+# Execute query for tuples without fetching the results yet
+ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
+while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50):
+if None == batch:
+# handle case where no records are returned
+break
+for (phospho_pep, dephospho_pep, sequence) in batch:
+#do interesting stuff here...
+PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
+PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
+if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]  = []
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]    = []
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]   = []
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]     = []
+DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
+#ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence))
+if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]:
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence)
+for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
+if phospho_pep != phospho_pep:
+print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep))
+if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+PhosphoPep_UniProtSeq_LUT[phospho_pep]                  = phospho_pep
+PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep
+r = list(zip(
+[s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]],
+[s for s in UniProtSeqLUT[(sequence,GENE_NAME)]],
+[s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]]
+))
+# Sort by `UniProt_ID`
+#   ref: https://stackoverflow.com/a/4174955/15509512
+r = sorted(r, key=operator.itemgetter(0))
+# Get one tuple for each `phospho_pep`
+#   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+for (upid, gn, desc) in r:
+# Append pseudo-tuple per UniProt_ID but only when it is not present
+if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]:
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid)
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc)
+DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn)
+# Close SwissProt SQLite database; clean up local variables
+conn.close()
+# wipe local variables
+phospho_pep = dephospho_pep = sequence = 0
+upid = gn = desc = r = ''
+## ----------- Get SwissProt data from SQLite database (finish) -----------
+end_time = time.process_time() #timer
+print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr)
+print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list))))
+#Read in Upstream tabular file
+# We are discarding the intensity data; so read it as text
+upstream_data = pandas.read_table(
+upstream_map_filename_tab,
+dtype='str',
+index_col = 0
+)
+end_time = time.process_time() #timer
+print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer
+upstream_data.index = upstream_map_p_peptide_list
+end_time = time.process_time() #timer
+print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer
+#trim upstream_data to include only the upstream map columns
+old_cols = upstream_data.columns.tolist()
+i = 0
+first_intensity = -1
+last_intensity  = -1
+intensity_re = re.compile('Intensity.*')
+for col_name in old_cols:
+m = intensity_re.match(col_name)
+if m:
+last_intensity = i
+if first_intensity == -1:
+first_intensity = i
+i += 1
+#print('last intensity = %d' % last_intensity)
+col_PKCalpha = last_intensity + 2
+col_firstIntensity = first_intensity
+data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1]
+if upstream_data.empty:
+print("upstream_data is empty")
+exit(0)
+data_in = upstream_data.copy(deep=True)[data_in_cols]
+# Convert floating-point integers to int64 integers
+#   ref: https://stackoverflow.com/a/68497603/15509512
+data_in[list(data_in.columns[1:])] = data_in[
+list(data_in.columns[1:])].astype('float64').apply(np.int64)
+#create another phosphopeptide column that will be used to join later;
+#  MAY need to change depending on Phosphopeptide column position
+#data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
+data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
+end_time = time.process_time() #timer
+print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer
+# Produce a dictionary of metadata for a single phosphopeptide.
+#   This is a replacement of `UniProtInfo_subdict` in the original code.
+def pseq_to_subdict(phospho_pep):
+#ACE print("calling pseq_to_subdict, %s" % phospho_pep);
+# Strip "p" from phosphopeptide sequence
+dephospho_pep = re_phos.sub('',phospho_pep)
+# Determine number of phosphoresidues in phosphopeptide
+numps = len(phospho_pep) - len(dephospho_pep)
+# Determine location(s) of phosphoresidue(s) in phosphopeptide
+#   (used later for Phosphoresidue, Sequence7, and Sequence10)
+ploc = [] #list of p locations
+i = 0
+p = phospho_pep
+while i < numps:
+ploc.append(p.find("p"))
+p = p[:p.find("p")] + p[p.find("p")+1:]
+i +=1
+# Establish nested dictionary
+result = {}
+result[SEQUENCE] = []
+result[UNIPROT_ID] = []
+result[DESCRIPTION] = []
+result[GENE_NAME] = []
+result[PHOSPHORESIDUE] = []
+result[SEQUENCE7] = []
+result[SEQUENCE10] = []
+# Add stripped sequence to dictionary
+result[SEQUENCE].append(dephospho_pep)
+# Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT
+dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep]
+# Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
+### Caller may elect to:
+## try:
+##     ...
+## except PreconditionError as pe:
+##     print("'{expression}': {message}".format(
+##             expression = pe.expression,
+##             message = pe.message))
+##             )
+##         )
+if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
+raise PreconditionError( dephospho_pep,
+'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT'
+)
+if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
+raise PreconditionError( dephospho_pep,
+'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT'
+)
+if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]:
+raise PreconditionError( dephospho_pep,
+"dephosphorylated phosphopeptide does not match " +
+"PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " +
+PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]
+)
+result[SEQUENCE] = [dephospho_pep]
+result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]
+result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)]
+result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)]
+if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
+raise PreconditionError( dephospho_pep,
+'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT'
+)
+UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]
+if len (UniProtSeqList) < 1:
+print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep)
+# raise PreconditionError(
+#     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
+#      'value has zero length'
+#      )
+for UniProtSeq in UniProtSeqList:
+i = 0
+phosphoresidues = []
+seq7s_set = set()
+seq7s = []
+seq10s_set = set()
+seq10s = []
+while i < len(ploc):
+start = UniProtSeq.find(dephospho_pep)
+psite = start+ploc[i] #location of phosphoresidue on protein sequence
+#add Phosphoresidue
+phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1)
+phosphoresidues.append(phosphosite)
+#Add Sequence7
+if psite < 7: #phospho_pep at N terminus
+seq7 = str(UniProtSeq)[:psite+8]
+if seq7[psite] == "S": #if phosphosresidue is serine
+pres = "s"
+elif seq7[psite] == "T": #if phosphosresidue is threonine
+pres = "t"
+elif seq7[psite] == "Y": #if phosphoresidue is tyrosine
+pres = "y"
+else: # if not pSTY
+pres = "?"
+seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8]
+while len(seq7) < 15: #add appropriate number of "_" to the front
+seq7 = "_" + seq7
+elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus
+seq7 = str(UniProtSeq)[psite-7:]
+if seq7[7] == "S":
+pres = "s"
+elif seq7[7] == "T":
+pres = "t"
+elif seq7[7] == "Y":
+pres = "y"
+else:
+pres = "?"
+seq7 = seq7[:7] + pres + seq7[8:]
+while len(seq7) < 15: #add appropriate number of "_" to the back
+seq7 = seq7 + "_"
+else:
+seq7 = str(UniProtSeq)[psite-7:psite+8]
+pres = "" #phosphoresidue
+if seq7[7] == "S": #if phosphosresidue is serine
+pres = "s"
+elif seq7[7] == "T": #if phosphosresidue is threonine
+pres = "t"
+elif seq7[7] == "Y": #if phosphoresidue is tyrosine
+pres = "y"
+else: # if not pSTY
+pres = "?"
+seq7 = seq7[:7] + pres + seq7[8:]
+if seq7 not in seq7s_set:
+seq7s.append(seq7)
+seq7s_set.add(seq7)
+#add Sequence10
+if psite < 10: #phospho_pep at N terminus
+seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11]
+elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus
+seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:]
+else:
+seq10 = str(UniProtSeq)[psite-10:psite+11]
+seq10 = seq10[:10] + "p" + seq10[10:]
+if seq10 not in seq10s_set:
+seq10s.append(seq10)
+seq10s_set.add(seq10)
+i+=1
+result[PHOSPHORESIDUE].append(phosphoresidues)
+result[SEQUENCE7].append(seq7s)
+# result[SEQUENCE10] is a list of lists of strings
+result[SEQUENCE10].append(seq10s)
+r = list(zip(
+result[UNIPROT_ID],
+result[GENE_NAME],
+result[DESCRIPTION],
+result[PHOSPHORESIDUE]
+))
+# Sort by `UniProt_ID`
+#   ref: https://stackoverflow.com//4174955/15509512
+s = sorted(r, key=operator.itemgetter(0))
+result[UNIPROT_ID] = []
+result[GENE_NAME] = []
+result[DESCRIPTION] = []
+result[PHOSPHORESIDUE] = []
+for r in s:
+result[UNIPROT_ID].append(r[0])
+result[GENE_NAME].append(r[1])
+result[DESCRIPTION].append(r[2])
+result[PHOSPHORESIDUE].append(r[3])
+#convert lists to strings in the dictionary
+for key,value in result.items():
+if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
+result[key] = '; '.join(map(str, value))
+elif key in [SEQUENCE10]:
+# result[SEQUENCE10] is a list of lists of strings
+joined_value = ''
+joined_set = set()
+sep = ''
+for valL in value:
+# valL is a list of strings
+for val in valL:
+# val is a string
+if val not in joined_set:
+joined_set.add(val)
+#joined_value += sep + '; '.join(map(str, val))
+joined_value += sep + val
+sep = '; '
+# joined_value is a string
+result[key] = joined_value
+newstring = '; '.join(
+[', '.join(l) for l in result[PHOSPHORESIDUE]]
+)
+### #separate the isoforms in PHOSPHORESIDUE column with ";"
+### oldstring = result[PHOSPHORESIDUE]
+### oldlist = list(oldstring)
+### newstring = ""
+### i = 0
+### for e in oldlist:
+###     if e == ";":
+###         if numps > 1:
+###             if i%numps:
+###                 newstring = newstring + ";"
+###             else:
+###                 newstring = newstring + ","
+###         else:
+###             newstring = newstring + ";"
+###         i +=1
+###     else:
+###         newstring = newstring + e
+result[PHOSPHORESIDUE] = newstring
+#separate sequence7's by |
+oldstring = result[SEQUENCE7]
+oldlist = oldstring
+newstring = ""
+for l in oldlist:
+for e in l:
+if e == ";":
+newstring = newstring + " |"
+elif len(newstring) > 0 and 1 > newstring.count(e):
+newstring = newstring + " | " + e
+elif 1 > newstring.count(e):
+newstring = newstring + e
+result[SEQUENCE7] = newstring
+return [phospho_pep, result]
+# Construct list of [string, dictionary] lists
+#   where the dictionary provides the SwissProt metadata for a phosphopeptide
+result_list = [
+catch(pseq_to_subdict,psequence)
+for psequence
+in data_in[PHOSPHOPEPTIDE_MATCH]
+]
+end_time = time.process_time() #timer
+print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer
+# Construct dictionary from list of lists
+#   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+UniProt_Info = {
+result[0]:result[1]
+for result
+in result_list
+if result is not None
+}
+end_time = time.process_time() #timer
+print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer
+#cosmetic: add N_A to phosphopeptide rows with no hits
+p_peptide_list = []
+for key in UniProt_Info:
+p_peptide_list.append(key)
+for nestedKey in UniProt_Info[key]:
+if UniProt_Info[key][nestedKey] == "":
+UniProt_Info[key][nestedKey] = N_A
+end_time = time.process_time() #timer
+print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer
+#convert UniProt_Info dictionary to dataframe
+uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info))
+#reorder columns to match expected output file
+uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too
+cols = uniprot_df.columns.tolist()
+#cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
+#uniprot_df = uniprot_df[cols]
+uniprot_df = uniprot_df[[
+PHOSPHOPEPTIDE,
+SEQUENCE10,
+SEQUENCE7,
+GENE_NAME,
+PHOSPHORESIDUE,
+UNIPROT_ID,
+DESCRIPTION
+]]
+end_time = time.process_time() #timer
+print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer
+#concat to split then groupby to collapse
+seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | '))
+for _, row in uniprot_df.iterrows()]).reset_index()
+seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE]
+# --- -------------- begin read PSP_Regulatory_sites ---------------------------------
+#read in PhosphoSitePlus Regulatory Sites dataset
+#ACE if (True):
+## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+conn = sql.connect(uniprot_sqlite)
+regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
+# Close SwissProt SQLite database
+conn.close()
+#ACE # Array indexes are zero-based
+#ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
+#ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT'
+#ACE               , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM']
+#ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+#ACE     print(regsites_df)
+## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) -----------
+#ACE else:
+#ACE     regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3)
+#ACE     SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA
+#ACE     #ACE # Array indexes are zero-based
+#ACE     #ACE #   ref: https://en.wikipedia.org/wiki/Python_(programming_language)
+#ACE     #ACE RENAME_COLS = [ 'GENE'          , 'PROTEIN'    , 'PROT_TYPE' , 'ACC_ID'          , 'GENE_ID'
+#ACE     #ACE               , 'HU_CHR_LOC'    , 'ORGANISM'   , 'MOD_RSD'   , 'SITE_GRP_ID'     , 'SITE_+/-7_AA'
+#ACE     #ACE               , 'DOMAIN'        , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT'
+#ACE     #ACE               , 'PMIDs'         , 'LT_LIT'     , 'MS_LIT'    , 'MS_CST'          , 'NOTES'
+#ACE     #ACE               ]
+#ACE     #ACE REGSITE_COL_SITE7AA = 9
+#ACE     #ACE REGSITE_COL_PROTEIN = 1
+#ACE     #ACE REGSITE_COL_DOMAIN = 10
+#ACE     #ACE REGSITE_COL_PMIDs = 15
+# ... -------------- end read PSP_Regulatory_sites ------------------------------------
+#keep only the human entries in dataframe
+if len(species) > 0:
+print('Limit PhosphoSitesPlus records to species "' + species + '"')
+regsites_df = regsites_df[regsites_df.ORGANISM == species]
+#merge the seq7 df with the regsites df based off of the sequence7
+merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left')
+#ACE print(merge_df.columns.tolist()) #ACE
+#after merging df, select only the columns of interest - note that PROTEIN is absent here
+merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]]
+#ACE print(merge_df.columns.tolist()) #ACE
+#combine column values of interest into one FUNCTION_PHOSPHORESIDUE column"
+merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="")
+merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
+merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
+merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
+#remove the columns that were combined
+merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]]
+#ACE print(merge_df) #ACE
+#ACE print(merge_df.columns.tolist()) #ACE
+end_time = time.process_time() #timer
+print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer
+#cosmetic changes to Function Phosphoresidue column
+fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
+end_time = time.process_time() #timer
+print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer
+i = 0
+while i < len(fp_series):
+#remove the extra ";" so that it looks more professional
+if fp_series[i] == "; ; ; ; ": #remove ; from empty hits
+fp_series[i] = ""
+while fp_series[i].endswith("; "): #remove ; from the ends
+fp_series[i] = fp_series[i][:-2]
+while fp_series[i].startswith("; "): #remove ; from the beginning
+fp_series[i] = fp_series[i][2:]
+fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
+fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
+fp_series[i] = fp_series[i].replace("; ; ", "; ")
+#turn blanks into N_A to signify the info was searched for but cannot be found
+if fp_series[i] == "":
+fp_series[i] = N_A
+i += 1
+merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
+end_time = time.process_time() #timer
+print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer
+#merge uniprot df with merge df
+uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left")
+#collapse the merged df
+uniprot_regsites_collapsed_df = pandas.DataFrame(
+uniprot_regsites_merged_df
+.groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE]
+.apply(lambda x: ppep_join(x)))
+#.apply(lambda x: "%s" % ' | '.join(x)))
+end_time = time.process_time() #timer
+print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer
+uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column
+#rename columns
+uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp']
+#select columns to be merged to uniprot_df
+#ACE cols = regsites_df.columns.tolist()
+#ACE print(cols) #ACE
+#ACE if len(cols) > 8:
+#ACE     cols = [cols[9]]+[cols[1]]+cols[10:15]
+#ACE     #ACE cols = [cols[9]]+[cols[1]]+cols[10:15]
+#ACE     print(cols) #ACE
+#ACE regsite_merge_df = regsites_df[cols]
+end_time = time.process_time() #timer
+print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer
+#add columns based on Sequence7 matching site_+/-7_AA
+uniprot_regsite_df = pandas.merge(
+left=uniprot_df,
+right=uniprot_regsites_collapsed_df,
+how='left',
+left_on=PHOSPHOPEPTIDE,
+right_on='ppp')
+end_time = time.process_time() #timer
+print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer
+data_in.rename(
+{'Protein description': PHOSPHOPEPTIDE},
+axis='columns',
+inplace=True
+)
+sort_start_time = time.process_time() #timer
+#data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
+res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold())
+data_in = data_in.loc[res2]
+end_time = time.process_time() #timer
+print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer
+cols = [old_cols[0]] + old_cols[col_PKCalpha-1:]
+upstream_data = upstream_data[cols]
+end_time = time.process_time() #timer
+print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer
+#### #rename upstream columns in new list
+#### new_cols = []
+#### for name in cols:
+####     if "_NetworKIN" in name:
+####         name = name.split("_")[0]
+####     if " motif" in name:
+####         name = name.split(" motif")[0]
+####     if " sequence " in name:
+####         name = name.split(" sequence")[0]
+####     if "_Phosida" in name:
+####         name = name.split("_")[0]
+####     if "_PhosphoSite" in name:
+####         name = name.split("_")[0]
+####     new_cols.append(name)
+#rename upstream columns in new list
+def col_rename(name):
+if "_NetworKIN" in name:
+name = name.split("_")[0]
+if " motif" in name:
+name = name.split(" motif")[0]
+if " sequence " in name:
+name = name.split(" sequence")[0]
+if "_Phosida" in name:
+name = name.split("_")[0]
+if "_PhosphoSite" in name:
+name = name.split("_")[0]
+return name
+new_cols = [col_rename(col) for col in cols]
+upstream_data.columns = new_cols
+end_time = time.process_time() #timer
+print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer
+# Create upstream_data_cast as a copy of upstream_data
+#   but with first column substituted by the phosphopeptide sequence
+upstream_data_cast = upstream_data.copy()
+new_cols_cast = new_cols
+new_cols_cast[0] = 'p_peptide'
+upstream_data_cast.columns = new_cols_cast
+upstream_data_cast['p_peptide'] = upstream_data.index
+new_cols_cast0 = new_cols_cast[0]
+# --- -------------- begin read upstream_data_melt ------------------------------------
+## ----------- Get melted kinase mapping data from SQLite database (start) -----------
+conn = sql.connect(uniprot_sqlite)
+upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
+# Close SwissProt SQLite database
+conn.close()
+upstream_data_melt = upstream_data_melt_df.copy()
+upstream_data_melt.columns = ['p_peptide', 'characterization', 'X']
+upstream_data_melt['characterization'] = [
+col_rename(s)
+for s in upstream_data_melt['characterization']
+]
+print('%0.6f upstream_data_melt_df initially has %d rows' %
+(end_time - start_time, len(upstream_data_melt.axes[0]))
+, file=sys.stderr)
+# ref: https://stackoverflow.com/a/27360130/15509512
+#      e.g. df.drop(df[df.score < 50].index, inplace=True)
+upstream_data_melt.drop(
+upstream_data_melt[upstream_data_melt.X != 'X'].index,
+inplace = True
+)
+print('%0.6f upstream_data_melt_df pre-dedup has %d rows' %
+(end_time - start_time, len(upstream_data_melt.axes[0]))
+, file=sys.stderr)
+#ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
+#ACE     print(upstream_data_melt)
+## ----------- Get melted kinase mapping data from SQLite database (finish) -----------
+# ... -------------- end read upstream_data_melt --------------------------------------
+end_time = time.process_time() #timer
+print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer
+# ... end read upstream_data_melt
+upstream_data_melt_index = upstream_data_melt.index
+upstream_data_melt_p_peptide = upstream_data_melt['p_peptide']
+end_time = time.process_time() #timer
+print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer
+upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
+melt_dict = {}
+for key in upstream_map_p_peptide_list:
+melt_dict[key] = []
+for el in upstream_delta_melt_LoL:
+(p_peptide, characterization, X) = tuple(el)
+if p_peptide in melt_dict:
+melt_dict[p_peptide].append(characterization)
+else:
+exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide))
+end_time = time.process_time() #timer
+print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer
+# for key in upstream_map_p_peptide_list:
+#     melt_dict[key] = ' | '.join(melt_dict[key])
+for key in upstream_map_p_peptide_list:
+melt_dict[key] = melt_join(melt_dict[key])
+end_time = time.process_time() #timer
+print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer
+# map_dict is a dictionary of dictionaries
+map_dict = {}
+for key in upstream_map_p_peptide_list:
+map_dict[key] = {}
+map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
+end_time = time.process_time() #timer
+print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer
+#convert map_dict to dataframe
+map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict))
+map_df["p-peptide"] = map_df.index #make index a column too
+cols_map_df = map_df.columns.tolist()
+cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
+map_df = map_df[cols_map_df]
+#join map_df to uniprot_regsite_df
+output_df = uniprot_regsite_df.merge(
+map_df,
+how="left",
+left_on=PHOSPHOPEPTIDE,
+right_on="p-peptide")
+output_df = output_df[
+[  PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE,
+UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE,
+PUTATIVE_UPSTREAM_DOMAINS
+]
+]
+# cols_output_prelim = output_df.columns.tolist()
+#
+# print("cols_output_prelim")
+# print(cols_output_prelim)
+#
+# cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
+#
+# print("cols_output with p-peptide")
+# print(cols_output)
+#
+# cols_output = [col for col in cols_output if not col == "p-peptide"]
+#
+# print("cols_output")
+# print(cols_output)
+#
+# output_df = output_df[cols_output]
+#join output_df back to quantitative columns in data_in df
+quant_cols = data_in.columns.tolist()
+quant_cols = quant_cols[1:]
+quant_data = data_in[quant_cols]
+## ----------- Write merge/filter metadata to SQLite database (start) -----------
+# Open SwissProt SQLite database
+conn = sql.connect(output_sqlite)
+cur  = conn.cursor()
+cur.executescript(MRGFLTR_DDL)
+cur.execute(
+CITATION_INSERT_STMT,
+('mrgfltr_metadata_view', CITATION_INSERT_PSP)
+)
+cur.execute(
+CITATION_INSERT_STMT,
+('mrgfltr_metadata', CITATION_INSERT_PSP)
+)
+cur.execute(
+CITATION_INSERT_STMT,
+('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF)
+)
+cur.execute(
+CITATION_INSERT_STMT,
+('mrgfltr_metadata', CITATION_INSERT_PSP_REF)
+)
+# Read ppep-to-sequence LUT
+ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
+#ACE ppep_lut_df.info(verbose=True)
+# write only metadata for merged/filtered records to SQLite
+mrgfltr_metadata_df = output_df.copy()
+# replace phosphopeptide seq with ppep.id
+mrgfltr_metadata_df = ppep_lut_df.merge(
+mrgfltr_metadata_df,
+left_on='ppep_seq',
+right_on=PHOSPHOPEPTIDE,
+how='inner'
+)
+mrgfltr_metadata_df.drop(
+columns=[PHOSPHOPEPTIDE, 'ppep_seq'],
+inplace=True
+)
+#rename columns
+mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
+#ACE mrgfltr_metadata_df.info(verbose=True)
+mrgfltr_metadata_df.to_sql(
+'mrgfltr_metadata',
+con=conn,
+if_exists='append',
+index=False,
+method='multi'
+)
+# Close SwissProt SQLite database
+conn.close()
+## ----------- Write merge/filter metadata to SQLite database (finish) -----------
+output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH)
+output_cols = output_df.columns.tolist()
+output_cols = output_cols[:-1]
+output_df = output_df[output_cols]
+#cosmetic changes to Upstream column
+output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
+i = 0
+while i < len(us_series):
+#turn blanks into N_A to signify the info was searched for but cannot be found
+if us_series[i] == "":
+us_series[i] = N_A
+i += 1
+output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
+end_time = time.process_time() #timer
+print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer
+(output_rows, output_cols) = output_df.shape
+#output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True)
+output_df = output_df.convert_dtypes(convert_integer=True)
+#Output onto Final CSV file
+output_df.to_csv(output_filename_csv, index=False)
+output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False)
+end_time = time.process_time() #timer
+print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer
+print('{:>10} phosphopeptides written to output'.format(str(output_rows)))
+end_time = time.process_time() #timer
+print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer
+#Rev. 7/1/2016
+#Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
+#Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
+#Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
+#                read from SwissProt SQLite database
+#Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
+#############################################
+# copied from Excel Output Script.ipynb END #
+#############################################
+try:
+catch(mqpep_getswissprot,)
+exit(0)
+except Exception as e:
+exit('Internal error running mqpep_getswissprot(): %s' % (e))
+if __name__ == "__main__":
+__main__()

Mercurial > repos > eschen42 > mqppep_anova

comparison mqppep_mrgfltr.py @ 0:c1403d18c189 draft