Mercurial > repos > eschen42 > mqppep_anova
diff mqppep_mrgfltr.py @ 5:d4d531006735 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 92e8ab6fc27a1f02583742715d644bc96418fbdf"
author | eschen42 |
---|---|
date | Thu, 10 Mar 2022 23:42:48 +0000 |
parents | c1403d18c189 |
children | 922d309640db |
line wrap: on
line diff
--- a/mqppep_mrgfltr.py Mon Mar 07 20:43:57 2022 +0000 +++ b/mqppep_mrgfltr.py Thu Mar 10 23:42:48 2022 +0000 @@ -1,1337 +1,1500 @@ -#!/usr/bin/env python - -# Import the packages needed -import argparse -import os.path -import sys - -import pandas -import re -import time -import sqlite3 as sql -from codecs import getreader as cx_getreader -import sys -import numpy as np - -# for sorting list of lists using operator.itemgetter -import operator - -# for formatting stack-trace -import traceback - -# for Aho-Corasick search for fixed set of substrings -import ahocorasick -import operator -import hashlib - -# for shutil.copyfile(src, dest) -import shutil - -# global constants -N_A = 'N/A' - -# ref: https://stackoverflow.com/a/8915613/15509512 -# answers: "How to handle exceptions in a list comprehensions" -# usage: -# from math import log -# eggs = [1,3,0,3,2] -# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) -# producing: -# for <built-in function log> -# with args (0,) -# exception: math domain error -# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] -def catch(func, *args, handle=lambda e : e, **kwargs): - try: - return func(*args, **kwargs) - except Exception as e: - print("For %s" % str(func)) - print(" with args %s" % str(args)) - print(" caught exception: %s" % str(e)) - (ty, va, tb) = sys.exc_info() - print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) - exit(-1) - return None # was handle(e) - -def ppep_join(x): - x = [i for i in x if N_A != i] - result = "%s" % ' | '.join(x) - if result != "": - return result - else: - return N_A - -def melt_join(x): - tmp = {key.lower(): key for key in x} - result = "%s" % ' | '.join([tmp[key] for key in tmp]) - return result - -def __main__(): - # Parse Command Line - parser = argparse.ArgumentParser( - description='Phopsphoproteomic Enrichment Pipeline Merge and Filter.' - ) - - # inputs: - # Phosphopeptide data for experimental results, including the intensities - # and the mapping to kinase domains, in tabular format. - parser.add_argument( - '--phosphopeptides', '-p', - nargs=1, - required=True, - dest='phosphopeptides', - help='Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format' - ) - # UniProtKB/SwissProt DB input, SQLite - parser.add_argument( - '--ppep_mapping_db', '-d', - nargs=1, - required=True, - dest='ppep_mapping_db', - help='UniProtKB/SwissProt SQLite Database' - ) - #ACE # PhosPhositesPlus DB input, csv - #ACE parser.add_argument( - #ACE '--psp_regulatory_sites', '-s', - #ACE nargs=1, - #ACE required=True, - #ACE dest='psp_regulatory_sites_csv', - #ACE help='PhosphoSitesPlus Regulatory Sites, in CSV format including three-line header' - #ACE ) - # species to limit records chosed from PhosPhositesPlus - parser.add_argument( - '--species', '-x', - nargs=1, - required=False, - default=[], - dest='species', - help='limit PhosphoSitePlus records to indicated species (field may be empty)' - ) - - # outputs: - # tabular output - parser.add_argument( - '--mrgfltr_tab', '-o', - nargs=1, - required=True, - dest='mrgfltr_tab', - help='Tabular output file for results' - ) - # CSV output - parser.add_argument( - '--mrgfltr_csv', '-c', - nargs=1, - required=True, - dest='mrgfltr_csv', - help='CSV output file for results' - ) - # SQLite output - parser.add_argument( - '--mrgfltr_sqlite', '-S', - nargs=1, - required=True, - dest='mrgfltr_sqlite', - help='SQLite output file for results' - ) - - # "Make it so!" (parse the arguments) - options = parser.parse_args() - print("options: " + str(options)) - - # determine phosphopeptide ("upstream map") input tabular file access - if options.phosphopeptides is None: - exit('Argument "phosphopeptides" is required but not supplied') - try: - upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) - input_file = open(upstream_map_filename_tab, 'r') - input_file.close() - except Exception as e: - exit('Error parsing phosphopeptides argument: %s' % str(e)) - - # determine input SQLite access - if options.ppep_mapping_db is None: - exit('Argument "ppep_mapping_db" is required but not supplied') - try: - uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) - input_file = open(uniprot_sqlite, 'rb') - input_file.close() - except Exception as e: - exit('Error parsing ppep_mapping_db argument: %s' % str(e)) - - # copy input SQLite dataset to output SQLite dataset - if options.mrgfltr_sqlite is None: - exit('Argument "mrgfltr_sqlite" is required but not supplied') - try: - output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) - shutil.copyfile(uniprot_sqlite, output_sqlite) - except Exception as e: - exit('Error copying ppep_mapping_db to mrgfltr_sqlite: %s' % str(e)) - - #ACE # determine psp_regulatory_sites CSV access - #ACE if options.psp_regulatory_sites_csv is None: - #ACE exit('Argument "psp_regulatory_sites_csv" is required but not supplied') - #ACE #ACE print('options.psp_regulatory_sites_csv: ' + options.psp_regulatory_sites_csv) - #ACE try: - #ACE phosphosite_filename_csv = os.path.abspath(options.psp_regulatory_sites_csv[0]) - #ACE input_file = open(phosphosite_filename_csv, 'r') - #ACE input_file.close() - #ACE except Exception as e: - #ACE exit('Error parsing psp_regulatory_sites_csv argument: %s' % str(e)) - #ACE print('phosphosite_filename_csv: ' + phosphosite_filename_csv) - - # determine species to limit records from PSP_Regulatory_Sites - if options.species is None: - exit('Argument "species" is required (and may be empty) but not supplied') - try: - if len(options.species) > 0: - species = options.species[0] - else: - species = '' - except Exception as e: - exit('Error parsing species argument: %s' % str(e)) - - # determine tabular output destination - if options.mrgfltr_tab is None: - exit('Argument "mrgfltr_tab" is required but not supplied') - try: - output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) - output_file = open(output_filename_tab, 'w') - output_file.close() - except Exception as e: - exit('Error parsing mrgfltr_tab argument: %s' % str(e)) - - # determine CSV output destination - if options.mrgfltr_csv is None: - exit('Argument "mrgfltr_csv" is required but not supplied') - try: - output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) - output_file = open(output_filename_csv, 'w') - output_file.close() - except Exception as e: - exit('Error parsing mrgfltr_csv argument: %s' % str(e)) - - - def mqpep_getswissprot(): - - ############################################### - # copied from Excel Output Script.ipynb BEGIN # - ############################################### - - ########### String Constants ################# - DEPHOSPHOPEP = 'DephosphoPep' - DESCRIPTION = 'Description' - FUNCTION_PHOSPHORESIDUE = 'Function Phosphoresidue(PSP=PhosphoSitePlus.org)' - GENE_NAME = 'Gene_Name' # Gene Name from UniProtKB - ON_FUNCTION = 'ON_FUNCTION' # ON_FUNCTION column from PSP_Regulatory_Sites - ON_NOTES = 'NOTES' # NOTES column from PSP_Regulatory_Sites - ON_OTHER_INTERACT = 'ON_OTHER_INTERACT' # ON_OTHER_INTERACT column from PSP_Regulatory_Sites - ON_PROCESS = 'ON_PROCESS' # ON_PROCESS column from PSP_Regulatory_Sites - ON_PROT_INTERACT = 'ON_PROT_INTERACT' # ON_PROT_INTERACT column from PSP_Regulatory_Sites - PHOSPHOPEPTIDE = 'Phosphopeptide' - PHOSPHOPEPTIDE_MATCH = 'Phosphopeptide_match' - PHOSPHORESIDUE = 'Phosphoresidue' - PUTATIVE_UPSTREAM_DOMAINS = 'Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains' - SEQUENCE = 'Sequence' - SEQUENCE10 = 'Sequence10' - SEQUENCE7 = 'Sequence7' - SITE_PLUSMINUS_7AA = 'SITE_+/-7_AA' - SITE_PLUSMINUS_7AA_SQL = 'SITE_PLUSMINUS_7AA' - UNIPROT_ID = 'UniProt_ID' - UNIPROT_SEQ_AND_META_SQL = ''' - select Uniprot_ID, Description, Gene_Name, Sequence, - Organism_Name, Organism_ID, PE, SV - from UniProtKB - order by Sequence, UniProt_ID - ''' - UNIPROT_UNIQUE_SEQ_SQL = ''' - select distinct Sequence - from UniProtKB - group by Sequence - ''' - PPEP_PEP_UNIPROTSEQ_SQL = ''' - select distinct phosphopeptide, peptide, sequence - from uniprotkb_pep_ppep_view - order by sequence - ''' - PPEP_MELT_SQL = ''' - SELECT DISTINCT - phospho_peptide AS 'p_peptide', - kinase_map AS 'characterization', - 'X' AS 'X' - FROM ppep_gene_site_view - ''' - # CREATE TABLE PSP_Regulatory_site ( - # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, - # domain TEXT, - # ON_FUNCTION TEXT, - # ON_PROCESS TEXT, - # ON_PROT_INTERACT TEXT, - # ON_OTHER_INTERACT TEXT, - # notes TEXT, - # organism TEXT - # ); - PSP_REGSITE_SQL = ''' - SELECT DISTINCT - SITE_PLUSMINUS_7AA , - DOMAIN , - ON_FUNCTION , - ON_PROCESS , - ON_PROT_INTERACT , - ON_OTHER_INTERACT , - NOTES , - ORGANISM - FROM PSP_Regulatory_site - ''' - PPEP_ID_SQL =''' - SELECT - id AS 'ppep_id', - seq AS 'ppep_seq' - FROM ppep - ''' - MRGFLTR_DDL =''' - DROP VIEW IF EXISTS mrgfltr_metadata_view; - DROP TABLE IF EXISTS mrgfltr_metadata; - CREATE TABLE mrgfltr_metadata - ( ppep_id INTEGER REFERENCES ppep(id) - , Sequence10 TEXT - , Sequence7 TEXT - , GeneName TEXT - , Phosphoresidue TEXT - , UniProtID TEXT - , Description TEXT - , FunctionPhosphoresidue TEXT - , PutativeUpstreamDomains TEXT - , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE - ) - ; - CREATE VIEW mrgfltr_metadata_view AS - SELECT DISTINCT - ppep.seq AS phospho_peptide - , Sequence10 - , Sequence7 - , GeneName - , Phosphoresidue - , UniProtID - , Description - , FunctionPhosphoresidue - , PutativeUpstreamDomains - FROM - ppep, mrgfltr_metadata - WHERE - mrgfltr_metadata.ppep_id = ppep.id - ORDER BY - ppep.seq - ; - ''' - - CITATION_INSERT_STMT = ''' - INSERT INTO Citation ( - ObjectName, - CitationData - ) VALUES (?,?) - ''' - CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' - CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' - - MRGFLTR_METADATA_COLUMNS = [ - 'ppep_id', - 'Sequence10', - 'Sequence7', - 'GeneName', - 'Phosphoresidue', - 'UniProtID', - 'Description', - 'FunctionPhosphoresidue', - 'PutativeUpstreamDomains' - ] - - ########### String Constants (end) ############ - - class Error(Exception): - """Base class for exceptions in this module.""" - pass - - class PreconditionError(Error): - """Exception raised for errors in the input. - - Attributes: - expression -- input expression in which the error occurred - message -- explanation of the error - """ - - def __init__(self, expression, message): - self.expression = expression - self.message = message - - #start_time = time.clock() #timer - start_time = time.process_time() #timer - - #get keys from upstream tabular file using readline() - # ref: https://stackoverflow.com/a/16713581/15509512 - # answer to "Use codecs to read file with correct encoding" - file1_encoded = open(upstream_map_filename_tab, 'rb') - file1 = cx_getreader("latin-1")(file1_encoded) - - count = 0 - upstream_map_p_peptide_list = [] - re_tab = re.compile('^[^\t]*') - while True: - count += 1 - # Get next line from file - line = file1.readline() - # if line is empty - # end of file is reached - if not line: - break - if count > 1: - m = re_tab.match(line) - upstream_map_p_peptide_list.append(m[0]) - file1.close() - file1_encoded.close() - - # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed - re_phos = re.compile('p') - dephospho_peptide_list = [ re_phos.sub('',foo) for foo in upstream_map_p_peptide_list ] - - end_time = time.process_time() #timer - print("%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr) - - ## ----------- Get SwissProt data from SQLite database (start) ----------- - # build UniProt sequence LUT and list of unique SwissProt sequences - - # Open SwissProt SQLite database - conn = sql.connect(uniprot_sqlite) - cur = conn.cursor() - - # Set up structures to hold SwissProt data - - uniprot_Sequence_List = [] - UniProtSeqLUT = {} - - # Execute query for unique seqs without fetching the results yet - uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) - - while batch := uniprot_unique_seq_cur.fetchmany(size=50): - if None == batch: - # handle case where no records are returned - break - for row in batch: - Sequence = row[0] - UniProtSeqLUT[(Sequence,DESCRIPTION)] = [] - UniProtSeqLUT[(Sequence,GENE_NAME) ] = [] - UniProtSeqLUT[(Sequence,UNIPROT_ID) ] = [] - UniProtSeqLUT[ Sequence ] = [] - - # Execute query for seqs and metadata without fetching the results yet - uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) - - while batch := uniprot_seq_and_meta.fetchmany(size=50): - if None == batch: - # handle case where no records are returned - break - for UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV in batch: - uniprot_Sequence_List.append(Sequence) - UniProtSeqLUT[Sequence] = Sequence - UniProtSeqLUT[(Sequence,UNIPROT_ID) ].append(UniProt_ID) - UniProtSeqLUT[(Sequence,GENE_NAME) ].append(Gene_Name) - if OS != N_A: - Description += ' OS=' + OS - if OX != N_A: - Description += ' OX=' + str(int(OX)) - if Gene_Name != N_A: - Description += ' GN=' + Gene_Name - if PE != N_A: - Description += ' PE=' + PE - if SV != N_A: - Description += ' SV=' + SV - UniProtSeqLUT[(Sequence,DESCRIPTION)].append(Description) - - # Close SwissProt SQLite database; clean up local variables - conn.close() - Sequence = '' - UniProt_ID = '' - Description = '' - Gene_Name = '' - - ## ----------- Get SwissProt data from SQLite database (finish) ----------- - - end_time = time.process_time() #timer - print("%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr) - - ## ----------- Get SwissProt data from SQLite database (start) ----------- - # build PhosphoPep_UniProtSeq_LUT and PhosphoPep_UniProtSeq_LUT - #ACE_temp pepSeqList = list( zip(pepList, dephosphPepList, [seq]*len(pepList)) ) - - # Open SwissProt SQLite database - conn = sql.connect(uniprot_sqlite) - cur = conn.cursor() - - # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide - DephosphoPep_UniProtSeq_LUT = {} - - # Set up dictionary to accumulate results - PhosphoPep_UniProtSeq_LUT = {} - - # Execute query for tuples without fetching the results yet - ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) - - while batch := ppep_pep_uniprotseq_cur.fetchmany(size=50): - if None == batch: - # handle case where no records are returned - break - for (phospho_pep, dephospho_pep, sequence) in batch: - #do interesting stuff here... - PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep - PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep - if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: - DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] = [] - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] = [] - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] = [] - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] = [] - DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) - - #ACE print("ppep:'%s' dephospho_pep:'%s' sequence:'%s'" % (phospho_pep, dephospho_pep, sequence)) - if sequence not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)]: - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)].append(sequence) - for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: - if phospho_pep != phospho_pep: - print("phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep)) - if phospho_pep not in PhosphoPep_UniProtSeq_LUT: - PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep - PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = dephospho_pep - r = list(zip( - [s for s in UniProtSeqLUT[(sequence,UNIPROT_ID)]], - [s for s in UniProtSeqLUT[(sequence,GENE_NAME)]], - [s for s in UniProtSeqLUT[(sequence,DESCRIPTION)]] - )) - # Sort by `UniProt_ID` - # ref: https://stackoverflow.com/a/4174955/15509512 - r = sorted(r, key=operator.itemgetter(0)) - # Get one tuple for each `phospho_pep` - # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] - for (upid, gn, desc) in r: - # Append pseudo-tuple per UniProt_ID but only when it is not present - if upid not in DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)]: - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)].append(upid) - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)].append(desc) - DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)].append(gn) - - # Close SwissProt SQLite database; clean up local variables - conn.close() - # wipe local variables - phospho_pep = dephospho_pep = sequence = 0 - upid = gn = desc = r = '' - - ## ----------- Get SwissProt data from SQLite database (finish) ----------- - - end_time = time.process_time() #timer - print("%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time,upstream_map_filename_tab), file=sys.stderr) - - print('{:>10} unique upstream phosphopeptides tested'.format(str(len(upstream_map_p_peptide_list)))) - - #Read in Upstream tabular file - # We are discarding the intensity data; so read it as text - upstream_data = pandas.read_table( - upstream_map_filename_tab, - dtype='str', - index_col = 0 - ) - - end_time = time.process_time() #timer - print("%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr) #timer - - upstream_data.index = upstream_map_p_peptide_list - - - end_time = time.process_time() #timer - print("%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr) #timer - - - #trim upstream_data to include only the upstream map columns - old_cols = upstream_data.columns.tolist() - i = 0 - first_intensity = -1 - last_intensity = -1 - intensity_re = re.compile('Intensity.*') - for col_name in old_cols: - m = intensity_re.match(col_name) - if m: - last_intensity = i - if first_intensity == -1: - first_intensity = i - i += 1 - #print('last intensity = %d' % last_intensity) - col_PKCalpha = last_intensity + 2 - col_firstIntensity = first_intensity - - data_in_cols = [old_cols[0]] + old_cols[first_intensity:last_intensity+1] - - if upstream_data.empty: - print("upstream_data is empty") - exit(0) - - data_in = upstream_data.copy(deep=True)[data_in_cols] - - # Convert floating-point integers to int64 integers - # ref: https://stackoverflow.com/a/68497603/15509512 - data_in[list(data_in.columns[1:])] = data_in[ - list(data_in.columns[1:])].astype('float64').apply(np.int64) - - #create another phosphopeptide column that will be used to join later; - # MAY need to change depending on Phosphopeptide column position - #data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] - data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index - - - - - end_time = time.process_time() #timer - print("%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr) #timer - - # Produce a dictionary of metadata for a single phosphopeptide. - # This is a replacement of `UniProtInfo_subdict` in the original code. - def pseq_to_subdict(phospho_pep): - #ACE print("calling pseq_to_subdict, %s" % phospho_pep); - # Strip "p" from phosphopeptide sequence - dephospho_pep = re_phos.sub('',phospho_pep) - - # Determine number of phosphoresidues in phosphopeptide - numps = len(phospho_pep) - len(dephospho_pep) - - # Determine location(s) of phosphoresidue(s) in phosphopeptide - # (used later for Phosphoresidue, Sequence7, and Sequence10) - ploc = [] #list of p locations - i = 0 - p = phospho_pep - while i < numps: - ploc.append(p.find("p")) - p = p[:p.find("p")] + p[p.find("p")+1:] - i +=1 - - - # Establish nested dictionary - result = {} - result[SEQUENCE] = [] - result[UNIPROT_ID] = [] - result[DESCRIPTION] = [] - result[GENE_NAME] = [] - result[PHOSPHORESIDUE] = [] - result[SEQUENCE7] = [] - result[SEQUENCE10] = [] - - # Add stripped sequence to dictionary - result[SEQUENCE].append(dephospho_pep) - - # Locate dephospho_pep in DephosphoPep_UniProtSeq_LUT - dephos = DephosphoPep_UniProtSeq_LUT[dephospho_pep] - - # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT - ### Caller may elect to: - ## try: - ## ... - ## except PreconditionError as pe: - ## print("'{expression}': {message}".format( - ## expression = pe.expression, - ## message = pe.message)) - ## ) - ## ) - if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: - raise PreconditionError( dephospho_pep, - 'dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT' - ) - if phospho_pep not in PhosphoPep_UniProtSeq_LUT: - raise PreconditionError( dephospho_pep, - 'no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT' - ) - if dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)]: - raise PreconditionError( dephospho_pep, - "dephosphorylated phosphopeptide does not match " + - "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + - PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] - ) - result[SEQUENCE] = [dephospho_pep] - result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,UNIPROT_ID)] - result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,DESCRIPTION)] - result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,GENE_NAME)] - if (dephospho_pep,SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: - raise PreconditionError( dephospho_pep, - 'no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT' - ) - UniProtSeqList = DephosphoPep_UniProtSeq_LUT[(dephospho_pep,SEQUENCE)] - if len (UniProtSeqList) < 1: - print("Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep) - # raise PreconditionError( - # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", - # 'value has zero length' - # ) - for UniProtSeq in UniProtSeqList: - i = 0 - phosphoresidues = [] - seq7s_set = set() - seq7s = [] - seq10s_set = set() - seq10s = [] - while i < len(ploc): - start = UniProtSeq.find(dephospho_pep) - psite = start+ploc[i] #location of phosphoresidue on protein sequence - - #add Phosphoresidue - phosphosite = "p"+str(UniProtSeq)[psite]+str(psite+1) - phosphoresidues.append(phosphosite) - - #Add Sequence7 - if psite < 7: #phospho_pep at N terminus - seq7 = str(UniProtSeq)[:psite+8] - if seq7[psite] == "S": #if phosphosresidue is serine - pres = "s" - elif seq7[psite] == "T": #if phosphosresidue is threonine - pres = "t" - elif seq7[psite] == "Y": #if phosphoresidue is tyrosine - pres = "y" - else: # if not pSTY - pres = "?" - seq7 = seq7[:psite] + pres + seq7[psite+1:psite+8] - while len(seq7) < 15: #add appropriate number of "_" to the front - seq7 = "_" + seq7 - elif len(UniProtSeq) - psite < 8: #phospho_pep at C terminus - seq7 = str(UniProtSeq)[psite-7:] - if seq7[7] == "S": - pres = "s" - elif seq7[7] == "T": - pres = "t" - elif seq7[7] == "Y": - pres = "y" - else: - pres = "?" - seq7 = seq7[:7] + pres + seq7[8:] - while len(seq7) < 15: #add appropriate number of "_" to the back - seq7 = seq7 + "_" - else: - seq7 = str(UniProtSeq)[psite-7:psite+8] - pres = "" #phosphoresidue - if seq7[7] == "S": #if phosphosresidue is serine - pres = "s" - elif seq7[7] == "T": #if phosphosresidue is threonine - pres = "t" - elif seq7[7] == "Y": #if phosphoresidue is tyrosine - pres = "y" - else: # if not pSTY - pres = "?" - seq7 = seq7[:7] + pres + seq7[8:] - if seq7 not in seq7s_set: - seq7s.append(seq7) - seq7s_set.add(seq7) - - #add Sequence10 - if psite < 10: #phospho_pep at N terminus - seq10 = str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite:psite+11] - elif len(UniProtSeq) - psite < 11: #phospho_pep at C terminus - seq10 = str(UniProtSeq)[psite-10:psite] + "p" + str(UniProtSeq)[psite:] - else: - seq10 = str(UniProtSeq)[psite-10:psite+11] - seq10 = seq10[:10] + "p" + seq10[10:] - if seq10 not in seq10s_set: - seq10s.append(seq10) - seq10s_set.add(seq10) - - i+=1 - - result[PHOSPHORESIDUE].append(phosphoresidues) - result[SEQUENCE7].append(seq7s) - # result[SEQUENCE10] is a list of lists of strings - result[SEQUENCE10].append(seq10s) - - - - - r = list(zip( - result[UNIPROT_ID], - result[GENE_NAME], - result[DESCRIPTION], - result[PHOSPHORESIDUE] - )) - # Sort by `UniProt_ID` - # ref: https://stackoverflow.com//4174955/15509512 - s = sorted(r, key=operator.itemgetter(0)) - - result[UNIPROT_ID] = [] - result[GENE_NAME] = [] - result[DESCRIPTION] = [] - result[PHOSPHORESIDUE] = [] - - for r in s: - result[UNIPROT_ID].append(r[0]) - result[GENE_NAME].append(r[1]) - result[DESCRIPTION].append(r[2]) - result[PHOSPHORESIDUE].append(r[3]) - - - #convert lists to strings in the dictionary - for key,value in result.items(): - if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: - result[key] = '; '.join(map(str, value)) - elif key in [SEQUENCE10]: - # result[SEQUENCE10] is a list of lists of strings - joined_value = '' - joined_set = set() - sep = '' - for valL in value: - # valL is a list of strings - for val in valL: - # val is a string - if val not in joined_set: - joined_set.add(val) - #joined_value += sep + '; '.join(map(str, val)) - joined_value += sep + val - sep = '; ' - # joined_value is a string - result[key] = joined_value - - - newstring = '; '.join( - [', '.join(l) for l in result[PHOSPHORESIDUE]] - ) - ### #separate the isoforms in PHOSPHORESIDUE column with ";" - ### oldstring = result[PHOSPHORESIDUE] - ### oldlist = list(oldstring) - ### newstring = "" - ### i = 0 - ### for e in oldlist: - ### if e == ";": - ### if numps > 1: - ### if i%numps: - ### newstring = newstring + ";" - ### else: - ### newstring = newstring + "," - ### else: - ### newstring = newstring + ";" - ### i +=1 - ### else: - ### newstring = newstring + e - result[PHOSPHORESIDUE] = newstring - - - #separate sequence7's by | - oldstring = result[SEQUENCE7] - oldlist = oldstring - newstring = "" - for l in oldlist: - for e in l: - if e == ";": - newstring = newstring + " |" - elif len(newstring) > 0 and 1 > newstring.count(e): - newstring = newstring + " | " + e - elif 1 > newstring.count(e): - newstring = newstring + e - result[SEQUENCE7] = newstring - - - return [phospho_pep, result] - - # Construct list of [string, dictionary] lists - # where the dictionary provides the SwissProt metadata for a phosphopeptide - result_list = [ - catch(pseq_to_subdict,psequence) - for psequence - in data_in[PHOSPHOPEPTIDE_MATCH] - ] - - - end_time = time.process_time() #timer - print("%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr) #timer - - # Construct dictionary from list of lists - # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ - UniProt_Info = { - result[0]:result[1] - for result - in result_list - if result is not None - } - - - end_time = time.process_time() #timer - print("%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr) #timer - - #cosmetic: add N_A to phosphopeptide rows with no hits - p_peptide_list = [] - for key in UniProt_Info: - p_peptide_list.append(key) - for nestedKey in UniProt_Info[key]: - if UniProt_Info[key][nestedKey] == "": - UniProt_Info[key][nestedKey] = N_A - - end_time = time.process_time() #timer - print("%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr) #timer - - #convert UniProt_Info dictionary to dataframe - uniprot_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(UniProt_Info)) - - #reorder columns to match expected output file - uniprot_df[PHOSPHOPEPTIDE] = uniprot_df.index #make index a column too - - - cols = uniprot_df.columns.tolist() - #cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] - #uniprot_df = uniprot_df[cols] - uniprot_df = uniprot_df[[ - PHOSPHOPEPTIDE, - SEQUENCE10, - SEQUENCE7, - GENE_NAME, - PHOSPHORESIDUE, - UNIPROT_ID, - DESCRIPTION - ]] - - - end_time = time.process_time() #timer - print("%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr) #timer - - #concat to split then groupby to collapse - seq7_df = pandas.concat([pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(' | ')) - for _, row in uniprot_df.iterrows()]).reset_index() - seq7_df.columns = [SEQUENCE7,PHOSPHOPEPTIDE] - - # --- -------------- begin read PSP_Regulatory_sites --------------------------------- - #read in PhosphoSitePlus Regulatory Sites dataset - #ACE if (True): - ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- - conn = sql.connect(uniprot_sqlite) - regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) - # Close SwissProt SQLite database - conn.close() - #ACE # Array indexes are zero-based - #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) - #ACE RENAME_COLS = [ 'SITE_PLUSMINUS_7AA', 'DOMAIN', 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT' - #ACE , 'ON_OTHER_INTERACT' , 'NOTES' , 'ORGANISM'] - #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also - #ACE print(regsites_df) - ## ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (finish) ----------- - #ACE else: - #ACE regsites_df = pandas.read_csv(phosphosite_filename_csv, header=3,skiprows=1-3) - #ACE SITE_PLUSMINUS_7AA_SQL = SITE_PLUSMINUS_7AA - #ACE #ACE # Array indexes are zero-based - #ACE #ACE # ref: https://en.wikipedia.org/wiki/Python_(programming_language) - #ACE #ACE RENAME_COLS = [ 'GENE' , 'PROTEIN' , 'PROT_TYPE' , 'ACC_ID' , 'GENE_ID' - #ACE #ACE , 'HU_CHR_LOC' , 'ORGANISM' , 'MOD_RSD' , 'SITE_GRP_ID' , 'SITE_+/-7_AA' - #ACE #ACE , 'DOMAIN' , 'ON_FUNCTION', 'ON_PROCESS', 'ON_PROT_INTERACT', 'ON_OTHER_INTERACT' - #ACE #ACE , 'PMIDs' , 'LT_LIT' , 'MS_LIT' , 'MS_CST' , 'NOTES' - #ACE #ACE ] - #ACE #ACE REGSITE_COL_SITE7AA = 9 - #ACE #ACE REGSITE_COL_PROTEIN = 1 - #ACE #ACE REGSITE_COL_DOMAIN = 10 - #ACE #ACE REGSITE_COL_PMIDs = 15 - - # ... -------------- end read PSP_Regulatory_sites ------------------------------------ - - - #keep only the human entries in dataframe - if len(species) > 0: - print('Limit PhosphoSitesPlus records to species "' + species + '"') - regsites_df = regsites_df[regsites_df.ORGANISM == species] - - #merge the seq7 df with the regsites df based off of the sequence7 - merge_df = seq7_df.merge(regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how='left') - #ACE print(merge_df.columns.tolist()) #ACE - - #after merging df, select only the columns of interest - note that PROTEIN is absent here - merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,ON_FUNCTION,ON_PROCESS, ON_PROT_INTERACT,ON_OTHER_INTERACT,ON_NOTES]] - #ACE print(merge_df.columns.tolist()) #ACE - #combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(merge_df[ON_PROCESS], sep="; ", na_rep="") - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") - merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[FUNCTION_PHOSPHORESIDUE].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") - - #remove the columns that were combined - merge_df = merge_df[[PHOSPHOPEPTIDE,SEQUENCE7,FUNCTION_PHOSPHORESIDUE]] - - #ACE print(merge_df) #ACE - #ACE print(merge_df.columns.tolist()) #ACE - - end_time = time.process_time() #timer - print("%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr) #timer - - #cosmetic changes to Function Phosphoresidue column - fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) - - end_time = time.process_time() #timer - print("%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr) #timer - - i = 0 - while i < len(fp_series): - #remove the extra ";" so that it looks more professional - if fp_series[i] == "; ; ; ; ": #remove ; from empty hits - fp_series[i] = "" - while fp_series[i].endswith("; "): #remove ; from the ends - fp_series[i] = fp_series[i][:-2] - while fp_series[i].startswith("; "): #remove ; from the beginning - fp_series[i] = fp_series[i][2:] - fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") - fp_series[i] = fp_series[i].replace("; ; ; ", "; ") - fp_series[i] = fp_series[i].replace("; ; ", "; ") - - #turn blanks into N_A to signify the info was searched for but cannot be found - if fp_series[i] == "": - fp_series[i] = N_A - - i += 1 - merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series - - end_time = time.process_time() #timer - print("%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr) #timer - - #merge uniprot df with merge df - uniprot_regsites_merged_df = uniprot_df.merge(merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE,how="left") - - #collapse the merged df - uniprot_regsites_collapsed_df = pandas.DataFrame( - uniprot_regsites_merged_df - .groupby(PHOSPHOPEPTIDE)[FUNCTION_PHOSPHORESIDUE] - .apply(lambda x: ppep_join(x))) - #.apply(lambda x: "%s" % ' | '.join(x))) - - - end_time = time.process_time() #timer - print("%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr) #timer - - uniprot_regsites_collapsed_df[PHOSPHOPEPTIDE] = uniprot_regsites_collapsed_df.index #add df index as its own column - - - #rename columns - uniprot_regsites_collapsed_df.columns = [FUNCTION_PHOSPHORESIDUE, 'ppp'] - - - - #select columns to be merged to uniprot_df - #ACE cols = regsites_df.columns.tolist() - #ACE print(cols) #ACE - #ACE if len(cols) > 8: - #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] - #ACE #ACE cols = [cols[9]]+[cols[1]]+cols[10:15] - #ACE print(cols) #ACE - #ACE regsite_merge_df = regsites_df[cols] - - end_time = time.process_time() #timer - print("%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr) #timer - - #add columns based on Sequence7 matching site_+/-7_AA - uniprot_regsite_df = pandas.merge( - left=uniprot_df, - right=uniprot_regsites_collapsed_df, - how='left', - left_on=PHOSPHOPEPTIDE, - right_on='ppp') - - end_time = time.process_time() #timer - print("%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr) #timer - - data_in.rename( - {'Protein description': PHOSPHOPEPTIDE}, - axis='columns', - inplace=True - ) - - - - sort_start_time = time.process_time() #timer - - #data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') - res2 = sorted(data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key = lambda s: s.casefold()) - data_in = data_in.loc[res2] - - end_time = time.process_time() #timer - print("%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr) #timer - - - - cols = [old_cols[0]] + old_cols[col_PKCalpha-1:] - upstream_data = upstream_data[cols] - - end_time = time.process_time() #timer - print("%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr) #timer - - - #### #rename upstream columns in new list - #### new_cols = [] - #### for name in cols: - #### if "_NetworKIN" in name: - #### name = name.split("_")[0] - #### if " motif" in name: - #### name = name.split(" motif")[0] - #### if " sequence " in name: - #### name = name.split(" sequence")[0] - #### if "_Phosida" in name: - #### name = name.split("_")[0] - #### if "_PhosphoSite" in name: - #### name = name.split("_")[0] - #### new_cols.append(name) - - #rename upstream columns in new list - def col_rename(name): - if "_NetworKIN" in name: - name = name.split("_")[0] - if " motif" in name: - name = name.split(" motif")[0] - if " sequence " in name: - name = name.split(" sequence")[0] - if "_Phosida" in name: - name = name.split("_")[0] - if "_PhosphoSite" in name: - name = name.split("_")[0] - return name - - new_cols = [col_rename(col) for col in cols] - upstream_data.columns = new_cols - - - - end_time = time.process_time() #timer - print("%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr) #timer - - - # Create upstream_data_cast as a copy of upstream_data - # but with first column substituted by the phosphopeptide sequence - upstream_data_cast = upstream_data.copy() - new_cols_cast = new_cols - new_cols_cast[0] = 'p_peptide' - upstream_data_cast.columns = new_cols_cast - upstream_data_cast['p_peptide'] = upstream_data.index - new_cols_cast0 = new_cols_cast[0] - - # --- -------------- begin read upstream_data_melt ------------------------------------ - ## ----------- Get melted kinase mapping data from SQLite database (start) ----------- - conn = sql.connect(uniprot_sqlite) - upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) - # Close SwissProt SQLite database - conn.close() - upstream_data_melt = upstream_data_melt_df.copy() - upstream_data_melt.columns = ['p_peptide', 'characterization', 'X'] - upstream_data_melt['characterization'] = [ - col_rename(s) - for s in upstream_data_melt['characterization'] - ] - - print('%0.6f upstream_data_melt_df initially has %d rows' % - (end_time - start_time, len(upstream_data_melt.axes[0])) - , file=sys.stderr) - # ref: https://stackoverflow.com/a/27360130/15509512 - # e.g. df.drop(df[df.score < 50].index, inplace=True) - upstream_data_melt.drop( - upstream_data_melt[upstream_data_melt.X != 'X'].index, - inplace = True - ) - print('%0.6f upstream_data_melt_df pre-dedup has %d rows' % - (end_time - start_time, len(upstream_data_melt.axes[0])) - , file=sys.stderr) - #ACE with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also - #ACE print(upstream_data_melt) - ## ----------- Get melted kinase mapping data from SQLite database (finish) ----------- - # ... -------------- end read upstream_data_melt -------------------------------------- - - end_time = time.process_time() #timer - print("%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr) #timer - # ... end read upstream_data_melt - - upstream_data_melt_index = upstream_data_melt.index - upstream_data_melt_p_peptide = upstream_data_melt['p_peptide'] - - end_time = time.process_time() #timer - print("%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr) #timer - - upstream_delta_melt_LoL = upstream_data_melt.values.tolist() - - melt_dict = {} - for key in upstream_map_p_peptide_list: - melt_dict[key] = [] - - for el in upstream_delta_melt_LoL: - (p_peptide, characterization, X) = tuple(el) - if p_peptide in melt_dict: - melt_dict[p_peptide].append(characterization) - else: - exit('Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide)) - - - end_time = time.process_time() #timer - print("%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr) #timer - - - # for key in upstream_map_p_peptide_list: - # melt_dict[key] = ' | '.join(melt_dict[key]) - - for key in upstream_map_p_peptide_list: - melt_dict[key] = melt_join(melt_dict[key]) - - end_time = time.process_time() #timer - print("%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr) #timer - - # map_dict is a dictionary of dictionaries - map_dict = {} - for key in upstream_map_p_peptide_list: - map_dict[key] = {} - map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] - - - end_time = time.process_time() #timer - print("%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr) #timer - - #convert map_dict to dataframe - map_df = pandas.DataFrame.transpose(pandas.DataFrame.from_dict(map_dict)) - map_df["p-peptide"] = map_df.index #make index a column too - cols_map_df = map_df.columns.tolist() - cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] - map_df = map_df[cols_map_df] - - #join map_df to uniprot_regsite_df - output_df = uniprot_regsite_df.merge( - map_df, - how="left", - left_on=PHOSPHOPEPTIDE, - right_on="p-peptide") - - output_df = output_df[ - [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, - UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, - PUTATIVE_UPSTREAM_DOMAINS - ] - ] - - - # cols_output_prelim = output_df.columns.tolist() - # - # print("cols_output_prelim") - # print(cols_output_prelim) - # - # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] - # - # print("cols_output with p-peptide") - # print(cols_output) - # - # cols_output = [col for col in cols_output if not col == "p-peptide"] - # - # print("cols_output") - # print(cols_output) - # - # output_df = output_df[cols_output] - - #join output_df back to quantitative columns in data_in df - quant_cols = data_in.columns.tolist() - quant_cols = quant_cols[1:] - quant_data = data_in[quant_cols] - - ## ----------- Write merge/filter metadata to SQLite database (start) ----------- - # Open SwissProt SQLite database - conn = sql.connect(output_sqlite) - cur = conn.cursor() - - cur.executescript(MRGFLTR_DDL) - - cur.execute( - CITATION_INSERT_STMT, - ('mrgfltr_metadata_view', CITATION_INSERT_PSP) - ) - cur.execute( - CITATION_INSERT_STMT, - ('mrgfltr_metadata', CITATION_INSERT_PSP) - ) - cur.execute( - CITATION_INSERT_STMT, - ('mrgfltr_metadata_view', CITATION_INSERT_PSP_REF) - ) - cur.execute( - CITATION_INSERT_STMT, - ('mrgfltr_metadata', CITATION_INSERT_PSP_REF) - ) - - # Read ppep-to-sequence LUT - ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) - #ACE ppep_lut_df.info(verbose=True) - # write only metadata for merged/filtered records to SQLite - mrgfltr_metadata_df = output_df.copy() - # replace phosphopeptide seq with ppep.id - mrgfltr_metadata_df = ppep_lut_df.merge( - mrgfltr_metadata_df, - left_on='ppep_seq', - right_on=PHOSPHOPEPTIDE, - how='inner' - ) - mrgfltr_metadata_df.drop( - columns=[PHOSPHOPEPTIDE, 'ppep_seq'], - inplace=True - ) - #rename columns - mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS - #ACE mrgfltr_metadata_df.info(verbose=True) - mrgfltr_metadata_df.to_sql( - 'mrgfltr_metadata', - con=conn, - if_exists='append', - index=False, - method='multi' - ) - - # Close SwissProt SQLite database - conn.close() - ## ----------- Write merge/filter metadata to SQLite database (finish) ----------- - - output_df = output_df.merge(quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH) - output_cols = output_df.columns.tolist() - output_cols = output_cols[:-1] - output_df = output_df[output_cols] - - #cosmetic changes to Upstream column - output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[PUTATIVE_UPSTREAM_DOMAINS].fillna("") #fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping - us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) - i = 0 - while i < len(us_series): - #turn blanks into N_A to signify the info was searched for but cannot be found - if us_series[i] == "": - us_series[i] = N_A - i += 1 - output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series - - end_time = time.process_time() #timer - print("%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr) #timer - - (output_rows, output_cols) = output_df.shape - - #output_df = output_df[cols].convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) - output_df = output_df.convert_dtypes(convert_integer=True) - - - #Output onto Final CSV file - output_df.to_csv(output_filename_csv, index=False) - output_df.to_csv(output_filename_tab, quoting=None, sep='\t', index=False) - - end_time = time.process_time() #timer - print("%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr) #timer - - print('{:>10} phosphopeptides written to output'.format(str(output_rows))) - - end_time = time.process_time() #timer - print("%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,) , file=sys.stderr) #timer - - - #Rev. 7/1/2016 - #Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's - #Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS - #Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ - # read from SwissProt SQLite database - #Rev. 12/9/2021: Transfer code to Galaxy tool wrapper - - ############################################# - # copied from Excel Output Script.ipynb END # - ############################################# - - try: - catch(mqpep_getswissprot,) - exit(0) - except Exception as e: - exit('Internal error running mqpep_getswissprot(): %s' % (e)) - -if __name__ == "__main__": - __main__() - +#!/usr/bin/env python + +# Import the packages needed +import argparse +import operator # for operator.itemgetter +import os.path +import re +import shutil # for shutil.copyfile(src, dest) +import sqlite3 as sql +import sys # import the sys module for exc_info +import time +import traceback # for formatting stack-trace +from codecs import getreader as cx_getreader + +import numpy as np +import pandas + +# global constants +N_A = "N/A" + + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + exit(-1) + return None + + +def ppep_join(x): + x = [i for i in x if N_A != i] + result = "%s" % " | ".join(x) + if result != "": + return result + else: + return N_A + + +def melt_join(x): + tmp = {key.lower(): key for key in x} + result = "%s" % " | ".join([tmp[key] for key in tmp]) + return result + + +def __main__(): + # Parse Command Line + parser = argparse.ArgumentParser( + description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + "--phosphopeptides", + "-p", + nargs=1, + required=True, + dest="phosphopeptides", + help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", + ) + # UniProtKB/SwissProt DB input, SQLite + parser.add_argument( + "--ppep_mapping_db", + "-d", + nargs=1, + required=True, + dest="ppep_mapping_db", + help="UniProtKB/SwissProt SQLite Database", + ) + # species to limit records chosed from PhosPhositesPlus + parser.add_argument( + "--species", + "-x", + nargs=1, + required=False, + default=[], + dest="species", + help="limit PhosphoSitePlus records to indicated species (field may be empty)", + ) + + # outputs: + # tabular output + parser.add_argument( + "--mrgfltr_tab", + "-o", + nargs=1, + required=True, + dest="mrgfltr_tab", + help="Tabular output file for results", + ) + # CSV output + parser.add_argument( + "--mrgfltr_csv", + "-c", + nargs=1, + required=True, + dest="mrgfltr_csv", + help="CSV output file for results", + ) + # SQLite output + parser.add_argument( + "--mrgfltr_sqlite", + "-S", + nargs=1, + required=True, + dest="mrgfltr_sqlite", + help="SQLite output file for results", + ) + + # "Make it so!" (parse the arguments) + options = parser.parse_args() + print("options: " + str(options)) + + # determine phosphopeptide ("upstream map") input tabular file access + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) + input_file = open(upstream_map_filename_tab, "r") + input_file.close() + except Exception as e: + exit("Error parsing phosphopeptides argument: %s" % str(e)) + + # determine input SQLite access + if options.ppep_mapping_db is None: + exit('Argument "ppep_mapping_db" is required but not supplied') + try: + uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) + input_file = open(uniprot_sqlite, "rb") + input_file.close() + except Exception as e: + exit("Error parsing ppep_mapping_db argument: %s" % str(e)) + + # copy input SQLite dataset to output SQLite dataset + if options.mrgfltr_sqlite is None: + exit('Argument "mrgfltr_sqlite" is required but not supplied') + try: + output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) + shutil.copyfile(uniprot_sqlite, output_sqlite) + except Exception as e: + exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) + + # determine species to limit records from PSP_Regulatory_Sites + if options.species is None: + exit( + 'Argument "species" is required (and may be empty) but not supplied' + ) + try: + if len(options.species) > 0: + species = options.species[0] + else: + species = "" + except Exception as e: + exit("Error parsing species argument: %s" % str(e)) + + # determine tabular output destination + if options.mrgfltr_tab is None: + exit('Argument "mrgfltr_tab" is required but not supplied') + try: + output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) + output_file = open(output_filename_tab, "w") + output_file.close() + except Exception as e: + exit("Error parsing mrgfltr_tab argument: %s" % str(e)) + + # determine CSV output destination + if options.mrgfltr_csv is None: + exit('Argument "mrgfltr_csv" is required but not supplied') + try: + output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) + output_file = open(output_filename_csv, "w") + output_file.close() + except Exception as e: + exit("Error parsing mrgfltr_csv argument: %s" % str(e)) + + def mqpep_getswissprot(): + + # + # copied from Excel Output Script.ipynb BEGIN # + # + + # String Constants ################# + DEPHOSPHOPEP = "DephosphoPep" + DESCRIPTION = "Description" + FUNCTION_PHOSPHORESIDUE = ( + "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" + ) + GENE_NAME = "Gene_Name" # Gene Name from UniProtKB + ON_FUNCTION = ( + "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites + ) + ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites + ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites + ON_PROCESS = ( + "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites + ) + ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites + PHOSPHOPEPTIDE = "Phosphopeptide" + PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" + PHOSPHORESIDUE = "Phosphoresidue" + PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" + SEQUENCE = "Sequence" + SEQUENCE10 = "Sequence10" + SEQUENCE7 = "Sequence7" + SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" + UNIPROT_ID = "UniProt_ID" + UNIPROT_SEQ_AND_META_SQL = """ + select Uniprot_ID, Description, Gene_Name, Sequence, + Organism_Name, Organism_ID, PE, SV + from UniProtKB + order by Sequence, UniProt_ID + """ + UNIPROT_UNIQUE_SEQ_SQL = """ + select distinct Sequence + from UniProtKB + group by Sequence + """ + PPEP_PEP_UNIPROTSEQ_SQL = """ + select distinct phosphopeptide, peptide, sequence + from uniprotkb_pep_ppep_view + order by sequence + """ + PPEP_MELT_SQL = """ + SELECT DISTINCT + phospho_peptide AS 'p_peptide', + kinase_map AS 'characterization', + 'X' AS 'X' + FROM ppep_gene_site_view + """ + # CREATE TABLE PSP_Regulatory_site ( + # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, + # domain TEXT, + # ON_FUNCTION TEXT, + # ON_PROCESS TEXT, + # ON_PROT_INTERACT TEXT, + # ON_OTHER_INTERACT TEXT, + # notes TEXT, + # organism TEXT + # ); + PSP_REGSITE_SQL = """ + SELECT DISTINCT + SITE_PLUSMINUS_7AA , + DOMAIN , + ON_FUNCTION , + ON_PROCESS , + ON_PROT_INTERACT , + ON_OTHER_INTERACT , + NOTES , + ORGANISM + FROM PSP_Regulatory_site + """ + PPEP_ID_SQL = """ + SELECT + id AS 'ppep_id', + seq AS 'ppep_seq' + FROM ppep + """ + MRGFLTR_DDL = """ + DROP VIEW IF EXISTS mrgfltr_metadata_view; + DROP TABLE IF EXISTS mrgfltr_metadata; + CREATE TABLE mrgfltr_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , Sequence10 TEXT + , Sequence7 TEXT + , GeneName TEXT + , Phosphoresidue TEXT + , UniProtID TEXT + , Description TEXT + , FunctionPhosphoresidue TEXT + , PutativeUpstreamDomains TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW mrgfltr_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , Sequence10 + , Sequence7 + , GeneName + , Phosphoresidue + , UniProtID + , Description + , FunctionPhosphoresidue + , PutativeUpstreamDomains + FROM + ppep, mrgfltr_metadata + WHERE + mrgfltr_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + """ + + CITATION_INSERT_STMT = """ + INSERT INTO Citation ( + ObjectName, + CitationData + ) VALUES (?,?) + """ + CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' + CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' + + MRGFLTR_METADATA_COLUMNS = [ + "ppep_id", + "Sequence10", + "Sequence7", + "GeneName", + "Phosphoresidue", + "UniProtID", + "Description", + "FunctionPhosphoresidue", + "PutativeUpstreamDomains", + ] + + # String Constants (end) ############ + + class Error(Exception): + """Base class for exceptions in this module.""" + + pass + + class PreconditionError(Error): + """Exception raised for errors in the input. + + Attributes: + expression -- input expression in which the error occurred + message -- explanation of the error + """ + + def __init__(self, expression, message): + self.expression = expression + self.message = message + + # start_time = time.clock() #timer + start_time = time.process_time() # timer + + # get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(upstream_map_filename_tab, "rb") + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + upstream_map_p_peptide_list = [] + re_tab = re.compile("^[^\t]*") + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + upstream_map_p_peptide_list.append(m[0]) + file1.close() + file1_encoded.close() + + # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed + re_phos = re.compile("p") + + end_time = time.process_time() # timer + print( + "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), + file=sys.stderr, + ) + + # ----------- Get SwissProt data from SQLite database (start) ----------- + # build UniProt sequence LUT and list of unique SwissProt sequences + + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up structures to hold SwissProt data + + uniprot_Sequence_List = [] + UniProtSeqLUT = {} + + # Execute query for unique seqs without fetching the results yet + uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) + + while 1: + batch = uniprot_unique_seq_cur.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for row in batch: + Sequence = row[0] + UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] + UniProtSeqLUT[(Sequence, GENE_NAME)] = [] + UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] + UniProtSeqLUT[Sequence] = [] + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) + + while 1: + batch = uniprot_seq_and_meta.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for ( + UniProt_ID, + Description, + Gene_Name, + Sequence, + OS, + OX, + PE, + SV, + ) in batch: + uniprot_Sequence_List.append(Sequence) + UniProtSeqLUT[Sequence] = Sequence + UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) + UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) + if OS != N_A: + Description += " OS=" + OS + if OX != N_A: + Description += " OX=" + str(int(OX)) + if Gene_Name != N_A: + Description += " GN=" + Gene_Name + if PE != N_A: + Description += " PE=" + PE + if SV != N_A: + Description += " SV=" + SV + UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + Sequence = "" + UniProt_ID = "" + Description = "" + Gene_Name = "" + + # ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() # timer + print( + "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), + file=sys.stderr, + ) + + # ----------- Get SwissProt data from SQLite database (start) ----------- + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide + DephosphoPep_UniProtSeq_LUT = {} + + # Set up dictionary to accumulate results + PhosphoPep_UniProtSeq_LUT = {} + + # Execute query for tuples without fetching the results yet + ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) + + while 1: + batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for (phospho_pep, dephospho_pep, sequence) in batch: + # do interesting stuff here... + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP) + ] = dephospho_pep + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ] = [] + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ] = [] + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] = [] + DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] + DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) + + if ( + sequence + not in DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ] + ): + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ].append(sequence) + for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: + if phospho_pep != phospho_pep: + print( + "phospho_pep:'%s' phospho_pep:'%s'" + % (phospho_pep, phospho_pep) + ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP) + ] = dephospho_pep + r = list( + zip( + [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], + [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], + [ + s + for s in UniProtSeqLUT[(sequence, DESCRIPTION)] + ], + ) + ) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com/a/4174955/15509512 + r = sorted(r, key=operator.itemgetter(0)) + # Get one tuple for each `phospho_pep` + # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] + for (upid, gn, desc) in r: + # Append pseudo-tuple per UniProt_ID but only when it is not present + if ( + upid + not in DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] + ): + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ].append(upid) + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ].append(desc) + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ].append(gn) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + # wipe local variables + phospho_pep = dephospho_pep = sequence = 0 + upid = gn = desc = r = "" + + # ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() # timer + print( + "%0.6f finished reading and decoding '%s' [0.4]" + % (end_time - start_time, upstream_map_filename_tab), + file=sys.stderr, + ) + + print( + "{:>10} unique upstream phosphopeptides tested".format( + str(len(upstream_map_p_peptide_list)) + ) + ) + + # Read in Upstream tabular file + # We are discarding the intensity data; so read it as text + upstream_data = pandas.read_table( + upstream_map_filename_tab, dtype="str", index_col=0 + ) + + end_time = time.process_time() # timer + print( + "%0.6f read Upstream Map from file [1g_1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + upstream_data.index = upstream_map_p_peptide_list + + end_time = time.process_time() # timer + print( + "%0.6f added index to Upstream Map [1g_2]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # trim upstream_data to include only the upstream map columns + old_cols = upstream_data.columns.tolist() + i = 0 + first_intensity = -1 + last_intensity = -1 + intensity_re = re.compile("Intensity.*") + for col_name in old_cols: + m = intensity_re.match(col_name) + if m: + last_intensity = i + if first_intensity == -1: + first_intensity = i + i += 1 + # print('last intensity = %d' % last_intensity) + col_PKCalpha = last_intensity + 2 + + data_in_cols = [old_cols[0]] + old_cols[ + first_intensity: last_intensity + 1 + ] + + if upstream_data.empty: + print("upstream_data is empty") + exit(0) + + data_in = upstream_data.copy(deep=True)[data_in_cols] + + # Convert floating-point integers to int64 integers + # ref: https://stackoverflow.com/a/68497603/15509512 + data_in[list(data_in.columns[1:])] = ( + data_in[list(data_in.columns[1:])] + .astype("float64") + .apply(np.int64) + ) + + # create another phosphopeptide column that will be used to join later; + # MAY need to change depending on Phosphopeptide column position + # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] + data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index + + end_time = time.process_time() # timer + print( + "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Produce a dictionary of metadata for a single phosphopeptide. + # This is a replacement of `UniProtInfo_subdict` in the original code. + def pseq_to_subdict(phospho_pep): + # Strip "p" from phosphopeptide sequence + dephospho_pep = re_phos.sub("", phospho_pep) + + # Determine number of phosphoresidues in phosphopeptide + numps = len(phospho_pep) - len(dephospho_pep) + + # Determine location(s) of phosphoresidue(s) in phosphopeptide + # (used later for Phosphoresidue, Sequence7, and Sequence10) + ploc = [] # list of p locations + i = 0 + p = phospho_pep + while i < numps: + ploc.append(p.find("p")) + p = p[: p.find("p")] + p[p.find("p") + 1:] + i += 1 + + # Establish nested dictionary + result = {} + result[SEQUENCE] = [] + result[UNIPROT_ID] = [] + result[DESCRIPTION] = [] + result[GENE_NAME] = [] + result[PHOSPHORESIDUE] = [] + result[SEQUENCE7] = [] + result[SEQUENCE10] = [] + + # Add stripped sequence to dictionary + result[SEQUENCE].append(dephospho_pep) + + # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT + # Caller may elect to: + # try: + # ... + # except PreconditionError as pe: + # print("'{expression}': {message}".format( + # expression = pe.expression, + # message = pe.message)) + # ) + # ) + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( + dephospho_pep, + "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", + ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + raise PreconditionError( + dephospho_pep, + "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", + ) + if ( + dephospho_pep + != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] + ): + raise PreconditionError( + dephospho_pep, + "dephosphorylated phosphopeptide does not match " + + "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + + PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)], + ) + result[SEQUENCE] = [dephospho_pep] + result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] + result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ] + result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ] + if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( + dephospho_pep, + "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", + ) + UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ] + if len(UniProtSeqList) < 1: + print( + "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" + % dephospho_pep + ) + # raise PreconditionError( + # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", + # 'value has zero length' + # ) + for UniProtSeq in UniProtSeqList: + i = 0 + phosphoresidues = [] + seq7s_set = set() + seq7s = [] + seq10s_set = set() + seq10s = [] + while i < len(ploc): + start = UniProtSeq.find(dephospho_pep) + psite = ( + start + ploc[i] + ) # location of phosphoresidue on protein sequence + + # add Phosphoresidue + phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) + phosphoresidues.append(phosphosite) + + # Add Sequence7 + if psite < 7: # phospho_pep at N terminus + seq7 = str(UniProtSeq)[: psite + 8] + if seq7[psite] == "S": # if phosphosresidue is serine + pres = "s" + elif ( + seq7[psite] == "T" + ): # if phosphosresidue is threonine + pres = "t" + elif ( + seq7[psite] == "Y" + ): # if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = ( + seq7[:psite] + pres + seq7[psite + 1: psite + 8] + ) + while ( + len(seq7) < 15 + ): # add appropriate number of "_" to the front + seq7 = "_" + seq7 + elif ( + len(UniProtSeq) - psite < 8 + ): # phospho_pep at C terminus + seq7 = str(UniProtSeq)[psite - 7:] + if seq7[7] == "S": + pres = "s" + elif seq7[7] == "T": + pres = "t" + elif seq7[7] == "Y": + pres = "y" + else: + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + while ( + len(seq7) < 15 + ): # add appropriate number of "_" to the back + seq7 = seq7 + "_" + else: + seq7 = str(UniProtSeq)[psite - 7: psite + 8] + pres = "" # phosphoresidue + if seq7[7] == "S": # if phosphosresidue is serine + pres = "s" + elif seq7[7] == "T": # if phosphosresidue is threonine + pres = "t" + elif seq7[7] == "Y": # if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + if seq7 not in seq7s_set: + seq7s.append(seq7) + seq7s_set.add(seq7) + + # add Sequence10 + if psite < 10: # phospho_pep at N terminus + seq10 = ( + str(UniProtSeq)[:psite] + + "p" + + str(UniProtSeq)[psite: psite + 11] + ) + elif ( + len(UniProtSeq) - psite < 11 + ): # phospho_pep at C terminus + seq10 = ( + str(UniProtSeq)[psite - 10: psite] + + "p" + + str(UniProtSeq)[psite:] + ) + else: + seq10 = str(UniProtSeq)[psite - 10: psite + 11] + seq10 = seq10[:10] + "p" + seq10[10:] + if seq10 not in seq10s_set: + seq10s.append(seq10) + seq10s_set.add(seq10) + + i += 1 + + result[PHOSPHORESIDUE].append(phosphoresidues) + result[SEQUENCE7].append(seq7s) + # result[SEQUENCE10] is a list of lists of strings + result[SEQUENCE10].append(seq10s) + + r = list( + zip( + result[UNIPROT_ID], + result[GENE_NAME], + result[DESCRIPTION], + result[PHOSPHORESIDUE], + ) + ) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com//4174955/15509512 + s = sorted(r, key=operator.itemgetter(0)) + + result[UNIPROT_ID] = [] + result[GENE_NAME] = [] + result[DESCRIPTION] = [] + result[PHOSPHORESIDUE] = [] + + for r in s: + result[UNIPROT_ID].append(r[0]) + result[GENE_NAME].append(r[1]) + result[DESCRIPTION].append(r[2]) + result[PHOSPHORESIDUE].append(r[3]) + + # convert lists to strings in the dictionary + for key, value in result.items(): + if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: + result[key] = "; ".join(map(str, value)) + elif key in [SEQUENCE10]: + # result[SEQUENCE10] is a list of lists of strings + joined_value = "" + joined_set = set() + sep = "" + for valL in value: + # valL is a list of strings + for val in valL: + # val is a string + if val not in joined_set: + joined_set.add(val) + # joined_value += sep + '; '.join(map(str, val)) + joined_value += sep + val + sep = "; " + # joined_value is a string + result[key] = joined_value + + newstring = "; ".join( + [", ".join(prez) for prez in result[PHOSPHORESIDUE]] + ) + # #separate the isoforms in PHOSPHORESIDUE column with ";" + # oldstring = result[PHOSPHORESIDUE] + # oldlist = list(oldstring) + # newstring = "" + # i = 0 + # for e in oldlist: + # if e == ";": + # if numps > 1: + # if i%numps: + # newstring = newstring + ";" + # else: + # newstring = newstring + "," + # else: + # newstring = newstring + ";" + # i +=1 + # else: + # newstring = newstring + e + result[PHOSPHORESIDUE] = newstring + + # separate sequence7's by | + oldstring = result[SEQUENCE7] + oldlist = oldstring + newstring = "" + for ol in oldlist: + for e in ol: + if e == ";": + newstring = newstring + " |" + elif len(newstring) > 0 and 1 > newstring.count(e): + newstring = newstring + " | " + e + elif 1 > newstring.count(e): + newstring = newstring + e + result[SEQUENCE7] = newstring + + return [phospho_pep, result] + + # Construct list of [string, dictionary] lists + # where the dictionary provides the SwissProt metadata for a phosphopeptide + result_list = [ + catch(pseq_to_subdict, psequence) + for psequence in data_in[PHOSPHOPEPTIDE_MATCH] + ] + + end_time = time.process_time() # timer + print( + "%0.6f added SwissProt annotations to phosphopeptides [B]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Construct dictionary from list of lists + # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ + UniProt_Info = { + result[0]: result[1] + for result in result_list + if result is not None + } + + end_time = time.process_time() # timer + print( + "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # cosmetic: add N_A to phosphopeptide rows with no hits + p_peptide_list = [] + for key in UniProt_Info: + p_peptide_list.append(key) + for nestedKey in UniProt_Info[key]: + if UniProt_Info[key][nestedKey] == "": + UniProt_Info[key][nestedKey] = N_A + + end_time = time.process_time() # timer + print( + "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # convert UniProt_Info dictionary to dataframe + uniprot_df = pandas.DataFrame.transpose( + pandas.DataFrame.from_dict(UniProt_Info) + ) + + # reorder columns to match expected output file + uniprot_df[ + PHOSPHOPEPTIDE + ] = uniprot_df.index # make index a column too + + cols = uniprot_df.columns.tolist() + # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] + # uniprot_df = uniprot_df[cols] + uniprot_df = uniprot_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE10, + SEQUENCE7, + GENE_NAME, + PHOSPHORESIDUE, + UNIPROT_ID, + DESCRIPTION, + ] + ] + + end_time = time.process_time() # timer + print( + "%0.6f reordered columns to match expected output file [1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # concat to split then groupby to collapse + seq7_df = pandas.concat( + [ + pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) + for _, row in uniprot_df.iterrows() + ] + ).reset_index() + seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] + + # --- -------------- begin read PSP_Regulatory_sites --------------------------------- + # read in PhosphoSitePlus Regulatory Sites dataset + # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) + # Close SwissProt SQLite database + conn.close() + # ... -------------- end read PSP_Regulatory_sites ------------------------------------ + + # keep only the human entries in dataframe + if len(species) > 0: + print( + 'Limit PhosphoSitesPlus records to species "' + species + '"' + ) + regsites_df = regsites_df[regsites_df.ORGANISM == species] + + # merge the seq7 df with the regsites df based off of the sequence7 + merge_df = seq7_df.merge( + regsites_df, + left_on=SEQUENCE7, + right_on=SITE_PLUSMINUS_7AA_SQL, + how="left", + ) + + # after merging df, select only the columns of interest - note that PROTEIN is absent here + merge_df = merge_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE7, + ON_FUNCTION, + ON_PROCESS, + ON_PROT_INTERACT, + ON_OTHER_INTERACT, + ON_NOTES, + ] + ] + # combine column values of interest into one FUNCTION_PHOSPHORESIDUE column" + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( + merge_df[ON_PROCESS], sep="; ", na_rep="" + ) + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") + + # remove the columns that were combined + merge_df = merge_df[ + [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] + ] + + end_time = time.process_time() # timer + print( + "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # cosmetic changes to Function Phosphoresidue column + fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) + + end_time = time.process_time() # timer + print( + "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + i = 0 + while i < len(fp_series): + # remove the extra ";" so that it looks more professional + if fp_series[i] == "; ; ; ; ": # remove ; from empty hits + fp_series[i] = "" + while fp_series[i].endswith("; "): # remove ; from the ends + fp_series[i] = fp_series[i][:-2] + while fp_series[i].startswith("; "): # remove ; from the beginning + fp_series[i] = fp_series[i][2:] + fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ", "; ") + + # turn blanks into N_A to signify the info was searched for but cannot be found + if fp_series[i] == "": + fp_series[i] = N_A + + i += 1 + merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series + + end_time = time.process_time() # timer + print( + "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # merge uniprot df with merge df + uniprot_regsites_merged_df = uniprot_df.merge( + merge_df, + left_on=PHOSPHOPEPTIDE, + right_on=PHOSPHOPEPTIDE, + how="left", + ) + + # collapse the merged df + uniprot_regsites_collapsed_df = pandas.DataFrame( + uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ + FUNCTION_PHOSPHORESIDUE + ].apply(lambda x: ppep_join(x)) + ) + # .apply(lambda x: "%s" % ' | '.join(x))) + + end_time = time.process_time() # timer + print( + "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + uniprot_regsites_collapsed_df[ + PHOSPHOPEPTIDE + ] = ( + uniprot_regsites_collapsed_df.index + ) # add df index as its own column + + # rename columns + uniprot_regsites_collapsed_df.columns = [ + FUNCTION_PHOSPHORESIDUE, + "ppp", + ] + + end_time = time.process_time() # timer + print( + "%0.6f selected columns to be merged to uniprot_df [1e]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # add columns based on Sequence7 matching site_+/-7_AA + uniprot_regsite_df = pandas.merge( + left=uniprot_df, + right=uniprot_regsites_collapsed_df, + how="left", + left_on=PHOSPHOPEPTIDE, + right_on="ppp", + ) + + end_time = time.process_time() # timer + print( + "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + data_in.rename( + {"Protein description": PHOSPHOPEPTIDE}, + axis="columns", + inplace=True, + ) + + # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') + res2 = sorted( + data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() + ) + data_in = data_in.loc[res2] + + end_time = time.process_time() # timer + print( + "%0.6f sorting time [1f]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] + upstream_data = upstream_data[cols] + + end_time = time.process_time() # timer + print( + "%0.6f refactored columns for Upstream Map [1g]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # #rename upstream columns in new list + # new_cols = [] + # for name in cols: + # if "_NetworKIN" in name: + # name = name.split("_")[0] + # if " motif" in name: + # name = name.split(" motif")[0] + # if " sequence " in name: + # name = name.split(" sequence")[0] + # if "_Phosida" in name: + # name = name.split("_")[0] + # if "_PhosphoSite" in name: + # name = name.split("_")[0] + # new_cols.append(name) + + # rename upstream columns in new list + def col_rename(name): + if "_NetworKIN" in name: + name = name.split("_")[0] + if " motif" in name: + name = name.split(" motif")[0] + if " sequence " in name: + name = name.split(" sequence")[0] + if "_Phosida" in name: + name = name.split("_")[0] + if "_PhosphoSite" in name: + name = name.split("_")[0] + return name + + new_cols = [col_rename(col) for col in cols] + upstream_data.columns = new_cols + + end_time = time.process_time() # timer + print( + "%0.6f renamed columns for Upstream Map [1h_1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Create upstream_data_cast as a copy of upstream_data + # but with first column substituted by the phosphopeptide sequence + upstream_data_cast = upstream_data.copy() + new_cols_cast = new_cols + new_cols_cast[0] = "p_peptide" + upstream_data_cast.columns = new_cols_cast + upstream_data_cast["p_peptide"] = upstream_data.index + + # --- -------------- begin read upstream_data_melt ------------------------------------ + # ----------- Get melted kinase mapping data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) + # Close SwissProt SQLite database + conn.close() + upstream_data_melt = upstream_data_melt_df.copy() + upstream_data_melt.columns = ["p_peptide", "characterization", "X"] + upstream_data_melt["characterization"] = [ + col_rename(s) for s in upstream_data_melt["characterization"] + ] + + print( + "%0.6f upstream_data_melt_df initially has %d rows" + % (end_time - start_time, len(upstream_data_melt.axes[0])), + file=sys.stderr, + ) + # ref: https://stackoverflow.com/a/27360130/15509512 + # e.g. df.drop(df[df.score < 50].index, inplace=True) + upstream_data_melt.drop( + upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True + ) + print( + "%0.6f upstream_data_melt_df pre-dedup has %d rows" + % (end_time - start_time, len(upstream_data_melt.axes[0])), + file=sys.stderr, + ) + # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- + # ... -------------- end read upstream_data_melt -------------------------------------- + + end_time = time.process_time() # timer + print( + "%0.6f melted and minimized Upstream Map dataframe [1h_2]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + # ... end read upstream_data_melt + + end_time = time.process_time() # timer + print( + "%0.6f indexed melted Upstream Map [1h_2a]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + upstream_delta_melt_LoL = upstream_data_melt.values.tolist() + + melt_dict = {} + for key in upstream_map_p_peptide_list: + melt_dict[key] = [] + + for el in upstream_delta_melt_LoL: + (p_peptide, characterization, X) = tuple(el) + if p_peptide in melt_dict: + melt_dict[p_peptide].append(characterization) + else: + exit( + 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' + % (p_peptide) + ) + + end_time = time.process_time() # timer + print( + "%0.6f appended peptide characterizations [1h_2b]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # for key in upstream_map_p_peptide_list: + # melt_dict[key] = ' | '.join(melt_dict[key]) + + for key in upstream_map_p_peptide_list: + melt_dict[key] = melt_join(melt_dict[key]) + + end_time = time.process_time() # timer + print( + "%0.6f concatenated multiple characterizations [1h_2c]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # map_dict is a dictionary of dictionaries + map_dict = {} + for key in upstream_map_p_peptide_list: + map_dict[key] = {} + map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] + + end_time = time.process_time() # timer + print( + "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # convert map_dict to dataframe + map_df = pandas.DataFrame.transpose( + pandas.DataFrame.from_dict(map_dict) + ) + map_df["p-peptide"] = map_df.index # make index a column too + cols_map_df = map_df.columns.tolist() + cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] + map_df = map_df[cols_map_df] + + # join map_df to uniprot_regsite_df + output_df = uniprot_regsite_df.merge( + map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" + ) + + output_df = output_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE10, + SEQUENCE7, + GENE_NAME, + PHOSPHORESIDUE, + UNIPROT_ID, + DESCRIPTION, + FUNCTION_PHOSPHORESIDUE, + PUTATIVE_UPSTREAM_DOMAINS, + ] + ] + + # cols_output_prelim = output_df.columns.tolist() + # + # print("cols_output_prelim") + # print(cols_output_prelim) + # + # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] + # + # print("cols_output with p-peptide") + # print(cols_output) + # + # cols_output = [col for col in cols_output if not col == "p-peptide"] + # + # print("cols_output") + # print(cols_output) + # + # output_df = output_df[cols_output] + + # join output_df back to quantitative columns in data_in df + quant_cols = data_in.columns.tolist() + quant_cols = quant_cols[1:] + quant_data = data_in[quant_cols] + + # ----------- Write merge/filter metadata to SQLite database (start) ----------- + # Open SwissProt SQLite database + conn = sql.connect(output_sqlite) + cur = conn.cursor() + + cur.executescript(MRGFLTR_DDL) + + cur.execute( + CITATION_INSERT_STMT, + ("mrgfltr_metadata_view", CITATION_INSERT_PSP), + ) + cur.execute( + CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) + ) + cur.execute( + CITATION_INSERT_STMT, + ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), + ) + cur.execute( + CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) + ) + + # Read ppep-to-sequence LUT + ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) + # write only metadata for merged/filtered records to SQLite + mrgfltr_metadata_df = output_df.copy() + # replace phosphopeptide seq with ppep.id + mrgfltr_metadata_df = ppep_lut_df.merge( + mrgfltr_metadata_df, + left_on="ppep_seq", + right_on=PHOSPHOPEPTIDE, + how="inner", + ) + mrgfltr_metadata_df.drop( + columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True + ) + # rename columns + mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS + mrgfltr_metadata_df.to_sql( + "mrgfltr_metadata", + con=conn, + if_exists="append", + index=False, + method="multi", + ) + + # Close SwissProt SQLite database + conn.close() + # ----------- Write merge/filter metadata to SQLite database (finish) ----------- + + output_df = output_df.merge( + quant_data, + how="right", + left_on=PHOSPHOPEPTIDE, + right_on=PHOSPHOPEPTIDE_MATCH, + ) + output_cols = output_df.columns.tolist() + output_cols = output_cols[:-1] + output_df = output_df[output_cols] + + # cosmetic changes to Upstream column + output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ + PUTATIVE_UPSTREAM_DOMAINS + ].fillna( + "" + ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping + us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) + i = 0 + while i < len(us_series): + # turn blanks into N_A to signify the info was searched for but cannot be found + if us_series[i] == "": + us_series[i] = N_A + i += 1 + output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series + + end_time = time.process_time() # timer + print( + "%0.6f establisheed output [3]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + (output_rows, output_cols) = output_df.shape + + output_df = output_df.convert_dtypes(convert_integer=True) + + # Output onto Final CSV file + output_df.to_csv(output_filename_csv, index=False) + output_df.to_csv( + output_filename_tab, quoting=None, sep="\t", index=False + ) + + end_time = time.process_time() # timer + print( + "%0.6f wrote output [4]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + print( + "{:>10} phosphopeptides written to output".format(str(output_rows)) + ) + + end_time = time.process_time() # timer + print( + "%0.6f seconds of non-system CPU time were consumed" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Rev. 7/1/2016 + # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's + # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS + # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ + # read from SwissProt SQLite database + # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper + + # + # copied from Excel Output Script.ipynb END # + # + + try: + catch( + mqpep_getswissprot, + ) + exit(0) + except Exception as e: + exit("Internal error running mqpep_getswissprot(): %s" % (e)) + + +if __name__ == "__main__": + __main__()