Mercurial > repos > eschen42 > mqppep_anova
view search_ppep.py @ 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | d4d531006735 |
line wrap: on
line source
#!/usr/bin/env python # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB import argparse import os.path import sqlite3 import re from codecs import getreader as cx_getreader import time # For Aho-Corasick search for fixed set of substrings # - add_word # - make_automaton # - iter import ahocorasick # Support map over auto.iter(...) # - itemgetter import operator #import hashlib # ref: https://stackoverflow.com/a/8915613/15509512 # answers: "How to handle exceptions in a list comprehensions" # usage: # from math import log # eggs = [1,3,0,3,2] # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) # producing: # for <built-in function log> # with args (0,) # exception: math domain error # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] def catch(func, *args, handle=lambda e : e, **kwargs): try: return func(*args, **kwargs) except Exception as e: print("For %s" % str(func)) print(" with args %s" % str(args)) print(" caught exception: %s" % str(e)) (ty, va, tb) = sys.exc_info() print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) #exit(-1) return None # was handle(e) def __main__(): ITEM_GETTER = operator.itemgetter(1) DROP_TABLES_SQL = ''' DROP VIEW IF EXISTS ppep_gene_site_view; DROP VIEW IF EXISTS uniprot_view; DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; DROP VIEW IF EXISTS ppep_intensity_view; DROP VIEW IF EXISTS ppep_metadata_view; DROP TABLE IF EXISTS sample; DROP TABLE IF EXISTS ppep; DROP TABLE IF EXISTS site_type; DROP TABLE IF EXISTS deppep_UniProtKB; DROP TABLE IF EXISTS deppep; DROP TABLE IF EXISTS ppep_gene_site; DROP TABLE IF EXISTS ppep_metadata; DROP TABLE IF EXISTS ppep_intensity; ''' CREATE_TABLES_SQL = ''' CREATE TABLE deppep ( id INTEGER PRIMARY KEY , seq TEXT UNIQUE ON CONFLICT IGNORE ) ; CREATE TABLE deppep_UniProtKB ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE , pos_start INTEGER , pos_end INTEGER , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) ON CONFLICT IGNORE ) ; CREATE TABLE ppep ( id INTEGER PRIMARY KEY , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE , seq TEXT UNIQUE ON CONFLICT IGNORE , scrubbed TEXT ); CREATE TABLE site_type ( id INTEGER PRIMARY KEY , type_name TEXT UNIQUE ON CONFLICT IGNORE ); CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) ; CREATE TABLE sample ( id INTEGER PRIMARY KEY , name TEXT UNIQUE ON CONFLICT IGNORE ) ; CREATE VIEW uniprot_view AS SELECT DISTINCT Uniprot_ID , Description , Organism_Name , Organism_ID , Gene_Name , PE , SV , Sequence , Description || ' OS=' || Organism_Name || ' OX=' || Organism_ID || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END AS long_description , Database FROM UniProtKB ; CREATE VIEW uniprotkb_pep_ppep_view AS SELECT deppep_UniProtKB.UniprotKB_ID AS accession , deppep_UniProtKB.pos_start AS pos_start , deppep_UniProtKB.pos_end AS pos_end , deppep.seq AS peptide , ppep.seq AS phosphopeptide , ppep.scrubbed AS scrubbed , uniprot_view.Sequence AS sequence , uniprot_view.Description AS description , uniprot_view.long_description AS long_description , ppep.id AS ppep_id FROM ppep, deppep, deppep_UniProtKB, uniprot_view WHERE deppep.id = ppep.deppep_id AND deppep.id = deppep_UniProtKB.deppep_id AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID ORDER BY UniprotKB_ID, deppep.seq, ppep.seq ; CREATE TABLE ppep_gene_site ( ppep_id INTEGER REFERENCES ppep(id) , gene_names TEXT , site_type_id INTEGER REFERENCES site_type(id) , kinase_map TEXT , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE ) ; CREATE VIEW ppep_gene_site_view AS SELECT DISTINCT ppep.seq AS phospho_peptide , ppep_id , gene_names , type_name , kinase_map FROM ppep, ppep_gene_site, site_type WHERE ppep_gene_site.ppep_id = ppep.id AND ppep_gene_site.site_type_id = site_type.id ORDER BY ppep.seq ; CREATE TABLE ppep_metadata ( ppep_id INTEGER REFERENCES ppep(id) , protein_description TEXT , gene_name TEXT , FASTA_name TEXT , phospho_sites TEXT , motifs_unique TEXT , accessions TEXT , motifs_all_members TEXT , domain TEXT , ON_FUNCTION TEXT , ON_PROCESS TEXT , ON_PROT_INTERACT TEXT , ON_OTHER_INTERACT TEXT , notes TEXT , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE ) ; CREATE VIEW ppep_metadata_view AS SELECT DISTINCT ppep.seq AS phospho_peptide , protein_description , gene_name , FASTA_name , phospho_sites , motifs_unique , accessions , motifs_all_members , domain , ON_FUNCTION , ON_PROCESS , ON_PROT_INTERACT , ON_OTHER_INTERACT , notes FROM ppep, ppep_metadata WHERE ppep_metadata.ppep_id = ppep.id ORDER BY ppep.seq ; CREATE TABLE ppep_intensity ( ppep_id INTEGER REFERENCES ppep(id) , sample_id INTEGER , intensity INTEGER , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE ) ; CREATE VIEW ppep_intensity_view AS SELECT DISTINCT ppep.seq AS phospho_peptide , sample.name AS sample , intensity FROM ppep, sample, ppep_intensity WHERE ppep_intensity.sample_id = sample.id AND ppep_intensity.ppep_id = ppep.id ; ''' UNIPROT_SEQ_AND_ID_SQL = ''' select Sequence, Uniprot_ID from UniProtKB ''' # Parse Command Line parser = argparse.ArgumentParser( description='Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB).' ) # inputs: # Phosphopeptide data for experimental results, including the intensities # and the mapping to kinase domains, in tabular format. parser.add_argument( '--phosphopeptides', '-p', nargs=1, required=True, dest='phosphopeptides', help='Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool' ) parser.add_argument( '--uniprotkb', '-u', nargs=1, required=True, dest='uniprotkb', help='UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool' ) parser.add_argument( '--schema', action='store_true', dest='db_schema', help='show updated database schema' ) parser.add_argument( '--warn-duplicates', action='store_true', dest='warn_duplicates', help='show warnings for duplicated sequences' ) parser.add_argument( '--verbose', action='store_true', dest='verbose', help='show somewhat verbose program tracing' ) # "Make it so!" (parse the arguments) options = parser.parse_args() if options.verbose: print("options: " + str(options) + "\n") # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file if options.phosphopeptides is None: exit('Argument "phosphopeptides" is required but not supplied') try: f_name = os.path.abspath(options.phosphopeptides[0]) except Exception as e: exit('Error parsing phosphopeptides argument: %s' % (e)) # path to SQLite input/output tabular file if options.uniprotkb is None: exit('Argument "uniprotkb" is required but not supplied') try: db_name = os.path.abspath(options.uniprotkb[0]) except Exception as e: exit('Error parsing uniprotkb argument: %s' % (e)) # print("options.schema is %d" % options.db_schema) # db_name = "demo/test.sqlite" # f_name = "demo/test_input.txt" con = sqlite3.connect(db_name) cur = con.cursor() ker = con.cursor() cur.executescript(DROP_TABLES_SQL) # if options.db_schema: # print("\nAfter dropping tables/views that are to be created, schema is:") # cur.execute("SELECT * FROM sqlite_schema") # for row in cur.fetchall(): # if row[4] is not None: # print("%s;" % row[4]) cur.executescript(CREATE_TABLES_SQL) if options.db_schema: print("\nAfter creating tables/views that are to be created, schema is:") cur.execute("SELECT * FROM sqlite_schema") for row in cur.fetchall(): if row[4] is not None: print("%s;" % row[4]) def generate_ppep(f): #get keys from upstream tabular file using readline() # ref: https://stackoverflow.com/a/16713581/15509512 # answer to "Use codecs to read file with correct encoding" file1_encoded = open(f, 'rb') file1 = cx_getreader("latin-1")(file1_encoded) count = 0 re_tab = re.compile('^[^\t]*') re_quote = re.compile('"') while True: count += 1 # Get next line from file line = file1.readline() # if line is empty # end of file is reached if not line: break if count > 1: m = re_tab.match(line) m = re_quote.sub('',m[0]) yield m file1.close() file1_encoded.close() # Build an Aho-Corasick automaton from a trie # - ref: # - https://pypi.org/project/pyahocorasick/ # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm # - https://en.wikipedia.org/wiki/Trie auto = ahocorasick.Automaton() re_phos = re.compile('p') # scrub out unsearchable characters per section # "Match the p_peptides to the @sequences array:" # of the original # PhosphoPeptide Upstream Kinase Mapping.pl # which originally read # $tmp_p_peptide =~ s/#//g; # $tmp_p_peptide =~ s/\d//g; # $tmp_p_peptide =~ s/\_//g; # $tmp_p_peptide =~ s/\.//g; # re_scrub = re.compile('0-9_.#') ppep_count = 0 for ppep in generate_ppep(f_name): ppep_count += 1 add_to_trie = False #print(ppep) scrubbed = re_scrub.sub('',ppep) deppep = re_phos.sub('',scrubbed) if options.verbose: print("deppep: %s; scrubbed: %s" % (deppep,scrubbed)) #print(deppep) cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) if cur.fetchone() is None: add_to_trie = True cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) deppep_id = cur.fetchone()[0] if add_to_trie: #print((deppep_id, deppep)) # Build the trie auto.add_word(deppep, (deppep_id, deppep)) cur.execute( "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", (ppep, scrubbed, deppep_id) ) # def generate_deppep(): # cur.execute("SELECT seq FROM deppep") # for row in cur.fetchall(): # yield row[0] cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") for row in cur.fetchall(): deppep_count = row[0] cur.execute("SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)") for row in cur.fetchall(): sequence_count = row[0] print( "%d phosphopeptides were read from input" % ppep_count ) print( "%d corresponding dephosphopeptides are represented in input" % deppep_count ) # Look for cases where both Gene_Name and Sequence are identical cur.execute(''' SELECT Uniprot_ID, Gene_Name, Sequence FROM UniProtKB WHERE Sequence IN ( SELECT Sequence FROM UniProtKB GROUP BY Sequence, Gene_Name HAVING count(*) > 1 ) ORDER BY Sequence ''') duplicate_count = 0 old_seq = '' for row in cur.fetchall(): if duplicate_count == 0: print("\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column).") if row[2] != old_seq: old_seq = row[2] duplicate_count += 1 if options.warn_duplicates: print("\n%s\t%s\t%s" % row) else: if options.warn_duplicates: print("%s\t%s" % (row[0], row[1])) if duplicate_count > 0: print("\n%d sequences have duplicated accession IDs\n" % duplicate_count) print( "%s accession sequences will be searched\n" % sequence_count ) #print(auto.dump()) # Convert the trie to an automaton (a finite-state machine) auto.make_automaton() # Execute query for seqs and metadata without fetching the results yet uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) while batch := uniprot_seq_and_id.fetchmany(size=50): if None == batch: break for Sequence, UniProtKB_id in batch: if Sequence is not None: for end_index, (insert_order, original_value) in auto.iter(Sequence): ker.execute(''' INSERT INTO deppep_UniProtKB (deppep_id,UniProtKB_id,pos_start,pos_end) VALUES (?,?,?,?) ''', ( insert_order, UniProtKB_id, 1 + end_index - len(original_value), end_index ) ) else: raise ValueError("UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" % (UniProtKB_id,)) ker.execute(""" SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' FROM uniprotkb_pep_ppep_view """ ) for row in ker.fetchall(): print(row[0]) ker.execute(""" SELECT count(*) || ' accession matches were found', count(*) AS accession_count FROM ( SELECT accession FROM uniprotkb_pep_ppep_view GROUP BY accession ) """ ) for row in ker.fetchall(): print(row[0]) accession_count = row[1] ker.execute(""" SELECT count(*) || ' peptide matches were found' FROM ( SELECT peptide FROM uniprotkb_pep_ppep_view GROUP BY peptide ) """ ) for row in ker.fetchall(): print(row[0]) ker.execute(""" SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count FROM ( SELECT phosphopeptide FROM uniprotkb_pep_ppep_view GROUP BY phosphopeptide ) """ ) for row in ker.fetchall(): print(row[0]) phosphopeptide_count = row[1] con.commit() ker.execute('vacuum') con.close() if __name__ == "__main__": wrap_start_time = time.perf_counter() __main__() wrap_stop_time = time.perf_counter() # print(wrap_start_time) # print(wrap_stop_time) print("\nThe matching process took %d milliseconds to run.\n" % ((wrap_stop_time - wrap_start_time)*1000),) # vim: sw=4 ts=4 et ai :