changeset 17:ba5f14c2a4af draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit aea9ac5a98069c3c993bd7903eda732f8ae2109d"
author eschen42
date Wed, 06 Apr 2022 05:47:34 +0000
parents d9906288cc6a
children 196b84357e7e
files PhosphoPeptide_Upstream_Kinase_Mapping.pl macros.xml mqppep_mrgfltr.py search_ppep.py
diffstat 4 files changed, 2193 insertions(+), 2116 deletions(-) [+]
line wrap: on
line diff
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Tue Apr 05 01:47:37 2022 +0000
+++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl	Wed Apr 06 05:47:34 2022 +0000
@@ -1,2095 +1,2157 @@
-#!/usr/local/bin/perl
-###############################################################################################################################
-#    perl Kinase_enrichment_analysis_complete_v0.pl
-#
-#    Nick Graham, USC
-#    2016-02-27
-#
-#    Built from scripts written by NG at UCLA in Tom Graeber's lab:
-#        CombinePhosphoSites.pl
-#        Retrieve_p_motifs.pl
-#        NetworKIN_Motif_Finder_v7.pl
-#
-#    Given a list of phospho-peptides, find protein information and upstream kinases.
-#    Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
-#
-#    Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab:
-#        Added warnings and used strict;
-#        fixed some code paths resulting in more NetworKIN matches;
-#        applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)
-#        to speed up "Match the non_p_peptides to the @sequences array";
-#        added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;
-#        added support for SQLite output in addition to tabular files.
-#
-#
-###############################################################################################################################
-
-use strict;
-use warnings 'FATAL' => 'all';
-
-use Getopt::Std;
-use DBD::SQLite::Constants qw/:file_open/;
-use DBI qw(:sql_types);
-use File::Copy;
-use File::Basename;
-use POSIX qw(strftime);
-use Time::HiRes qw(gettimeofday);
-#use Data::Dump qw(dump);
-
-my $USE_SEARCH_PPEP_PY = 1;
-#my $FAILED_MATCH_SEQ = "Failed match";
-my $FAILED_MATCH_SEQ = 'No Sequence';
-my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name';
-
-my $dirname = dirname(__FILE__);
-my %opts;
-my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);
-my $dbtype;
-my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
-my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
-my $line = 0;
-my @failed_match = ($FAILED_MATCH_SEQ);
-my @failed_matches;
-my (%all_data);
-my (@p_peptides, @non_p_peptides);
-my @parsed_fasta;
-my (@accessions, @names, @sequences, @databases, $database);
-my ($dbfile, $dbh, $stmth);
-my @col_names;
-my (%matched_sequences, %accessions,     %names,     %sites,   );
-my (@tmp_matches,       @tmp_accessions, @tmp_names, @tmp_sites);
-my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
-my (@kinases_observed, $kinases);
-my (@kinases_observed_lbl, @phosphosites_observed_lbl);
-my ($p_sequence_kinase, $p_sequence, $kinase);
-my (@motif_sequence, %motif_type, %motif_count);
-my (@kinases_PhosphoSite, $kinases_PhosphoSite);
-my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
-my (%regulatory_sites_PhosphoSite_hash);
-my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);
-my (%unique_motifs);
-my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
-my %psp_regsite_protein_2;
-my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);
-my @timeData;
-my $PhosphoSitePlusCitation;
-my %site_description;
-
-my %kinase_substrate_NetworKIN_matches;
-my %kinase_motif_matches;
-my $regulatory_sites_PhosphoSite;
-my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);
-my %kinase_substrate_PhosphoSite_matches;
-my @formatted_sequence;
-my $pSTY_sequence;
-my $i;
-my @a;
-my $use_sqlite;
-my $verbose;
-
-##########
-## opts ##
-##########
-  ## input files
-    # i : path to input file, e.g., 'outputfile_STEP2.txt'
-    # f : path to UniProtKB/SwissProt FASTA
-    # s : optional species argument
-    # n : path to NetworKIN_201612_cutoffscore2.0.txt
-    # m : path to pSTY_Motifs.txt
-    # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
-    # r : path to 2017-03_PSP_Regulatory_sites.txt
-  ## options
-    # P : phospho_type
-    # F : function
-    # v : verbose output
-  ## output files
-    # o : path to output file
-    # O : path to "melted" output file
-    # D : path to output SQLite file
-
-sub usage()
-    {
-        print STDERR <<"EOH";
-    This program given a list of phospho-peptides, finds protein information and upstream kinases.
-    usage: $0 [-hvd] -f FASTA_file
-     -h : this (help) message
-     -v : slightly verbose
-     -a : use SQLite less
-     ## input files
-     -i : path to input file, e.g., 'outputfile_STEP2.txt'
-     -f : path to UniProtDB/SwissProt FASTA
-     -s : optional species filter argument for PSP records; defaults to 'human'
-     -n : path to NetworKIN_201612_cutoffscore2.0.txt
-     -m : path to pSTY_Motifs.txt
-     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
-     -r : path to 2017-03_PSP_Regulatory_sites.txt
-     ## options
-     -P : phospho_type
-     -F : function
-     ## output files
-     -o : path to output file
-     -O : path to "melted" output file
-     -D : path to output SQLite file
-    example: $0
-EOH
-        exit;
-    }
-
-sub format_localtime_iso8601 {
-    # ref: https://perldoc.perl.org/Time::HiRes
-    my ($seconds, $microseconds) = gettimeofday;
-    # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html
-    return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000);
-}
-
-sub replace_pSpTpY {
-    my ($formatted_sequence, $phospho_type) = @_;
-    if ($phospho_type eq 'y') {
-        $formatted_sequence =~ s/pS/S/g;
-        $formatted_sequence =~ s/pT/T/g;
-        $formatted_sequence =~ s/pY/y/g;
-        }
-    elsif ($phospho_type eq "sty") {
-        $formatted_sequence =~ s/pS/s/g;
-        $formatted_sequence =~ s/pT/t/g;
-        $formatted_sequence =~ s/pY/y/g;
-        }
-    $formatted_sequence;
-}
-
-sub pseudo_sed()
-{
-    # Comments give the sed equivalent
-    my $s;
-    # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; };
-    unless (m / GN=/s)
-    {
-        $s = s :(OX=[^ \t]*):${1} GN=N/A:s;
-    }
-    # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; };
-    unless (m / PE=/s)
-    {
-        $s = s :(GN=[^ \t]*):${1} PE=N/A:s;
-    }
-    # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; };
-    unless (m / SV=/s)
-    {
-        $s = s :(PE=[^ \t]*):${1} SV=N/A:s;
-    }
-    # s/^sp.//;
-    $s = s /^sp.//s;
-    # s/[|]/\t/g;
-    $s = s /[|]/\t/sg;
-    # s/ OS=/\t/;
-    $s = s / OS=/\t/s;
-    # s/ OX=/\t/;
-    $s = s / OX=/\t/s;
-    # s/ GN=/\t/;
-    $s = s / GN=/\t/s;
-    # s/ PE=/\t/;
-    $s = s / PE=/\t/s;
-    # s/ SV=/\t/;
-    $s = s / SV=/\t/s;
-} # sub pseudo_sed
-
-getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ;
-
-
-if (exists($opts{'h'})) {
-    usage();
-}
-if (exists($opts{'a'})) {
-    $USE_SEARCH_PPEP_PY = 0;
-}
-if (exists($opts{'v'})) {
-    $verbose = 1;
-} else {
-    $verbose = 0;
-}
-if (!exists($opts{'i'}) || !-e $opts{'i'}) {
-    die('Input File not found');
-} else {
-    $file_in = $opts{'i'};
-}
-if (!exists($opts{'f'}) || !-e $opts{'f'}) {
-    die('FASTA not found');
-} else {
-    $fasta_in = $opts{'f'};
-    $use_sqlite = 0;
-}
-my $species;
-if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) {
-    $species = 'human';
-} else {
-    $species = $opts{'s'};
-    print "'-s' option is '$species'\n";
-}
-print "species filter is '$species'\n";
-
-if (!exists($opts{'n'}) || !-e $opts{'n'}) {
-    die('Input NetworKIN File not found');
-} else {
-    $networkin_in = $opts{'n'};
-}
-if (!exists($opts{'m'}) || !-e $opts{'m'}) {
-    die('Input pSTY_Motifs File not found');
-} else {
-    $motifs_in = $opts{'m'};
-}
-if (!exists($opts{'p'}) || !-e $opts{'p'}) {
-    die('Input PSP_Kinase_Substrate_Dataset File not found');
-} else {
-    $PSP_Kinase_Substrate_in = $opts{'p'};
-}
-if (!exists($opts{'r'}) || !-e $opts{'r'}) {
-    die('Input PSP_Regulatory_sites File not found');
-} else {
-    $PSP_Regulatory_Sites_in = $opts{'r'};
-}
-if (exists($opts{'P'})) {
-    $phospho_type = $opts{'P'};
-}
-else {
-    $phospho_type = "sty";
-}
-if (exists($opts{'F'})) {
-    $average_or_sum = $opts{'F'};
-}
-else {
-    $average_or_sum = "sum";
-}
-if (exists($opts{'D'})) {
-    $db_out = $opts{'D'};
-}
-else {
-    $db_out = "db_out.sqlite";
-}
-if (exists($opts{'O'})) {
-    $file_melt = $opts{'O'};
-}
-else {
-    $file_melt = "output_melt.tsv";
-}
-if (exists($opts{'o'})) {
-    $file_out = $opts{'o'};
-}
-else {
-    $file_out = "output.tsv";
-}
-
-
-###############################################################################################################################
-# Print the relevant file names to the screen
-###############################################################################################################################
-# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
-print "\n--- parameters:\n";
-print "Data file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\nMelted map:  $file_melt\n";
-if ($use_sqlite == 0) {
-  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file:  $fasta_in\nOutput SQLite file: $db_out\n";
-} else {
-  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file:  $dbfile\nOutput SQLite file: $db_out\n";
-}
-print "...\n\n";
-
-print "Phospho-residues(s) = $phospho_type\n\n";
-if ($phospho_type ne 'y') {
-    if ($phospho_type ne 'sty') {
-        die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
-    }
-}
-
-###############################################################################################################################
-# read the input data file
-# average or sum identical phospho-sites, depending on the value of $average_or_sum
-###############################################################################################################################
-
-open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";
-
-die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
-
-
-$line = 0;
-
-while (<IN>) {
-    chomp;
-    my @x = split(/\t/);
-    for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
-
-    # Read in the samples
-    if ($line == 0) {
-        for my $n (1 .. $#x) {
-            push (@samples, $x[$n]);
-            $sample_id_lut{$x[$n]} = $n;
-        }
-        $line++;
-    } else {
-        # check whether we have already seen a phospho-peptide
-        if (exists($data{$x[0]})) {
-            if ($average_or_sum eq "sum") {        # add the data
-                # unload the data
-                @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
-                # add the new data and repack
-                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
-                $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
-
-            } elsif ($average_or_sum eq "average") {        # average the data
-                # unload the data
-                @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
-                # average with the new data and repack
-                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
-                $n{$x[0]}++;
-                $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
-            }
-        }
-        # if the phospho-sequence has not been seen, save the data
-        else {
-            for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
-            $n{$x[0]} = 1;
-        }
-    }
-}
-close(IN);
-
-
-###############################################################################################################################
-# Search the FASTA database for phospho-sites and motifs
-#
-# based on Retrieve_p_peptide_motifs_v2.pl
-###############################################################################################################################
-
-
-###############################################################################################################################
-#
-#    Read in the Data file:
-#        1) make @p_peptides array as in the original file
-#        2) make @non_p_peptides array w/o residue modifications (p, #, other)
-#
-###############################################################################################################################
-
-foreach my $peptide (keys %data) {
-    $peptide =~ s/s/pS/g;    $peptide =~ s/t/pT/g;    $peptide =~ s/y/pY/g;
-    push (@p_peptides, $peptide);
-    $peptide =~ s/p//g;
-    push(@non_p_peptides, $peptide);
-}
-
-if ($use_sqlite == 0) {
-  ###############################################################################################################################
-  #
-  #    Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database
-  #
-  ###############################################################################################################################
-
-  # e.g.
-  #   >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
-  #   MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD
-  #   DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK
-  #   EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH
-  #   QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS
-  #   EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
-  # accession: Q9Y3B9
-  # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
-  # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
-
-  open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
-  print "Reading FASTA file $fasta_in\n";
-  # ref: https://perldoc.perl.org/perlsyn#Compound-Statements
-  #      "If the condition expression of a while statement is based on any of
-  #      a group of iterative expression types then it gets some magic treatment.
-  #      The affected iterative expression types are readline, the <FILEHANDLE>
-  #      input operator, readdir, glob, the <PATTERN> globbing operator, and
-  #      `each`. If the condition expression is one of these expression types,
-  #      then the value yielded by the iterative operator will be implicitly
-  #      assigned to `$_`."
-  while (<IN1>) {
-    chomp;
-    # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR
-    #      "If only PATTERN is given, EXPR defaults to $_."
-    my (@x) = split(/\|/);
-    for my $i (0 .. $#x) {
-      $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
-    if ($x[0] =~ /^>/) {
-      $x[0] =~ s/\>//g;
-      push (@databases, $x[0]);
-      push (@accessions, $x[1]);
-      push (@names, $x[2]);
-      pseudo_sed();
-      s/$/\t/;
-      push (@parsed_fasta, $_);
-    } elsif ($x[0] =~ /^\w/) {
-      if (defined $sequences[$#accessions]) {
-        $sequences[$#accessions] = $sequences[$#accessions].$x[0];
-      } else {
-        $sequences[$#accessions] = $x[0];
-      }
-      $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0];
-    }
-  }
-  close IN1;
-  print "Done Reading FASTA file $fasta_in\n";
-  $dbfile = $db_out;
-  print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n";
-  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
-  my $auto_commit = $dbh->{AutoCommit};
-  print "auto_commit was $auto_commit and is now 0\n" if ($verbose);
-  $dbh->{AutoCommit} = 0;
-
-  # begin DDL-to-SQLite
-  # ---
-  $stmth = $dbh->prepare("
-    DROP TABLE IF EXISTS UniProtKB;
-    ");
-  $stmth->execute();
-
-  $stmth = $dbh->prepare("
-  CREATE TABLE UniProtKB (
-    Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE,
-    Description TEXT,
-    Organism_Name TEXT,
-    Organism_ID INTEGER,
-    Gene_Name TEXT,
-    PE TEXT,
-    SV TEXT,
-    Sequence TEXT,
-    Database TEXT
-  )
-  ");
-  $stmth->execute();
-  $stmth = $dbh->prepare("
-  CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID);
-  ");
-  $stmth->execute();
-  $stmth = $dbh->prepare("
-  CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name);
-  ");
-  $stmth->execute();
-  # ...
-  # end DDL-to-SQLite
-
-  # insert all rows
-  # begin store-to-SQLite "UniProtKB" table
-  # ---
-  $stmth = $dbh->prepare("
-  INSERT INTO UniProtKB (
-    Uniprot_ID,
-    Description,
-    Organism_Name,
-    Organism_ID,
-    Gene_Name,
-    PE,
-    SV,
-    Sequence,
-    Database
-  ) VALUES (?,?,?,?,?,?,?,?,?)
-  ");
-  my $row_count = 1;
-  my $row_string;
-  my (@row, @rows);
-  my $wrd;
-  while ( scalar @parsed_fasta > 0 ) {
-      $database = $databases[$#parsed_fasta];
-      #### print "parsed_fasta[-1]: " . $parsed_fasta[$#parsed_fasta] . "\n";
-      $row_string = pop(@parsed_fasta);
-      #### print "row_string: $row_string\n";
-      @row = (split /\t/, $row_string);
-      for $i (1..3,5..8) {
-          $stmth->bind_param($i, $row[$i]);
-      }
-      $stmth->bind_param(9, $database);
-      $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER });
-      if (not $stmth->execute()) {
-          print "Error in row $row_count: $stmth->errstr\n";
-      }
-      $row_count += 1;
-  }
-  # ...
-  # end store-to-SQLite "UniProtKB" table
-
-  print "begin commit at " . format_localtime_iso8601() . "\n";
-  $dbh->{AutoCommit} = $auto_commit;
-  print "auto_commit is now $auto_commit\n" if ($verbose);
-  $dbh->disconnect if ( defined $dbh );
-  print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n";
-  $dbtype = "FASTA";
-}
-
-if ($use_sqlite == 1) {
-  ###############################################################################################################################
-  #
-  #    Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array
-  #
-  ###############################################################################################################################
-
-  copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!";
-
-  # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
-  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, {
-    sqlite_open_flags => SQLITE_OPEN_READONLY,
-  });
-  print "DB connection $dbh is to $dbfile\n";
-
-  # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence
-  $stmth = $dbh->prepare("
-  SELECT Uniprot_ID
-  , Description || ' OS=' || Organism_Name || ' OX=' || Organism_ID
-    || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END
-    || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE        END
-    || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV        END
-    AS Description
-  , Sequence
-  , Database
-  FROM
-    UniProtKB
-  ");
-  $stmth->execute();
-  @col_names = @{$stmth->{NAME}};
-  print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose);
-  while (my @row = $stmth->fetchrow_array) {
-    push (@names,              $row[1]); # redacted Description
-    push (@accessions,         $row[0]); # Uniprot_ID
-    $sequences[$#accessions] = $row[2];  # Sequence
-    push (@databases,          $row[3]); # Database (should be 'sp')
-  }
-
-  $dbh->disconnect if ( defined $dbh );
-
-  print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n";
-  $dbtype = "SQLite";
-}
-
-print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
-
-######################
-  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
-  $stmth = $dbh->prepare("
-  INSERT INTO UniProtKB (
-    Uniprot_ID,
-    Description,
-    Organism_Name,
-    Organism_ID,
-    Gene_Name,
-    PE,
-    SV,
-    Sequence,
-    Database
-  ) VALUES (
-    'No Uniprot_ID',
-    'NO_GENE_SYMBOL No Description',
-    'No Organism_Name',
-    0,
-    '$FAILED_MATCH_GENE_NAME',
-    '0',
-    '0',
-    '$FAILED_MATCH_SEQ',
-    'No Database'
-  )
-  ");
-  if (not $stmth->execute()) {
-      print "Error inserting dummy row into UniProtKB: $stmth->errstr\n";
-  }
-  $dbh->disconnect if ( defined $dbh );
-######################
-
-@timeData = localtime(time);
-print "\n--- Start search at " . format_localtime_iso8601() ."\n";
-
-print "    --> Calling 'search_ppep' script\n\n";
-if ($verbose) {
-  $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
-} else {
-  $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in");
-}
-if ($i) {
-  print "python $dirname/search_ppep.py -u $db_out -p $file_in\n  exited with exit code $i\n";
-  die "Search failed for phosphopeptides in SwissProt/SQLite file.";
-}
-print "    <-- Returned from 'search_ppep' script\n";
-
-@timeData = localtime(time);
-print "... Finished search at " . format_localtime_iso8601() ."\n\n";
-
-
-###############################################################################################################################
-#
-#    Match the non_p_peptides to the @sequences array:
-#        1) Format the motifs +/- 10 residues around the phospho-site
-#        2) Print the original data plus the phospho-motif to the output file
-#
-###############################################################################################################################
-
-
-print "--- Match the non_p_peptides to the \@sequences array:\n";
-
-if ($USE_SEARCH_PPEP_PY) {
-  print "Find the matching protein sequence(s) for the peptide using SQLite\n";
-} else {
-  print "Find the matching protein sequence(s) for the peptide using slow search\n";
-}
-
-# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
-$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, {
-  sqlite_open_flags => SQLITE_OPEN_READONLY,
-});
-print "DB connection $dbh is to $db_out\n";
-
-# CREATE VIEW uniprotid_pep_ppep AS
-#   SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
-#          , deppep.seq                          AS peptide
-#          , ppep.seq                            AS phosphopeptide
-#          , UniProtKB.Sequence                  AS sequence
-#          , UniProtKB.Description               AS description
-#   FROM     ppep, deppep, deppep_UniProtKB, UniProtKB
-#   WHERE    deppep.id = ppep.deppep_id
-#   AND      deppep.id = deppep_UniProtKB.deppep_id
-#   AND      deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID
-#   ORDER BY UniprotKB_ID, deppep.seq, ppep.seq;
-
-my %ppep_to_count_lut;
-print "start select peptide counts " . format_localtime_iso8601() . "\n";
-my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
-    SELECT DISTINCT
-      phosphopeptide
-    , count(*) as i
-    FROM
-      uniprotkb_pep_ppep_view
-    GROUP BY
-      phosphopeptide
-    ORDER BY
-      phosphopeptide
-");
-if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
-    die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n";
-}
-while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
-  $ppep_to_count_lut{$row[0]} = $row[1];
-  #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
-}
-
-# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id
-# 0          1        2         3            4               5                 6          7        8         9
-my $COL_ACCESSION        = 0;
-my $COL_PEPTIDE          = 1;
-my $COL_SEQUENCE         = 2;
-my $COL_DESCRIPTION      = 3;
-my $COL_PHOSPHOPEPTIDE   = 4;
-my $COL_LONG_DESCRIPTION = 5;
-my $COL_POS_START        = 6;
-my $COL_POS_END          = 7;
-my $COL_SCRUBBED         = 8;
-my $COL_PPEP_ID          = 9;
-
-my %ppep_to_row_lut;
-print "start select all records without qualification " . format_localtime_iso8601() . "\n";
-$uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
-    SELECT DISTINCT
-      accession
-    , peptide
-    , sequence
-    , description
-    , phosphopeptide
-    , long_description
-    , pos_start
-    , pos_end
-    , scrubbed
-    , ppep_id
-    FROM
-      uniprotkb_pep_ppep_view
-    ORDER BY
-      phosphopeptide
-");
-if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
-    die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n";
-}
-my $current_ppep;
-my $counter = 0;
-my $former_ppep = "";
-@tmp_matches = ();
-@tmp_accessions = ();
-@tmp_names = ();
-@tmp_sites = ();
-while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
-    # Identify phosphopeptide for current row;
-    #   it is an error for it to change when the counter is not zero.
-    $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
-
-    # when counter is zero, prepare for a new phosphopeptide
-    if (not $current_ppep eq $former_ppep) {
-        die "counter is $counter instead of zero" if ($counter != 0);
-        $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID];
-        print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose);
-        $counter = $ppep_to_count_lut{$current_ppep};
-        @tmp_matches = ();
-        @tmp_accessions = ();
-        @tmp_names = ();
-        @tmp_sites = ();
-    }
-
-    if ($USE_SEARCH_PPEP_PY) {
-        push(@tmp_matches,    $row[ $COL_SEQUENCE         ]);
-        push(@tmp_accessions, $row[ $COL_ACCESSION        ]);
-        push(@tmp_names,      $row[ $COL_LONG_DESCRIPTION ]);
-        push(@tmp_sites,      $row[ $COL_POS_START        ]);
-    }
-
-    # Prepare counter and phosphopeptide tracker for next row
-    $former_ppep = $current_ppep;
-    $counter -= 1;
-
-    # Set trackers for later use after last instance of current phosphopeptide
-    if ($counter == 0) {
-        if ($USE_SEARCH_PPEP_PY) {
-            $matched_sequences{$current_ppep} = [ @tmp_matches ];
-            $accessions{       $current_ppep} = [ @tmp_accessions ];
-            $names{            $current_ppep} = [ @tmp_names ];
-            $sites{            $current_ppep} = [ @tmp_sites ];
-        }
-    }
-}
-
-
-print "end select all records without qualification " . format_localtime_iso8601() . "\n";
-
-for my $j (0 .. $#p_peptides) {
-
-    #Find the matching protein sequence(s) for the peptide using SQLite
-    my ($site, $sequence);
-    my (@row, @rows);
-    my $match = 0;
-    my $p_peptide = $p_peptides[$j];
-    @tmp_matches = ();
-    @tmp_accessions = ();
-    @tmp_names = ();
-    @tmp_sites = ();
-
-    #Find the matching protein sequence(s) for the peptide using slow search
-    $site = -1;
-    unless ($USE_SEARCH_PPEP_PY) {
-        for my $k (0 .. $#sequences) {
-            $site = index($sequences[$k], $non_p_peptides[$j]);
-            if ($site != -1) {
-                  push(@tmp_matches, $sequences[$k]);
-                  push(@tmp_accessions, $accessions[$k]);
-                  push(@tmp_names, $names[$k]);
-                  push(@tmp_sites, $site);
-                }
-                # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n";
-                $site = -1; $match++;
-                # print "tmp_accessions @tmp_accessions \n";
-        }
-        if ($match == 0) {    # Check to see if no match was found.  Skip to next if no match found.
-            print "Warning:  Failed match for $p_peptides[$j]\n";
-            $matched_sequences{$p_peptides[$j]} = \@failed_match;
-            push(@failed_matches,$p_peptides[$j]);
-            next;
-        } else {
-            $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
-            $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
-            $names{$p_peptides[$j]} = [ @tmp_names ];
-            $sites{$p_peptides[$j]} = [ @tmp_sites ];
-        }
-    }
-
-} # end for my $j (0 .. $#p_peptides)
-
-print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n";
-
-print "--- Match the p_peptides to the \@sequences array:\n";
-
-for my $peptide_to_match ( keys %matched_sequences ) {
-    if (grep($peptide_to_match, @failed_matches)) {
-        print "Failed to match peptide $peptide_to_match\n";
-    }
-    next if (grep($peptide_to_match, @failed_matches));
-    my @matches = @{$matched_sequences{$peptide_to_match}};
-    @tmp_motifs_array = ();
-    for my $i (0 .. $#matches) {
-
-        # Find the location of the phospo-site in the sequence(s)
-        $tmp_site = 0; my $offset = 0;
-        my $tmp_p_peptide = $peptide_to_match;
-        $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
-
-        # Find all phosphorylated residues in the p_peptide
-        @p_sites = ();
-        while ($tmp_site != -1) {
-            $tmp_site = index($tmp_p_peptide, 'p', $offset);
-            if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
-            $offset = $tmp_site + 1;
-            $tmp_p_peptide =~ s/p//;
-        }
-        @tmp_p_residues = ();
-        for my $l (0 .. $#p_sites) {
-            next if not defined $sites{$peptide_to_match}[$i];
-
-            push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]);
-
-            # Match the sequences around the phospho residues to find the motifs
-            my ($desired_residues_L, $desired_residues_R);
-            if ($tmp_p_residues[0] - 10 < 0) {    #check to see if there are fewer than 10 residues left of the first p-site
-                # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
-                $desired_residues_L = $tmp_p_residues[0];
-            }
-            else {
-                $desired_residues_L = 10;
-            }
-            my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
-            if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {    #check to see if there are fewer than 10 residues right of the last p-site
-                $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
-                # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
-                # print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
-            }
-            else {
-                $desired_residues_R = 10;
-            }
-
-            my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
-            my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
-            my $arg1 = $matched_sequences{$peptide_to_match}[$i];
-
-            if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) {
-                $tmp_motif = substr($arg1, $arg2, $total_length);
-
-                # Put the "p" back in front of the appropriate phospho-residue(s).
-                my (@tmp_residues, $tmp_position);
-                for my $m (0 .. $#p_sites) {
-                    # print "Line 183: $p_sites[$m]\n";
-                    if ($m == 0) {
-                        $tmp_position = $desired_residues_L;
-                    } else {
-                        $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];
-                    }
-                    if ($tmp_position < length($tmp_motif) + 1) {
-                        push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
-                        if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
-                        if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
-                        if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
-                    }
-                }
-
-                $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
-
-                # Comment out on 8.10.13 to remove the numbers from motifs
-                my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
-                my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
-                $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
-                push(@tmp_motifs_array, $tmp_motif);
-                $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
-                $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
-            }
-        }
-        $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
-    }  # end for my $i (0 .. $#matches)       ### this bracket could be in the wrong place
-}
-
-print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n";
-
-###############################################################################################################################
-#
-#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
-#
-###############################################################################################################################
-
-
-print "--- Reading various site data:\n";
-
-###############################################################################################################################
-#
-#    Read the NetworKIN_predictions file:
-#        1) make a "kinases_observed" array
-#        2) annotate the phospho-substrates with the appropriate kinase
-#
-###############################################################################################################################
-my $SITE_KINASE_SUBSTRATE = 1;
-$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN";
-
-open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
-print "Reading the NetworKIN data:  $networkin_in\n";
-while (<IN1>) {
-    chomp;
-    my (@x) = split(/\t/);
-    for my $i (0 .. $#x) {
-        $x[$i] =~ s/\r//g;     $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
-    }
-    next if ($x[0] eq "#substrate");
-    if (exists ($kinases -> {$x[2]})) {
-        #do nothing
-    }
-    else {
-        $kinases -> {$x[2]} = $x[2];
-        push (@kinases_observed, $x[2]);
-    }
-    my $tmp = $x[10]."_".$x[2];    #eg, REEILsEMKKV_PKCalpha
-    if (exists($p_sequence_kinase -> {$tmp})) {
-        #do nothing
-    }
-    else {
-        $p_sequence_kinase -> {$tmp} = $tmp;
-    }
-}
-close IN1;
-
-###############################################################################################################################
-#
-#    Read the Kinase motifs file:
-#        1) make a "motif_sequence" array
-#
-###############################################################################################################################
-
-# file format (tab separated):
-#   x[0] = primary key (character), e.g., '17' or '23a'
-#   x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)'
-#   x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)'
-
-my $SITE_MOTIF = 2;
-$site_description{$SITE_MOTIF} = "motif";
-
-open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
-print "Reading the Motifs file:  $motifs_in\n";
-
-while (<IN2>) {
-    chomp;
-    my (@x) = split(/\t/);
-    for my $i (0 .. 2) {
-        $x[$i] =~ s/\r//g;
-        $x[$i]  =~ s/\n//g;
-        $x[$i]  =~ s/\"//g;
-        }
-    if (exists ($motif_type{$x[1]})) {
-        $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
-    } else {
-        $motif_type{$x[1]} = $x[2];
-        $motif_count{$x[1]} = 0;
-        push (@motif_sequence, $x[1]);
-    }
-}
-close (IN2);
-
-
-###############################################################################################################################
-#  6.28.2011
-#    Read PSP_Kinase_Substrate data:
-#        1) make a "kinases_PhosphoSite" array
-#        2) annotate the phospho-substrates with the appropriate kinase
-#
-#  Columns:
-#     (0) GENE
-#     (1) KINASE
-#     (2) KIN_ACC_ID
-#     (3) KIN_ORGANISM
-#     (4) SUBSTRATE
-#     (5) SUB_GENE_ID
-#     (6) SUB_ACC_ID
-#     (7) SUB_GENE
-#     (8) SUB_ORGANISM
-#     (9) SUB_MOD_RSD
-#     (10) SITE_GRP_ID
-#     (11) SITE_+/-7_AA
-#     (12) DOMAIN
-#     (13) IN_VIVO_RXN
-#     (14) IN_VITRO_RXN
-#     (15) CST_CAT#
-###############################################################################################################################
-
-my $SITE_PHOSPHOSITE = 3;
-$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite";
-
-
-$line = 0;
-
-open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n";
-print "Reading the PhosphoSite Kinase-Substrate data:  $PSP_Kinase_Substrate_in\n";
-
-while (<IN3>) {
-    chomp;
-    my (@x) = split(/\t/);
-    for my $i (0 .. $#x) {
-        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
-        }
-    if ($line != 0) {
-        if (($species eq $x[3]) && ($species eq $x[8])) {
-            if (exists ($kinases_PhosphoSite -> {$x[0]})) {
-                #do nothing
-            }
-            else {
-                $kinases_PhosphoSite -> {$x[0]} = $x[0];
-                push (@kinases_PhosphoSite, $x[0]);
-            }
-            my $offset = 0;
-            # Replace the superfluous lower case s, t and y
-            my @lowercase = ('s','t','y');
-            my @uppercase = ('S','T','Y');
-            for my $k (0 .. 2) {
-                my $site = 0;
-                while ($site != -1) {
-                    $site = index($x[11],$lowercase[$k], $offset);
-                    if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
-                    $offset = $site + 1;
-                }
-            }
-            my $tmp = $x[11]."_".$x[0];        #eg, RTPGRPLsSYGMDSR_PAK2
-            if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
-                #do nothing
-            }
-            else {
-                $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
-            }
-        }
-        else {
-            # do nothing
-            #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n";
-        }
-    }
-    $line++;
-}
-close IN3;
-
-
-###############################################################################################################################
-#  Read PhosphoSite regulatory site data:
-#        1) make a "regulatory_sites_PhosphoSite" hash
-#
-#  Columns:
-#    (0)  GENE
-#    (1)  PROTEIN           --> #ACE %psp_regsite_protein
-#    (2)  PROT_TYPE
-#    (3)  ACC_ID
-#    (4)  GENE_ID
-#    (5)  HU_CHR_LOC
-#    (6)  ORGANISM          --> %organism
-#    (7)  MOD_RSD
-#    (8)  SITE_GRP_ID
-#    (9)  SITE_+/-7_AA      --> %regulatory_sites_PhosphoSite_hash
-#    (10) DOMAIN            --> %domain
-#    (11) ON_FUNCTION       --> %ON_FUNCTION
-#    (12) ON_PROCESS        --> %ON_PROCESS
-#    (13) ON_PROT_INTERACT  --> %ON_PROT_INTERACT
-#    (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT
-#    (15) PMIDs
-#    (16) LT_LIT
-#    (17) MS_LIT
-#    (18) MS_CST
-#    (19) NOTES             --> %notes
-###############################################################################################################################
-
-
-$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
-my $auto_commit = $dbh->{AutoCommit};
-$dbh->{AutoCommit} = 0;
-print "DB connection $dbh is to $db_out, opened for modification\n";
-
-# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
-$stmth = $dbh->prepare("
-CREATE TABLE IF NOT EXISTS PSP_Regulatory_site (
-  SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
-  DOMAIN             TEXT,
-  ON_FUNCTION        TEXT,
-  ON_PROCESS         TEXT,
-  ON_PROT_INTERACT   TEXT,
-  ON_OTHER_INTERACT  TEXT,
-  NOTES              TEXT,
-  ORGANISM           TEXT,
-  PROTEIN            TEXT
-)
-");
-$stmth->execute();
-
-# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite
-$stmth = $dbh->prepare("
-CREATE TABLE IF NOT EXISTS ppep_regsite_LUT
-( ppep_id            INTEGER REFERENCES ppep(id)
-, site_plusminus_7AA TEXT    REFERENCES PSP_Regulatory_site(site_plusminus_7AA)
-, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE
-);
-");
-$stmth->execute();
-
-# $stmth = $dbh->prepare("
-# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0
-#   ON PSP_Regulatory_site(site_plusminus_7AA);
-# ");
-# $stmth->execute();
-
-
-# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
-my $citation_sql;
-$citation_sql = "
-CREATE TABLE IF NOT EXISTS Citation (
-  ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE,
-  CitationData TEXT,
-  PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE
-)
-";
-$stmth = $dbh->prepare($citation_sql);
-$stmth->execute();
-
-
-open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n";
-print "Reading the PhosphoSite regulatory site data:  $PSP_Regulatory_Sites_in\n";
-
-
-$line = -1;
-while (<IN4>) {
-    $line++;
-    chomp;
-    if ($_ =~ m/PhosphoSitePlus/) {
-        #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g);
-        $PhosphoSitePlusCitation = $_;
-        $PhosphoSitePlusCitation =~ s/\t//g;
-        $PhosphoSitePlusCitation =~ s/\r//g;
-        $PhosphoSitePlusCitation =~ s/\n//g;
-        $PhosphoSitePlusCitation =~ s/""/"/g;
-        $PhosphoSitePlusCitation =~ s/^"//g;
-        $PhosphoSitePlusCitation =~ s/"$//g;
-        print "$PhosphoSitePlusCitation\n";
-        next;
-    }
-    my (@x) = split(/\t/);
-    for my $i (0 .. $#x) {
-        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
-    }
-    my $found_GENE=0;
-    if ( (not exists($x[0])) ) {
-        next;
-    }
-    elsif ( ($x[0] eq "GENE") ) {
-        $found_GENE=1;
-        next;
-    }
-    if ( (not exists($x[9])) || ($x[9] eq "") ) {
-        if (exists($x[8]) && (not $x[8] eq "")) {
-            die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n";
-        } else {
-            if ( (not exists($x[1])) || (not $x[1] eq "") ) {
-                print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n"
-                  if $found_GENE==1;
-            }
-            next;
-        }
-    }
-    elsif ($line != 0) {
-        if ($species ne $x[6]) {
-            # Do nothing - this record was filtered out by the species filter
-        }
-        elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) {
-            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
-                $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9];
-                $domain{$x[9]} = $x[10];
-                $ON_FUNCTION{$x[9]} = $x[11];
-                $ON_PROCESS{$x[9]} = $x[12];
-                $ON_PROT_INTERACT{$x[9]} = $x[13];
-                $ON_OTHER_INTERACT{$x[9]} = $x[14];
-                $notes{$x[9]} = $x[19];
-                $organism{$x[9]} = $x[6];
-            }
-        }
-        else {
-            # $domain
-            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
-                if ($x[10] ne "") {
-                  $domain{$x[9]} = $domain{$x[10]};
-                  }
-                else {
-                  # do nothing
-                  }
-            }
-            else {
-                if ($domain{$x[9]} =~ /$x[10]/) {
-                  # do nothing
-                  }
-                else {
-                  $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
-                  #print "INFO line $line - compound domain for 7aa:  GENE $x[0]   PROTEIN $x[1]   PROT_TYPE $x[2]   ACC_ID $x[3]   GENE_ID $x[4]   HU_CHR_LOC $x[5]   ORGANISM $x[6]   MOD_RSD $x[7]   SITE_GRP_ID $x[8]   SITE_+/-7_AA $x[9]   DOMAIN $domain{$x[9]}\n";
-                  }
-            }
-
-            # $ON_FUNCTION
-            if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") {
-                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
-            }
-
-            # $ON_PROCESS
-            if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") {
-                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
-            }
-
-            # $ON_PROT_INTERACT
-            if (!defined $ON_PROT_INTERACT{$x[9]}  || $ON_PROT_INTERACT{$x[9]} eq "") {
-                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
-            }
-
-            # $ON_OTHER_INTERACT
-            if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") {
-                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
-            }
-
-            # $notes
-            if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") {
-                $notes{$x[9]} = $notes{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
-            }
-
-            # $organism
-            if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") {
-                $organism{$x[9]} = $organism{$x[10]};
-            } elsif ($x[10] eq "") {
-                # do nothing
-            }
-            else {
-                $organism{$x[9]} = $organism{$x[9]}." / ".$x[10];
-            }
-        }
-    }
-}
-close IN4;
-
-print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n";
-
-$stmth = $dbh->prepare("
-INSERT INTO Citation (
-  ObjectName,
-  CitationData
-) VALUES (?,?)
-");
-
-sub add_citation {
-    my ($cit_table, $cit_text, $cit_label) = @_;
-    $stmth->bind_param(1, $cit_table);
-    $stmth->bind_param(2, $cit_text);
-    if (not $stmth->execute()) {
-        print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n";
-    }
-}
-my ($citation_text, $citation_table); 
-
-# PSP regulatory or kinase/substrate site
-$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."';
-$citation_table = "PSP_Regulatory_site";
-add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
-$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122';
-$citation_table = "PSP_Regulatory_site";
-add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
-
-# NetworKIN site
-$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052';
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296';
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093';
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "NetworkKIN");
-
-# pSTY motifs
-$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285';
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
-$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159';
-$citation_table = "psp_gene_site";
-add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
-$citation_table = "psp_gene_site_view";
-add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
-
-
-###############################################################################################################################
-#
-#    Read the data file:
-#        1) find sequences that match the NetworKIN predictions
-#        2) find motifs that match the observed sequences
-#
-###############################################################################################################################
-
-print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n";
-
-my $ppep_regsite_LUT_stmth;
-$ppep_regsite_LUT_stmth = $dbh->prepare("
-  INSERT INTO ppep_regsite_LUT (
-    ppep_id,
-    site_plusminus_7AA
-  ) VALUES (?,?)
-");
-
-my ($start_seconds, $start_microseconds) = gettimeofday;
-
-foreach my $peptide (keys %data) {
-    # find the unique phospho-motifs for this $peptide
-    my @all_motifs = ();
-    my $have_all_motifs = 0;
-    for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
-        my $tmp_motif = $p_motifs{$peptide}[$i];
-        push(@all_motifs, $tmp_motif);
-        $have_all_motifs = 1;
-    }
-    if ($have_all_motifs == 1) {
-        for my $j (0 .. $#all_motifs) {
-            if (defined $all_motifs[$j]) {
-                $all_motifs[$j] =~ s/\d+-\[\s//;
-                $all_motifs[$j] =~ s/\s\]\-\d+//;
-            }
-        }
-    }
-    my %seen = ();
-    if ($have_all_motifs == 1) {
-        foreach my $a (@all_motifs) {
-            if (defined $a) {
-                if (exists($seen{$a})) {
-                    next;
-                } else {
-                    push(@{$unique_motifs{$peptide}}, $a);
-                    $seen{$a} = 1;
-                }
-            }
-            print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose);
-        }
-    }
-
-    # count the number of phospo-sites in the motif
-    my $number_pY = 0;
-    my $number_pSTY = 0;
-    if ($phospho_type eq 'y') {
-        if (defined(${$unique_motifs{$peptide}}[0])) {
-            while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {
-                $number_pY++;
-            }
-        }
-    }
-    if ($phospho_type eq 'sty') {
-        print "looking for unique_motifs for $peptide\n" if ($verbose);
-        if (defined(${$unique_motifs{$peptide}}[0])) {
-            while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {
-                $number_pSTY++;
-               print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose);
-            }
-        }
-    }
-
-
-    # search each of the unique motifs for matches
-    print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose);
-    for my $i (0 .. $#{$unique_motifs{$peptide}}) {
-        print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose);
-        my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
-        print "   --- matching unique motif $tmp_motif for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
-        my $formatted_sequence;
-        if (($number_pY == 1) || ($number_pSTY == 1)) {
-            my $seq_plus5aa = "";
-            my $seq_plus7aa = "";
-            $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
-            print "       a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
-            if ($phospho_type eq 'y') {
-                $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
-                $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
-            }
-            elsif ($phospho_type eq "sty") {
-                $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
-                $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
-            }
-
-            if (defined $seq_plus7aa) {
-                # commit the 7aa LUT records
-                $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} );
-                $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa             );
-                if (not $ppep_regsite_LUT_stmth->execute()) {
-                    print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n";
-                }
-            }
-            for my $i (0 .. $#kinases_observed) {
-                if (defined $seq_plus5aa) {
-                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should be PGRPLsSYGMD_PKCalpha
-                    if (exists($p_sequence_kinase -> {$tmp})) {
-                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE
-                    }
-                }
-            }
-            for my $i (0 .. $#motif_sequence) {
-                if ($peptide =~ /$motif_sequence[$i]/) {
-                    $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
-                }
-            }
-            for my $i (0 .. $#kinases_PhosphoSite) {
-                if (defined $seq_plus7aa) {
-                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
-                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
-                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
-                    }
-                }
-            }
-            if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) {
-                $seq_plus7aa_2{$peptide} = $seq_plus7aa;
-                $domain_2{$peptide} = $domain{$seq_plus7aa};
-                $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
-                $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
-                $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
-                $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
-                $notes_2{$peptide} = $notes{$seq_plus7aa};
-                $organism_2{$peptide} = $organism{$seq_plus7aa};
-            } else {
-            }
-        }
-        elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
-            $formatted_sequence = $tmp_motif;
-            $seq_plus5aa = "";
-            $seq_plus7aa = "";
-            #Create the sequences with only one phosphorylation site
-            #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
-
-            my (@sites, $offset, $next_p_site);
-            $sites[0] = index($tmp_motif, "p");
-            $offset = $sites[0] + 1;
-            $next_p_site = 0;
-            while ($next_p_site != -1) {
-                $next_p_site = index($tmp_motif, "p", $offset);
-                if ($next_p_site != -1) {
-                    push (@sites, $next_p_site);
-                }
-                $offset = $next_p_site+1;
-            }
-
-            my @pSTY_sequences;
-            for my $n (0 .. $#sites) {
-                $pSTY_sequences[$n] = $tmp_motif;
-                for (my $m = $#sites; $m >= 0; $m--) {
-                    if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
-                }
-            }
-
-            my @formatted_sequences;
-            for my $k (0 .. $#sites) {
-                $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
-            }
-
-            for my $k (0 .. $#formatted_sequences) {
-                print "       b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
-                if ($phospho_type eq 'y') {
-                    $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1];
-                    $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1];
-                }
-                elsif ($phospho_type eq "sty") {
-                    $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1];
-                    $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1];
-                }
-                for my $i (0 .. $#kinases_observed) {
-                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should look like REEILsEMKKV_PKCalpha
-                    if (exists($p_sequence_kinase -> {$tmp})) {
-                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
-                    }
-                }
-                $pSTY_sequence = $formatted_sequences[$k];
-                for my $i (0 .. $#motif_sequence) {
-                    if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
-                        $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
-                    }
-                }
-                for my $i (0 .. $#kinases_PhosphoSite) {
-                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
-                    #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp";
-                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
-                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
-                    }
-                }
-                if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
-                    $seq_plus7aa_2{$peptide} = $seq_plus7aa;
-
-                    # $domain
-                    if ($domain_2{$peptide} eq "") {
-                        $domain_2{$peptide} = $domain{$seq_plus7aa};
-                    }
-                    elsif ($domain{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
-                    }
-
-
-                    # $ON_FUNCTION_2
-                    if ($ON_FUNCTION_2{$peptide} eq "") {
-                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
-                    }
-                    elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
-                    }
-
-                    # $ON_PROCESS_2
-                    if ($ON_PROCESS_2{$peptide} eq "") {
-                        $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
-                    }
-                    elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
-                    }
-
-                    # $ON_PROT_INTERACT_2
-                    if ($ON_PROT_INTERACT_2{$peptide} eq "") {
-                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
-                    }
-                    elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
-                    }
-
-                    # $ON_OTHER_INTERACT_2
-                    if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
-                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
-                    }
-                    elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
-                    }
-
-                    # $notes_2
-                    if ($notes_2{$peptide} eq "") {
-                        $notes_2{$peptide} = $notes{$seq_plus7aa};
-                    }
-                    elsif ($notes{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
-                    }
-                    $notes_2{$peptide} = $notes{$seq_plus7aa};
-
-                    # $organism_2
-                    if ($organism_2{$peptide} eq "") {
-                        $organism_2{$peptide} = $organism{$seq_plus7aa};
-                    }
-                    elsif ($organism{$seq_plus7aa} eq "") {
-                        # do nothing
-                    }
-                    else {
-                        $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa};
-                    }
-                    $organism_2{$peptide} = $organism{$seq_plus7aa};
-                } else {
-                } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa}))
-            } # for my $k (0 .. $#formatted_sequences) 
-        } # if/else number of phosphosites
-    } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}})
-} # for each $peptide
-
-my ($end_seconds, $end_microseconds) = gettimeofday;
-
-my $delta_seconds = $end_seconds - $start_seconds;
-my $delta_microseconds = $end_microseconds - $start_microseconds;
-$delta_microseconds += 1000000 * $delta_seconds;
-my $key_count = keys(%data);
-print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count));
-
-($start_seconds, $start_microseconds) = gettimeofday;
-
-print "Writing PSP_Regulatory_site records\n";
-
-my $psp_regulatory_site_stmth = $dbh->prepare("
-    INSERT INTO PSP_Regulatory_site (
-      DOMAIN,
-      ON_FUNCTION,
-      ON_PROCESS,
-      ON_PROT_INTERACT,
-      ON_OTHER_INTERACT,
-      NOTES,
-      SITE_PLUSMINUS_7AA,
-      ORGANISM
-    ) VALUES (?,?,?,?,?,?,?,?)
-    ");
-
-foreach my $peptide (keys %data) {
-    if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
-        $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide});
-        $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide});
-        if (not $psp_regulatory_site_stmth->execute()) {
-            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n";
-        } else {
-        }
-    } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) {
-        print "\$domain_2{$peptide} is undefined\n";  #ACE
-    }
-}
-
-$dbh->{AutoCommit} = $auto_commit;
-# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish;
-$dbh->disconnect if ( defined $dbh );
-
-
-($end_seconds, $end_microseconds) = gettimeofday;
-
-$delta_seconds = $end_seconds - $start_seconds;
-$delta_microseconds = $end_microseconds - $start_microseconds;
-$delta_microseconds += 1000000 * $delta_seconds;
-$key_count = keys(%data);
-print sprintf("Write time is %d microseconds\n", ($delta_microseconds));
-
-print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n";
-
-###############################################################################################################################
-#
-# Print to the output file
-#
-###############################################################################################################################
-open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
-open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt";
-
-# print the header info
-print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n";
-print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
-
-# print the PhosphoSite regulatory data
-print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
-
-# print the sample names
-for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
-
-# print the kinases and groups
-for my $i (0 .. $#kinases_observed) {
-    my $temp = $kinases_observed[$i]."_NetworKIN";
-    print OUT "$temp\t";
-    push(@kinases_observed_lbl, $temp);
-}
-for my $i (0 .. $#motif_sequence) {
-    print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
-}
-for my $i (0 .. $#kinases_PhosphoSite) {
-    my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
-    if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
-    if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
-    push(@phosphosites_observed_lbl, $temp);
-}
-
-# begin DDL-to-SQLite
-# ---
-$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
-$auto_commit = $dbh->{AutoCommit};
-$dbh->{AutoCommit} = 0;
-print "DB connection $dbh is to $db_out, opened for modification\n";
-
-my $sample_stmth;
-$sample_stmth = $dbh->prepare("
-  INSERT INTO sample (
-    id,
-    name
-  ) VALUES (?,?)
-");
-
-my $ppep_intensity_stmth;
-$ppep_intensity_stmth = $dbh->prepare("
-  INSERT INTO ppep_intensity (
-    ppep_id,
-    sample_id,
-    intensity
-  ) VALUES (?,?,?)
-");
-
-my $site_type_stmth;
-$site_type_stmth = $dbh->prepare("
-  insert into site_type (
-    id,
-    type_name
-  ) values (?,?)
-");
-
-my $ppep_gene_site_stmth;
-$ppep_gene_site_stmth = $dbh->prepare("
-  insert into ppep_gene_site (
-    ppep_id,
-    gene_names,
-    kinase_map,
-    site_type_id
-  ) values (?,?,?,?)
-");
-
-my $ppep_metadata_stmth;
-$ppep_metadata_stmth = $dbh->prepare("
-  INSERT INTO ppep_metadata
-    ( ppep_id
-    , protein_description
-    , gene_name
-    , FASTA_name
-    , phospho_sites
-    , motifs_unique
-    , accessions
-    , motifs_all_members
-    , domain
-    , ON_FUNCTION
-    , ON_PROCESS
-    , ON_PROT_INTERACT
-    , ON_OTHER_INTERACT
-    , notes
-  ) VALUES (
-    ?,?,?,?,?,?,?
-  , ?,?,?,?,?,?,?
-  )
-");
-# end DDL-to-SQLite
-# ...
-
-# begin store-to-SQLite "sample" table
-# ---
-# %sample_id_lut maps name -> ID
-for my $sample_name (keys %sample_id_lut) {
-    $sample_stmth->bind_param( 2, $sample_name                 );
-    $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} );
-    if (not $sample_stmth->execute()) {
-        print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n";
-    }
-}
-# end store-to-SQLite "sample" table
-# ...
-
-# begin store-to-SQLite "site_type" table
-# ---
-sub add_site_type {
-    my ($site_type_id, $site_type_type_name) = @_;
-    $site_type_stmth->bind_param( 2, $site_type_type_name );
-    $site_type_stmth->bind_param( 1, $site_type_id        );
-    if (not $site_type_stmth->execute()) {
-        die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n";
-    }
-}
-add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE});
-add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF});
-add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE});
-# end store-to-SQLite "site_type" table
-# ...
-
-foreach my $peptide (sort(keys %data)) {
-    next if (grep($peptide, @failed_matches));
-    my $ppep_id = $ppep_id_lut{$peptide};
-    my @ppep_metadata = ();
-    my @ppep_intensity = ();
-    my @gene = ();
-    my $gene_names;
-    my $j;
-    # Print the peptide itself
-    #   column 1: p-peptide
-    print OUT "$peptide\t";
-    push (@ppep_metadata, $ppep_id);
-    push (@ppep_intensity, $peptide);
-
-    my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR';
-    # skip over failed matches
-    print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond;
-    if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) {
-        # column 2: Protein description
-        # column 3: Gene name(s)
-        # column 4: FASTA name
-        # column 5: phospho-residues
-        # Column 6: UNIQUE phospho-motifs
-        # Column 7: accessions
-        # Column 8: ALL motifs with residue numbers
-        #          2                                     3   4   5   6   7   8
-        print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
-        print "No match found for '$peptide' in sequence database\n";
-        $gene_names = '$FAILED_MATCH_GENE_NAME';
-    } else {
-        my @description = ();
-        my %seen = ();
-        # Print just the protein description
-        for $i (0 .. $#{$names{$peptide}}) {
-            my $long_name = $names{$peptide}[$i];
-            my @naming_parts = split(/\sOS/, $long_name);
-            my @front_half = split(/\s/, $naming_parts[0]);
-            push(@description, join(" ", @front_half[1..($#front_half)]));
-        }
-        # column 2: Protein description
-        print OUT join(" /// ", @description), "\t";
-        push (@ppep_metadata, join(" /// ", @description));
-
-        # Print just the gene name
-        for $i (0 .. $#{$names{$peptide}}) {
-            my $tmp_gene = $names{$peptide}[$i];
-            $tmp_gene =~ s/^.*GN=//;
-            $tmp_gene =~ s/\s.*//;
-            if (!exists($seen{$tmp_gene})) {
-                push(@gene, $tmp_gene);
-                $seen{$tmp_gene} = $tmp_gene;
-            }
-        }
-        # column 3: Gene name(s)
-        $gene_names = join(" /// ", @gene);
-        print OUT $gene_names, "\t";
-        push (@ppep_metadata, join(" /// ", @gene));
-
-        # column 4: FASTA name
-        print OUT join(" /// ", @{$names{$peptide}}), "\t";
-        push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
-
-        # column 5: phospho-residues
-        my $tmp_for_insert = "";
-        my $foobar;
-        for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
-            print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond;
-            if ($i < $#{ $matched_sequences{$peptide} }) {
-                if (defined $p_residues{$peptide}{$i}) {
-                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
-                    for $j (0 .. $#tmp_p_residues) {
-                        if ($j < $#tmp_p_residues) {
-                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
-                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
-                        }
-                        elsif ($j == $#tmp_p_residues) {
-                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
-                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
-                        }
-                    }
-                }
-            }
-            elsif ($i == $#{ $matched_sequences{$peptide} }) {
-                my $there_were_sites = 0;
-                if (defined $p_residues{$peptide}{$i}) {
-                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
-                    if ($#tmp_p_residues > 0) {
-                        for my $j (0 .. $#tmp_p_residues) {
-                            if ($j < $#tmp_p_residues) {
-                                if (defined $p_residues{$peptide}{$i}[$j]) {
-                                    my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                                    $foobar = $residues{$peptide}{$i}[$j];
-                                    if (defined $foobar) {
-                                        print OUT "$foobar";
-                                        print OUT "$tmp_site_for_printing, ";
-                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
-                                        $there_were_sites = 1;
-                                    }
-                                }
-                            }
-                            elsif ($j == $#tmp_p_residues) {
-                                if (defined $p_residues{$peptide}{$i}[$j]) {
-                                    $foobar = $residues{$peptide}{$i}[$j];
-                                    if (defined $foobar) {
-                                        my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
-                                        print OUT "$foobar";
-                                        print OUT "$tmp_site_for_printing\t";
-                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
-                                        $there_were_sites = 1;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                if (0 == $there_were_sites) {
-                  print OUT "\t";
-                }
-            }
-        }
-        print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond;
-        push (@ppep_metadata, $tmp_for_insert);
-
-        # Column 6: UNIQUE phospho-motifs
-        print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
-        push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
-
-        # Column 7: accessions
-        if (defined $accessions{$peptide}) {
-            print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
-            push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
-        } else {
-            print OUT "\t";
-            push (@ppep_metadata, "");
-        }
-
-        # Column 8: ALL motifs with residue numbers
-        if (defined $p_motifs{$peptide}) {
-            print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
-            push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
-        } else {
-            print OUT "\t";
-            push (@ppep_metadata, "");
-        }
-
-    }
-
-    # Print the PhosphoSite regulatory data
-
-    if (defined $domain_2{$peptide})            { print OUT "$domain_2{$peptide}\t";            } else { print OUT "\t"; }
-    if (defined $ON_FUNCTION_2{$peptide})       { print OUT "$ON_FUNCTION_2{$peptide}\t";       } else { print OUT "\t"; }
-    if (defined $ON_PROCESS_2{$peptide})        { print OUT "$ON_PROCESS_2{$peptide}\t";        } else { print OUT "\t"; }
-    if (defined $ON_PROT_INTERACT_2{$peptide})  { print OUT "$ON_PROT_INTERACT_2{$peptide}\t";  } else { print OUT "\t"; }
-    if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
-    if (defined $notes_2{$peptide})             { print OUT "$notes_2{$peptide}\t";             } else { print OUT "\t"; }
-
-    if (defined $domain_2{$peptide})            { push (@ppep_metadata, $domain_2{$peptide});            } else { push(@ppep_metadata, ""); }
-    if (defined $ON_FUNCTION_2{$peptide})       { push (@ppep_metadata, $ON_FUNCTION_2{$peptide});       } else { push(@ppep_metadata, ""); }
-    if (defined $ON_PROCESS_2{$peptide})        { push (@ppep_metadata, $ON_PROCESS_2{$peptide});        } else { push(@ppep_metadata, ""); }
-    if (defined $ON_PROT_INTERACT_2{$peptide})  { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide});  } else { push(@ppep_metadata, ""); }
-    if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
-    if (defined $notes_2{$peptide})             { push (@ppep_metadata, $notes_2{$peptide});             } else { push(@ppep_metadata, ""); }
-
-    # begin store-to-SQLite "ppep_metadata" table
-    # ---
-    for $i (1..14) {
-        $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]);
-    }
-    if (not $ppep_metadata_stmth->execute()) {
-        print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n";
-    }
-    # ...
-    # end store-to-SQLite "ppep_metadata" table
-
-    # Print the data
-    @tmp_data = ();
-    foreach (@{$data{$peptide}}) {
-        push(@tmp_data, $_);
-    }
-    print OUT join("\t", @tmp_data), "\t";
-
-    # begin store-to-SQLite "ppep_intensity" table
-    # ---
-    # commit the sample intensities
-    $i = 0;
-    foreach (@{$data{$peptide}}) {
-        my $intense = $_;
-        $ppep_intensity_stmth->bind_param( 1, $ppep_id                     );
-        $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} );
-        $ppep_intensity_stmth->bind_param( 3, $intense                     );
-        if (not $ppep_intensity_stmth->execute()) {
-            print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n";
-        }
-        $i += 1;
-    }
-    # ...
-    # end store-to-SQLite "ppep_intensity" table
-
-    # print the kinase-substrate data
-    for my $i (0 .. $#kinases_observed) {
-        if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
-            print OUT "X\t";
-            my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN";
-            print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n";
-            # begin store-to-SQLite "ppep_gene_site" table
-            # ---
-            $ppep_gene_site_stmth->bind_param(1, $ppep_id);               # ppep_gene_site.ppep_id
-            $ppep_gene_site_stmth->bind_param(2, $gene_names);            # ppep_gene_site.gene_names
-            $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label);       # ppep_gene_site.kinase_map
-            $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id
-            if (not $ppep_gene_site_stmth->execute()) {
-                print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n";
-            }
-            # ...
-            # end store-to-SQLite "ppep_gene_site" table
-        }
-        else { print OUT "\t";}
-    }
-    my %wrote_motif;
-    my $motif_parts_0;
-    for my $i (0 .. $#motif_sequence) {
-        if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
-            print OUT "X\t";
-            $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];
-            my $key = "$peptide\t$gene_names\t$motif_parts_0";
-            if (!exists($wrote_motif{$key})) {
-                $wrote_motif{$key} = $key;
-                print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n";
-                # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";            #debug
-                # begin store-to-SQLite "ppep_gene_site" table
-                # ---
-                $ppep_gene_site_stmth->bind_param(1, $ppep_id);        # ppep_gene_site.ppep_id
-                $ppep_gene_site_stmth->bind_param(2, $gene_names);     # ppep_gene_site.gene_names
-                $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map
-                $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF);     # ppep_gene_site.site_type_id
-                if (not $ppep_gene_site_stmth->execute()) {
-                    print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n";
-                }
-                # ...
-                # end store-to-SQLite "ppep_gene_site" table
-            }
-        }
-        else { print OUT "\t";}
-    }
-    for my $i (0 .. $#kinases_PhosphoSite) {
-        if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {
-            print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n";
-            if ($i < $#kinases_PhosphoSite) {
-                print OUT "X\t";
-            }
-            else {
-                print OUT "X\n";
-            }
-            # begin store-to-SQLite "ppep_gene_site" table
-            # ---
-            $ppep_gene_site_stmth->bind_param(1, $ppep_id);                       # ppep_gene_site.ppep_id
-            $ppep_gene_site_stmth->bind_param(2, $gene_names);                    # ppep_gene_site.gene_names
-            $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map
-            $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE);              # ppep_gene_site.site_type_id
-            if (not $ppep_gene_site_stmth->execute()) {
-                print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n";
-            }
-            # ...
-            # end store-to-SQLite "ppep_gene_site" table
-        }
-        else {
-            if ($i < $#kinases_PhosphoSite) {
-                print OUT "\t";
-            }
-            elsif ($i == $#kinases_PhosphoSite) {
-                print OUT "\n";
-            }
-        }
-    }
-}
-
-close OUT;
-close MELT;
-$ppep_gene_site_stmth->finish;
-print "begin DB commit at " . format_localtime_iso8601() . "\n";
-$dbh->{AutoCommit} = $auto_commit;
-$dbh->disconnect if ( defined $dbh );
-
-print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n";
-
-###############################################################################################################################
+#!/usr/local/bin/perl
+###############################################################################################################################
+#    perl Kinase_enrichment_analysis_complete_v0.pl
+#
+#    Nick Graham, USC
+#    2016-02-27
+#
+#    Built from scripts written by NG at UCLA in Tom Graeber's lab:
+#        CombinePhosphoSites.pl
+#        Retrieve_p_motifs.pl
+#        NetworKIN_Motif_Finder_v7.pl
+#
+#    Given a list of phospho-peptides, find protein information and upstream kinases.
+#    Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl
+#
+#    Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab:
+#        Added warnings and used strict;
+#        fixed some code paths resulting in more NetworKIN matches;
+#        applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow)
+#        to speed up "Match the non_p_peptides to the @sequences array";
+#        added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data;
+#        added support for SQLite output in addition to tabular files.
+#
+#
+###############################################################################################################################
+
+use strict;
+use warnings 'FATAL' => 'all';
+
+use Getopt::Std;
+use DBD::SQLite::Constants qw/:file_open/;
+use DBI qw(:sql_types);
+use File::Copy;
+use File::Basename;
+use POSIX qw(strftime);
+use Time::HiRes qw(gettimeofday);
+#use Data::Dump qw(dump);
+
+my $USE_SEARCH_PPEP_PY = 1;
+#my $FAILED_MATCH_SEQ = "Failed match";
+my $FAILED_MATCH_SEQ = 'No Sequence';
+my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name';
+
+my $dirname = dirname(__FILE__);
+my %opts;
+my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type);
+my $dbtype;
+my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in);
+my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n);
+my $line = 0;
+my @failed_match = ($FAILED_MATCH_SEQ);
+my @failed_matches;
+my (%all_data);
+my (@p_peptides, @non_p_peptides);
+my @parsed_fasta;
+my (@accessions, @names, @sequences, @databases, $database);
+my ($dbfile, $dbh, $stmth);
+my @col_names;
+my (%matched_sequences, %accessions,     %names,     %sites,   );
+my (@tmp_matches,       @tmp_accessions, @tmp_names, @tmp_sites);
+my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues);
+my (@kinases_observed, $kinases);
+my (@kinases_observed_lbl, @phosphosites_observed_lbl);
+my ($p_sequence_kinase, $p_sequence, $kinase);
+my (@motif_sequence, %motif_type, %motif_count);
+my (@kinases_PhosphoSite, $kinases_PhosphoSite);
+my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite);
+my (%regulatory_sites_PhosphoSite_hash);
+my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism);
+my (%unique_motifs);
+my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches);
+my %psp_regsite_protein_2;
+my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2);
+my @timeData;
+my $PhosphoSitePlusCitation;
+my %site_description;
+
+my %kinase_substrate_NetworKIN_matches;
+my %kinase_motif_matches;
+my $regulatory_sites_PhosphoSite;
+my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2);
+my %kinase_substrate_PhosphoSite_matches;
+my @formatted_sequence;
+my $pSTY_sequence;
+my $i;
+my @a;
+my $use_sqlite;
+my $verbose;
+
+##########
+## opts ##
+##########
+  ## input files
+    # i : path to input file, e.g., 'outputfile_STEP2.txt'
+    # f : path to UniProtKB/SwissProt FASTA
+    # s : optional species argument
+    # n : path to NetworKIN_201612_cutoffscore2.0.txt
+    # m : path to pSTY_Motifs.txt
+    # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+    # r : path to 2017-03_PSP_Regulatory_sites.txt
+  ## options
+    # P : phospho_type
+    # F : function
+    # v : verbose output
+  ## output files
+    # o : path to output file
+    # O : path to "melted" output file
+    # D : path to output SQLite file
+
+sub usage()
+    {
+        print STDERR <<"EOH";
+    This program given a list of phospho-peptides, finds protein information and upstream kinases.
+    usage: $0 [-hvd] -f FASTA_file
+     -h : this (help) message
+     -v : slightly verbose
+     -a : use SQLite less
+     ## input files
+     -i : path to input file, e.g., 'outputfile_STEP2.txt'
+     -f : path to UniProtDB/SwissProt FASTA
+     -s : optional species filter argument for PSP records; defaults to 'human'
+     -n : path to NetworKIN_201612_cutoffscore2.0.txt
+     -m : path to pSTY_Motifs.txt
+     -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt
+     -r : path to 2017-03_PSP_Regulatory_sites.txt
+     ## options
+     -P : phospho_type
+     -F : function
+     ## output files
+     -o : path to output file
+     -O : path to "melted" output file
+     -D : path to output SQLite file
+    example: $0
+EOH
+        exit;
+    }
+
+sub format_localtime_iso8601 {
+    # ref: https://perldoc.perl.org/Time::HiRes
+    my ($seconds, $microseconds) = gettimeofday;
+    # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html
+    return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000);
+}
+
+sub replace_pSpTpY {
+    my ($formatted_sequence, $phospho_type) = @_;
+    if ($phospho_type eq 'y') {
+        $formatted_sequence =~ s/pS/S/g;
+        $formatted_sequence =~ s/pT/T/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    elsif ($phospho_type eq "sty") {
+        $formatted_sequence =~ s/pS/s/g;
+        $formatted_sequence =~ s/pT/t/g;
+        $formatted_sequence =~ s/pY/y/g;
+        }
+    $formatted_sequence;
+}
+
+sub pseudo_sed
+{
+    # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
+    # Comments give the sed equivalent
+    my ($t) = @_;
+    my $s = $t;
+    # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; };
+    unless ($s =~ m / GN=/s)
+    {
+        $s =~ s :(OX=[^ \t]*):${1} GN=N/A:s;
+    }
+    # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; };
+    unless ($s =~ m / PE=/s)
+    {
+        $s =~ s :(GN=[^ \t]*):${1} PE=N/A:s;
+    }
+    # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; };
+    unless ($s =~ m / SV=/s)
+    {
+        $s =~ s :(PE=[^ \t]*):${1} SV=N/A:s;
+    }
+    # s/^sp.//;
+    $s =~ s :^...::s;
+    # s/[|]/\t/g;
+    $s =~ s :[|]:\t:sg;
+    if ( !($s =~ m/ OS=/s)
+      && !($s =~ m/ OX=/s)
+      && !($s =~ m/ GN=/s)
+      && !($s =~ m/ PE=/s)
+      && !($s =~ m/ SV=/s)
+    ) {
+      $s .= "\tN/A\t-1\tN/A\tN/A\tN/A";
+    } else {
+      # s/ OS=/\t/;
+      if ($s =~ m/ OS=/s) { $s =~ s: OS=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ OX=/\t/;
+      if ($s =~ m/ OX=/s) { $s =~ s: OX=:\t:s; } else { $s =~ s:(.*)\t:$1\t-1\t:x; };
+      # s/ GN=/\t/;
+      if ($s =~ m/ GN=/s) { $s =~ s: GN=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ PE=/\t/;
+      if ($s =~ m/ PE=/s) { $s =~ s: PE=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+      # s/ SV=/\t/;
+      if ($s =~ m/ SV=/s) { $s =~ s: SV=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; };
+    }
+    return $s;
+} # sub pseudo_sed
+
+getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ;
+
+
+if (exists($opts{'h'})) {
+    usage();
+}
+if (exists($opts{'a'})) {
+    $USE_SEARCH_PPEP_PY = 0;
+}
+if (exists($opts{'v'})) {
+    $verbose = 1;
+} else {
+    $verbose = 0;
+}
+if (!exists($opts{'i'}) || !-e $opts{'i'}) {
+    die('Input File not found');
+} else {
+    $file_in = $opts{'i'};
+}
+if (!exists($opts{'f'}) || !-e $opts{'f'}) {
+    die('FASTA not found');
+} else {
+    $fasta_in = $opts{'f'};
+    $use_sqlite = 0;
+}
+my $species;
+if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) {
+    $species = 'human';
+} else {
+    $species = $opts{'s'};
+    print "'-s' option is '$species'\n";
+}
+print "species filter is '$species'\n";
+
+if (!exists($opts{'n'}) || !-e $opts{'n'}) {
+    die('Input NetworKIN File not found');
+} else {
+    $networkin_in = $opts{'n'};
+}
+if (!exists($opts{'m'}) || !-e $opts{'m'}) {
+    die('Input pSTY_Motifs File not found');
+} else {
+    $motifs_in = $opts{'m'};
+}
+if (!exists($opts{'p'}) || !-e $opts{'p'}) {
+    die('Input PSP_Kinase_Substrate_Dataset File not found');
+} else {
+    $PSP_Kinase_Substrate_in = $opts{'p'};
+}
+if (!exists($opts{'r'}) || !-e $opts{'r'}) {
+    die('Input PSP_Regulatory_sites File not found');
+} else {
+    $PSP_Regulatory_Sites_in = $opts{'r'};
+}
+if (exists($opts{'P'})) {
+    $phospho_type = $opts{'P'};
+}
+else {
+    $phospho_type = "sty";
+}
+if (exists($opts{'F'})) {
+    $average_or_sum = $opts{'F'};
+}
+else {
+    $average_or_sum = "sum";
+}
+if (exists($opts{'D'})) {
+    $db_out = $opts{'D'};
+}
+else {
+    $db_out = "db_out.sqlite";
+}
+if (exists($opts{'O'})) {
+    $file_melt = $opts{'O'};
+}
+else {
+    $file_melt = "output_melt.tsv";
+}
+if (exists($opts{'o'})) {
+    $file_out = $opts{'o'};
+}
+else {
+    $file_out = "output.tsv";
+}
+
+
+###############################################################################################################################
+# Print the relevant file names to the screen
+###############################################################################################################################
+# print "\nData file:  $data_in\nFASTA file:  $fasta_in\nSpecies:  $species\nOutput file:  $motifs_out\n\n";
+print "\n--- parameters:\n";
+print "Data file:  $file_in\nAverage or sum identical p-sites?  $average_or_sum\nOutput file:  $file_out\nMelted map:  $file_melt\n";
+if ($use_sqlite == 0) {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file:  $fasta_in\nOutput SQLite file: $db_out\n";
+} else {
+  print "Motifs file:  $motifs_in\nNetworKIN file:  networkin_in\nPhosphosite kinase substrate data:  $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data:  $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file:  $dbfile\nOutput SQLite file: $db_out\n";
+}
+print "...\n\n";
+
+print "Phospho-residues(s) = $phospho_type\n\n";
+if ($phospho_type ne 'y') {
+    if ($phospho_type ne 'sty') {
+        die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n";
+    }
+}
+
+###############################################################################################################################
+# read the input data file
+# average or sum identical phospho-sites, depending on the value of $average_or_sum
+###############################################################################################################################
+
+open (IN, "$file_in") or die "I couldn't find the input file:  $file_in\n";
+
+die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ;
+
+
+$line = 0;
+
+while (<IN>) {
+    chomp;
+    my @x = split(/\t/);
+    for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n]  =~ s/\n//g; $x[$n]  =~ s/\"//g;}
+
+    # Read in the samples
+    if ($line == 0) {
+        for my $n (1 .. $#x) {
+            push (@samples, $x[$n]);
+            $sample_id_lut{$x[$n]} = $n;
+        }
+        $line++;
+    } else {
+        # check whether we have already seen a phospho-peptide
+        if (exists($data{$x[0]})) {
+            if ($average_or_sum eq "sum") {        # add the data
+                # unload the data
+                @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); }
+                # add the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; }
+                $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); }
+
+            } elsif ($average_or_sum eq "average") {        # average the data
+                # unload the data
+                @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); }
+                # average with the new data and repack
+                for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); }
+                $n{$x[0]}++;
+                $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); }
+            }
+        }
+        # if the phospho-sequence has not been seen, save the data
+        else {
+            for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); }
+            $n{$x[0]} = 1;
+        }
+    }
+}
+close(IN);
+
+
+###############################################################################################################################
+# Search the FASTA database for phospho-sites and motifs
+#
+# based on Retrieve_p_peptide_motifs_v2.pl
+###############################################################################################################################
+
+
+###############################################################################################################################
+#
+#    Read in the Data file:
+#        1) make @p_peptides array as in the original file
+#        2) make @non_p_peptides array w/o residue modifications (p, #, other)
+#
+###############################################################################################################################
+
+foreach my $peptide (keys %data) {
+    $peptide =~ s/s/pS/g;    $peptide =~ s/t/pT/g;    $peptide =~ s/y/pY/g;
+    push (@p_peptides, $peptide);
+    $peptide =~ s/p//g;
+    push(@non_p_peptides, $peptide);
+}
+
+if ($use_sqlite == 0) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database
+  #
+  ###############################################################################################################################
+
+  # e.g.
+  #   >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  #   MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD
+  #   DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK
+  #   EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH
+  #   QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS
+  #   EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+  # accession: Q9Y3B9
+  # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2
+  # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT
+  #
+  # e.g.
+  #   >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+  #   >gi|52001466|sp|P00366.2|DHE3_BOVIN Glutamate dehydrogenase 1, mitochondrial precursor (GDH) cRAP
+  #
+  # e.g.
+  #   >zs|P00009.24.AR-V2_1.zs|zs_peptide_0024_AR-V2_1
+
+
+  open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n";
+  print "Reading FASTA file $fasta_in\n";
+  # ref: https://perldoc.perl.org/perlsyn#Compound-Statements
+  #      "If the condition expression of a while statement is based on any of
+  #      a group of iterative expression types then it gets some magic treatment.
+  #      The affected iterative expression types are readline, the <FILEHANDLE>
+  #      input operator, readdir, glob, the <PATTERN> globbing operator, and
+  #      `each`. If the condition expression is one of these expression types,
+  #      then the value yielded by the iterative operator will be implicitly
+  #      assigned to `$_`."
+  while (<IN1>) {
+    chomp;
+    # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR
+    #      "If only PATTERN is given, EXPR defaults to $_."
+    my (@x) = split(/\|/);
+    # begin FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+    if (@x > 3) {
+      @x = (">".$x[$#x - 2], $x[$#x - 1], $x[$#x]);
+      if ($_ =~ m/DHE3_BOVIN/) {
+        print "\$_ = $_\n";
+        for my $i (0 .. $#x) {
+          print "\$x[$i] = $x[$i]\n";
+        };
+      }
+    }
+    # end FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP
+    for my $i (0 .. $#x) {
+      $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g; }
+    # Use of uninitialized value $x[0] in pattern match (m//) at /home/rstudio/src/mqppep/tools/mqppep/PhosphoPeptide_Upstream_Kinase_Mapping.pl line 411, <IN1> line 3.
+    if (exists($x[0])) {
+      if ($x[0] =~ /^>/) {
+        # parsing header line
+        $x[0] =~ s/\>//g;
+        push (@databases, $x[0]);
+        push (@accessions, $x[1]);
+        push (@names, $x[2]);
+        # format tags of standard UniProtKB headers as tab-separated values
+        # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV"
+        $_ = pseudo_sed(join "\t", (">".$x[0], $x[1], $x[2]));
+        # append tab as separator between header and sequence
+        s/$/\t/;
+        # parsed_fasta gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
+        print "push (\@parsed_fasta, $_)\n" if (0 && $x[0] ne "zs");
+        push (@parsed_fasta, $_);
+      } elsif ($x[0] =~ /^\w/) {
+        # line is a portion of the sequence
+        if (defined $sequences[$#accessions]) {
+          $sequences[$#accessions] = $sequences[$#accessions].$x[0];
+        } else {
+          $sequences[$#accessions] = $x[0];
+        }
+        $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0];
+      }
+    }
+  }
+  close IN1;
+  print "Done Reading FASTA file $fasta_in\n";
+  $dbfile = $db_out;
+  print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n";
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  my $auto_commit = $dbh->{AutoCommit};
+  print "auto_commit was $auto_commit and is now 0\n" if ($verbose);
+  $dbh->{AutoCommit} = 0;
+
+  # begin DDL-to-SQLite
+  # ---
+  $stmth = $dbh->prepare("
+    DROP TABLE IF EXISTS UniProtKB;
+    ");
+  $stmth->execute();
+
+  $stmth = $dbh->prepare("
+  CREATE TABLE UniProtKB (
+    Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE,
+    Description TEXT,
+    Organism_Name TEXT,
+    Organism_ID INTEGER,
+    Gene_Name TEXT,
+    PE TEXT,
+    SV TEXT,
+    Sequence TEXT,
+    Database TEXT
+  )
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID);
+  ");
+  $stmth->execute();
+  $stmth = $dbh->prepare("
+  CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name);
+  ");
+  $stmth->execute();
+  # ...
+  # end DDL-to-SQLite
+
+  # insert all rows
+  # begin store-to-SQLite "UniProtKB" table
+  # ---
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (?,?,?,?,?,?,?,?,?)
+  ");
+  my $row_count = 1;
+  my $row_string;
+  my (@row, @rows);
+  my $wrd;
+  while ( scalar @parsed_fasta > 0 ) {
+      $database = $databases[$#parsed_fasta];
+      #### print "parsed_fasta[-1]: " . $parsed_fasta[$#parsed_fasta] . "\n";
+      # row_string gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t"
+      #                  1           2            3   4   5   6   7   sequence database
+      $row_string = pop(@parsed_fasta);
+      @row = (split /\t/, $row_string);
+      if ((not exists($row[4])) || ($row[4] eq "")) {
+        die("invalid fasta line\n$row_string\n");
+      };
+      for $i (1..3,5..8) {
+          #BIND print "bind_param $i, $row[$i]\n";
+          $stmth->bind_param($i, $row[$i]);
+      }
+      #BIND print "bind_param 4, $row[4]\n";
+      $stmth->bind_param(9, $database);
+      #BIND print "bind_param 4, $row[4]\n";
+      $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER });
+      if (not $stmth->execute()) {
+          print "Error in row $row_count: " . $dbh->errstr . "\n";
+          print "Row $row_count: $row_string\n";
+          print "Row $row_count: " . ($row_string =~ s/\t/@/g) . "\n";
+      }
+      if (0 && $database ne "zs") {
+          print "row_count: $row_count\n";
+          #### print "row_string: $row_string\n";
+          print "Row $row_count: $row_string\n";
+          for $i (1..3,5..8) {
+              print "bind_param $i, $row[$i]\n" if (exists($row[$i]));
+          }
+          print "bind_param 4, $row[4]\n" if (exists($row[4]));
+          print "bind_param 9, $database\n";
+      };
+      $row_count += 1;
+  }
+  # ...
+  # end store-to-SQLite "UniProtKB" table
+
+  print "begin commit at " . format_localtime_iso8601() . "\n";
+  $dbh->{AutoCommit} = $auto_commit;
+  print "auto_commit is now $auto_commit\n" if ($verbose);
+  $dbh->disconnect if ( defined $dbh );
+  print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n";
+  $dbtype = "FASTA";
+}
+
+if ($use_sqlite == 1) {
+  ###############################################################################################################################
+  #
+  #    Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array
+  #
+  ###############################################################################################################################
+
+  copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!";
+
+  # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, {
+    sqlite_open_flags => SQLITE_OPEN_READONLY,
+  });
+  print "DB connection $dbh is to $dbfile\n";
+
+  # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence
+  $stmth = $dbh->prepare("
+  SELECT Uniprot_ID
+  , Description
+    || CASE WHEN Organism_Name = 'N/A' THEN '' ELSE ' OS=' || Organism_Name END
+    || CASE WHEN Organism_ID = -1      THEN '' ELSE ' OX=' || Organism_ID   END
+    || CASE WHEN Gene_Name = 'N/A'     THEN '' ELSE ' GN=' || Gene_Name     END
+    || CASE WHEN PE = 'N/A'            THEN '' ELSE ' PE=' || PE            END
+    || CASE WHEN SV = 'N/A'            THEN '' ELSE ' SV=' || SV            END
+    AS Description
+  , Sequence
+  , Database
+  FROM
+    UniProtKB
+  ");
+  $stmth->execute();
+  @col_names = @{$stmth->{NAME}};
+  print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose);
+  while (my @row = $stmth->fetchrow_array) {
+    push (@names,              $row[1]); # redacted Description
+    push (@accessions,         $row[0]); # Uniprot_ID
+    $sequences[$#accessions] = $row[2];  # Sequence
+    push (@databases,          $row[3]); # Database (should be 'sp')
+  }
+
+  $dbh->disconnect if ( defined $dbh );
+
+  print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n";
+  $dbtype = "SQLite";
+}
+
+print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n";
+
+######################
+  $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef);
+  $stmth = $dbh->prepare("
+  INSERT INTO UniProtKB (
+    Uniprot_ID,
+    Description,
+    Organism_Name,
+    Organism_ID,
+    Gene_Name,
+    PE,
+    SV,
+    Sequence,
+    Database
+  ) VALUES (
+    'No Uniprot_ID',
+    'NO_GENE_SYMBOL No Description',
+    'No Organism_Name',
+    0,
+    '$FAILED_MATCH_GENE_NAME',
+    '0',
+    '0',
+    '$FAILED_MATCH_SEQ',
+    'No Database'
+  )
+  ");
+  if (not $stmth->execute()) {
+      print "Error inserting dummy row into UniProtKB: $stmth->errstr\n";
+  }
+  $dbh->disconnect if ( defined $dbh );
+######################
+
+@timeData = localtime(time);
+print "\n--- Start search at " . format_localtime_iso8601() ."\n";
+
+print "    --> Calling 'search_ppep' script\n\n";
+if ($verbose) {
+  $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in --verbose");
+} else {
+  $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in");
+}
+if ($i) {
+  print "python $dirname/search_ppep.py -u $db_out -p $file_in\n  exited with exit code $i\n";
+  die "Search failed for phosphopeptides in SwissProt/SQLite file.";
+}
+print "    <-- Returned from 'search_ppep' script\n";
+
+@timeData = localtime(time);
+print "... Finished search at " . format_localtime_iso8601() ."\n\n";
+
+
+###############################################################################################################################
+#
+#    Match the non_p_peptides to the @sequences array:
+#        1) Format the motifs +/- 10 residues around the phospho-site
+#        2) Print the original data plus the phospho-motif to the output file
+#
+###############################################################################################################################
+
+
+print "--- Match the non_p_peptides to the \@sequences array:\n";
+
+if ($USE_SEARCH_PPEP_PY) {
+  print "Find the matching protein sequence(s) for the peptide using SQLite\n";
+} else {
+  print "Find the matching protein sequence(s) for the peptide using slow search\n";
+}
+
+# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, {
+  sqlite_open_flags => SQLITE_OPEN_READONLY,
+});
+print "DB connection $dbh is to $db_out\n";
+
+# CREATE VIEW uniprotid_pep_ppep AS
+#   SELECT   deppep_UniProtKB.UniprotKB_ID       AS accession
+#          , deppep.seq                          AS peptide
+#          , ppep.seq                            AS phosphopeptide
+#          , UniProtKB.Sequence                  AS sequence
+#          , UniProtKB.Description               AS description
+#   FROM     ppep, deppep, deppep_UniProtKB, UniProtKB
+#   WHERE    deppep.id = ppep.deppep_id
+#   AND      deppep.id = deppep_UniProtKB.deppep_id
+#   AND      deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID
+#   ORDER BY UniprotKB_ID, deppep.seq, ppep.seq;
+
+my %ppep_to_count_lut;
+print "start select peptide counts " . format_localtime_iso8601() . "\n";
+my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
+    SELECT DISTINCT
+      phosphopeptide
+    , count(*) as i
+    FROM
+      uniprotkb_pep_ppep_view
+    GROUP BY
+      phosphopeptide
+    ORDER BY
+      phosphopeptide
+");
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n";
+}
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
+  $ppep_to_count_lut{$row[0]} = $row[1];
+  #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n";
+}
+
+# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id
+# 0          1        2         3            4               5                 6          7        8         9
+my $COL_ACCESSION        = 0;
+my $COL_PEPTIDE          = 1;
+my $COL_SEQUENCE         = 2;
+my $COL_DESCRIPTION      = 3;
+my $COL_PHOSPHOPEPTIDE   = 4;
+my $COL_LONG_DESCRIPTION = 5;
+my $COL_POS_START        = 6;
+my $COL_POS_END          = 7;
+my $COL_SCRUBBED         = 8;
+my $COL_PPEP_ID          = 9;
+
+my %ppep_to_row_lut;
+print "start select all records without qualification " . format_localtime_iso8601() . "\n";
+$uniprotkb_pep_ppep_view_stmth = $dbh->prepare("
+    SELECT DISTINCT
+      accession
+    , peptide
+    , sequence
+    , description
+    , phosphopeptide
+    , long_description
+    , pos_start
+    , pos_end
+    , scrubbed
+    , ppep_id
+    FROM
+      uniprotkb_pep_ppep_view
+    ORDER BY
+      phosphopeptide
+");
+if (not $uniprotkb_pep_ppep_view_stmth->execute()) {
+    die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n";
+}
+my $current_ppep;
+my $counter = 0;
+my $former_ppep = "";
+@tmp_matches = ();
+@tmp_accessions = ();
+@tmp_names = ();
+@tmp_sites = ();
+while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) {
+    # Identify phosphopeptide for current row;
+    #   it is an error for it to change when the counter is not zero.
+    $current_ppep = $row[$COL_PHOSPHOPEPTIDE];
+
+    # when counter is zero, prepare for a new phosphopeptide
+    if (not $current_ppep eq $former_ppep) {
+        die "counter is $counter instead of zero" if ($counter != 0);
+        $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID];
+        print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose);
+        $counter = $ppep_to_count_lut{$current_ppep};
+        @tmp_matches = ();
+        @tmp_accessions = ();
+        @tmp_names = ();
+        @tmp_sites = ();
+    }
+
+    if ($USE_SEARCH_PPEP_PY) {
+        push(@tmp_matches,    $row[ $COL_SEQUENCE         ]);
+        push(@tmp_accessions, $row[ $COL_ACCESSION        ]);
+        push(@tmp_names,      $row[ $COL_LONG_DESCRIPTION ]);
+        push(@tmp_sites,      $row[ $COL_POS_START        ]);
+    }
+
+    # Prepare counter and phosphopeptide tracker for next row
+    $former_ppep = $current_ppep;
+    $counter -= 1;
+
+    # Set trackers for later use after last instance of current phosphopeptide
+    if ($counter == 0) {
+        if ($USE_SEARCH_PPEP_PY) {
+            $matched_sequences{$current_ppep} = [ @tmp_matches ];
+            $accessions{       $current_ppep} = [ @tmp_accessions ];
+            $names{            $current_ppep} = [ @tmp_names ];
+            $sites{            $current_ppep} = [ @tmp_sites ];
+        }
+    }
+}
+
+
+print "end select all records without qualification " . format_localtime_iso8601() . "\n";
+
+for my $j (0 .. $#p_peptides) {
+
+    #Find the matching protein sequence(s) for the peptide using SQLite
+    my ($site, $sequence);
+    my (@row, @rows);
+    my $match = 0;
+    my $p_peptide = $p_peptides[$j];
+    @tmp_matches = ();
+    @tmp_accessions = ();
+    @tmp_names = ();
+    @tmp_sites = ();
+
+    #Find the matching protein sequence(s) for the peptide using slow search
+    $site = -1;
+    unless ($USE_SEARCH_PPEP_PY) {
+        for my $k (0 .. $#sequences) {
+            $site = index($sequences[$k], $non_p_peptides[$j]);
+            if ($site != -1) {
+                  push(@tmp_matches, $sequences[$k]);
+                  push(@tmp_accessions, $accessions[$k]);
+                  push(@tmp_names, $names[$k]);
+                  push(@tmp_sites, $site);
+                }
+                # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n";
+                $site = -1; $match++;
+                # print "tmp_accessions @tmp_accessions \n";
+        }
+        if ($match == 0) {    # Check to see if no match was found.  Skip to next if no match found.
+            print "Warning:  Failed match for $p_peptides[$j]\n";
+            $matched_sequences{$p_peptides[$j]} = \@failed_match;
+            push(@failed_matches,$p_peptides[$j]);
+            next;
+        } else {
+            $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ];
+            $accessions{$p_peptides[$j]} = [ @tmp_accessions ];
+            $names{$p_peptides[$j]} = [ @tmp_names ];
+            $sites{$p_peptides[$j]} = [ @tmp_sites ];
+        }
+    }
+
+} # end for my $j (0 .. $#p_peptides)
+
+print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n";
+
+print "--- Match the p_peptides to the \@sequences array:\n";
+
+for my $peptide_to_match ( keys %matched_sequences ) {
+    if (grep($peptide_to_match, @failed_matches)) {
+        print "Failed to match peptide $peptide_to_match\n";
+    }
+    next if (grep($peptide_to_match, @failed_matches));
+    my @matches = @{$matched_sequences{$peptide_to_match}};
+    @tmp_motifs_array = ();
+    for my $i (0 .. $#matches) {
+
+        # Find the location of the phospo-site in the sequence(s)
+        $tmp_site = 0; my $offset = 0;
+        my $tmp_p_peptide = $peptide_to_match;
+        $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g;
+
+        # Find all phosphorylated residues in the p_peptide
+        @p_sites = ();
+        while ($tmp_site != -1) {
+            $tmp_site = index($tmp_p_peptide, 'p', $offset);
+            if ($tmp_site != -1) {push (@p_sites, $tmp_site);}
+            $offset = $tmp_site + 1;
+            $tmp_p_peptide =~ s/p//;
+        }
+        @tmp_p_residues = ();
+        for my $l (0 .. $#p_sites) {
+            next if not defined $sites{$peptide_to_match}[$i];
+
+            push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]);
+
+            # Match the sequences around the phospho residues to find the motifs
+            my ($desired_residues_L, $desired_residues_R);
+            if ($tmp_p_residues[0] - 10 < 0) {    #check to see if there are fewer than 10 residues left of the first p-site
+                # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3
+                $desired_residues_L = $tmp_p_residues[0];
+            }
+            else {
+                $desired_residues_L = 10;
+            }
+            my $seq_length = length($matched_sequences{$peptide_to_match}[$i]);
+            if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) {    #check to see if there are fewer than 10 residues right of the last p-site
+                $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1);
+                # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3
+                # print "Line 170:  seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n";
+            }
+            else {
+                $desired_residues_R = 10;
+            }
+
+            my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1;
+            my $arg2 = $tmp_p_residues[0] - $desired_residues_L;
+            my $arg1 = $matched_sequences{$peptide_to_match}[$i];
+
+            if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) {
+                $tmp_motif = substr($arg1, $arg2, $total_length);
+
+                # Put the "p" back in front of the appropriate phospho-residue(s).
+                my (@tmp_residues, $tmp_position);
+                for my $m (0 .. $#p_sites) {
+                    # print "Line 183: $p_sites[$m]\n";
+                    if ($m == 0) {
+                        $tmp_position = $desired_residues_L;
+                    } else {
+                        $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0];
+                    }
+                    if ($tmp_position < length($tmp_motif) + 1) {
+                        push (@tmp_residues, substr($tmp_motif, $tmp_position, 1));
+                        if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");}
+                        if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");}
+                        if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");}
+                    }
+                }
+
+                $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g;
+
+                # Comment out on 8.10.13 to remove the numbers from motifs
+                my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1;
+                my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1;
+                $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue;
+                push(@tmp_motifs_array, $tmp_motif);
+                $residues{$peptide_to_match}{$i} = [ @tmp_residues ];
+                $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ];
+            }
+        }
+        $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ];
+    }  # end for my $i (0 .. $#matches)       ### this bracket could be in the wrong place
+}
+
+print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+#  Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs
+#
+###############################################################################################################################
+
+
+print "--- Reading various site data:\n";
+
+###############################################################################################################################
+#
+#    Read the NetworKIN_predictions file:
+#        1) make a "kinases_observed" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+###############################################################################################################################
+my $SITE_KINASE_SUBSTRATE = 1;
+$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN";
+
+open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n";
+print "Reading the NetworKIN data:  $networkin_in\n";
+while (<IN1>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g;     $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    next if ($x[0] eq "#substrate");
+    if (exists ($kinases -> {$x[2]})) {
+        #do nothing
+    }
+    else {
+        $kinases -> {$x[2]} = $x[2];
+        push (@kinases_observed, $x[2]);
+    }
+    my $tmp = $x[10]."_".$x[2];    #eg, REEILsEMKKV_PKCalpha
+    if (exists($p_sequence_kinase -> {$tmp})) {
+        #do nothing
+    }
+    else {
+        $p_sequence_kinase -> {$tmp} = $tmp;
+    }
+}
+close IN1;
+
+###############################################################################################################################
+#
+#    Read the Kinase motifs file:
+#        1) make a "motif_sequence" array
+#
+###############################################################################################################################
+
+# file format (tab separated):
+#   x[0] = primary key (character), e.g., '17' or '23a'
+#   x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)'
+#   x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)'
+
+my $SITE_MOTIF = 2;
+$site_description{$SITE_MOTIF} = "motif";
+
+open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n";
+print "Reading the Motifs file:  $motifs_in\n";
+
+while (<IN2>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. 2) {
+        $x[$i] =~ s/\r//g;
+        $x[$i]  =~ s/\n//g;
+        $x[$i]  =~ s/\"//g;
+        }
+    if (exists ($motif_type{$x[1]})) {
+        $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2];
+    } else {
+        $motif_type{$x[1]} = $x[2];
+        $motif_count{$x[1]} = 0;
+        push (@motif_sequence, $x[1]);
+    }
+}
+close (IN2);
+
+
+###############################################################################################################################
+#  6.28.2011
+#    Read PSP_Kinase_Substrate data:
+#        1) make a "kinases_PhosphoSite" array
+#        2) annotate the phospho-substrates with the appropriate kinase
+#
+#  Columns:
+#     (0) GENE
+#     (1) KINASE
+#     (2) KIN_ACC_ID
+#     (3) KIN_ORGANISM
+#     (4) SUBSTRATE
+#     (5) SUB_GENE_ID
+#     (6) SUB_ACC_ID
+#     (7) SUB_GENE
+#     (8) SUB_ORGANISM
+#     (9) SUB_MOD_RSD
+#     (10) SITE_GRP_ID
+#     (11) SITE_+/-7_AA
+#     (12) DOMAIN
+#     (13) IN_VIVO_RXN
+#     (14) IN_VITRO_RXN
+#     (15) CST_CAT#
+###############################################################################################################################
+
+my $SITE_PHOSPHOSITE = 3;
+$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite";
+
+
+$line = 0;
+
+open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n";
+print "Reading the PhosphoSite Kinase-Substrate data:  $PSP_Kinase_Substrate_in\n";
+
+while (<IN3>) {
+    chomp;
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+        }
+    if ($line != 0) {
+        if (($species eq $x[3]) && ($species eq $x[8])) {
+            if (exists ($kinases_PhosphoSite -> {$x[0]})) {
+                #do nothing
+            }
+            else {
+                $kinases_PhosphoSite -> {$x[0]} = $x[0];
+                push (@kinases_PhosphoSite, $x[0]);
+            }
+            my $offset = 0;
+            # Replace the superfluous lower case s, t and y
+            my @lowercase = ('s','t','y');
+            my @uppercase = ('S','T','Y');
+            for my $k (0 .. 2) {
+                my $site = 0;
+                while ($site != -1) {
+                    $site = index($x[11],$lowercase[$k], $offset);
+                    if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);}
+                    $offset = $site + 1;
+                }
+            }
+            my $tmp = $x[11]."_".$x[0];        #eg, RTPGRPLsSYGMDSR_PAK2
+            if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                #do nothing
+            }
+            else {
+                $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp;
+            }
+        }
+        else {
+            # do nothing
+            #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n";
+        }
+    }
+    $line++;
+}
+close IN3;
+
+
+###############################################################################################################################
+#  Read PhosphoSite regulatory site data:
+#        1) make a "regulatory_sites_PhosphoSite" hash
+#
+#  Columns:
+#    (0)  GENE
+#    (1)  PROTEIN           --> #ACE %psp_regsite_protein
+#    (2)  PROT_TYPE
+#    (3)  ACC_ID
+#    (4)  GENE_ID
+#    (5)  HU_CHR_LOC
+#    (6)  ORGANISM          --> %organism
+#    (7)  MOD_RSD
+#    (8)  SITE_GRP_ID
+#    (9)  SITE_+/-7_AA      --> %regulatory_sites_PhosphoSite_hash
+#    (10) DOMAIN            --> %domain
+#    (11) ON_FUNCTION       --> %ON_FUNCTION
+#    (12) ON_PROCESS        --> %ON_PROCESS
+#    (13) ON_PROT_INTERACT  --> %ON_PROT_INTERACT
+#    (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT
+#    (15) PMIDs
+#    (16) LT_LIT
+#    (17) MS_LIT
+#    (18) MS_CST
+#    (19) NOTES             --> %notes
+###############################################################################################################################
+
+
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+my $auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS PSP_Regulatory_site (
+  SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
+  DOMAIN             TEXT,
+  ON_FUNCTION        TEXT,
+  ON_PROCESS         TEXT,
+  ON_PROT_INTERACT   TEXT,
+  ON_OTHER_INTERACT  TEXT,
+  NOTES              TEXT,
+  ORGANISM           TEXT,
+  PROTEIN            TEXT
+)
+");
+$stmth->execute();
+
+# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+$stmth = $dbh->prepare("
+CREATE TABLE IF NOT EXISTS ppep_regsite_LUT
+( ppep_id            INTEGER REFERENCES ppep(id)
+, site_plusminus_7AA TEXT    REFERENCES PSP_Regulatory_site(site_plusminus_7AA)
+, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE
+);
+");
+$stmth->execute();
+
+# $stmth = $dbh->prepare("
+# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0
+#   ON PSP_Regulatory_site(site_plusminus_7AA);
+# ");
+# $stmth->execute();
+
+
+# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite
+my $citation_sql;
+$citation_sql = "
+CREATE TABLE IF NOT EXISTS Citation (
+  ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE,
+  CitationData TEXT,
+  PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE
+)
+";
+$stmth = $dbh->prepare($citation_sql);
+$stmth->execute();
+
+
+open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n";
+print "Reading the PhosphoSite regulatory site data:  $PSP_Regulatory_Sites_in\n";
+
+
+$line = -1;
+while (<IN4>) {
+    $line++;
+    chomp;
+    if ($_ =~ m/PhosphoSitePlus/) {
+        #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g);
+        $PhosphoSitePlusCitation = $_;
+        $PhosphoSitePlusCitation =~ s/\t//g;
+        $PhosphoSitePlusCitation =~ s/\r//g;
+        $PhosphoSitePlusCitation =~ s/\n//g;
+        $PhosphoSitePlusCitation =~ s/""/"/g;
+        $PhosphoSitePlusCitation =~ s/^"//g;
+        $PhosphoSitePlusCitation =~ s/"$//g;
+        print "$PhosphoSitePlusCitation\n";
+        next;
+    }
+    my (@x) = split(/\t/);
+    for my $i (0 .. $#x) {
+        $x[$i] =~ s/\r//g; $x[$i]  =~ s/\n//g; $x[$i]  =~ s/\"//g;
+    }
+    my $found_GENE=0;
+    if ( (not exists($x[0])) ) {
+        next;
+    }
+    elsif ( ($x[0] eq "GENE") ) {
+        $found_GENE=1;
+        next;
+    }
+    if ( (not exists($x[9])) || ($x[9] eq "") ) {
+        if (exists($x[8]) && (not $x[8] eq "")) {
+            die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n";
+        } else {
+            if ( (not exists($x[1])) || (not $x[1] eq "") ) {
+                print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n"
+                  if $found_GENE==1;
+            }
+            next;
+        }
+    }
+    elsif ($line != 0) {
+        if ($species ne $x[6]) {
+            # Do nothing - this record was filtered out by the species filter
+        }
+        elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) {
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9];
+                $domain{$x[9]} = $x[10];
+                $ON_FUNCTION{$x[9]} = $x[11];
+                $ON_PROCESS{$x[9]} = $x[12];
+                $ON_PROT_INTERACT{$x[9]} = $x[13];
+                $ON_OTHER_INTERACT{$x[9]} = $x[14];
+                $notes{$x[9]} = $x[19];
+                $organism{$x[9]} = $x[6];
+            }
+        }
+        else {
+            # $domain
+            if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") {
+                if ($x[10] ne "") {
+                  $domain{$x[9]} = $domain{$x[10]};
+                  }
+                else {
+                  # do nothing
+                  }
+            }
+            else {
+                if ($domain{$x[9]} =~ /$x[10]/) {
+                  # do nothing
+                  }
+                else {
+                  $domain{$x[9]} = $domain{$x[9]}." / ".$x[10];
+                  #print "INFO line $line - compound domain for 7aa:  GENE $x[0]   PROTEIN $x[1]   PROT_TYPE $x[2]   ACC_ID $x[3]   GENE_ID $x[4]   HU_CHR_LOC $x[5]   ORGANISM $x[6]   MOD_RSD $x[7]   SITE_GRP_ID $x[8]   SITE_+/-7_AA $x[9]   DOMAIN $domain{$x[9]}\n";
+                  }
+            }
+
+            # $ON_FUNCTION
+            if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROCESS
+            if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_PROT_INTERACT
+            if (!defined $ON_PROT_INTERACT{$x[9]}  || $ON_PROT_INTERACT{$x[9]} eq "") {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $ON_OTHER_INTERACT
+            if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10];
+            }
+
+            # $notes
+            if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") {
+                $notes{$x[9]} = $notes{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $notes{$x[9]} = $notes{$x[9]}." / ".$x[10];
+            }
+
+            # $organism
+            if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") {
+                $organism{$x[9]} = $organism{$x[10]};
+            } elsif ($x[10] eq "") {
+                # do nothing
+            }
+            else {
+                $organism{$x[9]} = $organism{$x[9]}." / ".$x[10];
+            }
+        }
+    }
+}
+close IN4;
+
+print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n";
+
+$stmth = $dbh->prepare("
+INSERT INTO Citation (
+  ObjectName,
+  CitationData
+) VALUES (?,?)
+");
+
+sub add_citation {
+    my ($cit_table, $cit_text, $cit_label) = @_;
+    $stmth->bind_param(1, $cit_table);
+    $stmth->bind_param(2, $cit_text);
+    if (not $stmth->execute()) {
+        print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n";
+    }
+}
+my ($citation_text, $citation_table);
+
+# PSP regulatory or kinase/substrate site
+$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122';
+$citation_table = "PSP_Regulatory_site";
+add_citation($citation_table, $citation_text, "PSP_Regulatory_site");
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate");
+
+# NetworKIN site
+$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "NetworkKIN");
+
+# pSTY motifs
+$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs");
+$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159';
+$citation_table = "psp_gene_site";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+$citation_table = "psp_gene_site_view";
+add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs");
+
+
+###############################################################################################################################
+#
+#    Read the data file:
+#        1) find sequences that match the NetworKIN predictions
+#        2) find motifs that match the observed sequences
+#
+###############################################################################################################################
+
+print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n";
+
+my $ppep_regsite_LUT_stmth;
+$ppep_regsite_LUT_stmth = $dbh->prepare("
+  INSERT INTO ppep_regsite_LUT (
+    ppep_id,
+    site_plusminus_7AA
+  ) VALUES (?,?)
+");
+
+my ($start_seconds, $start_microseconds) = gettimeofday;
+
+foreach my $peptide (keys %data) {
+    # find the unique phospho-motifs for this $peptide
+    my @all_motifs = ();
+    my $have_all_motifs = 0;
+    for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+        my $tmp_motif = $p_motifs{$peptide}[$i];
+        push(@all_motifs, $tmp_motif);
+        $have_all_motifs = 1;
+    }
+    if ($have_all_motifs == 1) {
+        for my $j (0 .. $#all_motifs) {
+            if (defined $all_motifs[$j]) {
+                $all_motifs[$j] =~ s/\d+-\[\s//;
+                $all_motifs[$j] =~ s/\s\]\-\d+//;
+            }
+        }
+    }
+    my %seen = ();
+    if ($have_all_motifs == 1) {
+        foreach my $a (@all_motifs) {
+            if (defined $a) {
+                if (exists($seen{$a})) {
+                    next;
+                } else {
+                    push(@{$unique_motifs{$peptide}}, $a);
+                    $seen{$a} = 1;
+                }
+            }
+            print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose);
+        }
+    }
+
+    # count the number of phospo-sites in the motif
+    my $number_pY = 0;
+    my $number_pSTY = 0;
+    if ($phospho_type eq 'y') {
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /pY/g) {
+                $number_pY++;
+            }
+        }
+    }
+    if ($phospho_type eq 'sty') {
+        print "looking for unique_motifs for $peptide\n" if ($verbose);
+        if (defined(${$unique_motifs{$peptide}}[0])) {
+            while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) {
+                $number_pSTY++;
+               print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose);
+            }
+        }
+    }
+
+
+    # search each of the unique motifs for matches
+    print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose);
+    for my $i (0 .. $#{$unique_motifs{$peptide}}) {
+        print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose);
+        my $tmp_motif = ${$unique_motifs{$peptide}}[$i];
+        print "   --- matching unique motif $tmp_motif for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+        my $formatted_sequence;
+        if (($number_pY == 1) || ($number_pSTY == 1)) {
+            my $seq_plus5aa = "";
+            my $seq_plus7aa = "";
+            $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type);
+            print "       a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+            if ($phospho_type eq 'y') {
+                $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1];
+            }
+            elsif ($phospho_type eq "sty") {
+                $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1];
+                $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1];
+            }
+
+            if (defined $seq_plus7aa) {
+                # commit the 7aa LUT records
+                $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} );
+                $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa             );
+                if (not $ppep_regsite_LUT_stmth->execute()) {
+                    print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n";
+                }
+            }
+            for my $i (0 .. $#kinases_observed) {
+                if (defined $seq_plus5aa) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should be PGRPLsSYGMD_PKCalpha
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE
+                    }
+                }
+            }
+            for my $i (0 .. $#motif_sequence) {
+                if ($peptide =~ /$motif_sequence[$i]/) {
+                    $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+                }
+            }
+            for my $i (0 .. $#kinases_PhosphoSite) {
+                if (defined $seq_plus7aa) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    }
+                }
+            }
+            if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) {
+                $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+                $domain_2{$peptide} = $domain{$seq_plus7aa};
+                $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                $notes_2{$peptide} = $notes{$seq_plus7aa};
+                $organism_2{$peptide} = $organism{$seq_plus7aa};
+            } else {
+            }
+        }
+        elsif (($number_pY > 1) || ($number_pSTY > 1)) {  #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2
+            $formatted_sequence = $tmp_motif;
+            $seq_plus5aa = "";
+            $seq_plus7aa = "";
+            #Create the sequences with only one phosphorylation site
+            #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes  1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329  and  1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329
+
+            my (@sites, $offset, $next_p_site);
+            $sites[0] = index($tmp_motif, "p");
+            $offset = $sites[0] + 1;
+            $next_p_site = 0;
+            while ($next_p_site != -1) {
+                $next_p_site = index($tmp_motif, "p", $offset);
+                if ($next_p_site != -1) {
+                    push (@sites, $next_p_site);
+                }
+                $offset = $next_p_site+1;
+            }
+
+            my @pSTY_sequences;
+            for my $n (0 .. $#sites) {
+                $pSTY_sequences[$n] = $tmp_motif;
+                for (my $m = $#sites; $m >= 0; $m--) {
+                    if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";}
+                }
+            }
+
+            my @formatted_sequences;
+            for my $k (0 .. $#sites) {
+                $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type);
+            }
+
+            for my $k (0 .. $#formatted_sequences) {
+                print "       b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide  $peptide at " . format_localtime_iso8601() ."\n" if ($verbose);
+                if ($phospho_type eq 'y') {
+                    $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                elsif ($phospho_type eq "sty") {
+                    $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1];
+                    $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1];
+                }
+                for my $i (0 .. $#kinases_observed) {
+                    my $tmp = $seq_plus5aa."_".$kinases_observed[$i];    #eg, should look like REEILsEMKKV_PKCalpha
+                    if (exists($p_sequence_kinase -> {$tmp})) {
+                        $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X";
+                    }
+                }
+                $pSTY_sequence = $formatted_sequences[$k];
+                for my $i (0 .. $#motif_sequence) {
+                    if ($pSTY_sequence =~ /$motif_sequence[$i]/) {
+                        $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X";
+                    }
+                }
+                for my $i (0 .. $#kinases_PhosphoSite) {
+                    my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i];    #eg, should be RTPGRPLsSYGMDSR_PAK2
+                    #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp";
+                    if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) {
+                        $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X";
+                    }
+                }
+                if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) {
+                    $seq_plus7aa_2{$peptide} = $seq_plus7aa;
+
+                    # $domain
+                    if ($domain_2{$peptide} eq "") {
+                        $domain_2{$peptide} = $domain{$seq_plus7aa};
+                    }
+                    elsif ($domain{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa};
+                    }
+
+
+                    # $ON_FUNCTION_2
+                    if ($ON_FUNCTION_2{$peptide} eq "") {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa};
+                    }
+                    elsif ($ON_FUNCTION{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa};
+                    }
+
+                    # $ON_PROCESS_2
+                    if ($ON_PROCESS_2{$peptide} eq "") {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROCESS{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa};
+                    }
+
+                    # $ON_PROT_INTERACT_2
+                    if ($ON_PROT_INTERACT_2{$peptide} eq "") {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $ON_OTHER_INTERACT_2
+                    if ($ON_OTHER_INTERACT_2{$peptide} eq "") {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+                    elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa};
+                    }
+
+                    # $notes_2
+                    if ($notes_2{$peptide} eq "") {
+                        $notes_2{$peptide} = $notes{$seq_plus7aa};
+                    }
+                    elsif ($notes{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa};
+                    }
+                    $notes_2{$peptide} = $notes{$seq_plus7aa};
+
+                    # $organism_2
+                    if ($organism_2{$peptide} eq "") {
+                        $organism_2{$peptide} = $organism{$seq_plus7aa};
+                    }
+                    elsif ($organism{$seq_plus7aa} eq "") {
+                        # do nothing
+                    }
+                    else {
+                        $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa};
+                    }
+                    $organism_2{$peptide} = $organism{$seq_plus7aa};
+                } else {
+                } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa}))
+            } # for my $k (0 .. $#formatted_sequences)
+        } # if/else number of phosphosites
+    } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}})
+} # for each $peptide
+
+my ($end_seconds, $end_microseconds) = gettimeofday;
+
+my $delta_seconds = $end_seconds - $start_seconds;
+my $delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+my $key_count = keys(%data);
+print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count));
+
+($start_seconds, $start_microseconds) = gettimeofday;
+
+print "Writing PSP_Regulatory_site records\n";
+
+my $psp_regulatory_site_stmth = $dbh->prepare("
+    INSERT INTO PSP_Regulatory_site (
+      DOMAIN,
+      ON_FUNCTION,
+      ON_PROCESS,
+      ON_PROT_INTERACT,
+      ON_OTHER_INTERACT,
+      NOTES,
+      SITE_PLUSMINUS_7AA,
+      ORGANISM
+    ) VALUES (?,?,?,?,?,?,?,?)
+    ");
+
+foreach my $peptide (keys %data) {
+    if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) {
+        $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide});
+        $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide});
+        if (not $psp_regulatory_site_stmth->execute()) {
+            print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n";
+        } else {
+        }
+    } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) {
+        print "\$domain_2{$peptide} is undefined\n";  #ACE
+    }
+}
+
+$dbh->{AutoCommit} = $auto_commit;
+# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish;
+$dbh->disconnect if ( defined $dbh );
+
+
+($end_seconds, $end_microseconds) = gettimeofday;
+
+$delta_seconds = $end_seconds - $start_seconds;
+$delta_microseconds = $end_microseconds - $start_microseconds;
+$delta_microseconds += 1000000 * $delta_seconds;
+$key_count = keys(%data);
+print sprintf("Write time is %d microseconds\n", ($delta_microseconds));
+
+print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
+#
+# Print to the output file
+#
+###############################################################################################################################
+open (OUT, ">$file_out") || die "could not open the fileout: $file_out";
+open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt";
+
+# print the header info
+print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n";
+print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t";
+
+# print the PhosphoSite regulatory data
+print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t";
+
+# print the sample names
+for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; }
+
+# print the kinases and groups
+for my $i (0 .. $#kinases_observed) {
+    my $temp = $kinases_observed[$i]."_NetworKIN";
+    print OUT "$temp\t";
+    push(@kinases_observed_lbl, $temp);
+}
+for my $i (0 .. $#motif_sequence) {
+    print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t";
+}
+for my $i (0 .. $#kinases_PhosphoSite) {
+    my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite";
+    if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; }
+    if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; }
+    push(@phosphosites_observed_lbl, $temp);
+}
+
+# begin DDL-to-SQLite
+# ---
+$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef);
+$auto_commit = $dbh->{AutoCommit};
+$dbh->{AutoCommit} = 0;
+print "DB connection $dbh is to $db_out, opened for modification\n";
+
+my $sample_stmth;
+$sample_stmth = $dbh->prepare("
+  INSERT INTO sample (
+    id,
+    name
+  ) VALUES (?,?)
+");
+
+my $ppep_intensity_stmth;
+$ppep_intensity_stmth = $dbh->prepare("
+  INSERT INTO ppep_intensity (
+    ppep_id,
+    sample_id,
+    intensity
+  ) VALUES (?,?,?)
+");
+
+my $site_type_stmth;
+$site_type_stmth = $dbh->prepare("
+  insert into site_type (
+    id,
+    type_name
+  ) values (?,?)
+");
+
+my $ppep_gene_site_stmth;
+$ppep_gene_site_stmth = $dbh->prepare("
+  insert into ppep_gene_site (
+    ppep_id,
+    gene_names,
+    kinase_map,
+    site_type_id
+  ) values (?,?,?,?)
+");
+
+my $ppep_metadata_stmth;
+$ppep_metadata_stmth = $dbh->prepare("
+  INSERT INTO ppep_metadata
+    ( ppep_id
+    , protein_description
+    , gene_name
+    , FASTA_name
+    , phospho_sites
+    , motifs_unique
+    , accessions
+    , motifs_all_members
+    , domain
+    , ON_FUNCTION
+    , ON_PROCESS
+    , ON_PROT_INTERACT
+    , ON_OTHER_INTERACT
+    , notes
+  ) VALUES (
+    ?,?,?,?,?,?,?
+  , ?,?,?,?,?,?,?
+  )
+");
+# end DDL-to-SQLite
+# ...
+
+# begin store-to-SQLite "sample" table
+# ---
+# %sample_id_lut maps name -> ID
+for my $sample_name (keys %sample_id_lut) {
+    $sample_stmth->bind_param( 2, $sample_name                 );
+    $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} );
+    if (not $sample_stmth->execute()) {
+        print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n";
+    }
+}
+# end store-to-SQLite "sample" table
+# ...
+
+# begin store-to-SQLite "site_type" table
+# ---
+sub add_site_type {
+    my ($site_type_id, $site_type_type_name) = @_;
+    $site_type_stmth->bind_param( 2, $site_type_type_name );
+    $site_type_stmth->bind_param( 1, $site_type_id        );
+    if (not $site_type_stmth->execute()) {
+        die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n";
+    }
+}
+add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE});
+add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF});
+add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE});
+# end store-to-SQLite "site_type" table
+# ...
+
+foreach my $peptide (sort(keys %data)) {
+    next if (grep($peptide, @failed_matches));
+    my $ppep_id = $ppep_id_lut{$peptide};
+    my @ppep_metadata = ();
+    my @ppep_intensity = ();
+    my @gene = ();
+    my $gene_names;
+    my $j;
+    # Print the peptide itself
+    #   column 1: p-peptide
+    print OUT "$peptide\t";
+    push (@ppep_metadata, $ppep_id);
+    push (@ppep_intensity, $peptide);
+
+    my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR';
+    # skip over failed matches
+    print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond;
+    if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) {
+        # column 2: Protein description
+        # column 3: Gene name(s)
+        # column 4: FASTA name
+        # column 5: phospho-residues
+        # Column 6: UNIQUE phospho-motifs
+        # Column 7: accessions
+        # Column 8: ALL motifs with residue numbers
+        #          2                                     3   4   5   6   7   8
+        print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t";
+        print "No match found for '$peptide' in sequence database\n";
+        $gene_names = '$FAILED_MATCH_GENE_NAME';
+    } else {
+        my @description = ();
+        my %seen = ();
+        # Print just the protein description
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $long_name = $names{$peptide}[$i];
+            my @naming_parts = split(/\sOS/, $long_name);
+            my @front_half = split(/\s/, $naming_parts[0]);
+            push(@description, join(" ", @front_half[1..($#front_half)]));
+        }
+        # column 2: Protein description
+        print OUT join(" /// ", @description), "\t";
+        push (@ppep_metadata, join(" /// ", @description));
+
+        # Print just the gene name
+        for $i (0 .. $#{$names{$peptide}}) {
+            my $tmp_gene = $names{$peptide}[$i];
+            $tmp_gene =~ s/^.*GN=//;
+            $tmp_gene =~ s/\s.*//;
+            if (!exists($seen{$tmp_gene})) {
+                push(@gene, $tmp_gene);
+                $seen{$tmp_gene} = $tmp_gene;
+            }
+        }
+        # column 3: Gene name(s)
+        $gene_names = join(" /// ", @gene);
+        print OUT $gene_names, "\t";
+        push (@ppep_metadata, join(" /// ", @gene));
+
+        # column 4: FASTA name
+        print OUT join(" /// ", @{$names{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$names{$peptide}}));
+
+        # column 5: phospho-residues
+        my $tmp_for_insert = "";
+        my $foobar;
+        for my $i (0 .. $#{ $matched_sequences{$peptide} } ) {
+            print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond;
+            if ($i < $#{ $matched_sequences{$peptide} }) {
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    for $j (0 .. $#tmp_p_residues) {
+                        if ($j < $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                        }
+                        elsif ($j == $#tmp_p_residues) {
+                            my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                            print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                            $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// ";
+                        }
+                    }
+                }
+            }
+            elsif ($i == $#{ $matched_sequences{$peptide} }) {
+                my $there_were_sites = 0;
+                if (defined $p_residues{$peptide}{$i}) {
+                    @tmp_p_residues = @{$p_residues{$peptide}{$i}};
+                    if ($#tmp_p_residues > 0) {
+                        for my $j (0 .. $#tmp_p_residues) {
+                            if ($j < $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing, ";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, ";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
+                            elsif ($j == $#tmp_p_residues) {
+                                if (defined $p_residues{$peptide}{$i}[$j]) {
+                                    $foobar = $residues{$peptide}{$i}[$j];
+                                    if (defined $foobar) {
+                                        my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1;        # added 12.05.2012 for Justin's data
+                                        print OUT "$foobar";
+                                        print OUT "$tmp_site_for_printing\t";
+                                        $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing";
+                                        $there_were_sites = 1;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                if (0 == $there_were_sites) {
+                  print OUT "\t";
+                }
+            }
+        }
+        print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond;
+        push (@ppep_metadata, $tmp_for_insert);
+
+        # Column 6: UNIQUE phospho-motifs
+        print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t";
+        push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}}));
+
+        # Column 7: accessions
+        if (defined $accessions{$peptide}) {
+            print OUT join(" /// ", @{$accessions{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+        # Column 8: ALL motifs with residue numbers
+        if (defined $p_motifs{$peptide}) {
+            print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t";
+            push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}}));
+        } else {
+            print OUT "\t";
+            push (@ppep_metadata, "");
+        }
+
+    }
+
+    # Print the PhosphoSite regulatory data
+
+    if (defined $domain_2{$peptide})            { print OUT "$domain_2{$peptide}\t";            } else { print OUT "\t"; }
+    if (defined $ON_FUNCTION_2{$peptide})       { print OUT "$ON_FUNCTION_2{$peptide}\t";       } else { print OUT "\t"; }
+    if (defined $ON_PROCESS_2{$peptide})        { print OUT "$ON_PROCESS_2{$peptide}\t";        } else { print OUT "\t"; }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { print OUT "$ON_PROT_INTERACT_2{$peptide}\t";  } else { print OUT "\t"; }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; }
+    if (defined $notes_2{$peptide})             { print OUT "$notes_2{$peptide}\t";             } else { print OUT "\t"; }
+
+    if (defined $domain_2{$peptide})            { push (@ppep_metadata, $domain_2{$peptide});            } else { push(@ppep_metadata, ""); }
+    if (defined $ON_FUNCTION_2{$peptide})       { push (@ppep_metadata, $ON_FUNCTION_2{$peptide});       } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROCESS_2{$peptide})        { push (@ppep_metadata, $ON_PROCESS_2{$peptide});        } else { push(@ppep_metadata, ""); }
+    if (defined $ON_PROT_INTERACT_2{$peptide})  { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide});  } else { push(@ppep_metadata, ""); }
+    if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); }
+    if (defined $notes_2{$peptide})             { push (@ppep_metadata, $notes_2{$peptide});             } else { push(@ppep_metadata, ""); }
+
+    # begin store-to-SQLite "ppep_metadata" table
+    # ---
+    for $i (1..14) {
+        $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]);
+    }
+    if (not $ppep_metadata_stmth->execute()) {
+        print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n";
+    }
+    # ...
+    # end store-to-SQLite "ppep_metadata" table
+
+    # Print the data
+    @tmp_data = ();
+    foreach (@{$data{$peptide}}) {
+        push(@tmp_data, $_);
+    }
+    print OUT join("\t", @tmp_data), "\t";
+
+    # begin store-to-SQLite "ppep_intensity" table
+    # ---
+    # commit the sample intensities
+    $i = 0;
+    foreach (@{$data{$peptide}}) {
+        my $intense = $_;
+        $ppep_intensity_stmth->bind_param( 1, $ppep_id                     );
+        $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} );
+        $ppep_intensity_stmth->bind_param( 3, $intense                     );
+        if (not $ppep_intensity_stmth->execute()) {
+            print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n";
+        }
+        $i += 1;
+    }
+    # ...
+    # end store-to-SQLite "ppep_intensity" table
+
+    # print the kinase-substrate data
+    for my $i (0 .. $#kinases_observed) {
+        if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) {
+            print OUT "X\t";
+            my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN";
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n";
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);               # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);            # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label);       # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else { print OUT "\t";}
+    }
+    my %wrote_motif;
+    my $motif_parts_0;
+    for my $i (0 .. $#motif_sequence) {
+        if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) {
+            print OUT "X\t";
+            $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i];
+            my $key = "$peptide\t$gene_names\t$motif_parts_0";
+            if (!exists($wrote_motif{$key})) {
+                $wrote_motif{$key} = $key;
+                print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n";
+                # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n";            #debug
+                # begin store-to-SQLite "ppep_gene_site" table
+                # ---
+                $ppep_gene_site_stmth->bind_param(1, $ppep_id);        # ppep_gene_site.ppep_id
+                $ppep_gene_site_stmth->bind_param(2, $gene_names);     # ppep_gene_site.gene_names
+                $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map
+                $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF);     # ppep_gene_site.site_type_id
+                if (not $ppep_gene_site_stmth->execute()) {
+                    print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n";
+                }
+                # ...
+                # end store-to-SQLite "ppep_gene_site" table
+            }
+        }
+        else { print OUT "\t";}
+    }
+    for my $i (0 .. $#kinases_PhosphoSite) {
+        if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) {
+            print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n";
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "X\t";
+            }
+            else {
+                print OUT "X\n";
+            }
+            # begin store-to-SQLite "ppep_gene_site" table
+            # ---
+            $ppep_gene_site_stmth->bind_param(1, $ppep_id);                       # ppep_gene_site.ppep_id
+            $ppep_gene_site_stmth->bind_param(2, $gene_names);                    # ppep_gene_site.gene_names
+            $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map
+            $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE);              # ppep_gene_site.site_type_id
+            if (not $ppep_gene_site_stmth->execute()) {
+                print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n";
+            }
+            # ...
+            # end store-to-SQLite "ppep_gene_site" table
+        }
+        else {
+            if ($i < $#kinases_PhosphoSite) {
+                print OUT "\t";
+            }
+            elsif ($i == $#kinases_PhosphoSite) {
+                print OUT "\n";
+            }
+        }
+    }
+}
+
+close OUT;
+close MELT;
+$ppep_gene_site_stmth->finish;
+print "begin DB commit at " . format_localtime_iso8601() . "\n";
+$dbh->{AutoCommit} = $auto_commit;
+$dbh->disconnect if ( defined $dbh );
+
+print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n";
+
+###############################################################################################################################
--- a/macros.xml	Tue Apr 05 01:47:37 2022 +0000
+++ b/macros.xml	Wed Apr 06 05:47:34 2022 +0000
@@ -1,19 +1,17 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.1.7</token>
+    <token name="@TOOL_VERSION@">0.1.8</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="0.3.3"   >openblas</requirement>
             <requirement type="package" version="0.4.0"   >r-sass</requirement>
             <requirement type="package" version="1.14.2"  >r-data.table</requirement>
+            <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
+            <requirement type="package" version="5.26.2"  >perl</requirement>
+            <requirement type="package" version="0.3.3"   >openblas</requirement>
             <requirement type="package" version="1.22.2"  >numpy</requirement>
-            <requirement type="package" version="1.4.0"   >pyahocorasick</requirement>
-            <requirement type="package" version="1.4.0"   >r-stringr</requirement>
-            <requirement type="package" version="1.4.1"   >pandas</requirement>
-            <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
-            <requirement type="package" version="1.64"    >perl-dbd-sqlite</requirement>
-            <requirement type="package" version="1.7.1"   >r-optparse</requirement>
-            <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
+            <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+            <requirement type="package" version="0.3.7"   >r-vioplot</requirement>
+            <requirement type="package" version="0.37"    >r-tinytex</requirement>
             <!--
             It would be nice to use conda-forge/texlive-core rather than r-tinytex because the
             former installs texlive when the package is built, but issue 23 blocked PDF-creation.
@@ -21,12 +19,14 @@
             with boxes) unless I specified the build as well as the version when building a
             conda environment, e.g.:  texlive-core=20210325=h97429d4_0
             -->
-            <requirement type="package" version="0.37"    >r-tinytex</requirement>
-            <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
             <requirement type="package" version="3.9.10"  >python</requirement>
-            <requirement type="package" version="5.26.2"  >perl</requirement>
             <requirement type="package" version="0.9.4"   >r-latex2exp</requirement>
-            <requirement type="package" version="0.3.7"   >r-vioplot</requirement>
+            <requirement type="package" version="1.4.0"   >r-stringr</requirement>
+            <requirement type="package" version="1.64"    >perl-dbd-sqlite</requirement>
+            <requirement type="package" version="1.4.0"   >pyahocorasick</requirement>
+            <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
+            <requirement type="package" version="1.4.1"   >pandas</requirement>
+            <requirement type="package" version="1.7.1"   >r-optparse</requirement>
         </requirements>
     </xml>
 </macros>
--- a/mqppep_mrgfltr.py	Tue Apr 05 01:47:37 2022 +0000
+++ b/mqppep_mrgfltr.py	Wed Apr 06 05:47:34 2022 +0000
@@ -448,8 +448,8 @@
                 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
                 if OS != N_A:
                     Description += " OS=" + OS
-                if OX != N_A:
-                    Description += " OX=" + str(int(OX))
+                if OX != -1:
+                    Description += " OX=" + str(OX)
                 if Gene_Name != N_A:
                     Description += " GN=" + Gene_Name
                 if PE != N_A:
--- a/search_ppep.py	Tue Apr 05 01:47:37 2022 +0000
+++ b/search_ppep.py	Wed Apr 06 05:47:34 2022 +0000
@@ -103,12 +103,27 @@
             , PE
             , SV
             , Sequence
-            , Description || ' OS=' ||
-                Organism_Name || ' OX=' || Organism_ID ||
-                CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN='|| Gene_Name END ||
-                CASE WHEN PE = 'N/A' THEN '' ELSE ' PE='|| PE END ||
-                CASE WHEN SV = 'N/A' THEN '' ELSE ' SV='|| SV END
-                                                       AS long_description
+            , Description ||
+                CASE WHEN Organism_Name = 'N/A'
+                     THEN ''
+                     ELSE ' OS='|| Organism_Name
+                     END ||
+                CASE WHEN Organism_ID = -1
+                     THEN ''
+                     ELSE ' OX='|| Organism_ID
+                     END ||
+                CASE WHEN Gene_Name = 'N/A'
+                     THEN ''
+                     ELSE ' GN='|| Gene_Name
+                     END ||
+                CASE WHEN PE = 'N/A'
+                     THEN ''
+                     ELSE ' PE='|| PE
+                     END ||
+                CASE WHEN SV = 'N/A'
+                     THEN ''
+                     ELSE ' SV='|| SV
+                     END AS long_description
             , Database
           FROM UniProtKB
           ;