Mercurial > repos > eschen42 > mqppep_preproc
diff PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 23:7560a4e80a1e draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 0c7ca054e77e042c8a584c9903073da064df7d8b
| author | eschen42 |
|---|---|
| date | Thu, 30 Jun 2022 16:15:57 +0000 |
| parents | ee7dedf9c79b |
| children |
line wrap: on
line diff
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl Wed Apr 13 19:48:01 2022 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Thu Jun 30 16:15:57 2022 +0000 @@ -62,18 +62,18 @@ my (@kinases_observed, $kinases); my (@kinases_observed_lbl, @phosphosites_observed_lbl); my ($p_sequence_kinase, $p_sequence, $kinase); -my (@motif_sequence, %motif_type, %motif_count); +my (@motif_sequence, @motif_description, @motif_type_key_ary, %motif_type, %motif_count); my (@kinases_PhosphoSite, $kinases_PhosphoSite); my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); my (%regulatory_sites_PhosphoSite_hash); my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); my (%unique_motifs); -my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); +my ($kinase_substrate_NetworKIN_matches, $kinase_substrate_PhosphoSite_matches); my %psp_regsite_protein_2; my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2); my @timeData; my $PhosphoSitePlusCitation; -my %site_description; +my (%site_description, %site_id); my %kinase_substrate_NetworKIN_matches; my %kinase_motif_matches; @@ -997,12 +997,21 @@ ############################################################################################################################### # file format (tab separated): -# x[0] = primary key (character), e.g., '17' or '23a' +# x[0] = quasi-primary key (character), e.g., '17' or '23a' # x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)' # x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)' - -my $SITE_MOTIF = 2; -$site_description{$SITE_MOTIF} = "motif"; +# "counter" "pcre" "symbol" "description" "pubmed_id" "classification" "source" +# "1" "R.R..(pS|pT)(F|L)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8985174" "kinase substrate" "HPRD" +# x[3] = old description, i.e., description in Amanchy (HPRD) and Phosida tables +# x[4] = pubmed id +# x[5] = classification +# x[6] = source (Phosida or HPRD) +my $SITE_HPRD = 2; +$site_description{$SITE_HPRD} = "HPRD"; +$site_id{$site_description{$SITE_HPRD}} = $SITE_HPRD; +my $SITE_PHOSIDA = 4; +$site_description{$SITE_PHOSIDA} = "Phosida"; +$site_id{$site_description{$SITE_PHOSIDA}} = $SITE_PHOSIDA; open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; print "Reading the Motifs file: $motifs_in\n"; @@ -1010,17 +1019,30 @@ while (<IN2>) { chomp; my (@x) = split(/\t/); + my $tmp_motif_description; + if ($#x == 6) { # weirdly, a @list of length seven has $#list == 6 + # remove double-quotes which are helpful or necessary for Excel + $x[6] =~ s/\"//g; + $tmp_motif_description = $x[6]; + } else { + $tmp_motif_description = "motif"; + } for my $i (0 .. 2) { + # remove any embedded CR or LF (none should exist) $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; + # remove double-quotes which are helpful or necessary for Excel $x[$i] =~ s/\"//g; } - if (exists ($motif_type{$x[1]})) { - $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; + if (exists ($motif_type{$x[2]})) { + #ACE-2022.06.20 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; + $motif_type{$x[2]} = $motif_type{$x[2]}."|".$x[2]; } else { - $motif_type{$x[1]} = $x[2]; + $motif_type{$x[2]} = $x[2]; $motif_count{$x[1]} = 0; push (@motif_sequence, $x[1]); + push (@motif_description, $tmp_motif_description); + push (@motif_type_key_ary, $x[2]) } } close (IN2); @@ -1111,7 +1133,6 @@ # # Columns: # (0) GENE -# (1) PROTEIN --> #ACE %psp_regsite_protein # (2) PROT_TYPE # (3) ACC_ID # (4) GENE_ID @@ -1503,8 +1524,9 @@ } } for my $i (0 .. $#motif_sequence) { + print "matching $motif_sequence[$i]" if ($verbose); if ($peptide =~ /$motif_sequence[$i]/) { - $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; } } for my $i (0 .. $#kinases_PhosphoSite) { @@ -1578,7 +1600,7 @@ $pSTY_sequence = $formatted_sequences[$k]; for my $i (0 .. $#motif_sequence) { if ($pSTY_sequence =~ /$motif_sequence[$i]/) { - $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; + $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; } } for my $i (0 .. $#kinases_PhosphoSite) { @@ -1741,6 +1763,8 @@ # Print to the output file # ############################################################################################################################### + + open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt"; @@ -1760,11 +1784,12 @@ print OUT "$temp\t"; push(@kinases_observed_lbl, $temp); } -for my $i (0 .. $#motif_sequence) { - print OUT "$motif_type{$motif_sequence[$i]} ($motif_sequence[$i])\t"; +my @motif_type_keys = keys %motif_type; +for my $i (1 .. $#motif_type_keys) { + print OUT "$motif_type{$motif_type_keys[$i]}\t"; } for my $i (0 .. $#kinases_PhosphoSite) { - my $temp = $kinases_PhosphoSite[$i]."_PhosphoSite"; + my $temp = $kinases_PhosphoSite[$i]; # ."_PhosphoSite"; if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } push(@phosphosites_observed_lbl, $temp); @@ -1861,8 +1886,9 @@ } } add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE}); -add_site_type($SITE_MOTIF, $site_description{$SITE_MOTIF}); -add_site_type($SITE_PHOSPHOSITE, $site_description{$SITE_PHOSPHOSITE}); +add_site_type($SITE_HPRD , $site_description{$SITE_HPRD }); +add_site_type($SITE_PHOSIDA , $site_description{$SITE_PHOSIDA }); +add_site_type($SITE_PHOSPHOSITE , $site_description{$SITE_PHOSPHOSITE }); # end store-to-SQLite "site_type" table # ... @@ -2070,7 +2096,7 @@ for my $i (0 .. $#kinases_observed) { if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { print OUT "X\t"; - my $NetworKIN_label = $kinases_observed[$i]."_NetworKIN"; + my $NetworKIN_label = $kinases_observed[$i]; #."_NetworKIN"; print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n"; # begin store-to-SQLite "ppep_gene_site" table # --- @@ -2088,26 +2114,36 @@ } my %wrote_motif; my $motif_parts_0; - for my $i (0 .. $#motif_sequence) { - if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { + my @motif_split; + my $one_motif; + + for my $i (0 .. $#motif_type_keys) { + if (exists($kinase_motif_matches{$peptide}{$motif_type_keys[$i]})) { print OUT "X\t"; - $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; - my $key = "$peptide\t$gene_names\t$motif_parts_0"; - if (!exists($wrote_motif{$key})) { - $wrote_motif{$key} = $key; - print MELT "$peptide\t$gene_names\t$site_description{$SITE_MOTIF}\t$motif_parts_0\n"; - # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug - # begin store-to-SQLite "ppep_gene_site" table - # --- - $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id - $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names - $ppep_gene_site_stmth->bind_param(3, $motif_parts_0); # ppep_gene_site.kinase_map - $ppep_gene_site_stmth->bind_param(4, $SITE_MOTIF); # ppep_gene_site.site_type_id - if (not $ppep_gene_site_stmth->execute()) { - print "Error writing tuple ($peptide,$gene_names,$motif_parts_0): $ppep_gene_site_stmth->errstr\n"; + #ACE-2022.06.20 $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; + $motif_parts_0 = $motif_type{$motif_type_keys[$i]}; + @motif_split = split("[|]", $motif_parts_0); + #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; + for my $j (0 .. $#motif_split) { + $one_motif = $motif_split[$j]; + #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; + my $key = "$peptide\t$gene_names\t$one_motif"; + if (!exists($wrote_motif{$key})) { + $wrote_motif{$key} = $key; + print MELT "$peptide\t$gene_names\t$motif_description[$i]\t$one_motif\n"; + # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $one_motif); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $site_id{$motif_description[$i]}); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$one_motif): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table } - # ... - # end store-to-SQLite "ppep_gene_site" table } } else { print OUT "\t";}
