Mercurial > repos > eschen42 > mqppep_anova
diff PhosphoPeptide_Upstream_Kinase_Mapping.pl @ 7:d728198f1ba5 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9a0fa6d0f9aadc069a5551a54da6daf307885637"
author | eschen42 |
---|---|
date | Tue, 15 Mar 2022 00:35:16 +0000 |
parents | 922d309640db |
children |
line wrap: on
line diff
--- a/PhosphoPeptide_Upstream_Kinase_Mapping.pl Fri Mar 11 20:04:05 2022 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Tue Mar 15 00:35:16 2022 +0000 @@ -25,7 +25,6 @@ ############################################################################################################################### use strict; -#ACE use warnings; use warnings 'FATAL' => 'all'; use Getopt::Std; @@ -67,7 +66,6 @@ my (@kinases_PhosphoSite, $kinases_PhosphoSite); my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); my (%regulatory_sites_PhosphoSite_hash); -#ACE my %psp_regsite_protein; my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); my (%unique_motifs); my ($kinase_substrate_NetworKIN_matches, $kinase_motif_matches, $kinase_substrate_PhosphoSite_matches); @@ -196,8 +194,6 @@ getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ; -#ACE print %opts; #ACE -#ACE print "\n"; #ACE if (exists($opts{'h'})) { usage(); @@ -221,15 +217,6 @@ $fasta_in = $opts{'f'}; $use_sqlite = 0; } -#ACE if (exists($opts{'s'}) && -e $opts{'s'}) { -#ACE $use_sqlite = 1; -#ACE $dbfile = $opts{'s'}; -#ACE } elsif (!exists($opts{'f'}) || !-e $opts{'f'}) { -#ACE die('Neither input FASTA file nor input SQLite file was found'); -#ACE } else { -#ACE $use_sqlite = 0; -#ACE $fasta_in = $opts{'f'}; -#ACE } my $species; if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) { $species = 'human'; @@ -426,8 +413,6 @@ push (@databases, $x[0]); push (@accessions, $x[1]); push (@names, $x[2]); - #ACE print "names $x[2]\n"; - #ACE print "--- $_\n"; pseudo_sed(); s/$/\t/; push (@parsed_fasta, $_); @@ -439,7 +424,6 @@ } $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0]; } - #ACE print "... '$parsed_fasta[$#accessions]'\n"; } close IN1; print "Done Reading FASTA file $fasta_in\n"; @@ -614,7 +598,6 @@ $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); } else { $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in"); - #ACE DELETEME $i = system("\$CONDA_PREFIX/bin/python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); } if ($i) { print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; @@ -634,7 +617,6 @@ # ############################################################################################################################### -#ACE print OUT "$headers\tFormatted Motifs\tUnique Motifs\tPhospho-site(s)\tAccessions(s)\tName(s)\n"; print "--- Match the non_p_peptides to the \@sequences array:\n"; @@ -750,7 +732,6 @@ } # Prepare counter and phosphopeptide tracker for next row - #ACE print "counter: $counter; phosphpepetide: $current_ppep\n"; $former_ppep = $current_ppep; $counter -= 1; @@ -822,15 +803,11 @@ my @matches = @{$matched_sequences{$peptide_to_match}}; @tmp_motifs_array = (); for my $i (0 .. $#matches) { - #ACE print "Matching $peptide_to_match to match $i\n"; - #ACE print "\$sites{\$peptide_to_match}[\$i] $sites{$peptide_to_match}[$i]\n"; # Find the location of the phospo-site in the sequence(s) $tmp_site = 0; my $offset = 0; my $tmp_p_peptide = $peptide_to_match; - #ACE print "peptide_to_match: $peptide_to_match at position $sites{$peptide_to_match}[$i] in sequence $matched_sequences{$peptide_to_match}[$i]\n"; $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; - #ACE print "tmp_p_peptide: $tmp_p_peptide\n"; # Find all phosphorylated residues in the p_peptide @p_sites = (); @@ -870,9 +847,7 @@ my $arg1 = $matched_sequences{$peptide_to_match}[$i]; if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) { - #ACE print "\$tmp_motif = substr($arg1, $arg2, $total_length)\n"; $tmp_motif = substr($arg1, $arg2, $total_length); - #ACE print "tmp_motif = $tmp_motif\ti = $i\tpeptide_to_match = $peptide_to_match\tmatched_sequences{peptide_to_match}[i] = $matched_sequences{$peptide_to_match}[$i]\targ2 = $arg2\targ3 = $total_length\n"; # Put the "p" back in front of the appropriate phospho-residue(s). my (@tmp_residues, $tmp_position); @@ -883,7 +858,6 @@ } else { $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0]; } - #ACE print "Line 431: p_sites = $p_sites[$m]\ttmp_position = $tmp_position\ttmp_motif = $tmp_motif\n"; if ($tmp_position < length($tmp_motif) + 1) { push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} @@ -1032,9 +1006,7 @@ $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } if ($line != 0) { - #ACE FUE if (($species eq $species) && ($species eq $species)) { if (($species eq $x[3]) && ($species eq $x[8])) { - #ACE print "KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]', line: $line\n"; if (exists ($kinases_PhosphoSite -> {$x[0]})) { #do nothing } @@ -1154,7 +1126,6 @@ open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n"; print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n"; -#ACE $i = system("head -n 4 $PSP_Regulatory_Sites_in"); $line = -1; while (<IN4>) { @@ -1177,7 +1148,6 @@ $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } my $found_GENE=0; - #ACE print STDERR "line $line: $_\n"; if ( (not exists($x[0])) ) { next; } @@ -1197,19 +1167,13 @@ } } elsif ($line != 0) { - #ACE print "PSPReg $line: $_\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); - #ACE FUE if ($species ne $species) { if ($species ne $x[6]) { # Do nothing - this record was filtered out by the species filter - #ACE print "PSP_regsite line rejected: $line\n"; } elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) { - #ACE print "testing \$domain{\$x[9]} for \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { - #ACE print "adding found \$regulatory_sites_PhosphoSite_hash{$x[9]}\n" if ($x[9] eq 'KGQKYFDsGDYNMAK'); #ACE $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9]; $domain{$x[9]} = $x[10]; - #ACE $psp_regsite_protein{$x[9]} = $x[1]; $ON_FUNCTION{$x[9]} = $x[11]; $ON_PROCESS{$x[9]} = $x[12]; $ON_PROT_INTERACT{$x[9]} = $x[13]; @@ -1226,12 +1190,9 @@ } else { # do nothing - #ACE print "WARNING line $line - no domain or 7aa for: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $x[10]\n"; - #ACE print "$_\n"; } } else { - #ACE print "Checking $domain{$x[9]} =~ /$x[10]/\n"; if ($domain{$x[9]} =~ /$x[10]/) { # do nothing } @@ -1453,10 +1414,8 @@ if (($number_pY == 1) || ($number_pSTY == 1)) { my $seq_plus5aa = ""; my $seq_plus7aa = ""; - #ACE print "tmp_motif is $tmp_motif before replacement\n"; $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); - #ACE print "formatted_sequence is $formatted_sequence after replacement\n"; if ($phospho_type eq 'y') { $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; @@ -1474,14 +1433,10 @@ print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n"; } } - #ACE print "seq_plus5aa is $seq_plus5aa \n"; - #ACE print "seq_plus7aa is $seq_plus7aa \n"; for my $i (0 .. $#kinases_observed) { if (defined $seq_plus5aa) { my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha if (exists($p_sequence_kinase -> {$tmp})) { - #ACE print($tmp."\t"); - #ACE print(($p_sequence_kinase -> {$tmp})."\n"); #ACE $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE } } @@ -1489,7 +1444,6 @@ for my $i (0 .. $#motif_sequence) { if ($peptide =~ /$motif_sequence[$i]/) { $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; - #ACE print "\$kinase_motif_matches{$peptide}{$motif_sequence[$i]} = 'X'; $motif_type{$motif_sequence[$i]}\n"; #ACE } } for my $i (0 .. $#kinases_PhosphoSite) { @@ -1500,12 +1454,9 @@ } } } - #ACE print "checking for existence of \$regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) { - #ACE print "found regulatory_sites_PhosphoSite_hash{$seq_plus7aa}\n"; #ACE $seq_plus7aa_2{$peptide} = $seq_plus7aa; $domain_2{$peptide} = $domain{$seq_plus7aa}; - #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa}; $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; @@ -1513,12 +1464,10 @@ $notes_2{$peptide} = $notes{$seq_plus7aa}; $organism_2{$peptide} = $organism{$seq_plus7aa}; } else { - #ACE print "c not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}\n"; #ACE } } elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 $formatted_sequence = $tmp_motif; - #ACE print "formatted_sequence is $formatted_sequence \n"; $seq_plus5aa = ""; $seq_plus7aa = ""; #Create the sequences with only one phosphorylation site @@ -1546,14 +1495,11 @@ my @formatted_sequences; for my $k (0 .. $#sites) { - #ACE print "pSTY_sequences[k] is $pSTY_sequences[$k] before replacement\n"; $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); - #ACE print "formatted_sequences[k] is $formatted_sequences[$k] \n"; } for my $k (0 .. $#formatted_sequences) { print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); - #ACE print "formatted_sequences[k] for phosphotype $phospho_type is $formatted_sequences[$k] \n"; if ($phospho_type eq 'y') { $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1]; $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1]; @@ -1562,21 +1508,15 @@ $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1]; $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1]; } - #ACE print "seq_plus5aa is $seq_plus5aa \n"; - #ACE print "seq_plus7aa is $seq_plus7aa \n"; for my $i (0 .. $#kinases_observed) { my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha - #ACE print "seq_plus5aa._.kinases_observed[i] is $tmp\n"; #ACE if (exists($p_sequence_kinase -> {$tmp})) { $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; - #ACE print "$tmp matched\n"; } } $pSTY_sequence = $formatted_sequences[$k]; - #ACE print "trying pSTY_sequence $pSTY_sequence \n"; for my $i (0 .. $#motif_sequence) { if ($pSTY_sequence =~ /$motif_sequence[$i]/) { - #ACE print "match for pSTY_sequence $pSTY_sequence was $motif_sequence[$i]\n"; $kinase_motif_matches{$peptide}{$motif_sequence[$i]} = "X"; } } @@ -1585,11 +1525,9 @@ #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp"; if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; - #ACE print "$tmp matched \n"; } } if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { - #ACE print "ACE processing seq_plus7aa '$domain{$seq_plus7aa}'\n"; #ACE $seq_plus7aa_2{$peptide} = $seq_plus7aa; # $domain @@ -1603,16 +1541,6 @@ $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; } - #ACE # $psp_regsite_protein - #ACE if ($psp_regsite_protein_2{$peptide} eq "") { - #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein{$seq_plus7aa}; - #ACE } - #ACE elsif ($psp_regsite_protein{$seq_plus7aa} eq "") { - #ACE # do nothing - #ACE } - #ACE else { - #ACE $psp_regsite_protein_2{$peptide} = $psp_regsite_protein_2{$peptide}." / ".$psp_regsite_protein{$seq_plus7aa}; - #ACE } # $ON_FUNCTION_2 if ($ON_FUNCTION_2{$peptide} eq "") { @@ -1682,7 +1610,6 @@ } $organism_2{$peptide} = $organism{$seq_plus7aa}; } else { - #ACE print "d not found \$regulatory_sites_PhosphoSite_hash{{$seq_plus7aa}}\n"; } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) } # for my $k (0 .. $#formatted_sequences) } # if/else number of phosphosites @@ -1716,7 +1643,6 @@ foreach my $peptide (keys %data) { if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { - #ACE print "writing domain $domain_2{$peptide} for regulatory site(s) $seq_plus7aa_2{$peptide}\n"; #ACE $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide}); $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide}); @@ -1728,7 +1654,6 @@ if (not $psp_regulatory_site_stmth->execute()) { print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n"; } else { - #ACE print "added domain for $domain_2{$peptide}\n"; } } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) { print "\$domain_2{$peptide} is undefined\n"; #ACE @@ -1989,7 +1914,6 @@ my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data print OUT "$foobar"; print OUT "$tmp_site_for_printing\t"; - #ACE print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing\t"; $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; $there_were_sites = 1; } @@ -2049,7 +1973,6 @@ # begin store-to-SQLite "ppep_metadata" table # --- for $i (1..14) { - #ACE print "\$ppep_metadata_stmth->bind_param($i, " . $ppep_metadata[$i-1] . ")\n"; $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]); } if (not $ppep_metadata_stmth->execute()) { @@ -2074,7 +1997,6 @@ $ppep_intensity_stmth->bind_param( 1, $ppep_id ); $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} ); $ppep_intensity_stmth->bind_param( 3, $intense ); - #ACE print "insert ($peptide, $samples[$i], $intense)\n"; if (not $ppep_intensity_stmth->execute()) { print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n"; } @@ -2103,13 +2025,11 @@ } else { print OUT "\t";} } - #ACE my %wrote_motif = {}; my %wrote_motif; my $motif_parts_0; for my $i (0 .. $#motif_sequence) { if (exists($kinase_motif_matches{$peptide}{$motif_sequence[$i]})) { print OUT "X\t"; - #ACE my @motif_parts = split(/ motif /, $motif_type{$motif_sequence[$i]}); $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; my $key = "$peptide\t$gene_names\t$motif_parts_0"; if (!exists($wrote_motif{$key})) {