Mercurial > repos > devteam > snpfreq
annotate snpFreq2.pl @ 1:1e488c26e9d8 draft default tip
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
| author | devteam |
|---|---|
| date | Tue, 13 Oct 2015 12:52:33 -0400 |
| parents | afdfa7335319 |
| children |
| rev | line source |
|---|---|
| 0 | 1 #!/usr/bin/env perl |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 #using large SNP tables (~1G) may require large memory ~15G in R | |
| 7 | |
| 8 #expected input: path to file, cols of counts (2 sets of 3), threshold | |
| 9 if (!@ARGV or scalar @ARGV != 11) { | |
| 10 if (!@ARGV or scalar @ARGV != 6) { #snpTable usage | |
| 11 print "usage for tab separated allele counts\n", | |
| 12 "snpFreq.pl inputType #threshold /path/to/snps.txt outfile <6 column numbers(1 based) with counts for alleles, first one group then another>\n"; | |
| 13 print "OR for SNP tables\n"; | |
| 14 print "usage snpFreq.pl inputType #threshold /path/to/snpTable.txt outfile group1File group2File\n"; | |
| 15 exit 1; | |
| 16 } | |
| 17 } | |
| 18 | |
| 19 #get and verify inputs | |
| 20 my ($file, $a1, $a2, $a3, $b1, $b2, $b3, $thresh, $outfile); | |
| 21 if ($ARGV[0] eq 'tab') { | |
| 22 shift @ARGV; | |
| 23 $thresh = shift @ARGV; | |
| 24 if ($thresh !~ /^\d*\.?\d+$/) { | |
| 25 print "Error the threshold must be a number. Got $thresh\n"; | |
| 26 exit 1; | |
| 27 }elsif ($thresh > .3) { | |
| 28 print "Error the threshold can not be greater than 0.3 got $thresh\n"; | |
| 29 exit 1; | |
| 30 } | |
| 31 $file = shift @ARGV; | |
| 32 $outfile = shift @ARGV; | |
| 33 $a1 = shift @ARGV; | |
| 34 if ($a1 =~ /\D/ or $a1 < 1) { | |
| 35 print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n"; | |
| 36 exit 1; | |
| 37 } | |
| 38 $a2 = shift @ARGV; | |
| 39 if ($a2 =~ /\D/ or $a2 < 1) { | |
| 40 print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n"; | |
| 41 exit 1; | |
| 42 } | |
| 43 $a3 = shift @ARGV; | |
| 44 if ($a3 =~ /\D/ or $a3 < 1) { | |
| 45 print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n"; | |
| 46 exit 1; | |
| 47 } | |
| 48 $b1 = shift @ARGV; | |
| 49 if ($b1 =~ /\D/ or $b1 < 1) { | |
| 50 print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n"; | |
| 51 exit 1; | |
| 52 } | |
| 53 $b2 = shift @ARGV; | |
| 54 if ($b2 =~ /\D/ or $b2 < 1) { | |
| 55 print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n"; | |
| 56 exit 1; | |
| 57 } | |
| 58 $b3 = shift @ARGV; | |
| 59 if ($b3 =~ /\D/ or $b3 < 1) { | |
| 60 print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n"; | |
| 61 exit 1; | |
| 62 } | |
| 63 }else { #snp table convert and assign variables | |
| 64 #snpTable.txt #threshold outfile workingdir | |
| 65 shift @ARGV; | |
| 66 $thresh = shift @ARGV; | |
| 67 if ($thresh !~ /^\d*\.?\d+$/) { | |
| 68 print "Error the threshold must be a number. Got $thresh\n"; | |
| 69 exit 1; | |
| 70 }elsif ($thresh > .3) { | |
| 71 print "Error the threshold can not be greater than 0.3 got $thresh\n"; | |
| 72 exit 1; | |
| 73 } | |
| 74 $file = shift @ARGV; | |
| 75 $outfile = shift @ARGV; | |
| 76 my $grpFile = shift @ARGV; | |
| 77 my @g1; | |
| 78 open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; | |
| 79 while (<FH>) { | |
| 80 chomp; | |
| 81 if (/^(\d+)\s/) { push(@g1, $1); } | |
| 82 } | |
| 83 close FH or die "Couldn't close $grpFile, $!\n"; | |
| 84 $grpFile = shift @ARGV; | |
| 85 my @g2; | |
| 86 open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; | |
| 87 while (<FH>) { | |
| 88 chomp; | |
| 89 if (/^(\d+)\s/) { push(@g2, $1); } | |
| 90 } | |
| 91 close FH or die "Couldn't close $grpFile, $!\n"; | |
| 92 if ($file =~ /.gz$/) { | |
|
1
1e488c26e9d8
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
93 open(FH, "zcat < $file |") or die "Couldn't read $file, $!\n"; |
| 0 | 94 }else { |
| 95 open(FH, $file) or die "Couldn't read $file, $!\n"; | |
| 96 } | |
| 97 open(OUT, ">", "snpTable.txt") or die "Couldn't open snpTable.txt, $!\n"; | |
| 98 my $size; | |
| 99 while (<FH>) { | |
| 100 chomp; | |
| 101 if (/^#/) { next; } #header | |
| 102 my @f = split(/\t/); | |
| 103 $size = scalar @f; | |
| 104 my @gc1 = (0, 0, 0); | |
| 105 my @gc2 = (0, 0, 0); | |
| 106 foreach my $g (@g1) { | |
| 107 my $i = $g+1; #g is 1 based first col want 0 based snp call column | |
| 108 if ($i > $#f) { die "ERROR looking for index $i which is greater than the list $#f\n"; } | |
| 109 if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref | |
| 110 $gc1[0]++; | |
| 111 }elsif ($f[$i] == 1) { | |
| 112 $gc1[2]++; | |
| 113 }elsif ($f[$i] == 0) { | |
| 114 $gc1[1]++; | |
| 115 }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } | |
| 116 } | |
| 117 foreach my $g (@g2) { | |
| 118 my $i = $g+1; #g is 1 based first col want 0 based snp call column | |
| 119 if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref | |
| 120 $gc2[0]++; | |
| 121 }elsif ($f[$i] == 1) { | |
| 122 $gc2[2]++; | |
| 123 }elsif ($f[$i] == 0) { | |
| 124 $gc2[1]++; | |
| 125 }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } | |
| 126 } | |
| 127 print OUT join("\t", @f), "\t", join("\t", @gc1), "\t", join("\t", @gc2), | |
| 128 "\n"; | |
| 129 } | |
| 130 close FH or die "Couldn't close $file, $!\n"; | |
| 131 close OUT or die "Couldn't close snpTable.txt, $!\n"; | |
| 132 my $i = $size + 1; #next 1 based column after input data | |
| 133 $a1 = $i++; | |
| 134 $a2 = $i++; | |
| 135 $a3 = $i++; | |
| 136 $b1 = $i++; | |
| 137 $b2 = $i++; | |
| 138 $b3 = $i++; | |
| 139 $file = "snpTable.txt"; | |
| 140 } | |
| 141 | |
| 142 #run a fishers exact test (using R) on whole table | |
| 143 my $cmd = qq|options(warn=-1) | |
| 144 tab <- read.table('$file', sep="\t") | |
| 145 size <- length(tab[,1]) | |
| 146 width <- length(tab[1,]) | |
| 147 x <- 1:size | |
| 148 y <- matrix(data=0, nr=size, nc=6) | |
| 149 for(i in 1:size) { | |
| 150 m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2) | |
| 151 t <- fisher.test(m) | |
| 152 x[i] <- t\$p.value | |
| 153 if (x[i] >= 1) { | |
| 154 x[i] <- .999 | |
| 155 } | |
| 156 n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) | |
| 157 n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3]) | |
| 158 y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n | |
| 159 y[i,1] <- round(y[i,1],3) | |
| 160 y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n | |
| 161 y[i,2] <- round(y[i,2],3) | |
| 162 y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n | |
| 163 y[i,3] <- round(y[i,3],3) | |
| 164 n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) | |
| 165 y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n | |
| 166 y[i,4] <- round(y[i,4],3) | |
| 167 y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n | |
| 168 y[i,5] <- round(y[i,5],3) | |
| 169 y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n | |
| 170 y[i,6] <- round(y[i,6],3) | |
| 171 }|; | |
| 172 #results <- data.frame(tab[1:size,1:width], x[1:size]) | |
| 173 #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
| 174 #q()|; | |
| 175 | |
| 176 #my $cmd2 = qq|suppressPackageStartupMessages(library(lib.loc='/afs/bx.psu.edu/home/giardine/lib/R', qvalue)) | |
| 177 my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue)) | |
| 178 qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE) | |
| 179 q <- qobj\$qvalues | |
| 180 results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size]) | |
| 181 write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
| 182 q()|; | |
| 183 | |
| 184 #for TESTING | |
| 185 my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size]) | |
| 186 write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
| 187 q()|; | |
| 188 | |
| 189 open(FT, "| R --slave --vanilla") | |
| 190 or die "Couldn't call fisher.text, $!\n"; | |
| 191 print FT $cmd, "\n"; #fisher test | |
| 192 print FT $cmd2, "\n"; #qvalues and results | |
| 193 #print FT $pr, "\n"; | |
| 194 close FT or die "Couldn't finish fisher.test, $!\n"; | |
| 195 | |
| 196 exit; |
