Mercurial > repos > greg > ideas_preprocessor
diff runproc.R @ 22:3651f1592f3f draft
Uploaded
author | greg |
---|---|
date | Wed, 31 Jan 2018 14:13:11 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runproc.R Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,105 @@ +normdata<-function(case, control, method="nbinom") +{ d=dnorm(-10:10,0,10); + u= c(rep(0,10),control,rep(0,10)); + l=length(case); + nu=rep(0,l); + for(i in 1:21) + { nu = nu + u[i-1+1:l] * d[i]; + } + nu = nu / sum(d); + + t = which(case<quantile(case,prob=0.99)) + p=mean(case[t])/var(case[t]); + if(p>0.9) p=0.9; + if(p<0.1) p=0.1; + n=mean(case[t])*p/(1-p); + print(c(n,p)); + #nc = log2((case*20+1)/(nu/mean(nu)*mean(case)*20+1)+1)-1; + if(method=="nbinom") + { nc = -pnbinom(case,size=c((nu+1)/mean(nu+1)*n),prob=p,lower.tail=F,log.p=T)/log(10); + } else + { nc = -ppois(case,(nu+1)/mean(nu+1)*mean(case),lower.tail=F,log.p=T)/log(10); + } + return(nc); +} + +binsz=200 +folder="me66/Input/" +mbed="/gpfs/group/yzz2/default/IDEAS_server/data/mm10.bed" + +#read file list +args<-commandArgs(trailingOnly=TRUE); +flist=args[1]; +x=as.matrix(read.table(flist)); +if(length(args) > 1) +{ st=as.integer(args[2]); + ed=as.integer(args[3]); +} else +{ st=1; + ed=dim(x)[1]; +} +l=paste(x[,1],x[,2]); +ll=unique(l); +id=rep(0,length(l)); +for(i in ll) +{ t=which(l==i); + if(length(t)>1) + { id[t]=1:length(t); + } +} + +y=x[st:ed,1:3]; +if(dim(x)[2]==4) +{ y=rbind(y,x[st:ed,c(1:2,4)]); } +t=match(unique(y[,3]),y[,3]); +k=rapply(as.list(y[t,3]),function(z){tail(unlist(gregexpr("\\/",z)),n=1);}); +y=cbind(y[t,],substr(y[t,3],k+1,1000)); + +#fetch data and process data to windows mean +for(i in 1:dim(y)[1]) +{ fout = paste(folder, y[i,1],"_",y[i,2],"_",y[i,4],sep=""); + system(paste("wget -O",fout, y[i,3])); + tl = tail(unlist(gregexpr("\\.",fout)),n=1); + name = substr(fout,1,tl); + ftype = substr(fout, tl+1, 100000); + if(tolower(ftype) == "bam") + { system(paste("samtools index", fout)); + bw=paste(name,".bw",sep=""); + system(paste("bamCoverage --bam",fout,"-o",bw,"--binSize", binsz)); + } else + { bw=fout; + } + bd=paste(name,".bed",sep=""); + system(paste("/storage/home/yzz2/work/tools/bigWigAverageOverBed", bw, mbed, "stdout | cut -f5 >", bd)); + system(paste("gzip -f",bd)); + system(paste("rm ", name,".ba*",sep="")); + system(paste("rm ", name,".bw",sep="")); +} + +#normalize data if inputs are available, and prepare data file list +fname=NULL; +for(i in st:ed) +{ k=match(x[i,3],y[,3]); + f1=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); + if(dim(x)[2]==4) + { k=match(x[i,4],y[,3]); + f0=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); + if(id[i]>0) + { nf=paste(folder,x[i,1],".",x[i,2],"_",id[i],".norm.bed",sep=""); + } else + { nf=paste(folder,x[i,1],".",x[i,2],".norm.bed",sep=""); + } + x1=as.matrix(read.table(f1)); + x0=as.matrix(read.table(f0)); + nx=normdata(x1,x0); + write.table(round(nx*100)/100,nf,quote=F,row.names=F,col.names=F); + system(paste("gzip -f",nf)); + fname=rbind(fname,c(x[i,1:2],paste(nf,".gz",sep=""))); + } else + { nf=paste(fold,x[i,1],".",x[i,2],".bed.gz",sep=""); + system(paste("mv", f1, nf)); + fname=rbind(fname,c(x[i,1:2],nf)); + } +} + +write.table(fname,paste(folder,"tmp_",st,"_",ed,".input",sep=""),quote=F,row.names=F,col.names=F);