Mercurial > repos > greg > ideas_preprocessor
view runproc.R @ 22:3651f1592f3f draft
Uploaded
author | greg |
---|---|
date | Wed, 31 Jan 2018 14:13:11 -0500 |
parents | |
children |
line wrap: on
line source
normdata<-function(case, control, method="nbinom") { d=dnorm(-10:10,0,10); u= c(rep(0,10),control,rep(0,10)); l=length(case); nu=rep(0,l); for(i in 1:21) { nu = nu + u[i-1+1:l] * d[i]; } nu = nu / sum(d); t = which(case<quantile(case,prob=0.99)) p=mean(case[t])/var(case[t]); if(p>0.9) p=0.9; if(p<0.1) p=0.1; n=mean(case[t])*p/(1-p); print(c(n,p)); #nc = log2((case*20+1)/(nu/mean(nu)*mean(case)*20+1)+1)-1; if(method=="nbinom") { nc = -pnbinom(case,size=c((nu+1)/mean(nu+1)*n),prob=p,lower.tail=F,log.p=T)/log(10); } else { nc = -ppois(case,(nu+1)/mean(nu+1)*mean(case),lower.tail=F,log.p=T)/log(10); } return(nc); } binsz=200 folder="me66/Input/" mbed="/gpfs/group/yzz2/default/IDEAS_server/data/mm10.bed" #read file list args<-commandArgs(trailingOnly=TRUE); flist=args[1]; x=as.matrix(read.table(flist)); if(length(args) > 1) { st=as.integer(args[2]); ed=as.integer(args[3]); } else { st=1; ed=dim(x)[1]; } l=paste(x[,1],x[,2]); ll=unique(l); id=rep(0,length(l)); for(i in ll) { t=which(l==i); if(length(t)>1) { id[t]=1:length(t); } } y=x[st:ed,1:3]; if(dim(x)[2]==4) { y=rbind(y,x[st:ed,c(1:2,4)]); } t=match(unique(y[,3]),y[,3]); k=rapply(as.list(y[t,3]),function(z){tail(unlist(gregexpr("\\/",z)),n=1);}); y=cbind(y[t,],substr(y[t,3],k+1,1000)); #fetch data and process data to windows mean for(i in 1:dim(y)[1]) { fout = paste(folder, y[i,1],"_",y[i,2],"_",y[i,4],sep=""); system(paste("wget -O",fout, y[i,3])); tl = tail(unlist(gregexpr("\\.",fout)),n=1); name = substr(fout,1,tl); ftype = substr(fout, tl+1, 100000); if(tolower(ftype) == "bam") { system(paste("samtools index", fout)); bw=paste(name,".bw",sep=""); system(paste("bamCoverage --bam",fout,"-o",bw,"--binSize", binsz)); } else { bw=fout; } bd=paste(name,".bed",sep=""); system(paste("/storage/home/yzz2/work/tools/bigWigAverageOverBed", bw, mbed, "stdout | cut -f5 >", bd)); system(paste("gzip -f",bd)); system(paste("rm ", name,".ba*",sep="")); system(paste("rm ", name,".bw",sep="")); } #normalize data if inputs are available, and prepare data file list fname=NULL; for(i in st:ed) { k=match(x[i,3],y[,3]); f1=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); if(dim(x)[2]==4) { k=match(x[i,4],y[,3]); f0=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); if(id[i]>0) { nf=paste(folder,x[i,1],".",x[i,2],"_",id[i],".norm.bed",sep=""); } else { nf=paste(folder,x[i,1],".",x[i,2],".norm.bed",sep=""); } x1=as.matrix(read.table(f1)); x0=as.matrix(read.table(f0)); nx=normdata(x1,x0); write.table(round(nx*100)/100,nf,quote=F,row.names=F,col.names=F); system(paste("gzip -f",nf)); fname=rbind(fname,c(x[i,1:2],paste(nf,".gz",sep=""))); } else { nf=paste(fold,x[i,1],".",x[i,2],".bed.gz",sep=""); system(paste("mv", f1, nf)); fname=rbind(fname,c(x[i,1:2],nf)); } } write.table(fname,paste(folder,"tmp_",st,"_",ed,".input",sep=""),quote=F,row.names=F,col.names=F);