view runproc.R @ 22:3651f1592f3f draft

Uploaded
author greg
date Wed, 31 Jan 2018 14:13:11 -0500
parents
children
line wrap: on
line source

normdata<-function(case, control, method="nbinom")
{	d=dnorm(-10:10,0,10);
	u= c(rep(0,10),control,rep(0,10));
	l=length(case);
	nu=rep(0,l);
	for(i in 1:21)
	{	nu = nu + u[i-1+1:l] * d[i];
	}
	nu = nu / sum(d);

	t = which(case<quantile(case,prob=0.99))
	p=mean(case[t])/var(case[t]);
	if(p>0.9) p=0.9;
	if(p<0.1) p=0.1;
	n=mean(case[t])*p/(1-p);
	print(c(n,p));
	#nc = log2((case*20+1)/(nu/mean(nu)*mean(case)*20+1)+1)-1;
	if(method=="nbinom")
	{	nc = -pnbinom(case,size=c((nu+1)/mean(nu+1)*n),prob=p,lower.tail=F,log.p=T)/log(10);
	} else
	{	nc = -ppois(case,(nu+1)/mean(nu+1)*mean(case),lower.tail=F,log.p=T)/log(10);
	}
	return(nc);
}

binsz=200
folder="me66/Input/"
mbed="/gpfs/group/yzz2/default/IDEAS_server/data/mm10.bed"

#read file list
args<-commandArgs(trailingOnly=TRUE);
flist=args[1];
x=as.matrix(read.table(flist));
if(length(args) > 1)
{	st=as.integer(args[2]);
	ed=as.integer(args[3]);
} else
{	st=1;
	ed=dim(x)[1];
}
l=paste(x[,1],x[,2]);
ll=unique(l);
id=rep(0,length(l));
for(i in ll)
{	t=which(l==i);
	if(length(t)>1)
	{	id[t]=1:length(t);
	}
}

y=x[st:ed,1:3];
if(dim(x)[2]==4)
{ y=rbind(y,x[st:ed,c(1:2,4)]); }
t=match(unique(y[,3]),y[,3]);
k=rapply(as.list(y[t,3]),function(z){tail(unlist(gregexpr("\\/",z)),n=1);});
y=cbind(y[t,],substr(y[t,3],k+1,1000));

#fetch data and process data to windows mean
for(i in 1:dim(y)[1])
{	fout = paste(folder, y[i,1],"_",y[i,2],"_",y[i,4],sep="");	
	system(paste("wget -O",fout, y[i,3]));
	tl = tail(unlist(gregexpr("\\.",fout)),n=1);
	name = substr(fout,1,tl);
	ftype = substr(fout, tl+1, 100000);
	if(tolower(ftype) == "bam")
	{	system(paste("samtools index", fout));
		bw=paste(name,".bw",sep="");
		system(paste("bamCoverage --bam",fout,"-o",bw,"--binSize", binsz));
	} else
	{	bw=fout;
	}
	bd=paste(name,".bed",sep="");
	system(paste("/storage/home/yzz2/work/tools/bigWigAverageOverBed", bw, mbed, "stdout | cut -f5 >", bd));
	system(paste("gzip -f",bd));
	system(paste("rm ", name,".ba*",sep=""));
	system(paste("rm ", name,".bw",sep=""));
}

#normalize data if inputs are available, and prepare data file list
fname=NULL;
for(i in st:ed)
{	k=match(x[i,3],y[,3]);
	f1=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep="");
	if(dim(x)[2]==4)
	{	k=match(x[i,4],y[,3]);
		f0=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep="");
		if(id[i]>0) 
		{	nf=paste(folder,x[i,1],".",x[i,2],"_",id[i],".norm.bed",sep="");
		} else
		{	nf=paste(folder,x[i,1],".",x[i,2],".norm.bed",sep="");
		}	
		x1=as.matrix(read.table(f1));
		x0=as.matrix(read.table(f0));
		nx=normdata(x1,x0);
		write.table(round(nx*100)/100,nf,quote=F,row.names=F,col.names=F);
		system(paste("gzip -f",nf));
		fname=rbind(fname,c(x[i,1:2],paste(nf,".gz",sep="")));
	} else
	{	nf=paste(fold,x[i,1],".",x[i,2],".bed.gz",sep="");
		system(paste("mv", f1, nf));
		fname=rbind(fname,c(x[i,1:2],nf));
	}
}

write.table(fname,paste(folder,"tmp_",st,"_",ed,".input",sep=""),quote=F,row.names=F,col.names=F);