Mercurial > repos > greg > ideas_preprocessor
changeset 22:3651f1592f3f draft
Uploaded
author | greg |
---|---|
date | Wed, 31 Jan 2018 14:13:11 -0500 |
parents | 99102499271a |
children | 6306f535a658 |
files | ideas_preprocessor.R ideas_preprocessor.xml runproc.R test-data/IDEAS_input_config.txt test-data/chrom_windows.bed test-data/chromosome_windows.txt test-data/chromosomes.bed test-data/output.html test-data/output.ideaspre test-data/tmp.tar |
diffstat | 10 files changed, 189 insertions(+), 66 deletions(-) [+] |
line wrap: on
line diff
--- a/ideas_preprocessor.R Wed Jan 31 08:22:43 2018 -0500 +++ b/ideas_preprocessor.R Wed Jan 31 14:13:11 2018 -0500 @@ -45,6 +45,8 @@ cat(s, file=output); } +tmp_dir = "tmp"; + # Read the ideaspre_input_config text file which has this format: # "cell type name" "epigenetic factor name" "file path" "file name" "datatype" ideaspre_input_config = as.matrix(read.table(opt$ideaspre_input_config)); @@ -71,9 +73,10 @@ bigwig_file_name = file_path; } bed_file_name = paste(file_name, "bed", sep="."); - cmd = paste("bigWigAverageOverBed", bigwig_file_name, opt$chrom_bed_input, "stdout | cut -f5 >", bed_file_name); + bed_file_path = paste("tmp", bed_file_name, sep="/"); + cmd = paste("bigWigAverageOverBed", bigwig_file_name, opt$chrom_bed_input, "stdout | cut -f5 >", bed_file_path); system(cmd); - cmd = paste("gzip -f", bed_file_name); + cmd = paste("gzip -f", bed_file_path); system(cmd); } } @@ -81,8 +84,10 @@ # Create file1.txt. cmd = paste("cut -d' '", opt$ideaspre_input_config, "-f1,2 > file1.txt", sep=" "); system(cmd); -# Compress the bed files and create file2.txt. -cmd = "ls *.bed.gz > file2.txt"; +# Compress the bed files in the tmp directory. +tmp_gzipped_files = paste(tmp_dir, "*.bed.gz", sep="/"); +# Create file2.txt. +cmd = paste("ls", tmp_gzipped_files, "> file2.txt", sep=" "); system(cmd); # Create IDEAS_input_config.txt with the format required by IDEAS. ideas_input_config = "IDEAS_input_config.txt" @@ -91,7 +96,13 @@ # Move IDEAS_input_config.txt to the output directory. to_path = paste(opt$output_files_path, ideas_input_config, sep="/"); file.rename(ideas_input_config, to_path); -# Handle optional chrom_bed_input.txt and chromosomes.bed files. +# Archive the tmp directory. +cmd = "tar -cvf tmp.tar tmp"; +system(cmd); +# Move the tmp archive to the output directory. +to_path = paste(opt$output_files_path, "tmp.tar", sep="/"); +file.rename("tmp.tar", to_path); + if (!is.null(opt$chrom_bed_input) && !is.null(opt$chromosome_windows)) { # Renane opt$chrom_bed_input to be chromosomes.bed # and make a copy of it in the output directory.
--- a/ideas_preprocessor.xml Wed Jan 31 08:22:43 2018 -0500 +++ b/ideas_preprocessor.xml Wed Jan 31 14:13:11 2018 -0500 @@ -11,6 +11,8 @@ #set chromosome_windows = "chromosome_windows.txt" #set ideaspre_input_config = "ideaspre_input_config.txt" #set specify_chrom_windows = $specify_chrom_windows_cond.specify_chrom_windows +#set tmp_dir = "tmp" +mkdir $tmp_dir && mkdir $output.files_path && #if str($specify_chrom_windows) == "yes": ############################################## @@ -179,8 +181,13 @@ <test> <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/> <param name="specify_chrom_windows" value="yes"/> - <param name="chrom_bed_input" value="chrom_windows.bed" ftype="bed" dbkey="hg19"/> - <output name="output" file="output.ideaspre" ftype="ideaspre" /> + <param name="chrom_bed_input" value="chromosomes.bed" ftype="bed" dbkey="hg19"/> + <output name="output" file="output.html" ftype="ideaspre"> + <extra_files type="file" name="chromosomes.bed" value="chromosomes.bed"/> + <extra_files type="file" name="chromosome_windows.txt" value="chromosome_windows.txt"/> + <extra_files type="file" name="IDEAS_input_config.txt" value="IDEAS_input_config.txt"/> + <extra_files type="file" name="tmp.tar" value="tmp.tar" compare="sim_size"/> + </output> </test> </tests> <help>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/runproc.R Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,105 @@ +normdata<-function(case, control, method="nbinom") +{ d=dnorm(-10:10,0,10); + u= c(rep(0,10),control,rep(0,10)); + l=length(case); + nu=rep(0,l); + for(i in 1:21) + { nu = nu + u[i-1+1:l] * d[i]; + } + nu = nu / sum(d); + + t = which(case<quantile(case,prob=0.99)) + p=mean(case[t])/var(case[t]); + if(p>0.9) p=0.9; + if(p<0.1) p=0.1; + n=mean(case[t])*p/(1-p); + print(c(n,p)); + #nc = log2((case*20+1)/(nu/mean(nu)*mean(case)*20+1)+1)-1; + if(method=="nbinom") + { nc = -pnbinom(case,size=c((nu+1)/mean(nu+1)*n),prob=p,lower.tail=F,log.p=T)/log(10); + } else + { nc = -ppois(case,(nu+1)/mean(nu+1)*mean(case),lower.tail=F,log.p=T)/log(10); + } + return(nc); +} + +binsz=200 +folder="me66/Input/" +mbed="/gpfs/group/yzz2/default/IDEAS_server/data/mm10.bed" + +#read file list +args<-commandArgs(trailingOnly=TRUE); +flist=args[1]; +x=as.matrix(read.table(flist)); +if(length(args) > 1) +{ st=as.integer(args[2]); + ed=as.integer(args[3]); +} else +{ st=1; + ed=dim(x)[1]; +} +l=paste(x[,1],x[,2]); +ll=unique(l); +id=rep(0,length(l)); +for(i in ll) +{ t=which(l==i); + if(length(t)>1) + { id[t]=1:length(t); + } +} + +y=x[st:ed,1:3]; +if(dim(x)[2]==4) +{ y=rbind(y,x[st:ed,c(1:2,4)]); } +t=match(unique(y[,3]),y[,3]); +k=rapply(as.list(y[t,3]),function(z){tail(unlist(gregexpr("\\/",z)),n=1);}); +y=cbind(y[t,],substr(y[t,3],k+1,1000)); + +#fetch data and process data to windows mean +for(i in 1:dim(y)[1]) +{ fout = paste(folder, y[i,1],"_",y[i,2],"_",y[i,4],sep=""); + system(paste("wget -O",fout, y[i,3])); + tl = tail(unlist(gregexpr("\\.",fout)),n=1); + name = substr(fout,1,tl); + ftype = substr(fout, tl+1, 100000); + if(tolower(ftype) == "bam") + { system(paste("samtools index", fout)); + bw=paste(name,".bw",sep=""); + system(paste("bamCoverage --bam",fout,"-o",bw,"--binSize", binsz)); + } else + { bw=fout; + } + bd=paste(name,".bed",sep=""); + system(paste("/storage/home/yzz2/work/tools/bigWigAverageOverBed", bw, mbed, "stdout | cut -f5 >", bd)); + system(paste("gzip -f",bd)); + system(paste("rm ", name,".ba*",sep="")); + system(paste("rm ", name,".bw",sep="")); +} + +#normalize data if inputs are available, and prepare data file list +fname=NULL; +for(i in st:ed) +{ k=match(x[i,3],y[,3]); + f1=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); + if(dim(x)[2]==4) + { k=match(x[i,4],y[,3]); + f0=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep=""); + if(id[i]>0) + { nf=paste(folder,x[i,1],".",x[i,2],"_",id[i],".norm.bed",sep=""); + } else + { nf=paste(folder,x[i,1],".",x[i,2],".norm.bed",sep=""); + } + x1=as.matrix(read.table(f1)); + x0=as.matrix(read.table(f0)); + nx=normdata(x1,x0); + write.table(round(nx*100)/100,nf,quote=F,row.names=F,col.names=F); + system(paste("gzip -f",nf)); + fname=rbind(fname,c(x[i,1:2],paste(nf,".gz",sep=""))); + } else + { nf=paste(fold,x[i,1],".",x[i,2],".bed.gz",sep=""); + system(paste("mv", f1, nf)); + fname=rbind(fname,c(x[i,1:2],nf)); + } +} + +write.table(fname,paste(folder,"tmp_",st,"_",ed,".input",sep=""),quote=F,row.names=F,col.names=F);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/IDEAS_input_config.txt Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,1 @@ +e001 h3k4me3 tmp/e001-h3k4me3.bed.gz
--- a/test-data/chrom_windows.bed Wed Jan 31 08:22:43 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -chr1 21819600 21819800 R100001 -chr1 21819800 21820000 R100002 -chr1 21820000 21820200 R100003 -chr1 21820200 21820400 R100004 -chr1 21820400 21820600 R100005 -chr1 21820600 21820800 R100006 -chr1 21820800 21821000 R100007 -chr1 21821000 21821200 R100008 -chr1 21821200 21821400 R100009 -chr1 21821400 21821600 R100010 -chr1 21821600 21821800 R100011 -chr1 21821800 21822000 R100012 -chr1 21822000 21822200 R100013 -chr1 21822200 21822400 R100014 -chr1 21822400 21822600 R100015 -chr1 21822600 21822800 R100016 -chr1 21822800 21823000 R100017 -chr1 21823000 21823200 R100018 -chr1 21823200 21823400 R100019 -chr1 21823400 21823600 R100020 -chr1 21823600 21823800 R100021 -chr1 21823800 21824000 R100022 -chr1 21824000 21824200 R100023 -chr1 21824200 21824400 R100024 -chr1 21824400 21824600 R100025 -chr1 21824600 21824800 R100026 -chr1 21824800 21825000 R100027 -chr1 21825000 21825200 R100028 -chr1 21825200 21825400 R100029 -chr1 21825400 21825600 R100030 -chr1 21825600 21825800 R100031 -chr1 21825800 21826000 R100032 -chr1 21826000 21826200 R100033 -chr1 21826200 21826400 R100034 -chr1 21826400 21826600 R100035 -chr1 21826600 21826800 R100036 -chr1 21826800 21827000 R100037 -chr1 21827000 21827200 R100038 -chr1 21827200 21827400 R100039 -chr1 21827400 21827600 R100040 -chr1 21827600 21827800 R100041 -chr1 21827800 21828000 R100042 -chr1 21828000 21828200 R100043 -chr1 21828200 21828400 R100044 -chr1 21828400 21828600 R100045 -chr1 21829000 21829200 R100046 -chr1 21829400 21829600 R100047 -chr1 21829600 21829800 R100048 -chr1 21829800 21830000 R100049 -chr1 21830000 21830200 R100050
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chromosome_windows.txt Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,1 @@ +chr1 0 50
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chromosomes.bed Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,50 @@ +chr1 21819600 21819800 R100001 +chr1 21819800 21820000 R100002 +chr1 21820000 21820200 R100003 +chr1 21820200 21820400 R100004 +chr1 21820400 21820600 R100005 +chr1 21820600 21820800 R100006 +chr1 21820800 21821000 R100007 +chr1 21821000 21821200 R100008 +chr1 21821200 21821400 R100009 +chr1 21821400 21821600 R100010 +chr1 21821600 21821800 R100011 +chr1 21821800 21822000 R100012 +chr1 21822000 21822200 R100013 +chr1 21822200 21822400 R100014 +chr1 21822400 21822600 R100015 +chr1 21822600 21822800 R100016 +chr1 21822800 21823000 R100017 +chr1 21823000 21823200 R100018 +chr1 21823200 21823400 R100019 +chr1 21823400 21823600 R100020 +chr1 21823600 21823800 R100021 +chr1 21823800 21824000 R100022 +chr1 21824000 21824200 R100023 +chr1 21824200 21824400 R100024 +chr1 21824400 21824600 R100025 +chr1 21824600 21824800 R100026 +chr1 21824800 21825000 R100027 +chr1 21825000 21825200 R100028 +chr1 21825200 21825400 R100029 +chr1 21825400 21825600 R100030 +chr1 21825600 21825800 R100031 +chr1 21825800 21826000 R100032 +chr1 21826000 21826200 R100033 +chr1 21826200 21826400 R100034 +chr1 21826400 21826600 R100035 +chr1 21826600 21826800 R100036 +chr1 21826800 21827000 R100037 +chr1 21827000 21827200 R100038 +chr1 21827200 21827400 R100039 +chr1 21827400 21827600 R100040 +chr1 21827600 21827800 R100041 +chr1 21827800 21828000 R100042 +chr1 21828000 21828200 R100043 +chr1 21828200 21828400 R100044 +chr1 21828400 21828600 R100045 +chr1 21829000 21829200 R100046 +chr1 21829400 21829600 R100047 +chr1 21829600 21829800 R100048 +chr1 21829800 21830000 R100049 +chr1 21830000 21830200 R100050
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.html Wed Jan 31 14:13:11 2018 -0500 @@ -0,0 +1,7 @@ +<html><head></head><body><h3>Files prepared for IDEAS</h3> +<ul> +<li><a href="chromosome_windows.txt">chromosome_windows.txt</a></li> +<li><a href="chromosomes.bed">chromosomes.bed</a></li> +<li><a href="IDEAS_input_config.txt">IDEAS_input_config.txt</a></li> +<li><a href="tmp.tar">tmp.tar</a></li> +</ul></body></html> \ No newline at end of file
--- a/test-data/output.ideaspre Wed Jan 31 08:22:43 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -<html><head></head><body><h3>Files prepared for IDEAS</h3> -<ul> -<li><a href="chromosome_windows.txt">chromosome_windows.txt</a></li> -<li><a href="chromosomes.bed">chromosomes.bed</a></li> -<li><a href="IDEAS_input_config.txt">IDEAS_input_config.txt</a></li> -<li><a href="tmp">tmp</a></li> -</ul> -</body> -</html> \ No newline at end of file