changeset 22:3651f1592f3f draft

Uploaded
author greg
date Wed, 31 Jan 2018 14:13:11 -0500
parents 99102499271a
children 6306f535a658
files ideas_preprocessor.R ideas_preprocessor.xml runproc.R test-data/IDEAS_input_config.txt test-data/chrom_windows.bed test-data/chromosome_windows.txt test-data/chromosomes.bed test-data/output.html test-data/output.ideaspre test-data/tmp.tar
diffstat 10 files changed, 189 insertions(+), 66 deletions(-) [+]
line wrap: on
line diff
--- a/ideas_preprocessor.R	Wed Jan 31 08:22:43 2018 -0500
+++ b/ideas_preprocessor.R	Wed Jan 31 14:13:11 2018 -0500
@@ -45,6 +45,8 @@
     cat(s, file=output);
 }
 
+tmp_dir = "tmp";
+
 # Read the ideaspre_input_config text file which has this format:
 # "cell type name" "epigenetic factor name" "file path" "file name" "datatype"
 ideaspre_input_config = as.matrix(read.table(opt$ideaspre_input_config));
@@ -71,9 +73,10 @@
             bigwig_file_name = file_path;
         }
         bed_file_name = paste(file_name, "bed", sep=".");
-        cmd = paste("bigWigAverageOverBed", bigwig_file_name, opt$chrom_bed_input, "stdout | cut -f5 >", bed_file_name);
+        bed_file_path = paste("tmp", bed_file_name, sep="/");
+        cmd = paste("bigWigAverageOverBed", bigwig_file_name, opt$chrom_bed_input, "stdout | cut -f5 >", bed_file_path);
         system(cmd);
-        cmd = paste("gzip -f", bed_file_name);
+        cmd = paste("gzip -f", bed_file_path);
         system(cmd);
     }
 }
@@ -81,8 +84,10 @@
 # Create file1.txt.
 cmd = paste("cut -d' '", opt$ideaspre_input_config, "-f1,2 > file1.txt", sep=" ");
 system(cmd);
-# Compress the bed files and create file2.txt.
-cmd = "ls *.bed.gz > file2.txt";
+# Compress the bed files in the tmp directory.
+tmp_gzipped_files = paste(tmp_dir, "*.bed.gz", sep="/");
+# Create file2.txt.
+cmd = paste("ls", tmp_gzipped_files, "> file2.txt", sep=" ");
 system(cmd);
 # Create IDEAS_input_config.txt  with the format required by IDEAS.
 ideas_input_config = "IDEAS_input_config.txt"
@@ -91,7 +96,13 @@
 # Move IDEAS_input_config.txt to the output directory.
 to_path = paste(opt$output_files_path, ideas_input_config, sep="/");
 file.rename(ideas_input_config, to_path);
-# Handle optional chrom_bed_input.txt and chromosomes.bed files.
+# Archive the tmp directory.
+cmd = "tar -cvf tmp.tar tmp";
+system(cmd);
+# Move the tmp archive to the output directory.
+to_path = paste(opt$output_files_path, "tmp.tar", sep="/");
+file.rename("tmp.tar", to_path);
+
 if (!is.null(opt$chrom_bed_input) && !is.null(opt$chromosome_windows)) {
     # Renane opt$chrom_bed_input to be chromosomes.bed
     # and make a copy of it in the output directory.
--- a/ideas_preprocessor.xml	Wed Jan 31 08:22:43 2018 -0500
+++ b/ideas_preprocessor.xml	Wed Jan 31 14:13:11 2018 -0500
@@ -11,6 +11,8 @@
 #set chromosome_windows = "chromosome_windows.txt"
 #set ideaspre_input_config = "ideaspre_input_config.txt"
 #set specify_chrom_windows = $specify_chrom_windows_cond.specify_chrom_windows
+#set tmp_dir = "tmp"
+mkdir $tmp_dir &&
 mkdir $output.files_path &&
 #if str($specify_chrom_windows) == "yes":
     ##############################################
@@ -179,8 +181,13 @@
         <test>
             <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
             <param name="specify_chrom_windows" value="yes"/>
-            <param name="chrom_bed_input" value="chrom_windows.bed" ftype="bed" dbkey="hg19"/>
-            <output name="output" file="output.ideaspre" ftype="ideaspre" />
+            <param name="chrom_bed_input" value="chromosomes.bed" ftype="bed" dbkey="hg19"/>
+            <output name="output" file="output.html" ftype="ideaspre">
+                <extra_files type="file" name="chromosomes.bed" value="chromosomes.bed"/>
+                <extra_files type="file" name="chromosome_windows.txt" value="chromosome_windows.txt"/>
+                <extra_files type="file" name="IDEAS_input_config.txt" value="IDEAS_input_config.txt"/>
+                <extra_files type="file" name="tmp.tar" value="tmp.tar" compare="sim_size"/>
+            </output>
         </test>
     </tests>
     <help>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/runproc.R	Wed Jan 31 14:13:11 2018 -0500
@@ -0,0 +1,105 @@
+normdata<-function(case, control, method="nbinom")
+{	d=dnorm(-10:10,0,10);
+	u= c(rep(0,10),control,rep(0,10));
+	l=length(case);
+	nu=rep(0,l);
+	for(i in 1:21)
+	{	nu = nu + u[i-1+1:l] * d[i];
+	}
+	nu = nu / sum(d);
+
+	t = which(case<quantile(case,prob=0.99))
+	p=mean(case[t])/var(case[t]);
+	if(p>0.9) p=0.9;
+	if(p<0.1) p=0.1;
+	n=mean(case[t])*p/(1-p);
+	print(c(n,p));
+	#nc = log2((case*20+1)/(nu/mean(nu)*mean(case)*20+1)+1)-1;
+	if(method=="nbinom")
+	{	nc = -pnbinom(case,size=c((nu+1)/mean(nu+1)*n),prob=p,lower.tail=F,log.p=T)/log(10);
+	} else
+	{	nc = -ppois(case,(nu+1)/mean(nu+1)*mean(case),lower.tail=F,log.p=T)/log(10);
+	}
+	return(nc);
+}
+
+binsz=200
+folder="me66/Input/"
+mbed="/gpfs/group/yzz2/default/IDEAS_server/data/mm10.bed"
+
+#read file list
+args<-commandArgs(trailingOnly=TRUE);
+flist=args[1];
+x=as.matrix(read.table(flist));
+if(length(args) > 1)
+{	st=as.integer(args[2]);
+	ed=as.integer(args[3]);
+} else
+{	st=1;
+	ed=dim(x)[1];
+}
+l=paste(x[,1],x[,2]);
+ll=unique(l);
+id=rep(0,length(l));
+for(i in ll)
+{	t=which(l==i);
+	if(length(t)>1)
+	{	id[t]=1:length(t);
+	}
+}
+
+y=x[st:ed,1:3];
+if(dim(x)[2]==4)
+{ y=rbind(y,x[st:ed,c(1:2,4)]); }
+t=match(unique(y[,3]),y[,3]);
+k=rapply(as.list(y[t,3]),function(z){tail(unlist(gregexpr("\\/",z)),n=1);});
+y=cbind(y[t,],substr(y[t,3],k+1,1000));
+
+#fetch data and process data to windows mean
+for(i in 1:dim(y)[1])
+{	fout = paste(folder, y[i,1],"_",y[i,2],"_",y[i,4],sep="");	
+	system(paste("wget -O",fout, y[i,3]));
+	tl = tail(unlist(gregexpr("\\.",fout)),n=1);
+	name = substr(fout,1,tl);
+	ftype = substr(fout, tl+1, 100000);
+	if(tolower(ftype) == "bam")
+	{	system(paste("samtools index", fout));
+		bw=paste(name,".bw",sep="");
+		system(paste("bamCoverage --bam",fout,"-o",bw,"--binSize", binsz));
+	} else
+	{	bw=fout;
+	}
+	bd=paste(name,".bed",sep="");
+	system(paste("/storage/home/yzz2/work/tools/bigWigAverageOverBed", bw, mbed, "stdout | cut -f5 >", bd));
+	system(paste("gzip -f",bd));
+	system(paste("rm ", name,".ba*",sep=""));
+	system(paste("rm ", name,".bw",sep=""));
+}
+
+#normalize data if inputs are available, and prepare data file list
+fname=NULL;
+for(i in st:ed)
+{	k=match(x[i,3],y[,3]);
+	f1=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep="");
+	if(dim(x)[2]==4)
+	{	k=match(x[i,4],y[,3]);
+		f0=paste(folder,y[k,1],"_",y[k,2],"_",substr(y[k,4],1,nchar(y[k,4])-4),".bed.gz",sep="");
+		if(id[i]>0) 
+		{	nf=paste(folder,x[i,1],".",x[i,2],"_",id[i],".norm.bed",sep="");
+		} else
+		{	nf=paste(folder,x[i,1],".",x[i,2],".norm.bed",sep="");
+		}	
+		x1=as.matrix(read.table(f1));
+		x0=as.matrix(read.table(f0));
+		nx=normdata(x1,x0);
+		write.table(round(nx*100)/100,nf,quote=F,row.names=F,col.names=F);
+		system(paste("gzip -f",nf));
+		fname=rbind(fname,c(x[i,1:2],paste(nf,".gz",sep="")));
+	} else
+	{	nf=paste(fold,x[i,1],".",x[i,2],".bed.gz",sep="");
+		system(paste("mv", f1, nf));
+		fname=rbind(fname,c(x[i,1:2],nf));
+	}
+}
+
+write.table(fname,paste(folder,"tmp_",st,"_",ed,".input",sep=""),quote=F,row.names=F,col.names=F);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/IDEAS_input_config.txt	Wed Jan 31 14:13:11 2018 -0500
@@ -0,0 +1,1 @@
+e001 h3k4me3 tmp/e001-h3k4me3.bed.gz
--- a/test-data/chrom_windows.bed	Wed Jan 31 08:22:43 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,50 +0,0 @@
-chr1	21819600	21819800	R100001
-chr1	21819800	21820000	R100002
-chr1	21820000	21820200	R100003
-chr1	21820200	21820400	R100004
-chr1	21820400	21820600	R100005
-chr1	21820600	21820800	R100006
-chr1	21820800	21821000	R100007
-chr1	21821000	21821200	R100008
-chr1	21821200	21821400	R100009
-chr1	21821400	21821600	R100010
-chr1	21821600	21821800	R100011
-chr1	21821800	21822000	R100012
-chr1	21822000	21822200	R100013
-chr1	21822200	21822400	R100014
-chr1	21822400	21822600	R100015
-chr1	21822600	21822800	R100016
-chr1	21822800	21823000	R100017
-chr1	21823000	21823200	R100018
-chr1	21823200	21823400	R100019
-chr1	21823400	21823600	R100020
-chr1	21823600	21823800	R100021
-chr1	21823800	21824000	R100022
-chr1	21824000	21824200	R100023
-chr1	21824200	21824400	R100024
-chr1	21824400	21824600	R100025
-chr1	21824600	21824800	R100026
-chr1	21824800	21825000	R100027
-chr1	21825000	21825200	R100028
-chr1	21825200	21825400	R100029
-chr1	21825400	21825600	R100030
-chr1	21825600	21825800	R100031
-chr1	21825800	21826000	R100032
-chr1	21826000	21826200	R100033
-chr1	21826200	21826400	R100034
-chr1	21826400	21826600	R100035
-chr1	21826600	21826800	R100036
-chr1	21826800	21827000	R100037
-chr1	21827000	21827200	R100038
-chr1	21827200	21827400	R100039
-chr1	21827400	21827600	R100040
-chr1	21827600	21827800	R100041
-chr1	21827800	21828000	R100042
-chr1	21828000	21828200	R100043
-chr1	21828200	21828400	R100044
-chr1	21828400	21828600	R100045
-chr1	21829000	21829200	R100046
-chr1	21829400	21829600	R100047
-chr1	21829600	21829800	R100048
-chr1	21829800	21830000	R100049
-chr1	21830000	21830200	R100050
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chromosome_windows.txt	Wed Jan 31 14:13:11 2018 -0500
@@ -0,0 +1,1 @@
+chr1 0 50
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/chromosomes.bed	Wed Jan 31 14:13:11 2018 -0500
@@ -0,0 +1,50 @@
+chr1	21819600	21819800	R100001
+chr1	21819800	21820000	R100002
+chr1	21820000	21820200	R100003
+chr1	21820200	21820400	R100004
+chr1	21820400	21820600	R100005
+chr1	21820600	21820800	R100006
+chr1	21820800	21821000	R100007
+chr1	21821000	21821200	R100008
+chr1	21821200	21821400	R100009
+chr1	21821400	21821600	R100010
+chr1	21821600	21821800	R100011
+chr1	21821800	21822000	R100012
+chr1	21822000	21822200	R100013
+chr1	21822200	21822400	R100014
+chr1	21822400	21822600	R100015
+chr1	21822600	21822800	R100016
+chr1	21822800	21823000	R100017
+chr1	21823000	21823200	R100018
+chr1	21823200	21823400	R100019
+chr1	21823400	21823600	R100020
+chr1	21823600	21823800	R100021
+chr1	21823800	21824000	R100022
+chr1	21824000	21824200	R100023
+chr1	21824200	21824400	R100024
+chr1	21824400	21824600	R100025
+chr1	21824600	21824800	R100026
+chr1	21824800	21825000	R100027
+chr1	21825000	21825200	R100028
+chr1	21825200	21825400	R100029
+chr1	21825400	21825600	R100030
+chr1	21825600	21825800	R100031
+chr1	21825800	21826000	R100032
+chr1	21826000	21826200	R100033
+chr1	21826200	21826400	R100034
+chr1	21826400	21826600	R100035
+chr1	21826600	21826800	R100036
+chr1	21826800	21827000	R100037
+chr1	21827000	21827200	R100038
+chr1	21827200	21827400	R100039
+chr1	21827400	21827600	R100040
+chr1	21827600	21827800	R100041
+chr1	21827800	21828000	R100042
+chr1	21828000	21828200	R100043
+chr1	21828200	21828400	R100044
+chr1	21828400	21828600	R100045
+chr1	21829000	21829200	R100046
+chr1	21829400	21829600	R100047
+chr1	21829600	21829800	R100048
+chr1	21829800	21830000	R100049
+chr1	21830000	21830200	R100050
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.html	Wed Jan 31 14:13:11 2018 -0500
@@ -0,0 +1,7 @@
+<html><head></head><body><h3>Files prepared for IDEAS</h3>
+<ul>
+<li><a href="chromosome_windows.txt">chromosome_windows.txt</a></li>
+<li><a href="chromosomes.bed">chromosomes.bed</a></li>
+<li><a href="IDEAS_input_config.txt">IDEAS_input_config.txt</a></li>
+<li><a href="tmp.tar">tmp.tar</a></li>
+</ul></body></html>
\ No newline at end of file
--- a/test-data/output.ideaspre	Wed Jan 31 08:22:43 2018 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-<html><head></head><body><h3>Files prepared for IDEAS</h3>
-<ul>
-<li><a href="chromosome_windows.txt">chromosome_windows.txt</a></li>
-<li><a href="chromosomes.bed">chromosomes.bed</a></li>
-<li><a href="IDEAS_input_config.txt">IDEAS_input_config.txt</a></li>
-<li><a href="tmp">tmp</a></li>
-</ul>
-</body>
-</html>
\ No newline at end of file
Binary file test-data/tmp.tar has changed