annotate ideas_preprocessor.R @ 32:ea739695619d draft default tip

Uploaded
author greg
date Fri, 16 Feb 2018 11:17:00 -0500
parents 0cb6ff8ba6df
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env Rscript
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
2
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
3 suppressPackageStartupMessages(library("data.table"))
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
4 suppressPackageStartupMessages(library("optparse"))
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
5
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
6 option_list <- list(
12
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
7 make_option(c("--chrom_bed_input"), action="store", dest="chrom_bed_input", defaul=NULL, help="Chromosome windows positions file"),
14
20c21d946a8e Uploaded
greg
parents: 12
diff changeset
8 make_option(c("--exclude_bed_input"), action="store", dest="exclude_bed_input", defaul=NULL, help="File(s) containing regions to exclude"),
3
e97851e8951b Uploaded
greg
parents: 1
diff changeset
9 make_option(c("--chrom_len_file"), action="store", dest="chrom_len_file", default=NULL, help="Chromosome lengths file"),
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
10 make_option(c("--ideaspre_input_config"), action="store", dest="ideaspre_input_config", help="Preprocessing input config file"),
3
e97851e8951b Uploaded
greg
parents: 1
diff changeset
11 make_option(c("--output"), action="store", dest="output", help="Primary output dataset"),
e97851e8951b Uploaded
greg
parents: 1
diff changeset
12 make_option(c("--output_files_path"), action="store", dest="output_files_path", help="Primary output dataset extra files path"),
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
13 make_option(c("--chromosome_windows"), action="store", dest="chromosome_windows", default=NULL, help="Windows positions by chroms config file"),
3
e97851e8951b Uploaded
greg
parents: 1
diff changeset
14 make_option(c("--window_size"), action="store", dest="window_size", type="integer", default=NULL, help="Window size in base pairs")
1
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
15 )
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
16
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
17 parser <- OptionParser(usage="%prog [options] file", option_list=option_list)
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
18 args <- parse_args(parser, positional_arguments=TRUE)
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
19 opt <- args$options
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
20
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
21 run_cmd = function(cmd) {
ea739695619d Uploaded
greg
parents: 30
diff changeset
22 rc = system(cmd);
ea739695619d Uploaded
greg
parents: 30
diff changeset
23 if (rc != 0) {
ea739695619d Uploaded
greg
parents: 30
diff changeset
24 quit(save="no", status=rc);
ea739695619d Uploaded
greg
parents: 30
diff changeset
25 }
ea739695619d Uploaded
greg
parents: 30
diff changeset
26 }
ea739695619d Uploaded
greg
parents: 30
diff changeset
27
22
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
28 tmp_dir = "tmp";
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
29 cbi_file = "chrom_bed_input.bed";
22
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
30
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
31 if (is.null(opt$chrom_bed_input)) {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
32 # Create a chromosome windows positions file
19881f817d25 Uploaded
greg
parents: 25
diff changeset
33 # using the received chromosome lengths file
19881f817d25 Uploaded
greg
parents: 25
diff changeset
34 # and the window size.
19881f817d25 Uploaded
greg
parents: 25
diff changeset
35 cmd = paste("bedtools makewindows -g", opt$chrom_len_file, "-w", opt$window_size, ">", cbi_file, sep=" ");
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
36 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
37 } else {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
38 if (!is.null(opt$exclude_bed_input)) {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
39 # Copy the received chrom_bed_input
19881f817d25 Uploaded
greg
parents: 25
diff changeset
40 # since we will alter it.
19881f817d25 Uploaded
greg
parents: 25
diff changeset
41 file.copy(opt$chrom_bed_input, cbi_file);
19881f817d25 Uploaded
greg
parents: 25
diff changeset
42 } else {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
43 cbi_file = opt$chrom_bed_input;
19881f817d25 Uploaded
greg
parents: 25
diff changeset
44 }
19881f817d25 Uploaded
greg
parents: 25
diff changeset
45 }
19881f817d25 Uploaded
greg
parents: 25
diff changeset
46 # Exclude regions if necessary.
19881f817d25 Uploaded
greg
parents: 25
diff changeset
47 if (!is.null(opt$exclude_bed_input)) {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
48 exclude_bed_inputs = as.character(opt$exclude_bed_input);
19881f817d25 Uploaded
greg
parents: 25
diff changeset
49 exclude_bed_files = strsplit(exclude_bed_inputs, ",");
19881f817d25 Uploaded
greg
parents: 25
diff changeset
50 tmp_file = paste("tmp", cbi_file, sep="_");
19881f817d25 Uploaded
greg
parents: 25
diff changeset
51 for (exclude_bed_file in exclude_bed_files) {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
52 cmd = paste("bedtools subtract -a", cbi_file, "-b", exclude_bed_file, ">", tmp_file, sep=" ");
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
53 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
54 cmd = paste("mv", tmp_file, cbi_file, sep=" ");
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
55 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
56 }
19881f817d25 Uploaded
greg
parents: 25
diff changeset
57 }
19881f817d25 Uploaded
greg
parents: 25
diff changeset
58 # Read the chromosome windows positions file
19881f817d25 Uploaded
greg
parents: 25
diff changeset
59 # to get the smallest window size in the file
19881f817d25 Uploaded
greg
parents: 25
diff changeset
60 # (i.e., the minimum of column 3 - column 2.
19881f817d25 Uploaded
greg
parents: 25
diff changeset
61 cbi = fread(cbi_file);
19881f817d25 Uploaded
greg
parents: 25
diff changeset
62 min_window_size = min(cbi[,3]-cbi[,2]);
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
63 # Read the ideaspre_input_config text file which has this format:
1
91c5dbb14a13 Uploaded
greg
parents:
diff changeset
64 # "cell type name" "epigenetic factor name" "file path" "file name" "datatype"
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
65 ideaspre_input_config = as.matrix(read.table(opt$ideaspre_input_config));
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
66 # Process data to windows mean.
19881f817d25 Uploaded
greg
parents: 25
diff changeset
67 for (i in 1:dim(ideaspre_input_config)[1]) {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
68 file_path = ideaspre_input_config[i, 3]
19881f817d25 Uploaded
greg
parents: 25
diff changeset
69 file_name = ideaspre_input_config[i, 4]
19881f817d25 Uploaded
greg
parents: 25
diff changeset
70 datatype = ideaspre_input_config[i, 5]
19881f817d25 Uploaded
greg
parents: 25
diff changeset
71 if (datatype == "bam") {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
72 cmd = paste("samtools index", file_path);
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
73 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
74 bigwig_file_name = paste(file_name, "bw", sep=".");
19881f817d25 Uploaded
greg
parents: 25
diff changeset
75 cmd = paste("bamCoverage --bam", file_path, "-o", bigwig_file_name, "--binSize", min_window_size);
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
76 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
77 } else {
19881f817d25 Uploaded
greg
parents: 25
diff changeset
78 bigwig_file_name = file_path;
19881f817d25 Uploaded
greg
parents: 25
diff changeset
79 }
19881f817d25 Uploaded
greg
parents: 25
diff changeset
80 bed_file_name = paste(file_name, "bed", sep=".");
19881f817d25 Uploaded
greg
parents: 25
diff changeset
81 bed_file_path = paste("tmp", bed_file_name, sep="/");
19881f817d25 Uploaded
greg
parents: 25
diff changeset
82 cmd = paste("bigWigAverageOverBed", bigwig_file_name, opt$chrom_bed_input, "stdout | cut -f5 >", bed_file_path);
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
83 run_cmd(cmd);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
84 cmd = paste("gzip -f", bed_file_path);
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
85 run_cmd(cmd);
14
20c21d946a8e Uploaded
greg
parents: 12
diff changeset
86 }
9
0cf23f33d13d Uploaded
greg
parents: 7
diff changeset
87 # Create file1.txt.
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
88 cmd = paste("cut -d' '", opt$ideaspre_input_config, "-f1,2 > file1.txt", sep=" ");
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
89 run_cmd(cmd);
22
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
90 # Compress the bed files in the tmp directory.
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
91 tmp_gzipped_files = paste(tmp_dir, "*.bed.gz", sep="/");
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
92 # Create file2.txt.
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
93 cmd = paste("ls", tmp_gzipped_files, "> file2.txt", sep=" ");
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
94 run_cmd(cmd);
12
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
95 # Create IDEAS_input_config.txt with the format required by IDEAS.
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
96 ideas_input_config = "IDEAS_input_config.txt"
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
97 cmd = paste("paste -d' ' file1.txt file2.txt >", ideas_input_config, sep=" " );
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
98 run_cmd(cmd);
12
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
99 # Move IDEAS_input_config.txt to the output directory.
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
100 to_path = paste(opt$output_files_path, ideas_input_config, sep="/");
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
101 file.rename(ideas_input_config, to_path);
22
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
102 # Archive the tmp directory.
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
103 cmd = "tar -cvf tmp.tar tmp";
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
104 run_cmd(cmd);
25
f7563bb242fc Uploaded
greg
parents: 22
diff changeset
105 # Compress the archive.
f7563bb242fc Uploaded
greg
parents: 22
diff changeset
106 cmd = "gzip tmp.tar";
32
ea739695619d Uploaded
greg
parents: 30
diff changeset
107 run_cmd(cmd);
22
3651f1592f3f Uploaded
greg
parents: 21
diff changeset
108 # Move the tmp archive to the output directory.
25
f7563bb242fc Uploaded
greg
parents: 22
diff changeset
109 to_path = paste(opt$output_files_path, "tmp.tar.gz", sep="/");
f7563bb242fc Uploaded
greg
parents: 22
diff changeset
110 file.rename("tmp.tar.gz", to_path);
27
19881f817d25 Uploaded
greg
parents: 25
diff changeset
111 # Handle file names for display in the primary dataset if necessary.
29
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
112 to_path = paste(opt$output_files_path, "chromosomes.bed", sep="/");
30
0cb6ff8ba6df Uploaded
greg
parents: 29
diff changeset
113 if (is.null(opt$chrom_bed_input)) {
29
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
114 # Move cbi_file to the output directory,
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
115 # naming it chromosomes.bed.
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
116 file.rename(cbi_file, to_path);
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
117 } else {
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
118 # Copy opt$chrom_bed_input to the output
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
119 # directory, naming it chromosomes.bed.
15
ce2021cd68d2 Uploaded
greg
parents: 14
diff changeset
120 file.copy(opt$chrom_bed_input, to_path);
29
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
121 }
3b3001355f44 Uploaded
greg
parents: 27
diff changeset
122 if (!is.null(opt$chromosome_windows)) {
12
ab0f306504a3 Uploaded
greg
parents: 10
diff changeset
123 # Move chromosome_windows.txt to the output directory.
5
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
124 to_path = paste(opt$output_files_path, opt$chromosome_windows, sep="/");
2b4e1bd725f7 Uploaded
greg
parents: 3
diff changeset
125 file.rename(opt$chromosome_windows, to_path);
3
e97851e8951b Uploaded
greg
parents: 1
diff changeset
126 }