Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/multiIntersectBed/multiIntersectBedMain.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 unionBedGraphsMain.cpp | |
| 3 | |
| 4 (c) 2010 - Assaf Gordon, CSHL | |
| 5 - Aaron Quinlan, UVA | |
| 6 Hall Laboratory | |
| 7 Department of Biochemistry and Molecular Genetics | |
| 8 University of Virginia | |
| 9 aaronquinlan@gmail.com | |
| 10 | |
| 11 Licenced under the GNU General Public License 2.0 license. | |
| 12 ******************************************************************************/ | |
| 13 #include <climits> | |
| 14 #include <cstring> | |
| 15 #include <cstdlib> | |
| 16 #include <vector> | |
| 17 #include <string> | |
| 18 #include <iostream> | |
| 19 #include <getopt.h> | |
| 20 #include <libgen.h> //for basename() | |
| 21 #include "version.h" | |
| 22 | |
| 23 #include "genomeFile.h" | |
| 24 #include "multiIntersectBed.h" | |
| 25 | |
| 26 using namespace std; | |
| 27 | |
| 28 // define our program name | |
| 29 #define PROGRAM_NAME "multiIntersectBed" | |
| 30 | |
| 31 // define our parameter checking macro | |
| 32 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) | |
| 33 | |
| 34 //STLized version of basename() | |
| 35 // (because POSIX basename() modifies the input string pointer) | |
| 36 // Additionally: removes any extension the basename might have. | |
| 37 std::string stl_basename(const std::string& path); | |
| 38 | |
| 39 // function declarations | |
| 40 void ShowHelp(void); | |
| 41 void ShowExamples(void); | |
| 42 | |
| 43 | |
| 44 int main(int argc, char* argv[]) | |
| 45 { | |
| 46 bool haveFiles = false; | |
| 47 bool haveTitles = false; | |
| 48 bool haveGenome = false; | |
| 49 bool haveFiller = true; | |
| 50 bool printHeader = false; | |
| 51 bool printEmptyRegions = false; | |
| 52 bool showHelp = false; | |
| 53 string genomeFile; | |
| 54 string basePath; | |
| 55 string noCoverageValue = "0"; | |
| 56 vector<string> inputFiles; | |
| 57 vector<string> inputTitles; | |
| 58 | |
| 59 //Parse command line options | |
| 60 if(argc <= 1) | |
| 61 ShowHelp(); | |
| 62 | |
| 63 for(int i = 1; i < argc; i++) { | |
| 64 int parameterLength = (int)strlen(argv[i]); | |
| 65 | |
| 66 if((PARAMETER_CHECK("-h", 2, parameterLength)) || | |
| 67 (PARAMETER_CHECK("--help", 5, parameterLength))) { | |
| 68 showHelp = true; | |
| 69 } | |
| 70 } | |
| 71 | |
| 72 if(showHelp == true) { | |
| 73 ShowHelp(); | |
| 74 exit(1); | |
| 75 } | |
| 76 | |
| 77 // do some parsing (all of these parameters require 2 strings) | |
| 78 for(int i = 1; i < argc; i++) { | |
| 79 | |
| 80 int parameterLength = (int)strlen(argv[i]); | |
| 81 | |
| 82 if(PARAMETER_CHECK("-i", 2, parameterLength)) { | |
| 83 if ((i+1) < argc) { | |
| 84 haveFiles = true; | |
| 85 i = i+1; | |
| 86 string file = argv[i]; | |
| 87 while (file[0] != '-' && i < argc) { | |
| 88 inputFiles.push_back(file); | |
| 89 i++; | |
| 90 if (i < argc) | |
| 91 file = argv[i]; | |
| 92 } | |
| 93 i--; | |
| 94 } | |
| 95 } | |
| 96 else if(PARAMETER_CHECK("-names", 6, parameterLength)) { | |
| 97 if ((i+1) < argc) { | |
| 98 haveTitles = true; | |
| 99 i = i+1; | |
| 100 string title = argv[i]; | |
| 101 while (title[0] != '-' && i < argc) { | |
| 102 inputTitles.push_back(title); | |
| 103 i++; | |
| 104 if (i < argc) | |
| 105 title = argv[i]; | |
| 106 } | |
| 107 i--; | |
| 108 } | |
| 109 } | |
| 110 else if(PARAMETER_CHECK("-g", 2, parameterLength)) { | |
| 111 if ((i+1) < argc) { | |
| 112 haveGenome = true; | |
| 113 genomeFile = argv[i + 1]; | |
| 114 i++; | |
| 115 } | |
| 116 } | |
| 117 else if(PARAMETER_CHECK("-filler", 7, parameterLength)) { | |
| 118 if ((i+1) < argc) { | |
| 119 haveFiller = true; | |
| 120 noCoverageValue = argv[i + 1]; | |
| 121 i++; | |
| 122 } | |
| 123 } | |
| 124 else if(PARAMETER_CHECK("-header", 7, parameterLength)) { | |
| 125 printHeader = true; | |
| 126 } | |
| 127 else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { | |
| 128 printEmptyRegions = true; | |
| 129 } | |
| 130 else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { | |
| 131 ShowHelp(); | |
| 132 ShowExamples(); | |
| 133 exit(1); | |
| 134 } | |
| 135 } | |
| 136 | |
| 137 //Sanity checks | |
| 138 if (inputFiles.empty() == true) { | |
| 139 cerr << "Error: missing BedGraph file names (-i) to combine." << endl; | |
| 140 exit(1); | |
| 141 } | |
| 142 if (inputFiles.size() == 1) { | |
| 143 cerr << "Error: Only a single BedGraph file was specified. Nothing to combine, exiting." << endl; | |
| 144 exit(1); | |
| 145 } | |
| 146 if (printEmptyRegions && (genomeFile.empty() == true)) { | |
| 147 cerr << "Error: when using -empty, the genome sizes file (-g) must be specified using '-g FILE'." << endl; | |
| 148 exit(1); | |
| 149 } | |
| 150 if ((haveTitles == true) && (inputFiles.size() != inputTitles.size())) { | |
| 151 cerr << "Error: The number of file titles (-names) does not match the number of files (-i)." << endl; | |
| 152 exit(1); | |
| 153 } | |
| 154 | |
| 155 MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); | |
| 156 if (printHeader) | |
| 157 mbi.PrintHeader(); | |
| 158 mbi.MultiIntersect(); | |
| 159 } | |
| 160 | |
| 161 void ShowHelp(void) { | |
| 162 | |
| 163 cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; | |
| 164 | |
| 165 cerr << "Authors: Assaf Gordon, CSHL" << endl; | |
| 166 cerr << " Aaron Quinlan (aaronquinlan@gmail.com)" << endl << endl; | |
| 167 | |
| 168 cerr << "Summary: Combines multiple BedGraph files into a single file," << endl; | |
| 169 cerr << "\t allowing coverage comparisons between them." << endl << endl; | |
| 170 | |
| 171 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i FILE1 FILE2 .. FILEn" << endl; | |
| 172 cerr << "\t Assumes that each BedGraph file is sorted by chrom/start " << endl; | |
| 173 cerr << "\t and that the intervals in each are non-overlapping." << endl << endl; | |
| 174 | |
| 175 cerr << "Options: " << endl; | |
| 176 | |
| 177 cerr << "\t-header\t\t" << "Print a header line." << endl; | |
| 178 cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; | |
| 179 | |
| 180 cerr << "\t-names\t\t" << "A list of names (one / file) to describe each file in -i." << endl; | |
| 181 cerr << "\t\t\tThese names will be printed in the header line." << endl << endl; | |
| 182 | |
| 183 cerr << "\t-g\t\t" << "Use genome file to calculate empty regions." << endl; | |
| 184 cerr << "\t\t\t- STRING." << endl << endl; | |
| 185 | |
| 186 cerr << "\t-empty\t\t" << "Report empty regions (i.e., start/end intervals w/o" << endl; | |
| 187 cerr << "\t\t\tvalues in all files)." << endl; | |
| 188 cerr << "\t\t\t- Requires the '-g FILE' parameter.\n" << endl; | |
| 189 | |
| 190 cerr << "\t-filler TEXT\t" << "Use TEXT when representing intervals having no value." << endl; | |
| 191 cerr << "\t\t\t- Default is '0', but you can use 'N/A' or any other text." << endl << endl; | |
| 192 | |
| 193 cerr << "\t-examples\t" << "Show detailed usage examples." << endl << endl; | |
| 194 } | |
| 195 | |
| 196 | |
| 197 | |
| 198 void ShowExamples() | |
| 199 { | |
| 200 cerr << "Example usage:\n\n" \ | |
| 201 "== Input files: ==\n" \ | |
| 202 "\n" \ | |
| 203 " $ cat 1.bg\n" \ | |
| 204 " chr1 1000 1500 10\n" \ | |
| 205 " chr1 2000 2100 20\n" \ | |
| 206 "\n" \ | |
| 207 " $ cat 2.bg\n" \ | |
| 208 " chr1 900 1600 60\n" \ | |
| 209 " chr1 1700 2050 50\n" \ | |
| 210 "\n" \ | |
| 211 " $ cat 3.bg\n" \ | |
| 212 " chr1 1980 2070 80\n" \ | |
| 213 " chr1 2090 2100 20\n" \ | |
| 214 "\n" \ | |
| 215 " $ cat sizes.txt\n" \ | |
| 216 " chr1 5000\n" \ | |
| 217 "\n" \ | |
| 218 "== Union/combine the files: ==\n" \ | |
| 219 "\n" \ | |
| 220 " $ unionBedGraphs -i 1.bg 2.bg 3.bg\n" \ | |
| 221 " chr1 900 1000 0 60 0\n" \ | |
| 222 " chr1 1000 1500 10 60 0\n" \ | |
| 223 " chr1 1500 1600 0 60 0\n" \ | |
| 224 " chr1 1700 1980 0 50 0\n" \ | |
| 225 " chr1 1980 2000 0 50 80\n" \ | |
| 226 " chr1 2000 2050 20 50 80\n" \ | |
| 227 " chr1 2050 2070 20 0 80\n" \ | |
| 228 " chr1 2070 2090 20 0 0\n" \ | |
| 229 " chr1 2090 2100 20 0 20\n" \ | |
| 230 "\n" \ | |
| 231 "== Union/combine the files, with a header line (titles are the file names): ==\n" \ | |
| 232 "\n" \ | |
| 233 " $ unionBedGraphs -header -i 1.bg 2.bg 3.bg\n" \ | |
| 234 " chrom start end 1 2 3\n" \ | |
| 235 " chr1 900 1000 0 60 0\n" \ | |
| 236 " chr1 1000 1500 10 60 0\n" \ | |
| 237 " chr1 1500 1600 0 60 0\n" \ | |
| 238 " chr1 1700 1980 0 50 0\n" \ | |
| 239 " chr1 1980 2000 0 50 80\n" \ | |
| 240 " chr1 2000 2050 20 50 80\n" \ | |
| 241 " chr1 2050 2070 20 0 80\n" \ | |
| 242 " chr1 2070 2090 20 0 0\n" \ | |
| 243 " chr1 2090 2100 20 0 20\n" \ | |
| 244 "\n" \ | |
| 245 "== Union/combine the files, with a header line and custom names: ==\n" \ | |
| 246 "\n" \ | |
| 247 " $ unionBedGraphs -header -i 1.bg 2.bg 3.bg -names WT-1 WT-2 KO-1\n" \ | |
| 248 " chrom start end WT-1 WT-2 KO-1\n" \ | |
| 249 " chr1 900 1000 0 60 0\n" \ | |
| 250 " chr1 1000 1500 10 60 0\n" \ | |
| 251 " chr1 1500 1600 0 60 0\n" \ | |
| 252 " chr1 1700 1980 0 50 0\n" \ | |
| 253 " chr1 1980 2000 0 50 80\n" \ | |
| 254 " chr1 2000 2050 20 50 80\n" \ | |
| 255 " chr1 2050 2070 20 0 80\n" \ | |
| 256 " chr1 2070 2090 20 0 0\n" \ | |
| 257 " chr1 2090 2100 20 0 20\n" \ | |
| 258 "\n" \ | |
| 259 "== Union/combine, showing empty regions (note, requires -g): ==\n" \ | |
| 260 "\n" \ | |
| 261 " $ unionBedGraphs -header -empty -g sizes.TXT -i 1.bg 2.bg 3.bg\n" \ | |
| 262 " chrom start end 1 2 3\n" \ | |
| 263 " chr1 0 900 0 0 0\n" \ | |
| 264 " chr1 900 1000 0 60 0\n" \ | |
| 265 " chr1 1000 1500 10 60 0\n" \ | |
| 266 " chr1 1500 1600 0 60 0\n" \ | |
| 267 " chr1 1600 1700 0 0 0\n" \ | |
| 268 " chr1 1700 1980 0 50 0\n" \ | |
| 269 " chr1 1980 2000 0 50 80\n" \ | |
| 270 " chr1 2000 2050 20 50 80\n" \ | |
| 271 " chr1 2050 2070 20 0 80\n" \ | |
| 272 " chr1 2070 2090 20 0 0\n" \ | |
| 273 " chr1 2090 2100 20 0 20\n" \ | |
| 274 " chr1 2100 5000 0 0 0\n" \ | |
| 275 "\n" \ | |
| 276 ; | |
| 277 } | |
| 278 | |
| 279 std::string stl_basename(const std::string& path) | |
| 280 { | |
| 281 string result; | |
| 282 | |
| 283 char* path_dup = strdup(path.c_str()); | |
| 284 char* basename_part = basename(path_dup); | |
| 285 result = basename_part; | |
| 286 free(path_dup); | |
| 287 | |
| 288 size_t pos = result.find_last_of('.'); | |
| 289 if (pos != string::npos ) | |
| 290 result = result.substr(0,pos); | |
| 291 | |
| 292 return result; | |
| 293 } | |
| 294 |
