| 0 | 1 #!/usr/bin/python | 
|  | 2 | 
|  | 3 import os.path | 
|  | 4 import sys | 
|  | 5 import optparse | 
|  | 6 | 
|  | 7 import bedClass | 
|  | 8 from bedClass import * | 
|  | 9 | 
|  | 10 import vcfClass | 
|  | 11 from vcfClass import * | 
|  | 12 | 
|  | 13 import tools | 
|  | 14 from tools import * | 
|  | 15 | 
|  | 16 if __name__ == "__main__": | 
|  | 17   main() | 
|  | 18 | 
|  | 19 # Intersect two vcf files.  It is assumed that the two files are | 
|  | 20 # sorted by genomic coordinates and the reference sequences are | 
|  | 21 # in the same order. | 
|  | 22 def intersectVcf(v1, v2, priority, outputFile): | 
|  | 23   success1 = v1.getRecord() | 
|  | 24   success2 = v2.getRecord() | 
|  | 25   currentReferenceSequence = v1.referenceSequence | 
|  | 26 | 
|  | 27 # As soon as the end of either file is reached, there can be no | 
|  | 28 # more intersecting SNPs, so terminate. | 
|  | 29   while success1 and success2: | 
|  | 30     if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence: | 
|  | 31       if v1.position == v2.position: | 
|  | 32         writeVcfRecord(priority, v1, v2, outputFile) | 
|  | 33         success1 = v1.getRecord() | 
|  | 34         success2 = v2.getRecord() | 
|  | 35       elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) | 
|  | 36       elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) | 
|  | 37     else: | 
|  | 38       if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) | 
|  | 39       elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) | 
|  | 40 | 
|  | 41 # If the last record for a reference sequence is the same for both vcf | 
|  | 42 # files, they will both have referenceSequences different from the | 
|  | 43 # current reference sequence.  Change the reference sequence to reflect | 
|  | 44 # this and proceed. | 
|  | 45       else: | 
|  | 46         if v1.referenceSequence != v2.referenceSequence: | 
|  | 47           print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." | 
|  | 48           print >> sys.stderr, "Check that both files contain records for the following reference sequences:" | 
|  | 49           print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence | 
|  | 50           exit(1) | 
|  | 51       currentReferenceSequence = v1.referenceSequence | 
|  | 52 | 
|  | 53 # Intersect a vcf file and a bed file.  It is assumed that the | 
|  | 54 # two files are sorted by genomic coordinates and the reference | 
|  | 55 # sequences are in the same order. | 
|  | 56 def intersectVcfBed(v, b, outputFile): | 
|  | 57   successb = b.getRecord() | 
|  | 58   successv = v.getRecord() | 
|  | 59   currentReferenceSequence = v.referenceSequence | 
|  | 60 | 
|  | 61 # As soon as the end of the first file is reached, there are no | 
|  | 62 # more intersections and the program can terminate. | 
|  | 63   while successv: | 
|  | 64     if v.referenceSequence == b.referenceSequence: | 
|  | 65       if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None) | 
|  | 66       elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position) | 
|  | 67       else: | 
|  | 68         outputFile.write(v.record) | 
|  | 69         successv = v.getRecord() | 
|  | 70     else: | 
|  | 71       if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None) | 
|  | 72       if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position) | 
|  | 73       currentReferenceSequence = v.referenceSequence | 
|  | 74 | 
|  | 75 def main(): | 
|  | 76 | 
|  | 77 # Parse the command line options | 
|  | 78   usage = "Usage: vcfPytools.py intersect [options]" | 
|  | 79   parser = optparse.OptionParser(usage = usage) | 
|  | 80   parser.add_option("-i", "--in", | 
|  | 81                     action="append", type="string", | 
|  | 82                     dest="vcfFiles", help="input vcf files") | 
|  | 83   parser.add_option("-b", "--bed", | 
|  | 84                     action="store", type="string", | 
|  | 85                     dest="bedFile", help="input bed vcf file") | 
|  | 86   parser.add_option("-o", "--out", | 
|  | 87                     action="store", type="string", | 
|  | 88                     dest="output", help="output vcf file") | 
|  | 89   parser.add_option("-f", "--priority-file", | 
|  | 90                     action="store", type="string", | 
|  | 91                     dest="priorityFile", help="output records from this vcf file") | 
|  | 92 | 
|  | 93   (options, args) = parser.parse_args() | 
|  | 94 | 
|  | 95 # Check that a single  vcf file is given. | 
|  | 96   if options.vcfFiles == None: | 
|  | 97     parser.print_help() | 
|  | 98     print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection." | 
|  | 99     exit(1) | 
|  | 100   elif len(options.vcfFiles) > 2: | 
|  | 101     parser.print_help() | 
|  | 102     print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection." | 
|  | 103     exit(1) | 
|  | 104   elif len(options.vcfFiles) == 1 and not options.bedFile: | 
|  | 105     parser.print_help() | 
|  | 106     print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection." | 
|  | 107     exit(1) | 
|  | 108 | 
|  | 109 # Set the output file to stdout if no output file was specified. | 
|  | 110   outputFile, writeOut = setOutput(options.output) # tools.py | 
|  | 111 | 
|  | 112 # If intersecting with a bed file, call the bed intersection routine. | 
|  | 113   if options.bedFile: | 
|  | 114     v = vcf() # Define vcf object. | 
|  | 115     b = bed() # Define bed object. | 
|  | 116 | 
|  | 117 # Open the files. | 
|  | 118     v.openVcf(options.vcfFiles[0]) | 
|  | 119     b.openBed(options.bedFile) | 
|  | 120 | 
|  | 121 # Read in the header information. | 
|  | 122     v.parseHeader(options.vcfFiles[0], writeOut) | 
|  | 123     taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile | 
|  | 124     writeHeader(outputFile, v, False, taskDescriptor) # tools.py | 
|  | 125 | 
|  | 126 # Intersect the vcf file with the bed file. | 
|  | 127     intersectVcfBed(v, b, outputFile) | 
|  | 128 | 
|  | 129 # Check that the input files had the same list of reference sequences. | 
|  | 130 # If not, it is possible that there were some problems. | 
|  | 131     checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py | 
|  | 132 | 
|  | 133 # Close the files. | 
|  | 134     v.closeVcf(options.vcfFiles[0]) | 
|  | 135     b.closeBed(options.bedFile) | 
|  | 136 | 
|  | 137   else: | 
|  | 138     priority = setVcfPriority(options.priorityFile, options.vcfFiles) | 
|  | 139     v1 = vcf() # Define vcf object. | 
|  | 140     v2 = vcf() # Define vcf object. | 
|  | 141 | 
|  | 142 # Open the vcf files. | 
|  | 143     v1.openVcf(options.vcfFiles[0]) | 
|  | 144     v2.openVcf(options.vcfFiles[1]) | 
|  | 145 | 
|  | 146 # Read in the header information. | 
|  | 147     v1.parseHeader(options.vcfFiles[0], writeOut) | 
|  | 148     v2.parseHeader(options.vcfFiles[1], writeOut) | 
|  | 149     if priority == 3: | 
|  | 150       v3 = vcf() # Generate a new vcf object that will contain the header information of the new file. | 
|  | 151       mergeHeaders(v1, v2, v3) # tools.py | 
|  | 152       v1.processInfo = True | 
|  | 153       v2.processInfo = True | 
|  | 154     else: checkDataSets(v1, v2) | 
|  | 155 | 
|  | 156     #print v1.samplesList | 
|  | 157     #print v2.samplesList | 
|  | 158 | 
|  | 159 # Check that the header for the two files contain the same samples. | 
|  | 160     if v1.samplesList != v2.samplesList: | 
|  | 161       print >> sys.stderr, "vcf files contain different samples (or sample order)." | 
|  | 162       exit(1) | 
|  | 163     else: | 
|  | 164       taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename | 
|  | 165       if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor) | 
|  | 166       elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py | 
|  | 167       else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py | 
|  | 168 | 
|  | 169 # Intersect the two vcf files. | 
|  | 170     intersectVcf(v1, v2, priority, outputFile) | 
|  | 171 | 
|  | 172 # Check that the input files had the same list of reference sequences. | 
|  | 173 # If not, it is possible that there were some problems. | 
|  | 174     checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py | 
|  | 175 | 
|  | 176 # Close the vcf files. | 
|  | 177     v1.closeVcf(options.vcfFiles[0]) | 
|  | 178     v2.closeVcf(options.vcfFiles[1]) | 
|  | 179 | 
|  | 180 # End the program. | 
|  | 181   return 0 |