Mercurial > repos > devteam > vcf_intersect
comparison intersect.py @ 0:f5d5eed73180 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Thu, 23 Jan 2014 12:31:34 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f5d5eed73180 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os.path | |
| 4 import sys | |
| 5 import optparse | |
| 6 | |
| 7 import bedClass | |
| 8 from bedClass import * | |
| 9 | |
| 10 import vcfClass | |
| 11 from vcfClass import * | |
| 12 | |
| 13 import tools | |
| 14 from tools import * | |
| 15 | |
| 16 if __name__ == "__main__": | |
| 17 main() | |
| 18 | |
| 19 # Intersect two vcf files. It is assumed that the two files are | |
| 20 # sorted by genomic coordinates and the reference sequences are | |
| 21 # in the same order. | |
| 22 def intersectVcf(v1, v2, priority, outputFile): | |
| 23 success1 = v1.getRecord() | |
| 24 success2 = v2.getRecord() | |
| 25 currentReferenceSequence = v1.referenceSequence | |
| 26 | |
| 27 # As soon as the end of either file is reached, there can be no | |
| 28 # more intersecting SNPs, so terminate. | |
| 29 while success1 and success2: | |
| 30 if v1.referenceSequence == v2.referenceSequence and v1.referenceSequence == currentReferenceSequence: | |
| 31 if v1.position == v2.position: | |
| 32 writeVcfRecord(priority, v1, v2, outputFile) | |
| 33 success1 = v1.getRecord() | |
| 34 success2 = v2.getRecord() | |
| 35 elif v2.position > v1.position: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) | |
| 36 elif v1.position > v2.position: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) | |
| 37 else: | |
| 38 if v1.referenceSequence == currentReferenceSequence: success1 = v1.parseVcf(v2.referenceSequence, v2.position, False, None) | |
| 39 elif v2.referenceSequence == currentReferenceSequence: success2 = v2.parseVcf(v1.referenceSequence, v1.position, False, None) | |
| 40 | |
| 41 # If the last record for a reference sequence is the same for both vcf | |
| 42 # files, they will both have referenceSequences different from the | |
| 43 # current reference sequence. Change the reference sequence to reflect | |
| 44 # this and proceed. | |
| 45 else: | |
| 46 if v1.referenceSequence != v2.referenceSequence: | |
| 47 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different." | |
| 48 print >> sys.stderr, "Check that both files contain records for the following reference sequences:" | |
| 49 print >> sys.stderr, "\t", v1.referenceSequence, " and ", v2.referenceSequence | |
| 50 exit(1) | |
| 51 currentReferenceSequence = v1.referenceSequence | |
| 52 | |
| 53 # Intersect a vcf file and a bed file. It is assumed that the | |
| 54 # two files are sorted by genomic coordinates and the reference | |
| 55 # sequences are in the same order. | |
| 56 def intersectVcfBed(v, b, outputFile): | |
| 57 successb = b.getRecord() | |
| 58 successv = v.getRecord() | |
| 59 currentReferenceSequence = v.referenceSequence | |
| 60 | |
| 61 # As soon as the end of the first file is reached, there are no | |
| 62 # more intersections and the program can terminate. | |
| 63 while successv: | |
| 64 if v.referenceSequence == b.referenceSequence: | |
| 65 if v.position < b.start: successv = v.parseVcf(b.referenceSequence, b.start, False, None) | |
| 66 elif v.position > b.end: successb = b.parseBed(v.referenceSequence, v.position) | |
| 67 else: | |
| 68 outputFile.write(v.record) | |
| 69 successv = v.getRecord() | |
| 70 else: | |
| 71 if v.referenceSequence == currentReferenceSequence: successv = v.parseVcf(b.referenceSequence, b.start, False, None) | |
| 72 if b.referenceSequence == currentReferenceSequence: successb = b.parseBed(v.referenceSequence, v.position) | |
| 73 currentReferenceSequence = v.referenceSequence | |
| 74 | |
| 75 def main(): | |
| 76 | |
| 77 # Parse the command line options | |
| 78 usage = "Usage: vcfPytools.py intersect [options]" | |
| 79 parser = optparse.OptionParser(usage = usage) | |
| 80 parser.add_option("-i", "--in", | |
| 81 action="append", type="string", | |
| 82 dest="vcfFiles", help="input vcf files") | |
| 83 parser.add_option("-b", "--bed", | |
| 84 action="store", type="string", | |
| 85 dest="bedFile", help="input bed vcf file") | |
| 86 parser.add_option("-o", "--out", | |
| 87 action="store", type="string", | |
| 88 dest="output", help="output vcf file") | |
| 89 parser.add_option("-f", "--priority-file", | |
| 90 action="store", type="string", | |
| 91 dest="priorityFile", help="output records from this vcf file") | |
| 92 | |
| 93 (options, args) = parser.parse_args() | |
| 94 | |
| 95 # Check that a single vcf file is given. | |
| 96 if options.vcfFiles == None: | |
| 97 parser.print_help() | |
| 98 print >> sys.stderr, "\nAt least one vcf file (--in, -i) is required for performing intersection." | |
| 99 exit(1) | |
| 100 elif len(options.vcfFiles) > 2: | |
| 101 parser.print_help() | |
| 102 print >> sys.stderr, "\nAt most, two vcf files (--in, -i) can be submitted for performing intersection." | |
| 103 exit(1) | |
| 104 elif len(options.vcfFiles) == 1 and not options.bedFile: | |
| 105 parser.print_help() | |
| 106 print >> sys.stderr, "\nIf only one vcf file (--in, -i) is specified, a bed file is also required for performing intersection." | |
| 107 exit(1) | |
| 108 | |
| 109 # Set the output file to stdout if no output file was specified. | |
| 110 outputFile, writeOut = setOutput(options.output) # tools.py | |
| 111 | |
| 112 # If intersecting with a bed file, call the bed intersection routine. | |
| 113 if options.bedFile: | |
| 114 v = vcf() # Define vcf object. | |
| 115 b = bed() # Define bed object. | |
| 116 | |
| 117 # Open the files. | |
| 118 v.openVcf(options.vcfFiles[0]) | |
| 119 b.openBed(options.bedFile) | |
| 120 | |
| 121 # Read in the header information. | |
| 122 v.parseHeader(options.vcfFiles[0], writeOut) | |
| 123 taskDescriptor = "##vcfPytools=intersect " + options.vcfFiles[0] + ", " + options.bedFile | |
| 124 writeHeader(outputFile, v, False, taskDescriptor) # tools.py | |
| 125 | |
| 126 # Intersect the vcf file with the bed file. | |
| 127 intersectVcfBed(v, b, outputFile) | |
| 128 | |
| 129 # Check that the input files had the same list of reference sequences. | |
| 130 # If not, it is possible that there were some problems. | |
| 131 checkReferenceSequenceLists(v.referenceSequenceList, b.referenceSequenceList) # tools.py | |
| 132 | |
| 133 # Close the files. | |
| 134 v.closeVcf(options.vcfFiles[0]) | |
| 135 b.closeBed(options.bedFile) | |
| 136 | |
| 137 else: | |
| 138 priority = setVcfPriority(options.priorityFile, options.vcfFiles) | |
| 139 v1 = vcf() # Define vcf object. | |
| 140 v2 = vcf() # Define vcf object. | |
| 141 | |
| 142 # Open the vcf files. | |
| 143 v1.openVcf(options.vcfFiles[0]) | |
| 144 v2.openVcf(options.vcfFiles[1]) | |
| 145 | |
| 146 # Read in the header information. | |
| 147 v1.parseHeader(options.vcfFiles[0], writeOut) | |
| 148 v2.parseHeader(options.vcfFiles[1], writeOut) | |
| 149 if priority == 3: | |
| 150 v3 = vcf() # Generate a new vcf object that will contain the header information of the new file. | |
| 151 mergeHeaders(v1, v2, v3) # tools.py | |
| 152 v1.processInfo = True | |
| 153 v2.processInfo = True | |
| 154 else: checkDataSets(v1, v2) | |
| 155 | |
| 156 #print v1.samplesList | |
| 157 #print v2.samplesList | |
| 158 | |
| 159 # Check that the header for the two files contain the same samples. | |
| 160 if v1.samplesList != v2.samplesList: | |
| 161 print >> sys.stderr, "vcf files contain different samples (or sample order)." | |
| 162 exit(1) | |
| 163 else: | |
| 164 taskDescriptor = "##vcfPytools=intersect " + v1.filename + ", " + v2.filename | |
| 165 if priority == 3: writeHeader(outputFile, v3, False, taskDescriptor) | |
| 166 elif (priority == 2 and v2.hasHeader) or not v1.hasHeader: writeHeader(outputFile, v2, False, taskDescriptor) # tools.py | |
| 167 else: writeHeader(outputFile, v1, False, taskDescriptor) # tools.py | |
| 168 | |
| 169 # Intersect the two vcf files. | |
| 170 intersectVcf(v1, v2, priority, outputFile) | |
| 171 | |
| 172 # Check that the input files had the same list of reference sequences. | |
| 173 # If not, it is possible that there were some problems. | |
| 174 checkReferenceSequenceLists(v1.referenceSequenceList, v2.referenceSequenceList) # tools.py | |
| 175 | |
| 176 # Close the vcf files. | |
| 177 v1.closeVcf(options.vcfFiles[0]) | |
| 178 v2.closeVcf(options.vcfFiles[1]) | |
| 179 | |
| 180 # End the program. | |
| 181 return 0 |
