Mercurial > repos > devteam > vcf_intersect
comparison vcfClass.py @ 0:f5d5eed73180 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Thu, 23 Jan 2014 12:31:34 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f5d5eed73180 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os.path | |
| 4 import sys | |
| 5 import re | |
| 6 | |
| 7 class vcf: | |
| 8 def __init__(self): | |
| 9 | |
| 10 # Header info. | |
| 11 self.filename = "" | |
| 12 self.hasHeader = True | |
| 13 self.headerText = "" | |
| 14 self.headerTitles = "" | |
| 15 self.vcfFormat = "" | |
| 16 #self.headerInfoText = "" | |
| 17 #self.headerFormatText = "" | |
| 18 | |
| 19 # Store the info and format tags as well as the lines that describe | |
| 20 # them in a dictionary. | |
| 21 self.numberDataSets = 0 | |
| 22 self.includedDataSets = {} | |
| 23 self.infoHeaderTags = {} | |
| 24 self.infoHeaderString = {} | |
| 25 self.formatHeaderTags = {} | |
| 26 self.formatHeaderString = {} | |
| 27 | |
| 28 # Genotype information. | |
| 29 self.genotypes = False | |
| 30 self.infoField = {} | |
| 31 | |
| 32 # Reference sequence information. | |
| 33 self.referenceSequences = {} | |
| 34 self.referenceSequenceList = [] | |
| 35 self.referenceSequence = "" | |
| 36 | |
| 37 # Record information. | |
| 38 self.position = -1 | |
| 39 self.samplesList = [] | |
| 40 | |
| 41 # Determine which fields to process. | |
| 42 self.processInfo = False | |
| 43 self.processGenotypes = False | |
| 44 self.dbsnpVcf = False | |
| 45 self.hapmapVcf = False | |
| 46 | |
| 47 # Open a vcf file. | |
| 48 def openVcf(self, filename): | |
| 49 if filename == "stdin": | |
| 50 self.filehandle = sys.stdin | |
| 51 self.filename = "stdin" | |
| 52 else: | |
| 53 try: self.filehandle = open(filename,"r") | |
| 54 except IOError: | |
| 55 print >> sys.stderr, "Failed to find file: ",filename | |
| 56 exit(1) | |
| 57 self.filename = os.path.abspath(filename) | |
| 58 | |
| 59 # Parse the vcf header. | |
| 60 def parseHeader(self, filename, writeOut): | |
| 61 while self.getHeaderLine(filename, writeOut): | |
| 62 continue | |
| 63 | |
| 64 # Determine the type of information in the header line. | |
| 65 def getHeaderLine(self, filename, writeOut): | |
| 66 self.headerLine = self.filehandle.readline().rstrip("\n") | |
| 67 if self.headerLine.startswith("##fileformat"): success = self.getvcfFormat() | |
| 68 if self.headerLine.startswith("##INFO"): success = self.headerInfo(writeOut, "info") | |
| 69 elif self.headerLine.startswith("##FORMAT"): success = self.headerInfo(writeOut, "format") | |
| 70 elif self.headerLine.startswith("##FILE"): success = self.headerFiles(writeOut) | |
| 71 elif self.headerLine.startswith("##"): success = self.headerAdditional() | |
| 72 elif self.headerLine.startswith("#"): success = self.headerTitleString(filename, writeOut) | |
| 73 else: success = self.noHeader(filename, writeOut) | |
| 74 | |
| 75 return success | |
| 76 | |
| 77 # Read VCF format | |
| 78 def getvcfFormat(self): | |
| 79 try: | |
| 80 self.vcfFormat = self.headerLine.split("=",1)[1] | |
| 81 self.vcfFormat = float( self.vcfFormat.split("VCFv",1)[1] )## Extract the version number rather than the whole string | |
| 82 except IndexError: | |
| 83 print >> sys.stderr, "\nError parsing the fileformat" | |
| 84 print >> sys.stderr, "The following fileformat header is wrongly formatted: ", self.headerLine | |
| 85 exit(1) | |
| 86 return True | |
| 87 | |
| 88 | |
| 89 # Read information on an info field from the header line. | |
| 90 def headerInfo(self, writeOut, lineType): | |
| 91 tag = self.headerLine.split("=",1) | |
| 92 tagID = (tag[1].split("ID=",1))[1].split(",",1) | |
| 93 | |
| 94 # Check if this info field has already been defined. | |
| 95 if (lineType == "info" and self.infoHeaderTags.has_key(tagID[0])) or (lineType == "format" and self.formatHeaderTags.has_key(tagID[0])): | |
| 96 print >> sys.stderr, "Info tag \"", tagID[0], "\" is defined multiple times in the header." | |
| 97 exit(1) | |
| 98 | |
| 99 # Determine the number of entries, entry type and description. | |
| 100 tagNumber = (tagID[1].split("Number=",1))[1].split(",",1) | |
| 101 tagType = (tagNumber[1].split("Type=",1))[1].split(",",1) | |
| 102 try: tagDescription = ( ( (tagType[1].split("Description=\"",1))[1] ).split("\">") )[0] | |
| 103 except IndexError: tagDescription = "" | |
| 104 tagID = tagID[0]; tagNumber = tagNumber[0]; tagType = tagType[0] | |
| 105 | |
| 106 # Check that the number of fields associated with the tag is either | |
| 107 # an integer or a '.' to indicate variable number of entries. | |
| 108 if tagNumber == ".": tagNumber = "variable" | |
| 109 else: | |
| 110 if self.vcfFormat<4.1: | |
| 111 | |
| 112 try: | |
| 113 tagNumber = int(tagNumber) | |
| 114 | |
| 115 except ValueError: | |
| 116 print >> sys.stderr, "\nError parsing header. Problem with info tag:", tagID | |
| 117 print >> sys.stderr, "Number of fields associated with this tag is not an integer or '.'" | |
| 118 exit(1) | |
| 119 | |
| 120 if lineType == "info": | |
| 121 self.infoHeaderTags[tagID] = tagNumber, tagType, tagDescription | |
| 122 self.infoHeaderString[tagID] = self.headerLine | |
| 123 if lineType == "format": | |
| 124 self.formatHeaderTags[tagID] = tagNumber, tagType, tagDescription | |
| 125 self.formatHeaderString[tagID] = self.headerLine | |
| 126 | |
| 127 return True | |
| 128 | |
| 129 # Check to see if the records contain information from multiple different | |
| 130 # sources. If vcfPytools has been used to find the intersection or union | |
| 131 # of two vcf files, the records may have been merged to keep all the | |
| 132 # information available. If this is the case, there will be a ##FILE line | |
| 133 # for each set of information in the file. The order of these files needs | |
| 134 # to be maintained. | |
| 135 def headerFiles(self, writeOut): | |
| 136 fileID = (self.headerLine.split("ID=",1))[1].split(",",1) | |
| 137 filename = fileID[1].split("\"",2)[1] | |
| 138 try: fileID = int(fileID[0]) | |
| 139 except ValueError: | |
| 140 print >> sys.stderr, "File ID in ##FILE entry must be an integer." | |
| 141 print >> sys.stderr, self.headerLine | |
| 142 exit(1) | |
| 143 if self.includedDataSets.has_key(fileID): | |
| 144 print >> sys.stderr, "\nERROR: file " + self.filename | |
| 145 print >> sys.stderr, "Multiple files in the ##FILE list have identical ID values." | |
| 146 exit(1) | |
| 147 self.includedDataSets[fileID] = filename | |
| 148 | |
| 149 # Set the number of files with information in this vcf file. | |
| 150 if fileID > self.numberDataSets: self.numberDataSets = fileID | |
| 151 | |
| 152 return True | |
| 153 | |
| 154 # Read additional information contained in the header. | |
| 155 def headerAdditional(self): | |
| 156 self.headerText += self.headerLine + "\n" | |
| 157 | |
| 158 return True | |
| 159 | |
| 160 # Read in the column titles to check that all standard fields | |
| 161 # are present and read in all the samples. | |
| 162 def headerTitleString(self, filename, writeOut): | |
| 163 self.headerTitles = self.headerLine + "\n" | |
| 164 | |
| 165 # Strip the end of line character from the last infoFields entry. | |
| 166 infoFields = self.headerLine.split("\t") | |
| 167 if len(infoFields) > 8: | |
| 168 # if len(infoFields) - 9 == 1 and writeOut: print >> sys.stdout, len(infoFields) - 9, " sample present in vcf file: ", filename | |
| 169 # elif writeOut: print >> sys.stdout, len(infoFields) - 9, " samples present in vcf file: ", filename | |
| 170 self.samplesList = infoFields[9:] | |
| 171 self.genotypes = True | |
| 172 elif len(infoFields) == 8: | |
| 173 if writeOut: print >> sys.stdout, "No samples present in the header. No genotype information available." | |
| 174 else: | |
| 175 print self.headerLine, len(infoFields) | |
| 176 print >> sys.stderr, "Not all vcf standard fields are available." | |
| 177 exit(1) | |
| 178 | |
| 179 return False | |
| 180 | |
| 181 # If there is no header in the vcf file, close and reopen the | |
| 182 # file so that the first line is avaiable for parsing as a | |
| 183 # vcf record. | |
| 184 def noHeader(self, filename, writeOut): | |
| 185 if writeOut: print >> sys.stdout, "No header lines present in", filename | |
| 186 self.hasHeader = False | |
| 187 self.closeVcf(filename) | |
| 188 self.openVcf(filename) | |
| 189 | |
| 190 return False | |
| 191 | |
| 192 # Check that info fields exist. | |
| 193 def checkInfoFields(self, tag): | |
| 194 if self.infoHeaderTags.has_key(tag) == False: | |
| 195 print >> sys.stderr, "Info tag \"", tag, "\" does not exist in the header." | |
| 196 exit(1) | |
| 197 | |
| 198 # Get the next line of information from the vcf file. | |
| 199 def getRecord(self): | |
| 200 self.record = self.filehandle.readline() | |
| 201 if not self.record: return False | |
| 202 | |
| 203 # Set up and execute a regular expression match. | |
| 204 recordRe = re.compile(r"^(\S+)\t(\d+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)(\n|\t.+)$") | |
| 205 #recordRe = re.compile(r"^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(\n|\s+.+)$") | |
| 206 recordMatch = recordRe.match(self.record) | |
| 207 if recordMatch == None: | |
| 208 print >> sys.stderr, "Unable to resolve vcf record.\n" | |
| 209 print >> sys.stderr, self.record | |
| 210 exit(1) | |
| 211 | |
| 212 self.referenceSequence = recordMatch.group(1) | |
| 213 try: self.position = int(recordMatch.group(2)) | |
| 214 except ValueError: | |
| 215 text = "variant position is not an integer" | |
| 216 self.generalError(text, "", None) | |
| 217 self.rsid = recordMatch.group(3) | |
| 218 self.ref = recordMatch.group(4) | |
| 219 self.alt = recordMatch.group(5) | |
| 220 self.quality = recordMatch.group(6) | |
| 221 self.filters = recordMatch.group(7) | |
| 222 self.info = recordMatch.group(8) | |
| 223 self.genotypeString = recordMatch.group(9) | |
| 224 self.infoTags = {} | |
| 225 | |
| 226 # Check that the quality is an integer or a float. If not, set the quality | |
| 227 # to zero. | |
| 228 try: self.quality = float(self.quality) | |
| 229 except ValueError: self.quality = float(0.) | |
| 230 | |
| 231 # If recordMatch.group(9) is not the end of line character, there is | |
| 232 # genotype information with this record. | |
| 233 if self.genotypeString != "\n": self.hasGenotypes = True | |
| 234 else: self.hasGenotypes = False | |
| 235 | |
| 236 # Add the reference sequence to the dictionary. If it didn't previously | |
| 237 # exist append the reference sequence to the end of the list as well. | |
| 238 # This ensures that the order in which the reference sequences appeared | |
| 239 # in the header can be preserved. | |
| 240 if self.referenceSequence not in self.referenceSequences: | |
| 241 self.referenceSequences[self.referenceSequence] = True | |
| 242 self.referenceSequenceList.append(self.referenceSequence) | |
| 243 | |
| 244 # Check for multiple alternate alleles. | |
| 245 self.alternateAlleles = self.alt.split(",") | |
| 246 self.numberAlternateAlleles = len(self.alternateAlleles) | |
| 247 | |
| 248 # If required, process the info and genotypes. | |
| 249 if self.processInfo: self.processInfoFields() | |
| 250 if self.processGenotypes and self.hasGenotypes: self.processGenotypeFields() | |
| 251 | |
| 252 return True | |
| 253 | |
| 254 # Process the info string. | |
| 255 def processInfoFields(self): | |
| 256 | |
| 257 # First break the info string into its constituent elements. | |
| 258 infoEntries = self.info.split(";") | |
| 259 | |
| 260 # As long as some info fields exist, place them into a dictionary. | |
| 261 for entry in infoEntries: | |
| 262 infoEntry = entry.split("=") | |
| 263 | |
| 264 # If the entry is a flag, there will be no equals and the length of | |
| 265 # infoEntry will be 1. In this case, set the dictionary entry to the | |
| 266 # whole entry. If the vcf file has undergone a union or intersection | |
| 267 # operation and contains the information from multiple files, this may | |
| 268 # be a '/' seperate list of flags and so cannot be set to a Boolean value | |
| 269 # yet. | |
| 270 if len(infoEntry) == 1: self.infoTags[infoEntry[0]] = infoEntry[0] | |
| 271 elif len(infoEntry) > 1: self.infoTags[infoEntry[0]] = infoEntry[1] | |
| 272 | |
| 273 # Process the genotype formats and values. | |
| 274 def processGenotypeFields(self): | |
| 275 genotypeEntries = self.genotypeString.split("\t") | |
| 276 self.genotypeFormatString = genotypeEntries[1] | |
| 277 self.genotypes = list(genotypeEntries[2:]) | |
| 278 self.genotypeFormats = {} | |
| 279 self.genotypeFields = {} | |
| 280 self.genotypeFormats = self.genotypeFormatString.split(":") | |
| 281 | |
| 282 # Check that the number of genotype fields is equal to the number of samples | |
| 283 if len(self.samplesList) != len(self.genotypes): | |
| 284 text = "The number of genotypes is different to the number of samples" | |
| 285 self.generalError(text, "", "") | |
| 286 | |
| 287 # Add the genotype information to a dictionary. | |
| 288 for i in range( len(self.samplesList) ): | |
| 289 genotypeInfo = self.genotypes[i].split(":") | |
| 290 self.genotypeFields[ self.samplesList[i] ] = {} | |
| 291 | |
| 292 # Check that there are as many fields as in the format field. If not, this must | |
| 293 # be because the information is not known. In this case, it is permitted that | |
| 294 # the genotype information is either . or ./. | |
| 295 if genotypeInfo[0] == "./." or genotypeInfo[0] == "." and len(self.genotypeFormats) != len(genotypeInfo): | |
| 296 self.genotypeFields[ self.samplesList[i] ] = "." | |
| 297 else: | |
| 298 if len(self.genotypeFormats) != len(genotypeInfo): | |
| 299 text = "The number of genotype fields is different to the number specified in the format string" | |
| 300 self.generalError(text, "sample", self.samplesList[i]) | |
| 301 | |
| 302 for j in range( len(self.genotypeFormats) ): self.genotypeFields[ self.samplesList[i] ][ self.genotypeFormats[j] ] = genotypeInfo[j] | |
| 303 | |
| 304 # Parse through the vcf file until the correct reference sequence is | |
| 305 # encountered and the position is greater than or equal to that requested. | |
| 306 def parseVcf(self, referenceSequence, position, writeOut, outputFile): | |
| 307 success = True | |
| 308 if self.referenceSequence != referenceSequence: | |
| 309 while self.referenceSequence != referenceSequence and success: | |
| 310 if writeOut: outputFile.write(self.record) | |
| 311 success = self.getRecord() | |
| 312 | |
| 313 while self.referenceSequence == referenceSequence and self.position < position and success: | |
| 314 if writeOut: outputFile.write(self.record) | |
| 315 success = self.getRecord() | |
| 316 | |
| 317 return success | |
| 318 | |
| 319 # Get the information for a specific info tag. Also check that it contains | |
| 320 # the correct number and type of entries. | |
| 321 def getInfo(self, tag): | |
| 322 result = [] | |
| 323 | |
| 324 # Check if the tag exists in the header information. If so, | |
| 325 # determine the number and type of entries asscoiated with this | |
| 326 # tag. | |
| 327 if self.infoHeaderTags.has_key(tag): | |
| 328 infoNumber = self.infoHeaderTags[tag][0] | |
| 329 infoType = self.infoHeaderTags[tag][1] | |
| 330 numberValues = infoNumber | |
| 331 | |
| 332 # First check that the tag exists in the information string. Then split | |
| 333 # the entry on commas. For flag entries, do not perform the split. | |
| 334 if self.infoTags.has_key(tag): | |
| 335 if numberValues == 0 and infoType == "Flag": result = True | |
| 336 elif numberValues != 0 and infoType == "Flag": | |
| 337 print >> sys.stderr, "ERROR" | |
| 338 exit(1) | |
| 339 else: | |
| 340 fields = self.infoTags[tag].split(",") | |
| 341 if len(fields) != numberValues: | |
| 342 text = "Unexpected number of entries" | |
| 343 self.generalError(text, "information tag", tag) | |
| 344 | |
| 345 for i in range(infoNumber): | |
| 346 try: result.append(fields[i]) | |
| 347 except IndexError: | |
| 348 text = "Insufficient values. Expected: " + self.infoHeaderTags[tag][0] | |
| 349 self.generalError(text, "tag:", tag) | |
| 350 else: numberValues = 0 | |
| 351 | |
| 352 else: | |
| 353 text = "information field does not have a definition in the header" | |
| 354 self.generalError(text, "tag", tag) | |
| 355 | |
| 356 return numberValues, infoType, result | |
| 357 | |
| 358 # Get the genotype information. | |
| 359 def getGenotypeInfo(self, sample, tag): | |
| 360 result = [] | |
| 361 if self.formatHeaderTags.has_key(tag): | |
| 362 infoNumber = self.formatHeaderTags[tag][0] | |
| 363 infoType = self.formatHeaderTags[tag][1] | |
| 364 numberValues = infoNumber | |
| 365 | |
| 366 if self.genotypeFields[sample] == "." and len(self.genotypeFields[sample]) == 1: | |
| 367 numberValues = 0 | |
| 368 result = "." | |
| 369 else: | |
| 370 if self.genotypeFields[sample].has_key(tag): | |
| 371 if tag == "GT": | |
| 372 if len(self.genotypeFields[sample][tag]) != 3 and len(self.genotypeFields[sample][tag]) != 1: | |
| 373 text = "Unexected number of characters in genotype (GT) field" | |
| 374 self.generalError(text, "sample", sample) | |
| 375 | |
| 376 # If a diploid call, check whether or not the genotype is phased. | |
| 377 elif len(self.genotypeFields[sample][tag]) == 3: | |
| 378 self.phased = True if self.genotypeFields[sample][tag][1] == "|" else False | |
| 379 result.append( self.genotypeFields[sample][tag][0] ) | |
| 380 result.append( self.genotypeFields[sample][tag][2] ) | |
| 381 elif len(self.genotypeFields[sample][tag]) == 3: | |
| 382 result.append( self.genotypeFields[sample][tag][0] ) | |
| 383 else: | |
| 384 fields = self.genotypeFields[sample][tag].split(",") | |
| 385 if len(fields) != numberValues: | |
| 386 text = "Unexpected number of characters in " + tag + " field" | |
| 387 self.generalError(text, "sample", sample) | |
| 388 | |
| 389 for i in range(infoNumber): result.append(fields[i]) | |
| 390 else: | |
| 391 text = "genotype field does not have a definition in the header" | |
| 392 self.generalError(text, "tag", tag) | |
| 393 | |
| 394 return numberValues, result | |
| 395 | |
| 396 # Parse the dbsnp entry. If the entry conforms to the required variant type, | |
| 397 # return the dbsnp rsid value, otherwise ".". | |
| 398 def getDbsnpInfo(self): | |
| 399 | |
| 400 # First check that the variant class (VC) is listed as SNP. | |
| 401 vc = self.info.split("VC=",1) | |
| 402 if vc[1].find(";") != -1: snp = vc[1].split(";",1) | |
| 403 else: | |
| 404 snp = [] | |
| 405 snp.append(vc[1]) | |
| 406 | |
| 407 if snp[0].lower() == "snp": rsid = self.rsid | |
| 408 else: rsid = "." | |
| 409 | |
| 410 return rsid | |
| 411 | |
| 412 # Build a new vcf record. | |
| 413 def buildRecord(self, removeGenotypes): | |
| 414 record = self.referenceSequence + "\t" + \ | |
| 415 str(self.position) + "\t" + \ | |
| 416 self.rsid + "\t" + \ | |
| 417 self.ref + "\t" + \ | |
| 418 self.alt + "\t" + \ | |
| 419 str(self.quality) + "\t" + \ | |
| 420 self.filters + "\t" + \ | |
| 421 self.info | |
| 422 | |
| 423 if self.hasGenotypes and not removeGenotypes: record += self.genotypeString | |
| 424 | |
| 425 record += "\n" | |
| 426 | |
| 427 return record | |
| 428 | |
| 429 # Close the vcf file. | |
| 430 def closeVcf(self, filename): | |
| 431 self.filehandle.close() | |
| 432 | |
| 433 # Define error messages for different handled errors. | |
| 434 def generalError(self, text, field, fieldValue): | |
| 435 print >> sys.stderr, "\nError encountered when attempting to read:" | |
| 436 print >> sys.stderr, "\treference sequence :\t", self.referenceSequence | |
| 437 print >> sys.stderr, "\tposition :\t\t", self.position | |
| 438 if field != "": print >> sys.stderr, "\t", field, ":\t", fieldValue | |
| 439 print >> sys.stderr, "\n", text | |
| 440 exit(1) |
