vcf_intersect: tools.py comparison

comparison tools.py @ 0:f5d5eed73180 draft default tip

Imported from capsule None

author	devteam
date	Thu, 23 Jan 2014 12:31:34 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:f5d5eed73180
+#!/usr/bin/python
+import os.path
+import sys
+import vcfPytools
+from vcfPytools import __version__
+# Determine whether to output to a file or stdout.
+def setOutput(output):
+if output == None:
+outputFile = sys.stdout
+writeOut = False
+else:
+output = os.path.abspath(output)
+outputFile = open(output, 'w')
+writeOut = True
+return outputFile, writeOut
+# Determine which file has priority for writing out records.
+def setVcfPriority(priorityFile, vcfFiles):
+if priorityFile == None: priority = 0
+elif priorityFile == vcfFiles[0]: priority = 1
+elif priorityFile == vcfFiles[1]: priority = 2
+elif priorityFile.lower() == "merge": priority = 3
+else:
+print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
+exit(1)
+return priority
+# If the union or intersection of two vcf files is being performed
+# and the output vcf file is to contain the information from both
+# files, the headers need to be merged to ensure that all info and
+# format entries have an explanation.
+def mergeHeaders(v1, v2, v3):
+# If either file does not have a header, terminate the program.
+# In order to merge the headers, the different fields must be
+# checked to ensure the files are compatible.
+if not v1.hasHeader or not v2.hasHeader:
+print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
+exit(1)
+v3.infoHeaderTags = v1.infoHeaderTags.copy()
+v3.formatHeaderTags = v1.formatHeaderTags.copy()
+v3.numberDataSets = v1.numberDataSets
+v3.includedDataSets = v1.includedDataSets.copy()
+v3.headerText = v1.headerText
+v3.headerTitles = v1.headerTitles
+v3.infoHeaderString = v1.infoHeaderString.copy()
+v3.formatHeaderString = v1.formatHeaderString.copy()
+# Merge the info field descriptions.
+for tag in v2.infoHeaderTags:
+if v1.infoHeaderTags.has_key(tag):
+if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
+v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
+print v1.infoHeaderTags[tag][0]
+print v1.infoHeaderTags[tag][1]
+print v1.infoHeaderTags[tag][2]
+print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
+exit(1)
+else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
+# Merge the format field descriptions.
+for tag in v2.formatHeaderTags:
+if v1.formatHeaderTags.has_key(tag):
+if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
+v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
+print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
+exit(1)
+else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
+# Now check to see if the vcf files contain information from multiple
+# records themselves and create an ordered list in which the data
+# will appear in the file.  For instance, of the first file has
+# already got two sets of data and is being intersected with a file
+# with one set of data, the order of data in the new vcf file will be
+# the two sets from the first file followed by the second, e.g.
+# AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
+# value of AC from the second vcf.  The header will have a ##FILE for
+# each of the three files, so the origin if the data can be recovered.
+if v1.numberDataSets == 0:
+v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
+v3.numberDataSets += 1
+if v2.numberDataSets == 0:
+v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
+v3.numberDataSets += 1
+else:
+for i in range(1, v2.numberDataSets + 1):
+v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
+v3.numberDataSets += 1
+# If either of the input files contain multiple data sets (e.g. multiple
+# vcf files have undergone intersection or union calculations and all
+# information has been retained) and the priority isn't set to 'merge',
+# terminate the program.  This is to ensure that the origin of the data
+# doesn't get confused.
+def checkDataSets(v1, v2):
+if v1.numberDataSets + v2.numberDataSets != 0:
+print >> sys.stderr, "\nERROR:"
+print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
+print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
+print >> sys.stderr, "Other tools may be incompatible with this format."
+exit(1)
+# Write the header to file.
+def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
+if not v.hasHeader:
+v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
+v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
+outputFile.write(v.headerText) if v.headerText != "" else None
+print >> outputFile, taskDescriptor
+for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
+for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
+# Write out a list of files indicating which data set belongs to which file.
+if v.numberDataSets != 0:
+for i in range(1, v.numberDataSets + 1):
+print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
+if removeGenotypes:
+line = v.headerTitles.rstrip("\n").split("\t")
+newHeaderTitles = line[0]
+for i in range(1,8):
+newHeaderTitles = newHeaderTitles + "\t" + line[i]
+newHeaderTitles = newHeaderTitles + "\n"
+outputFile.write( newHeaderTitles )
+else:
+outputFile.write( v.headerTitles )
+# Check that the two reference sequence lists are identical.
+# If there are a different number or order, the results may
+# not be as expected.
+def checkReferenceSequenceLists(list1, list2):
+errorMessage = False
+if len(list1) != len(list2):
+print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
+errorMessage = True
+elif list1 != list2:
+print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
+errorMessage = True
+if errorMessage:
+print >> sys.stderr, "Results may not be as expected."
+print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
+print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
+# Write out a vcf record to file.  The record written depends on the
+# value of 'priority' and could therefore be the record from either
+# of the vcf files, or a combination of them.
+def writeVcfRecord(priority, v1, v2, outputFile):
+if priority == 0:
+if v1.quality >= v2.quality: outputFile.write(v1.record)
+else: outputFile.write(v2.record)
+elif priority == 1: outputFile.write(v1.record)
+elif priority == 2: outputFile.write(v2.record)
+elif priority == 3:
+# Define the missing entry values (depends on the number of data sets
+# in the file).
+info = ""
+missingEntry1 = missingEntry2 = "."
+for i in range(1, v1.numberDataSets): missingEntry1 += "/."
+for i in range(1, v2.numberDataSets): missingEntry2 += "/."
+secondList = v2.infoTags.copy()
+# Build up the info field.
+for tag in v1.infoTags:
+if secondList.has_key(tag):
+if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
+del secondList[tag]
+else:
+if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
+# Now include the info tags that are not populated in the first vcf file.
+for tag in secondList:
+if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
+# Build the complete record.
+info = info.rstrip(";")
+record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
+v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
+print >> outputFile, record
+else:
+print >> sys.sterr, "Unknown file priority."
+exit(1)

Mercurial > repos > devteam > vcf_intersect

comparison tools.py @ 0:f5d5eed73180 draft default tip