Mercurial > repos > devteam > vcf_intersect
comparison tools.py @ 0:f5d5eed73180 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Thu, 23 Jan 2014 12:31:34 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f5d5eed73180 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os.path | |
| 4 import sys | |
| 5 import vcfPytools | |
| 6 from vcfPytools import __version__ | |
| 7 | |
| 8 # Determine whether to output to a file or stdout. | |
| 9 def setOutput(output): | |
| 10 if output == None: | |
| 11 outputFile = sys.stdout | |
| 12 writeOut = False | |
| 13 else: | |
| 14 output = os.path.abspath(output) | |
| 15 outputFile = open(output, 'w') | |
| 16 writeOut = True | |
| 17 | |
| 18 return outputFile, writeOut | |
| 19 | |
| 20 # Determine which file has priority for writing out records. | |
| 21 def setVcfPriority(priorityFile, vcfFiles): | |
| 22 if priorityFile == None: priority = 0 | |
| 23 elif priorityFile == vcfFiles[0]: priority = 1 | |
| 24 elif priorityFile == vcfFiles[1]: priority = 2 | |
| 25 elif priorityFile.lower() == "merge": priority = 3 | |
| 26 else: | |
| 27 print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge." | |
| 28 exit(1) | |
| 29 | |
| 30 return priority | |
| 31 | |
| 32 # If the union or intersection of two vcf files is being performed | |
| 33 # and the output vcf file is to contain the information from both | |
| 34 # files, the headers need to be merged to ensure that all info and | |
| 35 # format entries have an explanation. | |
| 36 def mergeHeaders(v1, v2, v3): | |
| 37 | |
| 38 # If either file does not have a header, terminate the program. | |
| 39 # In order to merge the headers, the different fields must be | |
| 40 # checked to ensure the files are compatible. | |
| 41 if not v1.hasHeader or not v2.hasHeader: | |
| 42 print >> sys.stderr, "Both vcf files must have a header in order to merge data sets." | |
| 43 exit(1) | |
| 44 | |
| 45 v3.infoHeaderTags = v1.infoHeaderTags.copy() | |
| 46 v3.formatHeaderTags = v1.formatHeaderTags.copy() | |
| 47 v3.numberDataSets = v1.numberDataSets | |
| 48 v3.includedDataSets = v1.includedDataSets.copy() | |
| 49 v3.headerText = v1.headerText | |
| 50 v3.headerTitles = v1.headerTitles | |
| 51 v3.infoHeaderString = v1.infoHeaderString.copy() | |
| 52 v3.formatHeaderString = v1.formatHeaderString.copy() | |
| 53 | |
| 54 # Merge the info field descriptions. | |
| 55 for tag in v2.infoHeaderTags: | |
| 56 if v1.infoHeaderTags.has_key(tag): | |
| 57 if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \ | |
| 58 v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]: | |
| 59 print v1.infoHeaderTags[tag][0] | |
| 60 print v1.infoHeaderTags[tag][1] | |
| 61 print v1.infoHeaderTags[tag][2] | |
| 62 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field." | |
| 63 exit(1) | |
| 64 else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag] | |
| 65 | |
| 66 # Merge the format field descriptions. | |
| 67 for tag in v2.formatHeaderTags: | |
| 68 if v1.formatHeaderTags.has_key(tag): | |
| 69 if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \ | |
| 70 v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]: | |
| 71 print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field." | |
| 72 exit(1) | |
| 73 else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag] | |
| 74 | |
| 75 # Now check to see if the vcf files contain information from multiple | |
| 76 # records themselves and create an ordered list in which the data | |
| 77 # will appear in the file. For instance, of the first file has | |
| 78 # already got two sets of data and is being intersected with a file | |
| 79 # with one set of data, the order of data in the new vcf file will be | |
| 80 # the two sets from the first file followed by the second, e.g. | |
| 81 # AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the | |
| 82 # value of AC from the second vcf. The header will have a ##FILE for | |
| 83 # each of the three files, so the origin if the data can be recovered. | |
| 84 if v1.numberDataSets == 0: | |
| 85 v3.includedDataSets[v3.numberDataSets + 1] = v1.filename | |
| 86 v3.numberDataSets += 1 | |
| 87 if v2.numberDataSets == 0: | |
| 88 v3.includedDataSets[v3.numberDataSets + 1] = v2.filename | |
| 89 v3.numberDataSets += 1 | |
| 90 else: | |
| 91 for i in range(1, v2.numberDataSets + 1): | |
| 92 v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i] | |
| 93 v3.numberDataSets += 1 | |
| 94 | |
| 95 # If either of the input files contain multiple data sets (e.g. multiple | |
| 96 # vcf files have undergone intersection or union calculations and all | |
| 97 # information has been retained) and the priority isn't set to 'merge', | |
| 98 # terminate the program. This is to ensure that the origin of the data | |
| 99 # doesn't get confused. | |
| 100 def checkDataSets(v1, v2): | |
| 101 if v1.numberDataSets + v2.numberDataSets != 0: | |
| 102 print >> sys.stderr, "\nERROR:" | |
| 103 print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files." | |
| 104 print >> sys.stderr, "Further intersection or union operations must include --priority-file merge" | |
| 105 print >> sys.stderr, "Other tools may be incompatible with this format." | |
| 106 exit(1) | |
| 107 | |
| 108 # Write the header to file. | |
| 109 def writeHeader (outputFile, v, removeGenotypes, taskDescriptor): | |
| 110 if not v.hasHeader: | |
| 111 v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n" | |
| 112 v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" | |
| 113 outputFile.write(v.headerText) if v.headerText != "" else None | |
| 114 print >> outputFile, taskDescriptor | |
| 115 for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag] | |
| 116 for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag] | |
| 117 | |
| 118 # Write out a list of files indicating which data set belongs to which file. | |
| 119 if v.numberDataSets != 0: | |
| 120 for i in range(1, v.numberDataSets + 1): | |
| 121 print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">" | |
| 122 | |
| 123 if removeGenotypes: | |
| 124 line = v.headerTitles.rstrip("\n").split("\t") | |
| 125 newHeaderTitles = line[0] | |
| 126 for i in range(1,8): | |
| 127 newHeaderTitles = newHeaderTitles + "\t" + line[i] | |
| 128 newHeaderTitles = newHeaderTitles + "\n" | |
| 129 outputFile.write( newHeaderTitles ) | |
| 130 else: | |
| 131 outputFile.write( v.headerTitles ) | |
| 132 | |
| 133 # Check that the two reference sequence lists are identical. | |
| 134 # If there are a different number or order, the results may | |
| 135 # not be as expected. | |
| 136 def checkReferenceSequenceLists(list1, list2): | |
| 137 errorMessage = False | |
| 138 if len(list1) != len(list2): | |
| 139 print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences." | |
| 140 errorMessage = True | |
| 141 elif list1 != list2: | |
| 142 print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences." | |
| 143 errorMessage = True | |
| 144 if errorMessage: | |
| 145 print >> sys.stderr, "Results may not be as expected." | |
| 146 print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order." | |
| 147 print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2 | |
| 148 | |
| 149 # Write out a vcf record to file. The record written depends on the | |
| 150 # value of 'priority' and could therefore be the record from either | |
| 151 # of the vcf files, or a combination of them. | |
| 152 | |
| 153 def writeVcfRecord(priority, v1, v2, outputFile): | |
| 154 if priority == 0: | |
| 155 if v1.quality >= v2.quality: outputFile.write(v1.record) | |
| 156 else: outputFile.write(v2.record) | |
| 157 elif priority == 1: outputFile.write(v1.record) | |
| 158 elif priority == 2: outputFile.write(v2.record) | |
| 159 elif priority == 3: | |
| 160 | |
| 161 # Define the missing entry values (depends on the number of data sets | |
| 162 # in the file). | |
| 163 info = "" | |
| 164 missingEntry1 = missingEntry2 = "." | |
| 165 for i in range(1, v1.numberDataSets): missingEntry1 += "/." | |
| 166 for i in range(1, v2.numberDataSets): missingEntry2 += "/." | |
| 167 secondList = v2.infoTags.copy() | |
| 168 | |
| 169 # Build up the info field. | |
| 170 for tag in v1.infoTags: | |
| 171 if secondList.has_key(tag): | |
| 172 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";" | |
| 173 del secondList[tag] | |
| 174 else: | |
| 175 if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";" | |
| 176 | |
| 177 # Now include the info tags that are not populated in the first vcf file. | |
| 178 for tag in secondList: | |
| 179 if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";" | |
| 180 | |
| 181 # Build the complete record. | |
| 182 info = info.rstrip(";") | |
| 183 record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \ | |
| 184 v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info | |
| 185 print >> outputFile, record | |
| 186 else: | |
| 187 print >> sys.sterr, "Unknown file priority." | |
| 188 exit(1) |
