annotate extract.py @ 0:c6fb674dfda3 draft default tip

Imported from capsule None
author devteam
date Thu, 23 Jan 2014 12:31:12 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/python
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
2
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
3 import os.path
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
4 import sys
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
5 import optparse
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
6
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
7 import vcfClass
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
8 from vcfClass import *
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
9
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
10 import tools
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
11 from tools import *
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
12
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
13 if __name__ == "__main__":
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
14 main()
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
15
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
16 def main():
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
17
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
18 # Parse the command line options
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
19 usage = "Usage: vcfPytools.py extract [options]"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
20 parser = optparse.OptionParser(usage = usage)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
21 parser.add_option("-i", "--in",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
22 action="store", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
23 dest="vcfFile", help="input vcf file (stdin for piped vcf)")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
24 parser.add_option("-o", "--out",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
25 action="store", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
26 dest="output", help="output validation file")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
27 parser.add_option("-s", "--reference-sequence",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
28 action="store", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
29 dest="referenceSequence", help="extract records from this reference sequence")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
30 parser.add_option("-r", "--region",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
31 action="store", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
32 dest="region", help="extract records from this region")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
33 parser.add_option("-q", "--keep-quality",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
34 action="append", type="string", nargs=2,
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
35 dest="keepQuality", help="keep records containing this quality")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
36 parser.add_option("-k", "--keep-info",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
37 action="append", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
38 dest="infoKeep", help="keep records containing this info field")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
39 parser.add_option("-d", "--discard-info",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
40 action="append", type="string",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
41 dest="infoDiscard", help="discard records containing this info field")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
42 parser.add_option("-p", "--pass-filter",
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
43 action="store_true", default=False,
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
44 dest="passFilter", help="discard records whose filter field is not PASS")
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
45
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
46 (options, args) = parser.parse_args()
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
47
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
48 # Check that a vcf file is given.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
49 if options.vcfFile == None:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
50 parser.print_help()
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
52 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
53
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
54 # Check that either a reference sequence or region is specified,
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
55 # but not both if not dealing with info fields.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
57 if not options.referenceSequence and not options.region:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
58 parser.print_help()
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
60 print >> sys.stderr, "if not extracting records based on info strings."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
61 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
62 if options.referenceSequence and options.region:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
63 parser.print_help()
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
65 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
66
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
67 # If a region was supplied, check the format.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
68 if options.region:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
69 if options.region.find(":") == -1 or options.region.find("..") == -1:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
71 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
72 regionList = options.region.split(":",1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
73 referenceSequence = regionList[0]
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
74 try: start = int(regionList[1].split("..")[0])
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
75 except ValueError:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
76 print >> sys.stderr, "region start coordinate is not an integer"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
77 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
78 try: end = int(regionList[1].split("..")[1])
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
79 except ValueError:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
80 print >> sys.stderr, "region end coordinate is not an integer"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
81 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
82
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
83 # Ensure that discard-info and keep-info haven't both been defined.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
84 if options.infoKeep and options.infoDiscard:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
86 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
87
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
88 # If the --keep-quality argument is used, check that a value and a logical
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
89 # argument are supplied and that the logical argument is valid.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
90
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
91 if options.keepQuality:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
92 for value, logic in options.keepQuality:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
97 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
98 try: qualityValue = float(value)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
99 except ValueError:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
101 print >> sys.stderr, "Quality value must be an integer or float value."
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
102 exit(1)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
103 qualityLogic = logic
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
104
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
105 # Set the output file to stdout if no output file was specified.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
106 outputFile, writeOut = setOutput(options.output)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
107
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
108 v = vcf() # Define vcf object.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
109
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
110 # Set process info to True if info strings need to be parsed.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
111 if options.infoKeep or options.infoDiscard: v.processInfo = True
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
112
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
113 # Open the file.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
114 v.openVcf(options.vcfFile)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
115
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
116 # Read in the header information.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
117 v.parseHeader(options.vcfFile, writeOut)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
118 taskDescriptor = "##vcfPytools=extract data"
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
120
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
121 # Read through all the entries and write out records in the correct
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
122 # reference sequence.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
123 while v.getRecord():
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
124 writeRecord = True
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
126 elif options.region:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
127 if v.referenceSequence != referenceSequence: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
128 elif v.position < start or v.position > end: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
129
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
130 # Only consider these fields if the record is contained within the
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
131 # specified region.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
132 if options.infoKeep and writeRecord:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
133 for tag in options.infoKeep:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
134 if v.infoTags.has_key(tag):
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
135 writeRecord = True
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
136 break
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
137 if not v.infoTags.has_key(tag): writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
138 if options.infoDiscard and writeRecord:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
139 for tag in options.infoDiscard:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
140 if v.infoTags.has_key(tag): writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
142 if options.keepQuality:
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
148
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
149 if writeRecord: outputFile.write(v.record)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
150
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
151 # Close the file.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
152 v.closeVcf(options.vcfFile)
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
153
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
154 # Terminate the program cleanly.
c6fb674dfda3 Imported from capsule None
devteam
parents:
diff changeset
155 return 0