Mercurial > repos > cropgeeks > flapjack
comparison FlapjackProject.py @ 76:84ce7c332dc4 draft
Uploaded
author | cropgeeks |
---|---|
date | Fri, 23 Feb 2018 10:10:08 -0500 |
parents | d7e91a614582 |
children | 6ace5881c494 |
comparison
equal
deleted
inserted
replaced
75:2547ac473687 | 76:84ce7c332dc4 |
---|---|
5 | 5 |
6 DNASampleSplitter is a description | 6 DNASampleSplitter is a description |
7 | 7 |
8 It defines classes_and_methods | 8 It defines classes_and_methods |
9 | 9 |
10 @author: John Carlos Ignacia, Milcah, Yaw Nti-Addae | 10 @author: John Carlos Ignacio, Milcah Kigoni, Yaw Nti-Addae |
11 | 11 |
12 @copyright: 2017 Cornell University. All rights reserved. | 12 @copyright: 2017 Cornell University. All rights reserved. |
13 | 13 |
14 @license: MIT License | 14 @license: MIT License |
15 | 15 |
19 | 19 |
20 import sys | 20 import sys |
21 import os | 21 import os |
22 import math | 22 import math |
23 import pandas as pd | 23 import pandas as pd |
24 import tempfile | |
24 | 25 |
25 from optparse import OptionParser | 26 from optparse import OptionParser |
26 from __builtin__ import str | 27 from __builtin__ import str |
27 from subprocess import call | 28 from subprocess import call |
28 | 29 |
29 __all__ = [] | 30 __all__ = [] |
30 __version__ = 0.1 | 31 __version__ = 0.2 |
31 __date__ = '2017-06-20' | 32 __date__ = '2017-06-20' |
32 __updated__ = '2017-06-20' | 33 __updated__ = '2017-06-27' |
33 | 34 |
34 DEBUG = 1 | 35 DEBUG = 1 |
35 TESTRUN = 0 | 36 TESTRUN = 0 |
36 PROFILE = 0 | 37 PROFILE = 0 |
37 | 38 |
38 parents = {} | 39 parents = {} |
39 | 40 |
40 def splitfile(my_file, sample_data): | 41 samplefiles = {} |
42 genotypefiles = {} | |
43 | |
44 def splitfile(my_file, sample_data, isSample): | |
45 temp_parents = parents | |
41 header = '' | 46 header = '' |
42 fj_header = '' | 47 fj_header = '' |
43 with open(my_file) as infile: | 48 with open(my_file) as infile: |
44 for line in infile: | 49 for line in infile: |
45 if line[:2] == '# ': | 50 if line[:2] == '# ': |
57 dnarun = lst[0] | 62 dnarun = lst[0] |
58 dnarun_data = sample_data[sample_data.dnarun_name == dnarun] | 63 dnarun_data = sample_data[sample_data.dnarun_name == dnarun] |
59 group = list(dnarun_data.dnasample_sample_group)[0] | 64 group = list(dnarun_data.dnasample_sample_group)[0] |
60 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] | 65 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] |
61 | 66 |
62 filename = my_file | |
63 isParent = False | 67 isParent = False |
64 for key in parents: | 68 for key in temp_parents: |
65 value = parents[key] | 69 value = temp_parents[key] |
66 if dnarun in value: | 70 if dnarun in value: |
67 filename = my_file+'_'+key+'.txt' | 71 name = my_file + "_" + key |
68 if not os.path.isfile(filename) : | 72 if isSample: |
73 continue | |
74 if name not in samplefiles: | |
75 filename = tempfile.NamedTemporaryFile(delete=False).name | |
76 print("sample file %s has filename %s" % (name, filename)) | |
77 samplefiles[name] = filename | |
69 f = open(filename, "w") | 78 f = open(filename, "w") |
70 f.write('%s' % header) | 79 f.write('%s' % header) |
71 else: | 80 else: |
81 filename = samplefiles.get(name) | |
72 f=open(filename, "a+") | 82 f=open(filename, "a+") |
73 f.write('%s' % line) | 83 f.write('%s' % line) |
74 isParent = True | 84 isParent = True |
75 | 85 |
76 if isParent: | 86 if isParent: |
77 continue | 87 continue |
78 | 88 |
79 if isinstance(group, float) and math.isnan(group): | 89 if isinstance(group, float) and math.isnan(group): |
80 continue | 90 continue |
81 else: | 91 elif isSample == 1: |
82 # get parent data # | 92 # get parent data # |
83 | 93 |
94 filename = tempfile.NamedTemporaryFile(delete=False).name | |
84 # get file name for genotype data | 95 # get file name for genotype data |
85 if isinstance(cycle, float) and math.isnan(cycle): | 96 if isinstance(cycle, float) and math.isnan(cycle): |
86 filename += '_'+group+'.txt' | 97 # save genotype data to file |
98 if my_file + "_" + group not in genotypefiles: | |
99 genotypefiles[my_file + "_" + group] = filename | |
100 print("genotype file %s has filename %s" % (my_file + "_" + group, filename)) | |
101 f = open(filename, "w") | |
102 f.write('%s' % header) | |
103 else : | |
104 f=open(filename, "a+") | |
105 f.write('%s' % line) | |
87 else: | 106 else: |
88 filename += '_'+group+'_'+cycle+'.txt' | 107 # save genotype data to file |
108 if group not in genotypefiles: | |
109 genotypefiles[my_file + "_" + group+'_'+cycle] = filename | |
110 f = open(filename, "w") | |
111 f.write('%s' % header) | |
112 else : | |
113 f=open(filename, "a+") | |
114 f.write('%s' % line) | |
89 | 115 |
90 # save genotype data to file | 116 |
91 if not os.path.isfile(filename) : | 117 |
92 f = open(filename, "w") | |
93 f.write('%s' % header) | |
94 else : | |
95 f=open(filename, "a+") | |
96 f.write('%s' % line) | |
97 | |
98 def splitData(samplefile, genofile): | 118 def splitData(samplefile, genofile): |
99 # Split sample file # | 119 # Split sample file # |
100 sample_data = pd.read_table(samplefile, dtype='str') | 120 sample_data = pd.read_table(samplefile, dtype='str') |
101 group_list = sample_data.dnasample_sample_group.drop_duplicates() | 121 group_list = sample_data.dnasample_sample_group.drop_duplicates() |
102 for index, item in group_list.iteritems(): | 122 for index, item in group_list.iteritems(): |
133 parents.update({subkey : lst1+lst2}) | 153 parents.update({subkey : lst1+lst2}) |
134 # df_sub = df[df.dnasample_sample_group_cycle == sub] | 154 # df_sub = df[df.dnasample_sample_group_cycle == sub] |
135 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | 155 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") |
136 | 156 |
137 # Split genotype file based on sample information # | 157 # Split genotype file based on sample information # |
138 splitfile(samplefile, sample_data) | 158 splitfile(samplefile, sample_data, 0) |
139 splitfile(genofile, sample_data) | 159 splitfile(samplefile, sample_data, 1) |
160 splitfile(genofile, sample_data, 0) | |
161 splitfile(genofile, sample_data, 1) | |
140 | 162 |
141 def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): | 163 def createProjectFile(samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile, project): |
142 for key in groups: | 164 sample_data = pd.read_table(samplefile, dtype='str') |
143 sfile = samplefile+'_'+key+'.txt' | 165 groups = sample_data.dnasample_sample_group.drop_duplicates() |
144 gfile = genofile+'_'+key+'.txt.tmp' | 166 for index, key in groups.iteritems(): |
145 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] | 167 if isinstance(key, float) and math.isnan(key): |
146 call(cmd) | 168 continue |
169 df = sample_data[sample_data.dnasample_sample_group == key] | |
170 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() | |
171 for idx, sub in subgroup_list.iteritems(): | |
172 if isinstance(sub, float) and math.isnan(sub): | |
173 name = key | |
174 elif isinstance(sub, str) and not sub: | |
175 name = key | |
176 else: | |
177 name = key+'_'+sub | |
178 name = str(name) | |
179 sfile = samplefiles.get(samplefile + "_" + name) | |
180 gfile = genotypefiles.get(genofile + "_" + name) | |
181 gfile += '.tmp' | |
182 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',project,'-n',name,'-S',separator,'-M',missing,'-C'] | |
183 if qtlfile: | |
184 cmd += ['-q',qtlfile] | |
185 if mapfile: | |
186 cmd += ['-m',mapfile] | |
187 print(cmd) | |
188 call(cmd) | |
147 | 189 |
148 def createHeader(groups, samplefile, genofile, headerjar): | 190 def createHeader(samplefile, genofile, headerjar): |
149 for key in groups: | 191 sample_data = pd.read_table(samplefile, dtype='str') |
150 sfile = samplefile+'_'+key+'.txt' | 192 groups = sample_data.dnasample_sample_group.drop_duplicates() |
151 gfile = genofile+'_'+key+'.txt' | 193 for index, key in groups.iteritems(): |
152 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] | 194 if isinstance(key, float) and math.isnan(key): |
153 call(cmd) | 195 continue |
196 df = sample_data[sample_data.dnasample_sample_group == key] | |
197 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() | |
198 for idx, sub in subgroup_list.iteritems(): | |
199 if isinstance(sub, float) and math.isnan(sub): | |
200 name = key | |
201 elif isinstance(sub, str) and not sub: | |
202 name = key | |
203 else: | |
204 name = key+'_'+sub | |
205 name = str(name) | |
206 print("samplefile %s name %s" % (samplefile, name)) | |
207 sfile = samplefiles.get(samplefile + "_" + name) | |
208 print("sfile %s" + sfile) | |
209 gfile = genotypefiles.get(genofile + "_" + name) | |
210 print("gfile %s" + gfile) | |
211 | |
212 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] | |
213 call(cmd) | |
154 | 214 |
155 def main(argv=None): | 215 def main(argv=None): |
156 '''Command line options.''' | 216 '''Command line options.''' |
157 | 217 |
158 program_name = os.path.basename(sys.argv[0]) | 218 program_name = os.path.basename(sys.argv[0]) |
172 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) | 232 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) |
173 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") | 233 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") |
174 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") | 234 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") |
175 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") | 235 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") |
176 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") | 236 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") |
177 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") | 237 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE", default='jars/flapjack.jar') |
178 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") | 238 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE", default='jars/pedigreeheader.jar') |
179 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") | 239 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, \"\" for no separator [default: \"\"]", metavar="STRING", default='') |
180 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") | 240 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]", metavar="STRING", default='NN') |
181 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") | 241 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") |
242 parser.add_option("-p", "--project", dest="project", help="name of output file [default: %default]") | |
182 | 243 |
183 # process options | 244 # process options |
184 (opts, args) = parser.parse_args(argv) | 245 (opts, args) = parser.parse_args(argv) |
185 | 246 |
186 if opts.verbose > 0: | 247 if opts.verbose > 0: |
187 print("verbosity level = %d" % opts.verbose) | 248 print("verbosity level = %d" % opts.verbose) |
188 if opts.genofile: | 249 if opts.genofile: |
189 print("genofile = %s" % opts.genofile) | 250 print("genofile = %s" % opts.genofile) |
190 else: | 251 else: |
191 sys.stderr.write("no genotype file detected!") | 252 sys.stderr.write("No genotype file detected!\n") |
253 sys.exit() | |
192 if opts.samplefile: | 254 if opts.samplefile: |
193 print("samplefile = %s" % opts.samplefile) | 255 print("samplefile = %s" % opts.samplefile) |
194 else: | 256 else: |
195 sys.stderr.write("no sample file detected!") | 257 sys.stderr.write("No sample file detected!\n") |
258 sys.exit() | |
196 if opts.mapfile: | 259 if opts.mapfile: |
197 print("mapfile = %s" % opts.mapfile) | 260 print("mapfile = %s" % opts.mapfile) |
198 else: | 261 else: |
199 sys.stderr.write("no map file detected!") | 262 sys.stderr.write("No map file detected!\n") |
200 if opts.qtlfile: | 263 if opts.qtlfile: |
201 print("qtlfile = %s" % opts.qtlfile) | 264 print("qtlfile = %s" % opts.qtlfile) |
202 else: | 265 else: |
203 sys.stderr.write("no QTL file detected!") | 266 sys.stderr.write("No QTL file detected!\n") |
204 if opts.jarfile: | 267 if opts.jarfile: |
205 print("jarfile = %s" % opts.jarfile) | 268 print("jarfile = %s" % opts.jarfile) |
206 else: | 269 else: |
207 sys.stderr.write("no Flapjack project creator jar file detected!") | 270 sys.stderr.write("No Flapjack project creator jar file detected!\n") |
208 if opts.headerjar: | 271 if opts.headerjar: |
209 print("headerjar = %s" % opts.headerjar) | 272 print("headerjar = %s" % opts.headerjar) |
210 else: | 273 else: |
211 sys.stderr.write("no Flapjack header creator jar file detected!") | 274 sys.stderr.write("No Flapjack header creator jar file detected!\n") |
212 | 275 |
213 # MAIN BODY # | 276 # MAIN BODY # |
214 splitData(samplefile=opts.samplefile, genofile=opts.genofile) | 277 splitData(samplefile=opts.samplefile, genofile=opts.genofile) |
215 createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) | 278 createHeader(samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) |
216 createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) | 279 createProjectFile(samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile, project=opts.project) |
217 | 280 |
218 | 281 |
219 except Exception, e: | 282 except Exception, e: |
220 indent = len(program_name) * " " | 283 indent = len(program_name) * " " |
221 sys.stderr.write(program_name + ": " + repr(e) + "\n") | 284 sys.stderr.write(program_name + ": " + repr(e) + "\n") |