comparison FlapjackProject.py @ 76:84ce7c332dc4 draft

Uploaded
author cropgeeks
date Fri, 23 Feb 2018 10:10:08 -0500
parents d7e91a614582
children 6ace5881c494
comparison
equal deleted inserted replaced
75:2547ac473687 76:84ce7c332dc4
5 5
6 DNASampleSplitter is a description 6 DNASampleSplitter is a description
7 7
8 It defines classes_and_methods 8 It defines classes_and_methods
9 9
10 @author: John Carlos Ignacia, Milcah, Yaw Nti-Addae 10 @author: John Carlos Ignacio, Milcah Kigoni, Yaw Nti-Addae
11 11
12 @copyright: 2017 Cornell University. All rights reserved. 12 @copyright: 2017 Cornell University. All rights reserved.
13 13
14 @license: MIT License 14 @license: MIT License
15 15
19 19
20 import sys 20 import sys
21 import os 21 import os
22 import math 22 import math
23 import pandas as pd 23 import pandas as pd
24 import tempfile
24 25
25 from optparse import OptionParser 26 from optparse import OptionParser
26 from __builtin__ import str 27 from __builtin__ import str
27 from subprocess import call 28 from subprocess import call
28 29
29 __all__ = [] 30 __all__ = []
30 __version__ = 0.1 31 __version__ = 0.2
31 __date__ = '2017-06-20' 32 __date__ = '2017-06-20'
32 __updated__ = '2017-06-20' 33 __updated__ = '2017-06-27'
33 34
34 DEBUG = 1 35 DEBUG = 1
35 TESTRUN = 0 36 TESTRUN = 0
36 PROFILE = 0 37 PROFILE = 0
37 38
38 parents = {} 39 parents = {}
39 40
40 def splitfile(my_file, sample_data): 41 samplefiles = {}
42 genotypefiles = {}
43
44 def splitfile(my_file, sample_data, isSample):
45 temp_parents = parents
41 header = '' 46 header = ''
42 fj_header = '' 47 fj_header = ''
43 with open(my_file) as infile: 48 with open(my_file) as infile:
44 for line in infile: 49 for line in infile:
45 if line[:2] == '# ': 50 if line[:2] == '# ':
57 dnarun = lst[0] 62 dnarun = lst[0]
58 dnarun_data = sample_data[sample_data.dnarun_name == dnarun] 63 dnarun_data = sample_data[sample_data.dnarun_name == dnarun]
59 group = list(dnarun_data.dnasample_sample_group)[0] 64 group = list(dnarun_data.dnasample_sample_group)[0]
60 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] 65 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0]
61 66
62 filename = my_file
63 isParent = False 67 isParent = False
64 for key in parents: 68 for key in temp_parents:
65 value = parents[key] 69 value = temp_parents[key]
66 if dnarun in value: 70 if dnarun in value:
67 filename = my_file+'_'+key+'.txt' 71 name = my_file + "_" + key
68 if not os.path.isfile(filename) : 72 if isSample:
73 continue
74 if name not in samplefiles:
75 filename = tempfile.NamedTemporaryFile(delete=False).name
76 print("sample file %s has filename %s" % (name, filename))
77 samplefiles[name] = filename
69 f = open(filename, "w") 78 f = open(filename, "w")
70 f.write('%s' % header) 79 f.write('%s' % header)
71 else: 80 else:
81 filename = samplefiles.get(name)
72 f=open(filename, "a+") 82 f=open(filename, "a+")
73 f.write('%s' % line) 83 f.write('%s' % line)
74 isParent = True 84 isParent = True
75 85
76 if isParent: 86 if isParent:
77 continue 87 continue
78 88
79 if isinstance(group, float) and math.isnan(group): 89 if isinstance(group, float) and math.isnan(group):
80 continue 90 continue
81 else: 91 elif isSample == 1:
82 # get parent data # 92 # get parent data #
83 93
94 filename = tempfile.NamedTemporaryFile(delete=False).name
84 # get file name for genotype data 95 # get file name for genotype data
85 if isinstance(cycle, float) and math.isnan(cycle): 96 if isinstance(cycle, float) and math.isnan(cycle):
86 filename += '_'+group+'.txt' 97 # save genotype data to file
98 if my_file + "_" + group not in genotypefiles:
99 genotypefiles[my_file + "_" + group] = filename
100 print("genotype file %s has filename %s" % (my_file + "_" + group, filename))
101 f = open(filename, "w")
102 f.write('%s' % header)
103 else :
104 f=open(filename, "a+")
105 f.write('%s' % line)
87 else: 106 else:
88 filename += '_'+group+'_'+cycle+'.txt' 107 # save genotype data to file
108 if group not in genotypefiles:
109 genotypefiles[my_file + "_" + group+'_'+cycle] = filename
110 f = open(filename, "w")
111 f.write('%s' % header)
112 else :
113 f=open(filename, "a+")
114 f.write('%s' % line)
89 115
90 # save genotype data to file 116
91 if not os.path.isfile(filename) : 117
92 f = open(filename, "w")
93 f.write('%s' % header)
94 else :
95 f=open(filename, "a+")
96 f.write('%s' % line)
97
98 def splitData(samplefile, genofile): 118 def splitData(samplefile, genofile):
99 # Split sample file # 119 # Split sample file #
100 sample_data = pd.read_table(samplefile, dtype='str') 120 sample_data = pd.read_table(samplefile, dtype='str')
101 group_list = sample_data.dnasample_sample_group.drop_duplicates() 121 group_list = sample_data.dnasample_sample_group.drop_duplicates()
102 for index, item in group_list.iteritems(): 122 for index, item in group_list.iteritems():
133 parents.update({subkey : lst1+lst2}) 153 parents.update({subkey : lst1+lst2})
134 # df_sub = df[df.dnasample_sample_group_cycle == sub] 154 # df_sub = df[df.dnasample_sample_group_cycle == sub]
135 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") 155 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
136 156
137 # Split genotype file based on sample information # 157 # Split genotype file based on sample information #
138 splitfile(samplefile, sample_data) 158 splitfile(samplefile, sample_data, 0)
139 splitfile(genofile, sample_data) 159 splitfile(samplefile, sample_data, 1)
160 splitfile(genofile, sample_data, 0)
161 splitfile(genofile, sample_data, 1)
140 162
141 def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): 163 def createProjectFile(samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile, project):
142 for key in groups: 164 sample_data = pd.read_table(samplefile, dtype='str')
143 sfile = samplefile+'_'+key+'.txt' 165 groups = sample_data.dnasample_sample_group.drop_duplicates()
144 gfile = genofile+'_'+key+'.txt.tmp' 166 for index, key in groups.iteritems():
145 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] 167 if isinstance(key, float) and math.isnan(key):
146 call(cmd) 168 continue
169 df = sample_data[sample_data.dnasample_sample_group == key]
170 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
171 for idx, sub in subgroup_list.iteritems():
172 if isinstance(sub, float) and math.isnan(sub):
173 name = key
174 elif isinstance(sub, str) and not sub:
175 name = key
176 else:
177 name = key+'_'+sub
178 name = str(name)
179 sfile = samplefiles.get(samplefile + "_" + name)
180 gfile = genotypefiles.get(genofile + "_" + name)
181 gfile += '.tmp'
182 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',project,'-n',name,'-S',separator,'-M',missing,'-C']
183 if qtlfile:
184 cmd += ['-q',qtlfile]
185 if mapfile:
186 cmd += ['-m',mapfile]
187 print(cmd)
188 call(cmd)
147 189
148 def createHeader(groups, samplefile, genofile, headerjar): 190 def createHeader(samplefile, genofile, headerjar):
149 for key in groups: 191 sample_data = pd.read_table(samplefile, dtype='str')
150 sfile = samplefile+'_'+key+'.txt' 192 groups = sample_data.dnasample_sample_group.drop_duplicates()
151 gfile = genofile+'_'+key+'.txt' 193 for index, key in groups.iteritems():
152 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] 194 if isinstance(key, float) and math.isnan(key):
153 call(cmd) 195 continue
196 df = sample_data[sample_data.dnasample_sample_group == key]
197 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
198 for idx, sub in subgroup_list.iteritems():
199 if isinstance(sub, float) and math.isnan(sub):
200 name = key
201 elif isinstance(sub, str) and not sub:
202 name = key
203 else:
204 name = key+'_'+sub
205 name = str(name)
206 print("samplefile %s name %s" % (samplefile, name))
207 sfile = samplefiles.get(samplefile + "_" + name)
208 print("sfile %s" + sfile)
209 gfile = genotypefiles.get(genofile + "_" + name)
210 print("gfile %s" + gfile)
211
212 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
213 call(cmd)
154 214
155 def main(argv=None): 215 def main(argv=None):
156 '''Command line options.''' 216 '''Command line options.'''
157 217
158 program_name = os.path.basename(sys.argv[0]) 218 program_name = os.path.basename(sys.argv[0])
172 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) 232 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
173 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") 233 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE")
174 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") 234 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE")
175 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") 235 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE")
176 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") 236 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE")
177 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") 237 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE", default='jars/flapjack.jar')
178 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") 238 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE", default='jars/pedigreeheader.jar')
179 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") 239 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, \"\" for no separator [default: \"\"]", metavar="STRING", default='')
180 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") 240 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]", metavar="STRING", default='NN')
181 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") 241 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
242 parser.add_option("-p", "--project", dest="project", help="name of output file [default: %default]")
182 243
183 # process options 244 # process options
184 (opts, args) = parser.parse_args(argv) 245 (opts, args) = parser.parse_args(argv)
185 246
186 if opts.verbose > 0: 247 if opts.verbose > 0:
187 print("verbosity level = %d" % opts.verbose) 248 print("verbosity level = %d" % opts.verbose)
188 if opts.genofile: 249 if opts.genofile:
189 print("genofile = %s" % opts.genofile) 250 print("genofile = %s" % opts.genofile)
190 else: 251 else:
191 sys.stderr.write("no genotype file detected!") 252 sys.stderr.write("No genotype file detected!\n")
253 sys.exit()
192 if opts.samplefile: 254 if opts.samplefile:
193 print("samplefile = %s" % opts.samplefile) 255 print("samplefile = %s" % opts.samplefile)
194 else: 256 else:
195 sys.stderr.write("no sample file detected!") 257 sys.stderr.write("No sample file detected!\n")
258 sys.exit()
196 if opts.mapfile: 259 if opts.mapfile:
197 print("mapfile = %s" % opts.mapfile) 260 print("mapfile = %s" % opts.mapfile)
198 else: 261 else:
199 sys.stderr.write("no map file detected!") 262 sys.stderr.write("No map file detected!\n")
200 if opts.qtlfile: 263 if opts.qtlfile:
201 print("qtlfile = %s" % opts.qtlfile) 264 print("qtlfile = %s" % opts.qtlfile)
202 else: 265 else:
203 sys.stderr.write("no QTL file detected!") 266 sys.stderr.write("No QTL file detected!\n")
204 if opts.jarfile: 267 if opts.jarfile:
205 print("jarfile = %s" % opts.jarfile) 268 print("jarfile = %s" % opts.jarfile)
206 else: 269 else:
207 sys.stderr.write("no Flapjack project creator jar file detected!") 270 sys.stderr.write("No Flapjack project creator jar file detected!\n")
208 if opts.headerjar: 271 if opts.headerjar:
209 print("headerjar = %s" % opts.headerjar) 272 print("headerjar = %s" % opts.headerjar)
210 else: 273 else:
211 sys.stderr.write("no Flapjack header creator jar file detected!") 274 sys.stderr.write("No Flapjack header creator jar file detected!\n")
212 275
213 # MAIN BODY # 276 # MAIN BODY #
214 splitData(samplefile=opts.samplefile, genofile=opts.genofile) 277 splitData(samplefile=opts.samplefile, genofile=opts.genofile)
215 createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) 278 createHeader(samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
216 createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) 279 createProjectFile(samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile, project=opts.project)
217 280
218 281
219 except Exception, e: 282 except Exception, e:
220 indent = len(program_name) * " " 283 indent = len(program_name) * " "
221 sys.stderr.write(program_name + ": " + repr(e) + "\n") 284 sys.stderr.write(program_name + ": " + repr(e) + "\n")