65
|
1 #!/usr/bin/env python
|
|
2 # encoding: utf-8
|
|
3 '''
|
|
4 DNASampleSplitter -- shortdesc
|
|
5
|
|
6 DNASampleSplitter is a description
|
|
7
|
|
8 It defines classes_and_methods
|
|
9
|
76
|
10 @author: John Carlos Ignacio, Milcah Kigoni, Yaw Nti-Addae
|
65
|
11
|
|
12 @copyright: 2017 Cornell University. All rights reserved.
|
|
13
|
|
14 @license: MIT License
|
|
15
|
|
16 @contact: yn259@cornell.edu
|
|
17 @deffield updated: Updated
|
|
18 '''
|
|
19
|
|
20 import sys
|
|
21 import os
|
|
22 import math
|
|
23 import pandas as pd
|
76
|
24 import tempfile
|
65
|
25
|
|
26 from optparse import OptionParser
|
|
27 from __builtin__ import str
|
|
28 from subprocess import call
|
|
29
|
|
30 __all__ = []
|
76
|
31 __version__ = 0.2
|
65
|
32 __date__ = '2017-06-20'
|
76
|
33 __updated__ = '2017-06-27'
|
65
|
34
|
|
35 DEBUG = 1
|
|
36 TESTRUN = 0
|
|
37 PROFILE = 0
|
|
38
|
|
39 parents = {}
|
|
40
|
77
|
41 filenames = {}
|
76
|
42
|
79
|
43 favAlleleHeaders = []
|
|
44
|
76
|
45 def splitfile(my_file, sample_data, isSample):
|
|
46 temp_parents = parents
|
65
|
47 header = ''
|
|
48 fj_header = ''
|
|
49 with open(my_file) as infile:
|
|
50 for line in infile:
|
79
|
51 if line.lower().startswith("# fjfav"):
|
|
52 favAlleleHeaders.append(line)
|
|
53 continue
|
|
54 elif line[:2] == '# ':
|
65
|
55 fj_header += line
|
|
56 elif header == '':
|
|
57 if fj_header == '':
|
|
58 fj_header = '# fjFile = PHENOTYPE\n'
|
|
59 header_list = line.split('\t')
|
|
60 if header_list[0] != '':
|
|
61 header_list[0] = ''
|
|
62 line = "\t".join(header_list)
|
|
63 header = fj_header+line
|
|
64 else:
|
|
65 lst = line.split('\t')
|
|
66 dnarun = lst[0]
|
|
67 dnarun_data = sample_data[sample_data.dnarun_name == dnarun]
|
|
68 group = list(dnarun_data.dnasample_sample_group)[0]
|
|
69 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0]
|
|
70
|
|
71 isParent = False
|
76
|
72 for key in temp_parents:
|
|
73 value = temp_parents[key]
|
65
|
74 if dnarun in value:
|
76
|
75 name = my_file + "_" + key
|
|
76 if isSample:
|
|
77 continue
|
77
|
78 if name not in filenames:
|
76
|
79 filename = tempfile.NamedTemporaryFile(delete=False).name
|
77
|
80 filenames[name] = filename
|
65
|
81 f = open(filename, "w")
|
|
82 f.write('%s' % header)
|
|
83 else:
|
77
|
84 filename = filenames.get(name)
|
65
|
85 f=open(filename, "a+")
|
76
|
86 f.write('%s' % line)
|
65
|
87 isParent = True
|
|
88
|
|
89 if isParent:
|
|
90 continue
|
|
91
|
|
92 if isinstance(group, float) and math.isnan(group):
|
|
93 continue
|
76
|
94 elif isSample == 1:
|
65
|
95 # get parent data #
|
|
96
|
76
|
97 filename = tempfile.NamedTemporaryFile(delete=False).name
|
65
|
98 # get file name for genotype data
|
|
99 if isinstance(cycle, float) and math.isnan(cycle):
|
76
|
100 # save genotype data to file
|
77
|
101 if my_file + "_" + group not in filenames:
|
|
102 filenames[my_file + "_" + group] = filename
|
76
|
103 f = open(filename, "w")
|
|
104 f.write('%s' % header)
|
|
105 else :
|
77
|
106 filename = filenames.get(my_file + "_" + group)
|
76
|
107 f=open(filename, "a+")
|
|
108 f.write('%s' % line)
|
65
|
109 else:
|
76
|
110 # save genotype data to file
|
77
|
111 if my_file + "_" + group+'_'+cycle not in filenames:
|
|
112 filenames[my_file + "_" + group+'_'+cycle] = filename
|
76
|
113 f = open(filename, "w")
|
|
114 f.write('%s' % header)
|
|
115 else :
|
77
|
116 filename = filenames.get(my_file + "_" + group+'_'+cycle)
|
76
|
117 f=open(filename, "a+")
|
|
118 f.write('%s' % line)
|
79
|
119 for key in filenames:
|
|
120 fname = filenames[key]
|
|
121 f = open(fname, "a+")
|
|
122 for fav in favAlleleHeaders:
|
|
123 f.write(fav)
|
65
|
124
|
76
|
125
|
|
126
|
65
|
127 def splitData(samplefile, genofile):
|
|
128 # Split sample file #
|
|
129 sample_data = pd.read_table(samplefile, dtype='str')
|
|
130 group_list = sample_data.dnasample_sample_group.drop_duplicates()
|
|
131 for index, item in group_list.iteritems():
|
|
132 if isinstance(item, float):
|
|
133 if math.isnan(item):
|
|
134 continue
|
|
135 elif isinstance(item, str):
|
|
136 if not item:
|
|
137 continue
|
|
138 df = sample_data[sample_data.dnasample_sample_group == item]
|
|
139
|
|
140 # store dnaruns of parents in a dictionary
|
|
141 par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1)))
|
|
142 par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2)))
|
|
143 lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name'])
|
|
144 lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name'])
|
|
145 mergedlst = lst1 + lst2
|
|
146
|
|
147 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
|
|
148 for idx, sub in subgroup_list.iteritems():
|
|
149 if isinstance(sub, float):
|
|
150 if math.isnan(sub):
|
|
151 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
|
|
152 if not item in parents and mergedlst:
|
|
153 parents.update({item : mergedlst})
|
|
154 continue
|
|
155 elif isinstance(sub, str):
|
|
156 if not sub:
|
|
157 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
|
|
158 continue
|
|
159
|
|
160 subkey = item+'_'+sub
|
|
161 if not subkey in parents and mergedlst:
|
|
162 parents.update({subkey : lst1+lst2})
|
|
163 # df_sub = df[df.dnasample_sample_group_cycle == sub]
|
|
164 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
|
|
165
|
|
166 # Split genotype file based on sample information #
|
76
|
167 splitfile(samplefile, sample_data, 0)
|
|
168 splitfile(samplefile, sample_data, 1)
|
|
169 splitfile(genofile, sample_data, 0)
|
|
170 splitfile(genofile, sample_data, 1)
|
65
|
171
|
76
|
172 def createProjectFile(samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile, project):
|
|
173 sample_data = pd.read_table(samplefile, dtype='str')
|
|
174 groups = sample_data.dnasample_sample_group.drop_duplicates()
|
|
175 for index, key in groups.iteritems():
|
|
176 if isinstance(key, float) and math.isnan(key):
|
|
177 continue
|
|
178 df = sample_data[sample_data.dnasample_sample_group == key]
|
|
179 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
|
|
180 for idx, sub in subgroup_list.iteritems():
|
|
181 if isinstance(sub, float) and math.isnan(sub):
|
|
182 name = key
|
|
183 elif isinstance(sub, str) and not sub:
|
|
184 name = key
|
|
185 else:
|
|
186 name = key+'_'+sub
|
|
187 name = str(name)
|
77
|
188 sfile = filenames.get(samplefile + "_" + name)
|
|
189 gfile = filenames.get(genofile + "_" + name)
|
76
|
190 gfile += '.tmp'
|
|
191 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',project,'-n',name,'-S',separator,'-M',missing,'-C']
|
|
192 if qtlfile:
|
|
193 cmd += ['-q',qtlfile]
|
|
194 if mapfile:
|
|
195 cmd += ['-m',mapfile]
|
|
196 print(cmd)
|
|
197 call(cmd)
|
65
|
198
|
76
|
199 def createHeader(samplefile, genofile, headerjar):
|
|
200 sample_data = pd.read_table(samplefile, dtype='str')
|
|
201 groups = sample_data.dnasample_sample_group.drop_duplicates()
|
|
202 for index, key in groups.iteritems():
|
|
203 if isinstance(key, float) and math.isnan(key):
|
|
204 continue
|
|
205 df = sample_data[sample_data.dnasample_sample_group == key]
|
|
206 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
|
|
207 for idx, sub in subgroup_list.iteritems():
|
|
208 if isinstance(sub, float) and math.isnan(sub):
|
|
209 name = key
|
|
210 elif isinstance(sub, str) and not sub:
|
|
211 name = key
|
|
212 else:
|
|
213 name = key+'_'+sub
|
|
214 name = str(name)
|
77
|
215 sfile = filenames.get(samplefile + "_" + name)
|
|
216 gfile = filenames.get(genofile + "_" + name)
|
76
|
217
|
|
218 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
|
|
219 call(cmd)
|
65
|
220
|
|
221 def main(argv=None):
|
|
222 '''Command line options.'''
|
|
223
|
|
224 program_name = os.path.basename(sys.argv[0])
|
|
225 program_version = "v0.1"
|
|
226 program_build_date = "%s" % __updated__
|
|
227
|
|
228 program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
|
|
229 #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse
|
|
230 program_longdesc = '''''' # optional - give further explanation about what the program does
|
|
231 program_license = "Copyright 2017 user_name (organization_name) \
|
|
232 Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0"
|
|
233
|
|
234 if argv is None:
|
|
235 argv = sys.argv[1:]
|
|
236 try:
|
|
237 # setup option parser
|
|
238 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
|
|
239 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE")
|
|
240 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE")
|
|
241 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE")
|
|
242 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE")
|
76
|
243 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE", default='jars/flapjack.jar')
|
|
244 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE", default='jars/pedigreeheader.jar')
|
|
245 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, \"\" for no separator [default: \"\"]", metavar="STRING", default='')
|
|
246 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]", metavar="STRING", default='NN')
|
65
|
247 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
|
76
|
248 parser.add_option("-p", "--project", dest="project", help="name of output file [default: %default]")
|
65
|
249
|
|
250 # process options
|
|
251 (opts, args) = parser.parse_args(argv)
|
|
252
|
|
253 if opts.verbose > 0:
|
|
254 print("verbosity level = %d" % opts.verbose)
|
|
255 if opts.genofile:
|
|
256 print("genofile = %s" % opts.genofile)
|
|
257 else:
|
76
|
258 sys.stderr.write("No genotype file detected!\n")
|
|
259 sys.exit()
|
65
|
260 if opts.samplefile:
|
|
261 print("samplefile = %s" % opts.samplefile)
|
|
262 else:
|
76
|
263 sys.stderr.write("No sample file detected!\n")
|
|
264 sys.exit()
|
65
|
265 if opts.mapfile:
|
|
266 print("mapfile = %s" % opts.mapfile)
|
|
267 else:
|
76
|
268 sys.stderr.write("No map file detected!\n")
|
65
|
269 if opts.qtlfile:
|
|
270 print("qtlfile = %s" % opts.qtlfile)
|
|
271 else:
|
76
|
272 sys.stderr.write("No QTL file detected!\n")
|
65
|
273 if opts.jarfile:
|
|
274 print("jarfile = %s" % opts.jarfile)
|
|
275 else:
|
76
|
276 sys.stderr.write("No Flapjack project creator jar file detected!\n")
|
65
|
277 if opts.headerjar:
|
|
278 print("headerjar = %s" % opts.headerjar)
|
|
279 else:
|
76
|
280 sys.stderr.write("No Flapjack header creator jar file detected!\n")
|
65
|
281
|
|
282 # MAIN BODY #
|
|
283 splitData(samplefile=opts.samplefile, genofile=opts.genofile)
|
76
|
284 createHeader(samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
|
|
285 createProjectFile(samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile, project=opts.project)
|
65
|
286
|
|
287
|
|
288 except Exception, e:
|
|
289 indent = len(program_name) * " "
|
|
290 sys.stderr.write(program_name + ": " + repr(e) + "\n")
|
|
291 sys.stderr.write(indent + " for help use --help")
|
|
292 return 2
|
|
293
|
|
294
|
|
295 if __name__ == "__main__":
|
|
296 # if DEBUG:
|
|
297 # sys.argv.append("-h")
|
|
298 if TESTRUN:
|
|
299 import doctest
|
|
300 doctest.testmod()
|
|
301 if PROFILE:
|
|
302 import cProfile
|
|
303 import pstats
|
|
304 profile_filename = 'DNASampleSplitter_profile.txt'
|
|
305 cProfile.run('main()', profile_filename)
|
|
306 statsfile = open("profile_stats.txt", "wb")
|
|
307 p = pstats.Stats(profile_filename, stream=statsfile)
|
|
308 stats = p.strip_dirs().sort_stats('cumulative')
|
|
309 stats.print_stats()
|
|
310 statsfile.close()
|
|
311 sys.exit(0)
|
|
312 sys.exit(main()) |