comparison FlapjackProject.py @ 65:d7e91a614582 draft

Uploaded
author cropgeeks
date Wed, 21 Feb 2018 11:54:51 -0500
parents
children 84ce7c332dc4
comparison
equal deleted inserted replaced
64:3b4e505bdad3 65:d7e91a614582
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 DNASampleSplitter -- shortdesc
5
6 DNASampleSplitter is a description
7
8 It defines classes_and_methods
9
10 @author: John Carlos Ignacia, Milcah, Yaw Nti-Addae
11
12 @copyright: 2017 Cornell University. All rights reserved.
13
14 @license: MIT License
15
16 @contact: yn259@cornell.edu
17 @deffield updated: Updated
18 '''
19
20 import sys
21 import os
22 import math
23 import pandas as pd
24
25 from optparse import OptionParser
26 from __builtin__ import str
27 from subprocess import call
28
29 __all__ = []
30 __version__ = 0.1
31 __date__ = '2017-06-20'
32 __updated__ = '2017-06-20'
33
34 DEBUG = 1
35 TESTRUN = 0
36 PROFILE = 0
37
38 parents = {}
39
40 def splitfile(my_file, sample_data):
41 header = ''
42 fj_header = ''
43 with open(my_file) as infile:
44 for line in infile:
45 if line[:2] == '# ':
46 fj_header += line
47 elif header == '':
48 if fj_header == '':
49 fj_header = '# fjFile = PHENOTYPE\n'
50 header_list = line.split('\t')
51 if header_list[0] != '':
52 header_list[0] = ''
53 line = "\t".join(header_list)
54 header = fj_header+line
55 else:
56 lst = line.split('\t')
57 dnarun = lst[0]
58 dnarun_data = sample_data[sample_data.dnarun_name == dnarun]
59 group = list(dnarun_data.dnasample_sample_group)[0]
60 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0]
61
62 filename = my_file
63 isParent = False
64 for key in parents:
65 value = parents[key]
66 if dnarun in value:
67 filename = my_file+'_'+key+'.txt'
68 if not os.path.isfile(filename) :
69 f = open(filename, "w")
70 f.write('%s' % header)
71 else:
72 f=open(filename, "a+")
73 f.write('%s' % line)
74 isParent = True
75
76 if isParent:
77 continue
78
79 if isinstance(group, float) and math.isnan(group):
80 continue
81 else:
82 # get parent data #
83
84 # get file name for genotype data
85 if isinstance(cycle, float) and math.isnan(cycle):
86 filename += '_'+group+'.txt'
87 else:
88 filename += '_'+group+'_'+cycle+'.txt'
89
90 # save genotype data to file
91 if not os.path.isfile(filename) :
92 f = open(filename, "w")
93 f.write('%s' % header)
94 else :
95 f=open(filename, "a+")
96 f.write('%s' % line)
97
98 def splitData(samplefile, genofile):
99 # Split sample file #
100 sample_data = pd.read_table(samplefile, dtype='str')
101 group_list = sample_data.dnasample_sample_group.drop_duplicates()
102 for index, item in group_list.iteritems():
103 if isinstance(item, float):
104 if math.isnan(item):
105 continue
106 elif isinstance(item, str):
107 if not item:
108 continue
109 df = sample_data[sample_data.dnasample_sample_group == item]
110
111 # store dnaruns of parents in a dictionary
112 par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1)))
113 par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2)))
114 lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name'])
115 lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name'])
116 mergedlst = lst1 + lst2
117
118 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
119 for idx, sub in subgroup_list.iteritems():
120 if isinstance(sub, float):
121 if math.isnan(sub):
122 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
123 if not item in parents and mergedlst:
124 parents.update({item : mergedlst})
125 continue
126 elif isinstance(sub, str):
127 if not sub:
128 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
129 continue
130
131 subkey = item+'_'+sub
132 if not subkey in parents and mergedlst:
133 parents.update({subkey : lst1+lst2})
134 # df_sub = df[df.dnasample_sample_group_cycle == sub]
135 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
136
137 # Split genotype file based on sample information #
138 splitfile(samplefile, sample_data)
139 splitfile(genofile, sample_data)
140
141 def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile):
142 for key in groups:
143 sfile = samplefile+'_'+key+'.txt'
144 gfile = genofile+'_'+key+'.txt.tmp'
145 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile]
146 call(cmd)
147
148 def createHeader(groups, samplefile, genofile, headerjar):
149 for key in groups:
150 sfile = samplefile+'_'+key+'.txt'
151 gfile = genofile+'_'+key+'.txt'
152 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
153 call(cmd)
154
155 def main(argv=None):
156 '''Command line options.'''
157
158 program_name = os.path.basename(sys.argv[0])
159 program_version = "v0.1"
160 program_build_date = "%s" % __updated__
161
162 program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
163 #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse
164 program_longdesc = '''''' # optional - give further explanation about what the program does
165 program_license = "Copyright 2017 user_name (organization_name) \
166 Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0"
167
168 if argv is None:
169 argv = sys.argv[1:]
170 try:
171 # setup option parser
172 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
173 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE")
174 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE")
175 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE")
176 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE")
177 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE")
178 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE")
179 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]")
180 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]")
181 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
182
183 # process options
184 (opts, args) = parser.parse_args(argv)
185
186 if opts.verbose > 0:
187 print("verbosity level = %d" % opts.verbose)
188 if opts.genofile:
189 print("genofile = %s" % opts.genofile)
190 else:
191 sys.stderr.write("no genotype file detected!")
192 if opts.samplefile:
193 print("samplefile = %s" % opts.samplefile)
194 else:
195 sys.stderr.write("no sample file detected!")
196 if opts.mapfile:
197 print("mapfile = %s" % opts.mapfile)
198 else:
199 sys.stderr.write("no map file detected!")
200 if opts.qtlfile:
201 print("qtlfile = %s" % opts.qtlfile)
202 else:
203 sys.stderr.write("no QTL file detected!")
204 if opts.jarfile:
205 print("jarfile = %s" % opts.jarfile)
206 else:
207 sys.stderr.write("no Flapjack project creator jar file detected!")
208 if opts.headerjar:
209 print("headerjar = %s" % opts.headerjar)
210 else:
211 sys.stderr.write("no Flapjack header creator jar file detected!")
212
213 # MAIN BODY #
214 splitData(samplefile=opts.samplefile, genofile=opts.genofile)
215 createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
216 createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile)
217
218
219 except Exception, e:
220 indent = len(program_name) * " "
221 sys.stderr.write(program_name + ": " + repr(e) + "\n")
222 sys.stderr.write(indent + " for help use --help")
223 return 2
224
225
226 if __name__ == "__main__":
227 # if DEBUG:
228 # sys.argv.append("-h")
229 if TESTRUN:
230 import doctest
231 doctest.testmod()
232 if PROFILE:
233 import cProfile
234 import pstats
235 profile_filename = 'DNASampleSplitter_profile.txt'
236 cProfile.run('main()', profile_filename)
237 statsfile = open("profile_stats.txt", "wb")
238 p = pstats.Stats(profile_filename, stream=statsfile)
239 stats = p.strip_dirs().sort_stats('cumulative')
240 stats.print_stats()
241 statsfile.close()
242 sys.exit(0)
243 sys.exit(main())