Mercurial > repos > cropgeeks > flapjack
comparison FlapjackProject.py @ 65:d7e91a614582 draft
Uploaded
author | cropgeeks |
---|---|
date | Wed, 21 Feb 2018 11:54:51 -0500 |
parents | |
children | 84ce7c332dc4 |
comparison
equal
deleted
inserted
replaced
64:3b4e505bdad3 | 65:d7e91a614582 |
---|---|
1 #!/usr/bin/env python | |
2 # encoding: utf-8 | |
3 ''' | |
4 DNASampleSplitter -- shortdesc | |
5 | |
6 DNASampleSplitter is a description | |
7 | |
8 It defines classes_and_methods | |
9 | |
10 @author: John Carlos Ignacia, Milcah, Yaw Nti-Addae | |
11 | |
12 @copyright: 2017 Cornell University. All rights reserved. | |
13 | |
14 @license: MIT License | |
15 | |
16 @contact: yn259@cornell.edu | |
17 @deffield updated: Updated | |
18 ''' | |
19 | |
20 import sys | |
21 import os | |
22 import math | |
23 import pandas as pd | |
24 | |
25 from optparse import OptionParser | |
26 from __builtin__ import str | |
27 from subprocess import call | |
28 | |
29 __all__ = [] | |
30 __version__ = 0.1 | |
31 __date__ = '2017-06-20' | |
32 __updated__ = '2017-06-20' | |
33 | |
34 DEBUG = 1 | |
35 TESTRUN = 0 | |
36 PROFILE = 0 | |
37 | |
38 parents = {} | |
39 | |
40 def splitfile(my_file, sample_data): | |
41 header = '' | |
42 fj_header = '' | |
43 with open(my_file) as infile: | |
44 for line in infile: | |
45 if line[:2] == '# ': | |
46 fj_header += line | |
47 elif header == '': | |
48 if fj_header == '': | |
49 fj_header = '# fjFile = PHENOTYPE\n' | |
50 header_list = line.split('\t') | |
51 if header_list[0] != '': | |
52 header_list[0] = '' | |
53 line = "\t".join(header_list) | |
54 header = fj_header+line | |
55 else: | |
56 lst = line.split('\t') | |
57 dnarun = lst[0] | |
58 dnarun_data = sample_data[sample_data.dnarun_name == dnarun] | |
59 group = list(dnarun_data.dnasample_sample_group)[0] | |
60 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] | |
61 | |
62 filename = my_file | |
63 isParent = False | |
64 for key in parents: | |
65 value = parents[key] | |
66 if dnarun in value: | |
67 filename = my_file+'_'+key+'.txt' | |
68 if not os.path.isfile(filename) : | |
69 f = open(filename, "w") | |
70 f.write('%s' % header) | |
71 else: | |
72 f=open(filename, "a+") | |
73 f.write('%s' % line) | |
74 isParent = True | |
75 | |
76 if isParent: | |
77 continue | |
78 | |
79 if isinstance(group, float) and math.isnan(group): | |
80 continue | |
81 else: | |
82 # get parent data # | |
83 | |
84 # get file name for genotype data | |
85 if isinstance(cycle, float) and math.isnan(cycle): | |
86 filename += '_'+group+'.txt' | |
87 else: | |
88 filename += '_'+group+'_'+cycle+'.txt' | |
89 | |
90 # save genotype data to file | |
91 if not os.path.isfile(filename) : | |
92 f = open(filename, "w") | |
93 f.write('%s' % header) | |
94 else : | |
95 f=open(filename, "a+") | |
96 f.write('%s' % line) | |
97 | |
98 def splitData(samplefile, genofile): | |
99 # Split sample file # | |
100 sample_data = pd.read_table(samplefile, dtype='str') | |
101 group_list = sample_data.dnasample_sample_group.drop_duplicates() | |
102 for index, item in group_list.iteritems(): | |
103 if isinstance(item, float): | |
104 if math.isnan(item): | |
105 continue | |
106 elif isinstance(item, str): | |
107 if not item: | |
108 continue | |
109 df = sample_data[sample_data.dnasample_sample_group == item] | |
110 | |
111 # store dnaruns of parents in a dictionary | |
112 par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1))) | |
113 par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2))) | |
114 lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name']) | |
115 lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name']) | |
116 mergedlst = lst1 + lst2 | |
117 | |
118 subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() | |
119 for idx, sub in subgroup_list.iteritems(): | |
120 if isinstance(sub, float): | |
121 if math.isnan(sub): | |
122 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
123 if not item in parents and mergedlst: | |
124 parents.update({item : mergedlst}) | |
125 continue | |
126 elif isinstance(sub, str): | |
127 if not sub: | |
128 # df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
129 continue | |
130 | |
131 subkey = item+'_'+sub | |
132 if not subkey in parents and mergedlst: | |
133 parents.update({subkey : lst1+lst2}) | |
134 # df_sub = df[df.dnasample_sample_group_cycle == sub] | |
135 # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") | |
136 | |
137 # Split genotype file based on sample information # | |
138 splitfile(samplefile, sample_data) | |
139 splitfile(genofile, sample_data) | |
140 | |
141 def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): | |
142 for key in groups: | |
143 sfile = samplefile+'_'+key+'.txt' | |
144 gfile = genofile+'_'+key+'.txt.tmp' | |
145 cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] | |
146 call(cmd) | |
147 | |
148 def createHeader(groups, samplefile, genofile, headerjar): | |
149 for key in groups: | |
150 sfile = samplefile+'_'+key+'.txt' | |
151 gfile = genofile+'_'+key+'.txt' | |
152 cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] | |
153 call(cmd) | |
154 | |
155 def main(argv=None): | |
156 '''Command line options.''' | |
157 | |
158 program_name = os.path.basename(sys.argv[0]) | |
159 program_version = "v0.1" | |
160 program_build_date = "%s" % __updated__ | |
161 | |
162 program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) | |
163 #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse | |
164 program_longdesc = '''''' # optional - give further explanation about what the program does | |
165 program_license = "Copyright 2017 user_name (organization_name) \ | |
166 Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0" | |
167 | |
168 if argv is None: | |
169 argv = sys.argv[1:] | |
170 try: | |
171 # setup option parser | |
172 parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) | |
173 parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") | |
174 parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") | |
175 parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") | |
176 parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") | |
177 parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") | |
178 parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") | |
179 parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") | |
180 parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") | |
181 parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") | |
182 | |
183 # process options | |
184 (opts, args) = parser.parse_args(argv) | |
185 | |
186 if opts.verbose > 0: | |
187 print("verbosity level = %d" % opts.verbose) | |
188 if opts.genofile: | |
189 print("genofile = %s" % opts.genofile) | |
190 else: | |
191 sys.stderr.write("no genotype file detected!") | |
192 if opts.samplefile: | |
193 print("samplefile = %s" % opts.samplefile) | |
194 else: | |
195 sys.stderr.write("no sample file detected!") | |
196 if opts.mapfile: | |
197 print("mapfile = %s" % opts.mapfile) | |
198 else: | |
199 sys.stderr.write("no map file detected!") | |
200 if opts.qtlfile: | |
201 print("qtlfile = %s" % opts.qtlfile) | |
202 else: | |
203 sys.stderr.write("no QTL file detected!") | |
204 if opts.jarfile: | |
205 print("jarfile = %s" % opts.jarfile) | |
206 else: | |
207 sys.stderr.write("no Flapjack project creator jar file detected!") | |
208 if opts.headerjar: | |
209 print("headerjar = %s" % opts.headerjar) | |
210 else: | |
211 sys.stderr.write("no Flapjack header creator jar file detected!") | |
212 | |
213 # MAIN BODY # | |
214 splitData(samplefile=opts.samplefile, genofile=opts.genofile) | |
215 createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) | |
216 createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) | |
217 | |
218 | |
219 except Exception, e: | |
220 indent = len(program_name) * " " | |
221 sys.stderr.write(program_name + ": " + repr(e) + "\n") | |
222 sys.stderr.write(indent + " for help use --help") | |
223 return 2 | |
224 | |
225 | |
226 if __name__ == "__main__": | |
227 # if DEBUG: | |
228 # sys.argv.append("-h") | |
229 if TESTRUN: | |
230 import doctest | |
231 doctest.testmod() | |
232 if PROFILE: | |
233 import cProfile | |
234 import pstats | |
235 profile_filename = 'DNASampleSplitter_profile.txt' | |
236 cProfile.run('main()', profile_filename) | |
237 statsfile = open("profile_stats.txt", "wb") | |
238 p = pstats.Stats(profile_filename, stream=statsfile) | |
239 stats = p.strip_dirs().sort_stats('cumulative') | |
240 stats.print_stats() | |
241 statsfile.close() | |
242 sys.exit(0) | |
243 sys.exit(main()) |