Mercurial > repos > cropgeeks > flapjack
changeset 76:84ce7c332dc4 draft
Uploaded
author | cropgeeks |
---|---|
date | Fri, 23 Feb 2018 10:10:08 -0500 |
parents | 2547ac473687 |
children | 6ace5881c494 |
files | FlapjackProject.py lib/pedigreeheader.jar splitbysample.xml |
diffstat | 3 files changed, 111 insertions(+), 47 deletions(-) [+] |
line wrap: on
line diff
--- a/FlapjackProject.py Thu Feb 22 10:41:54 2018 -0500 +++ b/FlapjackProject.py Fri Feb 23 10:10:08 2018 -0500 @@ -7,7 +7,7 @@ It defines classes_and_methods -@author: John Carlos Ignacia, Milcah, Yaw Nti-Addae +@author: John Carlos Ignacio, Milcah Kigoni, Yaw Nti-Addae @copyright: 2017 Cornell University. All rights reserved. @@ -21,15 +21,16 @@ import os import math import pandas as pd +import tempfile from optparse import OptionParser from __builtin__ import str from subprocess import call __all__ = [] -__version__ = 0.1 +__version__ = 0.2 __date__ = '2017-06-20' -__updated__ = '2017-06-20' +__updated__ = '2017-06-27' DEBUG = 1 TESTRUN = 0 @@ -37,7 +38,11 @@ parents = {} -def splitfile(my_file, sample_data): +samplefiles = {} +genotypefiles = {} + +def splitfile(my_file, sample_data, isSample): + temp_parents = parents header = '' fj_header = '' with open(my_file) as infile: @@ -59,18 +64,23 @@ group = list(dnarun_data.dnasample_sample_group)[0] cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] - filename = my_file isParent = False - for key in parents: - value = parents[key] + for key in temp_parents: + value = temp_parents[key] if dnarun in value: - filename = my_file+'_'+key+'.txt' - if not os.path.isfile(filename) : + name = my_file + "_" + key + if isSample: + continue + if name not in samplefiles: + filename = tempfile.NamedTemporaryFile(delete=False).name + print("sample file %s has filename %s" % (name, filename)) + samplefiles[name] = filename f = open(filename, "w") f.write('%s' % header) else: + filename = samplefiles.get(name) f=open(filename, "a+") - f.write('%s' % line) + f.write('%s' % line) isParent = True if isParent: @@ -78,23 +88,33 @@ if isinstance(group, float) and math.isnan(group): continue - else: + elif isSample == 1: # get parent data # + filename = tempfile.NamedTemporaryFile(delete=False).name # get file name for genotype data if isinstance(cycle, float) and math.isnan(cycle): - filename += '_'+group+'.txt' + # save genotype data to file + if my_file + "_" + group not in genotypefiles: + genotypefiles[my_file + "_" + group] = filename + print("genotype file %s has filename %s" % (my_file + "_" + group, filename)) + f = open(filename, "w") + f.write('%s' % header) + else : + f=open(filename, "a+") + f.write('%s' % line) else: - filename += '_'+group+'_'+cycle+'.txt' + # save genotype data to file + if group not in genotypefiles: + genotypefiles[my_file + "_" + group+'_'+cycle] = filename + f = open(filename, "w") + f.write('%s' % header) + else : + f=open(filename, "a+") + f.write('%s' % line) - # save genotype data to file - if not os.path.isfile(filename) : - f = open(filename, "w") - f.write('%s' % header) - else : - f=open(filename, "a+") - f.write('%s' % line) - + + def splitData(samplefile, genofile): # Split sample file # sample_data = pd.read_table(samplefile, dtype='str') @@ -135,22 +155,62 @@ # df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") # Split genotype file based on sample information # - splitfile(samplefile, sample_data) - splitfile(genofile, sample_data) + splitfile(samplefile, sample_data, 0) + splitfile(samplefile, sample_data, 1) + splitfile(genofile, sample_data, 0) + splitfile(genofile, sample_data, 1) -def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): - for key in groups: - sfile = samplefile+'_'+key+'.txt' - gfile = genofile+'_'+key+'.txt.tmp' - cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] - call(cmd) +def createProjectFile(samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile, project): + sample_data = pd.read_table(samplefile, dtype='str') + groups = sample_data.dnasample_sample_group.drop_duplicates() + for index, key in groups.iteritems(): + if isinstance(key, float) and math.isnan(key): + continue + df = sample_data[sample_data.dnasample_sample_group == key] + subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() + for idx, sub in subgroup_list.iteritems(): + if isinstance(sub, float) and math.isnan(sub): + name = key + elif isinstance(sub, str) and not sub: + name = key + else: + name = key+'_'+sub + name = str(name) + sfile = samplefiles.get(samplefile + "_" + name) + gfile = genotypefiles.get(genofile + "_" + name) + gfile += '.tmp' + cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',project,'-n',name,'-S',separator,'-M',missing,'-C'] + if qtlfile: + cmd += ['-q',qtlfile] + if mapfile: + cmd += ['-m',mapfile] + print(cmd) + call(cmd) -def createHeader(groups, samplefile, genofile, headerjar): - for key in groups: - sfile = samplefile+'_'+key+'.txt' - gfile = genofile+'_'+key+'.txt' - cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] - call(cmd) +def createHeader(samplefile, genofile, headerjar): + sample_data = pd.read_table(samplefile, dtype='str') + groups = sample_data.dnasample_sample_group.drop_duplicates() + for index, key in groups.iteritems(): + if isinstance(key, float) and math.isnan(key): + continue + df = sample_data[sample_data.dnasample_sample_group == key] + subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() + for idx, sub in subgroup_list.iteritems(): + if isinstance(sub, float) and math.isnan(sub): + name = key + elif isinstance(sub, str) and not sub: + name = key + else: + name = key+'_'+sub + name = str(name) + print("samplefile %s name %s" % (samplefile, name)) + sfile = samplefiles.get(samplefile + "_" + name) + print("sfile %s" + sfile) + gfile = genotypefiles.get(genofile + "_" + name) + print("gfile %s" + gfile) + + cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] + call(cmd) def main(argv=None): '''Command line options.''' @@ -174,11 +234,12 @@ parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") - parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") - parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") - parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") - parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") + parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE", default='jars/flapjack.jar') + parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE", default='jars/pedigreeheader.jar') + parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, \"\" for no separator [default: \"\"]", metavar="STRING", default='') + parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]", metavar="STRING", default='NN') parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") + parser.add_option("-p", "--project", dest="project", help="name of output file [default: %default]") # process options (opts, args) = parser.parse_args(argv) @@ -188,32 +249,34 @@ if opts.genofile: print("genofile = %s" % opts.genofile) else: - sys.stderr.write("no genotype file detected!") + sys.stderr.write("No genotype file detected!\n") + sys.exit() if opts.samplefile: print("samplefile = %s" % opts.samplefile) else: - sys.stderr.write("no sample file detected!") + sys.stderr.write("No sample file detected!\n") + sys.exit() if opts.mapfile: print("mapfile = %s" % opts.mapfile) else: - sys.stderr.write("no map file detected!") + sys.stderr.write("No map file detected!\n") if opts.qtlfile: print("qtlfile = %s" % opts.qtlfile) else: - sys.stderr.write("no QTL file detected!") + sys.stderr.write("No QTL file detected!\n") if opts.jarfile: print("jarfile = %s" % opts.jarfile) else: - sys.stderr.write("no Flapjack project creator jar file detected!") + sys.stderr.write("No Flapjack project creator jar file detected!\n") if opts.headerjar: print("headerjar = %s" % opts.headerjar) else: - sys.stderr.write("no Flapjack header creator jar file detected!") + sys.stderr.write("No Flapjack header creator jar file detected!\n") # MAIN BODY # splitData(samplefile=opts.samplefile, genofile=opts.genofile) - createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) - createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) + createHeader(samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) + createProjectFile(samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile, project=opts.project) except Exception, e: