changeset 76:84ce7c332dc4 draft

Uploaded
author cropgeeks
date Fri, 23 Feb 2018 10:10:08 -0500
parents 2547ac473687
children 6ace5881c494
files FlapjackProject.py lib/pedigreeheader.jar splitbysample.xml
diffstat 3 files changed, 111 insertions(+), 47 deletions(-) [+]
line wrap: on
line diff
--- a/FlapjackProject.py	Thu Feb 22 10:41:54 2018 -0500
+++ b/FlapjackProject.py	Fri Feb 23 10:10:08 2018 -0500
@@ -7,7 +7,7 @@
 
 It defines classes_and_methods
 
-@author:     John Carlos Ignacia, Milcah, Yaw Nti-Addae
+@author:     John Carlos Ignacio, Milcah Kigoni, Yaw Nti-Addae
 
 @copyright:  2017 Cornell University. All rights reserved.
 
@@ -21,15 +21,16 @@
 import os
 import math
 import pandas as pd
+import tempfile
 
 from optparse import OptionParser
 from __builtin__ import str
 from subprocess import call
 
 __all__ = []
-__version__ = 0.1
+__version__ = 0.2
 __date__ = '2017-06-20'
-__updated__ = '2017-06-20'
+__updated__ = '2017-06-27'
 
 DEBUG = 1
 TESTRUN = 0
@@ -37,7 +38,11 @@
 
 parents = {}
 
-def splitfile(my_file, sample_data):
+samplefiles = {}
+genotypefiles = {}
+
+def splitfile(my_file, sample_data, isSample):
+    temp_parents = parents
     header = ''
     fj_header = ''
     with open(my_file) as infile:
@@ -59,18 +64,23 @@
                 group = list(dnarun_data.dnasample_sample_group)[0]
                 cycle = list(dnarun_data.dnasample_sample_group_cycle)[0]
 
-                filename = my_file
                 isParent = False
-                for key in parents:
-                    value = parents[key]
+                for key in temp_parents:
+                    value = temp_parents[key]
                     if dnarun in value:
-                        filename = my_file+'_'+key+'.txt'
-                        if not os.path.isfile(filename) :
+                        name = my_file + "_" + key
+                        if isSample:
+                            continue
+                        if name not in samplefiles:
+                            filename = tempfile.NamedTemporaryFile(delete=False).name
+                            print("sample file %s has filename %s" % (name, filename))
+                            samplefiles[name] = filename
                             f = open(filename, "w")
                             f.write('%s' % header)
                         else:
+                            filename = samplefiles.get(name)
                             f=open(filename, "a+")
-                        f.write('%s' % line)
+                        f.write('%s' % line)                        
                         isParent = True
                 
                 if isParent:
@@ -78,23 +88,33 @@
                 
                 if isinstance(group, float) and math.isnan(group):
                     continue
-                else:
+                elif isSample == 1:
                     # get parent data #
                     
+                    filename = tempfile.NamedTemporaryFile(delete=False).name
                     # get file name for genotype data
                     if isinstance(cycle, float) and math.isnan(cycle):
-                        filename += '_'+group+'.txt'
+                        # save genotype data to file
+                        if my_file + "_" + group not in genotypefiles:
+                            genotypefiles[my_file + "_" + group] = filename
+                            print("genotype file %s has filename %s" % (my_file + "_" + group, filename))
+                            f = open(filename, "w")
+                            f.write('%s' % header)
+                        else :
+                            f=open(filename, "a+")
+                        f.write('%s' % line)
                     else:
-                        filename += '_'+group+'_'+cycle+'.txt'
+                        # save genotype data to file
+                        if group not in genotypefiles:
+                            genotypefiles[my_file + "_" + group+'_'+cycle] = filename
+                            f = open(filename, "w")
+                            f.write('%s' % header)
+                        else :
+                            f=open(filename, "a+")
+                        f.write('%s' % line)
                     
-                    # save genotype data to file
-                    if not os.path.isfile(filename) :
-                        f = open(filename, "w")
-                        f.write('%s' % header)
-                    else :
-                        f=open(filename, "a+")
-                    f.write('%s' % line)
-
+                    
+                    
 def splitData(samplefile, genofile):
     # Split sample file #
     sample_data = pd.read_table(samplefile, dtype='str')
@@ -135,22 +155,62 @@
 #             df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
     
     # Split genotype file based on sample information #
-    splitfile(samplefile, sample_data)
-    splitfile(genofile, sample_data)
+    splitfile(samplefile, sample_data, 0)
+    splitfile(samplefile, sample_data, 1)
+    splitfile(genofile, sample_data, 0)
+    splitfile(genofile, sample_data, 1)
     
-def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile):
-    for key in groups:
-        sfile = samplefile+'_'+key+'.txt'
-        gfile = genofile+'_'+key+'.txt.tmp'
-        cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile]
-        call(cmd)
+def createProjectFile(samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile, project):
+    sample_data = pd.read_table(samplefile, dtype='str')
+    groups = sample_data.dnasample_sample_group.drop_duplicates()
+    for index, key in groups.iteritems():
+        if isinstance(key, float) and math.isnan(key):
+            continue
+        df = sample_data[sample_data.dnasample_sample_group == key]
+        subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
+        for idx, sub in subgroup_list.iteritems():
+            if isinstance(sub, float) and math.isnan(sub):
+                name = key
+            elif isinstance(sub, str) and not sub:
+                name = key
+            else:
+                name = key+'_'+sub
+            name = str(name)
+            sfile = samplefiles.get(samplefile + "_" + name)
+            gfile = genotypefiles.get(genofile + "_" + name)
+            gfile += '.tmp'
+            cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',project,'-n',name,'-S',separator,'-M',missing,'-C']
+            if qtlfile:
+                cmd += ['-q',qtlfile]
+            if mapfile:
+                cmd += ['-m',mapfile]
+            print(cmd)
+            call(cmd)
     
-def createHeader(groups, samplefile, genofile, headerjar):
-    for key in groups:
-        sfile = samplefile+'_'+key+'.txt'
-        gfile = genofile+'_'+key+'.txt'
-        cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
-        call(cmd)
+def createHeader(samplefile, genofile, headerjar):
+    sample_data = pd.read_table(samplefile, dtype='str')
+    groups = sample_data.dnasample_sample_group.drop_duplicates()
+    for index, key in groups.iteritems():
+        if isinstance(key, float) and math.isnan(key):
+            continue
+        df = sample_data[sample_data.dnasample_sample_group == key]
+        subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
+        for idx, sub in subgroup_list.iteritems():
+            if isinstance(sub, float) and math.isnan(sub):
+                name = key
+            elif isinstance(sub, str) and not sub:
+                name = key
+            else:
+                name = key+'_'+sub
+            name = str(name)
+            print("samplefile %s name %s" % (samplefile, name))
+            sfile = samplefiles.get(samplefile + "_" + name)
+            print("sfile %s" + sfile)
+            gfile = genotypefiles.get(genofile + "_" + name)
+            print("gfile %s" + gfile)
+
+            cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
+            call(cmd)
     
 def main(argv=None):
     '''Command line options.'''
@@ -174,11 +234,12 @@
         parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE")
         parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE")
         parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE")
-        parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE")
-        parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE")
-        parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]")
-        parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]")
+        parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE", default='jars/flapjack.jar')
+        parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE", default='jars/pedigreeheader.jar')
+        parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, \"\" for no separator [default: \"\"]", metavar="STRING", default='')
+        parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]", metavar="STRING", default='NN')
         parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
+        parser.add_option("-p", "--project", dest="project", help="name of output file [default: %default]")
 
         # process options
         (opts, args) = parser.parse_args(argv)
@@ -188,32 +249,34 @@
         if opts.genofile:
             print("genofile = %s" % opts.genofile)
         else:
-            sys.stderr.write("no genotype file detected!")
+            sys.stderr.write("No genotype file detected!\n")
+            sys.exit()
         if opts.samplefile:
             print("samplefile = %s" % opts.samplefile)
         else:
-            sys.stderr.write("no sample file detected!")
+            sys.stderr.write("No sample file detected!\n")            
+            sys.exit()
         if opts.mapfile:
             print("mapfile = %s" % opts.mapfile)
         else:
-            sys.stderr.write("no map file detected!")
+            sys.stderr.write("No map file detected!\n")
         if opts.qtlfile:
             print("qtlfile = %s" % opts.qtlfile)
         else:
-            sys.stderr.write("no QTL file detected!")
+            sys.stderr.write("No QTL file detected!\n")
         if opts.jarfile:
             print("jarfile = %s" % opts.jarfile)
         else:
-            sys.stderr.write("no Flapjack project creator jar file detected!")
+            sys.stderr.write("No Flapjack project creator jar file detected!\n")
         if opts.headerjar:
             print("headerjar = %s" % opts.headerjar)
         else:
-            sys.stderr.write("no Flapjack header creator jar file detected!")
+            sys.stderr.write("No Flapjack header creator jar file detected!\n")
 
         # MAIN BODY #
         splitData(samplefile=opts.samplefile, genofile=opts.genofile)
-        createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
-        createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile)
+        createHeader(samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
+        createProjectFile(samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile, project=opts.project)
         
                             
     except Exception, e:
Binary file lib/pedigreeheader.jar has changed
--- a/splitbysample.xml	Thu Feb 22 10:41:54 2018 -0500
+++ b/splitbysample.xml	Fri Feb 23 10:10:08 2018 -0500
@@ -9,6 +9,7 @@
         -J $__tool_directory__/lib/pedigreeheader.jar
         -g '$genotypes'
         -s '$traits'
+        -p '$output'
         #if $map
             -m '$map'
         #end if