changeset 65:d7e91a614582 draft

Uploaded
author cropgeeks
date Wed, 21 Feb 2018 11:54:51 -0500
parents 3b4e505bdad3
children d709024d6bb4
files FlapjackProject.py favalleleheadercreator.xml lib/flapjack.jar lib/pedigreeheader.jar split-by-sample.xml
diffstat 5 files changed, 330 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/FlapjackProject.py	Wed Feb 21 11:54:51 2018 -0500
@@ -0,0 +1,243 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+DNASampleSplitter -- shortdesc
+
+DNASampleSplitter is a description
+
+It defines classes_and_methods
+
+@author:     John Carlos Ignacia, Milcah, Yaw Nti-Addae
+
+@copyright:  2017 Cornell University. All rights reserved.
+
+@license:    MIT License
+
+@contact:    yn259@cornell.edu
+@deffield    updated: Updated
+'''
+
+import sys
+import os
+import math
+import pandas as pd
+
+from optparse import OptionParser
+from __builtin__ import str
+from subprocess import call
+
+__all__ = []
+__version__ = 0.1
+__date__ = '2017-06-20'
+__updated__ = '2017-06-20'
+
+DEBUG = 1
+TESTRUN = 0
+PROFILE = 0
+
+parents = {}
+
+def splitfile(my_file, sample_data):
+    header = ''
+    fj_header = ''
+    with open(my_file) as infile:
+        for line in infile:
+            if line[:2] == '# ':
+                fj_header += line
+            elif header == '':
+                if fj_header == '':
+                    fj_header = '# fjFile = PHENOTYPE\n'
+                header_list = line.split('\t')
+                if header_list[0] != '':
+                    header_list[0] = ''
+                line = "\t".join(header_list)
+                header = fj_header+line
+            else:
+                lst = line.split('\t')
+                dnarun = lst[0]
+                dnarun_data = sample_data[sample_data.dnarun_name == dnarun]
+                group = list(dnarun_data.dnasample_sample_group)[0]
+                cycle = list(dnarun_data.dnasample_sample_group_cycle)[0]
+
+                filename = my_file
+                isParent = False
+                for key in parents:
+                    value = parents[key]
+                    if dnarun in value:
+                        filename = my_file+'_'+key+'.txt'
+                        if not os.path.isfile(filename) :
+                            f = open(filename, "w")
+                            f.write('%s' % header)
+                        else:
+                            f=open(filename, "a+")
+                        f.write('%s' % line)
+                        isParent = True
+                
+                if isParent:
+                    continue
+                
+                if isinstance(group, float) and math.isnan(group):
+                    continue
+                else:
+                    # get parent data #
+                    
+                    # get file name for genotype data
+                    if isinstance(cycle, float) and math.isnan(cycle):
+                        filename += '_'+group+'.txt'
+                    else:
+                        filename += '_'+group+'_'+cycle+'.txt'
+                    
+                    # save genotype data to file
+                    if not os.path.isfile(filename) :
+                        f = open(filename, "w")
+                        f.write('%s' % header)
+                    else :
+                        f=open(filename, "a+")
+                    f.write('%s' % line)
+
+def splitData(samplefile, genofile):
+    # Split sample file #
+    sample_data = pd.read_table(samplefile, dtype='str')
+    group_list = sample_data.dnasample_sample_group.drop_duplicates()
+    for index, item in group_list.iteritems():
+        if isinstance(item, float):
+            if math.isnan(item):
+                continue
+        elif isinstance(item, str):
+            if not item:
+                continue
+        df = sample_data[sample_data.dnasample_sample_group == item]
+        
+        # store dnaruns of parents in a dictionary
+        par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1)))
+        par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2)))
+        lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name'])
+        lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name'])
+        mergedlst = lst1 + lst2
+                
+        subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates()
+        for idx, sub in subgroup_list.iteritems():
+            if isinstance(sub, float):
+                if math.isnan(sub):
+#                     df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
+                    if not item in parents and mergedlst:
+                        parents.update({item : mergedlst})
+                    continue
+            elif isinstance(sub, str):
+                if not sub:
+#                     df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
+                    continue
+
+            subkey = item+'_'+sub
+            if not subkey in parents and mergedlst:
+                parents.update({subkey : lst1+lst2})
+#             df_sub = df[df.dnasample_sample_group_cycle == sub]
+#             df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n")
+    
+    # Split genotype file based on sample information #
+    splitfile(samplefile, sample_data)
+    splitfile(genofile, sample_data)
+    
+def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile):
+    for key in groups:
+        sfile = samplefile+'_'+key+'.txt'
+        gfile = genofile+'_'+key+'.txt.tmp'
+        cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile]
+        call(cmd)
+    
+def createHeader(groups, samplefile, genofile, headerjar):
+    for key in groups:
+        sfile = samplefile+'_'+key+'.txt'
+        gfile = genofile+'_'+key+'.txt'
+        cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp']
+        call(cmd)
+    
+def main(argv=None):
+    '''Command line options.'''
+
+    program_name = os.path.basename(sys.argv[0])
+    program_version = "v0.1"
+    program_build_date = "%s" % __updated__
+
+    program_version_string = '%%prog %s (%s)' % (program_version, program_build_date)
+    #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse
+    program_longdesc = '''''' # optional - give further explanation about what the program does
+    program_license = "Copyright 2017 user_name (organization_name)                                            \
+                Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0"
+
+    if argv is None:
+        argv = sys.argv[1:]
+    try:
+        # setup option parser
+        parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license)
+        parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE")
+        parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE")
+        parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE")
+        parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE")
+        parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE")
+        parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE")
+        parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]")
+        parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]")
+        parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]")
+
+        # process options
+        (opts, args) = parser.parse_args(argv)
+
+        if opts.verbose > 0:
+            print("verbosity level = %d" % opts.verbose)
+        if opts.genofile:
+            print("genofile = %s" % opts.genofile)
+        else:
+            sys.stderr.write("no genotype file detected!")
+        if opts.samplefile:
+            print("samplefile = %s" % opts.samplefile)
+        else:
+            sys.stderr.write("no sample file detected!")
+        if opts.mapfile:
+            print("mapfile = %s" % opts.mapfile)
+        else:
+            sys.stderr.write("no map file detected!")
+        if opts.qtlfile:
+            print("qtlfile = %s" % opts.qtlfile)
+        else:
+            sys.stderr.write("no QTL file detected!")
+        if opts.jarfile:
+            print("jarfile = %s" % opts.jarfile)
+        else:
+            sys.stderr.write("no Flapjack project creator jar file detected!")
+        if opts.headerjar:
+            print("headerjar = %s" % opts.headerjar)
+        else:
+            sys.stderr.write("no Flapjack header creator jar file detected!")
+
+        # MAIN BODY #
+        splitData(samplefile=opts.samplefile, genofile=opts.genofile)
+        createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar)
+        createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile)
+        
+                            
+    except Exception, e:
+        indent = len(program_name) * " "
+        sys.stderr.write(program_name + ": " + repr(e) + "\n")
+        sys.stderr.write(indent + "  for help use --help")
+        return 2
+
+
+if __name__ == "__main__":
+#     if DEBUG:
+#         sys.argv.append("-h")
+    if TESTRUN:
+        import doctest
+        doctest.testmod()
+    if PROFILE:
+        import cProfile
+        import pstats
+        profile_filename = 'DNASampleSplitter_profile.txt'
+        cProfile.run('main()', profile_filename)
+        statsfile = open("profile_stats.txt", "wb")
+        p = pstats.Stats(profile_filename, stream=statsfile)
+        stats = p.strip_dirs().sort_stats('cumulative')
+        stats.print_stats()
+        statsfile.close()
+        sys.exit(0)
+    sys.exit(main())
\ No newline at end of file
--- a/favalleleheadercreator.xml	Wed Feb 21 09:33:24 2018 -0500
+++ b/favalleleheadercreator.xml	Wed Feb 21 11:54:51 2018 -0500
@@ -1,17 +1,17 @@
-<tool id="intertek_to_flapjack_fav_allele" name="Intertek -> Flapjack favourable allele header formate" version="0.0.1">
+<tool id="intertek_flapjack_snp_sample_processor" name="Intertek / Flapjack - SNP Sample File Processor" version="0.0.1">
 	<description>convert an HDF5 formatted genotype file into a Flapjack formatted genotype file</description>
 	<command><![CDATA[
 		java
 		#if $adv_opts.show_advanced
 			-Xmx$adv_opts.memory
 		#end if
-		-cp $__tool_directory__/lib/flapjack.jar jhi.flapjack.io.cmd.IntertekFavAlleleHeaderCreator
+		-cp $__tool_directory__/lib/flapjack.jar jhi.flapjack.io.cmd.IntertekFlapjackSnpSampleFileProcessor
 		-g $genotype
 		-i $intertek
 		-o $output
 	]]></command>
 	<inputs>
-		<param format="txt" name="intertek" type="data" label="Intertek favourable allele file"
+		<param format="txt" name="intertek" type="data" label="Favourable allele file"
 			help="An Intertek file with favourable and unfavourable allele information for markers."/>
 		<param format="fjgenotype" name="genotype" type="data" label="Flapjack genotype file"
 			help="A Flapjack formatted genotype file with markers which match those in the intertek file so that we can
Binary file lib/flapjack.jar has changed
Binary file lib/pedigreeheader.jar has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/split-by-sample.xml	Wed Feb 21 11:54:51 2018 -0500
@@ -0,0 +1,84 @@
+<tool id="flapjack_createproject" name="Flapjack CreateProject" version="0.0.1">
+    <description>create a Flapjack project file from Flapjack formatted input files.</description>
+    <command><![CDATA[
+        python $__tool_directory__/FlapjackProject.py
+        -j $__tool_directory__/lib/flapjack.jar
+        -g $genotypes
+        -s $traits
+        #if $map
+            -m $map
+        #end if
+        #if $name
+            -n $name
+        #end if
+        #if $qtls
+            -q $qtls
+        #end if
+        #if $adv_opts.show_advanced
+            -M "$adv_opts.missingData"
+            -S "$adv_opts.hetSep"
+        #end if
+    ]]>
+    </command>
+    <inputs>
+        <param format="fjgenotype" name="genotypes" type="data" label="Flapjack genotype file"
+            help="Flapjack-formatted (tab-delimited text) input file of genotype data"/>
+
+        <param format="fjphenotype" name="traits" type="data" label="Flapjack phenotype file"
+            help="Flapjack-formatted (tab-delimited text) input file of phenotype data"/>
+
+        <param format="fjmap" name="map" type="data" label="Flapjack map file"
+            help="Flapjack-formatted (tab-delimited text) input file of map data"
+            optional="true"/>
+
+        <param format="fjqtl" name="qtls" type="data" label="Flapjack QTL file"
+            help="Flapjack-formatted (tab-delimited text) input file of QTL data"
+            optional="true"/>
+
+        <!-- WARNING: Below is the advanced options section shared by all of our tools. If you alter it here you must update the other tools as well.-->
+        <conditional name="adv_opts">
+            <param name="show_advanced" type="boolean"
+               label="Enable advanced options"
+               truevalue="show" falsevalue="">
+            </param>
+            <when value="show">
+                <param name="missingData" type="text" value="-"
+                    label="Missing data string"
+                    help="Alleles with missing data are encoded using this string. Clear the box to use an empty string for missing data instead."/>
+
+                <param name="hetSep" type="text" value="/"
+                    label="Heterozygous allele separator string"
+                    help="Specifies the string used to separator heterozygous alleles (eg 'A/T'). Clear the box to specify no string is used (eg 'AT')"/>
+            </when>
+        </conditional>
+        <!-- WARNING: Above is the advanced options section shared by all of our tools. If you alter it here you must update the other tools as well.-->
+    </inputs>
+
+    <outputs>
+        <data name="output" format="flapjack" />
+    </outputs>
+  
+    <stdio>
+        <exit_code range="1:" />
+    </stdio>
+
+    <help><![CDATA[
+.. class:: infomark
+
+**What it does**
+
+Flapjack is a multi-platform application providing interactive visualizations of high-throughput genotype data,
+allowing for rapid navigation and comparisons between lines, markers and chromosomes.
+
+This tool creates a Flapjack project file from the supplied tab-delimitted flapjack input files.
+See Flapjack's documentation_ for more details on the various Flapjack data formats.
+More information on the CreateProject tool can be found here_
+
+.. _documentation: http://flapjack.hutton.ac.uk/en/latest/projects_&_data_formats.html
+.. _here: http://flapjack.hutton.ac.uk/en/latest/command_line_support.html#createproject-exe-jhi-flapjack-io-createproject
+    ]]></help>
+
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btq580</citation>
+    </citations>
+</tool>
\ No newline at end of file