Mercurial > repos > cropgeeks > flapjack
changeset 65:d7e91a614582 draft
Uploaded
author | cropgeeks |
---|---|
date | Wed, 21 Feb 2018 11:54:51 -0500 |
parents | 3b4e505bdad3 |
children | d709024d6bb4 |
files | FlapjackProject.py favalleleheadercreator.xml lib/flapjack.jar lib/pedigreeheader.jar split-by-sample.xml |
diffstat | 5 files changed, 330 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/FlapjackProject.py Wed Feb 21 11:54:51 2018 -0500 @@ -0,0 +1,243 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +DNASampleSplitter -- shortdesc + +DNASampleSplitter is a description + +It defines classes_and_methods + +@author: John Carlos Ignacia, Milcah, Yaw Nti-Addae + +@copyright: 2017 Cornell University. All rights reserved. + +@license: MIT License + +@contact: yn259@cornell.edu +@deffield updated: Updated +''' + +import sys +import os +import math +import pandas as pd + +from optparse import OptionParser +from __builtin__ import str +from subprocess import call + +__all__ = [] +__version__ = 0.1 +__date__ = '2017-06-20' +__updated__ = '2017-06-20' + +DEBUG = 1 +TESTRUN = 0 +PROFILE = 0 + +parents = {} + +def splitfile(my_file, sample_data): + header = '' + fj_header = '' + with open(my_file) as infile: + for line in infile: + if line[:2] == '# ': + fj_header += line + elif header == '': + if fj_header == '': + fj_header = '# fjFile = PHENOTYPE\n' + header_list = line.split('\t') + if header_list[0] != '': + header_list[0] = '' + line = "\t".join(header_list) + header = fj_header+line + else: + lst = line.split('\t') + dnarun = lst[0] + dnarun_data = sample_data[sample_data.dnarun_name == dnarun] + group = list(dnarun_data.dnasample_sample_group)[0] + cycle = list(dnarun_data.dnasample_sample_group_cycle)[0] + + filename = my_file + isParent = False + for key in parents: + value = parents[key] + if dnarun in value: + filename = my_file+'_'+key+'.txt' + if not os.path.isfile(filename) : + f = open(filename, "w") + f.write('%s' % header) + else: + f=open(filename, "a+") + f.write('%s' % line) + isParent = True + + if isParent: + continue + + if isinstance(group, float) and math.isnan(group): + continue + else: + # get parent data # + + # get file name for genotype data + if isinstance(cycle, float) and math.isnan(cycle): + filename += '_'+group+'.txt' + else: + filename += '_'+group+'_'+cycle+'.txt' + + # save genotype data to file + if not os.path.isfile(filename) : + f = open(filename, "w") + f.write('%s' % header) + else : + f=open(filename, "a+") + f.write('%s' % line) + +def splitData(samplefile, genofile): + # Split sample file # + sample_data = pd.read_table(samplefile, dtype='str') + group_list = sample_data.dnasample_sample_group.drop_duplicates() + for index, item in group_list.iteritems(): + if isinstance(item, float): + if math.isnan(item): + continue + elif isinstance(item, str): + if not item: + continue + df = sample_data[sample_data.dnasample_sample_group == item] + + # store dnaruns of parents in a dictionary + par1 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par1))) + par2 = list(set(filter(lambda x: str(x) != 'nan', df.germplasm_par2))) + lst1 = list(sample_data.loc[sample_data.germplasm_name.isin(par1), 'dnarun_name']) + lst2 = list(sample_data.loc[sample_data.germplasm_name.isin(par2), 'dnarun_name']) + mergedlst = lst1 + lst2 + + subgroup_list = df.dnasample_sample_group_cycle.drop_duplicates() + for idx, sub in subgroup_list.iteritems(): + if isinstance(sub, float): + if math.isnan(sub): +# df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") + if not item in parents and mergedlst: + parents.update({item : mergedlst}) + continue + elif isinstance(sub, str): + if not sub: +# df.to_csv(samplefile+"_"+item+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") + continue + + subkey = item+'_'+sub + if not subkey in parents and mergedlst: + parents.update({subkey : lst1+lst2}) +# df_sub = df[df.dnasample_sample_group_cycle == sub] +# df_sub.to_csv(samplefile+"_"+item+"_"+sub+".txt", index=None, na_rep='', sep="\t", mode="w", line_terminator="\n") + + # Split genotype file based on sample information # + splitfile(samplefile, sample_data) + splitfile(genofile, sample_data) + +def createProjectFile(groups, samplefile, genofile, jarfile, separator, missing, qtlfile, mapfile): + for key in groups: + sfile = samplefile+'_'+key+'.txt' + gfile = genofile+'_'+key+'.txt.tmp' + cmd = ['java', '-cp',jarfile,'jhi.flapjack.io.cmd.CreateProject','-A','-g',gfile,'-t',sfile,'-p',genofile+'.flapjack','-S',separator,'-M',missing,'-n',key,'-q',qtlfile,'-m',mapfile] + call(cmd) + +def createHeader(groups, samplefile, genofile, headerjar): + for key in groups: + sfile = samplefile+'_'+key+'.txt' + gfile = genofile+'_'+key+'.txt' + cmd = ['java','-jar',headerjar,sfile,gfile,gfile+'.tmp'] + call(cmd) + +def main(argv=None): + '''Command line options.''' + + program_name = os.path.basename(sys.argv[0]) + program_version = "v0.1" + program_build_date = "%s" % __updated__ + + program_version_string = '%%prog %s (%s)' % (program_version, program_build_date) + #program_usage = '''usage: spam two eggs''' # optional - will be autogenerated by optparse + program_longdesc = '''''' # optional - give further explanation about what the program does + program_license = "Copyright 2017 user_name (organization_name) \ + Licensed under the Apache License 2.0\nhttp://www.apache.org/licenses/LICENSE-2.0" + + if argv is None: + argv = sys.argv[1:] + try: + # setup option parser + parser = OptionParser(version=program_version_string, epilog=program_longdesc, description=program_license) + parser.add_option("-g", "--geno", dest="genofile", help="set input genotype file path [default: %default]", metavar="FILE") + parser.add_option("-s", "--sample", dest="samplefile", help="set input sample file path [default: %default]", metavar="FILE") + parser.add_option("-m", "--mapfile", dest="mapfile", help="set input map file path [default: %default]", metavar="FILE") + parser.add_option("-q", "--qtlfile", dest="qtlfile", help="set input QTL file path [default: %default]", metavar="FILE") + parser.add_option("-j", "--jar", dest="jarfile", help="set Flapjack project creator jar file path [default: %default]", metavar="FILE") + parser.add_option("-J", "--headerjar", dest="headerjar", help="set Flapjack header creator jar file path [default: %default]", metavar="FILE") + parser.add_option("-S", "--separator", dest="separator", help="declare separator for genotypes, '' for no separator [default: %default]") + parser.add_option("-M", "--missingGenotype", dest="missing", help="set missing genotype string [default: %default]") + parser.add_option("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %default]") + + # process options + (opts, args) = parser.parse_args(argv) + + if opts.verbose > 0: + print("verbosity level = %d" % opts.verbose) + if opts.genofile: + print("genofile = %s" % opts.genofile) + else: + sys.stderr.write("no genotype file detected!") + if opts.samplefile: + print("samplefile = %s" % opts.samplefile) + else: + sys.stderr.write("no sample file detected!") + if opts.mapfile: + print("mapfile = %s" % opts.mapfile) + else: + sys.stderr.write("no map file detected!") + if opts.qtlfile: + print("qtlfile = %s" % opts.qtlfile) + else: + sys.stderr.write("no QTL file detected!") + if opts.jarfile: + print("jarfile = %s" % opts.jarfile) + else: + sys.stderr.write("no Flapjack project creator jar file detected!") + if opts.headerjar: + print("headerjar = %s" % opts.headerjar) + else: + sys.stderr.write("no Flapjack header creator jar file detected!") + + # MAIN BODY # + splitData(samplefile=opts.samplefile, genofile=opts.genofile) + createHeader(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, headerjar=opts.headerjar) + createProjectFile(groups=parents, samplefile=opts.samplefile, genofile=opts.genofile, jarfile=opts.jarfile, separator=opts.separator, missing=opts.missing,qtlfile=opts.qtlfile,mapfile=opts.mapfile) + + + except Exception, e: + indent = len(program_name) * " " + sys.stderr.write(program_name + ": " + repr(e) + "\n") + sys.stderr.write(indent + " for help use --help") + return 2 + + +if __name__ == "__main__": +# if DEBUG: +# sys.argv.append("-h") + if TESTRUN: + import doctest + doctest.testmod() + if PROFILE: + import cProfile + import pstats + profile_filename = 'DNASampleSplitter_profile.txt' + cProfile.run('main()', profile_filename) + statsfile = open("profile_stats.txt", "wb") + p = pstats.Stats(profile_filename, stream=statsfile) + stats = p.strip_dirs().sort_stats('cumulative') + stats.print_stats() + statsfile.close() + sys.exit(0) + sys.exit(main()) \ No newline at end of file
--- a/favalleleheadercreator.xml Wed Feb 21 09:33:24 2018 -0500 +++ b/favalleleheadercreator.xml Wed Feb 21 11:54:51 2018 -0500 @@ -1,17 +1,17 @@ -<tool id="intertek_to_flapjack_fav_allele" name="Intertek -> Flapjack favourable allele header formate" version="0.0.1"> +<tool id="intertek_flapjack_snp_sample_processor" name="Intertek / Flapjack - SNP Sample File Processor" version="0.0.1"> <description>convert an HDF5 formatted genotype file into a Flapjack formatted genotype file</description> <command><![CDATA[ java #if $adv_opts.show_advanced -Xmx$adv_opts.memory #end if - -cp $__tool_directory__/lib/flapjack.jar jhi.flapjack.io.cmd.IntertekFavAlleleHeaderCreator + -cp $__tool_directory__/lib/flapjack.jar jhi.flapjack.io.cmd.IntertekFlapjackSnpSampleFileProcessor -g $genotype -i $intertek -o $output ]]></command> <inputs> - <param format="txt" name="intertek" type="data" label="Intertek favourable allele file" + <param format="txt" name="intertek" type="data" label="Favourable allele file" help="An Intertek file with favourable and unfavourable allele information for markers."/> <param format="fjgenotype" name="genotype" type="data" label="Flapjack genotype file" help="A Flapjack formatted genotype file with markers which match those in the intertek file so that we can
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split-by-sample.xml Wed Feb 21 11:54:51 2018 -0500 @@ -0,0 +1,84 @@ +<tool id="flapjack_createproject" name="Flapjack CreateProject" version="0.0.1"> + <description>create a Flapjack project file from Flapjack formatted input files.</description> + <command><![CDATA[ + python $__tool_directory__/FlapjackProject.py + -j $__tool_directory__/lib/flapjack.jar + -g $genotypes + -s $traits + #if $map + -m $map + #end if + #if $name + -n $name + #end if + #if $qtls + -q $qtls + #end if + #if $adv_opts.show_advanced + -M "$adv_opts.missingData" + -S "$adv_opts.hetSep" + #end if + ]]> + </command> + <inputs> + <param format="fjgenotype" name="genotypes" type="data" label="Flapjack genotype file" + help="Flapjack-formatted (tab-delimited text) input file of genotype data"/> + + <param format="fjphenotype" name="traits" type="data" label="Flapjack phenotype file" + help="Flapjack-formatted (tab-delimited text) input file of phenotype data"/> + + <param format="fjmap" name="map" type="data" label="Flapjack map file" + help="Flapjack-formatted (tab-delimited text) input file of map data" + optional="true"/> + + <param format="fjqtl" name="qtls" type="data" label="Flapjack QTL file" + help="Flapjack-formatted (tab-delimited text) input file of QTL data" + optional="true"/> + + <!-- WARNING: Below is the advanced options section shared by all of our tools. If you alter it here you must update the other tools as well.--> + <conditional name="adv_opts"> + <param name="show_advanced" type="boolean" + label="Enable advanced options" + truevalue="show" falsevalue=""> + </param> + <when value="show"> + <param name="missingData" type="text" value="-" + label="Missing data string" + help="Alleles with missing data are encoded using this string. Clear the box to use an empty string for missing data instead."/> + + <param name="hetSep" type="text" value="/" + label="Heterozygous allele separator string" + help="Specifies the string used to separator heterozygous alleles (eg 'A/T'). Clear the box to specify no string is used (eg 'AT')"/> + </when> + </conditional> + <!-- WARNING: Above is the advanced options section shared by all of our tools. If you alter it here you must update the other tools as well.--> + </inputs> + + <outputs> + <data name="output" format="flapjack" /> + </outputs> + + <stdio> + <exit_code range="1:" /> + </stdio> + + <help><![CDATA[ +.. class:: infomark + +**What it does** + +Flapjack is a multi-platform application providing interactive visualizations of high-throughput genotype data, +allowing for rapid navigation and comparisons between lines, markers and chromosomes. + +This tool creates a Flapjack project file from the supplied tab-delimitted flapjack input files. +See Flapjack's documentation_ for more details on the various Flapjack data formats. +More information on the CreateProject tool can be found here_ + +.. _documentation: http://flapjack.hutton.ac.uk/en/latest/projects_&_data_formats.html +.. _here: http://flapjack.hutton.ac.uk/en/latest/command_line_support.html#createproject-exe-jhi-flapjack-io-createproject + ]]></help> + + <citations> + <citation type="doi">10.1093/bioinformatics/btq580</citation> + </citations> +</tool> \ No newline at end of file