Mercurial > repos > greg > genotype_population_info
changeset 7:899a1b99dcf0 draft
Uploaded
author | greg |
---|---|
date | Tue, 20 Nov 2018 15:12:30 -0500 |
parents | be5f52df9a25 |
children | 6fa8a923f96b |
files | genotype_population_info.py genotype_population_info.xml |
diffstat | 2 files changed, 10 insertions(+), 67 deletions(-) [+] |
line wrap: on
line diff
--- a/genotype_population_info.py Fri Nov 09 14:16:37 2018 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#!/usr/bin/env python -""" -Generate the genotype_population_info.txt file by parsing the information -from a Affymetrix 96 well plate CSV file and an associated VCF file. -""" -import argparse -import sys - -parser = argparse.ArgumentParser() -parser.add_argument('--input_csv', dest='input_csv', help='Affymetrix 96 well plate file') -parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file') -parser.add_argument('--output', dest='output', help='Output dataset'), -args = parser.parse_args() - -# Parse the input_vcf file, looking for the first line -# that starts with the string "#CHROM" -with open(args.input_vcf, "r") as vcfh: - for line in vcfh: - if not line.startswith("#CHROM"): - continue - line = line.rstrip("\r\n") - # Example line: - # #CHROM 13704 13706 13708 13736 13748 13762 13782 - items = line.split("\t") - sample_list = items[8:] - break - -# Parse the input_csv file to get the region for for -# each sample_id in the sample_list. Initialize the -# region_list to be the same as the sample_list to ensure -# the same length. -region_list = [x for x in sample_list] -with open(args.input_csv, "r") as csvh: - for i, line in enumerate(csvh): - if i == 0: - # Skip the header. - continue - line = line.rstrip('\r\n') - items = line.split(',') - csv_sample_id = items[0] - csv_region = items[9] - # Make sure the csv_sample_id is in the sample_list. - try: - loc = sample_list.index(csv_sample_id) - region_list[loc] = csv_region - except Exception: - pass - -# The output file will consist of columns: -# Item #, Sample ID, Region -with open(args.output, "w") as outfh: - for i, sample_id in enumerate(sample_list): - outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
--- a/genotype_population_info.xml Fri Nov 09 14:16:37 2018 -0500 +++ b/genotype_population_info.xml Tue Nov 20 15:12:30 2018 -0500 @@ -1,14 +1,14 @@ <tool id="genotype_population_info" name="Generate genotype population info" version="1.0.0"> - <description>from Affymetrix data</description> + <description>from VCF data</description> <command detect_errors="exit_code"><![CDATA[ -python '$__tool_directory__/genotype_population_info.py' ---input_csv '$input_csv' ---input_vcf '$input_vcf' ---output '$output']]></command> +#set header = 'header.txt' +#set samples = 'samples.txt' +grep "#CHROM" $input_vcf > $header && +tr '\t' '\n' < $header > $samples && +sed -i 1,9d $samples && +awk -F'\t' -v OFS='\t' 'NR==0 {print ; next}{print (NR),$0}' $samples > $output +]]></command> <inputs> - <param name="input_csv" type="data" format="csv" label="Affymetrix 96 well plate CSV file"> - <validator type="expression" message="96 well plate data must have 31 columns and 96 rows">value is not None and value.metadata.columns==31 and value.metadata.data_lines==96</validator> - </param> <param name="input_vcf" type="data" format="vcf" label="VCF file"/> </inputs> <outputs> @@ -16,8 +16,7 @@ </outputs> <tests> <test> - <param name="input_csv" value="96_well_plate.csv" ftype="csv"/> - <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/> + <param name="input_vcf" value="input.vcf" ftype="vcf"/> <output name="output" file="output.tabular" ftype="tabular"/> </test> </tests> @@ -26,10 +25,7 @@ Generates a file that contains the genotype population information that can be used as input to the multilocus_genotype tool. This tool can be used only within a Galaxy instance which -includes the complementaty stag database. ------ - -**Required options** +includes the complementary stag database. </help> <citations> </citations>