Mercurial > repos > greg > genotype_population_info
changeset 1:4ac59322b173 draft
Uploaded
author | greg |
---|---|
date | Fri, 09 Nov 2018 14:04:53 -0500 |
parents | fa6ce3d66c4d |
children | 822169cfc355 |
files | genotype_population_info.py genotype_population_info.xml |
diffstat | 2 files changed, 47 insertions(+), 89 deletions(-) [+] |
line wrap: on
line diff
--- a/genotype_population_info.py Wed Oct 31 13:52:19 2018 -0400 +++ b/genotype_population_info.py Fri Nov 09 14:04:53 2018 -0500 @@ -1,97 +1,51 @@ #!/usr/bin/env python """ -Generate the genotype_population_info.txt file by parsing the information from a VCF -file and querying the stag database that is required to be available within the Galaxy -instance in which this tool is executing. PostgreSQL 9.1 or greater is required. +Generate the genotype_population_info.txt file by parsing the information +from a Affymetrix 96 well plate CSV file and an associated VCF file. """ import argparse import sys -import psycopg2 -from sqlalchemy import create_engine -from sqlalchemy.engine.url import make_url - - -class PopInfoGenerator(object): - - def __init__(self): - self.args = None - self.conn = None - self.parse_args() - self.connect_db() - self.engine = create_engine(self.args.database_connection_string) - - def parse_args(self): - parser = argparse.ArgumentParser() - parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), - parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file') - parser.add_argument('--output', dest='output', help='Output dataset'), - self.args = parser.parse_args() +parser = argparse.ArgumentParser() +parser.add_argument('--input_csv', dest='input_csv', help='Affymetrix 96 well plate file') +parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file') +parser.add_argument('--output', dest='output', help='Output dataset'), +self.args = parser.parse_args() - def connect_db(self): - url = make_url(self.args.database_connection_string) - self.log('Connecting to database with URL: %s' % url) - args = url.translate_connect_args(username='user') - args.update(url.query) - assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' - self.conn = psycopg2.connect(**args) - - def run(self): - self.gen_pop_info() - self.fh.flush() - self.fh.close() - - def shutdown(self): - self.conn.close() - - def stop_err(self, msg): - sys.stderr.write(msg) - - def log(self, msg): - self.fh.write("%s\n" % msg) - self.fh.flush() +# Parse the input_vcf file, looking for the first line +# that starts with the string "#CHROM" +with open(args.input_vcf, "r") as vcfh: + for line in vcfh: + if not line.startswith("#CHROM"): + continue + line = line.rstrip("\r\n") + # Example line: + # #CHROM 13704 13706 13708 13736 13748 13762 13782 + items = line.split("\t") + sample_list = items[8:] + break - def get_sample_list(self): - # Parse the input_vcf file, looking for the first line - # that starts with the string "#CHROM" - with open(self.args.input_vcf, "r") as vcfh: - for line in vcfh: - if not line.startswith("#CHROM"): - continue - line = line.rstrip("\r\n") - # Example line: - # #CHROM 13704 13706 13708 13736 13748 13762 13782 - items = line.split("\t") - sample_list = items[8:] - break - return sample_list +# Parse the input_csv file to get the region for for +# each sample_id in the sample_list. Initialize the +# region_list to be the same as the sample_list to ensure +# the same length. +region_list = [x for x in sample_list] +with open(args.input_csv, "r") as csvh: + for i, line in enumerate(csvh): + if i == 0: + # Skip the header. + continue + line = line.rstrip('\r\n') + items = line.split(',') + csv_sample_id = items[0] + csv_region = items[9] + # Make sure the csv_sample_id is in the sample_list. + loc = sample_list.index(csv_sample_id) + if loc >= 0: + region_list[loc] = csv_region - def get_region_list(self, sample_list): - # Retrieve the value of the region column in the reef table - # for each sample_id in the sample_list. - region_list = [] - for sample_id in sample_list: - sql = """SELECT reef.region - FROM reef - LEFT OUTER JOIN colony ON reef.id = colony.reef_id - LEFT OUTER JOIN sample ON sample.colony_id = colony.id - WHERE sample.id = '%s';""" % sample_id - cur = self.conn.cursor() - cur.execute(sql) - region_list.append(cur.fetchone()[0]) - return region_list - - def gen_pop_info(self): - sample_list = self.get_sample_list() - region_list = self.get_region_list(sample_list) - # The output file will consist of columns: - # Item # Sample ID Region - with open(self.args.output, "w") as outfh: - for i, sample_id in sample_list: - outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1])) - - -if __name__ == '__main__': - pop_info_generator = PopInfoGenerator() - pop_info_generator.run() - pop_info_generator.shutdown() +# The output file will consist of columns: +# Item #, Sample ID, Region +with open(self.args.output, "w") as outfh: + for i, sample_id in sample_list: + outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
--- a/genotype_population_info.xml Wed Oct 31 13:52:19 2018 -0400 +++ b/genotype_population_info.xml Fri Nov 09 14:04:53 2018 -0500 @@ -2,10 +2,13 @@ <description>from VCF</description> <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/genotype_population_info.py' ---database_connection_string '$__app__.config.corals_database_connection' +--input_csv '$input_csv' --input_vcf '$input_vcf' --output '$output']]></command> <inputs> + <param name="input_csv" type="data" format="csv" label="Affymetrix 96 well plate CSV file"> + <validator type="expression" message="96 well plate data must have 31 columns and 96 rows">value is not None and value.metadata.columns==31 and value.metadata.data_lines==96</validator> + </param> <param name="input_vcf" type="data" format="vcf" label="VCF file"/> </inputs> <outputs> @@ -13,6 +16,7 @@ </outputs> <tests> <test> + <param name="input_csv" value="96_well_plate.csv" ftype="csv"/> <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/> <output name="output" file="output.tabular" ftype="tabular"/> </test>