Mercurial > repos > greg > genotype_population_info
changeset 8:6fa8a923f96b draft
Uploaded
author | greg |
---|---|
date | Wed, 05 Dec 2018 09:31:35 -0500 |
parents | 899a1b99dcf0 |
children | 4e6f6e534929 |
files | genotype_population_info.py genotype_population_info.xml |
diffstat | 2 files changed, 106 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genotype_population_info.py Wed Dec 05 09:31:35 2018 -0500 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +import argparse +import sys + +import psycopg2 +from sqlalchemy import create_engine, MetaData +from sqlalchemy.engine.url import make_url + + +class GenotypeInfoGenerator(object): + def __init__(self): + self.args = None + self.conn = None + self.parse_args() + self.outfh = open(self.args.output, "w") + self.connect_db() + self.engine = create_engine(self.args.database_connection_string) + self.metadata = MetaData(self.engine) + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), + parser.add_argument('--input_partial_info', dest='input_partial_info', help='Tabular file containing part of the genotype info') + parser.add_argument('--output', dest='output', help='Output dataset'), + self.args = parser.parse_args() + + def connect_db(self): + url = make_url(self.args.database_connection_string) + args = url.translate_connect_args(username='user') + args.update(url.query) + assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' + self.conn = psycopg2.connect(**args) + + def run(self): + sql = """ + SELECT sample.user_specimen_id, + reef.region + FROM sample + LEFT OUTER JOIN colony + ON sample.colony_id = colony.id + LEFT OUTER JOIN reef + ON reef.id = colony.reef_id + WHERE sample.affy_id = '%s'; + """ + with open(self.args.input_partial_info, "r") as fh: + for line in fh: + line = line.strip() + out_items = [] + items = line.split('\t') + # Item number. + out_items.append(items[0]) + affy_id = items[1] + out_items.append(affy_id) + if len(items) == 2: + # Example line: + # 1 a100000-4368120-060520-256_I07.CEL + # The line is missing the user_specimen_id and + # region, so retrieve it from the database. + query = sql % affy_id + cur = self.conn.cursor() + cur.execute(query) + try: + missing_items = cur.fetchone() + # user_specimen_id + out_items.append(missing_items[0]) + # region + out_items.append(missing_items[1]) + except Exception as e: + msg = "Error retrieving user_specimen_id and region from the database for affy_id %s: %s" % (affy_id, e) + self.stop_err(msg) + else: + # The line contains all of the information we need. + # user_specimen_id + out_items.append(items[3]) + # region + out_items.append(items[9]) + self.outfh.write("%s\n" % "\t".join(out_items)) + self.outfh.close() + + def shutdown(self): + self.conn.close() + + def stop_err(self, msg): + sys.stderr.write(msg) + self.outfh.flush() + self.outfh.close() + sys.exit(1) + + +if __name__ == '__main__': + gig = GenotypeInfoGenerator() + gig.run() + gig.shutdown()
--- a/genotype_population_info.xml Tue Nov 20 15:12:30 2018 -0500 +++ b/genotype_population_info.xml Wed Dec 05 09:31:35 2018 -0500 @@ -1,32 +1,32 @@ -<tool id="genotype_population_info" name="Generate genotype population info" version="1.0.0"> - <description>from VCF data</description> +<tool id="genotype_population_info" name="Generate genotype population" version="1.0.0"> + <description>information from a partial dataset</description> <command detect_errors="exit_code"><![CDATA[ -#set header = 'header.txt' -#set samples = 'samples.txt' -grep "#CHROM" $input_vcf > $header && -tr '\t' '\n' < $header > $samples && -sed -i 1,9d $samples && -awk -F'\t' -v OFS='\t' 'NR==0 {print ; next}{print (NR),$0}' $samples > $output +python $__tool_directory__/genotype_population_info.py +--database_connection_string '$__app__.config.corals_database_connection' +--input_partial_info '$input_partial_info' +--output '$output' ]]></command> <inputs> - <param name="input_vcf" type="data" format="vcf" label="VCF file"/> + <param name="input_partial_info" type="data" format="tabular" label="Partial genotype population file"/> </inputs> <outputs> <data name="output" format="tabular"/> </outputs> <tests> <test> - <param name="input_vcf" value="input.vcf" ftype="vcf"/> + <param name="input_partial_info" value="input.tabular" ftype="tabular"/> <output name="output" file="output.tabular" ftype="tabular"/> </test> </tests> <help> **What it does** -Generates a file that contains the genotype population information that can be used as input -to the multilocus_genotype tool. This tool can be used only within a Galaxy instance which -includes the complementary stag database. +Accepts a file that contains the Affymetrix ids and a subset of the user specimen ids and regions for genotyping. +The tool queries the stag database to retrieve the user specimen ids and regions that are missing for each Affymetrix +id anf produces a complete set of genotype population information for use as input to the multilocus_genotype tool. +This tool can be used only within a Galaxy instance which includes the complementary corals (stag) database. </help> <citations> </citations> </tool> +