# HG changeset patch # User greg # Date 1544020295 18000 # Node ID 6fa8a923f96bcd9e3f4ab5df0553e1dbff20e9be # Parent 899a1b99dcf0dc3cc4e7eaef61c5d66ba99aeb2e Uploaded diff -r 899a1b99dcf0 -r 6fa8a923f96b genotype_population_info.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genotype_population_info.py Wed Dec 05 09:31:35 2018 -0500 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +import argparse +import sys + +import psycopg2 +from sqlalchemy import create_engine, MetaData +from sqlalchemy.engine.url import make_url + + +class GenotypeInfoGenerator(object): + def __init__(self): + self.args = None + self.conn = None + self.parse_args() + self.outfh = open(self.args.output, "w") + self.connect_db() + self.engine = create_engine(self.args.database_connection_string) + self.metadata = MetaData(self.engine) + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), + parser.add_argument('--input_partial_info', dest='input_partial_info', help='Tabular file containing part of the genotype info') + parser.add_argument('--output', dest='output', help='Output dataset'), + self.args = parser.parse_args() + + def connect_db(self): + url = make_url(self.args.database_connection_string) + args = url.translate_connect_args(username='user') + args.update(url.query) + assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' + self.conn = psycopg2.connect(**args) + + def run(self): + sql = """ + SELECT sample.user_specimen_id, + reef.region + FROM sample + LEFT OUTER JOIN colony + ON sample.colony_id = colony.id + LEFT OUTER JOIN reef + ON reef.id = colony.reef_id + WHERE sample.affy_id = '%s'; + """ + with open(self.args.input_partial_info, "r") as fh: + for line in fh: + line = line.strip() + out_items = [] + items = line.split('\t') + # Item number. + out_items.append(items[0]) + affy_id = items[1] + out_items.append(affy_id) + if len(items) == 2: + # Example line: + # 1 a100000-4368120-060520-256_I07.CEL + # The line is missing the user_specimen_id and + # region, so retrieve it from the database. + query = sql % affy_id + cur = self.conn.cursor() + cur.execute(query) + try: + missing_items = cur.fetchone() + # user_specimen_id + out_items.append(missing_items[0]) + # region + out_items.append(missing_items[1]) + except Exception as e: + msg = "Error retrieving user_specimen_id and region from the database for affy_id %s: %s" % (affy_id, e) + self.stop_err(msg) + else: + # The line contains all of the information we need. + # user_specimen_id + out_items.append(items[3]) + # region + out_items.append(items[9]) + self.outfh.write("%s\n" % "\t".join(out_items)) + self.outfh.close() + + def shutdown(self): + self.conn.close() + + def stop_err(self, msg): + sys.stderr.write(msg) + self.outfh.flush() + self.outfh.close() + sys.exit(1) + + +if __name__ == '__main__': + gig = GenotypeInfoGenerator() + gig.run() + gig.shutdown() diff -r 899a1b99dcf0 -r 6fa8a923f96b genotype_population_info.xml --- a/genotype_population_info.xml Tue Nov 20 15:12:30 2018 -0500 +++ b/genotype_population_info.xml Wed Dec 05 09:31:35 2018 -0500 @@ -1,32 +1,32 @@ - - from VCF data + + information from a partial dataset $header && -tr '\t' '\n' < $header > $samples && -sed -i 1,9d $samples && -awk -F'\t' -v OFS='\t' 'NR==0 {print ; next}{print (NR),$0}' $samples > $output +python $__tool_directory__/genotype_population_info.py +--database_connection_string '$__app__.config.corals_database_connection' +--input_partial_info '$input_partial_info' +--output '$output' ]]> - + - + **What it does** -Generates a file that contains the genotype population information that can be used as input -to the multilocus_genotype tool. This tool can be used only within a Galaxy instance which -includes the complementary stag database. +Accepts a file that contains the Affymetrix ids and a subset of the user specimen ids and regions for genotyping. +The tool queries the stag database to retrieve the user specimen ids and regions that are missing for each Affymetrix +id anf produces a complete set of genotype population information for use as input to the multilocus_genotype tool. +This tool can be used only within a Galaxy instance which includes the complementary corals (stag) database. +