# HG changeset patch
# User greg
# Date 1541790293 18000
# Node ID 4ac59322b173ba6eb80ff4b0aba6f9c29c9d3d4e
# Parent fa6ce3d66c4dc13a59dfe416f5ba17bbc50e0a99
Uploaded
diff -r fa6ce3d66c4d -r 4ac59322b173 genotype_population_info.py
--- a/genotype_population_info.py Wed Oct 31 13:52:19 2018 -0400
+++ b/genotype_population_info.py Fri Nov 09 14:04:53 2018 -0500
@@ -1,97 +1,51 @@
#!/usr/bin/env python
"""
-Generate the genotype_population_info.txt file by parsing the information from a VCF
-file and querying the stag database that is required to be available within the Galaxy
-instance in which this tool is executing. PostgreSQL 9.1 or greater is required.
+Generate the genotype_population_info.txt file by parsing the information
+from a Affymetrix 96 well plate CSV file and an associated VCF file.
"""
import argparse
import sys
-import psycopg2
-from sqlalchemy import create_engine
-from sqlalchemy.engine.url import make_url
-
-
-class PopInfoGenerator(object):
-
- def __init__(self):
- self.args = None
- self.conn = None
- self.parse_args()
- self.connect_db()
- self.engine = create_engine(self.args.database_connection_string)
-
- def parse_args(self):
- parser = argparse.ArgumentParser()
- parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
- parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file')
- parser.add_argument('--output', dest='output', help='Output dataset'),
- self.args = parser.parse_args()
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_csv', dest='input_csv', help='Affymetrix 96 well plate file')
+parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file')
+parser.add_argument('--output', dest='output', help='Output dataset'),
+self.args = parser.parse_args()
- def connect_db(self):
- url = make_url(self.args.database_connection_string)
- self.log('Connecting to database with URL: %s' % url)
- args = url.translate_connect_args(username='user')
- args.update(url.query)
- assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
- self.conn = psycopg2.connect(**args)
-
- def run(self):
- self.gen_pop_info()
- self.fh.flush()
- self.fh.close()
-
- def shutdown(self):
- self.conn.close()
-
- def stop_err(self, msg):
- sys.stderr.write(msg)
-
- def log(self, msg):
- self.fh.write("%s\n" % msg)
- self.fh.flush()
+# Parse the input_vcf file, looking for the first line
+# that starts with the string "#CHROM"
+with open(args.input_vcf, "r") as vcfh:
+ for line in vcfh:
+ if not line.startswith("#CHROM"):
+ continue
+ line = line.rstrip("\r\n")
+ # Example line:
+ # #CHROM 13704 13706 13708 13736 13748 13762 13782
+ items = line.split("\t")
+ sample_list = items[8:]
+ break
- def get_sample_list(self):
- # Parse the input_vcf file, looking for the first line
- # that starts with the string "#CHROM"
- with open(self.args.input_vcf, "r") as vcfh:
- for line in vcfh:
- if not line.startswith("#CHROM"):
- continue
- line = line.rstrip("\r\n")
- # Example line:
- # #CHROM 13704 13706 13708 13736 13748 13762 13782
- items = line.split("\t")
- sample_list = items[8:]
- break
- return sample_list
+# Parse the input_csv file to get the region for for
+# each sample_id in the sample_list. Initialize the
+# region_list to be the same as the sample_list to ensure
+# the same length.
+region_list = [x for x in sample_list]
+with open(args.input_csv, "r") as csvh:
+ for i, line in enumerate(csvh):
+ if i == 0:
+ # Skip the header.
+ continue
+ line = line.rstrip('\r\n')
+ items = line.split(',')
+ csv_sample_id = items[0]
+ csv_region = items[9]
+ # Make sure the csv_sample_id is in the sample_list.
+ loc = sample_list.index(csv_sample_id)
+ if loc >= 0:
+ region_list[loc] = csv_region
- def get_region_list(self, sample_list):
- # Retrieve the value of the region column in the reef table
- # for each sample_id in the sample_list.
- region_list = []
- for sample_id in sample_list:
- sql = """SELECT reef.region
- FROM reef
- LEFT OUTER JOIN colony ON reef.id = colony.reef_id
- LEFT OUTER JOIN sample ON sample.colony_id = colony.id
- WHERE sample.id = '%s';""" % sample_id
- cur = self.conn.cursor()
- cur.execute(sql)
- region_list.append(cur.fetchone()[0])
- return region_list
-
- def gen_pop_info(self):
- sample_list = self.get_sample_list()
- region_list = self.get_region_list(sample_list)
- # The output file will consist of columns:
- # Item # Sample ID Region
- with open(self.args.output, "w") as outfh:
- for i, sample_id in sample_list:
- outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
-
-
-if __name__ == '__main__':
- pop_info_generator = PopInfoGenerator()
- pop_info_generator.run()
- pop_info_generator.shutdown()
+# The output file will consist of columns:
+# Item #, Sample ID, Region
+with open(self.args.output, "w") as outfh:
+ for i, sample_id in sample_list:
+ outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
diff -r fa6ce3d66c4d -r 4ac59322b173 genotype_population_info.xml
--- a/genotype_population_info.xml Wed Oct 31 13:52:19 2018 -0400
+++ b/genotype_population_info.xml Fri Nov 09 14:04:53 2018 -0500
@@ -2,10 +2,13 @@
from VCF
+
+ value is not None and value.metadata.columns==31 and value.metadata.data_lines==96
+
@@ -13,6 +16,7 @@
+