changeset 1:4ac59322b173 draft

Uploaded
author greg
date Fri, 09 Nov 2018 14:04:53 -0500
parents fa6ce3d66c4d
children 822169cfc355
files genotype_population_info.py genotype_population_info.xml
diffstat 2 files changed, 47 insertions(+), 89 deletions(-) [+]
line wrap: on
line diff
--- a/genotype_population_info.py	Wed Oct 31 13:52:19 2018 -0400
+++ b/genotype_population_info.py	Fri Nov 09 14:04:53 2018 -0500
@@ -1,97 +1,51 @@
 #!/usr/bin/env python
 """
-Generate the genotype_population_info.txt file by parsing the information from a VCF
-file and querying the stag database that is required to be available within the Galaxy
-instance in which this tool is executing.  PostgreSQL 9.1 or greater is required.
+Generate the genotype_population_info.txt file by parsing the information
+from a Affymetrix 96 well plate CSV file and an associated VCF file.
 """
 import argparse
 import sys
 
-import psycopg2
-from sqlalchemy import create_engine
-from sqlalchemy.engine.url import make_url
-
-
-class PopInfoGenerator(object):
-
-    def __init__(self):
-        self.args = None
-        self.conn = None
-        self.parse_args()
-        self.connect_db()
-        self.engine = create_engine(self.args.database_connection_string)
-
-    def parse_args(self):
-        parser = argparse.ArgumentParser()
-        parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
-        parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file')
-        parser.add_argument('--output', dest='output', help='Output dataset'),
-        self.args = parser.parse_args()
+parser = argparse.ArgumentParser()
+parser.add_argument('--input_csv', dest='input_csv', help='Affymetrix 96 well plate file')
+parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file')
+parser.add_argument('--output', dest='output', help='Output dataset'),
+self.args = parser.parse_args()
 
-    def connect_db(self):
-        url = make_url(self.args.database_connection_string)
-        self.log('Connecting to database with URL: %s' % url)
-        args = url.translate_connect_args(username='user')
-        args.update(url.query)
-        assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
-        self.conn = psycopg2.connect(**args)
-
-    def run(self):
-        self.gen_pop_info()
-        self.fh.flush()
-        self.fh.close()
-
-    def shutdown(self):
-        self.conn.close()
-
-    def stop_err(self, msg):
-        sys.stderr.write(msg)
-
-    def log(self, msg):
-        self.fh.write("%s\n" % msg)
-        self.fh.flush()
+# Parse the input_vcf file, looking for the first line
+# that starts with the string "#CHROM"
+with open(args.input_vcf, "r") as vcfh:
+    for line in vcfh:
+        if not line.startswith("#CHROM"):
+            continue
+        line = line.rstrip("\r\n")
+        # Example line:
+        # #CHROM  13704   13706   13708   13736   13748   13762   13782
+        items = line.split("\t")
+        sample_list = items[8:]
+        break
 
-    def get_sample_list(self):
-        # Parse the input_vcf file, looking for the first line
-        # that starts with the string "#CHROM"
-        with open(self.args.input_vcf, "r") as vcfh:
-            for line in vcfh:
-                if not line.startswith("#CHROM"):
-                    continue
-                line = line.rstrip("\r\n")
-                # Example line:
-                # #CHROM  13704   13706   13708   13736   13748   13762   13782
-                items = line.split("\t")
-                sample_list = items[8:]
-                break
-        return sample_list
+# Parse the input_csv file to get the region for for
+# each sample_id in the sample_list.  Initialize the
+# region_list to be the same as the sample_list to ensure
+# the same length.
+region_list = [x for x in sample_list]
+with open(args.input_csv, "r") as csvh:
+    for i, line in enumerate(csvh):
+        if i == 0:
+            # Skip the header.
+            continue
+        line = line.rstrip('\r\n')
+        items = line.split(',')
+        csv_sample_id = items[0]
+        csv_region = items[9]
+        # Make sure the csv_sample_id is in the sample_list.
+        loc = sample_list.index(csv_sample_id)
+            if loc >= 0:
+                region_list[loc] = csv_region
 
-    def get_region_list(self, sample_list):
-        # Retrieve the value of the region column in the reef table
-        # for each sample_id in the sample_list.
-        region_list = []
-        for sample_id in sample_list:
-            sql = """SELECT reef.region
-                  FROM reef
-                  LEFT OUTER JOIN colony ON reef.id = colony.reef_id
-                  LEFT OUTER JOIN sample ON sample.colony_id = colony.id
-                  WHERE sample.id = '%s';""" % sample_id
-            cur = self.conn.cursor()
-            cur.execute(sql)
-            region_list.append(cur.fetchone()[0])
-        return region_list
-
-    def gen_pop_info(self):
-        sample_list = self.get_sample_list()
-        region_list = self.get_region_list(sample_list)
-        # The output file will consist of columns:
-        # Item # Sample ID Region
-        with open(self.args.output, "w") as outfh:
-            for i, sample_id in sample_list:
-                outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
-
-
-if __name__ == '__main__':
-    pop_info_generator = PopInfoGenerator()
-    pop_info_generator.run()
-    pop_info_generator.shutdown()
+# The output file will consist of columns:
+# Item #, Sample ID, Region
+with open(self.args.output, "w") as outfh:
+    for i, sample_id in sample_list:
+        outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
--- a/genotype_population_info.xml	Wed Oct 31 13:52:19 2018 -0400
+++ b/genotype_population_info.xml	Fri Nov 09 14:04:53 2018 -0500
@@ -2,10 +2,13 @@
     <description>from VCF</description>
     <command detect_errors="exit_code"><![CDATA[
 python '$__tool_directory__/genotype_population_info.py'
---database_connection_string '$__app__.config.corals_database_connection'
+--input_csv '$input_csv'
 --input_vcf '$input_vcf'
 --output '$output']]></command>
     <inputs>
+        <param name="input_csv" type="data" format="csv" label="Affymetrix 96 well plate CSV file">
+            <validator type="expression" message="96 well plate data must have 31 columns and 96 rows">value is not None and value.metadata.columns==31 and value.metadata.data_lines==96</validator>
+        </param>
         <param name="input_vcf" type="data" format="vcf" label="VCF file"/>
     </inputs>
     <outputs>
@@ -13,6 +16,7 @@
     </outputs>
     <tests>
         <test>
+            <param name="input_csv" value="96_well_plate.csv" ftype="csv"/>
             <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/>
             <output name="output" file="output.tabular" ftype="tabular"/>
         </test>