Mercurial > repos > greg > genotype_population_info

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Wed Oct 31 13:52:19 2018 -0400
@@ -0,0 +1,11 @@
+name: genotype_population_info
+owner: greg
+description: |
+  Contains a tool that generates the genotype popolation information file for use as input to the multilocus_genotype tool.
+homepage_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/genotype_population_info
+long_description: |
+  Contains a tool that generates the genotype popolation information file for use as input to the multilocus_genotype tool.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/corals/genotype_population_info
+type: unrestricted
+categories:
+  - Micro-array Analysis
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genotype_population_info.py	Wed Oct 31 13:52:19 2018 -0400
@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+"""
+Generate the genotype_population_info.txt file by parsing the information from a VCF
+file and querying the stag database that is required to be available within the Galaxy
+instance in which this tool is executing.  PostgreSQL 9.1 or greater is required.
+"""
+import argparse
+import sys
+
+import psycopg2
+from sqlalchemy import create_engine
+from sqlalchemy.engine.url import make_url
+
+
+class PopInfoGenerator(object):
+
+    def __init__(self):
+        self.args = None
+        self.conn = None
+        self.parse_args()
+        self.connect_db()
+        self.engine = create_engine(self.args.database_connection_string)
+
+    def parse_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
+        parser.add_argument('--input_vcf', dest='input_vcf', help='Input VCF file')
+        parser.add_argument('--output', dest='output', help='Output dataset'),
+        self.args = parser.parse_args()
+
+    def connect_db(self):
+        url = make_url(self.args.database_connection_string)
+        self.log('Connecting to database with URL: %s' % url)
+        args = url.translate_connect_args(username='user')
+        args.update(url.query)
+        assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
+        self.conn = psycopg2.connect(**args)
+
+    def run(self):
+        self.gen_pop_info()
+        self.fh.flush()
+        self.fh.close()
+
+    def shutdown(self):
+        self.conn.close()
+
+    def stop_err(self, msg):
+        sys.stderr.write(msg)
+
+    def log(self, msg):
+        self.fh.write("%s\n" % msg)
+        self.fh.flush()
+
+    def get_sample_list(self):
+        # Parse the input_vcf file, looking for the first line
+        # that starts with the string "#CHROM"
+        with open(self.args.input_vcf, "r") as vcfh:
+            for line in vcfh:
+                if not line.startswith("#CHROM"):
+                    continue
+                line = line.rstrip("\r\n")
+                # Example line:
+                # #CHROM  13704   13706   13708   13736   13748   13762   13782
+                items = line.split("\t")
+                sample_list = items[8:]
+                break
+        return sample_list
+
+    def get_region_list(self, sample_list):
+        # Retrieve the value of the region column in the reef table
+        # for each sample_id in the sample_list.
+        region_list = []
+        for sample_id in sample_list:
+            sql = """SELECT reef.region
+                  FROM reef
+                  LEFT OUTER JOIN colony ON reef.id = colony.reef_id
+                  LEFT OUTER JOIN sample ON sample.colony_id = colony.id
+                  WHERE sample.id = '%s';""" % sample_id
+            cur = self.conn.cursor()
+            cur.execute(sql)
+            region_list.append(cur.fetchone()[0])
+        return region_list
+
+    def gen_pop_info(self):
+        sample_list = self.get_sample_list()
+        region_list = self.get_region_list(sample_list)
+        # The output file will consist of columns:
+        # Item # Sample ID Region
+        with open(self.args.output, "w") as outfh:
+            for i, sample_id in sample_list:
+                outfh.write("%d\t%s\t%s\n" % (i, sample_id, region_list[1]))
+
+
+if __name__ == '__main__':
+    pop_info_generator = PopInfoGenerator()
+    pop_info_generator.run()
+    pop_info_generator.shutdown()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genotype_population_info.xml	Wed Oct 31 13:52:19 2018 -0400
@@ -0,0 +1,32 @@
+<tool id="genotype_population_info" name="Generate genotype population info" version="1.0.0">
+    <description>from VCF</description>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/genotype_population_info.py'
+--database_connection_string '$__app__.config.corals_database_connection'
+--input_vcf '$input_vcf'
+--output '$output']]></command>
+    <inputs>
+        <param name="input_vcf" type="data" format="vcf" label="VCF file"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="tabular"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_vcf" value="baitssnv.recode.vcf" ftype="vcf"/>
+            <output name="output" file="output.tabular" ftype="tabular"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Generates a file that contains the genotype population information that can be used as input
+to the multilocus_genotype tool.  This tool can be used only within a Galaxy instance which
+includes the complementaty stag database.
+-----
+
+**Required options**
+    </help>
+    <citations>
+    </citations>
+</tool>