changeset 6:9a4b0ae3d408 draft

Uploaded
author greg
date Tue, 05 Jun 2018 14:27:50 -0400
parents cb986be6355e
children 976184d4d5bb
files gene_family_scaffold_loader.py
diffstat 1 files changed, 13 insertions(+), 14 deletions(-) [+]
line wrap: on
line diff
--- a/gene_family_scaffold_loader.py	Tue Jun 05 10:56:33 2018 -0400
+++ b/gene_family_scaffold_loader.py	Tue Jun 05 14:27:50 2018 -0400
@@ -218,8 +218,8 @@
                 # Get the species_name from self.species_ids_dict.
                 species_name = self.species_ids_dict[species_code]
                 # Create a key for self.species_genes_dict, with the format:
-                # <clustering_method>^^<species_code>
-                species_genes_dict_key = "%s^^%s" % (clustering_method, species_code)
+                # <clustering_method>^^<species_name>
+                species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
                 # Add an entry to self.species_genes_dict, where the value
                 # is a list containing species_name and num_genes.
                 if species_genes_dict_key in self.species_genes_dict:
@@ -241,11 +241,10 @@
                 items = line.split("\t")
                 species_name = items[0]
                 i = 0
-                for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
-                    # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
+                for clustering_method in self.clustering_methods:
+                    species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
+                    # The format of species_genes_dict_key is <clustering_method>^^<species_name>.
                     species_genes_dict_key_items = species_genes_dict_key.split("^^")
-                    clustering_method = species_genes_dict_key_items[0]
-                    species_code = species_genes_dict_key_items[1]
                     # Get the scaffold_rec for the current scaffold_id and clustering_method.
                     # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
                     for scaffold_rec in self.scaffold_recs:
@@ -273,7 +272,7 @@
         1. Analyze all of the scaffold .fna and .faa files for each clustering
         method to populate the aa_dict and dna_dict sequence dictionaries.
         2. Use the populated sequence dictionaries to populate the plant_tribes_gene
-        and gene_scaffold_orthogroup_association tables.
+        and gene_scaffold_orthogroup_taxon_association tables.
         """
         scaffold_id = os.path.basename(self.args.scaffold_path)
         aa_dict = {}
@@ -317,9 +316,9 @@
                             sequence = adict[combined_id]
                             sequence = "%s%s" % (sequence, line)
                             adict[combined_id] = sequence
-        # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables
+        # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
         # from the contents of aa_dict and dna_dict.
-        self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.")
+        self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
         gi = 0
         for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
             # The dictionary keys combine the orthogroup_id, clustering method and
@@ -345,14 +344,14 @@
             cur.execute(sql)
             orthogroup_id_db = cur.fetchone()[0]
             # If the plant_tribes_gene table contains a row that has the gene_id,
-            # then we'll add a row only to the gene_scaffold_orthogroup_association table.
+            # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
             # Get the taxon_id  for the species_name from the plant_tribes_taxon table.
             sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
             cur = self.conn.cursor()
             cur.execute(sql)
             taxon_id_db = cur.fetchone()[0]
             # If the plant_tribes_gene table contains a row that has the gene_id,
-            # then we'll add a row only to the gene_scaffold_orthogroup_association table.
+            # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
             sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
             cur = self.conn.cursor()
             cur.execute(sql)
@@ -372,7 +371,7 @@
                 gi += 1
                 if gi % 1000 == 0:
                     self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
-            # Insert a row into the gene_scaffold_orthogroup_association table.
+            # Insert a row into the gene_scaffold_orthogroup_taxon_association table.
             # Get the scaffold_rec for the current scaffold_id and clustering_method.
             # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
             for scaffold_rec in self.scaffold_recs:
@@ -386,9 +385,9 @@
             cur = self.update(sql, tuple(args))
             self.flush()
             if gsoai % 1000 == 0:
-                self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.")
+                self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
         self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
-        self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai)
+        self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
 
 if __name__ == '__main__':
     scaffold_loader = ScaffoldLoader()