changeset 2:751b36922d59 draft

Uploaded
author greg
date Fri, 25 May 2018 10:18:13 -0400
parents 488bf95641d2
children 48c13482e6c9
files gene_family_scaffold_loader.py
diffstat 1 files changed, 19 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/gene_family_scaffold_loader.py	Fri May 25 10:18:01 2018 -0400
+++ b/gene_family_scaffold_loader.py	Fri May 25 10:18:13 2018 -0400
@@ -120,7 +120,7 @@
             if clustering_method not in self.clustering_methods:
                 self.clustering_methods.append(clustering_method)
             # Insert a row in to the plant_tribes_scaffold table.
-            self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method))
+            self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
             args = [scaffold_id, clustering_method]
             sql = """
                 INSERT INTO plant_tribes_scaffold
@@ -132,8 +132,9 @@
             scaffold_id_db = cur.fetchone()[0]
             self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
             with open(file_name, "r") as fh:
-                for i, line in enumerate(fh):
-                    if i == 0:
+                i = 0
+                for i2, line in enumerate(fh):
+                    if i2 == 0:
                         # Skip first line.
                         continue
                     num_genes = 0
@@ -150,7 +151,6 @@
                             num_species += 1
                             num_genes += j_int
                     # Insert a row into the plant_tribes_orthogroup table.
-                    self.log("Inserting a row into the plant_tribes_orthogroup table...")
                     args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
                     for k in range(super_ortho_start_index, len(items)):
                         args.append('%s' % str(items[k]))
@@ -160,6 +160,8 @@
                     """
                     cur = self.update(sql, tuple(args))
                     self.flush()
+                    i += 1
+                self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
         for file_name in glob.glob(os.path.join(file_dir, "*list")):
             items = os.path.basename(file_name).split(".")
             clustering_method = items[0]
@@ -230,7 +232,7 @@
         file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
         self.log("Processing taxa lineage config: %s" % str(file_name))
         with open(file_name, "r") as fh:
-            for i, line in enumerate(fh):
+            for line in fh:
                 line = line.strip()
                 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
                     # Skip blank lines, comments and section headers.
@@ -238,7 +240,7 @@
                 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
                 items = line.split("\t")
                 species_name = items[0]
-                self.log("Calculating the number of genes for species_name: %s" % str(species_name))
+                i = 0
                 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
                     # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
                     species_genes_dict_key_items = species_genes_dict_key.split("^^")
@@ -263,6 +265,8 @@
                     """
                     self.update(sql, tuple(args))
                     self.flush()
+                    i += 1
+                self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % str(species_name))
 
     def process_orthogroup_fasta_files(self):
         """
@@ -313,16 +317,17 @@
                             sequence = adict[combined_id]
                             sequence = "%s%s" % (sequence, line)
                             adict[combined_id] = sequence
-        # Populate the plant_tribes_gene and gen_scaffold_association tables
+        # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables
         # from the contents of aa_dict and dna_dict.
-        for combined_id in sorted(dna_dict.keys()):
+        self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.")
+        gi = 0
+        for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
             # The dictionary keys combine the orthogroup_id, clustering method and
             # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
             items = combined_id.split("^^")
             orthogroup_id = items[0]
             clustering_method = items[1]
             gene_id = items[2]
-            self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
             # The value will be a list containing both
             # clustering_method and the dna string.
             dna_sequence = dna_dict[combined_id]
@@ -364,6 +369,7 @@
                 cur = self.update(sql, tuple(args))
                 self.flush()
                 gene_id_db = cur.fetchone()[0]
+                gi += 1
             # Insert a row into the gene_scaffold_orthogroup_association table.
             # Get the scaffold_rec for the current scaffold_id and clustering_method.
             # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
@@ -377,7 +383,10 @@
             """
             cur = self.update(sql, tuple(args))
             self.flush()
-
+            if gsoai % 1000 == 0:
+                self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.")
+        self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
+        self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai)
 
 if __name__ == '__main__':
     scaffold_loader = ScaffoldLoader()