Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
comparison gene_family_scaffold_loader.py @ 6:9a4b0ae3d408 draft
Uploaded
author | greg |
---|---|
date | Tue, 05 Jun 2018 14:27:50 -0400 |
parents | cb986be6355e |
children | 3841f7252b1d |
comparison
equal
deleted
inserted
replaced
5:cb986be6355e | 6:9a4b0ae3d408 |
---|---|
216 # Strip the version from the species_code. | 216 # Strip the version from the species_code. |
217 species_code = species_code[0:5] | 217 species_code = species_code[0:5] |
218 # Get the species_name from self.species_ids_dict. | 218 # Get the species_name from self.species_ids_dict. |
219 species_name = self.species_ids_dict[species_code] | 219 species_name = self.species_ids_dict[species_code] |
220 # Create a key for self.species_genes_dict, with the format: | 220 # Create a key for self.species_genes_dict, with the format: |
221 # <clustering_method>^^<species_code> | 221 # <clustering_method>^^<species_name> |
222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_code) | 222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) |
223 # Add an entry to self.species_genes_dict, where the value | 223 # Add an entry to self.species_genes_dict, where the value |
224 # is a list containing species_name and num_genes. | 224 # is a list containing species_name and num_genes. |
225 if species_genes_dict_key in self.species_genes_dict: | 225 if species_genes_dict_key in self.species_genes_dict: |
226 tup = self.species_genes_dict[species_genes_dict_key] | 226 tup = self.species_genes_dict[species_genes_dict_key] |
227 tup[1] += 1 | 227 tup[1] += 1 |
239 continue | 239 continue |
240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | 240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots |
241 items = line.split("\t") | 241 items = line.split("\t") |
242 species_name = items[0] | 242 species_name = items[0] |
243 i = 0 | 243 i = 0 |
244 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): | 244 for clustering_method in self.clustering_methods: |
245 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. | 245 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) |
246 # The format of species_genes_dict_key is <clustering_method>^^<species_name>. | |
246 species_genes_dict_key_items = species_genes_dict_key.split("^^") | 247 species_genes_dict_key_items = species_genes_dict_key.split("^^") |
247 clustering_method = species_genes_dict_key_items[0] | |
248 species_code = species_genes_dict_key_items[1] | |
249 # Get the scaffold_rec for the current scaffold_id and clustering_method. | 248 # Get the scaffold_rec for the current scaffold_id and clustering_method. |
250 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | 249 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] |
251 for scaffold_rec in self.scaffold_recs: | 250 for scaffold_rec in self.scaffold_recs: |
252 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | 251 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: |
253 scaffold_id_db = scaffold_rec[0] | 252 scaffold_id_db = scaffold_rec[0] |
271 def process_orthogroup_fasta_files(self): | 270 def process_orthogroup_fasta_files(self): |
272 """ | 271 """ |
273 1. Analyze all of the scaffold .fna and .faa files for each clustering | 272 1. Analyze all of the scaffold .fna and .faa files for each clustering |
274 method to populate the aa_dict and dna_dict sequence dictionaries. | 273 method to populate the aa_dict and dna_dict sequence dictionaries. |
275 2. Use the populated sequence dictionaries to populate the plant_tribes_gene | 274 2. Use the populated sequence dictionaries to populate the plant_tribes_gene |
276 and gene_scaffold_orthogroup_association tables. | 275 and gene_scaffold_orthogroup_taxon_association tables. |
277 """ | 276 """ |
278 scaffold_id = os.path.basename(self.args.scaffold_path) | 277 scaffold_id = os.path.basename(self.args.scaffold_path) |
279 aa_dict = {} | 278 aa_dict = {} |
280 dna_dict = {} | 279 dna_dict = {} |
281 # Populate aa_dict and dna_dict. | 280 # Populate aa_dict and dna_dict. |
315 # above will be the sequence associated with that gene until | 314 # above will be the sequence associated with that gene until |
316 # the next gene id line is encountered. | 315 # the next gene id line is encountered. |
317 sequence = adict[combined_id] | 316 sequence = adict[combined_id] |
318 sequence = "%s%s" % (sequence, line) | 317 sequence = "%s%s" % (sequence, line) |
319 adict[combined_id] = sequence | 318 adict[combined_id] = sequence |
320 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables | 319 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables |
321 # from the contents of aa_dict and dna_dict. | 320 # from the contents of aa_dict and dna_dict. |
322 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.") | 321 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.") |
323 gi = 0 | 322 gi = 0 |
324 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): | 323 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): |
325 # The dictionary keys combine the orthogroup_id, clustering method and | 324 # The dictionary keys combine the orthogroup_id, clustering method and |
326 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | 325 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. |
327 items = combined_id.split("^^") | 326 items = combined_id.split("^^") |
343 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id | 342 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id |
344 cur = self.conn.cursor() | 343 cur = self.conn.cursor() |
345 cur.execute(sql) | 344 cur.execute(sql) |
346 orthogroup_id_db = cur.fetchone()[0] | 345 orthogroup_id_db = cur.fetchone()[0] |
347 # If the plant_tribes_gene table contains a row that has the gene_id, | 346 # If the plant_tribes_gene table contains a row that has the gene_id, |
348 # then we'll add a row only to the gene_scaffold_orthogroup_association table. | 347 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. |
349 # Get the taxon_id for the species_name from the plant_tribes_taxon table. | 348 # Get the taxon_id for the species_name from the plant_tribes_taxon table. |
350 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name | 349 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name |
351 cur = self.conn.cursor() | 350 cur = self.conn.cursor() |
352 cur.execute(sql) | 351 cur.execute(sql) |
353 taxon_id_db = cur.fetchone()[0] | 352 taxon_id_db = cur.fetchone()[0] |
354 # If the plant_tribes_gene table contains a row that has the gene_id, | 353 # If the plant_tribes_gene table contains a row that has the gene_id, |
355 # then we'll add a row only to the gene_scaffold_orthogroup_association table. | 354 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. |
356 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id | 355 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id |
357 cur = self.conn.cursor() | 356 cur = self.conn.cursor() |
358 cur.execute(sql) | 357 cur.execute(sql) |
359 try: | 358 try: |
360 gene_id_db = cur.fetchone()[0] | 359 gene_id_db = cur.fetchone()[0] |
370 self.flush() | 369 self.flush() |
371 gene_id_db = cur.fetchone()[0] | 370 gene_id_db = cur.fetchone()[0] |
372 gi += 1 | 371 gi += 1 |
373 if gi % 1000 == 0: | 372 if gi % 1000 == 0: |
374 self.log("Inserted 1000 more rows into the plant_tribes_gene table.") | 373 self.log("Inserted 1000 more rows into the plant_tribes_gene table.") |
375 # Insert a row into the gene_scaffold_orthogroup_association table. | 374 # Insert a row into the gene_scaffold_orthogroup_taxon_association table. |
376 # Get the scaffold_rec for the current scaffold_id and clustering_method. | 375 # Get the scaffold_rec for the current scaffold_id and clustering_method. |
377 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | 376 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] |
378 for scaffold_rec in self.scaffold_recs: | 377 for scaffold_rec in self.scaffold_recs: |
379 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | 378 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: |
380 scaffold_id_db = scaffold_rec[0] | 379 scaffold_id_db = scaffold_rec[0] |
384 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s); | 383 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s); |
385 """ | 384 """ |
386 cur = self.update(sql, tuple(args)) | 385 cur = self.update(sql, tuple(args)) |
387 self.flush() | 386 self.flush() |
388 if gsoai % 1000 == 0: | 387 if gsoai % 1000 == 0: |
389 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.") | 388 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.") |
390 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) | 389 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) |
391 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai) | 390 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai) |
392 | 391 |
393 if __name__ == '__main__': | 392 if __name__ == '__main__': |
394 scaffold_loader = ScaffoldLoader() | 393 scaffold_loader = ScaffoldLoader() |
395 scaffold_loader.run() | 394 scaffold_loader.run() |
396 scaffold_loader.shutdown() | 395 scaffold_loader.shutdown() |