comparison gene_family_scaffold_loader.py @ 6:9a4b0ae3d408 draft

Uploaded
author greg
date Tue, 05 Jun 2018 14:27:50 -0400
parents cb986be6355e
children 3841f7252b1d
comparison
equal deleted inserted replaced
5:cb986be6355e 6:9a4b0ae3d408
216 # Strip the version from the species_code. 216 # Strip the version from the species_code.
217 species_code = species_code[0:5] 217 species_code = species_code[0:5]
218 # Get the species_name from self.species_ids_dict. 218 # Get the species_name from self.species_ids_dict.
219 species_name = self.species_ids_dict[species_code] 219 species_name = self.species_ids_dict[species_code]
220 # Create a key for self.species_genes_dict, with the format: 220 # Create a key for self.species_genes_dict, with the format:
221 # <clustering_method>^^<species_code> 221 # <clustering_method>^^<species_name>
222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_code) 222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
223 # Add an entry to self.species_genes_dict, where the value 223 # Add an entry to self.species_genes_dict, where the value
224 # is a list containing species_name and num_genes. 224 # is a list containing species_name and num_genes.
225 if species_genes_dict_key in self.species_genes_dict: 225 if species_genes_dict_key in self.species_genes_dict:
226 tup = self.species_genes_dict[species_genes_dict_key] 226 tup = self.species_genes_dict[species_genes_dict_key]
227 tup[1] += 1 227 tup[1] += 1
239 continue 239 continue
240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots 240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
241 items = line.split("\t") 241 items = line.split("\t")
242 species_name = items[0] 242 species_name = items[0]
243 i = 0 243 i = 0
244 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): 244 for clustering_method in self.clustering_methods:
245 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. 245 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
246 # The format of species_genes_dict_key is <clustering_method>^^<species_name>.
246 species_genes_dict_key_items = species_genes_dict_key.split("^^") 247 species_genes_dict_key_items = species_genes_dict_key.split("^^")
247 clustering_method = species_genes_dict_key_items[0]
248 species_code = species_genes_dict_key_items[1]
249 # Get the scaffold_rec for the current scaffold_id and clustering_method. 248 # Get the scaffold_rec for the current scaffold_id and clustering_method.
250 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] 249 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
251 for scaffold_rec in self.scaffold_recs: 250 for scaffold_rec in self.scaffold_recs:
252 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: 251 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
253 scaffold_id_db = scaffold_rec[0] 252 scaffold_id_db = scaffold_rec[0]
271 def process_orthogroup_fasta_files(self): 270 def process_orthogroup_fasta_files(self):
272 """ 271 """
273 1. Analyze all of the scaffold .fna and .faa files for each clustering 272 1. Analyze all of the scaffold .fna and .faa files for each clustering
274 method to populate the aa_dict and dna_dict sequence dictionaries. 273 method to populate the aa_dict and dna_dict sequence dictionaries.
275 2. Use the populated sequence dictionaries to populate the plant_tribes_gene 274 2. Use the populated sequence dictionaries to populate the plant_tribes_gene
276 and gene_scaffold_orthogroup_association tables. 275 and gene_scaffold_orthogroup_taxon_association tables.
277 """ 276 """
278 scaffold_id = os.path.basename(self.args.scaffold_path) 277 scaffold_id = os.path.basename(self.args.scaffold_path)
279 aa_dict = {} 278 aa_dict = {}
280 dna_dict = {} 279 dna_dict = {}
281 # Populate aa_dict and dna_dict. 280 # Populate aa_dict and dna_dict.
315 # above will be the sequence associated with that gene until 314 # above will be the sequence associated with that gene until
316 # the next gene id line is encountered. 315 # the next gene id line is encountered.
317 sequence = adict[combined_id] 316 sequence = adict[combined_id]
318 sequence = "%s%s" % (sequence, line) 317 sequence = "%s%s" % (sequence, line)
319 adict[combined_id] = sequence 318 adict[combined_id] = sequence
320 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_association tables 319 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
321 # from the contents of aa_dict and dna_dict. 320 # from the contents of aa_dict and dna_dict.
322 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables.") 321 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
323 gi = 0 322 gi = 0
324 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): 323 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
325 # The dictionary keys combine the orthogroup_id, clustering method and 324 # The dictionary keys combine the orthogroup_id, clustering method and
326 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. 325 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
327 items = combined_id.split("^^") 326 items = combined_id.split("^^")
343 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id 342 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
344 cur = self.conn.cursor() 343 cur = self.conn.cursor()
345 cur.execute(sql) 344 cur.execute(sql)
346 orthogroup_id_db = cur.fetchone()[0] 345 orthogroup_id_db = cur.fetchone()[0]
347 # If the plant_tribes_gene table contains a row that has the gene_id, 346 # If the plant_tribes_gene table contains a row that has the gene_id,
348 # then we'll add a row only to the gene_scaffold_orthogroup_association table. 347 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
349 # Get the taxon_id for the species_name from the plant_tribes_taxon table. 348 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
350 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name 349 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
351 cur = self.conn.cursor() 350 cur = self.conn.cursor()
352 cur.execute(sql) 351 cur.execute(sql)
353 taxon_id_db = cur.fetchone()[0] 352 taxon_id_db = cur.fetchone()[0]
354 # If the plant_tribes_gene table contains a row that has the gene_id, 353 # If the plant_tribes_gene table contains a row that has the gene_id,
355 # then we'll add a row only to the gene_scaffold_orthogroup_association table. 354 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
356 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id 355 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
357 cur = self.conn.cursor() 356 cur = self.conn.cursor()
358 cur.execute(sql) 357 cur.execute(sql)
359 try: 358 try:
360 gene_id_db = cur.fetchone()[0] 359 gene_id_db = cur.fetchone()[0]
370 self.flush() 369 self.flush()
371 gene_id_db = cur.fetchone()[0] 370 gene_id_db = cur.fetchone()[0]
372 gi += 1 371 gi += 1
373 if gi % 1000 == 0: 372 if gi % 1000 == 0:
374 self.log("Inserted 1000 more rows into the plant_tribes_gene table.") 373 self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
375 # Insert a row into the gene_scaffold_orthogroup_association table. 374 # Insert a row into the gene_scaffold_orthogroup_taxon_association table.
376 # Get the scaffold_rec for the current scaffold_id and clustering_method. 375 # Get the scaffold_rec for the current scaffold_id and clustering_method.
377 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] 376 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
378 for scaffold_rec in self.scaffold_recs: 377 for scaffold_rec in self.scaffold_recs:
379 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: 378 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
380 scaffold_id_db = scaffold_rec[0] 379 scaffold_id_db = scaffold_rec[0]
384 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s); 383 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s);
385 """ 384 """
386 cur = self.update(sql, tuple(args)) 385 cur = self.update(sql, tuple(args))
387 self.flush() 386 self.flush()
388 if gsoai % 1000 == 0: 387 if gsoai % 1000 == 0:
389 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_association table.") 388 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
390 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) 389 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
391 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_association table." % gsoai) 390 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
392 391
393 if __name__ == '__main__': 392 if __name__ == '__main__':
394 scaffold_loader = ScaffoldLoader() 393 scaffold_loader = ScaffoldLoader()
395 scaffold_loader.run() 394 scaffold_loader.run()
396 scaffold_loader.shutdown() 395 scaffold_loader.shutdown()