comparison add_scaffold.py @ 3:04ad7b5d22dd draft

Uploaded
author greg
date Tue, 22 May 2018 10:01:20 -0400
parents fdcced0f4ae4
children fa0822e74ed3
comparison
equal deleted inserted replaced
2:38b2da3cac1e 3:04ad7b5d22dd
16 class AddScaffold(object): 16 class AddScaffold(object):
17 def __init__(self): 17 def __init__(self):
18 self.args = None 18 self.args = None
19 self.clustering_methods = [] 19 self.clustering_methods = []
20 self.conn = None 20 self.conn = None
21 self.fh = None
22 self.gene_sequences_dict = {} 21 self.gene_sequences_dict = {}
23 self.scaffold_genes_dict = {} 22 self.scaffold_genes_dict = {}
24 self.scaffold_recs = [] 23 self.scaffold_recs = []
25 self.species_genes_dict = {} 24 self.species_genes_dict = {}
26 self.species_ids_dict = {} 25 self.species_ids_dict = {}
27 self.taxa_lineage_config = None 26 self.taxa_lineage_config = None
28 self.__parse_args() 27 self.parse_args()
29 self.__connect_db() 28 self.fh = open(self.args.output, "w")
30 29 self.connect_db()
31 def __parse_args(self): 30
31 def parse_args(self):
32 parser = argparse.ArgumentParser() 32 parser = argparse.ArgumentParser()
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'), 34 parser.add_argument('--output', dest='output', help='Output dataset'),
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
36 self.args = parser.parse_args() 36 self.args = parser.parse_args()
37 37
38 def stop_err(msg): 38 def stop_err(msg):
39 sys.stderr.write(msg) 39 sys.stderr.write(msg)
40 self.fh.flush()
41 self.fh.close()
40 sys.exit(1) 42 sys.exit(1)
41 43
42 def __connect_db(self): 44 def connect_db(self):
43 url = make_url(self.args.database_connection_string) 45 url = make_url(self.args.database_connection_string)
44 self.fh.write('Connecting to database with URL: %s' % url) 46 self.fh.write('Connecting to database with URL: %s' % url)
45 args = url.translate_connect_args(username='user') 47 args = url.translate_connect_args(username='user')
46 args.update(url.query) 48 args.update(url.query)
47 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' 49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
48 self.conn = psycopg2.connect(**args) 50 self.conn = psycopg2.connect(**args)
49 51
50 def _flush(self): 52 def flush(self):
51 self.conn.commit() 53 self.conn.commit()
52 54
53 def _shutdown(self): 55 def shutdown(self):
54 self.conn.close() 56 self.conn.close()
55 57
56 def _update(self, sql, args): 58 def update(self, sql, args):
57 try: 59 try:
58 cur = self.conn.cursor() 60 cur = self.conn.cursor()
59 cur.execute(sql, args) 61 cur.execute(sql, args)
60 except Exception as e: 62 except Exception as e:
61 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) 63 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args))
75 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) 77 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id)
76 except: 78 except:
77 # The scaffold has not yet been added. 79 # The scaffold has not yet been added.
78 pass 80 pass
79 81
80 def _run(self): 82 def run(self):
81 self.check_scaffold() 83 self.check_scaffold()
82 with open(self.args.output, "w") as fh: 84 self.process_annot_dir()
83 self.fh = fh 85 self.process_scaffold_config_files()
84 self.process_annot_dir(self.fh) 86 self.process_orthogroup_fasta_files()
85 self.fh.flush() 87 self.fh.flush()
86 self.process_scaffold_config_files(fh)
87 self.fh.flush()
88 self.process_orthogroup_fasta_files(fh)
89 self.fh.flush()
90 self.fh.close() 88 self.fh.close()
91 89
92 def process_annot_dir(self): 90 def process_annot_dir(self):
93 """ 91 """
94 First, parse all of the *.min_evalue.summary files in the 92 First, parse all of the *.min_evalue.summary files in the
119 sql = """ 117 sql = """
120 INSERT INTO plant_tribes_scaffold 118 INSERT INTO plant_tribes_scaffold
121 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) 119 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
122 RETURNING id; 120 RETURNING id;
123 """ 121 """
124 cur = self._update(sql, tuple(args)) 122 cur = self.update(sql, tuple(args))
125 self._flush() 123 self.flush()
126 scaffold_id_db = cur.fetchone()[0] 124 scaffold_id_db = cur.fetchone()[0]
127 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) 125 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
128 with open(file_name, "r") as fh: 126 with open(file_name, "r") as fh:
129 for i, line in enumerate(fh): 127 for i, line in enumerate(fh):
130 if i == 0: 128 if i == 0:
150 args.append('%s' % str(items[k])) 148 args.append('%s' % str(items[k]))
151 sql = """ 149 sql = """
152 INSERT INTO plant_tribes_orthogroup 150 INSERT INTO plant_tribes_orthogroup
153 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); 151 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
154 """ 152 """
155 cur = self._update(sql, tuple(args)) 153 cur = self.update(sql, tuple(args))
156 self._flush() 154 self.flush()
157 for file_name in glob.glob(os.path.join(file_dir, "*list")): 155 for file_name in glob.glob(os.path.join(file_dir, "*list")):
158 items = os.path.basename(file_name).split(".") 156 items = os.path.basename(file_name).split(".")
159 clustering_method = items[0] 157 clustering_method = items[0]
160 with open(file_name, "r") as fh: 158 with open(file_name, "r") as fh:
161 for i, line in enumerate(fh): 159 for i, line in enumerate(fh):
253 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] 251 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
254 sql = """ 252 sql = """
255 INSERT INTO plant_tribes_taxon 253 INSERT INTO plant_tribes_taxon
256 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); 254 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
257 """ 255 """
258 self._update(sql, tuple(args)) 256 self.update(sql, tuple(args))
259 self._flush() 257 self.flush()
260 258
261 def process_orthogroup_fasta_files(self): 259 def process_orthogroup_fasta_files(self):
262 scaffold_id = os.path.basename(self.args.scaffold_path) 260 scaffold_id = os.path.basename(self.args.scaffold_path)
263 aa_dict = {} 261 aa_dict = {}
264 dna_dict = {} 262 dna_dict = {}
347 sql = """ 345 sql = """
348 INSERT INTO plant_tribes_gene 346 INSERT INTO plant_tribes_gene
349 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) 347 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s)
350 RETURNING id; 348 RETURNING id;
351 """ 349 """
352 cur = self._update(sql, tuple(args)) 350 cur = self.update(sql, tuple(args))
353 self._flush() 351 self.flush()
354 gene_id_db = cur.fetchone()[0] 352 gene_id_db = cur.fetchone()[0]
355 # Insert a row into the gene_scaffold_orthogroup_association table. 353 # Insert a row into the gene_scaffold_orthogroup_association table.
356 # Get the scaffold_rec for the current scaffold_id and clustering_method. 354 # Get the scaffold_rec for the current scaffold_id and clustering_method.
357 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] 355 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
358 for scaffold_rec in self.scaffold_recs: 356 for scaffold_rec in self.scaffold_recs:
361 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] 359 args = [gene_id_db, scaffold_id_db, orthogroup_id_db]
362 sql = """ 360 sql = """
363 INSERT INTO gene_scaffold_orthogroup_association 361 INSERT INTO gene_scaffold_orthogroup_association
364 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); 362 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s);
365 """ 363 """
366 cur = self._update(sql, tuple(args)) 364 cur = self.update(sql, tuple(args))
367 self._flush() 365 self.flush()
368 366
369 367
370 if __name__ == '__main__': 368 if __name__ == '__main__':
371 add_scaffold = AddScaffold() 369 add_scaffold = AddScaffold()
372 add_scaffold._run() 370 add_scaffold.run()
373 add_scaffold._shutdown() 371 add_scaffold.shutdown()