Mercurial > repos > greg > plant_tribes_add_scaffold
comparison add_scaffold.py @ 3:04ad7b5d22dd draft
Uploaded
author | greg |
---|---|
date | Tue, 22 May 2018 10:01:20 -0400 |
parents | fdcced0f4ae4 |
children | fa0822e74ed3 |
comparison
equal
deleted
inserted
replaced
2:38b2da3cac1e | 3:04ad7b5d22dd |
---|---|
16 class AddScaffold(object): | 16 class AddScaffold(object): |
17 def __init__(self): | 17 def __init__(self): |
18 self.args = None | 18 self.args = None |
19 self.clustering_methods = [] | 19 self.clustering_methods = [] |
20 self.conn = None | 20 self.conn = None |
21 self.fh = None | |
22 self.gene_sequences_dict = {} | 21 self.gene_sequences_dict = {} |
23 self.scaffold_genes_dict = {} | 22 self.scaffold_genes_dict = {} |
24 self.scaffold_recs = [] | 23 self.scaffold_recs = [] |
25 self.species_genes_dict = {} | 24 self.species_genes_dict = {} |
26 self.species_ids_dict = {} | 25 self.species_ids_dict = {} |
27 self.taxa_lineage_config = None | 26 self.taxa_lineage_config = None |
28 self.__parse_args() | 27 self.parse_args() |
29 self.__connect_db() | 28 self.fh = open(self.args.output, "w") |
30 | 29 self.connect_db() |
31 def __parse_args(self): | 30 |
31 def parse_args(self): | |
32 parser = argparse.ArgumentParser() | 32 parser = argparse.ArgumentParser() |
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), |
34 parser.add_argument('--output', dest='output', help='Output dataset'), | 34 parser.add_argument('--output', dest='output', help='Output dataset'), |
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') |
36 self.args = parser.parse_args() | 36 self.args = parser.parse_args() |
37 | 37 |
38 def stop_err(msg): | 38 def stop_err(msg): |
39 sys.stderr.write(msg) | 39 sys.stderr.write(msg) |
40 self.fh.flush() | |
41 self.fh.close() | |
40 sys.exit(1) | 42 sys.exit(1) |
41 | 43 |
42 def __connect_db(self): | 44 def connect_db(self): |
43 url = make_url(self.args.database_connection_string) | 45 url = make_url(self.args.database_connection_string) |
44 self.fh.write('Connecting to database with URL: %s' % url) | 46 self.fh.write('Connecting to database with URL: %s' % url) |
45 args = url.translate_connect_args(username='user') | 47 args = url.translate_connect_args(username='user') |
46 args.update(url.query) | 48 args.update(url.query) |
47 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' |
48 self.conn = psycopg2.connect(**args) | 50 self.conn = psycopg2.connect(**args) |
49 | 51 |
50 def _flush(self): | 52 def flush(self): |
51 self.conn.commit() | 53 self.conn.commit() |
52 | 54 |
53 def _shutdown(self): | 55 def shutdown(self): |
54 self.conn.close() | 56 self.conn.close() |
55 | 57 |
56 def _update(self, sql, args): | 58 def update(self, sql, args): |
57 try: | 59 try: |
58 cur = self.conn.cursor() | 60 cur = self.conn.cursor() |
59 cur.execute(sql, args) | 61 cur.execute(sql, args) |
60 except Exception as e: | 62 except Exception as e: |
61 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) | 63 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) |
75 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) | 77 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) |
76 except: | 78 except: |
77 # The scaffold has not yet been added. | 79 # The scaffold has not yet been added. |
78 pass | 80 pass |
79 | 81 |
80 def _run(self): | 82 def run(self): |
81 self.check_scaffold() | 83 self.check_scaffold() |
82 with open(self.args.output, "w") as fh: | 84 self.process_annot_dir() |
83 self.fh = fh | 85 self.process_scaffold_config_files() |
84 self.process_annot_dir(self.fh) | 86 self.process_orthogroup_fasta_files() |
85 self.fh.flush() | 87 self.fh.flush() |
86 self.process_scaffold_config_files(fh) | |
87 self.fh.flush() | |
88 self.process_orthogroup_fasta_files(fh) | |
89 self.fh.flush() | |
90 self.fh.close() | 88 self.fh.close() |
91 | 89 |
92 def process_annot_dir(self): | 90 def process_annot_dir(self): |
93 """ | 91 """ |
94 First, parse all of the *.min_evalue.summary files in the | 92 First, parse all of the *.min_evalue.summary files in the |
119 sql = """ | 117 sql = """ |
120 INSERT INTO plant_tribes_scaffold | 118 INSERT INTO plant_tribes_scaffold |
121 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | 119 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) |
122 RETURNING id; | 120 RETURNING id; |
123 """ | 121 """ |
124 cur = self._update(sql, tuple(args)) | 122 cur = self.update(sql, tuple(args)) |
125 self._flush() | 123 self.flush() |
126 scaffold_id_db = cur.fetchone()[0] | 124 scaffold_id_db = cur.fetchone()[0] |
127 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) | 125 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) |
128 with open(file_name, "r") as fh: | 126 with open(file_name, "r") as fh: |
129 for i, line in enumerate(fh): | 127 for i, line in enumerate(fh): |
130 if i == 0: | 128 if i == 0: |
150 args.append('%s' % str(items[k])) | 148 args.append('%s' % str(items[k])) |
151 sql = """ | 149 sql = """ |
152 INSERT INTO plant_tribes_orthogroup | 150 INSERT INTO plant_tribes_orthogroup |
153 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | 151 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); |
154 """ | 152 """ |
155 cur = self._update(sql, tuple(args)) | 153 cur = self.update(sql, tuple(args)) |
156 self._flush() | 154 self.flush() |
157 for file_name in glob.glob(os.path.join(file_dir, "*list")): | 155 for file_name in glob.glob(os.path.join(file_dir, "*list")): |
158 items = os.path.basename(file_name).split(".") | 156 items = os.path.basename(file_name).split(".") |
159 clustering_method = items[0] | 157 clustering_method = items[0] |
160 with open(file_name, "r") as fh: | 158 with open(file_name, "r") as fh: |
161 for i, line in enumerate(fh): | 159 for i, line in enumerate(fh): |
253 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] | 251 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] |
254 sql = """ | 252 sql = """ |
255 INSERT INTO plant_tribes_taxon | 253 INSERT INTO plant_tribes_taxon |
256 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); | 254 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); |
257 """ | 255 """ |
258 self._update(sql, tuple(args)) | 256 self.update(sql, tuple(args)) |
259 self._flush() | 257 self.flush() |
260 | 258 |
261 def process_orthogroup_fasta_files(self): | 259 def process_orthogroup_fasta_files(self): |
262 scaffold_id = os.path.basename(self.args.scaffold_path) | 260 scaffold_id = os.path.basename(self.args.scaffold_path) |
263 aa_dict = {} | 261 aa_dict = {} |
264 dna_dict = {} | 262 dna_dict = {} |
347 sql = """ | 345 sql = """ |
348 INSERT INTO plant_tribes_gene | 346 INSERT INTO plant_tribes_gene |
349 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) | 347 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) |
350 RETURNING id; | 348 RETURNING id; |
351 """ | 349 """ |
352 cur = self._update(sql, tuple(args)) | 350 cur = self.update(sql, tuple(args)) |
353 self._flush() | 351 self.flush() |
354 gene_id_db = cur.fetchone()[0] | 352 gene_id_db = cur.fetchone()[0] |
355 # Insert a row into the gene_scaffold_orthogroup_association table. | 353 # Insert a row into the gene_scaffold_orthogroup_association table. |
356 # Get the scaffold_rec for the current scaffold_id and clustering_method. | 354 # Get the scaffold_rec for the current scaffold_id and clustering_method. |
357 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | 355 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] |
358 for scaffold_rec in self.scaffold_recs: | 356 for scaffold_rec in self.scaffold_recs: |
361 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] | 359 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] |
362 sql = """ | 360 sql = """ |
363 INSERT INTO gene_scaffold_orthogroup_association | 361 INSERT INTO gene_scaffold_orthogroup_association |
364 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); | 362 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); |
365 """ | 363 """ |
366 cur = self._update(sql, tuple(args)) | 364 cur = self.update(sql, tuple(args)) |
367 self._flush() | 365 self.flush() |
368 | 366 |
369 | 367 |
370 if __name__ == '__main__': | 368 if __name__ == '__main__': |
371 add_scaffold = AddScaffold() | 369 add_scaffold = AddScaffold() |
372 add_scaffold._run() | 370 add_scaffold.run() |
373 add_scaffold._shutdown() | 371 add_scaffold.shutdown() |