Mercurial > repos > greg > plant_tribes_add_scaffold
comparison add_scaffold.py @ 3:04ad7b5d22dd draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 22 May 2018 10:01:20 -0400 |
| parents | fdcced0f4ae4 |
| children | fa0822e74ed3 |
comparison
equal
deleted
inserted
replaced
| 2:38b2da3cac1e | 3:04ad7b5d22dd |
|---|---|
| 16 class AddScaffold(object): | 16 class AddScaffold(object): |
| 17 def __init__(self): | 17 def __init__(self): |
| 18 self.args = None | 18 self.args = None |
| 19 self.clustering_methods = [] | 19 self.clustering_methods = [] |
| 20 self.conn = None | 20 self.conn = None |
| 21 self.fh = None | |
| 22 self.gene_sequences_dict = {} | 21 self.gene_sequences_dict = {} |
| 23 self.scaffold_genes_dict = {} | 22 self.scaffold_genes_dict = {} |
| 24 self.scaffold_recs = [] | 23 self.scaffold_recs = [] |
| 25 self.species_genes_dict = {} | 24 self.species_genes_dict = {} |
| 26 self.species_ids_dict = {} | 25 self.species_ids_dict = {} |
| 27 self.taxa_lineage_config = None | 26 self.taxa_lineage_config = None |
| 28 self.__parse_args() | 27 self.parse_args() |
| 29 self.__connect_db() | 28 self.fh = open(self.args.output, "w") |
| 30 | 29 self.connect_db() |
| 31 def __parse_args(self): | 30 |
| 31 def parse_args(self): | |
| 32 parser = argparse.ArgumentParser() | 32 parser = argparse.ArgumentParser() |
| 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), |
| 34 parser.add_argument('--output', dest='output', help='Output dataset'), | 34 parser.add_argument('--output', dest='output', help='Output dataset'), |
| 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') |
| 36 self.args = parser.parse_args() | 36 self.args = parser.parse_args() |
| 37 | 37 |
| 38 def stop_err(msg): | 38 def stop_err(msg): |
| 39 sys.stderr.write(msg) | 39 sys.stderr.write(msg) |
| 40 self.fh.flush() | |
| 41 self.fh.close() | |
| 40 sys.exit(1) | 42 sys.exit(1) |
| 41 | 43 |
| 42 def __connect_db(self): | 44 def connect_db(self): |
| 43 url = make_url(self.args.database_connection_string) | 45 url = make_url(self.args.database_connection_string) |
| 44 self.fh.write('Connecting to database with URL: %s' % url) | 46 self.fh.write('Connecting to database with URL: %s' % url) |
| 45 args = url.translate_connect_args(username='user') | 47 args = url.translate_connect_args(username='user') |
| 46 args.update(url.query) | 48 args.update(url.query) |
| 47 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' |
| 48 self.conn = psycopg2.connect(**args) | 50 self.conn = psycopg2.connect(**args) |
| 49 | 51 |
| 50 def _flush(self): | 52 def flush(self): |
| 51 self.conn.commit() | 53 self.conn.commit() |
| 52 | 54 |
| 53 def _shutdown(self): | 55 def shutdown(self): |
| 54 self.conn.close() | 56 self.conn.close() |
| 55 | 57 |
| 56 def _update(self, sql, args): | 58 def update(self, sql, args): |
| 57 try: | 59 try: |
| 58 cur = self.conn.cursor() | 60 cur = self.conn.cursor() |
| 59 cur.execute(sql, args) | 61 cur.execute(sql, args) |
| 60 except Exception as e: | 62 except Exception as e: |
| 61 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) | 63 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) |
| 75 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) | 77 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) |
| 76 except: | 78 except: |
| 77 # The scaffold has not yet been added. | 79 # The scaffold has not yet been added. |
| 78 pass | 80 pass |
| 79 | 81 |
| 80 def _run(self): | 82 def run(self): |
| 81 self.check_scaffold() | 83 self.check_scaffold() |
| 82 with open(self.args.output, "w") as fh: | 84 self.process_annot_dir() |
| 83 self.fh = fh | 85 self.process_scaffold_config_files() |
| 84 self.process_annot_dir(self.fh) | 86 self.process_orthogroup_fasta_files() |
| 85 self.fh.flush() | 87 self.fh.flush() |
| 86 self.process_scaffold_config_files(fh) | |
| 87 self.fh.flush() | |
| 88 self.process_orthogroup_fasta_files(fh) | |
| 89 self.fh.flush() | |
| 90 self.fh.close() | 88 self.fh.close() |
| 91 | 89 |
| 92 def process_annot_dir(self): | 90 def process_annot_dir(self): |
| 93 """ | 91 """ |
| 94 First, parse all of the *.min_evalue.summary files in the | 92 First, parse all of the *.min_evalue.summary files in the |
| 119 sql = """ | 117 sql = """ |
| 120 INSERT INTO plant_tribes_scaffold | 118 INSERT INTO plant_tribes_scaffold |
| 121 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | 119 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) |
| 122 RETURNING id; | 120 RETURNING id; |
| 123 """ | 121 """ |
| 124 cur = self._update(sql, tuple(args)) | 122 cur = self.update(sql, tuple(args)) |
| 125 self._flush() | 123 self.flush() |
| 126 scaffold_id_db = cur.fetchone()[0] | 124 scaffold_id_db = cur.fetchone()[0] |
| 127 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) | 125 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) |
| 128 with open(file_name, "r") as fh: | 126 with open(file_name, "r") as fh: |
| 129 for i, line in enumerate(fh): | 127 for i, line in enumerate(fh): |
| 130 if i == 0: | 128 if i == 0: |
| 150 args.append('%s' % str(items[k])) | 148 args.append('%s' % str(items[k])) |
| 151 sql = """ | 149 sql = """ |
| 152 INSERT INTO plant_tribes_orthogroup | 150 INSERT INTO plant_tribes_orthogroup |
| 153 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | 151 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); |
| 154 """ | 152 """ |
| 155 cur = self._update(sql, tuple(args)) | 153 cur = self.update(sql, tuple(args)) |
| 156 self._flush() | 154 self.flush() |
| 157 for file_name in glob.glob(os.path.join(file_dir, "*list")): | 155 for file_name in glob.glob(os.path.join(file_dir, "*list")): |
| 158 items = os.path.basename(file_name).split(".") | 156 items = os.path.basename(file_name).split(".") |
| 159 clustering_method = items[0] | 157 clustering_method = items[0] |
| 160 with open(file_name, "r") as fh: | 158 with open(file_name, "r") as fh: |
| 161 for i, line in enumerate(fh): | 159 for i, line in enumerate(fh): |
| 253 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] | 251 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] |
| 254 sql = """ | 252 sql = """ |
| 255 INSERT INTO plant_tribes_taxon | 253 INSERT INTO plant_tribes_taxon |
| 256 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); | 254 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); |
| 257 """ | 255 """ |
| 258 self._update(sql, tuple(args)) | 256 self.update(sql, tuple(args)) |
| 259 self._flush() | 257 self.flush() |
| 260 | 258 |
| 261 def process_orthogroup_fasta_files(self): | 259 def process_orthogroup_fasta_files(self): |
| 262 scaffold_id = os.path.basename(self.args.scaffold_path) | 260 scaffold_id = os.path.basename(self.args.scaffold_path) |
| 263 aa_dict = {} | 261 aa_dict = {} |
| 264 dna_dict = {} | 262 dna_dict = {} |
| 347 sql = """ | 345 sql = """ |
| 348 INSERT INTO plant_tribes_gene | 346 INSERT INTO plant_tribes_gene |
| 349 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) | 347 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) |
| 350 RETURNING id; | 348 RETURNING id; |
| 351 """ | 349 """ |
| 352 cur = self._update(sql, tuple(args)) | 350 cur = self.update(sql, tuple(args)) |
| 353 self._flush() | 351 self.flush() |
| 354 gene_id_db = cur.fetchone()[0] | 352 gene_id_db = cur.fetchone()[0] |
| 355 # Insert a row into the gene_scaffold_orthogroup_association table. | 353 # Insert a row into the gene_scaffold_orthogroup_association table. |
| 356 # Get the scaffold_rec for the current scaffold_id and clustering_method. | 354 # Get the scaffold_rec for the current scaffold_id and clustering_method. |
| 357 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | 355 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] |
| 358 for scaffold_rec in self.scaffold_recs: | 356 for scaffold_rec in self.scaffold_recs: |
| 361 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] | 359 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] |
| 362 sql = """ | 360 sql = """ |
| 363 INSERT INTO gene_scaffold_orthogroup_association | 361 INSERT INTO gene_scaffold_orthogroup_association |
| 364 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); | 362 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); |
| 365 """ | 363 """ |
| 366 cur = self._update(sql, tuple(args)) | 364 cur = self.update(sql, tuple(args)) |
| 367 self._flush() | 365 self.flush() |
| 368 | 366 |
| 369 | 367 |
| 370 if __name__ == '__main__': | 368 if __name__ == '__main__': |
| 371 add_scaffold = AddScaffold() | 369 add_scaffold = AddScaffold() |
| 372 add_scaffold._run() | 370 add_scaffold.run() |
| 373 add_scaffold._shutdown() | 371 add_scaffold.shutdown() |
