Mercurial > repos > greg > plant_tribes_add_scaffold
comparison add_scaffold.py @ 0:fdcced0f4ae4 draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 22 May 2018 09:40:43 -0400 |
| parents | |
| children | 04ad7b5d22dd |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:fdcced0f4ae4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes | |
| 4 database efficiently by bypassing the Galaxy model and operating directly on the database. | |
| 5 PostgreSQL 9.1 or greater is required. | |
| 6 """ | |
| 7 import argparse | |
| 8 import glob | |
| 9 import os | |
| 10 import sys | |
| 11 | |
| 12 import psycopg2 | |
| 13 from sqlalchemy.engine.url import make_url | |
| 14 | |
| 15 | |
| 16 class AddScaffold(object): | |
| 17 def __init__(self): | |
| 18 self.args = None | |
| 19 self.clustering_methods = [] | |
| 20 self.conn = None | |
| 21 self.fh = None | |
| 22 self.gene_sequences_dict = {} | |
| 23 self.scaffold_genes_dict = {} | |
| 24 self.scaffold_recs = [] | |
| 25 self.species_genes_dict = {} | |
| 26 self.species_ids_dict = {} | |
| 27 self.taxa_lineage_config = None | |
| 28 self.__parse_args() | |
| 29 self.__connect_db() | |
| 30 | |
| 31 def __parse_args(self): | |
| 32 parser = argparse.ArgumentParser() | |
| 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | |
| 34 parser.add_argument('--output', dest='output', help='Output dataset'), | |
| 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | |
| 36 self.args = parser.parse_args() | |
| 37 | |
| 38 def stop_err(msg): | |
| 39 sys.stderr.write(msg) | |
| 40 sys.exit(1) | |
| 41 | |
| 42 def __connect_db(self): | |
| 43 url = make_url(self.args.database_connection_string) | |
| 44 self.fh.write('Connecting to database with URL: %s' % url) | |
| 45 args = url.translate_connect_args(username='user') | |
| 46 args.update(url.query) | |
| 47 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | |
| 48 self.conn = psycopg2.connect(**args) | |
| 49 | |
| 50 def _flush(self): | |
| 51 self.conn.commit() | |
| 52 | |
| 53 def _shutdown(self): | |
| 54 self.conn.close() | |
| 55 | |
| 56 def _update(self, sql, args): | |
| 57 try: | |
| 58 cur = self.conn.cursor() | |
| 59 cur.execute(sql, args) | |
| 60 except Exception as e: | |
| 61 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args)) | |
| 62 self.stop_err('Caught exception executing SQL:\n%s\nException:\n%s\n' % (sql.format(args), e)) | |
| 63 return cur | |
| 64 | |
| 65 def check_scaffold(self): | |
| 66 """ | |
| 67 Make sure the scaffold has not already been added. | |
| 68 """ | |
| 69 scaffold_id = os.path.basename(self.args.scaffold_path) | |
| 70 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id | |
| 71 cur = self.conn.cursor() | |
| 72 cur.execute(sql) | |
| 73 try: | |
| 74 cur.fetchone()[0] | |
| 75 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id) | |
| 76 except: | |
| 77 # The scaffold has not yet been added. | |
| 78 pass | |
| 79 | |
| 80 def _run(self): | |
| 81 self.check_scaffold() | |
| 82 with open(self.args.output, "w") as fh: | |
| 83 self.fh = fh | |
| 84 self.process_annot_dir(self.fh) | |
| 85 self.fh.flush() | |
| 86 self.process_scaffold_config_files(fh) | |
| 87 self.fh.flush() | |
| 88 self.process_orthogroup_fasta_files(fh) | |
| 89 self.fh.flush() | |
| 90 self.fh.close() | |
| 91 | |
| 92 def process_annot_dir(self): | |
| 93 """ | |
| 94 First, parse all of the *.min_evalue.summary files in the | |
| 95 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate | |
| 96 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. | |
| 97 Next, parse all of the *.list files in the same directory to populate | |
| 98 self.scaffold_genes_dict. | |
| 99 """ | |
| 100 scaffold_id = os.path.basename(self.args.scaffold_path) | |
| 101 file_dir = os.path.join(self.args.scaffold_path, 'annot') | |
| 102 # The scaffol naming convention must follow this pattern: | |
| 103 # <integer1>Gv<integer2>.<integer3> | |
| 104 # where integer 1 is the number of genomes in the scaffold_id. For example: | |
| 105 # 22Gv1.1 -> 22 genomes | |
| 106 # 12Gv1.0 -> 12 genomes | |
| 107 # 26Gv2.0 -> 26 genomes, etc. | |
| 108 num_genomes = int(scaffold_id.split("Gv")[0]) | |
| 109 super_ortho_start_index = num_genomes + 1 | |
| 110 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")): | |
| 111 items = os.path.basename(file_name).split(".") | |
| 112 clustering_method = items[0] | |
| 113 # Save all clustering methods for later processing. | |
| 114 if clustering_method not in self.clustering_methods: | |
| 115 self.clustering_methods.append(clustering_method) | |
| 116 # Insert a row in to the plant_tribes_scaffold table. | |
| 117 self.fh.write("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) | |
| 118 args = [scaffold_id, clustering_method] | |
| 119 sql = """ | |
| 120 INSERT INTO plant_tribes_scaffold | |
| 121 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | |
| 122 RETURNING id; | |
| 123 """ | |
| 124 cur = self._update(sql, tuple(args)) | |
| 125 self._flush() | |
| 126 scaffold_id_db = cur.fetchone()[0] | |
| 127 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) | |
| 128 with open(file_name, "r") as fh: | |
| 129 for i, line in enumerate(fh): | |
| 130 if i == 0: | |
| 131 # Skip first line. | |
| 132 continue | |
| 133 num_genes = 0 | |
| 134 num_species = 0 | |
| 135 items = line.split("\t") | |
| 136 orthogroup_id = int(items[0]) | |
| 137 # Zero based items 1 to num_genomes consists of the | |
| 138 # number of species classified in the orthogroup (i.e., | |
| 139 # species with at least 1 gene in the orthogroup). | |
| 140 for j in range(1, num_genomes): | |
| 141 j_int = int(items[j]) | |
| 142 if j_int > 0: | |
| 143 # The species has at least 1 gene | |
| 144 num_species += 1 | |
| 145 num_genes += j_int | |
| 146 # Insert a row into the plant_tribes_orthogroup table. | |
| 147 self.fh.write("Inserting a row into the plant_tribes_orthogroup table...") | |
| 148 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | |
| 149 for k in range(super_ortho_start_index, len(items)): | |
| 150 args.append('%s' % str(items[k])) | |
| 151 sql = """ | |
| 152 INSERT INTO plant_tribes_orthogroup | |
| 153 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | |
| 154 """ | |
| 155 cur = self._update(sql, tuple(args)) | |
| 156 self._flush() | |
| 157 for file_name in glob.glob(os.path.join(file_dir, "*list")): | |
| 158 items = os.path.basename(file_name).split(".") | |
| 159 clustering_method = items[0] | |
| 160 with open(file_name, "r") as fh: | |
| 161 for i, line in enumerate(fh): | |
| 162 items = line.split("\t") | |
| 163 # The key will be a combination of clustering_method and | |
| 164 # orthogroup_id separated by "^^" for easy splitting later. | |
| 165 key = "%s^^%s" % (clustering_method, items[0]) | |
| 166 # The value is the gen_id with all white space replaced by "_". | |
| 167 val = items[1].replace("|", "_") | |
| 168 if key in self.scaffold_genes_dict: | |
| 169 self.scaffold_genes_dict[key].append(val) | |
| 170 else: | |
| 171 self.scaffold_genes_dict[key] = [val] | |
| 172 | |
| 173 def process_scaffold_config_files(self): | |
| 174 """ | |
| 175 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config | |
| 176 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate. | |
| 177 2. Calculate the number of genes found | |
| 178 for each species and add the number to self.species_genes_dict. | |
| 179 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to | |
| 180 populate the plant_tribes_taxon table. | |
| 181 """ | |
| 182 scaffold_id = os.path.basename(self.args.scaffold_path) | |
| 183 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) | |
| 184 self.fh.write("Processing rooting order config: %s" % str(file_name)) | |
| 185 # Populate self.species_ids_dict. | |
| 186 with open(file_name, "r") as fh: | |
| 187 for i, line in enumerate(fh): | |
| 188 line = line.strip() | |
| 189 if len(line) == 0 or line.startswith("#") or line.startswith("["): | |
| 190 # Skip blank lines, comments and section headers. | |
| 191 continue | |
| 192 # Example line: | |
| 193 # Physcomitrella patens=Phypa | |
| 194 items = line.split("=") | |
| 195 self.species_ids_dict[items[1]] = items[0] | |
| 196 # Get lineage information for orthogrpoup taxa. | |
| 197 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()): | |
| 198 # The format of key is <clustering_method>^^<orthogroup_id>. | |
| 199 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001" | |
| 200 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^") | |
| 201 clustering_method = scaffold_genes_dict_key_items[0] | |
| 202 # Get the list of genes for the current scaffold_genes_dict_key. | |
| 203 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key] | |
| 204 for gene_id in gene_list: | |
| 205 # Example species_code: Musac1.0, where | |
| 206 # species_name is Musac and version is 1.0. | |
| 207 species_code = gene_id.split("_")[1] | |
| 208 # Strip the version from the species_code. | |
| 209 species_code = species_code[0:5] | |
| 210 # Get the species_name from self.species_ids_dict. | |
| 211 species_name = self.species_ids_dict[species_code] | |
| 212 # Create a key for self.species_genes_dict, with the format: | |
| 213 # <clustering_method>^^<species_code> | |
| 214 species_genes_dict_key = "%s^^%s" % (clustering_method, species_code) | |
| 215 # Add an entry to self.species_genes_dict, where the value | |
| 216 # is a list containing species_name and num_genes. | |
| 217 if species_genes_dict_key in self.species_genes_dict: | |
| 218 tup = self.species_genes_dict[species_genes_dict_key] | |
| 219 tup[1] += 1 | |
| 220 self.species_genes_dict[species_genes_dict_key] = tup | |
| 221 else: | |
| 222 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] | |
| 223 # Populate the plant_tribes_taxon table. | |
| 224 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) | |
| 225 self.fh.write("Processing taxa lineage config: %s" % str(file_name)) | |
| 226 with open(file_name, "r") as fh: | |
| 227 for i, line in enumerate(fh): | |
| 228 line = line.strip() | |
| 229 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): | |
| 230 # Skip blank lines, comments and section headers. | |
| 231 continue | |
| 232 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | |
| 233 items = line.split("\t") | |
| 234 species_name = items[0] | |
| 235 self.fh.write("Calculating the number of genes for species_name: %s" % str(species_name)) | |
| 236 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): | |
| 237 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. | |
| 238 species_genes_dict_key_items = species_genes_dict_key.split("^^") | |
| 239 clustering_method = species_genes_dict_key_items[0] | |
| 240 species_code = species_genes_dict_key_items[1] | |
| 241 # Get the scaffold_rec for the current scaffold_id and clustering_method. | |
| 242 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | |
| 243 for scaffold_rec in self.scaffold_recs: | |
| 244 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | |
| 245 scaffold_id_db = scaffold_rec[0] | |
| 246 # The value is a list containing species_name and num_genes. | |
| 247 val = self.species_genes_dict[species_genes_dict_key] | |
| 248 if species_name == val[0]: | |
| 249 num_genes = val[1] | |
| 250 else: | |
| 251 num_genes = 0 | |
| 252 # Insert a row in to the plant_tribes_scaffold table. | |
| 253 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] | |
| 254 sql = """ | |
| 255 INSERT INTO plant_tribes_taxon | |
| 256 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); | |
| 257 """ | |
| 258 self._update(sql, tuple(args)) | |
| 259 self._flush() | |
| 260 | |
| 261 def process_orthogroup_fasta_files(self): | |
| 262 scaffold_id = os.path.basename(self.args.scaffold_path) | |
| 263 aa_dict = {} | |
| 264 dna_dict = {} | |
| 265 # Populate aa_dict and dna_dict. | |
| 266 for clustering_method in self.clustering_methods: | |
| 267 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method) | |
| 268 for file_name in os.listdir(file_dir): | |
| 269 items = file_name.split(".") | |
| 270 orthogroup_id = items[0] | |
| 271 file_extension = items[1] | |
| 272 if file_extension == "fna": | |
| 273 adict = dna_dict | |
| 274 else: | |
| 275 adict = aa_dict | |
| 276 file_path = os.path.join(file_dir, file_name) | |
| 277 with open(file_path, "r") as fh: | |
| 278 for i, line in enumerate(fh): | |
| 279 line = line.strip() | |
| 280 if len(line) == 0: | |
| 281 # Skip blank lines (shoudn't happen). | |
| 282 continue | |
| 283 if line.startswith(">"): | |
| 284 # Example line: | |
| 285 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110 | |
| 286 gene_id = line.lstrip(">") | |
| 287 # The dictionary keys will combine the orthogroup_id, | |
| 288 # clustering method and gene id using the format | |
| 289 # ,orthogroup_id>^^<clustering_method>^^<gene_id>. | |
| 290 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id) | |
| 291 if combined_id not in adict: | |
| 292 # The value will be the dna sequence string.. | |
| 293 adict[combined_id] = "" | |
| 294 else: | |
| 295 # Example line: | |
| 296 # ATGGAGAAGGACTTT | |
| 297 # Here combined_id is set because the fasta format specifies | |
| 298 # that all lines following the gene id defined in the if block | |
| 299 # above will be the sequence associated with that gene until | |
| 300 # the next gene id line is encountered. | |
| 301 sequence = adict[combined_id] | |
| 302 sequence = "%s%s" % (sequence, line) | |
| 303 adict[combined_id] = sequence | |
| 304 # Populate the plant_tribes_gene and gen_scaffold_association tables | |
| 305 # from the contents of aa_dict and dna_dict. | |
| 306 for combined_id in sorted(dna_dict.keys()): | |
| 307 # The dictionary keys combine the orthogroup_id, clustering method and | |
| 308 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | |
| 309 items = combined_id.split("^^") | |
| 310 orthogroup_id = items[0] | |
| 311 clustering_method = items[1] | |
| 312 gene_id = items[2] | |
| 313 self.fh.write("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) | |
| 314 # The value will be a list containing both | |
| 315 # clustering_method and the dna string. | |
| 316 dna_sequence = dna_dict[combined_id] | |
| 317 aa_sequence = aa_dict[combined_id] | |
| 318 # Get the species_code from the gene_id. | |
| 319 species_code = gene_id.split("_")[1] | |
| 320 # Strip the version from the species_code. | |
| 321 species_code = species_code[0:5] | |
| 322 # Get the species_name from self.species_ids_dict. | |
| 323 species_name = self.species_ids_dict[species_code] | |
| 324 # Get the plant_tribes_orthogroup primary key id for | |
| 325 # the orthogroup_id from the plant_tribes_orthogroup table. | |
| 326 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id | |
| 327 cur = self.conn.cursor() | |
| 328 cur.execute(sql) | |
| 329 orthogroup_id_db = cur.fetchone()[0] | |
| 330 # If the plant_tribes_gene table contains a row that has the gene_id, | |
| 331 # then we'll add a row only to the gene_scaffold_orthogroup_association table. | |
| 332 # Get the taxon_id for the species_name from the plant_tribes_taxon table. | |
| 333 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name | |
| 334 cur = self.conn.cursor() | |
| 335 cur.execute(sql) | |
| 336 taxon_id = cur.fetchone()[0] | |
| 337 # If the plant_tribes_gene table contains a row that has the gene_id, | |
| 338 # then we'll add a row only to the gene_scaffold_orthogroup_association table. | |
| 339 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id | |
| 340 cur = self.conn.cursor() | |
| 341 cur.execute(sql) | |
| 342 try: | |
| 343 gene_id_db = cur.fetchone()[0] | |
| 344 except: | |
| 345 # Insert a row into the plant_tribes_gene table. | |
| 346 args = [gene_id, taxon_id, dna_sequence, aa_sequence] | |
| 347 sql = """ | |
| 348 INSERT INTO plant_tribes_gene | |
| 349 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s) | |
| 350 RETURNING id; | |
| 351 """ | |
| 352 cur = self._update(sql, tuple(args)) | |
| 353 self._flush() | |
| 354 gene_id_db = cur.fetchone()[0] | |
| 355 # Insert a row into the gene_scaffold_orthogroup_association table. | |
| 356 # Get the scaffold_rec for the current scaffold_id and clustering_method. | |
| 357 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | |
| 358 for scaffold_rec in self.scaffold_recs: | |
| 359 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | |
| 360 scaffold_id_db = scaffold_rec[0] | |
| 361 args = [gene_id_db, scaffold_id_db, orthogroup_id_db] | |
| 362 sql = """ | |
| 363 INSERT INTO gene_scaffold_orthogroup_association | |
| 364 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s); | |
| 365 """ | |
| 366 cur = self._update(sql, tuple(args)) | |
| 367 self._flush() | |
| 368 | |
| 369 | |
| 370 if __name__ == '__main__': | |
| 371 add_scaffold = AddScaffold() | |
| 372 add_scaffold._run() | |
| 373 add_scaffold._shutdown() |
