Mercurial > repos > greg > plant_tribes_add_scaffold
comparison add_scaffold.py @ 9:3706dea6c2f1 draft default tip
Uploaded
| author | greg |
|---|---|
| date | Tue, 22 May 2018 10:49:03 -0400 |
| parents | 9b2ede3e7100 |
| children |
comparison
equal
deleted
inserted
replaced
| 8:9b2ede3e7100 | 9:3706dea6c2f1 |
|---|---|
| 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), |
| 34 parser.add_argument('--output', dest='output', help='Output dataset'), | 34 parser.add_argument('--output', dest='output', help='Output dataset'), |
| 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') |
| 36 self.args = parser.parse_args() | 36 self.args = parser.parse_args() |
| 37 | 37 |
| 38 def stop_err(self, msg): | |
| 39 sys.stderr.write(msg) | |
| 40 self.fh.flush() | |
| 41 self.fh.close() | |
| 42 sys.exit(1) | |
| 43 | |
| 44 def connect_db(self): | 38 def connect_db(self): |
| 45 url = make_url(self.args.database_connection_string) | 39 url = make_url(self.args.database_connection_string) |
| 46 self.fh.write('Connecting to database with URL: %s' % url) | 40 self.log('Connecting to database with URL: %s' % url) |
| 47 args = url.translate_connect_args(username='user') | 41 args = url.translate_connect_args(username='user') |
| 48 args.update(url.query) | 42 args.update(url.query) |
| 49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' |
| 50 self.conn = psycopg2.connect(**args) | 44 self.conn = psycopg2.connect(**args) |
| 51 | 45 |
| 61 cur.execute(sql, args) | 55 cur.execute(sql, args) |
| 62 except Exception as e: | 56 except Exception as e: |
| 63 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) | 57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) |
| 64 self.stop_err(msg) | 58 self.stop_err(msg) |
| 65 return cur | 59 return cur |
| 60 | |
| 61 def stop_err(self, msg): | |
| 62 sys.stderr.write(msg) | |
| 63 self.fh.flush() | |
| 64 self.fh.close() | |
| 65 sys.exit(1) | |
| 66 | |
| 67 def log(self, msg): | |
| 68 self.fh.write("%s\n" % msg) | |
| 69 self.fh.flush() | |
| 66 | 70 |
| 67 @property | 71 @property |
| 68 def can_add_scaffold(self): | 72 def can_add_scaffold(self): |
| 69 """ | 73 """ |
| 70 Make sure the scaffold has not already been added. | 74 Make sure the scaffold has not already been added. |
| 114 clustering_method = items[0] | 118 clustering_method = items[0] |
| 115 # Save all clustering methods for later processing. | 119 # Save all clustering methods for later processing. |
| 116 if clustering_method not in self.clustering_methods: | 120 if clustering_method not in self.clustering_methods: |
| 117 self.clustering_methods.append(clustering_method) | 121 self.clustering_methods.append(clustering_method) |
| 118 # Insert a row in to the plant_tribes_scaffold table. | 122 # Insert a row in to the plant_tribes_scaffold table. |
| 119 self.fh.write("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) | 123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) |
| 120 args = [scaffold_id, clustering_method] | 124 args = [scaffold_id, clustering_method] |
| 121 sql = """ | 125 sql = """ |
| 122 INSERT INTO plant_tribes_scaffold | 126 INSERT INTO plant_tribes_scaffold |
| 123 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | 127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) |
| 124 RETURNING id; | 128 RETURNING id; |
| 144 if j_int > 0: | 148 if j_int > 0: |
| 145 # The species has at least 1 gene | 149 # The species has at least 1 gene |
| 146 num_species += 1 | 150 num_species += 1 |
| 147 num_genes += j_int | 151 num_genes += j_int |
| 148 # Insert a row into the plant_tribes_orthogroup table. | 152 # Insert a row into the plant_tribes_orthogroup table. |
| 149 self.fh.write("Inserting a row into the plant_tribes_orthogroup table...") | 153 self.log("Inserting a row into the plant_tribes_orthogroup table...") |
| 150 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | 154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] |
| 151 for k in range(super_ortho_start_index, len(items)): | 155 for k in range(super_ortho_start_index, len(items)): |
| 152 args.append('%s' % str(items[k])) | 156 args.append('%s' % str(items[k])) |
| 153 sql = """ | 157 sql = """ |
| 154 INSERT INTO plant_tribes_orthogroup | 158 INSERT INTO plant_tribes_orthogroup |
| 181 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to | 185 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to |
| 182 populate the plant_tribes_taxon table. | 186 populate the plant_tribes_taxon table. |
| 183 """ | 187 """ |
| 184 scaffold_id = os.path.basename(self.args.scaffold_path) | 188 scaffold_id = os.path.basename(self.args.scaffold_path) |
| 185 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) | 189 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) |
| 186 self.fh.write("Processing rooting order config: %s" % str(file_name)) | 190 self.log("Processing rooting order config: %s" % str(file_name)) |
| 187 # Populate self.species_ids_dict. | 191 # Populate self.species_ids_dict. |
| 188 with open(file_name, "r") as fh: | 192 with open(file_name, "r") as fh: |
| 189 for i, line in enumerate(fh): | 193 for i, line in enumerate(fh): |
| 190 line = line.strip() | 194 line = line.strip() |
| 191 if len(line) == 0 or line.startswith("#") or line.startswith("["): | 195 if len(line) == 0 or line.startswith("#") or line.startswith("["): |
| 222 self.species_genes_dict[species_genes_dict_key] = tup | 226 self.species_genes_dict[species_genes_dict_key] = tup |
| 223 else: | 227 else: |
| 224 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] | 228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] |
| 225 # Populate the plant_tribes_taxon table. | 229 # Populate the plant_tribes_taxon table. |
| 226 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) | 230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) |
| 227 self.fh.write("Processing taxa lineage config: %s" % str(file_name)) | 231 self.log("Processing taxa lineage config: %s" % str(file_name)) |
| 228 with open(file_name, "r") as fh: | 232 with open(file_name, "r") as fh: |
| 229 for i, line in enumerate(fh): | 233 for i, line in enumerate(fh): |
| 230 line = line.strip() | 234 line = line.strip() |
| 231 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): | 235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): |
| 232 # Skip blank lines, comments and section headers. | 236 # Skip blank lines, comments and section headers. |
| 233 continue | 237 continue |
| 234 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | 238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots |
| 235 items = line.split("\t") | 239 items = line.split("\t") |
| 236 species_name = items[0] | 240 species_name = items[0] |
| 237 self.fh.write("Calculating the number of genes for species_name: %s" % str(species_name)) | 241 self.log("Calculating the number of genes for species_name: %s" % str(species_name)) |
| 238 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): | 242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): |
| 239 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. | 243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. |
| 240 species_genes_dict_key_items = species_genes_dict_key.split("^^") | 244 species_genes_dict_key_items = species_genes_dict_key.split("^^") |
| 241 clustering_method = species_genes_dict_key_items[0] | 245 clustering_method = species_genes_dict_key_items[0] |
| 242 species_code = species_genes_dict_key_items[1] | 246 species_code = species_genes_dict_key_items[1] |
| 310 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | 314 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. |
| 311 items = combined_id.split("^^") | 315 items = combined_id.split("^^") |
| 312 orthogroup_id = items[0] | 316 orthogroup_id = items[0] |
| 313 clustering_method = items[1] | 317 clustering_method = items[1] |
| 314 gene_id = items[2] | 318 gene_id = items[2] |
| 315 self.fh.write("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) | 319 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) |
| 316 # The value will be a list containing both | 320 # The value will be a list containing both |
| 317 # clustering_method and the dna string. | 321 # clustering_method and the dna string. |
| 318 dna_sequence = dna_dict[combined_id] | 322 dna_sequence = dna_dict[combined_id] |
| 319 aa_sequence = aa_dict[combined_id] | 323 aa_sequence = aa_dict[combined_id] |
| 320 # Get the species_code from the gene_id. | 324 # Get the species_code from the gene_id. |
