Mercurial > repos > greg > plant_tribes_add_scaffold
comparison add_scaffold.py @ 9:3706dea6c2f1 draft default tip
Uploaded
author | greg |
---|---|
date | Tue, 22 May 2018 10:49:03 -0400 |
parents | 9b2ede3e7100 |
children |
comparison
equal
deleted
inserted
replaced
8:9b2ede3e7100 | 9:3706dea6c2f1 |
---|---|
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), |
34 parser.add_argument('--output', dest='output', help='Output dataset'), | 34 parser.add_argument('--output', dest='output', help='Output dataset'), |
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') |
36 self.args = parser.parse_args() | 36 self.args = parser.parse_args() |
37 | 37 |
38 def stop_err(self, msg): | |
39 sys.stderr.write(msg) | |
40 self.fh.flush() | |
41 self.fh.close() | |
42 sys.exit(1) | |
43 | |
44 def connect_db(self): | 38 def connect_db(self): |
45 url = make_url(self.args.database_connection_string) | 39 url = make_url(self.args.database_connection_string) |
46 self.fh.write('Connecting to database with URL: %s' % url) | 40 self.log('Connecting to database with URL: %s' % url) |
47 args = url.translate_connect_args(username='user') | 41 args = url.translate_connect_args(username='user') |
48 args.update(url.query) | 42 args.update(url.query) |
49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | 43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' |
50 self.conn = psycopg2.connect(**args) | 44 self.conn = psycopg2.connect(**args) |
51 | 45 |
61 cur.execute(sql, args) | 55 cur.execute(sql, args) |
62 except Exception as e: | 56 except Exception as e: |
63 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) | 57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) |
64 self.stop_err(msg) | 58 self.stop_err(msg) |
65 return cur | 59 return cur |
60 | |
61 def stop_err(self, msg): | |
62 sys.stderr.write(msg) | |
63 self.fh.flush() | |
64 self.fh.close() | |
65 sys.exit(1) | |
66 | |
67 def log(self, msg): | |
68 self.fh.write("%s\n" % msg) | |
69 self.fh.flush() | |
66 | 70 |
67 @property | 71 @property |
68 def can_add_scaffold(self): | 72 def can_add_scaffold(self): |
69 """ | 73 """ |
70 Make sure the scaffold has not already been added. | 74 Make sure the scaffold has not already been added. |
114 clustering_method = items[0] | 118 clustering_method = items[0] |
115 # Save all clustering methods for later processing. | 119 # Save all clustering methods for later processing. |
116 if clustering_method not in self.clustering_methods: | 120 if clustering_method not in self.clustering_methods: |
117 self.clustering_methods.append(clustering_method) | 121 self.clustering_methods.append(clustering_method) |
118 # Insert a row in to the plant_tribes_scaffold table. | 122 # Insert a row in to the plant_tribes_scaffold table. |
119 self.fh.write("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) | 123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) |
120 args = [scaffold_id, clustering_method] | 124 args = [scaffold_id, clustering_method] |
121 sql = """ | 125 sql = """ |
122 INSERT INTO plant_tribes_scaffold | 126 INSERT INTO plant_tribes_scaffold |
123 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | 127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) |
124 RETURNING id; | 128 RETURNING id; |
144 if j_int > 0: | 148 if j_int > 0: |
145 # The species has at least 1 gene | 149 # The species has at least 1 gene |
146 num_species += 1 | 150 num_species += 1 |
147 num_genes += j_int | 151 num_genes += j_int |
148 # Insert a row into the plant_tribes_orthogroup table. | 152 # Insert a row into the plant_tribes_orthogroup table. |
149 self.fh.write("Inserting a row into the plant_tribes_orthogroup table...") | 153 self.log("Inserting a row into the plant_tribes_orthogroup table...") |
150 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | 154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] |
151 for k in range(super_ortho_start_index, len(items)): | 155 for k in range(super_ortho_start_index, len(items)): |
152 args.append('%s' % str(items[k])) | 156 args.append('%s' % str(items[k])) |
153 sql = """ | 157 sql = """ |
154 INSERT INTO plant_tribes_orthogroup | 158 INSERT INTO plant_tribes_orthogroup |
181 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to | 185 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to |
182 populate the plant_tribes_taxon table. | 186 populate the plant_tribes_taxon table. |
183 """ | 187 """ |
184 scaffold_id = os.path.basename(self.args.scaffold_path) | 188 scaffold_id = os.path.basename(self.args.scaffold_path) |
185 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) | 189 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) |
186 self.fh.write("Processing rooting order config: %s" % str(file_name)) | 190 self.log("Processing rooting order config: %s" % str(file_name)) |
187 # Populate self.species_ids_dict. | 191 # Populate self.species_ids_dict. |
188 with open(file_name, "r") as fh: | 192 with open(file_name, "r") as fh: |
189 for i, line in enumerate(fh): | 193 for i, line in enumerate(fh): |
190 line = line.strip() | 194 line = line.strip() |
191 if len(line) == 0 or line.startswith("#") or line.startswith("["): | 195 if len(line) == 0 or line.startswith("#") or line.startswith("["): |
222 self.species_genes_dict[species_genes_dict_key] = tup | 226 self.species_genes_dict[species_genes_dict_key] = tup |
223 else: | 227 else: |
224 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] | 228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] |
225 # Populate the plant_tribes_taxon table. | 229 # Populate the plant_tribes_taxon table. |
226 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) | 230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) |
227 self.fh.write("Processing taxa lineage config: %s" % str(file_name)) | 231 self.log("Processing taxa lineage config: %s" % str(file_name)) |
228 with open(file_name, "r") as fh: | 232 with open(file_name, "r") as fh: |
229 for i, line in enumerate(fh): | 233 for i, line in enumerate(fh): |
230 line = line.strip() | 234 line = line.strip() |
231 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): | 235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): |
232 # Skip blank lines, comments and section headers. | 236 # Skip blank lines, comments and section headers. |
233 continue | 237 continue |
234 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | 238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots |
235 items = line.split("\t") | 239 items = line.split("\t") |
236 species_name = items[0] | 240 species_name = items[0] |
237 self.fh.write("Calculating the number of genes for species_name: %s" % str(species_name)) | 241 self.log("Calculating the number of genes for species_name: %s" % str(species_name)) |
238 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): | 242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): |
239 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. | 243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. |
240 species_genes_dict_key_items = species_genes_dict_key.split("^^") | 244 species_genes_dict_key_items = species_genes_dict_key.split("^^") |
241 clustering_method = species_genes_dict_key_items[0] | 245 clustering_method = species_genes_dict_key_items[0] |
242 species_code = species_genes_dict_key_items[1] | 246 species_code = species_genes_dict_key_items[1] |
310 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | 314 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. |
311 items = combined_id.split("^^") | 315 items = combined_id.split("^^") |
312 orthogroup_id = items[0] | 316 orthogroup_id = items[0] |
313 clustering_method = items[1] | 317 clustering_method = items[1] |
314 gene_id = items[2] | 318 gene_id = items[2] |
315 self.fh.write("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) | 319 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) |
316 # The value will be a list containing both | 320 # The value will be a list containing both |
317 # clustering_method and the dna string. | 321 # clustering_method and the dna string. |
318 dna_sequence = dna_dict[combined_id] | 322 dna_sequence = dna_dict[combined_id] |
319 aa_sequence = aa_dict[combined_id] | 323 aa_sequence = aa_dict[combined_id] |
320 # Get the species_code from the gene_id. | 324 # Get the species_code from the gene_id. |