comparison add_scaffold.py @ 9:3706dea6c2f1 draft default tip

Uploaded
author greg
date Tue, 22 May 2018 10:49:03 -0400
parents 9b2ede3e7100
children
comparison
equal deleted inserted replaced
8:9b2ede3e7100 9:3706dea6c2f1
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), 33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'), 34 parser.add_argument('--output', dest='output', help='Output dataset'),
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') 35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
36 self.args = parser.parse_args() 36 self.args = parser.parse_args()
37 37
38 def stop_err(self, msg):
39 sys.stderr.write(msg)
40 self.fh.flush()
41 self.fh.close()
42 sys.exit(1)
43
44 def connect_db(self): 38 def connect_db(self):
45 url = make_url(self.args.database_connection_string) 39 url = make_url(self.args.database_connection_string)
46 self.fh.write('Connecting to database with URL: %s' % url) 40 self.log('Connecting to database with URL: %s' % url)
47 args = url.translate_connect_args(username='user') 41 args = url.translate_connect_args(username='user')
48 args.update(url.query) 42 args.update(url.query)
49 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' 43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
50 self.conn = psycopg2.connect(**args) 44 self.conn = psycopg2.connect(**args)
51 45
61 cur.execute(sql, args) 55 cur.execute(sql, args)
62 except Exception as e: 56 except Exception as e:
63 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) 57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
64 self.stop_err(msg) 58 self.stop_err(msg)
65 return cur 59 return cur
60
61 def stop_err(self, msg):
62 sys.stderr.write(msg)
63 self.fh.flush()
64 self.fh.close()
65 sys.exit(1)
66
67 def log(self, msg):
68 self.fh.write("%s\n" % msg)
69 self.fh.flush()
66 70
67 @property 71 @property
68 def can_add_scaffold(self): 72 def can_add_scaffold(self):
69 """ 73 """
70 Make sure the scaffold has not already been added. 74 Make sure the scaffold has not already been added.
114 clustering_method = items[0] 118 clustering_method = items[0]
115 # Save all clustering methods for later processing. 119 # Save all clustering methods for later processing.
116 if clustering_method not in self.clustering_methods: 120 if clustering_method not in self.clustering_methods:
117 self.clustering_methods.append(clustering_method) 121 self.clustering_methods.append(clustering_method)
118 # Insert a row in to the plant_tribes_scaffold table. 122 # Insert a row in to the plant_tribes_scaffold table.
119 self.fh.write("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method)) 123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method))
120 args = [scaffold_id, clustering_method] 124 args = [scaffold_id, clustering_method]
121 sql = """ 125 sql = """
122 INSERT INTO plant_tribes_scaffold 126 INSERT INTO plant_tribes_scaffold
123 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) 127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
124 RETURNING id; 128 RETURNING id;
144 if j_int > 0: 148 if j_int > 0:
145 # The species has at least 1 gene 149 # The species has at least 1 gene
146 num_species += 1 150 num_species += 1
147 num_genes += j_int 151 num_genes += j_int
148 # Insert a row into the plant_tribes_orthogroup table. 152 # Insert a row into the plant_tribes_orthogroup table.
149 self.fh.write("Inserting a row into the plant_tribes_orthogroup table...") 153 self.log("Inserting a row into the plant_tribes_orthogroup table...")
150 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] 154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
151 for k in range(super_ortho_start_index, len(items)): 155 for k in range(super_ortho_start_index, len(items)):
152 args.append('%s' % str(items[k])) 156 args.append('%s' % str(items[k]))
153 sql = """ 157 sql = """
154 INSERT INTO plant_tribes_orthogroup 158 INSERT INTO plant_tribes_orthogroup
181 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to 185 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
182 populate the plant_tribes_taxon table. 186 populate the plant_tribes_taxon table.
183 """ 187 """
184 scaffold_id = os.path.basename(self.args.scaffold_path) 188 scaffold_id = os.path.basename(self.args.scaffold_path)
185 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) 189 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
186 self.fh.write("Processing rooting order config: %s" % str(file_name)) 190 self.log("Processing rooting order config: %s" % str(file_name))
187 # Populate self.species_ids_dict. 191 # Populate self.species_ids_dict.
188 with open(file_name, "r") as fh: 192 with open(file_name, "r") as fh:
189 for i, line in enumerate(fh): 193 for i, line in enumerate(fh):
190 line = line.strip() 194 line = line.strip()
191 if len(line) == 0 or line.startswith("#") or line.startswith("["): 195 if len(line) == 0 or line.startswith("#") or line.startswith("["):
222 self.species_genes_dict[species_genes_dict_key] = tup 226 self.species_genes_dict[species_genes_dict_key] = tup
223 else: 227 else:
224 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] 228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
225 # Populate the plant_tribes_taxon table. 229 # Populate the plant_tribes_taxon table.
226 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) 230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
227 self.fh.write("Processing taxa lineage config: %s" % str(file_name)) 231 self.log("Processing taxa lineage config: %s" % str(file_name))
228 with open(file_name, "r") as fh: 232 with open(file_name, "r") as fh:
229 for i, line in enumerate(fh): 233 for i, line in enumerate(fh):
230 line = line.strip() 234 line = line.strip()
231 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): 235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
232 # Skip blank lines, comments and section headers. 236 # Skip blank lines, comments and section headers.
233 continue 237 continue
234 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots 238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
235 items = line.split("\t") 239 items = line.split("\t")
236 species_name = items[0] 240 species_name = items[0]
237 self.fh.write("Calculating the number of genes for species_name: %s" % str(species_name)) 241 self.log("Calculating the number of genes for species_name: %s" % str(species_name))
238 for species_genes_dict_key in sorted(self.species_genes_dict.keys()): 242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
239 # The format of species_genes_dict_key is <clustering_method>^^<species_code>. 243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
240 species_genes_dict_key_items = species_genes_dict_key.split("^^") 244 species_genes_dict_key_items = species_genes_dict_key.split("^^")
241 clustering_method = species_genes_dict_key_items[0] 245 clustering_method = species_genes_dict_key_items[0]
242 species_code = species_genes_dict_key_items[1] 246 species_code = species_genes_dict_key_items[1]
310 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. 314 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
311 items = combined_id.split("^^") 315 items = combined_id.split("^^")
312 orthogroup_id = items[0] 316 orthogroup_id = items[0]
313 clustering_method = items[1] 317 clustering_method = items[1]
314 gene_id = items[2] 318 gene_id = items[2]
315 self.fh.write("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id)) 319 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
316 # The value will be a list containing both 320 # The value will be a list containing both
317 # clustering_method and the dna string. 321 # clustering_method and the dna string.
318 dna_sequence = dna_dict[combined_id] 322 dna_sequence = dna_dict[combined_id]
319 aa_sequence = aa_dict[combined_id] 323 aa_sequence = aa_dict[combined_id]
320 # Get the species_code from the gene_id. 324 # Get the species_code from the gene_id.