comparison add_scaffold.py @ 0:fdcced0f4ae4 draft

Uploaded
author greg
date Tue, 22 May 2018 09:40:43 -0400
parents
children 04ad7b5d22dd
comparison
equal deleted inserted replaced
-1:000000000000 0:fdcced0f4ae4
1 #!/usr/bin/env python
2 """
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
4 database efficiently by bypassing the Galaxy model and operating directly on the database.
5 PostgreSQL 9.1 or greater is required.
6 """
7 import argparse
8 import glob
9 import os
10 import sys
11
12 import psycopg2
13 from sqlalchemy.engine.url import make_url
14
15
16 class AddScaffold(object):
17 def __init__(self):
18 self.args = None
19 self.clustering_methods = []
20 self.conn = None
21 self.fh = None
22 self.gene_sequences_dict = {}
23 self.scaffold_genes_dict = {}
24 self.scaffold_recs = []
25 self.species_genes_dict = {}
26 self.species_ids_dict = {}
27 self.taxa_lineage_config = None
28 self.__parse_args()
29 self.__connect_db()
30
31 def __parse_args(self):
32 parser = argparse.ArgumentParser()
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
34 parser.add_argument('--output', dest='output', help='Output dataset'),
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
36 self.args = parser.parse_args()
37
38 def stop_err(msg):
39 sys.stderr.write(msg)
40 sys.exit(1)
41
42 def __connect_db(self):
43 url = make_url(self.args.database_connection_string)
44 self.fh.write('Connecting to database with URL: %s' % url)
45 args = url.translate_connect_args(username='user')
46 args.update(url.query)
47 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
48 self.conn = psycopg2.connect(**args)
49
50 def _flush(self):
51 self.conn.commit()
52
53 def _shutdown(self):
54 self.conn.close()
55
56 def _update(self, sql, args):
57 try:
58 cur = self.conn.cursor()
59 cur.execute(sql, args)
60 except Exception as e:
61 self.fh.write("Caught exception executing SQL:\n%s\n" % sql.format(args))
62 self.stop_err('Caught exception executing SQL:\n%s\nException:\n%s\n' % (sql.format(args), e))
63 return cur
64
65 def check_scaffold(self):
66 """
67 Make sure the scaffold has not already been added.
68 """
69 scaffold_id = os.path.basename(self.args.scaffold_path)
70 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
71 cur = self.conn.cursor()
72 cur.execute(sql)
73 try:
74 cur.fetchone()[0]
75 self.stop_err("The scaffold %s has already been added to the database." % scaffold_id)
76 except:
77 # The scaffold has not yet been added.
78 pass
79
80 def _run(self):
81 self.check_scaffold()
82 with open(self.args.output, "w") as fh:
83 self.fh = fh
84 self.process_annot_dir(self.fh)
85 self.fh.flush()
86 self.process_scaffold_config_files(fh)
87 self.fh.flush()
88 self.process_orthogroup_fasta_files(fh)
89 self.fh.flush()
90 self.fh.close()
91
92 def process_annot_dir(self):
93 """
94 First, parse all of the *.min_evalue.summary files in the
95 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
96 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
97 Next, parse all of the *.list files in the same directory to populate
98 self.scaffold_genes_dict.
99 """
100 scaffold_id = os.path.basename(self.args.scaffold_path)
101 file_dir = os.path.join(self.args.scaffold_path, 'annot')
102 # The scaffol naming convention must follow this pattern:
103 # <integer1>Gv<integer2>.<integer3>
104 # where integer 1 is the number of genomes in the scaffold_id. For example:
105 # 22Gv1.1 -> 22 genomes
106 # 12Gv1.0 -> 12 genomes
107 # 26Gv2.0 -> 26 genomes, etc.
108 num_genomes = int(scaffold_id.split("Gv")[0])
109 super_ortho_start_index = num_genomes + 1
110 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
111 items = os.path.basename(file_name).split(".")
112 clustering_method = items[0]
113 # Save all clustering methods for later processing.
114 if clustering_method not in self.clustering_methods:
115 self.clustering_methods.append(clustering_method)
116 # Insert a row in to the plant_tribes_scaffold table.
117 self.fh.write("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method))
118 args = [scaffold_id, clustering_method]
119 sql = """
120 INSERT INTO plant_tribes_scaffold
121 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
122 RETURNING id;
123 """
124 cur = self._update(sql, tuple(args))
125 self._flush()
126 scaffold_id_db = cur.fetchone()[0]
127 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
128 with open(file_name, "r") as fh:
129 for i, line in enumerate(fh):
130 if i == 0:
131 # Skip first line.
132 continue
133 num_genes = 0
134 num_species = 0
135 items = line.split("\t")
136 orthogroup_id = int(items[0])
137 # Zero based items 1 to num_genomes consists of the
138 # number of species classified in the orthogroup (i.e.,
139 # species with at least 1 gene in the orthogroup).
140 for j in range(1, num_genomes):
141 j_int = int(items[j])
142 if j_int > 0:
143 # The species has at least 1 gene
144 num_species += 1
145 num_genes += j_int
146 # Insert a row into the plant_tribes_orthogroup table.
147 self.fh.write("Inserting a row into the plant_tribes_orthogroup table...")
148 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
149 for k in range(super_ortho_start_index, len(items)):
150 args.append('%s' % str(items[k]))
151 sql = """
152 INSERT INTO plant_tribes_orthogroup
153 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
154 """
155 cur = self._update(sql, tuple(args))
156 self._flush()
157 for file_name in glob.glob(os.path.join(file_dir, "*list")):
158 items = os.path.basename(file_name).split(".")
159 clustering_method = items[0]
160 with open(file_name, "r") as fh:
161 for i, line in enumerate(fh):
162 items = line.split("\t")
163 # The key will be a combination of clustering_method and
164 # orthogroup_id separated by "^^" for easy splitting later.
165 key = "%s^^%s" % (clustering_method, items[0])
166 # The value is the gen_id with all white space replaced by "_".
167 val = items[1].replace("|", "_")
168 if key in self.scaffold_genes_dict:
169 self.scaffold_genes_dict[key].append(val)
170 else:
171 self.scaffold_genes_dict[key] = [val]
172
173 def process_scaffold_config_files(self):
174 """
175 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
176 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
177 2. Calculate the number of genes found
178 for each species and add the number to self.species_genes_dict.
179 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
180 populate the plant_tribes_taxon table.
181 """
182 scaffold_id = os.path.basename(self.args.scaffold_path)
183 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
184 self.fh.write("Processing rooting order config: %s" % str(file_name))
185 # Populate self.species_ids_dict.
186 with open(file_name, "r") as fh:
187 for i, line in enumerate(fh):
188 line = line.strip()
189 if len(line) == 0 or line.startswith("#") or line.startswith("["):
190 # Skip blank lines, comments and section headers.
191 continue
192 # Example line:
193 # Physcomitrella patens=Phypa
194 items = line.split("=")
195 self.species_ids_dict[items[1]] = items[0]
196 # Get lineage information for orthogrpoup taxa.
197 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
198 # The format of key is <clustering_method>^^<orthogroup_id>.
199 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
200 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
201 clustering_method = scaffold_genes_dict_key_items[0]
202 # Get the list of genes for the current scaffold_genes_dict_key.
203 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
204 for gene_id in gene_list:
205 # Example species_code: Musac1.0, where
206 # species_name is Musac and version is 1.0.
207 species_code = gene_id.split("_")[1]
208 # Strip the version from the species_code.
209 species_code = species_code[0:5]
210 # Get the species_name from self.species_ids_dict.
211 species_name = self.species_ids_dict[species_code]
212 # Create a key for self.species_genes_dict, with the format:
213 # <clustering_method>^^<species_code>
214 species_genes_dict_key = "%s^^%s" % (clustering_method, species_code)
215 # Add an entry to self.species_genes_dict, where the value
216 # is a list containing species_name and num_genes.
217 if species_genes_dict_key in self.species_genes_dict:
218 tup = self.species_genes_dict[species_genes_dict_key]
219 tup[1] += 1
220 self.species_genes_dict[species_genes_dict_key] = tup
221 else:
222 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
223 # Populate the plant_tribes_taxon table.
224 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
225 self.fh.write("Processing taxa lineage config: %s" % str(file_name))
226 with open(file_name, "r") as fh:
227 for i, line in enumerate(fh):
228 line = line.strip()
229 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
230 # Skip blank lines, comments and section headers.
231 continue
232 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
233 items = line.split("\t")
234 species_name = items[0]
235 self.fh.write("Calculating the number of genes for species_name: %s" % str(species_name))
236 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
237 # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
238 species_genes_dict_key_items = species_genes_dict_key.split("^^")
239 clustering_method = species_genes_dict_key_items[0]
240 species_code = species_genes_dict_key_items[1]
241 # Get the scaffold_rec for the current scaffold_id and clustering_method.
242 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
243 for scaffold_rec in self.scaffold_recs:
244 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
245 scaffold_id_db = scaffold_rec[0]
246 # The value is a list containing species_name and num_genes.
247 val = self.species_genes_dict[species_genes_dict_key]
248 if species_name == val[0]:
249 num_genes = val[1]
250 else:
251 num_genes = 0
252 # Insert a row in to the plant_tribes_scaffold table.
253 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
254 sql = """
255 INSERT INTO plant_tribes_taxon
256 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
257 """
258 self._update(sql, tuple(args))
259 self._flush()
260
261 def process_orthogroup_fasta_files(self):
262 scaffold_id = os.path.basename(self.args.scaffold_path)
263 aa_dict = {}
264 dna_dict = {}
265 # Populate aa_dict and dna_dict.
266 for clustering_method in self.clustering_methods:
267 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
268 for file_name in os.listdir(file_dir):
269 items = file_name.split(".")
270 orthogroup_id = items[0]
271 file_extension = items[1]
272 if file_extension == "fna":
273 adict = dna_dict
274 else:
275 adict = aa_dict
276 file_path = os.path.join(file_dir, file_name)
277 with open(file_path, "r") as fh:
278 for i, line in enumerate(fh):
279 line = line.strip()
280 if len(line) == 0:
281 # Skip blank lines (shoudn't happen).
282 continue
283 if line.startswith(">"):
284 # Example line:
285 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
286 gene_id = line.lstrip(">")
287 # The dictionary keys will combine the orthogroup_id,
288 # clustering method and gene id using the format
289 # ,orthogroup_id>^^<clustering_method>^^<gene_id>.
290 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
291 if combined_id not in adict:
292 # The value will be the dna sequence string..
293 adict[combined_id] = ""
294 else:
295 # Example line:
296 # ATGGAGAAGGACTTT
297 # Here combined_id is set because the fasta format specifies
298 # that all lines following the gene id defined in the if block
299 # above will be the sequence associated with that gene until
300 # the next gene id line is encountered.
301 sequence = adict[combined_id]
302 sequence = "%s%s" % (sequence, line)
303 adict[combined_id] = sequence
304 # Populate the plant_tribes_gene and gen_scaffold_association tables
305 # from the contents of aa_dict and dna_dict.
306 for combined_id in sorted(dna_dict.keys()):
307 # The dictionary keys combine the orthogroup_id, clustering method and
308 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
309 items = combined_id.split("^^")
310 orthogroup_id = items[0]
311 clustering_method = items[1]
312 gene_id = items[2]
313 self.fh.write("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
314 # The value will be a list containing both
315 # clustering_method and the dna string.
316 dna_sequence = dna_dict[combined_id]
317 aa_sequence = aa_dict[combined_id]
318 # Get the species_code from the gene_id.
319 species_code = gene_id.split("_")[1]
320 # Strip the version from the species_code.
321 species_code = species_code[0:5]
322 # Get the species_name from self.species_ids_dict.
323 species_name = self.species_ids_dict[species_code]
324 # Get the plant_tribes_orthogroup primary key id for
325 # the orthogroup_id from the plant_tribes_orthogroup table.
326 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
327 cur = self.conn.cursor()
328 cur.execute(sql)
329 orthogroup_id_db = cur.fetchone()[0]
330 # If the plant_tribes_gene table contains a row that has the gene_id,
331 # then we'll add a row only to the gene_scaffold_orthogroup_association table.
332 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
333 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
334 cur = self.conn.cursor()
335 cur.execute(sql)
336 taxon_id = cur.fetchone()[0]
337 # If the plant_tribes_gene table contains a row that has the gene_id,
338 # then we'll add a row only to the gene_scaffold_orthogroup_association table.
339 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
340 cur = self.conn.cursor()
341 cur.execute(sql)
342 try:
343 gene_id_db = cur.fetchone()[0]
344 except:
345 # Insert a row into the plant_tribes_gene table.
346 args = [gene_id, taxon_id, dna_sequence, aa_sequence]
347 sql = """
348 INSERT INTO plant_tribes_gene
349 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s)
350 RETURNING id;
351 """
352 cur = self._update(sql, tuple(args))
353 self._flush()
354 gene_id_db = cur.fetchone()[0]
355 # Insert a row into the gene_scaffold_orthogroup_association table.
356 # Get the scaffold_rec for the current scaffold_id and clustering_method.
357 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
358 for scaffold_rec in self.scaffold_recs:
359 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
360 scaffold_id_db = scaffold_rec[0]
361 args = [gene_id_db, scaffold_id_db, orthogroup_id_db]
362 sql = """
363 INSERT INTO gene_scaffold_orthogroup_association
364 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s);
365 """
366 cur = self._update(sql, tuple(args))
367 self._flush()
368
369
370 if __name__ == '__main__':
371 add_scaffold = AddScaffold()
372 add_scaffold._run()
373 add_scaffold._shutdown()