0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
|
|
4 database efficiently by bypassing the Galaxy model and operating directly on the database.
|
|
5 PostgreSQL 9.1 or greater is required.
|
|
6 """
|
|
7 import argparse
|
|
8 import glob
|
|
9 import os
|
|
10 import sys
|
|
11
|
|
12 import psycopg2
|
11
|
13 from sqlalchemy import create_engine, MetaData, Table
|
0
|
14 from sqlalchemy.engine.url import make_url
|
|
15
|
12
|
16 BLACKLIST_STRINGS = ['Unknown protein(s)',
|
|
17 'No TAIR description(s)',
|
|
18 'Representative annotation below 0.1%'
|
|
19 'Representative AHRD below 0.1%']
|
|
20
|
0
|
21
|
|
22 class ScaffoldLoader(object):
|
|
23 def __init__(self):
|
|
24 self.args = None
|
|
25 self.clustering_methods = []
|
|
26 self.conn = None
|
|
27 self.gene_sequences_dict = {}
|
|
28 self.scaffold_genes_dict = {}
|
|
29 self.scaffold_recs = []
|
|
30 self.species_genes_dict = {}
|
|
31 self.species_ids_dict = {}
|
|
32 self.taxa_lineage_config = None
|
|
33 self.parse_args()
|
|
34 self.fh = open(self.args.output, "w")
|
|
35 self.connect_db()
|
11
|
36 self.engine = create_engine(self.args.database_connection_string)
|
|
37 self.metadata = MetaData(self.engine)
|
0
|
38
|
|
39 def parse_args(self):
|
|
40 parser = argparse.ArgumentParser()
|
|
41 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
|
|
42 parser.add_argument('--output', dest='output', help='Output dataset'),
|
|
43 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
|
|
44 self.args = parser.parse_args()
|
|
45
|
|
46 def connect_db(self):
|
|
47 url = make_url(self.args.database_connection_string)
|
|
48 self.log('Connecting to database with URL: %s' % url)
|
|
49 args = url.translate_connect_args(username='user')
|
|
50 args.update(url.query)
|
|
51 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
|
|
52 self.conn = psycopg2.connect(**args)
|
|
53
|
|
54 def flush(self):
|
|
55 self.conn.commit()
|
|
56
|
|
57 def shutdown(self):
|
|
58 self.conn.close()
|
|
59
|
|
60 def update(self, sql, args):
|
|
61 try:
|
|
62 cur = self.conn.cursor()
|
|
63 cur.execute(sql, args)
|
|
64 except Exception as e:
|
|
65 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
|
|
66 self.stop_err(msg)
|
|
67 return cur
|
|
68
|
|
69 def stop_err(self, msg):
|
|
70 sys.stderr.write(msg)
|
|
71 self.fh.flush()
|
|
72 self.fh.close()
|
|
73 sys.exit(1)
|
|
74
|
|
75 def log(self, msg):
|
|
76 self.fh.write("%s\n" % msg)
|
|
77 self.fh.flush()
|
|
78
|
|
79 @property
|
|
80 def can_add_scaffold(self):
|
|
81 """
|
|
82 Make sure the scaffold has not already been added.
|
|
83 """
|
|
84 scaffold_id = os.path.basename(self.args.scaffold_path)
|
|
85 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
|
|
86 cur = self.conn.cursor()
|
|
87 cur.execute(sql)
|
|
88 try:
|
|
89 cur.fetchone()[0]
|
|
90 # The scaffold has been added to the database.
|
|
91 return False
|
|
92 except:
|
|
93 # The scaffold has not yet been added.
|
|
94 return True
|
|
95
|
|
96 def run(self):
|
|
97 if self.can_add_scaffold:
|
|
98 self.process_annot_dir()
|
|
99 self.process_scaffold_config_files()
|
|
100 self.process_orthogroup_fasta_files()
|
|
101 self.fh.flush()
|
|
102 self.fh.close()
|
|
103 else:
|
|
104 self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path))
|
|
105
|
|
106 def process_annot_dir(self):
|
|
107 """
|
|
108 1. Parse all of the *.min_evalue.summary files in the
|
|
109 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
|
|
110 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
|
|
111 1. Parse all of the *.list files in the same directory to populate
|
|
112 self.scaffold_genes_dict.
|
|
113 """
|
11
|
114 self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True)
|
0
|
115 scaffold_id = os.path.basename(self.args.scaffold_path)
|
|
116 file_dir = os.path.join(self.args.scaffold_path, 'annot')
|
10
|
117 # The scaffold naming convention must follow this pattern:
|
0
|
118 # <integer1>Gv<integer2>.<integer3>
|
|
119 # where integer 1 is the number of genomes in the scaffold_id. For example:
|
|
120 # 22Gv1.1 -> 22 genomes
|
|
121 # 12Gv1.0 -> 12 genomes
|
|
122 # 26Gv2.0 -> 26 genomes, etc.
|
|
123 num_genomes = int(scaffold_id.split("Gv")[0])
|
|
124 super_ortho_start_index = num_genomes + 1
|
|
125 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
|
|
126 items = os.path.basename(file_name).split(".")
|
|
127 clustering_method = items[0]
|
|
128 # Save all clustering methods for later processing.
|
|
129 if clustering_method not in self.clustering_methods:
|
|
130 self.clustering_methods.append(clustering_method)
|
|
131 # Insert a row in to the plant_tribes_scaffold table.
|
2
|
132 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
|
0
|
133 args = [scaffold_id, clustering_method]
|
|
134 sql = """
|
|
135 INSERT INTO plant_tribes_scaffold
|
|
136 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
|
|
137 RETURNING id;
|
|
138 """
|
|
139 cur = self.update(sql, tuple(args))
|
|
140 self.flush()
|
|
141 scaffold_id_db = cur.fetchone()[0]
|
|
142 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
|
|
143 with open(file_name, "r") as fh:
|
2
|
144 i = 0
|
|
145 for i2, line in enumerate(fh):
|
|
146 if i2 == 0:
|
0
|
147 # Skip first line.
|
|
148 continue
|
10
|
149 line = line.rstrip('\n')
|
0
|
150 num_genes = 0
|
|
151 num_species = 0
|
|
152 items = line.split("\t")
|
|
153 orthogroup_id = int(items[0])
|
|
154 # Zero based items 1 to num_genomes consists of the
|
|
155 # number of species classified in the orthogroup (i.e.,
|
|
156 # species with at least 1 gene in the orthogroup).
|
|
157 for j in range(1, num_genomes):
|
|
158 j_int = int(items[j])
|
|
159 if j_int > 0:
|
|
160 # The species has at least 1 gene
|
|
161 num_species += 1
|
|
162 num_genes += j_int
|
11
|
163 # Get the auto-incremented row id to insert a row inot
|
|
164 # the plant_tribes_orthogroup table.
|
|
165 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
|
|
166 cur = self.conn.cursor()
|
|
167 cur.execute(sql)
|
|
168 plant_tribes_orthogroup_id = cur.fetchone()[0]
|
|
169 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
|
|
170 last_item = len(items)
|
|
171 for k in range(super_ortho_start_index, last_item):
|
12
|
172 bs_found = False
|
11
|
173 # The last 7 items in this range are as follows.
|
|
174 # items[last_item-6]: AHRD Descriptions
|
|
175 # items[last_item-5]: TAIR Gene(s) Descriptions
|
|
176 # items[last_item-4]: Pfam Domains
|
|
177 # items[last_item-3]: InterProScan Descriptions
|
|
178 # items[last_item-2]: GO Molecular Functions
|
|
179 # items[last_item-1]: GO Biological Processes
|
|
180 # items[last_item]: GO Cellular Components
|
|
181 # We'll translate each of these items into a JSON
|
|
182 # dictionary for inserting into the table.
|
|
183 if k >= (last_item-7) and k <= last_item:
|
12
|
184 json_str = str(items[k])
|
11
|
185 # Here is an example string:
|
|
186 # Phosphate transporter PHO1 [0.327] | Phosphate
|
12
|
187 for bs in BLACKLIST_STRINGS:
|
|
188 if json_str.find(bs) >= 0:
|
|
189 bs_found = True
|
|
190 args.append(None)
|
|
191 break
|
|
192 if not bs_found:
|
|
193 # We'll split the string on " | " to create each value.
|
|
194 # The keys will be zero-padded integers to enable sorting.
|
|
195 json_dict = dict()
|
|
196 json_vals = json_str.split(' | ')
|
|
197 for key_index, json_val in enumerate(json_vals):
|
|
198 # The zero-padded key is 1 based.
|
|
199 json_key = '%04d' % key_index
|
|
200 json_dict[json_key] = json_val
|
|
201 args.append(json_dict)
|
11
|
202 else:
|
|
203 args.append('%s' % str(items[k]))
|
|
204 sql = self.pto_table.insert().values(args)
|
|
205 try:
|
|
206 self.engine.execute(sql)
|
|
207 except Exception as e:
|
|
208 msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e)
|
|
209 self.stop_err(msg)
|
2
|
210 i += 1
|
|
211 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
|
0
|
212 for file_name in glob.glob(os.path.join(file_dir, "*list")):
|
|
213 items = os.path.basename(file_name).split(".")
|
|
214 clustering_method = items[0]
|
|
215 with open(file_name, "r") as fh:
|
|
216 for i, line in enumerate(fh):
|
|
217 items = line.split("\t")
|
|
218 # The key will be a combination of clustering_method and
|
|
219 # orthogroup_id separated by "^^" for easy splitting later.
|
|
220 key = "%s^^%s" % (clustering_method, items[0])
|
|
221 # The value is the gen_id with all white space replaced by "_".
|
|
222 val = items[1].replace("|", "_")
|
|
223 if key in self.scaffold_genes_dict:
|
|
224 self.scaffold_genes_dict[key].append(val)
|
|
225 else:
|
|
226 self.scaffold_genes_dict[key] = [val]
|
|
227
|
|
228 def process_scaffold_config_files(self):
|
|
229 """
|
|
230 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
|
|
231 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
|
|
232 2. Calculate the number of genes found
|
|
233 for each species and add the number to self.species_genes_dict.
|
|
234 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
|
|
235 populate the plant_tribes_taxon table.
|
|
236 """
|
|
237 scaffold_id = os.path.basename(self.args.scaffold_path)
|
|
238 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
|
|
239 self.log("Processing rooting order config: %s" % str(file_name))
|
|
240 # Populate self.species_ids_dict.
|
|
241 with open(file_name, "r") as fh:
|
|
242 for i, line in enumerate(fh):
|
|
243 line = line.strip()
|
|
244 if len(line) == 0 or line.startswith("#") or line.startswith("["):
|
|
245 # Skip blank lines, comments and section headers.
|
|
246 continue
|
|
247 # Example line:
|
|
248 # Physcomitrella patens=Phypa
|
|
249 items = line.split("=")
|
|
250 self.species_ids_dict[items[1]] = items[0]
|
|
251 # Get lineage information for orthogrpoup taxa.
|
|
252 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
|
|
253 # The format of key is <clustering_method>^^<orthogroup_id>.
|
|
254 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
|
|
255 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
|
|
256 clustering_method = scaffold_genes_dict_key_items[0]
|
|
257 # Get the list of genes for the current scaffold_genes_dict_key.
|
|
258 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
|
|
259 for gene_id in gene_list:
|
|
260 # Example species_code: Musac1.0, where
|
|
261 # species_name is Musac and version is 1.0.
|
|
262 species_code = gene_id.split("_")[1]
|
|
263 # Strip the version from the species_code.
|
|
264 species_code = species_code[0:5]
|
|
265 # Get the species_name from self.species_ids_dict.
|
|
266 species_name = self.species_ids_dict[species_code]
|
|
267 # Create a key for self.species_genes_dict, with the format:
|
6
|
268 # <clustering_method>^^<species_name>
|
|
269 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
|
0
|
270 # Add an entry to self.species_genes_dict, where the value
|
|
271 # is a list containing species_name and num_genes.
|
|
272 if species_genes_dict_key in self.species_genes_dict:
|
|
273 tup = self.species_genes_dict[species_genes_dict_key]
|
|
274 tup[1] += 1
|
|
275 self.species_genes_dict[species_genes_dict_key] = tup
|
|
276 else:
|
|
277 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
|
|
278 # Populate the plant_tribes_taxon table.
|
|
279 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
|
|
280 self.log("Processing taxa lineage config: %s" % str(file_name))
|
|
281 with open(file_name, "r") as fh:
|
2
|
282 for line in fh:
|
0
|
283 line = line.strip()
|
|
284 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
|
|
285 # Skip blank lines, comments and section headers.
|
|
286 continue
|
|
287 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
|
|
288 items = line.split("\t")
|
|
289 species_name = items[0]
|
2
|
290 i = 0
|
6
|
291 for clustering_method in self.clustering_methods:
|
8
|
292 # The format of species_genes_dict_key is <clustering_method>^^<species_name>.
|
6
|
293 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
|
0
|
294 # Get the scaffold_rec for the current scaffold_id and clustering_method.
|
|
295 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
|
|
296 for scaffold_rec in self.scaffold_recs:
|
|
297 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
|
|
298 scaffold_id_db = scaffold_rec[0]
|
|
299 # The value is a list containing species_name and num_genes.
|
|
300 val = self.species_genes_dict[species_genes_dict_key]
|
|
301 if species_name == val[0]:
|
|
302 num_genes = val[1]
|
|
303 else:
|
|
304 num_genes = 0
|
|
305 # Insert a row in to the plant_tribes_scaffold table.
|
|
306 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
|
|
307 sql = """
|
|
308 INSERT INTO plant_tribes_taxon
|
|
309 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
|
|
310 """
|
|
311 self.update(sql, tuple(args))
|
|
312 self.flush()
|
2
|
313 i += 1
|
3
|
314 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name)))
|
0
|
315
|
|
316 def process_orthogroup_fasta_files(self):
|
|
317 """
|
|
318 1. Analyze all of the scaffold .fna and .faa files for each clustering
|
|
319 method to populate the aa_dict and dna_dict sequence dictionaries.
|
|
320 2. Use the populated sequence dictionaries to populate the plant_tribes_gene
|
6
|
321 and gene_scaffold_orthogroup_taxon_association tables.
|
0
|
322 """
|
|
323 scaffold_id = os.path.basename(self.args.scaffold_path)
|
|
324 aa_dict = {}
|
|
325 dna_dict = {}
|
|
326 # Populate aa_dict and dna_dict.
|
|
327 for clustering_method in self.clustering_methods:
|
|
328 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
|
|
329 for file_name in os.listdir(file_dir):
|
|
330 items = file_name.split(".")
|
|
331 orthogroup_id = items[0]
|
|
332 file_extension = items[1]
|
|
333 if file_extension == "fna":
|
|
334 adict = dna_dict
|
|
335 else:
|
|
336 adict = aa_dict
|
|
337 file_path = os.path.join(file_dir, file_name)
|
|
338 with open(file_path, "r") as fh:
|
|
339 for i, line in enumerate(fh):
|
|
340 line = line.strip()
|
|
341 if len(line) == 0:
|
|
342 # Skip blank lines (shoudn't happen).
|
|
343 continue
|
|
344 if line.startswith(">"):
|
|
345 # Example line:
|
|
346 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
|
|
347 gene_id = line.lstrip(">")
|
|
348 # The dictionary keys will combine the orthogroup_id,
|
|
349 # clustering method and gene id using the format
|
|
350 # ,orthogroup_id>^^<clustering_method>^^<gene_id>.
|
|
351 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
|
|
352 if combined_id not in adict:
|
|
353 # The value will be the dna sequence string..
|
|
354 adict[combined_id] = ""
|
|
355 else:
|
|
356 # Example line:
|
|
357 # ATGGAGAAGGACTTT
|
|
358 # Here combined_id is set because the fasta format specifies
|
|
359 # that all lines following the gene id defined in the if block
|
|
360 # above will be the sequence associated with that gene until
|
|
361 # the next gene id line is encountered.
|
|
362 sequence = adict[combined_id]
|
|
363 sequence = "%s%s" % (sequence, line)
|
|
364 adict[combined_id] = sequence
|
6
|
365 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
|
0
|
366 # from the contents of aa_dict and dna_dict.
|
6
|
367 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
|
2
|
368 gi = 0
|
|
369 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
|
0
|
370 # The dictionary keys combine the orthogroup_id, clustering method and
|
|
371 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
|
|
372 items = combined_id.split("^^")
|
|
373 orthogroup_id = items[0]
|
|
374 clustering_method = items[1]
|
|
375 gene_id = items[2]
|
|
376 # The value will be a list containing both
|
|
377 # clustering_method and the dna string.
|
|
378 dna_sequence = dna_dict[combined_id]
|
|
379 aa_sequence = aa_dict[combined_id]
|
|
380 # Get the species_code from the gene_id.
|
|
381 species_code = gene_id.split("_")[1]
|
|
382 # Strip the version from the species_code.
|
|
383 species_code = species_code[0:5]
|
|
384 # Get the species_name from self.species_ids_dict.
|
|
385 species_name = self.species_ids_dict[species_code]
|
|
386 # Get the plant_tribes_orthogroup primary key id for
|
|
387 # the orthogroup_id from the plant_tribes_orthogroup table.
|
|
388 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
|
|
389 cur = self.conn.cursor()
|
|
390 cur.execute(sql)
|
|
391 orthogroup_id_db = cur.fetchone()[0]
|
|
392 # If the plant_tribes_gene table contains a row that has the gene_id,
|
6
|
393 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
|
0
|
394 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
|
|
395 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
|
|
396 cur = self.conn.cursor()
|
|
397 cur.execute(sql)
|
5
|
398 taxon_id_db = cur.fetchone()[0]
|
0
|
399 # If the plant_tribes_gene table contains a row that has the gene_id,
|
6
|
400 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
|
0
|
401 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
|
|
402 cur = self.conn.cursor()
|
|
403 cur.execute(sql)
|
|
404 try:
|
|
405 gene_id_db = cur.fetchone()[0]
|
|
406 except:
|
|
407 # Insert a row into the plant_tribes_gene table.
|
5
|
408 args = [gene_id, dna_sequence, aa_sequence]
|
0
|
409 sql = """
|
|
410 INSERT INTO plant_tribes_gene
|
5
|
411 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s)
|
0
|
412 RETURNING id;
|
|
413 """
|
|
414 cur = self.update(sql, tuple(args))
|
|
415 self.flush()
|
|
416 gene_id_db = cur.fetchone()[0]
|
2
|
417 gi += 1
|
4
|
418 if gi % 1000 == 0:
|
|
419 self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
|
6
|
420 # Insert a row into the gene_scaffold_orthogroup_taxon_association table.
|
0
|
421 # Get the scaffold_rec for the current scaffold_id and clustering_method.
|
|
422 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
|
|
423 for scaffold_rec in self.scaffold_recs:
|
|
424 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
|
|
425 scaffold_id_db = scaffold_rec[0]
|
5
|
426 args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db]
|
0
|
427 sql = """
|
5
|
428 INSERT INTO gene_scaffold_orthogroup_taxon_association
|
|
429 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s);
|
0
|
430 """
|
|
431 cur = self.update(sql, tuple(args))
|
|
432 self.flush()
|
2
|
433 if gsoai % 1000 == 0:
|
6
|
434 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
|
2
|
435 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
|
6
|
436 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
|
0
|
437
|
8
|
438
|
0
|
439 if __name__ == '__main__':
|
|
440 scaffold_loader = ScaffoldLoader()
|
|
441 scaffold_loader.run()
|
|
442 scaffold_loader.shutdown()
|