annotate gene_family_scaffold_loader.py @ 14:4c88c8d793e8 draft default tip

Uploaded
author greg
date Thu, 18 Oct 2018 11:02:18 -0400
parents 6f1f5f242503
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
177b62db4320 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
177b62db4320 Uploaded
greg
parents:
diff changeset
2 """
177b62db4320 Uploaded
greg
parents:
diff changeset
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
177b62db4320 Uploaded
greg
parents:
diff changeset
4 database efficiently by bypassing the Galaxy model and operating directly on the database.
177b62db4320 Uploaded
greg
parents:
diff changeset
5 PostgreSQL 9.1 or greater is required.
177b62db4320 Uploaded
greg
parents:
diff changeset
6 """
177b62db4320 Uploaded
greg
parents:
diff changeset
7 import argparse
177b62db4320 Uploaded
greg
parents:
diff changeset
8 import glob
177b62db4320 Uploaded
greg
parents:
diff changeset
9 import os
177b62db4320 Uploaded
greg
parents:
diff changeset
10 import sys
177b62db4320 Uploaded
greg
parents:
diff changeset
11
177b62db4320 Uploaded
greg
parents:
diff changeset
12 import psycopg2
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
13 from sqlalchemy import create_engine, MetaData, Table
0
177b62db4320 Uploaded
greg
parents:
diff changeset
14 from sqlalchemy.engine.url import make_url
177b62db4320 Uploaded
greg
parents:
diff changeset
15
14
4c88c8d793e8 Uploaded
greg
parents: 13
diff changeset
16 BLACKLIST_STRINGS = ['NULL',
13
6f1f5f242503 Uploaded
greg
parents: 12
diff changeset
17 'Unknown protein',
6f1f5f242503 Uploaded
greg
parents: 12
diff changeset
18 'No TAIR description',
6f1f5f242503 Uploaded
greg
parents: 12
diff changeset
19 'Representative annotation below 0'
6f1f5f242503 Uploaded
greg
parents: 12
diff changeset
20 'Representative AHRD below 0']
12
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
21
0
177b62db4320 Uploaded
greg
parents:
diff changeset
22
177b62db4320 Uploaded
greg
parents:
diff changeset
23 class ScaffoldLoader(object):
177b62db4320 Uploaded
greg
parents:
diff changeset
24 def __init__(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
25 self.args = None
177b62db4320 Uploaded
greg
parents:
diff changeset
26 self.clustering_methods = []
177b62db4320 Uploaded
greg
parents:
diff changeset
27 self.conn = None
177b62db4320 Uploaded
greg
parents:
diff changeset
28 self.gene_sequences_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
29 self.scaffold_genes_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
30 self.scaffold_recs = []
177b62db4320 Uploaded
greg
parents:
diff changeset
31 self.species_genes_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
32 self.species_ids_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
33 self.taxa_lineage_config = None
177b62db4320 Uploaded
greg
parents:
diff changeset
34 self.parse_args()
177b62db4320 Uploaded
greg
parents:
diff changeset
35 self.fh = open(self.args.output, "w")
177b62db4320 Uploaded
greg
parents:
diff changeset
36 self.connect_db()
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
37 self.engine = create_engine(self.args.database_connection_string)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
38 self.metadata = MetaData(self.engine)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
39
177b62db4320 Uploaded
greg
parents:
diff changeset
40 def parse_args(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
41 parser = argparse.ArgumentParser()
177b62db4320 Uploaded
greg
parents:
diff changeset
42 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
177b62db4320 Uploaded
greg
parents:
diff changeset
43 parser.add_argument('--output', dest='output', help='Output dataset'),
177b62db4320 Uploaded
greg
parents:
diff changeset
44 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
177b62db4320 Uploaded
greg
parents:
diff changeset
45 self.args = parser.parse_args()
177b62db4320 Uploaded
greg
parents:
diff changeset
46
177b62db4320 Uploaded
greg
parents:
diff changeset
47 def connect_db(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
48 url = make_url(self.args.database_connection_string)
177b62db4320 Uploaded
greg
parents:
diff changeset
49 self.log('Connecting to database with URL: %s' % url)
177b62db4320 Uploaded
greg
parents:
diff changeset
50 args = url.translate_connect_args(username='user')
177b62db4320 Uploaded
greg
parents:
diff changeset
51 args.update(url.query)
177b62db4320 Uploaded
greg
parents:
diff changeset
52 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
177b62db4320 Uploaded
greg
parents:
diff changeset
53 self.conn = psycopg2.connect(**args)
177b62db4320 Uploaded
greg
parents:
diff changeset
54
177b62db4320 Uploaded
greg
parents:
diff changeset
55 def flush(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
56 self.conn.commit()
177b62db4320 Uploaded
greg
parents:
diff changeset
57
177b62db4320 Uploaded
greg
parents:
diff changeset
58 def shutdown(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
59 self.conn.close()
177b62db4320 Uploaded
greg
parents:
diff changeset
60
177b62db4320 Uploaded
greg
parents:
diff changeset
61 def update(self, sql, args):
177b62db4320 Uploaded
greg
parents:
diff changeset
62 try:
177b62db4320 Uploaded
greg
parents:
diff changeset
63 cur = self.conn.cursor()
177b62db4320 Uploaded
greg
parents:
diff changeset
64 cur.execute(sql, args)
177b62db4320 Uploaded
greg
parents:
diff changeset
65 except Exception as e:
177b62db4320 Uploaded
greg
parents:
diff changeset
66 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
177b62db4320 Uploaded
greg
parents:
diff changeset
67 self.stop_err(msg)
177b62db4320 Uploaded
greg
parents:
diff changeset
68 return cur
177b62db4320 Uploaded
greg
parents:
diff changeset
69
177b62db4320 Uploaded
greg
parents:
diff changeset
70 def stop_err(self, msg):
177b62db4320 Uploaded
greg
parents:
diff changeset
71 sys.stderr.write(msg)
177b62db4320 Uploaded
greg
parents:
diff changeset
72 self.fh.flush()
177b62db4320 Uploaded
greg
parents:
diff changeset
73 self.fh.close()
177b62db4320 Uploaded
greg
parents:
diff changeset
74 sys.exit(1)
177b62db4320 Uploaded
greg
parents:
diff changeset
75
177b62db4320 Uploaded
greg
parents:
diff changeset
76 def log(self, msg):
177b62db4320 Uploaded
greg
parents:
diff changeset
77 self.fh.write("%s\n" % msg)
177b62db4320 Uploaded
greg
parents:
diff changeset
78 self.fh.flush()
177b62db4320 Uploaded
greg
parents:
diff changeset
79
177b62db4320 Uploaded
greg
parents:
diff changeset
80 @property
177b62db4320 Uploaded
greg
parents:
diff changeset
81 def can_add_scaffold(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
82 """
177b62db4320 Uploaded
greg
parents:
diff changeset
83 Make sure the scaffold has not already been added.
177b62db4320 Uploaded
greg
parents:
diff changeset
84 """
177b62db4320 Uploaded
greg
parents:
diff changeset
85 scaffold_id = os.path.basename(self.args.scaffold_path)
177b62db4320 Uploaded
greg
parents:
diff changeset
86 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
177b62db4320 Uploaded
greg
parents:
diff changeset
87 cur = self.conn.cursor()
177b62db4320 Uploaded
greg
parents:
diff changeset
88 cur.execute(sql)
177b62db4320 Uploaded
greg
parents:
diff changeset
89 try:
177b62db4320 Uploaded
greg
parents:
diff changeset
90 cur.fetchone()[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
91 # The scaffold has been added to the database.
177b62db4320 Uploaded
greg
parents:
diff changeset
92 return False
177b62db4320 Uploaded
greg
parents:
diff changeset
93 except:
177b62db4320 Uploaded
greg
parents:
diff changeset
94 # The scaffold has not yet been added.
177b62db4320 Uploaded
greg
parents:
diff changeset
95 return True
177b62db4320 Uploaded
greg
parents:
diff changeset
96
177b62db4320 Uploaded
greg
parents:
diff changeset
97 def run(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
98 if self.can_add_scaffold:
177b62db4320 Uploaded
greg
parents:
diff changeset
99 self.process_annot_dir()
177b62db4320 Uploaded
greg
parents:
diff changeset
100 self.process_scaffold_config_files()
177b62db4320 Uploaded
greg
parents:
diff changeset
101 self.process_orthogroup_fasta_files()
177b62db4320 Uploaded
greg
parents:
diff changeset
102 self.fh.flush()
177b62db4320 Uploaded
greg
parents:
diff changeset
103 self.fh.close()
177b62db4320 Uploaded
greg
parents:
diff changeset
104 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
105 self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path))
177b62db4320 Uploaded
greg
parents:
diff changeset
106
177b62db4320 Uploaded
greg
parents:
diff changeset
107 def process_annot_dir(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
108 """
177b62db4320 Uploaded
greg
parents:
diff changeset
109 1. Parse all of the *.min_evalue.summary files in the
177b62db4320 Uploaded
greg
parents:
diff changeset
110 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
177b62db4320 Uploaded
greg
parents:
diff changeset
111 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
177b62db4320 Uploaded
greg
parents:
diff changeset
112 1. Parse all of the *.list files in the same directory to populate
177b62db4320 Uploaded
greg
parents:
diff changeset
113 self.scaffold_genes_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
114 """
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
115 self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
116 scaffold_id = os.path.basename(self.args.scaffold_path)
177b62db4320 Uploaded
greg
parents:
diff changeset
117 file_dir = os.path.join(self.args.scaffold_path, 'annot')
10
11a36e425c94 Uploaded
greg
parents: 8
diff changeset
118 # The scaffold naming convention must follow this pattern:
0
177b62db4320 Uploaded
greg
parents:
diff changeset
119 # <integer1>Gv<integer2>.<integer3>
177b62db4320 Uploaded
greg
parents:
diff changeset
120 # where integer 1 is the number of genomes in the scaffold_id. For example:
177b62db4320 Uploaded
greg
parents:
diff changeset
121 # 22Gv1.1 -> 22 genomes
177b62db4320 Uploaded
greg
parents:
diff changeset
122 # 12Gv1.0 -> 12 genomes
177b62db4320 Uploaded
greg
parents:
diff changeset
123 # 26Gv2.0 -> 26 genomes, etc.
177b62db4320 Uploaded
greg
parents:
diff changeset
124 num_genomes = int(scaffold_id.split("Gv")[0])
177b62db4320 Uploaded
greg
parents:
diff changeset
125 super_ortho_start_index = num_genomes + 1
177b62db4320 Uploaded
greg
parents:
diff changeset
126 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
177b62db4320 Uploaded
greg
parents:
diff changeset
127 items = os.path.basename(file_name).split(".")
177b62db4320 Uploaded
greg
parents:
diff changeset
128 clustering_method = items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
129 # Save all clustering methods for later processing.
177b62db4320 Uploaded
greg
parents:
diff changeset
130 if clustering_method not in self.clustering_methods:
177b62db4320 Uploaded
greg
parents:
diff changeset
131 self.clustering_methods.append(clustering_method)
177b62db4320 Uploaded
greg
parents:
diff changeset
132 # Insert a row in to the plant_tribes_scaffold table.
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
133 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
0
177b62db4320 Uploaded
greg
parents:
diff changeset
134 args = [scaffold_id, clustering_method]
177b62db4320 Uploaded
greg
parents:
diff changeset
135 sql = """
177b62db4320 Uploaded
greg
parents:
diff changeset
136 INSERT INTO plant_tribes_scaffold
177b62db4320 Uploaded
greg
parents:
diff changeset
137 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
177b62db4320 Uploaded
greg
parents:
diff changeset
138 RETURNING id;
177b62db4320 Uploaded
greg
parents:
diff changeset
139 """
177b62db4320 Uploaded
greg
parents:
diff changeset
140 cur = self.update(sql, tuple(args))
177b62db4320 Uploaded
greg
parents:
diff changeset
141 self.flush()
177b62db4320 Uploaded
greg
parents:
diff changeset
142 scaffold_id_db = cur.fetchone()[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
143 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
177b62db4320 Uploaded
greg
parents:
diff changeset
144 with open(file_name, "r") as fh:
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
145 i = 0
751b36922d59 Uploaded
greg
parents: 0
diff changeset
146 for i2, line in enumerate(fh):
751b36922d59 Uploaded
greg
parents: 0
diff changeset
147 if i2 == 0:
0
177b62db4320 Uploaded
greg
parents:
diff changeset
148 # Skip first line.
177b62db4320 Uploaded
greg
parents:
diff changeset
149 continue
10
11a36e425c94 Uploaded
greg
parents: 8
diff changeset
150 line = line.rstrip('\n')
0
177b62db4320 Uploaded
greg
parents:
diff changeset
151 num_genes = 0
177b62db4320 Uploaded
greg
parents:
diff changeset
152 num_species = 0
177b62db4320 Uploaded
greg
parents:
diff changeset
153 items = line.split("\t")
177b62db4320 Uploaded
greg
parents:
diff changeset
154 orthogroup_id = int(items[0])
177b62db4320 Uploaded
greg
parents:
diff changeset
155 # Zero based items 1 to num_genomes consists of the
177b62db4320 Uploaded
greg
parents:
diff changeset
156 # number of species classified in the orthogroup (i.e.,
177b62db4320 Uploaded
greg
parents:
diff changeset
157 # species with at least 1 gene in the orthogroup).
177b62db4320 Uploaded
greg
parents:
diff changeset
158 for j in range(1, num_genomes):
177b62db4320 Uploaded
greg
parents:
diff changeset
159 j_int = int(items[j])
177b62db4320 Uploaded
greg
parents:
diff changeset
160 if j_int > 0:
177b62db4320 Uploaded
greg
parents:
diff changeset
161 # The species has at least 1 gene
177b62db4320 Uploaded
greg
parents:
diff changeset
162 num_species += 1
177b62db4320 Uploaded
greg
parents:
diff changeset
163 num_genes += j_int
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
164 # Get the auto-incremented row id to insert a row inot
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
165 # the plant_tribes_orthogroup table.
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
166 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
167 cur = self.conn.cursor()
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
168 cur.execute(sql)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
169 plant_tribes_orthogroup_id = cur.fetchone()[0]
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
170 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
171 last_item = len(items)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
172 for k in range(super_ortho_start_index, last_item):
12
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
173 bs_found = False
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
174 # The last 7 items in this range are as follows.
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
175 # items[last_item-6]: AHRD Descriptions
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
176 # items[last_item-5]: TAIR Gene(s) Descriptions
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
177 # items[last_item-4]: Pfam Domains
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
178 # items[last_item-3]: InterProScan Descriptions
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
179 # items[last_item-2]: GO Molecular Functions
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
180 # items[last_item-1]: GO Biological Processes
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
181 # items[last_item]: GO Cellular Components
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
182 # We'll translate each of these items into a JSON
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
183 # dictionary for inserting into the table.
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
184 if k >= (last_item-7) and k <= last_item:
12
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
185 json_str = str(items[k])
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
186 # Here is an example string:
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
187 # Phosphate transporter PHO1 [0.327] | Phosphate
12
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
188 for bs in BLACKLIST_STRINGS:
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
189 if json_str.find(bs) >= 0:
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
190 bs_found = True
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
191 args.append(None)
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
192 break
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
193 if not bs_found:
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
194 # We'll split the string on " | " to create each value.
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
195 # The keys will be zero-padded integers to enable sorting.
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
196 json_dict = dict()
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
197 json_vals = json_str.split(' | ')
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
198 for key_index, json_val in enumerate(json_vals):
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
199 # The zero-padded key is 1 based.
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
200 json_key = '%04d' % key_index
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
201 json_dict[json_key] = json_val
ef07c8756360 Uploaded
greg
parents: 11
diff changeset
202 args.append(json_dict)
11
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
203 else:
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
204 args.append('%s' % str(items[k]))
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
205 sql = self.pto_table.insert().values(args)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
206 try:
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
207 self.engine.execute(sql)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
208 except Exception as e:
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
209 msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e)
2fac73ec6ee8 Uploaded
greg
parents: 10
diff changeset
210 self.stop_err(msg)
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
211 i += 1
751b36922d59 Uploaded
greg
parents: 0
diff changeset
212 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
0
177b62db4320 Uploaded
greg
parents:
diff changeset
213 for file_name in glob.glob(os.path.join(file_dir, "*list")):
177b62db4320 Uploaded
greg
parents:
diff changeset
214 items = os.path.basename(file_name).split(".")
177b62db4320 Uploaded
greg
parents:
diff changeset
215 clustering_method = items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
216 with open(file_name, "r") as fh:
177b62db4320 Uploaded
greg
parents:
diff changeset
217 for i, line in enumerate(fh):
177b62db4320 Uploaded
greg
parents:
diff changeset
218 items = line.split("\t")
177b62db4320 Uploaded
greg
parents:
diff changeset
219 # The key will be a combination of clustering_method and
177b62db4320 Uploaded
greg
parents:
diff changeset
220 # orthogroup_id separated by "^^" for easy splitting later.
177b62db4320 Uploaded
greg
parents:
diff changeset
221 key = "%s^^%s" % (clustering_method, items[0])
177b62db4320 Uploaded
greg
parents:
diff changeset
222 # The value is the gen_id with all white space replaced by "_".
177b62db4320 Uploaded
greg
parents:
diff changeset
223 val = items[1].replace("|", "_")
177b62db4320 Uploaded
greg
parents:
diff changeset
224 if key in self.scaffold_genes_dict:
177b62db4320 Uploaded
greg
parents:
diff changeset
225 self.scaffold_genes_dict[key].append(val)
177b62db4320 Uploaded
greg
parents:
diff changeset
226 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
227 self.scaffold_genes_dict[key] = [val]
177b62db4320 Uploaded
greg
parents:
diff changeset
228
177b62db4320 Uploaded
greg
parents:
diff changeset
229 def process_scaffold_config_files(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
230 """
177b62db4320 Uploaded
greg
parents:
diff changeset
231 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
177b62db4320 Uploaded
greg
parents:
diff changeset
232 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
177b62db4320 Uploaded
greg
parents:
diff changeset
233 2. Calculate the number of genes found
177b62db4320 Uploaded
greg
parents:
diff changeset
234 for each species and add the number to self.species_genes_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
235 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
177b62db4320 Uploaded
greg
parents:
diff changeset
236 populate the plant_tribes_taxon table.
177b62db4320 Uploaded
greg
parents:
diff changeset
237 """
177b62db4320 Uploaded
greg
parents:
diff changeset
238 scaffold_id = os.path.basename(self.args.scaffold_path)
177b62db4320 Uploaded
greg
parents:
diff changeset
239 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
177b62db4320 Uploaded
greg
parents:
diff changeset
240 self.log("Processing rooting order config: %s" % str(file_name))
177b62db4320 Uploaded
greg
parents:
diff changeset
241 # Populate self.species_ids_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
242 with open(file_name, "r") as fh:
177b62db4320 Uploaded
greg
parents:
diff changeset
243 for i, line in enumerate(fh):
177b62db4320 Uploaded
greg
parents:
diff changeset
244 line = line.strip()
177b62db4320 Uploaded
greg
parents:
diff changeset
245 if len(line) == 0 or line.startswith("#") or line.startswith("["):
177b62db4320 Uploaded
greg
parents:
diff changeset
246 # Skip blank lines, comments and section headers.
177b62db4320 Uploaded
greg
parents:
diff changeset
247 continue
177b62db4320 Uploaded
greg
parents:
diff changeset
248 # Example line:
177b62db4320 Uploaded
greg
parents:
diff changeset
249 # Physcomitrella patens=Phypa
177b62db4320 Uploaded
greg
parents:
diff changeset
250 items = line.split("=")
177b62db4320 Uploaded
greg
parents:
diff changeset
251 self.species_ids_dict[items[1]] = items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
252 # Get lineage information for orthogrpoup taxa.
177b62db4320 Uploaded
greg
parents:
diff changeset
253 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
177b62db4320 Uploaded
greg
parents:
diff changeset
254 # The format of key is <clustering_method>^^<orthogroup_id>.
177b62db4320 Uploaded
greg
parents:
diff changeset
255 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
177b62db4320 Uploaded
greg
parents:
diff changeset
256 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
177b62db4320 Uploaded
greg
parents:
diff changeset
257 clustering_method = scaffold_genes_dict_key_items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
258 # Get the list of genes for the current scaffold_genes_dict_key.
177b62db4320 Uploaded
greg
parents:
diff changeset
259 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
177b62db4320 Uploaded
greg
parents:
diff changeset
260 for gene_id in gene_list:
177b62db4320 Uploaded
greg
parents:
diff changeset
261 # Example species_code: Musac1.0, where
177b62db4320 Uploaded
greg
parents:
diff changeset
262 # species_name is Musac and version is 1.0.
177b62db4320 Uploaded
greg
parents:
diff changeset
263 species_code = gene_id.split("_")[1]
177b62db4320 Uploaded
greg
parents:
diff changeset
264 # Strip the version from the species_code.
177b62db4320 Uploaded
greg
parents:
diff changeset
265 species_code = species_code[0:5]
177b62db4320 Uploaded
greg
parents:
diff changeset
266 # Get the species_name from self.species_ids_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
267 species_name = self.species_ids_dict[species_code]
177b62db4320 Uploaded
greg
parents:
diff changeset
268 # Create a key for self.species_genes_dict, with the format:
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
269 # <clustering_method>^^<species_name>
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
270 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
271 # Add an entry to self.species_genes_dict, where the value
177b62db4320 Uploaded
greg
parents:
diff changeset
272 # is a list containing species_name and num_genes.
177b62db4320 Uploaded
greg
parents:
diff changeset
273 if species_genes_dict_key in self.species_genes_dict:
177b62db4320 Uploaded
greg
parents:
diff changeset
274 tup = self.species_genes_dict[species_genes_dict_key]
177b62db4320 Uploaded
greg
parents:
diff changeset
275 tup[1] += 1
177b62db4320 Uploaded
greg
parents:
diff changeset
276 self.species_genes_dict[species_genes_dict_key] = tup
177b62db4320 Uploaded
greg
parents:
diff changeset
277 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
278 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
177b62db4320 Uploaded
greg
parents:
diff changeset
279 # Populate the plant_tribes_taxon table.
177b62db4320 Uploaded
greg
parents:
diff changeset
280 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
177b62db4320 Uploaded
greg
parents:
diff changeset
281 self.log("Processing taxa lineage config: %s" % str(file_name))
177b62db4320 Uploaded
greg
parents:
diff changeset
282 with open(file_name, "r") as fh:
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
283 for line in fh:
0
177b62db4320 Uploaded
greg
parents:
diff changeset
284 line = line.strip()
177b62db4320 Uploaded
greg
parents:
diff changeset
285 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
177b62db4320 Uploaded
greg
parents:
diff changeset
286 # Skip blank lines, comments and section headers.
177b62db4320 Uploaded
greg
parents:
diff changeset
287 continue
177b62db4320 Uploaded
greg
parents:
diff changeset
288 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
177b62db4320 Uploaded
greg
parents:
diff changeset
289 items = line.split("\t")
177b62db4320 Uploaded
greg
parents:
diff changeset
290 species_name = items[0]
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
291 i = 0
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
292 for clustering_method in self.clustering_methods:
8
3841f7252b1d Uploaded
greg
parents: 6
diff changeset
293 # The format of species_genes_dict_key is <clustering_method>^^<species_name>.
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
294 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
295 # Get the scaffold_rec for the current scaffold_id and clustering_method.
177b62db4320 Uploaded
greg
parents:
diff changeset
296 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
177b62db4320 Uploaded
greg
parents:
diff changeset
297 for scaffold_rec in self.scaffold_recs:
177b62db4320 Uploaded
greg
parents:
diff changeset
298 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
177b62db4320 Uploaded
greg
parents:
diff changeset
299 scaffold_id_db = scaffold_rec[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
300 # The value is a list containing species_name and num_genes.
177b62db4320 Uploaded
greg
parents:
diff changeset
301 val = self.species_genes_dict[species_genes_dict_key]
177b62db4320 Uploaded
greg
parents:
diff changeset
302 if species_name == val[0]:
177b62db4320 Uploaded
greg
parents:
diff changeset
303 num_genes = val[1]
177b62db4320 Uploaded
greg
parents:
diff changeset
304 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
305 num_genes = 0
177b62db4320 Uploaded
greg
parents:
diff changeset
306 # Insert a row in to the plant_tribes_scaffold table.
177b62db4320 Uploaded
greg
parents:
diff changeset
307 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
177b62db4320 Uploaded
greg
parents:
diff changeset
308 sql = """
177b62db4320 Uploaded
greg
parents:
diff changeset
309 INSERT INTO plant_tribes_taxon
177b62db4320 Uploaded
greg
parents:
diff changeset
310 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
177b62db4320 Uploaded
greg
parents:
diff changeset
311 """
177b62db4320 Uploaded
greg
parents:
diff changeset
312 self.update(sql, tuple(args))
177b62db4320 Uploaded
greg
parents:
diff changeset
313 self.flush()
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
314 i += 1
3
48c13482e6c9 Uploaded
greg
parents: 2
diff changeset
315 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name)))
0
177b62db4320 Uploaded
greg
parents:
diff changeset
316
177b62db4320 Uploaded
greg
parents:
diff changeset
317 def process_orthogroup_fasta_files(self):
177b62db4320 Uploaded
greg
parents:
diff changeset
318 """
177b62db4320 Uploaded
greg
parents:
diff changeset
319 1. Analyze all of the scaffold .fna and .faa files for each clustering
177b62db4320 Uploaded
greg
parents:
diff changeset
320 method to populate the aa_dict and dna_dict sequence dictionaries.
177b62db4320 Uploaded
greg
parents:
diff changeset
321 2. Use the populated sequence dictionaries to populate the plant_tribes_gene
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
322 and gene_scaffold_orthogroup_taxon_association tables.
0
177b62db4320 Uploaded
greg
parents:
diff changeset
323 """
177b62db4320 Uploaded
greg
parents:
diff changeset
324 scaffold_id = os.path.basename(self.args.scaffold_path)
177b62db4320 Uploaded
greg
parents:
diff changeset
325 aa_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
326 dna_dict = {}
177b62db4320 Uploaded
greg
parents:
diff changeset
327 # Populate aa_dict and dna_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
328 for clustering_method in self.clustering_methods:
177b62db4320 Uploaded
greg
parents:
diff changeset
329 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
177b62db4320 Uploaded
greg
parents:
diff changeset
330 for file_name in os.listdir(file_dir):
177b62db4320 Uploaded
greg
parents:
diff changeset
331 items = file_name.split(".")
177b62db4320 Uploaded
greg
parents:
diff changeset
332 orthogroup_id = items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
333 file_extension = items[1]
177b62db4320 Uploaded
greg
parents:
diff changeset
334 if file_extension == "fna":
177b62db4320 Uploaded
greg
parents:
diff changeset
335 adict = dna_dict
177b62db4320 Uploaded
greg
parents:
diff changeset
336 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
337 adict = aa_dict
177b62db4320 Uploaded
greg
parents:
diff changeset
338 file_path = os.path.join(file_dir, file_name)
177b62db4320 Uploaded
greg
parents:
diff changeset
339 with open(file_path, "r") as fh:
177b62db4320 Uploaded
greg
parents:
diff changeset
340 for i, line in enumerate(fh):
177b62db4320 Uploaded
greg
parents:
diff changeset
341 line = line.strip()
177b62db4320 Uploaded
greg
parents:
diff changeset
342 if len(line) == 0:
177b62db4320 Uploaded
greg
parents:
diff changeset
343 # Skip blank lines (shoudn't happen).
177b62db4320 Uploaded
greg
parents:
diff changeset
344 continue
177b62db4320 Uploaded
greg
parents:
diff changeset
345 if line.startswith(">"):
177b62db4320 Uploaded
greg
parents:
diff changeset
346 # Example line:
177b62db4320 Uploaded
greg
parents:
diff changeset
347 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
177b62db4320 Uploaded
greg
parents:
diff changeset
348 gene_id = line.lstrip(">")
177b62db4320 Uploaded
greg
parents:
diff changeset
349 # The dictionary keys will combine the orthogroup_id,
177b62db4320 Uploaded
greg
parents:
diff changeset
350 # clustering method and gene id using the format
177b62db4320 Uploaded
greg
parents:
diff changeset
351 # ,orthogroup_id>^^<clustering_method>^^<gene_id>.
177b62db4320 Uploaded
greg
parents:
diff changeset
352 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
177b62db4320 Uploaded
greg
parents:
diff changeset
353 if combined_id not in adict:
177b62db4320 Uploaded
greg
parents:
diff changeset
354 # The value will be the dna sequence string..
177b62db4320 Uploaded
greg
parents:
diff changeset
355 adict[combined_id] = ""
177b62db4320 Uploaded
greg
parents:
diff changeset
356 else:
177b62db4320 Uploaded
greg
parents:
diff changeset
357 # Example line:
177b62db4320 Uploaded
greg
parents:
diff changeset
358 # ATGGAGAAGGACTTT
177b62db4320 Uploaded
greg
parents:
diff changeset
359 # Here combined_id is set because the fasta format specifies
177b62db4320 Uploaded
greg
parents:
diff changeset
360 # that all lines following the gene id defined in the if block
177b62db4320 Uploaded
greg
parents:
diff changeset
361 # above will be the sequence associated with that gene until
177b62db4320 Uploaded
greg
parents:
diff changeset
362 # the next gene id line is encountered.
177b62db4320 Uploaded
greg
parents:
diff changeset
363 sequence = adict[combined_id]
177b62db4320 Uploaded
greg
parents:
diff changeset
364 sequence = "%s%s" % (sequence, line)
177b62db4320 Uploaded
greg
parents:
diff changeset
365 adict[combined_id] = sequence
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
366 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
0
177b62db4320 Uploaded
greg
parents:
diff changeset
367 # from the contents of aa_dict and dna_dict.
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
368 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
369 gi = 0
751b36922d59 Uploaded
greg
parents: 0
diff changeset
370 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
0
177b62db4320 Uploaded
greg
parents:
diff changeset
371 # The dictionary keys combine the orthogroup_id, clustering method and
177b62db4320 Uploaded
greg
parents:
diff changeset
372 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
177b62db4320 Uploaded
greg
parents:
diff changeset
373 items = combined_id.split("^^")
177b62db4320 Uploaded
greg
parents:
diff changeset
374 orthogroup_id = items[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
375 clustering_method = items[1]
177b62db4320 Uploaded
greg
parents:
diff changeset
376 gene_id = items[2]
177b62db4320 Uploaded
greg
parents:
diff changeset
377 # The value will be a list containing both
177b62db4320 Uploaded
greg
parents:
diff changeset
378 # clustering_method and the dna string.
177b62db4320 Uploaded
greg
parents:
diff changeset
379 dna_sequence = dna_dict[combined_id]
177b62db4320 Uploaded
greg
parents:
diff changeset
380 aa_sequence = aa_dict[combined_id]
177b62db4320 Uploaded
greg
parents:
diff changeset
381 # Get the species_code from the gene_id.
177b62db4320 Uploaded
greg
parents:
diff changeset
382 species_code = gene_id.split("_")[1]
177b62db4320 Uploaded
greg
parents:
diff changeset
383 # Strip the version from the species_code.
177b62db4320 Uploaded
greg
parents:
diff changeset
384 species_code = species_code[0:5]
177b62db4320 Uploaded
greg
parents:
diff changeset
385 # Get the species_name from self.species_ids_dict.
177b62db4320 Uploaded
greg
parents:
diff changeset
386 species_name = self.species_ids_dict[species_code]
177b62db4320 Uploaded
greg
parents:
diff changeset
387 # Get the plant_tribes_orthogroup primary key id for
177b62db4320 Uploaded
greg
parents:
diff changeset
388 # the orthogroup_id from the plant_tribes_orthogroup table.
177b62db4320 Uploaded
greg
parents:
diff changeset
389 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
177b62db4320 Uploaded
greg
parents:
diff changeset
390 cur = self.conn.cursor()
177b62db4320 Uploaded
greg
parents:
diff changeset
391 cur.execute(sql)
177b62db4320 Uploaded
greg
parents:
diff changeset
392 orthogroup_id_db = cur.fetchone()[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
393 # If the plant_tribes_gene table contains a row that has the gene_id,
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
394 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
0
177b62db4320 Uploaded
greg
parents:
diff changeset
395 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
177b62db4320 Uploaded
greg
parents:
diff changeset
396 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
177b62db4320 Uploaded
greg
parents:
diff changeset
397 cur = self.conn.cursor()
177b62db4320 Uploaded
greg
parents:
diff changeset
398 cur.execute(sql)
5
cb986be6355e Uploaded
greg
parents: 4
diff changeset
399 taxon_id_db = cur.fetchone()[0]
0
177b62db4320 Uploaded
greg
parents:
diff changeset
400 # If the plant_tribes_gene table contains a row that has the gene_id,
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
401 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
0
177b62db4320 Uploaded
greg
parents:
diff changeset
402 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
177b62db4320 Uploaded
greg
parents:
diff changeset
403 cur = self.conn.cursor()
177b62db4320 Uploaded
greg
parents:
diff changeset
404 cur.execute(sql)
177b62db4320 Uploaded
greg
parents:
diff changeset
405 try:
177b62db4320 Uploaded
greg
parents:
diff changeset
406 gene_id_db = cur.fetchone()[0]
177b62db4320 Uploaded
greg
parents:
diff changeset
407 except:
177b62db4320 Uploaded
greg
parents:
diff changeset
408 # Insert a row into the plant_tribes_gene table.
5
cb986be6355e Uploaded
greg
parents: 4
diff changeset
409 args = [gene_id, dna_sequence, aa_sequence]
0
177b62db4320 Uploaded
greg
parents:
diff changeset
410 sql = """
177b62db4320 Uploaded
greg
parents:
diff changeset
411 INSERT INTO plant_tribes_gene
5
cb986be6355e Uploaded
greg
parents: 4
diff changeset
412 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
413 RETURNING id;
177b62db4320 Uploaded
greg
parents:
diff changeset
414 """
177b62db4320 Uploaded
greg
parents:
diff changeset
415 cur = self.update(sql, tuple(args))
177b62db4320 Uploaded
greg
parents:
diff changeset
416 self.flush()
177b62db4320 Uploaded
greg
parents:
diff changeset
417 gene_id_db = cur.fetchone()[0]
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
418 gi += 1
4
3c6514d810ae Uploaded
greg
parents: 3
diff changeset
419 if gi % 1000 == 0:
3c6514d810ae Uploaded
greg
parents: 3
diff changeset
420 self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
421 # Insert a row into the gene_scaffold_orthogroup_taxon_association table.
0
177b62db4320 Uploaded
greg
parents:
diff changeset
422 # Get the scaffold_rec for the current scaffold_id and clustering_method.
177b62db4320 Uploaded
greg
parents:
diff changeset
423 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
177b62db4320 Uploaded
greg
parents:
diff changeset
424 for scaffold_rec in self.scaffold_recs:
177b62db4320 Uploaded
greg
parents:
diff changeset
425 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
177b62db4320 Uploaded
greg
parents:
diff changeset
426 scaffold_id_db = scaffold_rec[0]
5
cb986be6355e Uploaded
greg
parents: 4
diff changeset
427 args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db]
0
177b62db4320 Uploaded
greg
parents:
diff changeset
428 sql = """
5
cb986be6355e Uploaded
greg
parents: 4
diff changeset
429 INSERT INTO gene_scaffold_orthogroup_taxon_association
cb986be6355e Uploaded
greg
parents: 4
diff changeset
430 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s);
0
177b62db4320 Uploaded
greg
parents:
diff changeset
431 """
177b62db4320 Uploaded
greg
parents:
diff changeset
432 cur = self.update(sql, tuple(args))
177b62db4320 Uploaded
greg
parents:
diff changeset
433 self.flush()
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
434 if gsoai % 1000 == 0:
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
435 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
2
751b36922d59 Uploaded
greg
parents: 0
diff changeset
436 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
6
9a4b0ae3d408 Uploaded
greg
parents: 5
diff changeset
437 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
0
177b62db4320 Uploaded
greg
parents:
diff changeset
438
8
3841f7252b1d Uploaded
greg
parents: 6
diff changeset
439
0
177b62db4320 Uploaded
greg
parents:
diff changeset
440 if __name__ == '__main__':
177b62db4320 Uploaded
greg
parents:
diff changeset
441 scaffold_loader = ScaffoldLoader()
177b62db4320 Uploaded
greg
parents:
diff changeset
442 scaffold_loader.run()
177b62db4320 Uploaded
greg
parents:
diff changeset
443 scaffold_loader.shutdown()