annotate add_scaffold.py @ 9:3706dea6c2f1 draft default tip

Uploaded
author greg
date Tue, 22 May 2018 10:49:03 -0400
parents 9b2ede3e7100
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
2 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
4 database efficiently by bypassing the Galaxy model and operating directly on the database.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
5 PostgreSQL 9.1 or greater is required.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
6 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
7 import argparse
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
8 import glob
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
9 import os
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
10 import sys
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
11
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
12 import psycopg2
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
13 from sqlalchemy.engine.url import make_url
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
14
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
15
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
16 class AddScaffold(object):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
17 def __init__(self):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
18 self.args = None
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
19 self.clustering_methods = []
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
20 self.conn = None
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
21 self.gene_sequences_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
22 self.scaffold_genes_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
23 self.scaffold_recs = []
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
24 self.species_genes_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
25 self.species_ids_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
26 self.taxa_lineage_config = None
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
27 self.parse_args()
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
28 self.fh = open(self.args.output, "w")
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
29 self.connect_db()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
30
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
31 def parse_args(self):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
32 parser = argparse.ArgumentParser()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
34 parser.add_argument('--output', dest='output', help='Output dataset'),
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
36 self.args = parser.parse_args()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
37
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
38 def connect_db(self):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
39 url = make_url(self.args.database_connection_string)
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
40 self.log('Connecting to database with URL: %s' % url)
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
41 args = url.translate_connect_args(username='user')
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
42 args.update(url.query)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
44 self.conn = psycopg2.connect(**args)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
45
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
46 def flush(self):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
47 self.conn.commit()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
48
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
49 def shutdown(self):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
50 self.conn.close()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
51
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
52 def update(self, sql, args):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
53 try:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
54 cur = self.conn.cursor()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
55 cur.execute(sql, args)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
56 except Exception as e:
4
fa0822e74ed3 Uploaded
greg
parents: 3
diff changeset
57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
fa0822e74ed3 Uploaded
greg
parents: 3
diff changeset
58 self.stop_err(msg)
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
59 return cur
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
60
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
61 def stop_err(self, msg):
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
62 sys.stderr.write(msg)
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
63 self.fh.flush()
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
64 self.fh.close()
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
65 sys.exit(1)
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
66
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
67 def log(self, msg):
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
68 self.fh.write("%s\n" % msg)
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
69 self.fh.flush()
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
70
6
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
71 @property
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
72 def can_add_scaffold(self):
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
73 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
74 Make sure the scaffold has not already been added.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
75 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
76 scaffold_id = os.path.basename(self.args.scaffold_path)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
77 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
78 cur = self.conn.cursor()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
79 cur.execute(sql)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
80 try:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
81 cur.fetchone()[0]
7
82822d1e5119 Uploaded
greg
parents: 6
diff changeset
82 # The scaffold has been added to the database.
6
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
83 return False
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
84 except:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
85 # The scaffold has not yet been added.
6
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
86 return True
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
87
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
88 def run(self):
6
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
89 if self.can_add_scaffold:
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
90 self.process_annot_dir()
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
91 self.process_scaffold_config_files()
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
92 self.process_orthogroup_fasta_files()
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
93 self.fh.flush()
7973201f23e7 Uploaded
greg
parents: 5
diff changeset
94 self.fh.close()
7
82822d1e5119 Uploaded
greg
parents: 6
diff changeset
95 else:
8
9b2ede3e7100 Uploaded
greg
parents: 7
diff changeset
96 self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
97
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
98 def process_annot_dir(self):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
99 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
100 First, parse all of the *.min_evalue.summary files in the
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
103 Next, parse all of the *.list files in the same directory to populate
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
104 self.scaffold_genes_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
105 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
106 scaffold_id = os.path.basename(self.args.scaffold_path)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
107 file_dir = os.path.join(self.args.scaffold_path, 'annot')
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
108 # The scaffol naming convention must follow this pattern:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
109 # <integer1>Gv<integer2>.<integer3>
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
110 # where integer 1 is the number of genomes in the scaffold_id. For example:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
111 # 22Gv1.1 -> 22 genomes
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
112 # 12Gv1.0 -> 12 genomes
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
113 # 26Gv2.0 -> 26 genomes, etc.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
114 num_genomes = int(scaffold_id.split("Gv")[0])
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
115 super_ortho_start_index = num_genomes + 1
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
116 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
117 items = os.path.basename(file_name).split(".")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
118 clustering_method = items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
119 # Save all clustering methods for later processing.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
120 if clustering_method not in self.clustering_methods:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
121 self.clustering_methods.append(clustering_method)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
122 # Insert a row in to the plant_tribes_scaffold table.
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s..." % (scaffold_id, clustering_method))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
124 args = [scaffold_id, clustering_method]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
125 sql = """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
126 INSERT INTO plant_tribes_scaffold
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
128 RETURNING id;
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
129 """
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
130 cur = self.update(sql, tuple(args))
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
131 self.flush()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
132 scaffold_id_db = cur.fetchone()[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
134 with open(file_name, "r") as fh:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
135 for i, line in enumerate(fh):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
136 if i == 0:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
137 # Skip first line.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
138 continue
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
139 num_genes = 0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
140 num_species = 0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
141 items = line.split("\t")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
142 orthogroup_id = int(items[0])
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
143 # Zero based items 1 to num_genomes consists of the
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
144 # number of species classified in the orthogroup (i.e.,
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
145 # species with at least 1 gene in the orthogroup).
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
146 for j in range(1, num_genomes):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
147 j_int = int(items[j])
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
148 if j_int > 0:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
149 # The species has at least 1 gene
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
150 num_species += 1
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
151 num_genes += j_int
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
152 # Insert a row into the plant_tribes_orthogroup table.
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
153 self.log("Inserting a row into the plant_tribes_orthogroup table...")
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
155 for k in range(super_ortho_start_index, len(items)):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
156 args.append('%s' % str(items[k]))
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
157 sql = """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
158 INSERT INTO plant_tribes_orthogroup
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
160 """
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
161 cur = self.update(sql, tuple(args))
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
162 self.flush()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
163 for file_name in glob.glob(os.path.join(file_dir, "*list")):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
164 items = os.path.basename(file_name).split(".")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
165 clustering_method = items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
166 with open(file_name, "r") as fh:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
167 for i, line in enumerate(fh):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
168 items = line.split("\t")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
169 # The key will be a combination of clustering_method and
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
170 # orthogroup_id separated by "^^" for easy splitting later.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
171 key = "%s^^%s" % (clustering_method, items[0])
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
172 # The value is the gen_id with all white space replaced by "_".
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
173 val = items[1].replace("|", "_")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
174 if key in self.scaffold_genes_dict:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
175 self.scaffold_genes_dict[key].append(val)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
176 else:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
177 self.scaffold_genes_dict[key] = [val]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
178
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
179 def process_scaffold_config_files(self):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
180 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
181 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
182 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
183 2. Calculate the number of genes found
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
184 for each species and add the number to self.species_genes_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
185 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
186 populate the plant_tribes_taxon table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
187 """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
188 scaffold_id = os.path.basename(self.args.scaffold_path)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
189 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
190 self.log("Processing rooting order config: %s" % str(file_name))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
191 # Populate self.species_ids_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
192 with open(file_name, "r") as fh:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
193 for i, line in enumerate(fh):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
194 line = line.strip()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
195 if len(line) == 0 or line.startswith("#") or line.startswith("["):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
196 # Skip blank lines, comments and section headers.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
197 continue
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
198 # Example line:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
199 # Physcomitrella patens=Phypa
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
200 items = line.split("=")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
201 self.species_ids_dict[items[1]] = items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
202 # Get lineage information for orthogrpoup taxa.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
203 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
204 # The format of key is <clustering_method>^^<orthogroup_id>.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
205 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
206 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
207 clustering_method = scaffold_genes_dict_key_items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
208 # Get the list of genes for the current scaffold_genes_dict_key.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
209 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
210 for gene_id in gene_list:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
211 # Example species_code: Musac1.0, where
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
212 # species_name is Musac and version is 1.0.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
213 species_code = gene_id.split("_")[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
214 # Strip the version from the species_code.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
215 species_code = species_code[0:5]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
216 # Get the species_name from self.species_ids_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
217 species_name = self.species_ids_dict[species_code]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
218 # Create a key for self.species_genes_dict, with the format:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
219 # <clustering_method>^^<species_code>
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
220 species_genes_dict_key = "%s^^%s" % (clustering_method, species_code)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
221 # Add an entry to self.species_genes_dict, where the value
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
222 # is a list containing species_name and num_genes.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
223 if species_genes_dict_key in self.species_genes_dict:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
224 tup = self.species_genes_dict[species_genes_dict_key]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
225 tup[1] += 1
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
226 self.species_genes_dict[species_genes_dict_key] = tup
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
227 else:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
228 self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
229 # Populate the plant_tribes_taxon table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
230 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
231 self.log("Processing taxa lineage config: %s" % str(file_name))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
232 with open(file_name, "r") as fh:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
233 for i, line in enumerate(fh):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
234 line = line.strip()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
235 if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
236 # Skip blank lines, comments and section headers.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
237 continue
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
238 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
239 items = line.split("\t")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
240 species_name = items[0]
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
241 self.log("Calculating the number of genes for species_name: %s" % str(species_name))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
242 for species_genes_dict_key in sorted(self.species_genes_dict.keys()):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
243 # The format of species_genes_dict_key is <clustering_method>^^<species_code>.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
244 species_genes_dict_key_items = species_genes_dict_key.split("^^")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
245 clustering_method = species_genes_dict_key_items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
246 species_code = species_genes_dict_key_items[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
247 # Get the scaffold_rec for the current scaffold_id and clustering_method.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
248 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
249 for scaffold_rec in self.scaffold_recs:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
250 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
251 scaffold_id_db = scaffold_rec[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
252 # The value is a list containing species_name and num_genes.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
253 val = self.species_genes_dict[species_genes_dict_key]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
254 if species_name == val[0]:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
255 num_genes = val[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
256 else:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
257 num_genes = 0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
258 # Insert a row in to the plant_tribes_scaffold table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
259 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
260 sql = """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
261 INSERT INTO plant_tribes_taxon
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
262 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
263 """
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
264 self.update(sql, tuple(args))
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
265 self.flush()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
266
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
267 def process_orthogroup_fasta_files(self):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
268 scaffold_id = os.path.basename(self.args.scaffold_path)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
269 aa_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
270 dna_dict = {}
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
271 # Populate aa_dict and dna_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
272 for clustering_method in self.clustering_methods:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
273 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
274 for file_name in os.listdir(file_dir):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
275 items = file_name.split(".")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
276 orthogroup_id = items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
277 file_extension = items[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
278 if file_extension == "fna":
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
279 adict = dna_dict
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
280 else:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
281 adict = aa_dict
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
282 file_path = os.path.join(file_dir, file_name)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
283 with open(file_path, "r") as fh:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
284 for i, line in enumerate(fh):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
285 line = line.strip()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
286 if len(line) == 0:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
287 # Skip blank lines (shoudn't happen).
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
288 continue
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
289 if line.startswith(">"):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
290 # Example line:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
291 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
292 gene_id = line.lstrip(">")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
293 # The dictionary keys will combine the orthogroup_id,
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
294 # clustering method and gene id using the format
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
295 # ,orthogroup_id>^^<clustering_method>^^<gene_id>.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
296 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
297 if combined_id not in adict:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
298 # The value will be the dna sequence string..
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
299 adict[combined_id] = ""
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
300 else:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
301 # Example line:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
302 # ATGGAGAAGGACTTT
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
303 # Here combined_id is set because the fasta format specifies
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
304 # that all lines following the gene id defined in the if block
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
305 # above will be the sequence associated with that gene until
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
306 # the next gene id line is encountered.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
307 sequence = adict[combined_id]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
308 sequence = "%s%s" % (sequence, line)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
309 adict[combined_id] = sequence
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
310 # Populate the plant_tribes_gene and gen_scaffold_association tables
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
311 # from the contents of aa_dict and dna_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
312 for combined_id in sorted(dna_dict.keys()):
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
313 # The dictionary keys combine the orthogroup_id, clustering method and
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
314 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
315 items = combined_id.split("^^")
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
316 orthogroup_id = items[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
317 clustering_method = items[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
318 gene_id = items[2]
9
3706dea6c2f1 Uploaded
greg
parents: 8
diff changeset
319 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_association tables with gene %s, scaffold %s and orthogroup %s..." % (gene_id, scaffold_id, orthogroup_id))
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
320 # The value will be a list containing both
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
321 # clustering_method and the dna string.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
322 dna_sequence = dna_dict[combined_id]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
323 aa_sequence = aa_dict[combined_id]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
324 # Get the species_code from the gene_id.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
325 species_code = gene_id.split("_")[1]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
326 # Strip the version from the species_code.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
327 species_code = species_code[0:5]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
328 # Get the species_name from self.species_ids_dict.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
329 species_name = self.species_ids_dict[species_code]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
330 # Get the plant_tribes_orthogroup primary key id for
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
331 # the orthogroup_id from the plant_tribes_orthogroup table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
332 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
333 cur = self.conn.cursor()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
334 cur.execute(sql)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
335 orthogroup_id_db = cur.fetchone()[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
336 # If the plant_tribes_gene table contains a row that has the gene_id,
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
337 # then we'll add a row only to the gene_scaffold_orthogroup_association table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
338 # Get the taxon_id for the species_name from the plant_tribes_taxon table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
339 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
340 cur = self.conn.cursor()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
341 cur.execute(sql)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
342 taxon_id = cur.fetchone()[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
343 # If the plant_tribes_gene table contains a row that has the gene_id,
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
344 # then we'll add a row only to the gene_scaffold_orthogroup_association table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
345 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
346 cur = self.conn.cursor()
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
347 cur.execute(sql)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
348 try:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
349 gene_id_db = cur.fetchone()[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
350 except:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
351 # Insert a row into the plant_tribes_gene table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
352 args = [gene_id, taxon_id, dna_sequence, aa_sequence]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
353 sql = """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
354 INSERT INTO plant_tribes_gene
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
355 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s, %s)
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
356 RETURNING id;
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
357 """
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
358 cur = self.update(sql, tuple(args))
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
359 self.flush()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
360 gene_id_db = cur.fetchone()[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
361 # Insert a row into the gene_scaffold_orthogroup_association table.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
362 # Get the scaffold_rec for the current scaffold_id and clustering_method.
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
363 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
364 for scaffold_rec in self.scaffold_recs:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
365 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
366 scaffold_id_db = scaffold_rec[0]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
367 args = [gene_id_db, scaffold_id_db, orthogroup_id_db]
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
368 sql = """
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
369 INSERT INTO gene_scaffold_orthogroup_association
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
370 VALUES (nextval('gene_scaffold_orthogroup_association_id_seq'), %s, %s, %s);
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
371 """
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
372 cur = self.update(sql, tuple(args))
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
373 self.flush()
0
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
374
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
375
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
376 if __name__ == '__main__':
fdcced0f4ae4 Uploaded
greg
parents:
diff changeset
377 add_scaffold = AddScaffold()
3
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
378 add_scaffold.run()
04ad7b5d22dd Uploaded
greg
parents: 0
diff changeset
379 add_scaffold.shutdown()