comparison gene_family_scaffold_loader.py @ 12:ef07c8756360 draft

Uploaded
author greg
date Wed, 17 Oct 2018 10:07:38 -0400
parents 2fac73ec6ee8
children 6f1f5f242503
comparison
equal deleted inserted replaced
11:2fac73ec6ee8 12:ef07c8756360
10 import sys 10 import sys
11 11
12 import psycopg2 12 import psycopg2
13 from sqlalchemy import create_engine, MetaData, Table 13 from sqlalchemy import create_engine, MetaData, Table
14 from sqlalchemy.engine.url import make_url 14 from sqlalchemy.engine.url import make_url
15
16 BLACKLIST_STRINGS = ['Unknown protein(s)',
17 'No TAIR description(s)',
18 'Representative annotation below 0.1%'
19 'Representative AHRD below 0.1%']
15 20
16 21
17 class ScaffoldLoader(object): 22 class ScaffoldLoader(object):
18 def __init__(self): 23 def __init__(self):
19 self.args = None 24 self.args = None
159 # the plant_tribes_orthogroup table. 164 # the plant_tribes_orthogroup table.
160 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');" 165 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');"
161 cur = self.conn.cursor() 166 cur = self.conn.cursor()
162 cur.execute(sql) 167 cur.execute(sql)
163 plant_tribes_orthogroup_id = cur.fetchone()[0] 168 plant_tribes_orthogroup_id = cur.fetchone()[0]
164
165 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] 169 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
166 last_item = len(items) 170 last_item = len(items)
167 for k in range(super_ortho_start_index, last_item): 171 for k in range(super_ortho_start_index, last_item):
172 bs_found = False
168 # The last 7 items in this range are as follows. 173 # The last 7 items in this range are as follows.
169 # items[last_item-6]: AHRD Descriptions 174 # items[last_item-6]: AHRD Descriptions
170 # items[last_item-5]: TAIR Gene(s) Descriptions 175 # items[last_item-5]: TAIR Gene(s) Descriptions
171 # items[last_item-4]: Pfam Domains 176 # items[last_item-4]: Pfam Domains
172 # items[last_item-3]: InterProScan Descriptions 177 # items[last_item-3]: InterProScan Descriptions
174 # items[last_item-1]: GO Biological Processes 179 # items[last_item-1]: GO Biological Processes
175 # items[last_item]: GO Cellular Components 180 # items[last_item]: GO Cellular Components
176 # We'll translate each of these items into a JSON 181 # We'll translate each of these items into a JSON
177 # dictionary for inserting into the table. 182 # dictionary for inserting into the table.
178 if k >= (last_item-7) and k <= last_item: 183 if k >= (last_item-7) and k <= last_item:
184 json_str = str(items[k])
179 # Here is an example string: 185 # Here is an example string:
180 # Phosphate transporter PHO1 [0.327] | Phosphate 186 # Phosphate transporter PHO1 [0.327] | Phosphate
181 # We'll split the string on " | " to create each value. 187 for bs in BLACKLIST_STRINGS:
182 # The keys will be zero-padded integers to enable sorting. 188 if json_str.find(bs) >= 0:
183 json_dict = dict() 189 bs_found = True
184 json_str = str(items[k]) 190 args.append(None)
185 json_vals = json_str.split(' | ') 191 break
186 for key_index, json_val in enumerate(json_vals): 192 if not bs_found:
187 # The zero-padded key is 1 based. 193 # We'll split the string on " | " to create each value.
188 json_key = '%04d' % key_index 194 # The keys will be zero-padded integers to enable sorting.
189 json_dict[json_key] = json_val 195 json_dict = dict()
190 args.append(json_dict) 196 json_vals = json_str.split(' | ')
197 for key_index, json_val in enumerate(json_vals):
198 # The zero-padded key is 1 based.
199 json_key = '%04d' % key_index
200 json_dict[json_key] = json_val
201 args.append(json_dict)
191 else: 202 else:
192 args.append('%s' % str(items[k])) 203 args.append('%s' % str(items[k]))
193 sql = self.pto_table.insert().values(args) 204 sql = self.pto_table.insert().values(args)
194 try: 205 try:
195 self.engine.execute(sql) 206 self.engine.execute(sql)