Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
comparison gene_family_scaffold_loader.py @ 12:ef07c8756360 draft
Uploaded
author | greg |
---|---|
date | Wed, 17 Oct 2018 10:07:38 -0400 |
parents | 2fac73ec6ee8 |
children | 6f1f5f242503 |
comparison
equal
deleted
inserted
replaced
11:2fac73ec6ee8 | 12:ef07c8756360 |
---|---|
10 import sys | 10 import sys |
11 | 11 |
12 import psycopg2 | 12 import psycopg2 |
13 from sqlalchemy import create_engine, MetaData, Table | 13 from sqlalchemy import create_engine, MetaData, Table |
14 from sqlalchemy.engine.url import make_url | 14 from sqlalchemy.engine.url import make_url |
15 | |
16 BLACKLIST_STRINGS = ['Unknown protein(s)', | |
17 'No TAIR description(s)', | |
18 'Representative annotation below 0.1%' | |
19 'Representative AHRD below 0.1%'] | |
15 | 20 |
16 | 21 |
17 class ScaffoldLoader(object): | 22 class ScaffoldLoader(object): |
18 def __init__(self): | 23 def __init__(self): |
19 self.args = None | 24 self.args = None |
159 # the plant_tribes_orthogroup table. | 164 # the plant_tribes_orthogroup table. |
160 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');" | 165 sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');" |
161 cur = self.conn.cursor() | 166 cur = self.conn.cursor() |
162 cur.execute(sql) | 167 cur.execute(sql) |
163 plant_tribes_orthogroup_id = cur.fetchone()[0] | 168 plant_tribes_orthogroup_id = cur.fetchone()[0] |
164 | |
165 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] | 169 args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] |
166 last_item = len(items) | 170 last_item = len(items) |
167 for k in range(super_ortho_start_index, last_item): | 171 for k in range(super_ortho_start_index, last_item): |
172 bs_found = False | |
168 # The last 7 items in this range are as follows. | 173 # The last 7 items in this range are as follows. |
169 # items[last_item-6]: AHRD Descriptions | 174 # items[last_item-6]: AHRD Descriptions |
170 # items[last_item-5]: TAIR Gene(s) Descriptions | 175 # items[last_item-5]: TAIR Gene(s) Descriptions |
171 # items[last_item-4]: Pfam Domains | 176 # items[last_item-4]: Pfam Domains |
172 # items[last_item-3]: InterProScan Descriptions | 177 # items[last_item-3]: InterProScan Descriptions |
174 # items[last_item-1]: GO Biological Processes | 179 # items[last_item-1]: GO Biological Processes |
175 # items[last_item]: GO Cellular Components | 180 # items[last_item]: GO Cellular Components |
176 # We'll translate each of these items into a JSON | 181 # We'll translate each of these items into a JSON |
177 # dictionary for inserting into the table. | 182 # dictionary for inserting into the table. |
178 if k >= (last_item-7) and k <= last_item: | 183 if k >= (last_item-7) and k <= last_item: |
184 json_str = str(items[k]) | |
179 # Here is an example string: | 185 # Here is an example string: |
180 # Phosphate transporter PHO1 [0.327] | Phosphate | 186 # Phosphate transporter PHO1 [0.327] | Phosphate |
181 # We'll split the string on " | " to create each value. | 187 for bs in BLACKLIST_STRINGS: |
182 # The keys will be zero-padded integers to enable sorting. | 188 if json_str.find(bs) >= 0: |
183 json_dict = dict() | 189 bs_found = True |
184 json_str = str(items[k]) | 190 args.append(None) |
185 json_vals = json_str.split(' | ') | 191 break |
186 for key_index, json_val in enumerate(json_vals): | 192 if not bs_found: |
187 # The zero-padded key is 1 based. | 193 # We'll split the string on " | " to create each value. |
188 json_key = '%04d' % key_index | 194 # The keys will be zero-padded integers to enable sorting. |
189 json_dict[json_key] = json_val | 195 json_dict = dict() |
190 args.append(json_dict) | 196 json_vals = json_str.split(' | ') |
197 for key_index, json_val in enumerate(json_vals): | |
198 # The zero-padded key is 1 based. | |
199 json_key = '%04d' % key_index | |
200 json_dict[json_key] = json_val | |
201 args.append(json_dict) | |
191 else: | 202 else: |
192 args.append('%s' % str(items[k])) | 203 args.append('%s' % str(items[k])) |
193 sql = self.pto_table.insert().values(args) | 204 sql = self.pto_table.insert().values(args) |
194 try: | 205 try: |
195 self.engine.execute(sql) | 206 self.engine.execute(sql) |