Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
changeset 12:ef07c8756360 draft
Uploaded
author | greg |
---|---|
date | Wed, 17 Oct 2018 10:07:38 -0400 |
parents | 2fac73ec6ee8 |
children | 6f1f5f242503 |
files | gene_family_scaffold_loader.py |
diffstat | 1 files changed, 22 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/gene_family_scaffold_loader.py Tue Oct 16 10:57:23 2018 -0400 +++ b/gene_family_scaffold_loader.py Wed Oct 17 10:07:38 2018 -0400 @@ -13,6 +13,11 @@ from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.engine.url import make_url +BLACKLIST_STRINGS = ['Unknown protein(s)', + 'No TAIR description(s)', + 'Representative annotation below 0.1%' + 'Representative AHRD below 0.1%'] + class ScaffoldLoader(object): def __init__(self): @@ -161,10 +166,10 @@ cur = self.conn.cursor() cur.execute(sql) plant_tribes_orthogroup_id = cur.fetchone()[0] - args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] last_item = len(items) for k in range(super_ortho_start_index, last_item): + bs_found = False # The last 7 items in this range are as follows. # items[last_item-6]: AHRD Descriptions # items[last_item-5]: TAIR Gene(s) Descriptions @@ -176,18 +181,24 @@ # We'll translate each of these items into a JSON # dictionary for inserting into the table. if k >= (last_item-7) and k <= last_item: + json_str = str(items[k]) # Here is an example string: # Phosphate transporter PHO1 [0.327] | Phosphate - # We'll split the string on " | " to create each value. - # The keys will be zero-padded integers to enable sorting. - json_dict = dict() - json_str = str(items[k]) - json_vals = json_str.split(' | ') - for key_index, json_val in enumerate(json_vals): - # The zero-padded key is 1 based. - json_key = '%04d' % key_index - json_dict[json_key] = json_val - args.append(json_dict) + for bs in BLACKLIST_STRINGS: + if json_str.find(bs) >= 0: + bs_found = True + args.append(None) + break + if not bs_found: + # We'll split the string on " | " to create each value. + # The keys will be zero-padded integers to enable sorting. + json_dict = dict() + json_vals = json_str.split(' | ') + for key_index, json_val in enumerate(json_vals): + # The zero-padded key is 1 based. + json_key = '%04d' % key_index + json_dict[json_key] = json_val + args.append(json_dict) else: args.append('%s' % str(items[k])) sql = self.pto_table.insert().values(args)