# HG changeset patch # User greg # Date 1539785258 14400 # Node ID ef07c87563609cf6a158fcc6298506991d408cb6 # Parent 2fac73ec6ee8883e7734f8b211aa9c2f5f1f08dc Uploaded diff -r 2fac73ec6ee8 -r ef07c8756360 gene_family_scaffold_loader.py --- a/gene_family_scaffold_loader.py Tue Oct 16 10:57:23 2018 -0400 +++ b/gene_family_scaffold_loader.py Wed Oct 17 10:07:38 2018 -0400 @@ -13,6 +13,11 @@ from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.engine.url import make_url +BLACKLIST_STRINGS = ['Unknown protein(s)', + 'No TAIR description(s)', + 'Representative annotation below 0.1%' + 'Representative AHRD below 0.1%'] + class ScaffoldLoader(object): def __init__(self): @@ -161,10 +166,10 @@ cur = self.conn.cursor() cur.execute(sql) plant_tribes_orthogroup_id = cur.fetchone()[0] - args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] last_item = len(items) for k in range(super_ortho_start_index, last_item): + bs_found = False # The last 7 items in this range are as follows. # items[last_item-6]: AHRD Descriptions # items[last_item-5]: TAIR Gene(s) Descriptions @@ -176,18 +181,24 @@ # We'll translate each of these items into a JSON # dictionary for inserting into the table. if k >= (last_item-7) and k <= last_item: + json_str = str(items[k]) # Here is an example string: # Phosphate transporter PHO1 [0.327] | Phosphate - # We'll split the string on " | " to create each value. - # The keys will be zero-padded integers to enable sorting. - json_dict = dict() - json_str = str(items[k]) - json_vals = json_str.split(' | ') - for key_index, json_val in enumerate(json_vals): - # The zero-padded key is 1 based. - json_key = '%04d' % key_index - json_dict[json_key] = json_val - args.append(json_dict) + for bs in BLACKLIST_STRINGS: + if json_str.find(bs) >= 0: + bs_found = True + args.append(None) + break + if not bs_found: + # We'll split the string on " | " to create each value. + # The keys will be zero-padded integers to enable sorting. + json_dict = dict() + json_vals = json_str.split(' | ') + for key_index, json_val in enumerate(json_vals): + # The zero-padded key is 1 based. + json_key = '%04d' % key_index + json_dict[json_key] = json_val + args.append(json_dict) else: args.append('%s' % str(items[k])) sql = self.pto_table.insert().values(args)