# HG changeset patch
# User greg
# Date 1539785258 14400
# Node ID ef07c87563609cf6a158fcc6298506991d408cb6
# Parent  2fac73ec6ee8883e7734f8b211aa9c2f5f1f08dc
Uploaded

diff -r 2fac73ec6ee8 -r ef07c8756360 gene_family_scaffold_loader.py
--- a/gene_family_scaffold_loader.py	Tue Oct 16 10:57:23 2018 -0400
+++ b/gene_family_scaffold_loader.py	Wed Oct 17 10:07:38 2018 -0400
@@ -13,6 +13,11 @@
 from sqlalchemy import create_engine, MetaData, Table
 from sqlalchemy.engine.url import make_url
 
+BLACKLIST_STRINGS = ['Unknown protein(s)',
+                     'No TAIR description(s)',
+                     'Representative annotation below 0.1%'
+                     'Representative AHRD below 0.1%']
+
 
 class ScaffoldLoader(object):
     def __init__(self):
@@ -161,10 +166,10 @@
                     cur = self.conn.cursor()
                     cur.execute(sql)
                     plant_tribes_orthogroup_id = cur.fetchone()[0]
-
                     args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes]
                     last_item = len(items)
                     for k in range(super_ortho_start_index, last_item):
+                        bs_found = False
                         # The last 7 items in this range are as follows.
                         # items[last_item-6]: AHRD Descriptions
                         # items[last_item-5]: TAIR Gene(s) Descriptions
@@ -176,18 +181,24 @@
                         # We'll translate each of these items into a JSON
                         # dictionary for inserting into the table.
                         if k >= (last_item-7) and k <= last_item:
+                            json_str = str(items[k])
                             # Here is an example string:
                             # Phosphate transporter PHO1 [0.327] | Phosphate
-                            # We'll split the string on " | " to create each value.
-                            # The keys will be zero-padded integers to enable sorting.
-                            json_dict = dict()
-                            json_str = str(items[k])
-                            json_vals = json_str.split(' | ')
-                            for key_index, json_val in enumerate(json_vals):
-                                # The zero-padded key is 1 based.
-                                json_key = '%04d' % key_index
-                                json_dict[json_key] = json_val
-                            args.append(json_dict)
+                            for bs in BLACKLIST_STRINGS:
+                                if json_str.find(bs) >= 0:
+                                    bs_found = True
+                                    args.append(None)
+                                    break
+                            if not bs_found:
+                                # We'll split the string on " | " to create each value.
+                                # The keys will be zero-padded integers to enable sorting.
+                                json_dict = dict()
+                                json_vals = json_str.split(' | ')
+                                for key_index, json_val in enumerate(json_vals):
+                                    # The zero-padded key is 1 based.
+                                    json_key = '%04d' % key_index
+                                    json_dict[json_key] = json_val
+                                args.append(json_dict)
                         else:
                             args.append('%s' % str(items[k]))
                     sql = self.pto_table.insert().values(args)