annotate ensure_synced.py @ 0:4f8134bfed02 draft default tip

Uploaded
author greg
date Thu, 15 Aug 2019 10:36:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4f8134bfed02 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
4f8134bfed02 Uploaded
greg
parents:
diff changeset
2 from __future__ import print_function
4f8134bfed02 Uploaded
greg
parents:
diff changeset
3
4f8134bfed02 Uploaded
greg
parents:
diff changeset
4 import argparse
4f8134bfed02 Uploaded
greg
parents:
diff changeset
5 import psycopg2
4f8134bfed02 Uploaded
greg
parents:
diff changeset
6 import sys
4f8134bfed02 Uploaded
greg
parents:
diff changeset
7
4f8134bfed02 Uploaded
greg
parents:
diff changeset
8 from sqlalchemy import create_engine
4f8134bfed02 Uploaded
greg
parents:
diff changeset
9 from sqlalchemy import MetaData
4f8134bfed02 Uploaded
greg
parents:
diff changeset
10 from sqlalchemy.engine.url import make_url
4f8134bfed02 Uploaded
greg
parents:
diff changeset
11
4f8134bfed02 Uploaded
greg
parents:
diff changeset
12 metadata = MetaData()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
13
4f8134bfed02 Uploaded
greg
parents:
diff changeset
14 SKIP_VALS = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
4f8134bfed02 Uploaded
greg
parents:
diff changeset
15
4f8134bfed02 Uploaded
greg
parents:
diff changeset
16
4f8134bfed02 Uploaded
greg
parents:
diff changeset
17 class EnsureSynced(object):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
18 def __init__(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
19 self.args = None
4f8134bfed02 Uploaded
greg
parents:
diff changeset
20 self.conn = None
4f8134bfed02 Uploaded
greg
parents:
diff changeset
21 self.parse_args()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
22 self.outfh = open(self.args.output, "w")
4f8134bfed02 Uploaded
greg
parents:
diff changeset
23 self.connect_db()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
24 self.engine = create_engine(self.args.database_connection_string)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
25 self.metadata = MetaData(self.engine)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
26 self.affy_ids_from_db = []
4f8134bfed02 Uploaded
greg
parents:
diff changeset
27 self.affy_ids_from_file = []
4f8134bfed02 Uploaded
greg
parents:
diff changeset
28
4f8134bfed02 Uploaded
greg
parents:
diff changeset
29 def connect_db(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
30 url = make_url(self.args.database_connection_string)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
31 args = url.translate_connect_args(username='user')
4f8134bfed02 Uploaded
greg
parents:
diff changeset
32 args.update(url.query)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
33 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
4f8134bfed02 Uploaded
greg
parents:
diff changeset
34 self.conn = psycopg2.connect(**args)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
35
4f8134bfed02 Uploaded
greg
parents:
diff changeset
36 def get_affy_ids_from_db(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
37 cmd = "SELECT affy_id FROM sample WHERE genotype_id NOT IN (SELECT id FROM genotype WHERE coral_mlg_clonal_id = 'failed') ORDER BY affy_id;"
4f8134bfed02 Uploaded
greg
parents:
diff changeset
38 cur = self.conn.cursor()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
39 cur.execute(cmd)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
40 rows = cur.fetchall()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
41 for row in rows:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
42 self.affy_ids_from_db.append(row[0])
4f8134bfed02 Uploaded
greg
parents:
diff changeset
43
4f8134bfed02 Uploaded
greg
parents:
diff changeset
44 def get_affy_ids_from_file(self, f):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
45 with open(f) as fh:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
46 for line in fh:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
47 line = line.strip()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
48 if line in SKIP_VALS:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
49 # Skip the first 9 lines in the file.
4f8134bfed02 Uploaded
greg
parents:
diff changeset
50 continue
4f8134bfed02 Uploaded
greg
parents:
diff changeset
51 self.affy_ids_from_file.append(line)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
52 self.affy_ids_from_file.sort()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
53
4f8134bfed02 Uploaded
greg
parents:
diff changeset
54 def get_difference(self, list1, list2):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
55 if len(list1) > len(list2):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
56 return list(set(list1) - set(list2))
4f8134bfed02 Uploaded
greg
parents:
diff changeset
57 return list(set(list2) - set(list1))
4f8134bfed02 Uploaded
greg
parents:
diff changeset
58
4f8134bfed02 Uploaded
greg
parents:
diff changeset
59 def log(self, msg):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
60 self.outfh.write("%s\n" % msg)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
61
4f8134bfed02 Uploaded
greg
parents:
diff changeset
62 def parse_args(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
63 parser = argparse.ArgumentParser()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
64 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
4f8134bfed02 Uploaded
greg
parents:
diff changeset
65 parser.add_argument('--affy_ids_from_file', dest='affy_ids_from_file', help='Affy ids taken from all previously genotyped samples vcf file')
4f8134bfed02 Uploaded
greg
parents:
diff changeset
66 parser.add_argument('--output', dest='output', help='Output dataset'),
4f8134bfed02 Uploaded
greg
parents:
diff changeset
67 self.args = parser.parse_args()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
68
4f8134bfed02 Uploaded
greg
parents:
diff changeset
69 def run(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
70 self.get_affy_ids_from_db()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
71 self.get_affy_ids_from_file(self.args.affy_ids_from_file)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
72 if self.affy_ids_from_db == self.affy_ids_from_file:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
73 in_sync = True
4f8134bfed02 Uploaded
greg
parents:
diff changeset
74 self.log("The selected file is in sync with the database.\n\n")
4f8134bfed02 Uploaded
greg
parents:
diff changeset
75 else:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
76 in_sync = False
4f8134bfed02 Uploaded
greg
parents:
diff changeset
77 self.log("The selected file is not in sync with the database.\n\n")
4f8134bfed02 Uploaded
greg
parents:
diff changeset
78 num_affy_ids_from_db = len(self.affy_ids_from_db)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
79 self.log("Number of Affymetrix ids in the database: %d\n" % num_affy_ids_from_db)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
80 num_affy_ids_from_file = len(self.affy_ids_from_file)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
81 self.log("Number of Affymetrix ids in the file: %d\n" % num_affy_ids_from_file)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
82 if not in_sync:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
83 if num_affy_ids_from_db > num_affy_ids_from_file:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
84 self.log("The database contains the following Affymetrix ids that are not in the file.\n")
4f8134bfed02 Uploaded
greg
parents:
diff changeset
85 else:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
86 self.log("The file contains the following Affymetrix ids that are not in the database.\n")
4f8134bfed02 Uploaded
greg
parents:
diff changeset
87 diff_list = self.get_difference(self.affy_ids_from_db, self.affy_ids_from_file)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
88 for affy_id in diff_list:
4f8134bfed02 Uploaded
greg
parents:
diff changeset
89 self.log("%s\n" % affy_id)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
90 self.outfh.flush()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
91 self.outfh.close()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
92 sys.exit(1)
4f8134bfed02 Uploaded
greg
parents:
diff changeset
93
4f8134bfed02 Uploaded
greg
parents:
diff changeset
94 def shutdown(self):
4f8134bfed02 Uploaded
greg
parents:
diff changeset
95 self.outfh.flush()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
96 self.outfh.close()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
97 self.conn.close()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
98
4f8134bfed02 Uploaded
greg
parents:
diff changeset
99
4f8134bfed02 Uploaded
greg
parents:
diff changeset
100 if __name__ == '__main__':
4f8134bfed02 Uploaded
greg
parents:
diff changeset
101 es = EnsureSynced()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
102 es.run()
4f8134bfed02 Uploaded
greg
parents:
diff changeset
103 es.shutdown()