Mercurial > repos > pimarin > data_manager_bakta
comparison data_manager/bakta_build_database.py @ 27:2879a0e702d5 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 3bfd33ae9741216e50373ddd04914d82f9731883
author | pimarin |
---|---|
date | Wed, 23 Aug 2023 14:38:16 +0000 |
parents | 0408796bce2a |
children |
comparison
equal
deleted
inserted
replaced
26:0408796bce2a | 27:2879a0e702d5 |
---|---|
1 import argparse | 1 import argparse |
2 import hashlib | 2 import hashlib |
3 import json | 3 import json |
4 import os | 4 import os |
5 import re | |
5 import sys | 6 import sys |
6 import tarfile | 7 import tarfile |
7 from datetime import datetime | 8 from datetime import datetime |
8 from pathlib import Path | 9 from pathlib import Path |
9 | 10 |
14 class GetBaktaDatabaseInfo: | 15 class GetBaktaDatabaseInfo: |
15 """ | 16 """ |
16 Extract bakta database information to make a json file for data_manager | 17 Extract bakta database information to make a json file for data_manager |
17 """ | 18 """ |
18 | 19 |
19 def __init__(self, | 20 def __init__( |
20 data_table_name="bakta_database", | 21 self, |
21 db_name=Path.cwd().joinpath("db"), | 22 data_table_name="bakta_database", |
22 db_version="latest", | 23 db_version="latest", |
23 test_mode=False): | 24 tarball_name="db.tar.gz", |
25 test_mode=False, | |
26 ): | |
24 self.bakta_table_list = None | 27 self.bakta_table_list = None |
25 self.db_url = None | 28 self.db_url = None |
29 self.db_name = "bakta-db" | |
30 self.db_type = "" | |
26 self.data_table_entry = None | 31 self.data_table_entry = None |
27 self.data_table_name = data_table_name | 32 self.data_table_name = data_table_name |
28 self.db_name = db_name | 33 self.tar_name = tarball_name |
29 self.db_version = db_version | 34 self.db_version = db_version |
30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' | 35 self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json" |
31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' | 36 self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json" |
32 self.test_mode = test_mode | 37 self.test_mode = test_mode |
38 | |
39 def get_database_type(self): | |
40 self.light_db = bool(re.search(pattern="light", string=self.db_version)) | |
41 self.db_version = self.db_version.split(sep="_")[0] | |
42 if self.light_db: | |
43 self.db_type = "light" | |
44 self.tar_name = "db-light.tar.gz" | |
45 self.md5 = self.fetch_db_versions()["md5-light"] | |
46 else: | |
47 self.md5 = self.fetch_db_versions()["md5"] | |
33 | 48 |
34 def get_data_table_format(self): | 49 def get_data_table_format(self): |
35 """ | 50 """ |
36 Skeleton of a data_table format | 51 Skeleton of a data_table format |
37 return: a data table formated for json output | 52 return: a data table formated for json output |
38 """ | 53 """ |
39 self.data_table_entry = { | 54 self.data_table_entry = {"data_tables": {self.data_table_name: {}}} |
40 "data_tables": { | |
41 self.data_table_name: {} | |
42 } | |
43 } | |
44 return self.data_table_entry | 55 return self.data_table_entry |
45 | 56 |
46 def fetch_db_versions(self, db_version="latest"): | 57 def fetch_db_versions(self): |
47 """ | 58 """ |
48 List bakta database info related to the db_version selected | 59 List bakta database info related to the db_version selected |
49 """ | 60 """ |
50 if self.test_mode is True: | 61 |
62 if self.test_mode: | |
51 self.DB_VERSIONS_URL = self.DB_TEST_URL | 63 self.DB_VERSIONS_URL = self.DB_TEST_URL |
52 try: | 64 try: |
53 with requests.get(self.DB_VERSIONS_URL) as resp: | 65 with requests.get(self.DB_VERSIONS_URL) as resp: |
54 versions = json.loads(resp.content) | 66 versions = json.loads(resp.content) |
55 except IOError as e: | 67 except IOError as e: |
56 print(e, file=sys.stderr) | 68 print(e, file=sys.stderr) |
57 raise e | 69 raise e |
58 else: | 70 |
59 if db_version == "latest": | 71 if self.db_version == "latest": |
60 db_date_list = [] | 72 db_date_list = [] |
61 for db_dic in versions: | 73 for db_dic in versions: |
62 db_date_list.append(datetime.strptime(db_dic["date"], | 74 db_date_list.append( |
63 '%Y-%m-%d').date()) | 75 datetime.strptime(db_dic["date"], "%Y-%m-%d").date() |
64 filtered_version = max(versions, key=lambda x: x['date']) | 76 ) |
65 else: | 77 filtered_version = max(versions, key=lambda x: x["date"]) |
66 filtered_version = None | 78 else: |
67 for item in versions: | 79 filtered_version = None |
68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: | 80 for item in versions: |
69 filtered_version = item | 81 if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version: |
70 break | 82 filtered_version = item |
71 if filtered_version is None: | 83 break |
72 print("No matching version detected in the list") | 84 if filtered_version is None: |
73 if filtered_version is not None: | 85 print("No matching version detected in the list") |
74 self.db_url = f"https://zenodo.org/record/" \ | 86 else: |
75 f"{filtered_version['record']}/files/db.tar.gz" | 87 self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}" |
76 self.db_version = db_version | 88 return filtered_version |
77 return filtered_version | |
78 | 89 |
79 def get_data_manager(self, bakta_database_info): | 90 def get_data_manager(self, bakta_database_info): |
80 self.bakta_table_list = self.get_data_table_format() | 91 self.bakta_table_list = self.get_data_table_format() |
81 bakta_name = f"V{bakta_database_info['major']}." \ | 92 bakta_name = ( |
82 f"{bakta_database_info['minor']}_" \ | 93 f"V{bakta_database_info['major']}." |
83 f"{bakta_database_info['date']}" | 94 f"{bakta_database_info['minor']}{self.db_type}_" |
84 tool_version = str(f"{bakta_database_info['software-min']['major']}." | 95 f"{bakta_database_info['date']}" |
85 f"{bakta_database_info['software-min']['minor']}") | 96 ) |
86 data_info = dict(value=bakta_name, | 97 tool_version = str( |
87 dbkey=bakta_database_info['record'], | 98 f"{bakta_database_info['software-min']['major']}." |
88 bakta_version=tool_version, | 99 f"{bakta_database_info['software-min']['minor']}" |
89 path="db") | 100 ) |
101 data_info = dict( | |
102 value=bakta_name, | |
103 dbkey=bakta_database_info["record"], | |
104 bakta_version=tool_version, | |
105 path=self.db_name, | |
106 ) | |
90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] | 107 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] |
91 return self.bakta_table_list | 108 return self.bakta_table_list |
92 | 109 |
93 | 110 |
94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): | 111 class InstallBaktaDatabase(GetBaktaDatabaseInfo): |
96 Download the bakta database, | 113 Download the bakta database, |
97 check md5 sum, | 114 check md5 sum, |
98 untar the download db and update for the amrfinderplus database | 115 untar the download db and update for the amrfinderplus database |
99 """ | 116 """ |
100 | 117 |
101 def __init__(self, | 118 def __init__( |
102 db_dir=Path.cwd(), | 119 self, |
103 db_name="bakta", | 120 db_dir=Path.cwd(), |
104 tarball_name="db.tar.gz", | 121 db_name="bakta-db", |
105 test_mode=False): | 122 db_version="latest", |
123 test_mode=False | |
124 ): | |
106 super().__init__() | 125 super().__init__() |
107 self.md5 = None | 126 self.md5 = None |
127 self.db_version = db_version | |
108 self.db_dir = db_dir | 128 self.db_dir = db_dir |
109 self.db_name = db_name | 129 self.db_name = db_name |
110 self.tarball_name = tarball_name | 130 self.tarball_path = "" |
111 self.tarball_path = None | |
112 self.test_mode = test_mode | 131 self.test_mode = test_mode |
132 self.get_database_type() | |
113 | 133 |
114 def download(self): | 134 def download(self): |
115 self.db_name = f'{self.db_name}_{self.db_version}' | 135 #self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}" |
116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) | 136 bakta_path = Path(self.db_dir).joinpath(self.tar_name) |
117 try: | 137 try: |
118 with bakta_path.open('wb') as fh_out, \ | 138 with bakta_path.open("wb") as fh_out, requests.get( |
119 requests.get(self.db_url, stream=True) as resp: | 139 self.db_url, stream=True) as resp: |
120 total_length = resp.headers.get('content-length') | 140 total_length = resp.headers.get("content-length") |
121 if total_length is None: # no content length header | 141 if total_length is None: # no content length header |
122 for data in resp.iter_content(chunk_size=1024 * 1024): | 142 for data in resp.iter_content(chunk_size=1024 * 1024): |
123 fh_out.write(data) | 143 fh_out.write(data) |
124 else: | 144 else: |
125 for data in resp.iter_content(chunk_size=1024 * 1024): | 145 for data in resp.iter_content(chunk_size=1024 * 1024): |
126 fh_out.write(data) | 146 fh_out.write(data) |
127 print(f'Download bakta database {self.db_version}') | 147 print(f"Download bakta database {self.db_version}") |
128 self.tarball_path = bakta_path | 148 self.tarball_path = bakta_path |
129 except IOError: | 149 except IOError: |
130 print(f'ERROR: Could not download file from Zenodo!' | 150 print( |
131 f' url={self.db_url}, path={self.tarball_name}') | 151 f"ERROR: Could not download file from Zenodo!" |
152 f" url={self.db_url}, to={self.tarball_path}" | |
153 ) | |
132 | 154 |
133 def untar(self): | 155 def untar(self): |
134 db_path = Path(self.db_dir).as_posix() | 156 db_path = Path(self.db_dir).joinpath(self.db_name) |
135 try: | 157 try: |
136 with self.tarball_path.open('rb') as fh_in, \ | 158 with self.tarball_path.open("rb") as fh_in, tarfile.open( |
137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | 159 fileobj=fh_in, mode="r:gz" |
160 ) as tar_file: | |
138 tar_file.extractall(path=db_path) | 161 tar_file.extractall(path=db_path) |
139 print(f'Untar the database in {db_path}') | 162 print(f"Untar the database in {db_path}") |
140 return db_path | 163 # return db_path |
141 except OSError: | 164 except OSError: |
142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' | 165 sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {db_path}") |
143 f'to {self.db_name}') | |
144 | 166 |
145 def calc_md5_sum(self, buffer_size=1048576): | 167 def calc_md5_sum(self, buffer_size=1048576): |
146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) | 168 tarball_path = Path(self.db_dir).joinpath(self.tar_name) |
147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"] | |
148 md5 = hashlib.md5() | 169 md5 = hashlib.md5() |
149 with tarball_path.open('rb') as fh: | 170 with tarball_path.open("rb") as fh: |
150 data = fh.read(buffer_size) | 171 data = fh.read(buffer_size) |
151 while data: | 172 while data: |
152 md5.update(data) | 173 md5.update(data) |
153 data = fh.read(buffer_size) | 174 data = fh.read(buffer_size) |
154 if md5.hexdigest() == self.md5: | 175 if md5.hexdigest() == self.md5: |
155 print('\t...md5 control database OK') | 176 print("\t...md5 control database OK") |
156 else: | 177 else: |
157 print(f"Error: corrupt database file! " | 178 print( |
158 f"calculated md5 = {md5.hexdigest()}" | 179 f"Error: corrupt database file! " |
159 f" different from {self.md5} ") | 180 f"calculated md5 = {md5.hexdigest()}" |
160 | 181 f" different from {self.md5} " |
161 | 182 ) |
162 """ | |
163 This is the method to download the amrfinderplus database need by bakta. | |
164 Deprecated to use the amrfinderplus data_manager | |
165 def update_amrfinderplus_db(self): | |
166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db" | |
167 if self.db_version == "test": | |
168 cmd = [ | |
169 'amrfinder_update', | |
170 '--database', str(amrfinderplus_db_path), | |
171 '--force_update', | |
172 '--help' | |
173 ] | |
174 else: | |
175 cmd = [ | |
176 'amrfinder_update', | |
177 '--database', str(amrfinderplus_db_path), | |
178 '--force_update' | |
179 ] | |
180 proc = sp.run( | |
181 cmd, | |
182 universal_newlines=True | |
183 ) | |
184 if proc.returncode != 0: | |
185 print(f"ERROR: AMRFinderPlus failed! " | |
186 f"command: 'amrfinder_update --force_update" | |
187 f" --database {amrfinderplus_db_path}'") | |
188 else: | |
189 print("AMRFinderPlus database download") | |
190 """ | |
191 | 183 |
192 | 184 |
193 def parse_arguments(): | 185 def parse_arguments(): |
194 # parse options and arguments | 186 # parse options and arguments |
195 arg_parser = argparse.ArgumentParser() | 187 arg_parser = argparse.ArgumentParser() |
196 arg_parser.add_argument("data_manager_json") | 188 arg_parser.add_argument("data_manager_json") |
197 arg_parser.add_argument("-d", "--database_version", | 189 arg_parser.add_argument( |
198 help='Select the database version ' | 190 "-d", |
199 '(major and minor eg. 4.0),' | 191 "--database_version", |
200 'default is the latest version', | 192 help="Select the database version " |
201 default="latest", | 193 "(major and minor eg. 4.0)," |
202 required=True) | 194 "default is the latest version", |
203 arg_parser.add_argument("-t", "--test", action='store_true', | 195 default="latest", |
204 help="option to test the script with an empty database") | 196 required=True, |
197 ) | |
198 arg_parser.add_argument( | |
199 "-t", | |
200 "--test", | |
201 action="store_true", | |
202 help="option to test the script with an empty database", | |
203 ) | |
205 return arg_parser.parse_args() | 204 return arg_parser.parse_args() |
206 | 205 |
207 | 206 |
208 def main(): | 207 def main(): |
209 all_args = parse_arguments() | 208 all_args = parse_arguments() |
210 with open(all_args.data_manager_json) as fh: | 209 with open(all_args.data_manager_json) as fh: |
211 params = json.load(fh) | 210 params = json.load(fh) |
212 target_dir = params['output_data'][0]['extra_files_path'] | 211 target_dir = params["output_data"][0]["extra_files_path"] |
213 os.makedirs(target_dir) | 212 os.makedirs(target_dir) |
214 # init the class to download bakta db | 213 # init the class to download bakta db |
215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) | 214 bakta_upload = InstallBaktaDatabase( |
216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) | 215 test_mode=all_args.test, db_version=all_args.database_version |
216 ) | |
217 bakta_db = bakta_upload.fetch_db_versions() | |
217 # update the path for galaxy | 218 # update the path for galaxy |
218 bakta_upload.db_dir = target_dir | 219 bakta_upload.db_dir = target_dir |
219 # download the database | 220 # download the database |
220 bakta_upload.download() | 221 bakta_upload.download() |
221 # check md5 sum | 222 # check md5 sum |
222 bakta_upload.calc_md5_sum() | 223 bakta_upload.calc_md5_sum() |
223 # untar db | 224 # untar db |
224 bakta_upload.untar() | 225 bakta_upload.untar() |
225 # make the data_manager metadata | 226 # make the data_manager metadata |
226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) | 227 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) |
227 with open(all_args.data_manager_json, 'w') as fh: | 228 with open(all_args.data_manager_json, "w") as fh: |
228 json.dump(bakta_data_manager, fh, sort_keys=True) | 229 json.dump(bakta_data_manager, fh, sort_keys=True) |
229 | 230 |
230 | 231 |
231 if __name__ == '__main__': | 232 if __name__ == "__main__": |
232 main() | 233 main() |