comparison data_manager/bakta_build_database.py @ 0:9d08486abf8e draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bakta commit 5f320f165a7f454193df36cc2ec77a87d50ec80f-dirty"
author pimarin
date Wed, 17 Aug 2022 14:42:22 +0000
parents
children faae5d8ce0cb
comparison
equal deleted inserted replaced
-1:000000000000 0:9d08486abf8e
1 import hashlib
2 import json
3 import logging
4 import os
5 import shutil
6 import subprocess as sp
7 import stat
8 import sys
9 import tarfile
10
11 from pathlib import Path
12
13 from alive_progress import alive_bar
14 import requests
15
16 import bakta
17 import bakta.config as cfg
18 import bakta.constants as bc
19 import bakta.utils as bu
20
21
22 log = logging.getLogger('DB')
23
24
25 FILE_NAMES = [
26 'antifam.h3f',
27 'antifam.h3i',
28 'antifam.h3m',
29 'antifam.h3p',
30 'bakta.db',
31 'expert-protein-sequences.dmnd',
32 'ncRNA-genes.i1f',
33 'ncRNA-genes.i1i',
34 'ncRNA-genes.i1m',
35 'ncRNA-genes.i1p',
36 'ncRNA-regions.i1f',
37 'ncRNA-regions.i1i',
38 'ncRNA-regions.i1m',
39 'ncRNA-regions.i1p',
40 'oric.fna',
41 'orit.fna',
42 'pfam.h3f',
43 'pfam.h3i',
44 'pfam.h3m',
45 'pfam.h3p',
46 'psc.dmnd',
47 'rfam-go.tsv',
48 'rRNA.i1f',
49 'rRNA.i1i',
50 'rRNA.i1m',
51 'rRNA.i1p',
52 'sorf.dmnd'
53 ]
54
55
56 def check(db_path: Path) -> dict:
57 """Check if database directory exists, is accessible and contains necessary files."""
58
59 if(db_path is None):
60 log.error('directory neither provided nor detected!')
61 sys.exit('ERROR: database directory not provided nor detected! Please provide a valid path to the database directory.')
62
63 if(not os.access(str(db_path), os.R_OK & os.X_OK)):
64 log.error('directory (%s) not readable/accessible!', db_path)
65 sys.exit(f'ERROR: database directory ({db_path}) not readable/accessible!')
66
67 version_path = db_path.joinpath('version.json')
68 if(not os.access(str(version_path), os.R_OK) or not version_path.is_file()):
69 log.error('version file not readable!')
70 sys.exit('ERROR: database version file (version.json) not readable!')
71
72 try:
73 with version_path.open() as fh:
74 db_info = json.load(fh)
75 except:
76 log.exception('could not parse database version file!')
77 sys.exit('ERROR: could not parse database version file!')
78
79 for key in ['date', 'major', 'minor']:
80 if(key not in db_info):
81 log.error('wrong db version info file content! missed key=%s', key)
82 sys.exit(f"ERROR: wrong db version info file format! Missed key '{key}' in JSON structure.")
83
84 log.info('detected: major=%i, minor=%i, date=%s', db_info['major'], db_info['minor'], db_info['date'])
85 if(db_info['major'] < bakta.__db_schema_version__):
86 log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
87 sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update the database from https://doi.org/10.5281/zenodo.4247253")
88 elif(db_info['major'] > bakta.__db_schema_version__):
89 log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major'])
90 sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update Bakta or download a compatible database version from https://doi.org/10.5281/zenodo.4247253")
91
92 for file_name in FILE_NAMES:
93 path = db_path.joinpath(file_name)
94 if(not os.access(str(path), os.R_OK) or not path.is_file()):
95 log.error('file not readable! file=%s', file_name)
96 sys.exit(f'ERROR: database file ({file_name}) not readable!')
97
98 return db_info
99
100
101 def fetch_db_versions():
102 try:
103 with requests.get(bc.DB_VERSIONS_URL) as resp:
104 versions = json.loads(resp.content)
105 except IOError as e:
106 print(e, file=sys.stderr)
107 raise e
108 else:
109 return versions
110
111
112 def download(db_url: str, tarball_path: Path):
113 try:
114 with tarball_path.open('wb') as fh_out, requests.get(db_url, stream=True) as resp:
115 total_length = resp.headers.get('content-length')
116 if(total_length is None): # no content length header
117 with alive_bar() as bar:
118 for data in resp.iter_content(chunk_size=1024*1024):
119 fh_out.write(data)
120 bar()
121 else:
122 total_length = int(int(total_length)/1024) # length in Kb
123 with alive_bar(total=total_length) as bar:
124 for data in resp.iter_content(chunk_size=1024*1024):
125 fh_out.write(data)
126 bar(incr=len(data)/1024)
127 except IOError:
128 sys.exit(f'ERROR: Could not download file from Zenodo! url={db_url}, path={tarball_path}')
129
130
131 def calc_md5_sum(tarball_path: Path, buffer_size: int=1024*1024) -> str:
132 md5 = hashlib.md5()
133 with tarball_path.open('rb') as fh:
134 data = fh.read(buffer_size)
135 while data:
136 md5.update(data)
137 data = fh.read(buffer_size)
138 return md5.hexdigest()
139
140
141 def untar(tarball_path: Path, output_path: Path):
142 try:
143 with tarball_path.open('rb') as fh_in, tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file:
144 tar_file.extractall(path=str(output_path))
145 except OSError:
146 sys.exit(f'ERROR: Could not extract {tarball_path} to {output_path}')
147
148
149 def main():
150 # parse options and arguments
151 parser = bu.init_parser(sub_command='_db')
152 group_runtime = parser.add_argument_group('Runtime & auxiliary options')
153 group_runtime.add_argument('--help', '-h', action='help', help='Show this help message and exit')
154 group_runtime.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}')
155
156 subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help')
157 parser_list = subparsers.add_parser('list', help='List available database versions') # add list sub-command options
158 parser_list.add_argument('--all', action='store_true', help='Show all versions including incompatible')
159
160 parser_download = subparsers.add_parser('download', help='Download a database') # add download sub-command options
161 parser_download.add_argument('--output', '-o', action='store', default=Path.cwd(), help='output directory (default = current working directory)')
162 parser_download.add_argument('--minor', '-n', action='store', type=int, default=0, help='Database minor version (default = most recent db minor version)')
163
164 parser_update = subparsers.add_parser('update', help='Update an existing database to the most recent compatible version') # add download sub-command options
165 parser_update.add_argument('--db', '-d', action='store', default=None, help='Current database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.')
166 parser_update.add_argument('--tmp-dir', '-t', action='store', dest='tmp_dir', default=Path.cwd(), help='Temporary directory to download & extract (default = current working directory)')
167
168 args = parser.parse_args()
169 if(args.subcommand == 'list'):
170 print(f'Required database schema version: {bakta.__db_schema_version__}\n')
171 versions = fetch_db_versions()
172 if(not args.all):
173 versions = [v for v in versions if v['major'] == bakta.__db_schema_version__]
174
175 print('Available DB versions:')
176 for v in sorted(versions, key=lambda v: (v['major'], v['minor'])):
177 print(f"{v['major']}.{v['minor']}\t{v['date']}\t{v['doi']}")
178 elif(args.subcommand == 'download'):
179 bu.test_dependency(bu.DEPENDENCY_AMRFINDERPLUS)
180 output_path = cfg.check_output_path(args)
181
182 print('fetch DB versions...')
183 versions = fetch_db_versions()
184 compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__]
185 if(len(compatible_versions) == 0):
186 sys.exit(f'Error: no compatible version available for current major db version {bakta.__db_schema_version__}')
187 else:
188 print(f'\t... compatible DB versions: {len(compatible_versions)}')
189
190 required_version = None
191 if(args.minor > 0):
192 for v in versions:
193 if(v['minor'] == args.minor):
194 required_version = v
195 break
196 if(required_version is None):
197 sys.exit(f"requested DB minor version {args.minor} is not available. Please use 'bakta_db list' to get a list of available DB versions")
198 else:
199 compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True)
200 required_version = compatible_sorted[0]
201
202 tarball_path = output_path.joinpath('db.tar.gz')
203 db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
204 print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
205 download(db_url, tarball_path)
206 print('\t... done')
207
208 print('check MD5 sum...')
209 md5_sum = calc_md5_sum(tarball_path)
210 if(md5_sum == required_version['md5']):
211 print(f'\t...database file OK: {md5_sum}')
212 else:
213 sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
214
215 print(f'extract DB tarball: file={tarball_path}, output={output_path}')
216 untar(tarball_path, output_path)
217 tarball_path.unlink()
218
219 db_path = output_path.joinpath('db')
220 db_info = check(db_path)
221 if(db_info['major'] != required_version['major']):
222 sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_info['major']}")
223 elif(db_info['minor'] != required_version['minor']):
224 sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_info['minor']}")
225 print('successfully downloaded Bakta database!')
226 print(f"\tversion: {required_version['major']}.{required_version['minor']}")
227 print(f"\tDOI: {required_version['doi']}")
228 print(f'\tpath: {db_path}')
229
230 try:
231 db_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # set write permissions on old (existing) directory with updated content
232 for db_file_path in db_path.iterdir():
233 db_file_path.chmod(stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
234 except:
235 sys.exit(f'ERROR: cannot set read|execute permissions on new database! path={db_path}, owner={db_path.owner()}, group={db_path.group()}, permissions={oct(db_path.stat().st_mode )[-3:]}')
236
237 print('update AMRFinderPlus database...')
238 update_amrfinderplus_db(db_path)
239 print('\t... done')
240
241 print(f"\nRun Bakta using '--db {db_path}' or set a BAKTA_DB environment variable: 'export BAKTA_DB={db_path}'")
242 elif(args.subcommand == 'update'):
243 bu.test_dependency(bu.DEPENDENCY_AMRFINDERPLUS)
244 tmp_path = cfg.check_tmp_path(args)
245 db_old_path = cfg.check_db_path(args)
246 db_old_info = check(db_old_path)
247 print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}")
248 print('fetch DB versions...')
249 versions = fetch_db_versions()
250 compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__]
251 if(len(compatible_versions) == 0):
252 sys.exit(f'Error: no compatible version available for current major db version {bakta.__db_schema_version__}')
253 else:
254 print(f'\t... compatible DB versions: {len(compatible_versions)}')
255
256 compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True)
257 if(compatible_sorted[0]['minor'] <= db_old_info['minor']):
258 print(f"Database version {db_old_info['major']}.{db_old_info['minor']} is up-to-date")
259 sys.exit()
260 required_version = compatible_sorted[0]
261
262 tarball_path = tmp_path.joinpath('db.tar.gz')
263 db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz"
264 print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...")
265 download(db_url, tarball_path)
266 print('\t... done')
267
268 print('check MD5 sum...')
269 md5_sum = calc_md5_sum(tarball_path)
270 if(md5_sum == required_version['md5']):
271 print(f'\t...database file OK: {md5_sum}')
272 else:
273 sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'")
274
275 print(f'extract DB tarball: file={tarball_path}, output-directory={tmp_path}')
276 untar(tarball_path, tmp_path)
277 tarball_path.unlink()
278
279 db_new_path = tmp_path.joinpath('db')
280 db_new_info = check(db_new_path)
281 if(db_new_info['major'] != required_version['major']):
282 sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_new_info['major']}")
283 elif(db_new_info['minor'] != required_version['minor']):
284 sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_new_info['minor']}")
285 print('successfully downloaded Bakta DB:')
286 print(f"\tversion: {required_version['major']}.{required_version['minor']}")
287 print(f"\tDOI: {required_version['doi']}")
288 print(f'\tpath: {db_new_path}')
289 print('remove old database...')
290 try:
291 db_old_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # set write permissions on old directory
292 for db_old_file_path in db_old_path.iterdir():
293 if(db_old_file_path.is_dir()):
294 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
295 else:
296 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR)
297 except:
298 sys.exit(f'ERROR: cannot set read|write|execute permissions on old database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}')
299 try:
300 shutil.rmtree(db_old_path)
301 except:
302 sys.exit(f'ERROR: cannot remove old database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}')
303 db_old_path.mkdir()
304
305 try:
306 db_new_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # set write permissions on db_new_path directory
307 for db_new_file_path in db_new_path.iterdir():
308 db_new_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR)
309 except:
310 sys.exit(f'ERROR: cannot set read|write|execute permissions on new database! path={db_new_path}, owner={db_new_path.owner()}, group={db_new_path.group()}, permissions={oct(db_new_path.stat().st_mode )[-3:]}')
311 try:
312 for db_new_file_path in db_new_path.iterdir(): # move new db files into old (existing) db directory
313 file_name = db_new_file_path.name
314 shutil.move(db_new_file_path, db_old_path.joinpath(file_name))
315 except:
316 sys.exit(f'ERROR: cannot move new database to existing path! new-path={db_new_path}, existing-path={db_old_path.parent}')
317 shutil.rmtree(tmp_path)
318
319 try:
320 db_old_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # set write permissions on old (existing) directory with updated content
321 for db_old_file_path in db_old_path.iterdir():
322 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
323 except:
324 sys.exit(f'ERROR: cannot set read(|execute) permissions on new database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}')
325
326 print('\t... done')
327
328 print('update AMRFinderPlus database...')
329 update_amrfinderplus_db(db_old_path)
330 print('\t... done')
331
332 print(f"\nRun Bakta using '--db {db_old_path}' or set a BAKTA_DB environment variable: 'export BAKTA_DB={db_old_path}'")
333 else:
334 parser.print_help()
335 sys.exit('Error: no subcommand provided!')
336
337
338 def update_amrfinderplus_db(db_path: Path):
339 amrfinderplus_db_path = db_path.joinpath('amrfinderplus-db')
340 cmd = [
341 'amrfinder_update',
342 '--database', str(amrfinderplus_db_path),
343 '--force_update'
344 ]
345 log.debug('cmd=%s', cmd)
346 proc = sp.run(
347 cmd,
348 stdout=sp.PIPE,
349 stderr=sp.PIPE,
350 universal_newlines=True
351 )
352 if(proc.returncode != 0):
353 log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
354 log.warning('AMRFinderPlus failed! amrfinder-error-code=%d', proc.returncode)
355 sys.exit(f"ERROR: AMRFinderPlus failed! command: 'amrfinder_update --force_update --database {amrfinderplus_db_path}', error code: {proc.returncode}")
356
357
358 if __name__ == '__main__':
359 main()