Mercurial > repos > pimarin > data_manager_bakta
comparison data_manager/bakta_build_database.py @ 0:9d08486abf8e draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_bakta commit 5f320f165a7f454193df36cc2ec77a87d50ec80f-dirty"
| author | pimarin |
|---|---|
| date | Wed, 17 Aug 2022 14:42:22 +0000 |
| parents | |
| children | faae5d8ce0cb |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9d08486abf8e |
|---|---|
| 1 import hashlib | |
| 2 import json | |
| 3 import logging | |
| 4 import os | |
| 5 import shutil | |
| 6 import subprocess as sp | |
| 7 import stat | |
| 8 import sys | |
| 9 import tarfile | |
| 10 | |
| 11 from pathlib import Path | |
| 12 | |
| 13 from alive_progress import alive_bar | |
| 14 import requests | |
| 15 | |
| 16 import bakta | |
| 17 import bakta.config as cfg | |
| 18 import bakta.constants as bc | |
| 19 import bakta.utils as bu | |
| 20 | |
| 21 | |
| 22 log = logging.getLogger('DB') | |
| 23 | |
| 24 | |
| 25 FILE_NAMES = [ | |
| 26 'antifam.h3f', | |
| 27 'antifam.h3i', | |
| 28 'antifam.h3m', | |
| 29 'antifam.h3p', | |
| 30 'bakta.db', | |
| 31 'expert-protein-sequences.dmnd', | |
| 32 'ncRNA-genes.i1f', | |
| 33 'ncRNA-genes.i1i', | |
| 34 'ncRNA-genes.i1m', | |
| 35 'ncRNA-genes.i1p', | |
| 36 'ncRNA-regions.i1f', | |
| 37 'ncRNA-regions.i1i', | |
| 38 'ncRNA-regions.i1m', | |
| 39 'ncRNA-regions.i1p', | |
| 40 'oric.fna', | |
| 41 'orit.fna', | |
| 42 'pfam.h3f', | |
| 43 'pfam.h3i', | |
| 44 'pfam.h3m', | |
| 45 'pfam.h3p', | |
| 46 'psc.dmnd', | |
| 47 'rfam-go.tsv', | |
| 48 'rRNA.i1f', | |
| 49 'rRNA.i1i', | |
| 50 'rRNA.i1m', | |
| 51 'rRNA.i1p', | |
| 52 'sorf.dmnd' | |
| 53 ] | |
| 54 | |
| 55 | |
| 56 def check(db_path: Path) -> dict: | |
| 57 """Check if database directory exists, is accessible and contains necessary files.""" | |
| 58 | |
| 59 if(db_path is None): | |
| 60 log.error('directory neither provided nor detected!') | |
| 61 sys.exit('ERROR: database directory not provided nor detected! Please provide a valid path to the database directory.') | |
| 62 | |
| 63 if(not os.access(str(db_path), os.R_OK & os.X_OK)): | |
| 64 log.error('directory (%s) not readable/accessible!', db_path) | |
| 65 sys.exit(f'ERROR: database directory ({db_path}) not readable/accessible!') | |
| 66 | |
| 67 version_path = db_path.joinpath('version.json') | |
| 68 if(not os.access(str(version_path), os.R_OK) or not version_path.is_file()): | |
| 69 log.error('version file not readable!') | |
| 70 sys.exit('ERROR: database version file (version.json) not readable!') | |
| 71 | |
| 72 try: | |
| 73 with version_path.open() as fh: | |
| 74 db_info = json.load(fh) | |
| 75 except: | |
| 76 log.exception('could not parse database version file!') | |
| 77 sys.exit('ERROR: could not parse database version file!') | |
| 78 | |
| 79 for key in ['date', 'major', 'minor']: | |
| 80 if(key not in db_info): | |
| 81 log.error('wrong db version info file content! missed key=%s', key) | |
| 82 sys.exit(f"ERROR: wrong db version info file format! Missed key '{key}' in JSON structure.") | |
| 83 | |
| 84 log.info('detected: major=%i, minor=%i, date=%s', db_info['major'], db_info['minor'], db_info['date']) | |
| 85 if(db_info['major'] < bakta.__db_schema_version__): | |
| 86 log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major']) | |
| 87 sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update the database from https://doi.org/10.5281/zenodo.4247253") | |
| 88 elif(db_info['major'] > bakta.__db_schema_version__): | |
| 89 log.error('wrong database version detected! required=%i, detected=%i', bakta.__db_schema_version__, db_info['major']) | |
| 90 sys.exit(f"ERROR: wrong database version detected!\nBakta version {bakta.__version__} requires database version {bakta.__db_schema_version__}.x, but {db_info['major']}.{db_info['minor']} was detected. Please, update Bakta or download a compatible database version from https://doi.org/10.5281/zenodo.4247253") | |
| 91 | |
| 92 for file_name in FILE_NAMES: | |
| 93 path = db_path.joinpath(file_name) | |
| 94 if(not os.access(str(path), os.R_OK) or not path.is_file()): | |
| 95 log.error('file not readable! file=%s', file_name) | |
| 96 sys.exit(f'ERROR: database file ({file_name}) not readable!') | |
| 97 | |
| 98 return db_info | |
| 99 | |
| 100 | |
| 101 def fetch_db_versions(): | |
| 102 try: | |
| 103 with requests.get(bc.DB_VERSIONS_URL) as resp: | |
| 104 versions = json.loads(resp.content) | |
| 105 except IOError as e: | |
| 106 print(e, file=sys.stderr) | |
| 107 raise e | |
| 108 else: | |
| 109 return versions | |
| 110 | |
| 111 | |
| 112 def download(db_url: str, tarball_path: Path): | |
| 113 try: | |
| 114 with tarball_path.open('wb') as fh_out, requests.get(db_url, stream=True) as resp: | |
| 115 total_length = resp.headers.get('content-length') | |
| 116 if(total_length is None): # no content length header | |
| 117 with alive_bar() as bar: | |
| 118 for data in resp.iter_content(chunk_size=1024*1024): | |
| 119 fh_out.write(data) | |
| 120 bar() | |
| 121 else: | |
| 122 total_length = int(int(total_length)/1024) # length in Kb | |
| 123 with alive_bar(total=total_length) as bar: | |
| 124 for data in resp.iter_content(chunk_size=1024*1024): | |
| 125 fh_out.write(data) | |
| 126 bar(incr=len(data)/1024) | |
| 127 except IOError: | |
| 128 sys.exit(f'ERROR: Could not download file from Zenodo! url={db_url}, path={tarball_path}') | |
| 129 | |
| 130 | |
| 131 def calc_md5_sum(tarball_path: Path, buffer_size: int=1024*1024) -> str: | |
| 132 md5 = hashlib.md5() | |
| 133 with tarball_path.open('rb') as fh: | |
| 134 data = fh.read(buffer_size) | |
| 135 while data: | |
| 136 md5.update(data) | |
| 137 data = fh.read(buffer_size) | |
| 138 return md5.hexdigest() | |
| 139 | |
| 140 | |
| 141 def untar(tarball_path: Path, output_path: Path): | |
| 142 try: | |
| 143 with tarball_path.open('rb') as fh_in, tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | |
| 144 tar_file.extractall(path=str(output_path)) | |
| 145 except OSError: | |
| 146 sys.exit(f'ERROR: Could not extract {tarball_path} to {output_path}') | |
| 147 | |
| 148 | |
| 149 def main(): | |
| 150 # parse options and arguments | |
| 151 parser = bu.init_parser(sub_command='_db') | |
| 152 group_runtime = parser.add_argument_group('Runtime & auxiliary options') | |
| 153 group_runtime.add_argument('--help', '-h', action='help', help='Show this help message and exit') | |
| 154 group_runtime.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}') | |
| 155 | |
| 156 subparsers = parser.add_subparsers(dest='subcommand', help='sub-command help') | |
| 157 parser_list = subparsers.add_parser('list', help='List available database versions') # add list sub-command options | |
| 158 parser_list.add_argument('--all', action='store_true', help='Show all versions including incompatible') | |
| 159 | |
| 160 parser_download = subparsers.add_parser('download', help='Download a database') # add download sub-command options | |
| 161 parser_download.add_argument('--output', '-o', action='store', default=Path.cwd(), help='output directory (default = current working directory)') | |
| 162 parser_download.add_argument('--minor', '-n', action='store', type=int, default=0, help='Database minor version (default = most recent db minor version)') | |
| 163 | |
| 164 parser_update = subparsers.add_parser('update', help='Update an existing database to the most recent compatible version') # add download sub-command options | |
| 165 parser_update.add_argument('--db', '-d', action='store', default=None, help='Current database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.') | |
| 166 parser_update.add_argument('--tmp-dir', '-t', action='store', dest='tmp_dir', default=Path.cwd(), help='Temporary directory to download & extract (default = current working directory)') | |
| 167 | |
| 168 args = parser.parse_args() | |
| 169 if(args.subcommand == 'list'): | |
| 170 print(f'Required database schema version: {bakta.__db_schema_version__}\n') | |
| 171 versions = fetch_db_versions() | |
| 172 if(not args.all): | |
| 173 versions = [v for v in versions if v['major'] == bakta.__db_schema_version__] | |
| 174 | |
| 175 print('Available DB versions:') | |
| 176 for v in sorted(versions, key=lambda v: (v['major'], v['minor'])): | |
| 177 print(f"{v['major']}.{v['minor']}\t{v['date']}\t{v['doi']}") | |
| 178 elif(args.subcommand == 'download'): | |
| 179 bu.test_dependency(bu.DEPENDENCY_AMRFINDERPLUS) | |
| 180 output_path = cfg.check_output_path(args) | |
| 181 | |
| 182 print('fetch DB versions...') | |
| 183 versions = fetch_db_versions() | |
| 184 compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__] | |
| 185 if(len(compatible_versions) == 0): | |
| 186 sys.exit(f'Error: no compatible version available for current major db version {bakta.__db_schema_version__}') | |
| 187 else: | |
| 188 print(f'\t... compatible DB versions: {len(compatible_versions)}') | |
| 189 | |
| 190 required_version = None | |
| 191 if(args.minor > 0): | |
| 192 for v in versions: | |
| 193 if(v['minor'] == args.minor): | |
| 194 required_version = v | |
| 195 break | |
| 196 if(required_version is None): | |
| 197 sys.exit(f"requested DB minor version {args.minor} is not available. Please use 'bakta_db list' to get a list of available DB versions") | |
| 198 else: | |
| 199 compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True) | |
| 200 required_version = compatible_sorted[0] | |
| 201 | |
| 202 tarball_path = output_path.joinpath('db.tar.gz') | |
| 203 db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz" | |
| 204 print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...") | |
| 205 download(db_url, tarball_path) | |
| 206 print('\t... done') | |
| 207 | |
| 208 print('check MD5 sum...') | |
| 209 md5_sum = calc_md5_sum(tarball_path) | |
| 210 if(md5_sum == required_version['md5']): | |
| 211 print(f'\t...database file OK: {md5_sum}') | |
| 212 else: | |
| 213 sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'") | |
| 214 | |
| 215 print(f'extract DB tarball: file={tarball_path}, output={output_path}') | |
| 216 untar(tarball_path, output_path) | |
| 217 tarball_path.unlink() | |
| 218 | |
| 219 db_path = output_path.joinpath('db') | |
| 220 db_info = check(db_path) | |
| 221 if(db_info['major'] != required_version['major']): | |
| 222 sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_info['major']}") | |
| 223 elif(db_info['minor'] != required_version['minor']): | |
| 224 sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_info['minor']}") | |
| 225 print('successfully downloaded Bakta database!') | |
| 226 print(f"\tversion: {required_version['major']}.{required_version['minor']}") | |
| 227 print(f"\tDOI: {required_version['doi']}") | |
| 228 print(f'\tpath: {db_path}') | |
| 229 | |
| 230 try: | |
| 231 db_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # set write permissions on old (existing) directory with updated content | |
| 232 for db_file_path in db_path.iterdir(): | |
| 233 db_file_path.chmod(stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) | |
| 234 except: | |
| 235 sys.exit(f'ERROR: cannot set read|execute permissions on new database! path={db_path}, owner={db_path.owner()}, group={db_path.group()}, permissions={oct(db_path.stat().st_mode )[-3:]}') | |
| 236 | |
| 237 print('update AMRFinderPlus database...') | |
| 238 update_amrfinderplus_db(db_path) | |
| 239 print('\t... done') | |
| 240 | |
| 241 print(f"\nRun Bakta using '--db {db_path}' or set a BAKTA_DB environment variable: 'export BAKTA_DB={db_path}'") | |
| 242 elif(args.subcommand == 'update'): | |
| 243 bu.test_dependency(bu.DEPENDENCY_AMRFINDERPLUS) | |
| 244 tmp_path = cfg.check_tmp_path(args) | |
| 245 db_old_path = cfg.check_db_path(args) | |
| 246 db_old_info = check(db_old_path) | |
| 247 print(f"existing database: v{db_old_info['major']}.{db_old_info['minor']}") | |
| 248 print('fetch DB versions...') | |
| 249 versions = fetch_db_versions() | |
| 250 compatible_versions = [v for v in versions if v['major'] == bakta.__db_schema_version__] | |
| 251 if(len(compatible_versions) == 0): | |
| 252 sys.exit(f'Error: no compatible version available for current major db version {bakta.__db_schema_version__}') | |
| 253 else: | |
| 254 print(f'\t... compatible DB versions: {len(compatible_versions)}') | |
| 255 | |
| 256 compatible_sorted = sorted(compatible_versions, key=lambda v: v['minor'], reverse=True) | |
| 257 if(compatible_sorted[0]['minor'] <= db_old_info['minor']): | |
| 258 print(f"Database version {db_old_info['major']}.{db_old_info['minor']} is up-to-date") | |
| 259 sys.exit() | |
| 260 required_version = compatible_sorted[0] | |
| 261 | |
| 262 tarball_path = tmp_path.joinpath('db.tar.gz') | |
| 263 db_url = f"https://zenodo.org/record/{required_version['record']}/files/db.tar.gz" | |
| 264 print(f"download database: v{required_version['major']}.{required_version['minor']}, {required_version['date']}, DOI: {required_version['doi']}, URL: {db_url}...") | |
| 265 download(db_url, tarball_path) | |
| 266 print('\t... done') | |
| 267 | |
| 268 print('check MD5 sum...') | |
| 269 md5_sum = calc_md5_sum(tarball_path) | |
| 270 if(md5_sum == required_version['md5']): | |
| 271 print(f'\t...database file OK: {md5_sum}') | |
| 272 else: | |
| 273 sys.exit(f"Error: corrupt database file! MD5 should be '{required_version['md5']}' but is '{md5_sum}'") | |
| 274 | |
| 275 print(f'extract DB tarball: file={tarball_path}, output-directory={tmp_path}') | |
| 276 untar(tarball_path, tmp_path) | |
| 277 tarball_path.unlink() | |
| 278 | |
| 279 db_new_path = tmp_path.joinpath('db') | |
| 280 db_new_info = check(db_new_path) | |
| 281 if(db_new_info['major'] != required_version['major']): | |
| 282 sys.exit(f"ERROR: wrong major db detected! required={required_version['major']}, detected={db_new_info['major']}") | |
| 283 elif(db_new_info['minor'] != required_version['minor']): | |
| 284 sys.exit(f"ERROR: wrong minor db detected! required={required_version['minor']}, detected={db_new_info['minor']}") | |
| 285 print('successfully downloaded Bakta DB:') | |
| 286 print(f"\tversion: {required_version['major']}.{required_version['minor']}") | |
| 287 print(f"\tDOI: {required_version['doi']}") | |
| 288 print(f'\tpath: {db_new_path}') | |
| 289 print('remove old database...') | |
| 290 try: | |
| 291 db_old_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # set write permissions on old directory | |
| 292 for db_old_file_path in db_old_path.iterdir(): | |
| 293 if(db_old_file_path.is_dir()): | |
| 294 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) | |
| 295 else: | |
| 296 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR) | |
| 297 except: | |
| 298 sys.exit(f'ERROR: cannot set read|write|execute permissions on old database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}') | |
| 299 try: | |
| 300 shutil.rmtree(db_old_path) | |
| 301 except: | |
| 302 sys.exit(f'ERROR: cannot remove old database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}') | |
| 303 db_old_path.mkdir() | |
| 304 | |
| 305 try: | |
| 306 db_new_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # set write permissions on db_new_path directory | |
| 307 for db_new_file_path in db_new_path.iterdir(): | |
| 308 db_new_file_path.chmod(stat.S_IRUSR | stat.S_IWUSR) | |
| 309 except: | |
| 310 sys.exit(f'ERROR: cannot set read|write|execute permissions on new database! path={db_new_path}, owner={db_new_path.owner()}, group={db_new_path.group()}, permissions={oct(db_new_path.stat().st_mode )[-3:]}') | |
| 311 try: | |
| 312 for db_new_file_path in db_new_path.iterdir(): # move new db files into old (existing) db directory | |
| 313 file_name = db_new_file_path.name | |
| 314 shutil.move(db_new_file_path, db_old_path.joinpath(file_name)) | |
| 315 except: | |
| 316 sys.exit(f'ERROR: cannot move new database to existing path! new-path={db_new_path}, existing-path={db_old_path.parent}') | |
| 317 shutil.rmtree(tmp_path) | |
| 318 | |
| 319 try: | |
| 320 db_old_path.chmod(stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # set write permissions on old (existing) directory with updated content | |
| 321 for db_old_file_path in db_old_path.iterdir(): | |
| 322 db_old_file_path.chmod(stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) | |
| 323 except: | |
| 324 sys.exit(f'ERROR: cannot set read(|execute) permissions on new database! path={db_old_path}, owner={db_old_path.owner()}, group={db_old_path.group()}, permissions={oct(db_old_path.stat().st_mode )[-3:]}') | |
| 325 | |
| 326 print('\t... done') | |
| 327 | |
| 328 print('update AMRFinderPlus database...') | |
| 329 update_amrfinderplus_db(db_old_path) | |
| 330 print('\t... done') | |
| 331 | |
| 332 print(f"\nRun Bakta using '--db {db_old_path}' or set a BAKTA_DB environment variable: 'export BAKTA_DB={db_old_path}'") | |
| 333 else: | |
| 334 parser.print_help() | |
| 335 sys.exit('Error: no subcommand provided!') | |
| 336 | |
| 337 | |
| 338 def update_amrfinderplus_db(db_path: Path): | |
| 339 amrfinderplus_db_path = db_path.joinpath('amrfinderplus-db') | |
| 340 cmd = [ | |
| 341 'amrfinder_update', | |
| 342 '--database', str(amrfinderplus_db_path), | |
| 343 '--force_update' | |
| 344 ] | |
| 345 log.debug('cmd=%s', cmd) | |
| 346 proc = sp.run( | |
| 347 cmd, | |
| 348 stdout=sp.PIPE, | |
| 349 stderr=sp.PIPE, | |
| 350 universal_newlines=True | |
| 351 ) | |
| 352 if(proc.returncode != 0): | |
| 353 log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr) | |
| 354 log.warning('AMRFinderPlus failed! amrfinder-error-code=%d', proc.returncode) | |
| 355 sys.exit(f"ERROR: AMRFinderPlus failed! command: 'amrfinder_update --force_update --database {amrfinderplus_db_path}', error code: {proc.returncode}") | |
| 356 | |
| 357 | |
| 358 if __name__ == '__main__': | |
| 359 main() |
