Mercurial > repos > iuc > data_manager_mothur_toolsuite
comparison data_manager/fetch_mothur_reference_data.py @ 3:2004bb845685 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
| author | iuc |
|---|---|
| date | Fri, 25 Jun 2021 09:36:36 +0000 |
| parents | 2ffd2cdc5089 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:2ffd2cdc5089 | 3:2004bb845685 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
| 2 # | 2 # |
| 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools | 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools |
| 4 import io | |
| 4 import json | 5 import json |
| 5 import optparse | 6 import optparse |
| 6 import os | 7 import os |
| 7 import shutil | 8 import shutil |
| 8 import sys | 9 import sys |
| 9 import tarfile | 10 import tarfile |
| 10 import tempfile | 11 import tempfile |
| 11 import urllib2 | 12 import urllib.error |
| 13 import urllib.parse | |
| 14 import urllib.request | |
| 12 import zipfile | 15 import zipfile |
| 13 from functools import reduce | 16 from functools import reduce |
| 14 | 17 |
| 15 # When extracting files from archives, skip names that | 18 # When extracting files from archives, skip names that |
| 16 # start with the following strings | 19 # start with the following strings |
| 36 "lookup_gs20": { | 39 "lookup_gs20": { |
| 37 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] | 40 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] |
| 38 }, | 41 }, |
| 39 # RDP reference files | 42 # RDP reference files |
| 40 # http://www.mothur.org/wiki/RDP_reference_files | 43 # http://www.mothur.org/wiki/RDP_reference_files |
| 44 "RDP_v18": { | |
| 45 "16S rRNA RDP training set 18": | |
| 46 [ | |
| 47 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ], | |
| 48 "16S rRNA PDS training set 18": | |
| 49 [ | |
| 50 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ], | |
| 51 }, | |
| 41 "RDP_v16": { | 52 "RDP_v16": { |
| 42 "16S rRNA RDP training set 16": | 53 "16S rRNA RDP training set 16": |
| 43 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], | 54 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], |
| 44 "16S rRNA PDS training set 16": | 55 "16S rRNA PDS training set 16": |
| 45 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], | 56 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], |
| 74 "RDP training set 6": | 85 "RDP training set 6": |
| 75 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], | 86 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], |
| 76 }, | 87 }, |
| 77 # Silva reference files | 88 # Silva reference files |
| 78 # http://www.mothur.org/wiki/Silva_reference_files | 89 # http://www.mothur.org/wiki/Silva_reference_files |
| 90 "silva_release_138.1": { | |
| 91 "SILVA release 138.1": | |
| 92 [ | |
| 93 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz", | |
| 94 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ], | |
| 95 }, | |
| 79 "silva_release_128": { | 96 "silva_release_128": { |
| 80 "SILVA release 128": | 97 "SILVA release 128": |
| 81 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", | 98 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", |
| 82 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], | 99 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], |
| 83 }, | 100 }, |
| 158 NB the directory pointed to by 'extra_files_path' | 175 NB the directory pointed to by 'extra_files_path' |
| 159 doesn't exist initially, it is the job of the script | 176 doesn't exist initially, it is the job of the script |
| 160 to create it if necessary. | 177 to create it if necessary. |
| 161 | 178 |
| 162 """ | 179 """ |
| 163 params = json.loads(open(jsonfile).read()) | 180 with open(jsonfile) as fh: |
| 181 params = json.load(fh) | |
| 164 return (params['param_dict'], | 182 return (params['param_dict'], |
| 165 params['output_data'][0]['extra_files_path']) | 183 params['output_data'][0]['extra_files_path']) |
| 166 | 184 |
| 167 | 185 |
| 168 # Utility functions for creating data table dictionaries | 186 # Utility functions for creating data table dictionaries |
| 170 # Example usage: | 188 # Example usage: |
| 171 # >>> d = create_data_tables_dict() | 189 # >>> d = create_data_tables_dict() |
| 172 # >>> add_data_table(d,'my_data') | 190 # >>> add_data_table(d,'my_data') |
| 173 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | 191 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) |
| 174 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | 192 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) |
| 175 # >>> print str(json.dumps(d)) | 193 # >>> print(json.dumps(d)) |
| 176 def create_data_tables_dict(): | 194 def create_data_tables_dict(): |
| 177 """Return a dictionary for storing data table information | 195 """Return a dictionary for storing data table information |
| 178 | 196 |
| 179 Returns a dictionary that can be used with 'add_data_table' | 197 Returns a dictionary that can be used with 'add_data_table' |
| 180 and 'add_data_table_entry' to store information about a | 198 and 'add_data_table_entry' to store information about a |
| 227 system. | 245 system. |
| 228 | 246 |
| 229 Returns the name that the file is saved with. | 247 Returns the name that the file is saved with. |
| 230 | 248 |
| 231 """ | 249 """ |
| 232 print("Downloading %s" % url) | 250 print(f"Downloading {url}") |
| 233 if not target: | 251 if not target: |
| 234 target = os.path.basename(url) | 252 target = os.path.basename(url) |
| 235 if wd: | 253 if wd: |
| 236 target = os.path.join(wd, target) | 254 target = os.path.join(wd, target) |
| 237 print("Saving to %s" % target) | 255 print(f"Saving to {target}") |
| 238 open(target, 'wb').write(urllib2.urlopen(url).read()) | 256 with open(target, 'wb') as fh: |
| 257 url_h = urllib.request.urlopen(url) | |
| 258 while True: | |
| 259 buffer = url_h.read(io.DEFAULT_BUFFER_SIZE) | |
| 260 if buffer == b"": | |
| 261 break | |
| 262 fh.write(buffer) | |
| 239 return target | 263 return target |
| 240 | 264 |
| 241 | 265 |
| 242 def unpack_zip_archive(filen, wd=None): | 266 def unpack_zip_archive(filen, wd=None): |
| 243 """Extract files from a ZIP archive | 267 """Extract files from a ZIP archive |
| 253 Once all the files are extracted the ZIP archive | 277 Once all the files are extracted the ZIP archive |
| 254 file is deleted from the file system. | 278 file is deleted from the file system. |
| 255 | 279 |
| 256 """ | 280 """ |
| 257 if not zipfile.is_zipfile(filen): | 281 if not zipfile.is_zipfile(filen): |
| 258 print("%s: not ZIP formatted file") | 282 print(f"{filen}: not ZIP formatted file") |
| 259 return [filen] | 283 return [filen] |
| 260 file_list = [] | 284 file_list = [] |
| 261 z = zipfile.ZipFile(filen) | 285 with zipfile.ZipFile(filen) as z: |
| 262 for name in z.namelist(): | 286 for name in z.namelist(): |
| 263 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 287 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
| 264 print("Ignoring %s" % name) | 288 print(f"Ignoring {name}") |
| 265 continue | 289 continue |
| 266 if wd: | 290 if wd: |
| 267 target = os.path.join(wd, name) | 291 target = os.path.join(wd, name) |
| 268 else: | 292 else: |
| 269 target = name | 293 target = name |
| 270 if name.endswith('/'): | 294 if name.endswith('/'): |
| 271 # Make directory | 295 # Make directory |
| 272 print("Creating dir %s" % target) | 296 print(f"Creating dir {target}") |
| 273 try: | 297 try: |
| 274 os.makedirs(target) | 298 os.makedirs(target) |
| 275 except OSError: | 299 except OSError: |
| 276 pass | 300 pass |
| 277 else: | 301 else: |
| 278 # Extract file | 302 # Extract file |
| 279 print("Extracting %s" % name) | 303 print("Extracting {target}") |
| 280 try: | 304 try: |
| 281 os.makedirs(os.path.dirname(target)) | 305 os.makedirs(os.path.dirname(target)) |
| 282 except OSError: | 306 except OSError: |
| 283 pass | 307 pass |
| 284 open(target, 'wb').write(z.read(name)) | 308 with open(target, 'wb') as fh: |
| 285 file_list.append(target) | 309 fh.write(z.read(name)) |
| 286 print("Removing %s" % filen) | 310 file_list.append(target) |
| 311 print(f"Removing {filen}") | |
| 287 os.remove(filen) | 312 os.remove(filen) |
| 288 return file_list | 313 return file_list |
| 289 | 314 |
| 290 | 315 |
| 291 def unpack_tar_archive(filen, wd=None): | 316 def unpack_tar_archive(filen, wd=None): |
| 304 file is deleted from the file system. | 329 file is deleted from the file system. |
| 305 | 330 |
| 306 """ | 331 """ |
| 307 file_list = [] | 332 file_list = [] |
| 308 if not tarfile.is_tarfile(filen): | 333 if not tarfile.is_tarfile(filen): |
| 309 print("%s: not TAR file") | 334 print(f"{filen}: not TAR file") |
| 310 return [filen] | 335 return [filen] |
| 311 t = tarfile.open(filen) | 336 with tarfile.open(filen) as t: |
| 312 for name in t.getnames(): | 337 for name in t.getnames(): |
| 313 # Check for unwanted files | 338 # Check for unwanted files |
| 314 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): | 339 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): |
| 315 print("Ignoring %s" % name) | 340 print(f"Ignoring {name}") |
| 316 continue | 341 continue |
| 317 # Extract file | 342 # Extract file |
| 318 print("Extracting %s" % name) | 343 print(f"Extracting {name}") |
| 319 t.extract(name, wd) | 344 t.extract(name, wd) |
| 320 if wd: | 345 if wd: |
| 321 target = os.path.join(wd, name) | 346 target = os.path.join(wd, name) |
| 322 else: | 347 else: |
| 323 target = name | 348 target = name |
| 324 file_list.append(target) | 349 file_list.append(target) |
| 325 print("Removing %s" % filen) | 350 print(f"Removing {filen}") |
| 326 os.remove(filen) | 351 os.remove(filen) |
| 327 return file_list | 352 return file_list |
| 328 | 353 |
| 329 | 354 |
| 330 def unpack_archive(filen, wd=None): | 355 def unpack_archive(filen, wd=None): |
| 338 'wd' specifies the working directory to extract | 363 'wd' specifies the working directory to extract |
| 339 the files to, otherwise they are extracted to the | 364 the files to, otherwise they are extracted to the |
| 340 current working directory. | 365 current working directory. |
| 341 | 366 |
| 342 """ | 367 """ |
| 343 print("Unpack %s" % filen) | 368 print(f"Unpack {filen}") |
| 344 ext = os.path.splitext(filen)[1] | 369 ext = os.path.splitext(filen)[1] |
| 345 print("Extension: %s" % ext) | 370 print(f"Extension: {ext}") |
| 346 if ext == ".zip": | 371 if ext == ".zip": |
| 347 return unpack_zip_archive(filen, wd=wd) | 372 return unpack_zip_archive(filen, wd=wd) |
| 348 elif ext == ".tgz": | 373 elif ext == ".tgz": |
| 349 return unpack_tar_archive(filen, wd=wd) | 374 return unpack_tar_archive(filen, wd=wd) |
| 350 else: | 375 else: |
| 381 """ | 406 """ |
| 382 ext = os.path.splitext(filen)[1] | 407 ext = os.path.splitext(filen)[1] |
| 383 try: | 408 try: |
| 384 return MOTHUR_FILE_TYPES[ext] | 409 return MOTHUR_FILE_TYPES[ext] |
| 385 except KeyError: | 410 except KeyError: |
| 386 print("WARNING: unknown file type for " + filen + ", skipping") | 411 print(f"WARNING: unknown file type for {filen}, skipping") |
| 387 return None | 412 return None |
| 388 | 413 |
| 389 | 414 |
| 390 def get_name(filen): | 415 def get_name(filen): |
| 391 """Generate a descriptive name based on the file name | 416 """Generate a descriptive name based on the file name |
| 414 datasets: a list of dataset names corresponding to keys in | 439 datasets: a list of dataset names corresponding to keys in |
| 415 the MOTHUR_REFERENCE_DATA dictionary | 440 the MOTHUR_REFERENCE_DATA dictionary |
| 416 """ | 441 """ |
| 417 # Make working dir | 442 # Make working dir |
| 418 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) | 443 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) |
| 419 print("Working dir %s" % wd) | 444 print(f"Working dir {wd}") |
| 420 # Iterate over all requested reference data URLs | 445 # Iterate over all requested reference data URLs |
| 421 for dataset in datasets: | 446 for dataset in datasets: |
| 422 print("Handling dataset '%s'" % dataset) | 447 print(f"Handling dataset '{dataset}'") |
| 423 for name in MOTHUR_REFERENCE_DATA[dataset]: | 448 for name in MOTHUR_REFERENCE_DATA[dataset]: |
| 424 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): | 449 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): |
| 425 type_ = identify_type(f) | 450 type_ = identify_type(f) |
| 426 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) | 451 name_from_file = os.path.splitext(os.path.basename(f))[0] |
| 427 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))) | 452 entry_name = f"{name_from_file} ({name})" |
| 453 print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}") | |
| 428 if type_ is not None: | 454 if type_ is not None: |
| 429 # Move to target dir | 455 # Move to target dir |
| 430 ref_data_file = os.path.basename(f) | 456 ref_data_file = os.path.basename(f) |
| 431 f1 = os.path.join(target_dir, ref_data_file) | 457 f1 = os.path.join(target_dir, ref_data_file) |
| 432 print("Moving %s to %s" % (f, f1)) | 458 print(f"Moving {f} to {f1}") |
| 433 os.rename(f, f1) | 459 shutil.move(f, f1) |
| 434 # Add entry to data table | 460 # Add entry to data table |
| 435 table_name = "mothur_%s" % type_ | 461 table_name = f"mothur_{type_}" |
| 436 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 462 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
| 437 # Remove working dir | 463 # Remove working dir |
| 438 print("Removing %s" % wd) | 464 print(f"Removing {wd}") |
| 439 shutil.rmtree(wd) | 465 shutil.rmtree(wd) |
| 440 | 466 |
| 441 | 467 |
| 442 def files_from_filesystem_paths(paths): | 468 def files_from_filesystem_paths(paths): |
| 443 """Return list of file paths from arbitrary input paths | 469 """Return list of file paths from arbitrary input paths |
| 449 """ | 475 """ |
| 450 # Collect files to add | 476 # Collect files to add |
| 451 files = [] | 477 files = [] |
| 452 for path in paths: | 478 for path in paths: |
| 453 path = os.path.abspath(path) | 479 path = os.path.abspath(path) |
| 454 print("Examining '%s'..." % path) | 480 print(f"Examining '{path}'...") |
| 455 if os.path.isfile(path): | 481 if os.path.isfile(path): |
| 456 # Store full path for file | 482 # Store full path for file |
| 457 files.append(path) | 483 files.append(path) |
| 458 elif os.path.isdir(path): | 484 elif os.path.isdir(path): |
| 459 # Descend into directory and collect the files | 485 # Descend into directory and collect the files |
| 488 files = files_from_filesystem_paths(paths) | 514 files = files_from_filesystem_paths(paths) |
| 489 # Handle each file individually | 515 # Handle each file individually |
| 490 for f in files: | 516 for f in files: |
| 491 type_ = identify_type(f) | 517 type_ = identify_type(f) |
| 492 if type_ is None: | 518 if type_ is None: |
| 493 print("%s: unrecognised type, skipped" % f) | 519 print(f"{f}: unrecognised type, skipped") |
| 494 continue | 520 continue |
| 495 ref_data_file = os.path.basename(f) | 521 ref_data_file = os.path.basename(f) |
| 496 target_file = os.path.join(target_dir, ref_data_file) | 522 target_file = os.path.join(target_dir, ref_data_file) |
| 497 entry_name = "%s" % os.path.splitext(ref_data_file)[0] | 523 entry_name = "%s" % os.path.splitext(ref_data_file)[0] |
| 498 if description: | 524 if description: |
| 499 entry_name += " (%s)" % description | 525 entry_name += " (%s)" % description |
| 500 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)) | 526 print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}") |
| 501 # Link to or copy the data | 527 # Link to or copy the data |
| 502 if link_to_data: | 528 if link_to_data: |
| 503 os.symlink(f, target_file) | 529 os.symlink(f, target_file) |
| 504 else: | 530 else: |
| 505 shutil.copyfile(f, target_file) | 531 shutil.copyfile(f, target_file) |
| 506 # Add entry to data table | 532 # Add entry to data table |
| 507 table_name = "mothur_%s" % type_ | 533 table_name = f"mothur_{type_}" |
| 508 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) | 534 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) |
| 509 | 535 |
| 510 | 536 |
| 511 if __name__ == "__main__": | 537 if __name__ == "__main__": |
| 512 print("Starting...") | 538 print("Starting...") |
| 517 parser.add_option('--datasets', action='store', dest='datasets', default='') | 543 parser.add_option('--datasets', action='store', dest='datasets', default='') |
| 518 parser.add_option('--paths', action='store', dest='paths', default=[]) | 544 parser.add_option('--paths', action='store', dest='paths', default=[]) |
| 519 parser.add_option('--description', action='store', dest='description', default='') | 545 parser.add_option('--description', action='store', dest='description', default='') |
| 520 parser.add_option('--link', action='store_true', dest='link_to_data') | 546 parser.add_option('--link', action='store_true', dest='link_to_data') |
| 521 options, args = parser.parse_args() | 547 options, args = parser.parse_args() |
| 522 print("options: %s" % options) | 548 print(f"options: {options}") |
| 523 print("args : %s" % args) | 549 print(f"args : {args}") |
| 524 | 550 |
| 525 # Check for JSON file | 551 # Check for JSON file |
| 526 if len(args) != 1: | 552 if len(args) != 1: |
| 527 sys.stderr.write("Need to supply JSON file name") | 553 sys.stderr.write("Need to supply JSON file name") |
| 528 sys.exit(1) | 554 sys.exit(1) |
| 531 | 557 |
| 532 # Read the input JSON | 558 # Read the input JSON |
| 533 params, target_dir = read_input_json(jsonfile) | 559 params, target_dir = read_input_json(jsonfile) |
| 534 | 560 |
| 535 # Make the target directory | 561 # Make the target directory |
| 536 print("Making %s" % target_dir) | 562 print(f"Making {target_dir}") |
| 537 os.mkdir(target_dir) | 563 os.mkdir(target_dir) |
| 538 | 564 |
| 539 # Set up data tables dictionary | 565 # Set up data tables dictionary |
| 540 data_tables = create_data_tables_dict() | 566 data_tables = create_data_tables_dict() |
| 541 add_data_table(data_tables, 'mothur_lookup') | 567 add_data_table(data_tables, 'mothur_lookup') |
| 554 # that might have been inserted by Galaxy) | 580 # that might have been inserted by Galaxy) |
| 555 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() | 581 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() |
| 556 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) | 582 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) |
| 557 # Write output JSON | 583 # Write output JSON |
| 558 print("Outputting JSON") | 584 print("Outputting JSON") |
| 559 print(json.dumps(data_tables)) | 585 with open(jsonfile, 'w') as fh: |
| 560 open(jsonfile, 'w').write(json.dumps(data_tables, sort_keys=True)) | 586 json.dump(data_tables, fh, sort_keys=True) |
| 561 print("Done.") | 587 print("Done.") |
