comparison data_manager/fetch_mothur_reference_data.py @ 3:2004bb845685 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_mothur_toolsuite/ commit f845716f6ac93500f143a30abef97eaba406344e"
author iuc
date Fri, 25 Jun 2021 09:36:36 +0000
parents 2ffd2cdc5089
children
comparison
equal deleted inserted replaced
2:2ffd2cdc5089 3:2004bb845685
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 # 2 #
3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools 3 # Data manager for reference data for the 'mothur_toolsuite' Galaxy tools
4 import io
4 import json 5 import json
5 import optparse 6 import optparse
6 import os 7 import os
7 import shutil 8 import shutil
8 import sys 9 import sys
9 import tarfile 10 import tarfile
10 import tempfile 11 import tempfile
11 import urllib2 12 import urllib.error
13 import urllib.parse
14 import urllib.request
12 import zipfile 15 import zipfile
13 from functools import reduce 16 from functools import reduce
14 17
15 # When extracting files from archives, skip names that 18 # When extracting files from archives, skip names that
16 # start with the following strings 19 # start with the following strings
36 "lookup_gs20": { 39 "lookup_gs20": {
37 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ] 40 "GS20": ["https://mothur.s3.us-east-2.amazonaws.com/wiki/lookup_gs20.zip", ]
38 }, 41 },
39 # RDP reference files 42 # RDP reference files
40 # http://www.mothur.org/wiki/RDP_reference_files 43 # http://www.mothur.org/wiki/RDP_reference_files
44 "RDP_v18": {
45 "16S rRNA RDP training set 18":
46 [
47 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.rdp.tgz", ],
48 "16S rRNA PDS training set 18":
49 [
50 "https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset18_062020.pds.tgz", ],
51 },
41 "RDP_v16": { 52 "RDP_v16": {
42 "16S rRNA RDP training set 16": 53 "16S rRNA RDP training set 16":
43 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ], 54 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.rdp.tgz", ],
44 "16S rRNA PDS training set 16": 55 "16S rRNA PDS training set 16":
45 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ], 56 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/trainset16_022016.pds.tgz", ],
74 "RDP training set 6": 85 "RDP training set 6":
75 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ], 86 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/rdptrainingset.zip", ],
76 }, 87 },
77 # Silva reference files 88 # Silva reference files
78 # http://www.mothur.org/wiki/Silva_reference_files 89 # http://www.mothur.org/wiki/Silva_reference_files
90 "silva_release_138.1": {
91 "SILVA release 138.1":
92 [
93 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v138_1.tgz",
94 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v138_1.tgz", ],
95 },
79 "silva_release_128": { 96 "silva_release_128": {
80 "SILVA release 128": 97 "SILVA release 128":
81 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz", 98 ["https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.nr_v128.tgz",
82 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ], 99 "https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.seed_v128.tgz", ],
83 }, 100 },
158 NB the directory pointed to by 'extra_files_path' 175 NB the directory pointed to by 'extra_files_path'
159 doesn't exist initially, it is the job of the script 176 doesn't exist initially, it is the job of the script
160 to create it if necessary. 177 to create it if necessary.
161 178
162 """ 179 """
163 params = json.loads(open(jsonfile).read()) 180 with open(jsonfile) as fh:
181 params = json.load(fh)
164 return (params['param_dict'], 182 return (params['param_dict'],
165 params['output_data'][0]['extra_files_path']) 183 params['output_data'][0]['extra_files_path'])
166 184
167 185
168 # Utility functions for creating data table dictionaries 186 # Utility functions for creating data table dictionaries
170 # Example usage: 188 # Example usage:
171 # >>> d = create_data_tables_dict() 189 # >>> d = create_data_tables_dict()
172 # >>> add_data_table(d,'my_data') 190 # >>> add_data_table(d,'my_data')
173 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) 191 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
174 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) 192 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
175 # >>> print str(json.dumps(d)) 193 # >>> print(json.dumps(d))
176 def create_data_tables_dict(): 194 def create_data_tables_dict():
177 """Return a dictionary for storing data table information 195 """Return a dictionary for storing data table information
178 196
179 Returns a dictionary that can be used with 'add_data_table' 197 Returns a dictionary that can be used with 'add_data_table'
180 and 'add_data_table_entry' to store information about a 198 and 'add_data_table_entry' to store information about a
227 system. 245 system.
228 246
229 Returns the name that the file is saved with. 247 Returns the name that the file is saved with.
230 248
231 """ 249 """
232 print("Downloading %s" % url) 250 print(f"Downloading {url}")
233 if not target: 251 if not target:
234 target = os.path.basename(url) 252 target = os.path.basename(url)
235 if wd: 253 if wd:
236 target = os.path.join(wd, target) 254 target = os.path.join(wd, target)
237 print("Saving to %s" % target) 255 print(f"Saving to {target}")
238 open(target, 'wb').write(urllib2.urlopen(url).read()) 256 with open(target, 'wb') as fh:
257 url_h = urllib.request.urlopen(url)
258 while True:
259 buffer = url_h.read(io.DEFAULT_BUFFER_SIZE)
260 if buffer == b"":
261 break
262 fh.write(buffer)
239 return target 263 return target
240 264
241 265
242 def unpack_zip_archive(filen, wd=None): 266 def unpack_zip_archive(filen, wd=None):
243 """Extract files from a ZIP archive 267 """Extract files from a ZIP archive
253 Once all the files are extracted the ZIP archive 277 Once all the files are extracted the ZIP archive
254 file is deleted from the file system. 278 file is deleted from the file system.
255 279
256 """ 280 """
257 if not zipfile.is_zipfile(filen): 281 if not zipfile.is_zipfile(filen):
258 print("%s: not ZIP formatted file") 282 print(f"{filen}: not ZIP formatted file")
259 return [filen] 283 return [filen]
260 file_list = [] 284 file_list = []
261 z = zipfile.ZipFile(filen) 285 with zipfile.ZipFile(filen) as z:
262 for name in z.namelist(): 286 for name in z.namelist():
263 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 287 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
264 print("Ignoring %s" % name) 288 print(f"Ignoring {name}")
265 continue 289 continue
266 if wd: 290 if wd:
267 target = os.path.join(wd, name) 291 target = os.path.join(wd, name)
268 else: 292 else:
269 target = name 293 target = name
270 if name.endswith('/'): 294 if name.endswith('/'):
271 # Make directory 295 # Make directory
272 print("Creating dir %s" % target) 296 print(f"Creating dir {target}")
273 try: 297 try:
274 os.makedirs(target) 298 os.makedirs(target)
275 except OSError: 299 except OSError:
276 pass 300 pass
277 else: 301 else:
278 # Extract file 302 # Extract file
279 print("Extracting %s" % name) 303 print("Extracting {target}")
280 try: 304 try:
281 os.makedirs(os.path.dirname(target)) 305 os.makedirs(os.path.dirname(target))
282 except OSError: 306 except OSError:
283 pass 307 pass
284 open(target, 'wb').write(z.read(name)) 308 with open(target, 'wb') as fh:
285 file_list.append(target) 309 fh.write(z.read(name))
286 print("Removing %s" % filen) 310 file_list.append(target)
311 print(f"Removing {filen}")
287 os.remove(filen) 312 os.remove(filen)
288 return file_list 313 return file_list
289 314
290 315
291 def unpack_tar_archive(filen, wd=None): 316 def unpack_tar_archive(filen, wd=None):
304 file is deleted from the file system. 329 file is deleted from the file system.
305 330
306 """ 331 """
307 file_list = [] 332 file_list = []
308 if not tarfile.is_tarfile(filen): 333 if not tarfile.is_tarfile(filen):
309 print("%s: not TAR file") 334 print(f"{filen}: not TAR file")
310 return [filen] 335 return [filen]
311 t = tarfile.open(filen) 336 with tarfile.open(filen) as t:
312 for name in t.getnames(): 337 for name in t.getnames():
313 # Check for unwanted files 338 # Check for unwanted files
314 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False): 339 if reduce(lambda x, y: x or name.startswith(y), IGNORE_PATHS, False):
315 print("Ignoring %s" % name) 340 print(f"Ignoring {name}")
316 continue 341 continue
317 # Extract file 342 # Extract file
318 print("Extracting %s" % name) 343 print(f"Extracting {name}")
319 t.extract(name, wd) 344 t.extract(name, wd)
320 if wd: 345 if wd:
321 target = os.path.join(wd, name) 346 target = os.path.join(wd, name)
322 else: 347 else:
323 target = name 348 target = name
324 file_list.append(target) 349 file_list.append(target)
325 print("Removing %s" % filen) 350 print(f"Removing {filen}")
326 os.remove(filen) 351 os.remove(filen)
327 return file_list 352 return file_list
328 353
329 354
330 def unpack_archive(filen, wd=None): 355 def unpack_archive(filen, wd=None):
338 'wd' specifies the working directory to extract 363 'wd' specifies the working directory to extract
339 the files to, otherwise they are extracted to the 364 the files to, otherwise they are extracted to the
340 current working directory. 365 current working directory.
341 366
342 """ 367 """
343 print("Unpack %s" % filen) 368 print(f"Unpack {filen}")
344 ext = os.path.splitext(filen)[1] 369 ext = os.path.splitext(filen)[1]
345 print("Extension: %s" % ext) 370 print(f"Extension: {ext}")
346 if ext == ".zip": 371 if ext == ".zip":
347 return unpack_zip_archive(filen, wd=wd) 372 return unpack_zip_archive(filen, wd=wd)
348 elif ext == ".tgz": 373 elif ext == ".tgz":
349 return unpack_tar_archive(filen, wd=wd) 374 return unpack_tar_archive(filen, wd=wd)
350 else: 375 else:
381 """ 406 """
382 ext = os.path.splitext(filen)[1] 407 ext = os.path.splitext(filen)[1]
383 try: 408 try:
384 return MOTHUR_FILE_TYPES[ext] 409 return MOTHUR_FILE_TYPES[ext]
385 except KeyError: 410 except KeyError:
386 print("WARNING: unknown file type for " + filen + ", skipping") 411 print(f"WARNING: unknown file type for {filen}, skipping")
387 return None 412 return None
388 413
389 414
390 def get_name(filen): 415 def get_name(filen):
391 """Generate a descriptive name based on the file name 416 """Generate a descriptive name based on the file name
414 datasets: a list of dataset names corresponding to keys in 439 datasets: a list of dataset names corresponding to keys in
415 the MOTHUR_REFERENCE_DATA dictionary 440 the MOTHUR_REFERENCE_DATA dictionary
416 """ 441 """
417 # Make working dir 442 # Make working dir
418 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd()) 443 wd = tempfile.mkdtemp(suffix=".mothur", dir=os.getcwd())
419 print("Working dir %s" % wd) 444 print(f"Working dir {wd}")
420 # Iterate over all requested reference data URLs 445 # Iterate over all requested reference data URLs
421 for dataset in datasets: 446 for dataset in datasets:
422 print("Handling dataset '%s'" % dataset) 447 print(f"Handling dataset '{dataset}'")
423 for name in MOTHUR_REFERENCE_DATA[dataset]: 448 for name in MOTHUR_REFERENCE_DATA[dataset]:
424 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd): 449 for f in fetch_files(MOTHUR_REFERENCE_DATA[dataset][name], wd=wd):
425 type_ = identify_type(f) 450 type_ = identify_type(f)
426 entry_name = "%s (%s)" % (os.path.splitext(os.path.basename(f))[0], name) 451 name_from_file = os.path.splitext(os.path.basename(f))[0]
427 print("%s\t\'%s'\t.../%s" % (type_, entry_name, os.path.basename(f))) 452 entry_name = f"{name_from_file} ({name})"
453 print(f"{type_}\t\'{entry_name}'\t.../{os.path.basename(f)}")
428 if type_ is not None: 454 if type_ is not None:
429 # Move to target dir 455 # Move to target dir
430 ref_data_file = os.path.basename(f) 456 ref_data_file = os.path.basename(f)
431 f1 = os.path.join(target_dir, ref_data_file) 457 f1 = os.path.join(target_dir, ref_data_file)
432 print("Moving %s to %s" % (f, f1)) 458 print(f"Moving {f} to {f1}")
433 os.rename(f, f1) 459 shutil.move(f, f1)
434 # Add entry to data table 460 # Add entry to data table
435 table_name = "mothur_%s" % type_ 461 table_name = f"mothur_{type_}"
436 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 462 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
437 # Remove working dir 463 # Remove working dir
438 print("Removing %s" % wd) 464 print(f"Removing {wd}")
439 shutil.rmtree(wd) 465 shutil.rmtree(wd)
440 466
441 467
442 def files_from_filesystem_paths(paths): 468 def files_from_filesystem_paths(paths):
443 """Return list of file paths from arbitrary input paths 469 """Return list of file paths from arbitrary input paths
449 """ 475 """
450 # Collect files to add 476 # Collect files to add
451 files = [] 477 files = []
452 for path in paths: 478 for path in paths:
453 path = os.path.abspath(path) 479 path = os.path.abspath(path)
454 print("Examining '%s'..." % path) 480 print(f"Examining '{path}'...")
455 if os.path.isfile(path): 481 if os.path.isfile(path):
456 # Store full path for file 482 # Store full path for file
457 files.append(path) 483 files.append(path)
458 elif os.path.isdir(path): 484 elif os.path.isdir(path):
459 # Descend into directory and collect the files 485 # Descend into directory and collect the files
488 files = files_from_filesystem_paths(paths) 514 files = files_from_filesystem_paths(paths)
489 # Handle each file individually 515 # Handle each file individually
490 for f in files: 516 for f in files:
491 type_ = identify_type(f) 517 type_ = identify_type(f)
492 if type_ is None: 518 if type_ is None:
493 print("%s: unrecognised type, skipped" % f) 519 print(f"{f}: unrecognised type, skipped")
494 continue 520 continue
495 ref_data_file = os.path.basename(f) 521 ref_data_file = os.path.basename(f)
496 target_file = os.path.join(target_dir, ref_data_file) 522 target_file = os.path.join(target_dir, ref_data_file)
497 entry_name = "%s" % os.path.splitext(ref_data_file)[0] 523 entry_name = "%s" % os.path.splitext(ref_data_file)[0]
498 if description: 524 if description:
499 entry_name += " (%s)" % description 525 entry_name += " (%s)" % description
500 print("%s\t\'%s'\t.../%s" % (type_, entry_name, ref_data_file)) 526 print(f"{type_}\t\'{entry_name}'\t.../{ref_data_file}")
501 # Link to or copy the data 527 # Link to or copy the data
502 if link_to_data: 528 if link_to_data:
503 os.symlink(f, target_file) 529 os.symlink(f, target_file)
504 else: 530 else:
505 shutil.copyfile(f, target_file) 531 shutil.copyfile(f, target_file)
506 # Add entry to data table 532 # Add entry to data table
507 table_name = "mothur_%s" % type_ 533 table_name = f"mothur_{type_}"
508 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file)) 534 add_data_table_entry(data_tables, table_name, dict(name=entry_name, value=ref_data_file))
509 535
510 536
511 if __name__ == "__main__": 537 if __name__ == "__main__":
512 print("Starting...") 538 print("Starting...")
517 parser.add_option('--datasets', action='store', dest='datasets', default='') 543 parser.add_option('--datasets', action='store', dest='datasets', default='')
518 parser.add_option('--paths', action='store', dest='paths', default=[]) 544 parser.add_option('--paths', action='store', dest='paths', default=[])
519 parser.add_option('--description', action='store', dest='description', default='') 545 parser.add_option('--description', action='store', dest='description', default='')
520 parser.add_option('--link', action='store_true', dest='link_to_data') 546 parser.add_option('--link', action='store_true', dest='link_to_data')
521 options, args = parser.parse_args() 547 options, args = parser.parse_args()
522 print("options: %s" % options) 548 print(f"options: {options}")
523 print("args : %s" % args) 549 print(f"args : {args}")
524 550
525 # Check for JSON file 551 # Check for JSON file
526 if len(args) != 1: 552 if len(args) != 1:
527 sys.stderr.write("Need to supply JSON file name") 553 sys.stderr.write("Need to supply JSON file name")
528 sys.exit(1) 554 sys.exit(1)
531 557
532 # Read the input JSON 558 # Read the input JSON
533 params, target_dir = read_input_json(jsonfile) 559 params, target_dir = read_input_json(jsonfile)
534 560
535 # Make the target directory 561 # Make the target directory
536 print("Making %s" % target_dir) 562 print(f"Making {target_dir}")
537 os.mkdir(target_dir) 563 os.mkdir(target_dir)
538 564
539 # Set up data tables dictionary 565 # Set up data tables dictionary
540 data_tables = create_data_tables_dict() 566 data_tables = create_data_tables_dict()
541 add_data_table(data_tables, 'mothur_lookup') 567 add_data_table(data_tables, 'mothur_lookup')
554 # that might have been inserted by Galaxy) 580 # that might have been inserted by Galaxy)
555 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split() 581 paths = options.paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
556 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data) 582 import_from_server(data_tables, target_dir, paths, description, link_to_data=options.link_to_data)
557 # Write output JSON 583 # Write output JSON
558 print("Outputting JSON") 584 print("Outputting JSON")
559 print(json.dumps(data_tables)) 585 with open(jsonfile, 'w') as fh:
560 open(jsonfile, 'w').write(json.dumps(data_tables, sort_keys=True)) 586 json.dump(data_tables, fh, sort_keys=True)
561 print("Done.") 587 print("Done.")