Mercurial > repos > matthias > data_manager_dada2
comparison data_manager/data_manager.py @ 9:facf9e6c872c draft default tip
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit df2dfeb75f88b326f567cab8df4e6c4a7f2e548c
| author | matthias |
|---|---|
| date | Tue, 15 Oct 2019 07:20:59 -0400 |
| parents | da93e6a3fe23 |
| children |
comparison
equal
deleted
inserted
replaced
| 8:da93e6a3fe23 | 9:facf9e6c872c |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import json | 2 import json |
| 3 import os | 3 import os |
| 4 import shutil | |
| 5 import sys | |
| 6 import zipfile | |
| 7 try: | 4 try: |
| 8 # For Python 3.0 and later | 5 # For Python 3.0 and later |
| 9 from urllib.request import Request, urlopen | 6 from urllib.request import Request, urlopen |
| 10 except ImportError: | 7 except ImportError: |
| 11 # Fall back to Python 2 imports | 8 # Fall back to Python 2 imports |
| 12 from urllib2 import Request, urlopen | 9 from urllib2 import Request, urlopen |
| 13 | 10 |
| 14 DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" | 11 DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" |
| 15 | 12 |
| 16 FILE2NAME = { | 13 FILE2NAME = { |
| 17 "silva_132":"Silva version 132", | 14 "silva_132": "Silva version 132", |
| 18 "silva_128":"Silva version 128", | 15 "silva_128": "Silva version 128", |
| 19 "rdp_16":"RDP trainset 16", | 16 "rdp_16": "RDP trainset 16", |
| 20 "rdp_14":"RDP trainset 14", | 17 "rdp_14": "RDP trainset 14", |
| 21 "greengenes_13.84":"GreenGenes version 13.84", | 18 "greengenes_13.84": "GreenGenes version 13.84", |
| 22 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", | 19 "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", |
| 23 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", | 20 "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", |
| 24 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", | 21 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", |
| 25 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", | 22 "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", |
| 26 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", | 23 "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", |
| 27 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", | 24 "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", |
| 28 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" | 25 "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" |
| 29 } | 26 } |
| 30 | 27 |
| 31 FILE2TAXURL = { | 28 FILE2TAXURL = { |
| 32 "silva_132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", | 29 "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", |
| 33 "silva_128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", | 30 "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", |
| 34 "rdp_16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", | 31 "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", |
| 35 "rdp_14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", | 32 "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", |
| 36 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", | 33 "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", |
| 37 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", | 34 "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", |
| 38 "greengenes_13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", | 35 "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", |
| 39 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", | 36 "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", |
| 40 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", | 37 "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", |
| 41 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", | 38 "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", |
| 42 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", | 39 "silva_euk_18S_132": "https://zenodo.org/record/1447330/files/silva_132.18s.99_rep_set.dada2.fa.gz?download=1", |
| 43 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" | 40 "PR2_4.11.1": "https://github.com/pr2database/pr2database/releases/download/4.11.1/pr2_version_4.11.1_dada2.fasta.gz" |
| 44 } | 41 } |
| 45 | 42 |
| 46 FILE2SPECIESURL = { | 43 FILE2SPECIESURL = { |
| 47 "silva_132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", | 44 "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", |
| 48 "silva_128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", | 45 "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", |
| 49 "rdp_16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", | 46 "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", |
| 50 "rdp_14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" | 47 "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" |
| 51 } | 48 } |
| 52 | 49 |
| 53 FILE2TAXLEVELS = { | 50 FILE2TAXLEVELS = { |
| 54 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" | 51 "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" |
| 55 } | 52 } |
| 53 | |
| 56 | 54 |
| 57 def url_download(url, fname, workdir): | 55 def url_download(url, fname, workdir): |
| 58 """ | 56 """ |
| 59 download url to workdir/fname | 57 download url to workdir/fname |
| 60 """ | 58 """ |
| 75 break | 73 break |
| 76 finally: | 74 finally: |
| 77 if src: | 75 if src: |
| 78 src.close() | 76 src.close() |
| 79 | 77 |
| 80 #special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) | 78 # special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) |
| 81 if fname.startswith("unite"): | 79 if fname.startswith("unite"): |
| 82 import glob | 80 import glob |
| 83 import gzip | 81 import gzip |
| 84 import shutil | 82 import shutil |
| 85 import zipfile | 83 import zipfile |
| 86 # unzip download | 84 # unzip download |
| 87 zip_ref = zipfile.ZipFile(file_path, 'r') | 85 zip_ref = zipfile.ZipFile(file_path, 'r') |
| 88 zip_ref.extractall(workdir) | 86 zip_ref.extractall(workdir) |
| 89 zip_ref.close() | 87 zip_ref.close() |
| 90 # gzip top level fasta file | 88 # gzip top level fasta file |
| 91 fastas = glob.glob("%s/*fasta"%workdir) | 89 fastas = glob.glob("%s/*fasta" % workdir) |
| 92 if len(fastas) != 1: | 90 if len(fastas) != 1: |
| 93 msg = "UNITE download %s contained %d fasta file(s): %s"%(url, len(fastas), " ".join(fastas)) | 91 msg = "UNITE download %s contained %d fasta file(s): %s" % (url, len(fastas), " ".join(fastas)) |
| 94 raise Exception(msg) | 92 raise Exception(msg) |
| 95 with open(fastas[0], 'rb') as f_in: | 93 with open(fastas[0], 'rb') as f_in: |
| 96 with gzip.open(file_path, 'wb') as f_out: | 94 with gzip.open(file_path, 'wb') as f_out: |
| 97 shutil.copyfileobj(f_in, f_out) | 95 shutil.copyfileobj(f_in, f_out) |
| 98 | 96 |
| 102 with open(outjson) as jf: | 100 with open(outjson) as jf: |
| 103 params = json.loads(jf.read()) | 101 params = json.loads(jf.read()) |
| 104 | 102 |
| 105 workdir = params['output_data'][0]['extra_files_path'] | 103 workdir = params['output_data'][0]['extra_files_path'] |
| 106 os.mkdir(workdir) | 104 os.mkdir(workdir) |
| 107 url_download( FILE2TAXURL[dataset], dataset+".taxonomy", workdir) | 105 url_download( FILE2TAXURL[dataset], dataset + ".taxonomy", workdir) |
| 108 | 106 |
| 109 data_manager_json = {"data_tables":{}} | 107 data_manager_json = {"data_tables": {}} |
| 110 data_manager_entry = {} | 108 data_manager_entry = {} |
| 111 data_manager_entry['value'] = dataset | 109 data_manager_entry['value'] = dataset |
| 112 data_manager_entry['name'] = FILE2NAME[dataset] | 110 data_manager_entry['name'] = FILE2NAME[dataset] |
| 113 data_manager_entry['path'] = dataset+".taxonomy" | 111 data_manager_entry['path'] = dataset + ".taxonomy" |
| 114 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) | 112 data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) |
| 115 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry | 113 data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry |
| 116 | 114 |
| 117 if FILE2SPECIESURL.get(dataset, False ): | 115 if FILE2SPECIESURL.get(dataset, False ): |
| 118 url_download( FILE2SPECIESURL[dataset], dataset+".species", workdir) | 116 url_download( FILE2SPECIESURL[dataset], dataset + ".species", workdir) |
| 119 data_manager_entry = {} | 117 data_manager_entry = {} |
| 120 data_manager_entry['value'] = dataset | 118 data_manager_entry['value'] = dataset |
| 121 data_manager_entry['name'] = FILE2NAME[dataset] | 119 data_manager_entry['name'] = FILE2NAME[dataset] |
| 122 data_manager_entry['path'] = dataset+".species" | 120 data_manager_entry['path'] = dataset + ".species" |
| 123 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry | 121 data_manager_json["data_tables"]["dada2_species"] = data_manager_entry |
| 124 | 122 |
| 125 with file(outjson, 'w') as jf: | 123 with file(outjson, 'w') as jf: |
| 126 jf.write(json.dumps(data_manager_json)) | 124 jf.write(json.dumps(data_manager_json)) |
| 125 | |
| 127 | 126 |
| 128 if __name__ == '__main__': | 127 if __name__ == '__main__': |
| 129 parser = argparse.ArgumentParser(description='Create data manager json.') | 128 parser = argparse.ArgumentParser(description='Create data manager json.') |
| 130 parser.add_argument('--out', action='store', help='JSON filename') | 129 parser.add_argument('--out', action='store', help='JSON filename') |
| 131 parser.add_argument('--dataset', action='store', help='Download data set name') | 130 parser.add_argument('--dataset', action='store', help='Download data set name') |
