comparison data_manager/bakta_build_database.py @ 27:2879a0e702d5 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 3bfd33ae9741216e50373ddd04914d82f9731883
author pimarin
date Wed, 23 Aug 2023 14:38:16 +0000
parents 0408796bce2a
children
comparison
equal deleted inserted replaced
26:0408796bce2a 27:2879a0e702d5
1 import argparse 1 import argparse
2 import hashlib 2 import hashlib
3 import json 3 import json
4 import os 4 import os
5 import re
5 import sys 6 import sys
6 import tarfile 7 import tarfile
7 from datetime import datetime 8 from datetime import datetime
8 from pathlib import Path 9 from pathlib import Path
9 10
14 class GetBaktaDatabaseInfo: 15 class GetBaktaDatabaseInfo:
15 """ 16 """
16 Extract bakta database information to make a json file for data_manager 17 Extract bakta database information to make a json file for data_manager
17 """ 18 """
18 19
19 def __init__(self, 20 def __init__(
20 data_table_name="bakta_database", 21 self,
21 db_name=Path.cwd().joinpath("db"), 22 data_table_name="bakta_database",
22 db_version="latest", 23 db_version="latest",
23 test_mode=False): 24 tarball_name="db.tar.gz",
25 test_mode=False,
26 ):
24 self.bakta_table_list = None 27 self.bakta_table_list = None
25 self.db_url = None 28 self.db_url = None
29 self.db_name = "bakta-db"
30 self.db_type = ""
26 self.data_table_entry = None 31 self.data_table_entry = None
27 self.data_table_name = data_table_name 32 self.data_table_name = data_table_name
28 self.db_name = db_name 33 self.tar_name = tarball_name
29 self.db_version = db_version 34 self.db_version = db_version
30 self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json' 35 self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json"
31 self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json' 36 self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json"
32 self.test_mode = test_mode 37 self.test_mode = test_mode
38
39 def get_database_type(self):
40 self.light_db = bool(re.search(pattern="light", string=self.db_version))
41 self.db_version = self.db_version.split(sep="_")[0]
42 if self.light_db:
43 self.db_type = "light"
44 self.tar_name = "db-light.tar.gz"
45 self.md5 = self.fetch_db_versions()["md5-light"]
46 else:
47 self.md5 = self.fetch_db_versions()["md5"]
33 48
34 def get_data_table_format(self): 49 def get_data_table_format(self):
35 """ 50 """
36 Skeleton of a data_table format 51 Skeleton of a data_table format
37 return: a data table formated for json output 52 return: a data table formated for json output
38 """ 53 """
39 self.data_table_entry = { 54 self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
40 "data_tables": {
41 self.data_table_name: {}
42 }
43 }
44 return self.data_table_entry 55 return self.data_table_entry
45 56
46 def fetch_db_versions(self, db_version="latest"): 57 def fetch_db_versions(self):
47 """ 58 """
48 List bakta database info related to the db_version selected 59 List bakta database info related to the db_version selected
49 """ 60 """
50 if self.test_mode is True: 61
62 if self.test_mode:
51 self.DB_VERSIONS_URL = self.DB_TEST_URL 63 self.DB_VERSIONS_URL = self.DB_TEST_URL
52 try: 64 try:
53 with requests.get(self.DB_VERSIONS_URL) as resp: 65 with requests.get(self.DB_VERSIONS_URL) as resp:
54 versions = json.loads(resp.content) 66 versions = json.loads(resp.content)
55 except IOError as e: 67 except IOError as e:
56 print(e, file=sys.stderr) 68 print(e, file=sys.stderr)
57 raise e 69 raise e
58 else: 70
59 if db_version == "latest": 71 if self.db_version == "latest":
60 db_date_list = [] 72 db_date_list = []
61 for db_dic in versions: 73 for db_dic in versions:
62 db_date_list.append(datetime.strptime(db_dic["date"], 74 db_date_list.append(
63 '%Y-%m-%d').date()) 75 datetime.strptime(db_dic["date"], "%Y-%m-%d").date()
64 filtered_version = max(versions, key=lambda x: x['date']) 76 )
65 else: 77 filtered_version = max(versions, key=lambda x: x["date"])
66 filtered_version = None 78 else:
67 for item in versions: 79 filtered_version = None
68 if '{0}.{1}'.format(item["major"], item["minor"]) == db_version: 80 for item in versions:
69 filtered_version = item 81 if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version:
70 break 82 filtered_version = item
71 if filtered_version is None: 83 break
72 print("No matching version detected in the list") 84 if filtered_version is None:
73 if filtered_version is not None: 85 print("No matching version detected in the list")
74 self.db_url = f"https://zenodo.org/record/" \ 86 else:
75 f"{filtered_version['record']}/files/db.tar.gz" 87 self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}"
76 self.db_version = db_version 88 return filtered_version
77 return filtered_version
78 89
79 def get_data_manager(self, bakta_database_info): 90 def get_data_manager(self, bakta_database_info):
80 self.bakta_table_list = self.get_data_table_format() 91 self.bakta_table_list = self.get_data_table_format()
81 bakta_name = f"V{bakta_database_info['major']}." \ 92 bakta_name = (
82 f"{bakta_database_info['minor']}_" \ 93 f"V{bakta_database_info['major']}."
83 f"{bakta_database_info['date']}" 94 f"{bakta_database_info['minor']}{self.db_type}_"
84 tool_version = str(f"{bakta_database_info['software-min']['major']}." 95 f"{bakta_database_info['date']}"
85 f"{bakta_database_info['software-min']['minor']}") 96 )
86 data_info = dict(value=bakta_name, 97 tool_version = str(
87 dbkey=bakta_database_info['record'], 98 f"{bakta_database_info['software-min']['major']}."
88 bakta_version=tool_version, 99 f"{bakta_database_info['software-min']['minor']}"
89 path="db") 100 )
101 data_info = dict(
102 value=bakta_name,
103 dbkey=bakta_database_info["record"],
104 bakta_version=tool_version,
105 path=self.db_name,
106 )
90 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info] 107 self.bakta_table_list["data_tables"][self.data_table_name] = [data_info]
91 return self.bakta_table_list 108 return self.bakta_table_list
92 109
93 110
94 class InstallBaktaDatabase(GetBaktaDatabaseInfo): 111 class InstallBaktaDatabase(GetBaktaDatabaseInfo):
96 Download the bakta database, 113 Download the bakta database,
97 check md5 sum, 114 check md5 sum,
98 untar the download db and update for the amrfinderplus database 115 untar the download db and update for the amrfinderplus database
99 """ 116 """
100 117
101 def __init__(self, 118 def __init__(
102 db_dir=Path.cwd(), 119 self,
103 db_name="bakta", 120 db_dir=Path.cwd(),
104 tarball_name="db.tar.gz", 121 db_name="bakta-db",
105 test_mode=False): 122 db_version="latest",
123 test_mode=False
124 ):
106 super().__init__() 125 super().__init__()
107 self.md5 = None 126 self.md5 = None
127 self.db_version = db_version
108 self.db_dir = db_dir 128 self.db_dir = db_dir
109 self.db_name = db_name 129 self.db_name = db_name
110 self.tarball_name = tarball_name 130 self.tarball_path = ""
111 self.tarball_path = None
112 self.test_mode = test_mode 131 self.test_mode = test_mode
132 self.get_database_type()
113 133
114 def download(self): 134 def download(self):
115 self.db_name = f'{self.db_name}_{self.db_version}' 135 #self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}"
116 bakta_path = Path(self.db_dir).joinpath(self.tarball_name) 136 bakta_path = Path(self.db_dir).joinpath(self.tar_name)
117 try: 137 try:
118 with bakta_path.open('wb') as fh_out, \ 138 with bakta_path.open("wb") as fh_out, requests.get(
119 requests.get(self.db_url, stream=True) as resp: 139 self.db_url, stream=True) as resp:
120 total_length = resp.headers.get('content-length') 140 total_length = resp.headers.get("content-length")
121 if total_length is None: # no content length header 141 if total_length is None: # no content length header
122 for data in resp.iter_content(chunk_size=1024 * 1024): 142 for data in resp.iter_content(chunk_size=1024 * 1024):
123 fh_out.write(data) 143 fh_out.write(data)
124 else: 144 else:
125 for data in resp.iter_content(chunk_size=1024 * 1024): 145 for data in resp.iter_content(chunk_size=1024 * 1024):
126 fh_out.write(data) 146 fh_out.write(data)
127 print(f'Download bakta database {self.db_version}') 147 print(f"Download bakta database {self.db_version}")
128 self.tarball_path = bakta_path 148 self.tarball_path = bakta_path
129 except IOError: 149 except IOError:
130 print(f'ERROR: Could not download file from Zenodo!' 150 print(
131 f' url={self.db_url}, path={self.tarball_name}') 151 f"ERROR: Could not download file from Zenodo!"
152 f" url={self.db_url}, to={self.tarball_path}"
153 )
132 154
133 def untar(self): 155 def untar(self):
134 db_path = Path(self.db_dir).as_posix() 156 db_path = Path(self.db_dir).joinpath(self.db_name)
135 try: 157 try:
136 with self.tarball_path.open('rb') as fh_in, \ 158 with self.tarball_path.open("rb") as fh_in, tarfile.open(
137 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: 159 fileobj=fh_in, mode="r:gz"
160 ) as tar_file:
138 tar_file.extractall(path=db_path) 161 tar_file.extractall(path=db_path)
139 print(f'Untar the database in {db_path}') 162 print(f"Untar the database in {db_path}")
140 return db_path 163 # return db_path
141 except OSError: 164 except OSError:
142 sys.exit(f'ERROR: Could not extract {self.tarball_name} ' 165 sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {db_path}")
143 f'to {self.db_name}')
144 166
145 def calc_md5_sum(self, buffer_size=1048576): 167 def calc_md5_sum(self, buffer_size=1048576):
146 tarball_path = Path(self.db_dir).joinpath(self.tarball_name) 168 tarball_path = Path(self.db_dir).joinpath(self.tar_name)
147 self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"]
148 md5 = hashlib.md5() 169 md5 = hashlib.md5()
149 with tarball_path.open('rb') as fh: 170 with tarball_path.open("rb") as fh:
150 data = fh.read(buffer_size) 171 data = fh.read(buffer_size)
151 while data: 172 while data:
152 md5.update(data) 173 md5.update(data)
153 data = fh.read(buffer_size) 174 data = fh.read(buffer_size)
154 if md5.hexdigest() == self.md5: 175 if md5.hexdigest() == self.md5:
155 print('\t...md5 control database OK') 176 print("\t...md5 control database OK")
156 else: 177 else:
157 print(f"Error: corrupt database file! " 178 print(
158 f"calculated md5 = {md5.hexdigest()}" 179 f"Error: corrupt database file! "
159 f" different from {self.md5} ") 180 f"calculated md5 = {md5.hexdigest()}"
160 181 f" different from {self.md5} "
161 182 )
162 """
163 This is the method to download the amrfinderplus database need by bakta.
164 Deprecated to use the amrfinderplus data_manager
165 def update_amrfinderplus_db(self):
166 amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db"
167 if self.db_version == "test":
168 cmd = [
169 'amrfinder_update',
170 '--database', str(amrfinderplus_db_path),
171 '--force_update',
172 '--help'
173 ]
174 else:
175 cmd = [
176 'amrfinder_update',
177 '--database', str(amrfinderplus_db_path),
178 '--force_update'
179 ]
180 proc = sp.run(
181 cmd,
182 universal_newlines=True
183 )
184 if proc.returncode != 0:
185 print(f"ERROR: AMRFinderPlus failed! "
186 f"command: 'amrfinder_update --force_update"
187 f" --database {amrfinderplus_db_path}'")
188 else:
189 print("AMRFinderPlus database download")
190 """
191 183
192 184
193 def parse_arguments(): 185 def parse_arguments():
194 # parse options and arguments 186 # parse options and arguments
195 arg_parser = argparse.ArgumentParser() 187 arg_parser = argparse.ArgumentParser()
196 arg_parser.add_argument("data_manager_json") 188 arg_parser.add_argument("data_manager_json")
197 arg_parser.add_argument("-d", "--database_version", 189 arg_parser.add_argument(
198 help='Select the database version ' 190 "-d",
199 '(major and minor eg. 4.0),' 191 "--database_version",
200 'default is the latest version', 192 help="Select the database version "
201 default="latest", 193 "(major and minor eg. 4.0),"
202 required=True) 194 "default is the latest version",
203 arg_parser.add_argument("-t", "--test", action='store_true', 195 default="latest",
204 help="option to test the script with an empty database") 196 required=True,
197 )
198 arg_parser.add_argument(
199 "-t",
200 "--test",
201 action="store_true",
202 help="option to test the script with an empty database",
203 )
205 return arg_parser.parse_args() 204 return arg_parser.parse_args()
206 205
207 206
208 def main(): 207 def main():
209 all_args = parse_arguments() 208 all_args = parse_arguments()
210 with open(all_args.data_manager_json) as fh: 209 with open(all_args.data_manager_json) as fh:
211 params = json.load(fh) 210 params = json.load(fh)
212 target_dir = params['output_data'][0]['extra_files_path'] 211 target_dir = params["output_data"][0]["extra_files_path"]
213 os.makedirs(target_dir) 212 os.makedirs(target_dir)
214 # init the class to download bakta db 213 # init the class to download bakta db
215 bakta_upload = InstallBaktaDatabase(test_mode=all_args.test) 214 bakta_upload = InstallBaktaDatabase(
216 bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version) 215 test_mode=all_args.test, db_version=all_args.database_version
216 )
217 bakta_db = bakta_upload.fetch_db_versions()
217 # update the path for galaxy 218 # update the path for galaxy
218 bakta_upload.db_dir = target_dir 219 bakta_upload.db_dir = target_dir
219 # download the database 220 # download the database
220 bakta_upload.download() 221 bakta_upload.download()
221 # check md5 sum 222 # check md5 sum
222 bakta_upload.calc_md5_sum() 223 bakta_upload.calc_md5_sum()
223 # untar db 224 # untar db
224 bakta_upload.untar() 225 bakta_upload.untar()
225 # make the data_manager metadata 226 # make the data_manager metadata
226 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db) 227 bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db)
227 with open(all_args.data_manager_json, 'w') as fh: 228 with open(all_args.data_manager_json, "w") as fh:
228 json.dump(bakta_data_manager, fh, sort_keys=True) 229 json.dump(bakta_data_manager, fh, sort_keys=True)
229 230
230 231
231 if __name__ == '__main__': 232 if __name__ == "__main__":
232 main() 233 main()