comparison data_manager/data_manager_build_amrfinderplus.py @ 2:8fa7efc32500 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_amrfinderplus commit 45dbbf06a59df43da2c321c272de11cc41e17d43
author iuc
date Sun, 23 Nov 2025 12:21:18 +0000
parents 585cdfaf6ddb
children
comparison
equal deleted inserted replaced
1:d4e3b8d47f49 2:8fa7efc32500
12 class GetAmrFinderPlusDataManager: 12 class GetAmrFinderPlusDataManager:
13 """ 13 """
14 Create the json file with database information for galaxy data manager 14 Create the json file with database information for galaxy data manager
15 """ 15 """
16 16
17 def __init__(self, 17 def __init__(
18 amrfinderplus_database="amrfinderplus_database", 18 self,
19 db_name="amrfinderplus-db", 19 amrfinderplus_database="amrfinderplus_versioned_database",
20 amrfinderplus_version="latest", 20 db_name="amrfinderplus-db",
21 date_version=None): 21 amrfinderplus_version="latest",
22 date_version=None,
23 ):
22 self.data_table_name = amrfinderplus_database 24 self.data_table_name = amrfinderplus_database
23 self._db_name = db_name 25 self._db_name = db_name
24 self._amrfinderplus_version = amrfinderplus_version 26 self._amrfinderplus_version = amrfinderplus_version
25 self._amrfinderplus_date_version = date_version 27 self._amrfinderplus_date_version = date_version
26 self.data_table_entry = None 28 self.data_table_entry = None
29 def get_data_table_format(self): 31 def get_data_table_format(self):
30 """ 32 """
31 Skeleton of a data_table format 33 Skeleton of a data_table format
32 return: a data table formatted for json output 34 return: a data table formatted for json output
33 """ 35 """
34 self.data_table_entry = { 36 self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
35 "data_tables": {
36 self.data_table_name: {}
37 }
38 }
39 return self.data_table_entry 37 return self.data_table_entry
40 38
41 def get_data_manager(self): 39 def get_data_manager(self):
42 """ 40 """
43 Create the empty data table format and add all the information into 41 Create the empty data table format and add all the information into
44 return: The data table with database information 42 return: The data table with database information
45 """ 43 """
46 self.amrfinderplus_table_list = self.get_data_table_format() 44 self.amrfinderplus_table_list = self.get_data_table_format()
47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ 45 amrfinderplus_value = (
48 f"_{self._amrfinderplus_date_version}" 46 f"amrfinderplus_V{self._amrfinderplus_version}"
49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \ 47 f"_{self._amrfinderplus_date_version}"
50 f"-{self._amrfinderplus_date_version}" 48 )
51 data_info = dict(value=amrfinderplus_value, 49 amrfinderplus_name = (
52 name=amrfinderplus_name, 50 f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}"
53 path=self._db_name) 51 )
52 data_info = dict(
53 value=amrfinderplus_value,
54 name=amrfinderplus_name,
55 db_version=self._amrfinderplus_version,
56 path=self._db_name,
57 )
54 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] 58 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info]
55 return self.amrfinderplus_table_list 59 return self.amrfinderplus_table_list
56 60
57 61
58 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): 62 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager):
60 Download the amrfinderplus database from the ncbi. 64 Download the amrfinderplus database from the ncbi.
61 Make the database available with hmm and indexed files 65 Make the database available with hmm and indexed files
62 Build the data manager infos for galaxy 66 Build the data manager infos for galaxy
63 """ 67 """
64 68
65 def __init__(self, 69 def __init__(
66 output_dir=Path.cwd(), 70 self,
67 ncbi_url="ftp.ncbi.nlm.nih.gov", 71 output_dir=Path.cwd(),
68 ftp_login="anonymous", 72 ncbi_url="ftp.ncbi.nlm.nih.gov",
69 ftp_password="anonymous", 73 ftp_login="anonymous",
70 amrfinderplus_database="amrfinderplus_database", 74 ftp_password="anonymous",
71 db_name="amrfinderplus-db", 75 amrfinderplus_database="amrfinderplus_database",
72 amrfinderplus_version="latest", 76 db_name="amrfinderplus-db",
73 json_file_path=None, 77 amrfinderplus_version="latest",
74 date_version=None, 78 json_file_path=None,
75 amrfinderplus_db_path=None, 79 date_version=None,
76 test_mode=False): 80 amrfinderplus_db_path=None,
81 test_mode=False,
82 ):
77 83
78 super().__init__() 84 super().__init__()
79 self.json_file_path = json_file_path 85 self.json_file_path = json_file_path
80 self._output_dir = output_dir 86 self._output_dir = output_dir
81 self._ncbi_ftp_url = ncbi_url 87 self._ncbi_ftp_url = ncbi_url
82 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" 88 self._ncbi_database_path = (
89 "pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
90 )
83 self._login = ftp_login 91 self._login = ftp_login
84 self._password = ftp_password 92 self._password = ftp_password
85 self._amrfinderplus_database = amrfinderplus_database 93 self._amrfinderplus_database = amrfinderplus_database
86 self._db_name = db_name 94 self._db_name = db_name
87 self._amrfinderplus_version = amrfinderplus_version 95 self._amrfinderplus_version = amrfinderplus_version
100 """ 108 """
101 cmd = [command] 109 cmd = [command]
102 [cmd.append(i) for i in args] 110 [cmd.append(i) for i in args]
103 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) 111 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
104 if proc.returncode != 0: 112 if proc.returncode != 0:
105 print(f'Error type {proc.returncode} with : \n {proc}') 113 print(f"Error type {proc.returncode} with : \n {proc}")
106 114
107 def download_amrfinderplus_db(self): 115 def download_amrfinderplus_db(self):
108 """ 116 """
109 Download the amrfinderplus database from the ncbi ftp server 117 Download the amrfinderplus database from the ncbi ftp server
110 """ 118 """
111 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' 119 self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}"
112 os.makedirs(self.amrfinderplus_db_path) 120 os.makedirs(self.amrfinderplus_db_path)
113 if self._amrfinderplus_version == 'latest': 121
122 if self._amrfinderplus_version == "latest":
114 self.get_amrfinderplus_version() 123 self.get_amrfinderplus_version()
115 124
116 amrfinderplus_ftp_path = f"ftp://{self._login}:" \ 125 amrfinderplus_ftp_path = (
117 f"{self._password}@{self._ncbi_ftp_url}/" \ 126 f"ftp://{self._login}:"
118 f"{self._ncbi_database_path}/" \ 127 f"{self._password}@{self._ncbi_ftp_url}/"
119 f"{self._amrfinderplus_version}/" \ 128 f"{self._ncbi_database_path}/"
120 f"{self._amrfinderplus_date_version}" 129 f"{self._amrfinderplus_version}/"
130 f"{self._amrfinderplus_date_version}"
131 )
132
133 if self._amrfinderplus_version == "3.12":
134 taxa_group_file = "taxgroup.tab"
135 test_dna_fasta = "AMR_DNA-Escherichia"
136 else:
137 taxa_group_file = "taxgroup.tsv"
138 test_dna_fasta = "AMR_DNA-Escherichia.fa"
121 if self.test_mode is True: 139 if self.test_mode is True:
122 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] 140 file_list = [
141 test_dna_fasta,
142 "version.txt",
143 taxa_group_file,
144 "database_format_version.txt",
145 ]
123 output_option = "-O" 146 output_option = "-O"
124 for file in file_list: 147 for file in file_list:
125 self.subprocess_cmd("wget", 148 self.subprocess_cmd(
126 "-nd", 149 "wget",
127 "-np", 150 "-nd",
128 "-r", 151 "-np",
129 f"{amrfinderplus_ftp_path}/{file}", 152 "-r",
130 output_option, 153 f"{amrfinderplus_ftp_path}/{file}",
131 f"{self.amrfinderplus_db_path}/{file}") 154 output_option,
155 f"{self.amrfinderplus_db_path}/{file}",
156 )
132 else: 157 else:
133 output_option = "-P" 158 output_option = "-P"
134 self.subprocess_cmd("wget", 159 self.subprocess_cmd(
135 "-nd", 160 "wget",
136 "-np", 161 "-nd",
137 "-r", 162 "-np",
138 amrfinderplus_ftp_path, 163 "-r",
139 output_option, 164 amrfinderplus_ftp_path,
140 self.amrfinderplus_db_path) 165 output_option,
166 self.amrfinderplus_db_path,
167 )
141 168
142 def make_hmm_profile(self): 169 def make_hmm_profile(self):
143 """ 170 """
144 Make the hmm profile using the AMR.LIB file previously download 171 Make the hmm profile using the AMR.LIB file previously download
145 """ 172 """
152 def extract_filelist_makeblast(self): 179 def extract_filelist_makeblast(self):
153 """ 180 """
154 Extract le list of species which have file in the database 181 Extract le list of species which have file in the database
155 return: a filtered species list of available species in the database 182 return: a filtered species list of available species in the database
156 """ 183 """
157 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") 184 if self._amrfinderplus_version == "3.12":
185 taxa_group_file = "taxgroup.tab"
186 else:
187 taxa_group_file = "taxgroup.tsv"
188 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}")
158 if Path.exists(taxa_group_path): 189 if Path.exists(taxa_group_path):
159 taxa_table = pd.read_table(taxa_group_path) 190 taxa_table = pd.read_table(taxa_group_path)
160 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] 191 taxa_table.columns = [
161 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) 192 "taxgroup",
193 "gpipe_taxgroup",
194 "number_of_nucl_ref_genes",
195 ]
196 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(
197 items=["taxgroup"], axis=1
198 )
162 if self.test_mode is True: 199 if self.test_mode is True:
163 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup 200 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup
164 else: 201 else:
165 taxa_df = taxa_df.taxgroup 202 taxa_df = taxa_df.taxgroup
166 self.species_list = list(taxa_df) 203 self.species_list = list(taxa_df)
167 else: 204 else:
168 print("taxgroup.tab file is missing to list available species") 205 print(f"{taxa_group_file} file is missing to list available species")
169 206
170 def make_blastdb(self): 207 def make_blastdb(self):
171 """ 208 """
172 Index fasta file for blast 209 Index fasta file for blast
173 """ 210 """
174 self.extract_filelist_makeblast() 211 self.extract_filelist_makeblast()
175 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] 212 if self._amrfinderplus_version == "3.12":
176 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' 213 nucl_file_db_list = [
177 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' 214 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}"
215 for specie in self.species_list
216 ]
217 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS"
218 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt"
219 else:
220 nucl_file_db_list = [
221 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa"
222 for specie in self.species_list
223 ]
224 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa"
225 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa"
178 os.chdir(self.amrfinderplus_db_path) 226 os.chdir(self.amrfinderplus_db_path)
179 if Path(amr_dna).exists(): 227 if Path(amr_dna).exists():
180 nucl_file_db_list.append(amr_dna) 228 nucl_file_db_list.append(amr_dna)
181 else: 229 else:
182 print("No file AMR_CDS detected for indexing") 230 print("No file AMR_CDS detected for indexing")
183 if Path(amr_prot).exists(): 231 if Path(amr_prot).exists():
184 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") 232 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot")
185 else: 233 else:
186 print("No file AMRProt detected for indexing") 234 print("No file AMRProt detected for indexing")
187 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] 235 [
188 236 self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl")
189 def get_amrfinderplus_version(self, version_file="version.txt", 237 for file in nucl_file_db_list
190 database_version_file="database_format_version.txt"): 238 ]
239
240 def get_amrfinderplus_version(
241 self,
242 version_file="version.txt",
243 database_version_file="database_format_version.txt",
244 ):
191 """ 245 """
192 Check the version when latest if provided and update the number 246 Check the version when latest if provided and update the number
193 param version_file: name of the file containing version information 247 param version_file: name of the file containing version information
194 param database_version_file: name of the file containing date version information 248 param database_version_file: name of the file containing date version information
195 """ 249 """
196 ftp = FTP(self._ncbi_ftp_url) 250 ftp = FTP(self._ncbi_ftp_url)
197 ftp.login(self._login, self._password) 251 ftp.login(self._login, self._password)
198 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") 252 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}")
199 db_version = BytesIO() 253 db_version = BytesIO()
200 db_date_version = BytesIO() 254 db_date_version = BytesIO()
201 ftp.retrbinary(f'RETR {version_file}', db_version.write) 255 ftp.retrbinary(f"RETR {version_file}", db_version.write)
202 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) 256 ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write)
203 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] 257 self._amrfinderplus_date_version = (
204 self._amrfinderplus_version = '.'.join( 258 db_version.getvalue().decode("utf-8").splitlines()[0]
205 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) 259 )
260 self._amrfinderplus_version = ".".join(
261 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]
262 )
206 263
207 def read_json_input_file(self): 264 def read_json_input_file(self):
208 """ 265 """
209 Import the json file 266 Import the json file
210 """ 267 """
211 with open(self.json_file_path) as fh: 268 with open(self.json_file_path) as fh:
212 params = json.load(fh) 269 params = json.load(fh)
213 target_dir = params['output_data'][0]['extra_files_path'] 270 target_dir = params["output_data"][0]["extra_files_path"]
214 os.makedirs(target_dir) 271 os.makedirs(target_dir)
215 self._output_dir = target_dir 272 self._output_dir = target_dir
216 273
217 def write_json_infos(self): 274 def write_json_infos(self):
218 """ 275 """
219 Write in the imported json file 276 Write in the imported json file
220 """ 277 """
221 with open(self.json_file_path, 'w') as fh: 278 with open(self.json_file_path, "w") as fh:
222 json.dump(self.get_data_manager(), fh, sort_keys=True) 279 json.dump(self.get_data_manager(), fh, sort_keys=True)
223 280
224 281
225 def parse_arguments(): 282 def parse_arguments():
226 """ 283 """
227 List of arguments provided by the user 284 List of arguments provided by the user
228 return: parsed arguments 285 return: parsed arguments
229 """ 286 """
230 # parse options and arguments 287 # parse options and arguments
231 arg_parser = argparse.ArgumentParser() 288 arg_parser = argparse.ArgumentParser()
232 arg_parser.add_argument("data_manager_json", 289 arg_parser.add_argument("data_manager_json", help="json file from galaxy")
233 help="json file from galaxy") 290 arg_parser.add_argument(
234 arg_parser.add_argument("--db_version", default="latest", 291 "--db_version",
235 help="select the major version of the database (e.g. 3.10, 3.8), default is latest") 292 default="latest",
236 arg_parser.add_argument("--db_date", 293 help="select the major version of the database (e.g. 3.10, 3.8), default is latest",
237 help="select the date into the database version (e.g. 2022-10-11.2)") 294 )
238 arg_parser.add_argument("--test", action='store_true', 295 arg_parser.add_argument(
239 help="option to test the script with an lighted database") 296 "--db_date",
297 help="select the date into the database version (e.g. 2022-10-11.2)",
298 )
299 arg_parser.add_argument(
300 "--test",
301 action="store_true",
302 help="option to test the script with an lighted database",
303 )
240 return arg_parser.parse_args() 304 return arg_parser.parse_args()
241 305
242 306
243 def main(): 307 def main():
244 all_args = parse_arguments() 308 all_args = parse_arguments()
245 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, 309 amrfinderplus_download = DownloadAmrFinderPlusDatabase(
246 date_version=all_args.db_date, 310 amrfinderplus_version=all_args.db_version,
247 json_file_path=all_args.data_manager_json, 311 date_version=all_args.db_date,
248 test_mode=all_args.test) 312 json_file_path=all_args.data_manager_json,
313 test_mode=all_args.test,
314 )
249 amrfinderplus_download.read_json_input_file() 315 amrfinderplus_download.read_json_input_file()
250 amrfinderplus_download.download_amrfinderplus_db() 316 amrfinderplus_download.download_amrfinderplus_db()
251 amrfinderplus_download.make_hmm_profile() 317 amrfinderplus_download.make_hmm_profile()
252 amrfinderplus_download.make_blastdb() 318 amrfinderplus_download.make_blastdb()
253 amrfinderplus_download.write_json_infos() 319 amrfinderplus_download.write_json_infos()
254 320
255 321
256 if __name__ == '__main__': 322 if __name__ == "__main__":
257 main() 323 main()