Mercurial > repos > iuc > amrfinderplus_data_manager_build
comparison data_manager/data_manager_build_amrfinderplus.py @ 2:8fa7efc32500 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_amrfinderplus commit 45dbbf06a59df43da2c321c272de11cc41e17d43
| author | iuc |
|---|---|
| date | Sun, 23 Nov 2025 12:21:18 +0000 |
| parents | 585cdfaf6ddb |
| children |
comparison
equal
deleted
inserted
replaced
| 1:d4e3b8d47f49 | 2:8fa7efc32500 |
|---|---|
| 12 class GetAmrFinderPlusDataManager: | 12 class GetAmrFinderPlusDataManager: |
| 13 """ | 13 """ |
| 14 Create the json file with database information for galaxy data manager | 14 Create the json file with database information for galaxy data manager |
| 15 """ | 15 """ |
| 16 | 16 |
| 17 def __init__(self, | 17 def __init__( |
| 18 amrfinderplus_database="amrfinderplus_database", | 18 self, |
| 19 db_name="amrfinderplus-db", | 19 amrfinderplus_database="amrfinderplus_versioned_database", |
| 20 amrfinderplus_version="latest", | 20 db_name="amrfinderplus-db", |
| 21 date_version=None): | 21 amrfinderplus_version="latest", |
| 22 date_version=None, | |
| 23 ): | |
| 22 self.data_table_name = amrfinderplus_database | 24 self.data_table_name = amrfinderplus_database |
| 23 self._db_name = db_name | 25 self._db_name = db_name |
| 24 self._amrfinderplus_version = amrfinderplus_version | 26 self._amrfinderplus_version = amrfinderplus_version |
| 25 self._amrfinderplus_date_version = date_version | 27 self._amrfinderplus_date_version = date_version |
| 26 self.data_table_entry = None | 28 self.data_table_entry = None |
| 29 def get_data_table_format(self): | 31 def get_data_table_format(self): |
| 30 """ | 32 """ |
| 31 Skeleton of a data_table format | 33 Skeleton of a data_table format |
| 32 return: a data table formatted for json output | 34 return: a data table formatted for json output |
| 33 """ | 35 """ |
| 34 self.data_table_entry = { | 36 self.data_table_entry = {"data_tables": {self.data_table_name: {}}} |
| 35 "data_tables": { | |
| 36 self.data_table_name: {} | |
| 37 } | |
| 38 } | |
| 39 return self.data_table_entry | 37 return self.data_table_entry |
| 40 | 38 |
| 41 def get_data_manager(self): | 39 def get_data_manager(self): |
| 42 """ | 40 """ |
| 43 Create the empty data table format and add all the information into | 41 Create the empty data table format and add all the information into |
| 44 return: The data table with database information | 42 return: The data table with database information |
| 45 """ | 43 """ |
| 46 self.amrfinderplus_table_list = self.get_data_table_format() | 44 self.amrfinderplus_table_list = self.get_data_table_format() |
| 47 amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ | 45 amrfinderplus_value = ( |
| 48 f"_{self._amrfinderplus_date_version}" | 46 f"amrfinderplus_V{self._amrfinderplus_version}" |
| 49 amrfinderplus_name = f"V{self._amrfinderplus_version}" \ | 47 f"_{self._amrfinderplus_date_version}" |
| 50 f"-{self._amrfinderplus_date_version}" | 48 ) |
| 51 data_info = dict(value=amrfinderplus_value, | 49 amrfinderplus_name = ( |
| 52 name=amrfinderplus_name, | 50 f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}" |
| 53 path=self._db_name) | 51 ) |
| 52 data_info = dict( | |
| 53 value=amrfinderplus_value, | |
| 54 name=amrfinderplus_name, | |
| 55 db_version=self._amrfinderplus_version, | |
| 56 path=self._db_name, | |
| 57 ) | |
| 54 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] | 58 self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] |
| 55 return self.amrfinderplus_table_list | 59 return self.amrfinderplus_table_list |
| 56 | 60 |
| 57 | 61 |
| 58 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): | 62 class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): |
| 60 Download the amrfinderplus database from the ncbi. | 64 Download the amrfinderplus database from the ncbi. |
| 61 Make the database available with hmm and indexed files | 65 Make the database available with hmm and indexed files |
| 62 Build the data manager infos for galaxy | 66 Build the data manager infos for galaxy |
| 63 """ | 67 """ |
| 64 | 68 |
| 65 def __init__(self, | 69 def __init__( |
| 66 output_dir=Path.cwd(), | 70 self, |
| 67 ncbi_url="ftp.ncbi.nlm.nih.gov", | 71 output_dir=Path.cwd(), |
| 68 ftp_login="anonymous", | 72 ncbi_url="ftp.ncbi.nlm.nih.gov", |
| 69 ftp_password="anonymous", | 73 ftp_login="anonymous", |
| 70 amrfinderplus_database="amrfinderplus_database", | 74 ftp_password="anonymous", |
| 71 db_name="amrfinderplus-db", | 75 amrfinderplus_database="amrfinderplus_database", |
| 72 amrfinderplus_version="latest", | 76 db_name="amrfinderplus-db", |
| 73 json_file_path=None, | 77 amrfinderplus_version="latest", |
| 74 date_version=None, | 78 json_file_path=None, |
| 75 amrfinderplus_db_path=None, | 79 date_version=None, |
| 76 test_mode=False): | 80 amrfinderplus_db_path=None, |
| 81 test_mode=False, | |
| 82 ): | |
| 77 | 83 |
| 78 super().__init__() | 84 super().__init__() |
| 79 self.json_file_path = json_file_path | 85 self.json_file_path = json_file_path |
| 80 self._output_dir = output_dir | 86 self._output_dir = output_dir |
| 81 self._ncbi_ftp_url = ncbi_url | 87 self._ncbi_ftp_url = ncbi_url |
| 82 self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" | 88 self._ncbi_database_path = ( |
| 89 "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" | |
| 90 ) | |
| 83 self._login = ftp_login | 91 self._login = ftp_login |
| 84 self._password = ftp_password | 92 self._password = ftp_password |
| 85 self._amrfinderplus_database = amrfinderplus_database | 93 self._amrfinderplus_database = amrfinderplus_database |
| 86 self._db_name = db_name | 94 self._db_name = db_name |
| 87 self._amrfinderplus_version = amrfinderplus_version | 95 self._amrfinderplus_version = amrfinderplus_version |
| 100 """ | 108 """ |
| 101 cmd = [command] | 109 cmd = [command] |
| 102 [cmd.append(i) for i in args] | 110 [cmd.append(i) for i in args] |
| 103 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) | 111 proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) |
| 104 if proc.returncode != 0: | 112 if proc.returncode != 0: |
| 105 print(f'Error type {proc.returncode} with : \n {proc}') | 113 print(f"Error type {proc.returncode} with : \n {proc}") |
| 106 | 114 |
| 107 def download_amrfinderplus_db(self): | 115 def download_amrfinderplus_db(self): |
| 108 """ | 116 """ |
| 109 Download the amrfinderplus database from the ncbi ftp server | 117 Download the amrfinderplus database from the ncbi ftp server |
| 110 """ | 118 """ |
| 111 self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' | 119 self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}" |
| 112 os.makedirs(self.amrfinderplus_db_path) | 120 os.makedirs(self.amrfinderplus_db_path) |
| 113 if self._amrfinderplus_version == 'latest': | 121 |
| 122 if self._amrfinderplus_version == "latest": | |
| 114 self.get_amrfinderplus_version() | 123 self.get_amrfinderplus_version() |
| 115 | 124 |
| 116 amrfinderplus_ftp_path = f"ftp://{self._login}:" \ | 125 amrfinderplus_ftp_path = ( |
| 117 f"{self._password}@{self._ncbi_ftp_url}/" \ | 126 f"ftp://{self._login}:" |
| 118 f"{self._ncbi_database_path}/" \ | 127 f"{self._password}@{self._ncbi_ftp_url}/" |
| 119 f"{self._amrfinderplus_version}/" \ | 128 f"{self._ncbi_database_path}/" |
| 120 f"{self._amrfinderplus_date_version}" | 129 f"{self._amrfinderplus_version}/" |
| 130 f"{self._amrfinderplus_date_version}" | |
| 131 ) | |
| 132 | |
| 133 if self._amrfinderplus_version == "3.12": | |
| 134 taxa_group_file = "taxgroup.tab" | |
| 135 test_dna_fasta = "AMR_DNA-Escherichia" | |
| 136 else: | |
| 137 taxa_group_file = "taxgroup.tsv" | |
| 138 test_dna_fasta = "AMR_DNA-Escherichia.fa" | |
| 121 if self.test_mode is True: | 139 if self.test_mode is True: |
| 122 file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] | 140 file_list = [ |
| 141 test_dna_fasta, | |
| 142 "version.txt", | |
| 143 taxa_group_file, | |
| 144 "database_format_version.txt", | |
| 145 ] | |
| 123 output_option = "-O" | 146 output_option = "-O" |
| 124 for file in file_list: | 147 for file in file_list: |
| 125 self.subprocess_cmd("wget", | 148 self.subprocess_cmd( |
| 126 "-nd", | 149 "wget", |
| 127 "-np", | 150 "-nd", |
| 128 "-r", | 151 "-np", |
| 129 f"{amrfinderplus_ftp_path}/{file}", | 152 "-r", |
| 130 output_option, | 153 f"{amrfinderplus_ftp_path}/{file}", |
| 131 f"{self.amrfinderplus_db_path}/{file}") | 154 output_option, |
| 155 f"{self.amrfinderplus_db_path}/{file}", | |
| 156 ) | |
| 132 else: | 157 else: |
| 133 output_option = "-P" | 158 output_option = "-P" |
| 134 self.subprocess_cmd("wget", | 159 self.subprocess_cmd( |
| 135 "-nd", | 160 "wget", |
| 136 "-np", | 161 "-nd", |
| 137 "-r", | 162 "-np", |
| 138 amrfinderplus_ftp_path, | 163 "-r", |
| 139 output_option, | 164 amrfinderplus_ftp_path, |
| 140 self.amrfinderplus_db_path) | 165 output_option, |
| 166 self.amrfinderplus_db_path, | |
| 167 ) | |
| 141 | 168 |
| 142 def make_hmm_profile(self): | 169 def make_hmm_profile(self): |
| 143 """ | 170 """ |
| 144 Make the hmm profile using the AMR.LIB file previously download | 171 Make the hmm profile using the AMR.LIB file previously download |
| 145 """ | 172 """ |
| 152 def extract_filelist_makeblast(self): | 179 def extract_filelist_makeblast(self): |
| 153 """ | 180 """ |
| 154 Extract le list of species which have file in the database | 181 Extract le list of species which have file in the database |
| 155 return: a filtered species list of available species in the database | 182 return: a filtered species list of available species in the database |
| 156 """ | 183 """ |
| 157 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") | 184 if self._amrfinderplus_version == "3.12": |
| 185 taxa_group_file = "taxgroup.tab" | |
| 186 else: | |
| 187 taxa_group_file = "taxgroup.tsv" | |
| 188 taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}") | |
| 158 if Path.exists(taxa_group_path): | 189 if Path.exists(taxa_group_path): |
| 159 taxa_table = pd.read_table(taxa_group_path) | 190 taxa_table = pd.read_table(taxa_group_path) |
| 160 taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] | 191 taxa_table.columns = [ |
| 161 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) | 192 "taxgroup", |
| 193 "gpipe_taxgroup", | |
| 194 "number_of_nucl_ref_genes", | |
| 195 ] | |
| 196 taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter( | |
| 197 items=["taxgroup"], axis=1 | |
| 198 ) | |
| 162 if self.test_mode is True: | 199 if self.test_mode is True: |
| 163 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup | 200 taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup |
| 164 else: | 201 else: |
| 165 taxa_df = taxa_df.taxgroup | 202 taxa_df = taxa_df.taxgroup |
| 166 self.species_list = list(taxa_df) | 203 self.species_list = list(taxa_df) |
| 167 else: | 204 else: |
| 168 print("taxgroup.tab file is missing to list available species") | 205 print(f"{taxa_group_file} file is missing to list available species") |
| 169 | 206 |
| 170 def make_blastdb(self): | 207 def make_blastdb(self): |
| 171 """ | 208 """ |
| 172 Index fasta file for blast | 209 Index fasta file for blast |
| 173 """ | 210 """ |
| 174 self.extract_filelist_makeblast() | 211 self.extract_filelist_makeblast() |
| 175 nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] | 212 if self._amrfinderplus_version == "3.12": |
| 176 amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' | 213 nucl_file_db_list = [ |
| 177 amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' | 214 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}" |
| 215 for specie in self.species_list | |
| 216 ] | |
| 217 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS" | |
| 218 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt" | |
| 219 else: | |
| 220 nucl_file_db_list = [ | |
| 221 f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa" | |
| 222 for specie in self.species_list | |
| 223 ] | |
| 224 amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa" | |
| 225 amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa" | |
| 178 os.chdir(self.amrfinderplus_db_path) | 226 os.chdir(self.amrfinderplus_db_path) |
| 179 if Path(amr_dna).exists(): | 227 if Path(amr_dna).exists(): |
| 180 nucl_file_db_list.append(amr_dna) | 228 nucl_file_db_list.append(amr_dna) |
| 181 else: | 229 else: |
| 182 print("No file AMR_CDS detected for indexing") | 230 print("No file AMR_CDS detected for indexing") |
| 183 if Path(amr_prot).exists(): | 231 if Path(amr_prot).exists(): |
| 184 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") | 232 self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") |
| 185 else: | 233 else: |
| 186 print("No file AMRProt detected for indexing") | 234 print("No file AMRProt detected for indexing") |
| 187 [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] | 235 [ |
| 188 | 236 self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") |
| 189 def get_amrfinderplus_version(self, version_file="version.txt", | 237 for file in nucl_file_db_list |
| 190 database_version_file="database_format_version.txt"): | 238 ] |
| 239 | |
| 240 def get_amrfinderplus_version( | |
| 241 self, | |
| 242 version_file="version.txt", | |
| 243 database_version_file="database_format_version.txt", | |
| 244 ): | |
| 191 """ | 245 """ |
| 192 Check the version when latest if provided and update the number | 246 Check the version when latest if provided and update the number |
| 193 param version_file: name of the file containing version information | 247 param version_file: name of the file containing version information |
| 194 param database_version_file: name of the file containing date version information | 248 param database_version_file: name of the file containing date version information |
| 195 """ | 249 """ |
| 196 ftp = FTP(self._ncbi_ftp_url) | 250 ftp = FTP(self._ncbi_ftp_url) |
| 197 ftp.login(self._login, self._password) | 251 ftp.login(self._login, self._password) |
| 198 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") | 252 ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") |
| 199 db_version = BytesIO() | 253 db_version = BytesIO() |
| 200 db_date_version = BytesIO() | 254 db_date_version = BytesIO() |
| 201 ftp.retrbinary(f'RETR {version_file}', db_version.write) | 255 ftp.retrbinary(f"RETR {version_file}", db_version.write) |
| 202 ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) | 256 ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write) |
| 203 self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] | 257 self._amrfinderplus_date_version = ( |
| 204 self._amrfinderplus_version = '.'.join( | 258 db_version.getvalue().decode("utf-8").splitlines()[0] |
| 205 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) | 259 ) |
| 260 self._amrfinderplus_version = ".".join( | |
| 261 db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2] | |
| 262 ) | |
| 206 | 263 |
| 207 def read_json_input_file(self): | 264 def read_json_input_file(self): |
| 208 """ | 265 """ |
| 209 Import the json file | 266 Import the json file |
| 210 """ | 267 """ |
| 211 with open(self.json_file_path) as fh: | 268 with open(self.json_file_path) as fh: |
| 212 params = json.load(fh) | 269 params = json.load(fh) |
| 213 target_dir = params['output_data'][0]['extra_files_path'] | 270 target_dir = params["output_data"][0]["extra_files_path"] |
| 214 os.makedirs(target_dir) | 271 os.makedirs(target_dir) |
| 215 self._output_dir = target_dir | 272 self._output_dir = target_dir |
| 216 | 273 |
| 217 def write_json_infos(self): | 274 def write_json_infos(self): |
| 218 """ | 275 """ |
| 219 Write in the imported json file | 276 Write in the imported json file |
| 220 """ | 277 """ |
| 221 with open(self.json_file_path, 'w') as fh: | 278 with open(self.json_file_path, "w") as fh: |
| 222 json.dump(self.get_data_manager(), fh, sort_keys=True) | 279 json.dump(self.get_data_manager(), fh, sort_keys=True) |
| 223 | 280 |
| 224 | 281 |
| 225 def parse_arguments(): | 282 def parse_arguments(): |
| 226 """ | 283 """ |
| 227 List of arguments provided by the user | 284 List of arguments provided by the user |
| 228 return: parsed arguments | 285 return: parsed arguments |
| 229 """ | 286 """ |
| 230 # parse options and arguments | 287 # parse options and arguments |
| 231 arg_parser = argparse.ArgumentParser() | 288 arg_parser = argparse.ArgumentParser() |
| 232 arg_parser.add_argument("data_manager_json", | 289 arg_parser.add_argument("data_manager_json", help="json file from galaxy") |
| 233 help="json file from galaxy") | 290 arg_parser.add_argument( |
| 234 arg_parser.add_argument("--db_version", default="latest", | 291 "--db_version", |
| 235 help="select the major version of the database (e.g. 3.10, 3.8), default is latest") | 292 default="latest", |
| 236 arg_parser.add_argument("--db_date", | 293 help="select the major version of the database (e.g. 3.10, 3.8), default is latest", |
| 237 help="select the date into the database version (e.g. 2022-10-11.2)") | 294 ) |
| 238 arg_parser.add_argument("--test", action='store_true', | 295 arg_parser.add_argument( |
| 239 help="option to test the script with an lighted database") | 296 "--db_date", |
| 297 help="select the date into the database version (e.g. 2022-10-11.2)", | |
| 298 ) | |
| 299 arg_parser.add_argument( | |
| 300 "--test", | |
| 301 action="store_true", | |
| 302 help="option to test the script with an lighted database", | |
| 303 ) | |
| 240 return arg_parser.parse_args() | 304 return arg_parser.parse_args() |
| 241 | 305 |
| 242 | 306 |
| 243 def main(): | 307 def main(): |
| 244 all_args = parse_arguments() | 308 all_args = parse_arguments() |
| 245 amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, | 309 amrfinderplus_download = DownloadAmrFinderPlusDatabase( |
| 246 date_version=all_args.db_date, | 310 amrfinderplus_version=all_args.db_version, |
| 247 json_file_path=all_args.data_manager_json, | 311 date_version=all_args.db_date, |
| 248 test_mode=all_args.test) | 312 json_file_path=all_args.data_manager_json, |
| 313 test_mode=all_args.test, | |
| 314 ) | |
| 249 amrfinderplus_download.read_json_input_file() | 315 amrfinderplus_download.read_json_input_file() |
| 250 amrfinderplus_download.download_amrfinderplus_db() | 316 amrfinderplus_download.download_amrfinderplus_db() |
| 251 amrfinderplus_download.make_hmm_profile() | 317 amrfinderplus_download.make_hmm_profile() |
| 252 amrfinderplus_download.make_blastdb() | 318 amrfinderplus_download.make_blastdb() |
| 253 amrfinderplus_download.write_json_infos() | 319 amrfinderplus_download.write_json_infos() |
| 254 | 320 |
| 255 | 321 |
| 256 if __name__ == '__main__': | 322 if __name__ == "__main__": |
| 257 main() | 323 main() |
