comparison data_manager/plasmidfinder_fetch_database.py @ 0:8ad77a9dc91d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_plasmidfinder commit f3383fdb9a17d1b69d05547cdb96534a5f762bec-dirty
author pimarin
date Tue, 14 Feb 2023 14:25:43 +0000
parents
children 9ef108e34063
comparison
equal deleted inserted replaced
-1:000000000000 0:8ad77a9dc91d
1 import argparse
2 import json
3 import os
4 import tarfile
5 from datetime import datetime
6 from pathlib import Path
7
8 import requests
9
10
11 class GetPlasmidfinderDataManager:
12 """
13 Create the json file with database information for galaxy data manager
14 """
15
16 def __init__(self,
17 plasmidfinder_database="plasmidfinder_database",
18 db_name="plasmidfinder-db",
19 plasmidfinder_version="latest"):
20 self.data_table_name = plasmidfinder_database
21 self._db_name = db_name
22 self._plasmidfinder_version = plasmidfinder_version
23 self._plasmidfinder_date_version = None
24 self.data_table_entry = None
25 self.plasmidfinder_table_list = None
26 self._commit_number = None
27
28 def get_data_table_format(self):
29 """
30 Skeleton of a data_table format
31 return: a data table formatted for json output
32 """
33 self.data_table_entry = {
34 "data_tables": {
35 self.data_table_name: {}
36 }
37 }
38 return self.data_table_entry
39
40 def get_data_manager(self):
41 """
42 Create the empty data table format and add all the information into
43 Commit number is added if latest is required instead of version number
44 return: The data table with database information
45 """
46 self.plasmidfinder_table_list = self.get_data_table_format()
47 if self._plasmidfinder_version == "latest":
48 version_value = self._commit_number
49 else:
50 version_value = self._plasmidfinder_version
51 plasmidfinder_value = f"plasmidfinder_{self._commit_number}" \
52 f"_{self._plasmidfinder_date_version}"
53 plasmidfinder_name = f"{version_value}" \
54 f"_{self._plasmidfinder_date_version}"
55 data_info = dict(value=plasmidfinder_value,
56 name=plasmidfinder_name,
57 date=self._plasmidfinder_date_version,
58 path=self._db_name)
59 self.plasmidfinder_table_list["data_tables"][self.data_table_name] = [data_info]
60 return self.plasmidfinder_table_list
61
62
63 class DownloadPlasmidfinderDatabase(GetPlasmidfinderDataManager):
64 """
65 Download the plasmidfinder database from the bitbucket repository.
66 Build the data manager info for galaxy
67 """
68
69 def __init__(self,
70 output_dir=Path.cwd(),
71 plasmidfinder_url="https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/",
72 db_name="plasmidfinder-db",
73 db_tmp="tmp_database",
74 plasmidfinder_version="latest",
75 json_file_path=None,
76 date_version=datetime.now().strftime("%Y-%m-%d")):
77
78 super().__init__()
79 self.json_file_path = json_file_path
80 self._output_dir = output_dir
81 self._plasmidfinder_url = plasmidfinder_url
82 self._temporary_folder = db_tmp
83 self._db_name = db_name
84 self._db_name_tar = f'{db_name}.gz'
85 self._plasmidfinder_version = plasmidfinder_version
86 self._plasmidfinder_date_version = date_version
87 self._commit_number = None
88
89 def extract_db_commit(self, request_header, title_name="content-disposition"):
90 """
91 Extract the commit if to add the information as identifier of the download
92 @request_header: a request object obtained from requests.get()
93 @title_name: the tag to search in the header of the requests object
94 return: the value of the commit
95 """
96 db_info = request_header.headers[title_name]
97 commit_number = db_info.split("-")[2].split(".")[0]
98 return commit_number
99
100 def untar_files(self, file_path: Path, extracted_path_output: Path):
101 """
102 untar the download archive
103 @file_path: input path of the tar.gz file
104 @extracted_path_output: output path of the extract folder
105 return: the path of the output
106 """
107 try:
108 with file_path.open('rb') as fh_in, \
109 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file:
110 tar_file.extractall(path=extracted_path_output)
111 print(f'Untar the database in {extracted_path_output}')
112 return extracted_path_output
113 except OSError:
114 os.sys.exit(f'ERROR: Could not extract {file_path}')
115
116 def choose_db_version(self):
117 """
118 Update the url link depending on the version choosen by user.
119 This method could be upgraded simply by adding the new versions
120 """
121 if self._plasmidfinder_version == "latest":
122 self._plasmidfinder_url = f"{self._plasmidfinder_url}master.gz"
123 elif self._plasmidfinder_version == "2.1":
124 self._plasmidfinder_url = f"{self._plasmidfinder_url}1307168.gz"
125
126 def download_database(self):
127 """
128 Download the plasmidfinder database using requests lib
129 Make the directory and temporary directory for download
130 Untar the download files
131 """
132 self._output_dir = Path(self._output_dir)
133 self.choose_db_version()
134 try:
135 request_info = requests.get(self._plasmidfinder_url)
136 request_info.raise_for_status()
137 self._commit_number = self.extract_db_commit(request_info)
138 output_tar_path = self._output_dir.joinpath(self._temporary_folder)
139 output_tar_path_file = output_tar_path.joinpath(self._db_name_tar)
140 output_path = self._output_dir.joinpath(self._db_name)
141 os.makedirs(output_tar_path)
142 os.makedirs(output_path)
143 with open(output_tar_path_file, 'wb') as output_dir:
144 output_dir.write(request_info.content)
145 untar_output = self.untar_files(file_path=output_tar_path_file, extracted_path_output=output_tar_path.joinpath(self._db_name))
146
147 self.moove_download_files(older_path=untar_output, new_path=output_path)
148 except requests.exceptions.HTTPError as http_error:
149 print(f"Requests Error: {http_error}")
150 print(f"Fail to import Plasmidfinder database from {self._plasmidfinder_url}")
151
152 def moove_download_files(self, older_path, new_path, expression_search="*fsa"):
153 """
154 Clean downloaded data by mooving fasta files in the final folder
155 @older_path: previous path where the files are located
156 @new_path: final path where files will be mooved
157 @expression_search: keep only file with this expression
158 """
159 fasta_files = Path(older_path).rglob(expression_search)
160 file_list_paths = [file for file in fasta_files if file.is_file()]
161 [self.keep_filename(pathname=path, output_path=new_path) for path in file_list_paths]
162
163 def keep_filename(self, pathname, output_path):
164 """
165 Moove files
166 @pathname: previous path
167 @output_path: final path
168 """
169 Path.replace(pathname, output_path.joinpath(pathname.name))
170
171 def read_json_input_file(self):
172 """
173 Import the json file
174 """
175 with open(self.json_file_path) as fh:
176 params = json.load(fh)
177 target_dir = params['output_data'][0]['extra_files_path']
178 os.makedirs(target_dir)
179 self._output_dir = target_dir
180
181 def write_json_infos(self):
182 """
183 Write in the imported json file
184 """
185 with open(self.json_file_path, 'w') as fh:
186 json.dump(self.get_data_manager(), fh, sort_keys=True)
187
188
189 def parse_arguments():
190 """
191 List of arguments provided by the user
192 return: parsed arguments
193 """
194 # parse options and arguments
195 arg_parser = argparse.ArgumentParser()
196 arg_parser.add_argument("data_manager_json",
197 help="json file from galaxy")
198 arg_parser.add_argument("-v", "--db_version",
199 help="version of the plasmidfinder (latest or 2.1)")
200 return arg_parser.parse_args()
201
202
203 def main():
204 all_args = parse_arguments()
205 plasmidfinder_download = DownloadPlasmidfinderDatabase(json_file_path=all_args.data_manager_json, plasmidfinder_version=all_args.db_version)
206 plasmidfinder_download.read_json_input_file()
207 plasmidfinder_download.download_database()
208 plasmidfinder_download.write_json_infos()
209
210
211 if __name__ == '__main__':
212 main()