Mercurial > repos > pimarin > data_manager_fetch_plasmidfinder
comparison data_manager/plasmidfinder_fetch_database.py @ 5:60cfd33bc2fb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_plasmidfinder commit 0a3992c5be846fc9f18b7ca18f0adcd78f5b9396-dirty
author | pimarin |
---|---|
date | Mon, 24 Jul 2023 10:00:33 +0000 |
parents | e05fd47bcca6 |
children | f99089461adb |
comparison
equal
deleted
inserted
replaced
4:e05fd47bcca6 | 5:60cfd33bc2fb |
---|---|
1 import argparse | 1 import argparse |
2 import json | 2 import json |
3 import os | 3 import os |
4 import tarfile | 4 import time |
5 from datetime import datetime | |
6 from pathlib import Path | 5 from pathlib import Path |
7 | 6 |
8 import requests | 7 |
8 import git | |
9 | 9 |
10 | 10 |
11 class GetPlasmidfinderDataManager: | 11 class GetPlasmidfinderDataManager: |
12 """ | 12 """ |
13 Create the json file with database information for galaxy data manager | 13 Create the json file with database information for galaxy data manager |
14 """ | 14 """ |
15 | 15 |
16 def __init__(self, | 16 def __init__(self, |
17 plasmidfinder_database="plasmidfinder_database", | 17 plasmidfinder_database="plasmidfinder_database", |
18 db_name="plasmidfinder-db", | 18 db_name="plasmidfinder_database", |
19 plasmidfinder_version="latest"): | 19 plasmidfinder_version="latest"): |
20 self.data_table_name = plasmidfinder_database | 20 self.data_table_name = plasmidfinder_database |
21 self._db_name = db_name | 21 self._db_name = db_name |
22 self._plasmidfinder_version = plasmidfinder_version | 22 self._plasmidfinder_version = plasmidfinder_version |
23 self._plasmidfinder_date_version = None | 23 self._plasmidfinder_date_version = None |
66 Build the data manager info for galaxy | 66 Build the data manager info for galaxy |
67 """ | 67 """ |
68 | 68 |
69 def __init__(self, | 69 def __init__(self, |
70 output_dir=Path.cwd(), | 70 output_dir=Path.cwd(), |
71 plasmidfinder_url="https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/", | 71 plasmidfinder_url="https://bitbucket.org/genomicepidemiology/plasmidfinder_db/src/master", |
72 db_name="plasmidfinder-db", | 72 db_name="plasmidfinder_database", |
73 db_tmp="tmp_database", | 73 db_tmp="tmp_database", |
74 plasmidfinder_version="latest", | 74 plasmidfinder_version="latest", |
75 json_file_path=None, | 75 json_file_path=None, |
76 date_version=datetime.now().strftime("%Y-%m-%d")): | 76 date_version=None): |
77 | 77 |
78 super().__init__() | 78 super().__init__() |
79 self.json_file_path = json_file_path | 79 self.json_file_path = json_file_path |
80 self._output_dir = output_dir | 80 self._output_dir = output_dir |
81 self._plasmidfinder_url = plasmidfinder_url | 81 self._plasmidfinder_url = plasmidfinder_url |
84 self._db_name_tar = f'{db_name}.gz' | 84 self._db_name_tar = f'{db_name}.gz' |
85 self._plasmidfinder_version = plasmidfinder_version | 85 self._plasmidfinder_version = plasmidfinder_version |
86 self._plasmidfinder_date_version = date_version | 86 self._plasmidfinder_date_version = date_version |
87 self._commit_number = None | 87 self._commit_number = None |
88 | 88 |
89 def extract_db_commit(self, request_header, title_name="content-disposition"): | 89 def git_clone(self): |
90 """ | 90 git.Repo.clone_from(url=self._plasmidfinder_url, to_path=self._output_dir) |
91 Extract the commit if to add the information as identifier of the download | 91 self._plasmidfinder_repository = git.Repo(path=self._output_dir) |
92 @request_header: a request object obtained from requests.get() | |
93 @title_name: the tag to search in the header of the requests object | |
94 return: the value of the commit | |
95 """ | |
96 db_info = request_header.headers[title_name] | |
97 commit_number = db_info.split("-")[2].split(".")[0] | |
98 return commit_number | |
99 | 92 |
100 def untar_files(self, file_path: Path, extracted_path_output: Path): | 93 def get_commit_number(self): |
101 """ | 94 sha = self._plasmidfinder_repository.head.commit.hexsha |
102 untar the download archive | 95 short_sha = self._plasmidfinder_repository.git.rev_parse(sha, short=7) |
103 @file_path: input path of the tar.gz file | 96 self._commit_number = short_sha |
104 @extracted_path_output: output path of the extract folder | |
105 return: the path of the output | |
106 """ | |
107 try: | |
108 with file_path.open('rb') as fh_in, \ | |
109 tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: | |
110 tar_file.extractall(path=extracted_path_output) | |
111 print(f'Untar the database in {extracted_path_output}') | |
112 return extracted_path_output | |
113 except OSError: | |
114 os.sys.exit(f'ERROR: Could not extract {file_path}') | |
115 | 97 |
116 def choose_db_version(self): | 98 def get_commit_date(self): |
117 """ | 99 self._plasmidfinder_date_version = time.strftime("%Y_%m_%d", time.gmtime(self._plasmidfinder_repository.head.commit.committed_date)) |
118 Update the url link depending on the version choosen by user. | |
119 This method could be upgraded simply by adding the new versions | |
120 """ | |
121 if self._plasmidfinder_version == "latest": | |
122 self._plasmidfinder_url = f"{self._plasmidfinder_url}master.gz" | |
123 elif self._plasmidfinder_version == "2.1": | |
124 self._plasmidfinder_url = f"{self._plasmidfinder_url}1307168.gz" | |
125 | 100 |
126 def download_database(self): | 101 def download_database(self): |
127 """ | 102 """ |
128 Download the plasmidfinder database using requests lib | 103 Download the plasmidfinder database using git lib |
129 Make the directory and temporary directory for download | 104 Extract commit and commit date |
130 Untar the download files | |
131 """ | 105 """ |
132 self._output_dir = Path(self._output_dir) | 106 self._output_dir = Path(self._output_dir) |
133 self.choose_db_version() | 107 self.git_clone() |
134 try: | 108 if self._plasmidfinder_version != "latest": |
135 request_info = requests.get(self._plasmidfinder_url) | 109 self._plasmidfinder_repository.git.checkout(self._plasmidfinder_version) |
136 request_info.raise_for_status() | 110 self.get_commit_number() |
137 self._commit_number = self.extract_db_commit(request_info) | 111 self.get_commit_date() |
138 output_tar_path = self._output_dir.joinpath(self._temporary_folder) | |
139 output_tar_path_file = output_tar_path.joinpath(self._db_name_tar) | |
140 output_path = self._output_dir.joinpath(self._db_name) | |
141 os.makedirs(output_tar_path) | |
142 os.makedirs(output_path) | |
143 with open(output_tar_path_file, 'wb') as output_dir: | |
144 output_dir.write(request_info.content) | |
145 untar_output = self.untar_files(file_path=output_tar_path_file, extracted_path_output=output_tar_path.joinpath(self._db_name)) | |
146 self.moove_download_files(source=untar_output, destination=output_path) | |
147 except requests.exceptions.HTTPError as http_error: | |
148 print(f"Requests Error: {http_error}") | |
149 print(f"Fail to import Plasmidfinder database from {self._plasmidfinder_url}") | |
150 | |
151 def moove_download_files(self, source, destination, expression_search="*"): | |
152 """ | |
153 Clean downloaded data by mooving fasta files in the final folder | |
154 @older_path: previous path where the files are located | |
155 @new_path: final path where files will be mooved | |
156 @expression_search: keep only file with this expression | |
157 """ | |
158 fasta_files = Path(source).rglob(expression_search) | |
159 file_list_paths = [file for file in fasta_files if file.is_file()] | |
160 [self.keep_filename(pathname=path, output_path=destination) for path in file_list_paths] | |
161 | |
162 def keep_filename(self, pathname, output_path): | |
163 """ | |
164 Moove files | |
165 @pathname: previous path | |
166 @output_path: final path | |
167 """ | |
168 Path.replace(pathname, output_path.joinpath(pathname.name)) | |
169 | 112 |
170 def read_json_input_file(self): | 113 def read_json_input_file(self): |
171 """ | 114 """ |
172 Import the json file | 115 Import the json file |
173 """ | 116 """ |