Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 6:c4a0f3badafe draft default tip
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 91705a9789b30878a55d1044c654e39a7726cf60
| author | galaxyp |
|---|---|
| date | Wed, 11 Dec 2024 13:34:46 +0000 |
| parents | 7be8e30d536f |
| children |
comparison
equal
deleted
inserted
replaced
| 5:7be8e30d536f | 6:c4a0f3badafe |
|---|---|
| 15 import re | 15 import re |
| 16 import sys | 16 import sys |
| 17 from urllib import parse | 17 from urllib import parse |
| 18 | 18 |
| 19 import requests | 19 import requests |
| 20 from requests.adapters import HTTPAdapter | 20 from requests.adapters import HTTPAdapter, Retry |
| 21 from requests.packages.urllib3.util.retry import Retry | |
| 22 | |
| 23 DEFAULT_TIMEOUT = 5 # seconds | |
| 24 retry_strategy = Retry( | |
| 25 total=5, | |
| 26 backoff_factor=2, | |
| 27 status_forcelist=[429, 500, 502, 503, 504], | |
| 28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] | |
| 29 ) | |
| 30 | |
| 31 | |
| 32 class TimeoutHTTPAdapter(HTTPAdapter): | |
| 33 def __init__(self, *args, **kwargs): | |
| 34 self.timeout = DEFAULT_TIMEOUT | |
| 35 if "timeout" in kwargs: | |
| 36 self.timeout = kwargs["timeout"] | |
| 37 del kwargs["timeout"] | |
| 38 super().__init__(*args, **kwargs) | |
| 39 | |
| 40 def send(self, request, **kwargs): | |
| 41 timeout = kwargs.get("timeout") | |
| 42 if timeout is None: | |
| 43 kwargs["timeout"] = self.timeout | |
| 44 return super().send(request, **kwargs) | |
| 45 | 21 |
| 46 | 22 |
| 47 def __main__(): | 23 def __main__(): |
| 48 # Parse Command Line | 24 # Parse Command Line |
| 49 parser = optparse.OptionParser() | 25 parser = optparse.OptionParser() |
| 50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') | 26 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of search search_ids') |
| 51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') | 27 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains search search_ids') |
| 52 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') | 28 parser.add_option('-s', '--search-id', dest='search_id', action='append', default=[], help='ID to search in Uniprot') |
| 53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') | 29 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
| 54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') | 30 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta', 'tsv'], default='xml', help='output format') |
| 55 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') | 31 parser.add_option('-k', '--field', dest='field', choices=['taxonomy_name', 'taxonomy_id', 'accession'], default='taxonomy_name', help='query field') |
| 56 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') | 32 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') |
| 33 parser.add_option('--output_columns', dest='output_columns', help='Columns to include in output (tsv)') | |
| 57 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 34 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
| 58 (options, args) = parser.parse_args() | 35 (options, args) = parser.parse_args() |
| 59 search_ids = set(options.search_id) | 36 search_ids = set(options.search_id) |
| 60 if options.input: | 37 if options.input: |
| 61 with open(options.input, 'r') as inputFile: | 38 with open(options.input, 'r') as inputFile: |
| 73 dest_path = options.output | 50 dest_path = options.output |
| 74 else: | 51 else: |
| 75 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) | 52 dest_path = "uniprot_%s.xml" % '_'.join(search_ids) |
| 76 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 53 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
| 77 try: | 54 try: |
| 78 url = 'https://rest.uniprot.org/uniprotkb/stream' | 55 re_next_link = re.compile(r'<(.+)>; rel="next"') |
| 79 query = "%s%s" % (search_query, reviewed) | 56 retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504]) |
| 80 params = {'query': query, 'format': options.format} | 57 session = requests.Session() |
| 81 if options.debug: | 58 session.mount("https://", HTTPAdapter(max_retries=retries)) |
| 82 print("%s ? %s" % (url, params), file=sys.stderr) | |
| 83 data = parse.urlencode(params) | |
| 84 print(f"Retrieving: {url}?{data}") | |
| 85 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) | |
| 86 | 59 |
| 87 http = requests.Session() | 60 def get_next_link(headers): |
| 88 http.mount("https://", adapter) | 61 if "Link" in headers: |
| 89 response = http.get(url, params=params) | 62 match = re_next_link.match(headers["Link"]) |
| 90 http.close() | 63 if match: |
| 64 return match.group(1) | |
| 91 | 65 |
| 92 if response.status_code != 200: | 66 def get_batch(batch_url): |
| 93 exit(f"Request failed with status code {response.status_code}:\n{response.text}") | 67 while batch_url: |
| 68 response = session.get(batch_url) | |
| 69 response.raise_for_status() | |
| 70 total = response.headers["x-total-results"] | |
| 71 release = response.headers["x-uniprot-release"] | |
| 72 yield response, total, release | |
| 73 batch_url = get_next_link(response.headers) | |
| 74 | |
| 75 params = {'size': 500, 'format': options.format, 'query': search_query + reviewed} | |
| 76 if options.output_columns: | |
| 77 params['fields'] = options.output_columns | |
| 78 url = f'https://rest.uniprot.org/uniprotkb/search?{parse.urlencode(params)}' | |
| 79 print(f"Downloading from:{url}") | |
| 94 | 80 |
| 95 with open(dest_path, 'w') as fh: | 81 with open(dest_path, 'w') as fh: |
| 96 fh.write(response.text) | 82 for batch, total, release in get_batch(url): |
| 83 fh.write(batch.text) | |
| 97 | 84 |
| 98 if options.format == 'xml': | 85 if options.format == 'xml': |
| 99 with open(dest_path, 'r') as contents: | 86 with open(dest_path, 'r') as contents: |
| 100 while True: | 87 while True: |
| 101 line = contents.readline() | 88 line = contents.readline() |
| 110 if re.match(pattern, line): | 97 if re.match(pattern, line): |
| 111 break | 98 break |
| 112 else: | 99 else: |
| 113 print("failed: Not a uniprot xml file", file=sys.stderr) | 100 print("failed: Not a uniprot xml file", file=sys.stderr) |
| 114 exit(1) | 101 exit(1) |
| 115 print("Search IDs:%s" % search_ids, file=sys.stdout) | 102 print(f"Search IDs:{search_ids}") |
| 116 if 'X-UniProt-Release' in response.headers: | 103 print(f"UniProt-Release:{release}") |
| 117 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) | 104 print(f"Entries:{total}") |
| 118 if 'X-Total-Results' in response.headers: | |
| 119 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) | |
| 120 except Exception as e: | 105 except Exception as e: |
| 121 exit("%s" % e) | 106 exit("%s" % e) |
| 122 | 107 |
| 123 | 108 |
| 124 if __name__ == "__main__": | 109 if __name__ == "__main__": |
