Mercurial > repos > bgruening > uniprot_rest_interface
comparison uniprot.py @ 7:b1cc2c5bde0e draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit ddbed5f0b0879b4a001d2da6a521b0c9a39c1e7b"
| author | bgruening |
|---|---|
| date | Thu, 22 Apr 2021 17:31:48 +0000 |
| parents | 6e9fabe16b0c |
| children | d2ad6e2c55d1 |
comparison
equal
deleted
inserted
replaced
| 6:f806bb47aff6 | 7:b1cc2c5bde0e |
|---|---|
| 5 | 5 |
| 6 Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot | 6 Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot |
| 7 available services: | 7 available services: |
| 8 map | 8 map |
| 9 retrieve | 9 retrieve |
| 10 | |
| 11 rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/ | |
| 10 """ | 12 """ |
| 11 import argparse | 13 import argparse |
| 12 import sys | 14 import sys |
| 13 | 15 |
| 14 import requests | 16 import requests |
| 15 | 17 from requests.adapters import HTTPAdapter |
| 16 url = 'https://www.uniprot.org/' | 18 from requests.packages.urllib3.util.retry import Retry |
| 17 | 19 |
| 18 | 20 |
| 19 def _retrieve(query, format='txt'): | 21 DEFAULT_TIMEOUT = 5 # seconds |
| 20 """_retrieve is not meant for use with the python interface, use `retrieve` | 22 URL = 'https://www.uniprot.org/' |
| 21 instead""" | |
| 22 tool = 'uploadlists/' | |
| 23 | 23 |
| 24 query = list(set(query.split('\n'))) | 24 retry_strategy = Retry( |
| 25 queries = [query[i:i+100] for i in range(0, len(query), 100)] | 25 total=5, |
| 26 | 26 backoff_factor=2, |
| 27 data = { | 27 status_forcelist=[429, 500, 502, 503, 504], |
| 28 'format': format, | 28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] |
| 29 'from': 'ACC+ID', | 29 ) |
| 30 'to': 'ACC' | |
| 31 } | |
| 32 | |
| 33 responses = [requests.post(url + tool, data=data, files={'file': ' '.join(_)}) for _ in queries] | |
| 34 page = ''.join(response.text for response in responses) | |
| 35 return page | |
| 36 | 30 |
| 37 | 31 |
| 38 def _map(query, f, t, format='tab'): | 32 class TimeoutHTTPAdapter(HTTPAdapter): |
| 33 def __init__(self, *args, **kwargs): | |
| 34 self.timeout = DEFAULT_TIMEOUT | |
| 35 if "timeout" in kwargs: | |
| 36 self.timeout = kwargs["timeout"] | |
| 37 del kwargs["timeout"] | |
| 38 super().__init__(*args, **kwargs) | |
| 39 | |
| 40 def send(self, request, **kwargs): | |
| 41 timeout = kwargs.get("timeout") | |
| 42 if timeout is None: | |
| 43 kwargs["timeout"] = self.timeout | |
| 44 return super().send(request, **kwargs) | |
| 45 | |
| 46 | |
| 47 def _map(query, f, t, format='tab', chunk_size=100): | |
| 39 """ _map is not meant for use with the python interface, use `map` instead | 48 """ _map is not meant for use with the python interface, use `map` instead |
| 40 """ | 49 """ |
| 41 tool = 'uploadlists/' | 50 tool = 'uploadlists/' |
| 51 data = {'format': format, 'from': f, 'to': t} | |
| 42 | 52 |
| 43 data = { | 53 req = [] |
| 44 'from': f, | 54 for i in range(0, len(query), chunk_size): |
| 45 'to': t, | 55 q = query[i:i + chunk_size] |
| 46 'format': format, | 56 req.append(dict([("url", URL + tool), |
| 47 'query': query | 57 ('data', data), |
| 48 } | 58 ("files", {'file': ' '.join(q)})])) |
| 49 response = requests.post(url + tool, data=data) | 59 return req |
| 60 response = requests.post(URL + tool, data=data) | |
| 61 response.raise_for_status() | |
| 50 page = response.text | 62 page = response.text |
| 63 if "The service is temporarily unavailable" in page: | |
| 64 exit("The UNIPROT service is temporarily unavailable. Please try again later.") | |
| 51 return page | 65 return page |
| 52 | 66 |
| 53 | 67 |
| 54 if __name__ == '__main__': | 68 if __name__ == '__main__': |
| 55 parser = argparse.ArgumentParser(description='retrieve uniprot mapping') | 69 parser = argparse.ArgumentParser(description='retrieve uniprot mapping') |
| 70 retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'), | 84 retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'), |
| 71 default=sys.stdout, help='output file (default: stdout)') | 85 default=sys.stdout, help='output file (default: stdout)') |
| 72 retrieve.add_argument('-f', '--format', help='specify output format', default='txt') | 86 retrieve.add_argument('-f', '--format', help='specify output format', default='txt') |
| 73 | 87 |
| 74 args = parser.parse_args() | 88 args = parser.parse_args() |
| 75 query = args.inp.read() | 89 |
| 90 # get the IDs from the file as sorted list | |
| 91 # (sorted is convenient for testing) | |
| 92 query = set() | |
| 93 for line in args.inp: | |
| 94 query.add(line.strip()) | |
| 95 query = sorted(query) | |
| 76 | 96 |
| 77 if args.tool == 'map': | 97 if args.tool == 'map': |
| 78 args.out.write(_map(query, args.f, args.t, args.format)) | 98 pload = _map(query, args.f, args.t, chunk_size=100) |
| 99 elif args.tool == 'retrieve': | |
| 100 pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100) | |
| 79 | 101 |
| 80 elif args.tool == 'retrieve': | 102 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) |
| 81 args.out.write(_retrieve(query, format=args.format)) | 103 http = requests.Session() |
| 104 http.mount("https://", adapter) | |
| 105 for i, p in enumerate(pload): | |
| 106 response = http.post(**p) | |
| 107 args.out.write(response.text) | |
| 108 http.close() |
