Mercurial > repos > galaxyp > uniprotxml_downloader
comparison uniprotxml_downloader.py @ 3:b0abab8e78eb draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"
| author | galaxyp |
|---|---|
| date | Tue, 01 Jun 2021 11:54:16 +0000 |
| parents | 366bf2635603 |
| children | e1ffb00a0436 |
comparison
equal
deleted
inserted
replaced
| 2:366bf2635603 | 3:b0abab8e78eb |
|---|---|
| 9 # | 9 # |
| 10 # James E Johnson | 10 # James E Johnson |
| 11 # | 11 # |
| 12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
| 13 """ | 13 """ |
| 14 import optparse | |
| 15 import re | |
| 14 import sys | 16 import sys |
| 15 import re | 17 from urllib import parse |
| 16 import optparse | 18 |
| 17 import urllib | 19 import requests |
| 18 import urllib2 | 20 from requests.adapters import HTTPAdapter |
| 21 from requests.packages.urllib3.util.retry import Retry | |
| 22 | |
| 23 DEFAULT_TIMEOUT = 5 # seconds | |
| 24 retry_strategy = Retry( | |
| 25 total=5, | |
| 26 backoff_factor=2, | |
| 27 status_forcelist=[429, 500, 502, 503, 504], | |
| 28 allowed_methods=["HEAD", "GET", "OPTIONS", "POST"] | |
| 29 ) | |
| 30 | |
| 31 | |
| 32 class TimeoutHTTPAdapter(HTTPAdapter): | |
| 33 def __init__(self, *args, **kwargs): | |
| 34 self.timeout = DEFAULT_TIMEOUT | |
| 35 if "timeout" in kwargs: | |
| 36 self.timeout = kwargs["timeout"] | |
| 37 del kwargs["timeout"] | |
| 38 super().__init__(*args, **kwargs) | |
| 39 | |
| 40 def send(self, request, **kwargs): | |
| 41 timeout = kwargs.get("timeout") | |
| 42 if timeout is None: | |
| 43 kwargs["timeout"] = self.timeout | |
| 44 return super().send(request, **kwargs) | |
| 19 | 45 |
| 20 | 46 |
| 21 def __main__(): | 47 def __main__(): |
| 22 # Parse Command Line | 48 # Parse Command Line |
| 23 parser = optparse.OptionParser() | 49 parser = optparse.OptionParser() |
| 24 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') | 50 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs') |
| 25 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' ) | 51 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs') |
| 26 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') | 52 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download') |
| 27 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') | 53 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries') |
| 28 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') | 54 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format') |
| 29 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') | 55 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml') |
| 30 parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info') | |
| 31 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') | 56 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr') |
| 32 (options, args) = parser.parse_args() | 57 (options, args) = parser.parse_args() |
| 33 taxids = set(options.taxon) | 58 taxids = set(options.taxon) |
| 34 if options.input: | 59 if options.input: |
| 35 with open(options.input,'r') as inputFile: | 60 with open(options.input, 'r') as inputFile: |
| 36 for linenum,line in enumerate(inputFile): | 61 for linenum, line in enumerate(inputFile): |
| 37 if line.startswith('#'): | 62 if line.startswith('#'): |
| 38 continue | 63 continue |
| 39 fields = line.rstrip('\r\n').split('\t') | 64 fields = line.rstrip('\r\n').split('\t') |
| 40 if len(fields) > abs(options.column): | 65 if len(fields) > abs(options.column): |
| 41 taxid = fields[options.column].strip() | 66 taxid = fields[options.column].strip() |
| 42 if taxid: | 67 if taxid: |
| 43 taxids.add(taxid) | 68 taxids.add(taxid) |
| 44 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] | 69 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids] |
| 45 taxon_query = ' OR '.join(taxon_queries) | 70 taxon_query = ' OR '.join(taxon_queries) |
| 46 if options.output: | 71 if options.output: |
| 47 dest_path = options.output | 72 dest_path = options.output |
| 48 else: | 73 else: |
| 49 dest_path = "uniprot_%s.xml" % '_'.join(taxids) | 74 dest_path = "uniprot_%s.xml" % '_'.join(taxids) |
| 50 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' | 75 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else '' |
| 51 try: | 76 try: |
| 52 def reporthook(n1,n2,n3): | |
| 53 pass | |
| 54 url = 'https://www.uniprot.org/uniprot/' | 77 url = 'https://www.uniprot.org/uniprot/' |
| 55 query = "%s%s" % (taxon_query, reviewed) | 78 query = "%s%s" % (taxon_query, reviewed) |
| 56 params = {'query' : query, 'force' : 'yes' , 'format' : options.format} | 79 params = {'query': query, 'force': 'yes', 'format': options.format} |
| 57 if options.debug: | 80 if options.debug: |
| 58 print >> sys.stderr, "%s ? %s" % (url,params) | 81 print("%s ? %s" % (url, params), file=sys.stderr) |
| 59 data = urllib.urlencode(params) | 82 data = parse.urlencode(params) |
| 60 (fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data) | 83 print(f"Retrieving: {url+data}") |
| 61 headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]} | 84 adapter = TimeoutHTTPAdapter(max_retries=retry_strategy) |
| 62 if 'Content-Length' in headers and headers['Content-Length'] == 0: | 85 http = requests.Session() |
| 63 print >> sys.stderr, url | 86 http.mount("https://", adapter) |
| 64 print >> sys.stderr, msg | 87 response = http.post(url, data=params) |
| 65 exit(1) | 88 http.close() |
| 89 with open(dest_path, 'w') as fh: | |
| 90 fh.write(response.text) | |
| 66 if options.format == 'xml': | 91 if options.format == 'xml': |
| 67 with open(dest_path, 'r') as contents: | 92 with open(dest_path, 'r') as contents: |
| 68 while True: | 93 while True: |
| 69 line = contents.readline() | 94 line = contents.readline() |
| 70 if options.debug: | 95 if options.debug: |
| 71 print >> sys.stderr, line | 96 print(line, file=sys.stderr) |
| 72 if line is None: | 97 if line is None: |
| 73 break | 98 break |
| 74 if line.startswith('<?'): | 99 if line.startswith('<?'): |
| 75 continue | 100 continue |
| 76 # pattern match <root or <ns:root for any ns string | 101 # pattern match <root or <ns:root for any ns string |
| 77 pattern = '^<(\w*:)?uniprot' | 102 pattern = r'^<(\w*:)?uniprot' |
| 78 if re.match(pattern, line): | 103 if re.match(pattern, line): |
| 79 break | 104 break |
| 80 else: | 105 else: |
| 81 print >> sys.stderr, "failed: Not a uniprot xml file" | 106 print("failed: Not a uniprot xml file", file=sys.stderr) |
| 82 exit(1) | 107 exit(1) |
| 83 if options.verbose: | 108 print("NCBI Taxon ID:%s" % taxids, file=sys.stdout) |
| 84 print >> sys.stdout, "NCBI Taxon ID:%s" % taxids | 109 if 'X-UniProt-Release' in response.headers: |
| 85 if 'X-UniProt-Release' in headers: | 110 print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout) |
| 86 print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release'] | 111 if 'X-Total-Results' in response.headers: |
| 87 if 'X-Total-Results' in headers: | 112 print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout) |
| 88 print >> sys.stdout, "Entries:%s" % headers['X-Total-Results'] | 113 except Exception as e: |
| 89 print >> sys.stdout, "%s" % url | 114 exit("%s" % e) |
| 90 except Exception, e: | |
| 91 print >> sys.stderr, "failed: %s" % e | |
| 92 | 115 |
| 93 | 116 |
| 94 if __name__ == "__main__": | 117 if __name__ == "__main__": |
| 95 __main__() | 118 __main__() |
