uniprotxml_downloader: uniprotxml_downloader.py comparison

comparison uniprotxml_downloader.py @ 3:b0abab8e78eb draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/uniprotxml_downloader commit 62afd9de6db50f4314e49d9f24881b6d3778a0a5"

author	galaxyp
date	Tue, 01 Jun 2021 11:54:16 +0000
parents	366bf2635603
children	e1ffb00a0436

comparison

equal deleted inserted replaced

-:366bf2635603
+:b0abab8e78eb
 #
 #  James E Johnson
 #
 #------------------------------------------------------------------------------
 """
+import optparse
+import re
 import sys
-import re
+from urllib import parse
-import optparse
-import urllib
+import requests
-import urllib2
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+DEFAULT_TIMEOUT = 5  # seconds
+retry_strategy = Retry(
+total=5,
+backoff_factor=2,
+status_forcelist=[429, 500, 502, 503, 504],
+allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
+)
+class TimeoutHTTPAdapter(HTTPAdapter):
+def __init__(self, *args, **kwargs):
+self.timeout = DEFAULT_TIMEOUT
+if "timeout" in kwargs:
+self.timeout = kwargs["timeout"]
+del kwargs["timeout"]
+super().__init__(*args, **kwargs)
+def send(self, request, **kwargs):
+timeout = kwargs.get("timeout")
+if timeout is None:
+kwargs["timeout"] = self.timeout
+return super().send(request, **kwargs)
 def __main__():
 # Parse Command Line
 parser = optparse.OptionParser()
 parser.add_option('-i', '--input', dest='input', default=None, help='Tabular file containing a column of NCBI Taxon IDs')
-parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs' )
+parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains Taxon IDs')
 parser.add_option('-t', '--taxon', dest='taxon', action='append', default=[], help='NCBI taxon ID to download')
 parser.add_option('-r', '--reviewed', dest='reviewed', help='Only uniprot reviewed entries')
 parser.add_option('-f', '--format', dest='format', choices=['xml', 'fasta'], default='xml', help='output format')
 parser.add_option('-o', '--output', dest='output', help='file path for the downloaded uniprot xml')
-parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Print UniProt Info')
 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stderr')
 (options, args) = parser.parse_args()
 taxids = set(options.taxon)
 if options.input:
-with open(options.input,'r') as inputFile:
+with open(options.input, 'r') as inputFile:
-for linenum,line in enumerate(inputFile):
+for linenum, line in enumerate(inputFile):
 if line.startswith('#'):
 continue
 fields = line.rstrip('\r\n').split('\t')
 if len(fields) > abs(options.column):
 taxid = fields[options.column].strip()
 if taxid:
 taxids.add(taxid)
 taxon_queries = ['taxonomy:"%s"' % taxid for taxid in taxids]
 taxon_query = ' OR '.join(taxon_queries)
 if options.output:
 dest_path = options.output
 else:
 dest_path = "uniprot_%s.xml" % '_'.join(taxids)
 reviewed = " reviewed:%s" % options.reviewed if options.reviewed else ''
 try:
-def reporthook(n1,n2,n3):
-pass
 url = 'https://www.uniprot.org/uniprot/'
 query = "%s%s" % (taxon_query, reviewed)
-params = {'query' : query, 'force' : 'yes' , 'format' : options.format}
+params = {'query': query, 'force': 'yes', 'format': options.format}
 if options.debug:
-print >> sys.stderr, "%s ? %s" % (url,params)
+print("%s ? %s" % (url, params), file=sys.stderr)
-data = urllib.urlencode(params)
+data = parse.urlencode(params)
-(fname, msg) = urllib.urlretrieve(url, dest_path,reporthook,data)
+print(f"Retrieving: {url+data}")
-headers = {j[0]: j[1].strip() for j in [i.split(':', 1) for i in str(msg).strip().splitlines()]}
+adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
-if 'Content-Length' in headers and headers['Content-Length'] == 0:
+http = requests.Session()
-print >> sys.stderr, url
+http.mount("https://", adapter)
-print >> sys.stderr, msg
+response = http.post(url, data=params)
-exit(1)
+http.close()
+with open(dest_path, 'w') as fh:
+fh.write(response.text)
 if options.format == 'xml':
 with open(dest_path, 'r') as contents:
 while True:
 line = contents.readline()
 if options.debug:
-print >> sys.stderr, line
+print(line, file=sys.stderr)
 if line is None:
 break
 if line.startswith('<?'):
 continue
 # pattern match <root or <ns:root for any ns string
-pattern = '^<(\w*:)?uniprot'
+pattern = r'^<(\w*:)?uniprot'
 if re.match(pattern, line):
 break
 else:
-print >> sys.stderr, "failed: Not a uniprot xml file"
+print("failed: Not a uniprot xml file", file=sys.stderr)
 exit(1)
-if options.verbose:
+print("NCBI Taxon ID:%s" % taxids, file=sys.stdout)
-print >> sys.stdout, "NCBI Taxon ID:%s" % taxids
+if 'X-UniProt-Release' in response.headers:
-if 'X-UniProt-Release' in headers:
+print("UniProt-Release:%s" % response.headers['X-UniProt-Release'], file=sys.stdout)
-print >> sys.stdout, "UniProt-Release:%s" % headers['X-UniProt-Release']
+if 'X-Total-Results' in response.headers:
-if 'X-Total-Results' in headers:
+print("Entries:%s" % response.headers['X-Total-Results'], file=sys.stdout)
-print >> sys.stdout, "Entries:%s" % headers['X-Total-Results']
+except Exception as e:
-print >> sys.stdout, "%s" % url
+exit("%s" % e)
-except Exception, e:
-print >> sys.stderr, "failed: %s" % e
 if __name__ == "__main__":
 __main__()

Mercurial > repos > galaxyp > uniprotxml_downloader

comparison uniprotxml_downloader.py @ 3:b0abab8e78eb draft