# HG changeset patch
# User bgruening
# Date 1722955774 0
# Node ID f31d8d59ffb62deeeda0cf320871fefa23c35b06
# Parent d2ad6e2c55d1e851ce2a32e9e4116390705637a5
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/uniprot_rest_interface commit 1c020106d4d7f957c9f1ec0d9885bbb2d56e70e7
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 macros.xml
--- a/macros.xml Mon Nov 21 22:02:05 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,226 +0,0 @@
-
-1.0
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 test-data/test1_map.tab
--- a/test-data/test1_map.tab Mon Nov 21 22:02:05 2022 +0000
+++ b/test-data/test1_map.tab Tue Aug 06 14:49:34 2024 +0000
@@ -2,8 +2,6 @@
A0A077Z587 TTRE_0000309301
A0A077ZFY8 TTRE_0000758701
A0A077ZHN8 TTRE_0000819801
-M5B8V9 CMN_01519
-M5BAG7 cydC
O14639 ABLIM1
Q0P8A9 fdhC
Q13685 AAMP
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 test-data/test2_map.tab
--- a/test-data/test2_map.tab Mon Nov 21 22:02:05 2022 +0000
+++ b/test-data/test2_map.tab Tue Aug 06 14:49:34 2024 +0000
@@ -1,4 +1,4 @@
-From To
-NM_001087 AAMP_HUMAN
-NM_130786 A1BG_HUMAN
-NM_130786 V9HWD8_HUMAN
+From Entry Entry Name Reviewed Protein names Gene Names Organism Length
+NM_001087 Q13685 AAMP_HUMAN reviewed Angio-associated migratory cell protein AAMP Homo sapiens (Human) 434
+NM_130786 P04217 A1BG_HUMAN reviewed Alpha-1B-glycoprotein (Alpha-1-B glycoprotein) A1BG Homo sapiens (Human) 495
+NM_130786 V9HWD8 V9HWD8_HUMAN unreviewed Epididymis secretory sperm binding protein Li 163pA HEL-S-163pA Homo sapiens (Human) 495
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 test-data/test2_retrieve.gff
--- a/test-data/test2_retrieve.gff Mon Nov 21 22:02:05 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,107 +0,0 @@
-##gff-version 3
-##sequence-region M5BAG7 1 563
-M5BAG7 UniProtKB Transmembrane 21 43 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Transmembrane 49 71 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Transmembrane 132 153 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Transmembrane 159 181 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Transmembrane 236 259 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Transmembrane 274 296 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5BAG7 UniProtKB Domain 20 301 . . . Note=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929
-M5BAG7 UniProtKB Domain 345 559 . . . Note=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893
-M5BAG7 UniProtKB Nucleotide binding 379 386 . . . Note=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434
-M5BAG7 UniProtKB Region 317 337 . . . Note=Disordered;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:MobiDB-lite
-##sequence-region A0A077ZHN8 1 634
-A0A077ZHN8 UniProtKB Transmembrane 14 36 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077ZHN8 UniProtKB Transmembrane 56 80 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077ZHN8 UniProtKB Transmembrane 113 132 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077ZHN8 UniProtKB Transmembrane 290 310 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077ZHN8 UniProtKB Domain 312 364 . . . Note=HAMP;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50885
-A0A077ZHN8 UniProtKB Domain 369 598 . . . Note=Methyl-accepting transducer;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50111
-A0A077ZHN8 UniProtKB Coiled coil 170 204 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils
-A0A077ZHN8 UniProtKB Coiled coil 569 607 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils
-##sequence-region M5B8V9 1 582
-M5B8V9 UniProtKB Transmembrane 20 43 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5B8V9 UniProtKB Transmembrane 55 77 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5B8V9 UniProtKB Transmembrane 134 154 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5B8V9 UniProtKB Transmembrane 161 180 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5B8V9 UniProtKB Transmembrane 236 260 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-M5B8V9 UniProtKB Domain 20 302 . . . Note=ABC transmembrane type-1;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50929
-M5B8V9 UniProtKB Domain 340 570 . . . Note=ABC transporter;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50893
-M5B8V9 UniProtKB Nucleotide binding 372 379 . . . Note=ATP;Ontology_term=ECO:0000256;evidence=ECO:0000256|PROSITE-ProRule:PRU00434
-##sequence-region S0DS17 1 369
-S0DS17 UniProtKB Chain 1 369 . . . ID=PRO_0000437163;Note=Cytochrome P450 monooxygenase apf8
-S0DS17 UniProtKB Metal binding 303 303 . . . Note=Iron (heme axial ligand);Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:P04798
-##sequence-region A0A077Z587 1 772
-A0A077Z587 UniProtKB Transmembrane 593 617 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077Z587 UniProtKB Transmembrane 637 656 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077Z587 UniProtKB Transmembrane 668 692 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077Z587 UniProtKB Transmembrane 704 727 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077Z587 UniProtKB Transmembrane 733 755 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-A0A077Z587 UniProtKB Domain 20 94 . . . Note=PDZ;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50106
-A0A077Z587 UniProtKB Domain 552 761 . . . Note=Cytochrome b561;Ontology_term=ECO:0000259;evidence=ECO:0000259|PROSITE:PS50939
-##sequence-region Q0P8A9 1 310
-Q0P8A9 UniProtKB Transmembrane 55 78 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-Q0P8A9 UniProtKB Transmembrane 99 124 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-Q0P8A9 UniProtKB Transmembrane 136 156 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-Q0P8A9 UniProtKB Transmembrane 195 216 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-Q0P8A9 UniProtKB Transmembrane 244 264 . . . Note=Helical;Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Phobius
-Q0P8A9 UniProtKB Domain 93 274 . . . Note=Ni_hydr_CYTB;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01292
-##sequence-region O14639 1 778
-O14639 UniProtKB Chain 1 778 . . . ID=PRO_0000075697;Note=Actin-binding LIM protein 1
-O14639 UniProtKB Domain 97 156 . . . Note=LIM zinc-binding 1;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125
-O14639 UniProtKB Domain 156 216 . . . Note=LIM zinc-binding 2;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125
-O14639 UniProtKB Domain 224 283 . . . Note=LIM zinc-binding 3;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125
-O14639 UniProtKB Domain 283 343 . . . Note=LIM zinc-binding 4;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00125
-O14639 UniProtKB Domain 710 778 . . . Note=HP;Ontology_term=ECO:0000255;evidence=ECO:0000255|PROSITE-ProRule:PRU00595
-O14639 UniProtKB Coiled coil 590 614 . . . Ontology_term=ECO:0000255;evidence=ECO:0000255
-O14639 UniProtKB Modified residue 216 216 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5
-O14639 UniProtKB Modified residue 367 367 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332
-O14639 UniProtKB Modified residue 373 373 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332
-O14639 UniProtKB Modified residue 396 396 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:15592455;Dbxref=PMID:15592455
-O14639 UniProtKB Modified residue 422 422 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:23186163;Dbxref=PMID:23186163
-O14639 UniProtKB Modified residue 426 426 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:24275569;Dbxref=PMID:19690332,PMID:24275569
-O14639 UniProtKB Modified residue 431 431 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:19690332,PMID:23186163
-O14639 UniProtKB Modified residue 433 433 . . . Note=Phosphothreonine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:19690332;Dbxref=PMID:19690332
-O14639 UniProtKB Modified residue 435 435 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:19690332,ECO:0000244|PubMed:20068231,ECO:0000244|PubMed:21406692,ECO:0000244|PubMed:23186163,ECO:0000244|PubMed:24275569;Dbxref=PMID:18669648,PMID:19690332,PMID:20068231,PMID:21406692,PMID:23186163,PMID:24275569
-O14639 UniProtKB Modified residue 439 439 . . . Note=Phosphotyrosine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:15144186;Dbxref=PMID:15144186
-O14639 UniProtKB Modified residue 452 452 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:23186163;Dbxref=PMID:23186163
-O14639 UniProtKB Modified residue 455 455 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163,ECO:0000244|PubMed:24275569;Dbxref=PMID:18669648,PMID:23186163,PMID:24275569
-O14639 UniProtKB Modified residue 458 458 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163
-O14639 UniProtKB Modified residue 498 498 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5
-O14639 UniProtKB Modified residue 587 587 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163
-O14639 UniProtKB Modified residue 640 640 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:20068231;Dbxref=PMID:18669648,PMID:20068231
-O14639 UniProtKB Modified residue 655 655 . . . Note=Phosphoserine;Ontology_term=ECO:0000244,ECO:0000244;evidence=ECO:0000244|PubMed:18669648,ECO:0000244|PubMed:23186163;Dbxref=PMID:18669648,PMID:23186163
-O14639 UniProtKB Modified residue 677 677 . . . Note=Phosphoserine;Ontology_term=ECO:0000250;evidence=ECO:0000250|UniProtKB:Q8K4G5
-O14639 UniProtKB Modified residue 706 706 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569
-O14639 UniProtKB Cross-link 620 620 . . . Note=Glycyl lysine isopeptide (Lys-Gly) (interchain with G-Cter in SUMO2);Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:28112733;Dbxref=PMID:28112733
-O14639 UniProtKB Alternative sequence 1 316 . . . ID=VSP_012099;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005
-O14639 UniProtKB Alternative sequence 1 81 . . . ID=VSP_012100;Note=In isoform 2 and isoform 6. MPAFLGLKCLGKLCSSEKSKVTSSERTSARGSNRKRLIVEDRRVSGTSFTAHRRATITHLLYLCPKDYCPRGRVCNSVDPF->MLMTLEMTELTDPHHTMGDYK;Ontology_term=ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:7584044;Dbxref=PMID:14702039,PMID:7584044
-O14639 UniProtKB Alternative sequence 347 347 . . . ID=VSP_041185;Note=In isoform 5 and isoform 6. R->RLPNIRRSSSDFFYSKSLIRRTGRSPSLQ;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039
-O14639 UniProtKB Alternative sequence 348 373 . . . ID=VSP_012101;Note=In isoform 4. Missing;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:15489334;Dbxref=PMID:15489334
-O14639 UniProtKB Alternative sequence 480 514 . . . ID=VSP_012102;Note=In isoform 3%2C isoform 4 and isoform 5. Missing;Ontology_term=ECO:0000303,ECO:0000303,ECO:0000303;evidence=ECO:0000303|PubMed:14702039,ECO:0000303|PubMed:15489334,ECO:0000303|PubMed:17974005;Dbxref=PMID:14702039,PMID:15489334,PMID:17974005
-O14639 UniProtKB Alternative sequence 531 531 . . . ID=VSP_057209;Note=In isoform 6. H->HDA;Ontology_term=ECO:0000303;evidence=ECO:0000303|PubMed:14702039;Dbxref=PMID:14702039
-O14639 UniProtKB Natural variant 434 434 . . . ID=VAR_050141;Note=P->T;Dbxref=dbSNP:rs11593544
-O14639 UniProtKB Natural variant 637 637 . . . ID=VAR_050142;Note=R->G;Dbxref=dbSNP:rs7091419
-O14639 UniProtKB Sequence conflict 499 499 . . . Note=R->L;Ontology_term=ECO:0000305;evidence=ECO:0000305
-O14639 UniProtKB Sequence conflict 532 532 . . . Note=A->R;Ontology_term=ECO:0000305;evidence=ECO:0000305
-O14639 UniProtKB Sequence conflict 563 563 . . . Note=K->E;Ontology_term=ECO:0000305;evidence=ECO:0000305
-O14639 UniProtKB Sequence conflict 578 578 . . . Note=V->I;Ontology_term=ECO:0000305;evidence=ECO:0000305
-##sequence-region A0A077ZFY8 1 973
-A0A077ZFY8 UniProtKB Domain 1 89 . . . Note=Mur_ligase;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF01225
-A0A077ZFY8 UniProtKB Domain 96 279 . . . Note=Mur_ligase_M;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF08245
-A0A077ZFY8 UniProtKB Domain 300 349 . . . Note=Mur_ligase_C;Ontology_term=ECO:0000259;evidence=ECO:0000259|Pfam:PF02875
-A0A077ZFY8 UniProtKB Coiled coil 867 887 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils
-A0A077ZFY8 UniProtKB Coiled coil 951 971 . . . Ontology_term=ECO:0000256;evidence=ECO:0000256|SAM:Coils
-##sequence-region Q13685 1 434
-Q13685 UniProtKB Chain 1 434 . . . ID=PRO_0000050832;Note=Angio-associated migratory cell protein
-Q13685 UniProtKB Repeat 89 129 . . . Note=WD 1
-Q13685 UniProtKB Repeat 132 171 . . . Note=WD 2
-Q13685 UniProtKB Repeat 173 212 . . . Note=WD 3
-Q13685 UniProtKB Repeat 214 254 . . . Note=WD 4
-Q13685 UniProtKB Repeat 258 299 . . . Note=WD 5
-Q13685 UniProtKB Repeat 315 354 . . . Note=WD 6
-Q13685 UniProtKB Repeat 356 395 . . . Note=WD 7
-Q13685 UniProtKB Repeat 398 433 . . . Note=WD 8
-Q13685 UniProtKB Compositional bias 53 59 . . . Note=Poly-Glu
-Q13685 UniProtKB Modified residue 20 20 . . . Note=Phosphoserine;Ontology_term=ECO:0000244;evidence=ECO:0000244|PubMed:24275569;Dbxref=PMID:24275569
-Q13685 UniProtKB Natural variant 250 250 . . . ID=VAR_037061;Note=I->V;Dbxref=dbSNP:rs2305835
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 uniprot.py
--- a/uniprot.py Mon Nov 21 22:02:05 2022 +0000
+++ b/uniprot.py Tue Aug 06 14:49:34 2024 +0000
@@ -1,108 +1,296 @@
-#!/usr/bin/env python
-"""
-uniprot python interface
-to access the uniprot database
-
-Based on work from Jan Rudolph: https://github.com/jdrudolph/uniprot
-available services:
- map
- retrieve
-
-rewitten using inspiration form: https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/
-"""
import argparse
+import json
+import re
import sys
+import time
+import zlib
+from time import sleep
+from urllib.parse import (
+ parse_qs,
+ urlencode,
+ urlparse,
+)
+from xml.etree import ElementTree
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
-
-
-DEFAULT_TIMEOUT = 5 # seconds
-URL = 'https://legacy.uniprot.org/'
-
-retry_strategy = Retry(
- total=5,
- backoff_factor=2,
- status_forcelist=[429, 500, 502, 503, 504],
- allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
+from requests.adapters import (
+ HTTPAdapter,
+ Retry,
)
-class TimeoutHTTPAdapter(HTTPAdapter):
- def __init__(self, *args, **kwargs):
- self.timeout = DEFAULT_TIMEOUT
- if "timeout" in kwargs:
- self.timeout = kwargs["timeout"]
- del kwargs["timeout"]
- super().__init__(*args, **kwargs)
+BATCH_SIZE = 50000 # Limit at UniProt is 100k
+POLLING_INTERVAL = 5
+API_URL = "https://rest.uniprot.org"
+
+
+retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
+session = requests.Session()
+session.mount("https://", HTTPAdapter(max_retries=retries))
+
+
+def check_response(response):
+ try:
+ response.raise_for_status()
+ except requests.HTTPError:
+ raise
+
+
+def submit_id_mapping(from_db, to_db, ids):
+ print(f"{from_db} {to_db}")
+ request = requests.post(
+ f"{API_URL}/idmapping/run",
+ data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
+ )
+ check_response(request)
+ return request.json()["jobId"]
+
+
+def get_next_link(headers):
+ re_next_link = re.compile(r'<(.+)>; rel="next"')
+ if "Link" in headers:
+ match = re_next_link.match(headers["Link"])
+ if match:
+ return match.group(1)
+
+
+def check_id_mapping_results_ready(job_id):
+ while True:
+ request = session.get(f"{API_URL}/idmapping/status/{job_id}")
+ check_response(request)
+ j = request.json()
+ if "jobStatus" in j:
+ if j["jobStatus"] in ["NEW", "RUNNING"]:
+ print(f"Retrying in {POLLING_INTERVAL}s")
+ time.sleep(POLLING_INTERVAL)
+ else:
+ raise Exception(j["jobStatus"])
+ else:
+ return bool(j["results"] or j["failedIds"])
+
+
+def get_batch(batch_response, file_format, compressed):
+ batch_url = get_next_link(batch_response.headers)
+ while batch_url:
+ batch_response = session.get(batch_url)
+ batch_response.raise_for_status()
+ yield decode_results(batch_response, file_format, compressed)
+ batch_url = get_next_link(batch_response.headers)
- def send(self, request, **kwargs):
- timeout = kwargs.get("timeout")
- if timeout is None:
- kwargs["timeout"] = self.timeout
- return super().send(request, **kwargs)
+
+def combine_batches(all_results, batch_results, file_format):
+ if file_format == "json":
+ for key in ("results", "failedIds"):
+ if key in batch_results and batch_results[key]:
+ all_results[key] += batch_results[key]
+ elif file_format == "tsv":
+ return all_results + batch_results[1:]
+ else:
+ return all_results + batch_results
+ return all_results
+
+
+def get_id_mapping_results_link(job_id):
+ url = f"{API_URL}/idmapping/details/{job_id}"
+ request = session.get(url)
+ check_response(request)
+ return request.json()["redirectURL"]
+
+
+def decode_results(response, file_format, compressed):
+ if compressed:
+ decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
+ if file_format == "json":
+ j = json.loads(decompressed.decode("utf-8"))
+ return j
+ elif file_format in ["tsv", "gff"]:
+ return [line for line in decompressed.decode("utf-8").split("\n") if line]
+ elif file_format == "xlsx":
+ return [decompressed]
+ elif file_format == "xml":
+ return [decompressed.decode("utf-8")]
+ else:
+ return decompressed.decode("utf-8")
+ elif file_format == "json":
+ return response.json()
+ elif file_format in ["tsv", "gff"]:
+ return [line for line in response.text.split("\n") if line]
+ elif file_format == "xlsx":
+ return [response.content]
+ elif file_format == "xml":
+ return [response.text]
+ return response.text
+
+
+def get_xml_namespace(element):
+ m = re.match(r"\{(.*)\}", element.tag)
+ return m.groups()[0] if m else ""
+
+
+def merge_xml_results(xml_results):
+ merged_root = ElementTree.fromstring(xml_results[0])
+ for result in xml_results[1:]:
+ root = ElementTree.fromstring(result)
+ for child in root.findall("{http://uniprot.org/uniprot}entry"):
+ merged_root.insert(-1, child)
+ ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
+ return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)
-def _map(query, f, t, format='tab', chunk_size=100):
- """ _map is not meant for use with the python interface, use `map` instead
- """
- tool = 'uploadlists/'
- data = {'format': format, 'from': f, 'to': t}
+def print_progress_batches(batch_index, size, total):
+ n_fetched = min((batch_index + 1) * size, total)
+ print(f"Fetched: {n_fetched} / {total}")
+
- req = []
- for i in range(0, len(query), chunk_size):
- q = query[i:i + chunk_size]
- req.append(dict([("url", URL + tool),
- ('data', data),
- ("files", {'file': ' '.join(q)})]))
- return req
- response = requests.post(URL + tool, data=data)
- response.raise_for_status()
- page = response.text
- if "The service is temporarily unavailable" in page:
- exit("The UNIPROT service is temporarily unavailable. Please try again later.")
- return page
+def get_id_mapping_results_search(url, first):
+ parsed = urlparse(url)
+ query = parse_qs(parsed.query)
+ file_format = query["format"][0] if "format" in query else "json"
+ if "size" in query:
+ size = int(query["size"][0])
+ else:
+ size = 500
+ query["size"] = size
+ compressed = (
+ query["compressed"][0].lower() == "true" if "compressed" in query else False
+ )
+ parsed = parsed._replace(query=urlencode(query, doseq=True))
+ url = parsed.geturl()
+ request = session.get(url)
+ check_response(request)
+ results = decode_results(request, file_format, compressed)
+ total = int(request.headers["x-total-results"])
+ print_progress_batches(0, size, total)
+ for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
+ results = combine_batches(results, batch, file_format)
+ print_progress_batches(i, size, total)
+ if len(results) > 1 and file_format == "tsv" and not first:
+ results = results[1:]
+ if file_format == "xml":
+ return merge_xml_results(results)
+ return results
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='retrieve uniprot mapping')
- subparsers = parser.add_subparsers(dest='tool')
+# print(results)
+# {'results': [{'from': 'P05067', 'to': 'CHEMBL2487'}], 'failedIds': ['P12345']}
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="retrieve uniprot mapping")
+ subparsers = parser.add_subparsers(dest="tool")
- mapping = subparsers.add_parser('map')
- mapping.add_argument('f', help='from')
- mapping.add_argument('t', help='to')
- mapping.add_argument('inp', nargs='?', type=argparse.FileType('r'),
- default=sys.stdin, help='input file (default: stdin)')
- mapping.add_argument('out', nargs='?', type=argparse.FileType('w'),
- default=sys.stdout, help='output file (default: stdout)')
- mapping.add_argument('--format', default='tab', help='output format')
+ mapping = subparsers.add_parser("map")
+ mapping.add_argument("f", help="from")
+ mapping.add_argument("t", help="to")
+ mapping.add_argument(
+ "inp",
+ nargs="?",
+ type=argparse.FileType("r"),
+ default=sys.stdin,
+ help="input file (default: stdin)",
+ )
+ mapping.add_argument(
+ "out",
+ nargs="?",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="output file (default: stdout)",
+ )
+ mapping.add_argument("--format", default="tab", help="output format")
- retrieve = subparsers.add_parser('retrieve')
- retrieve.add_argument('inp', metavar='in', nargs='?', type=argparse.FileType('r'),
- default=sys.stdin, help='input file (default: stdin)')
- retrieve.add_argument('out', nargs='?', type=argparse.FileType('w'),
- default=sys.stdout, help='output file (default: stdout)')
- retrieve.add_argument('-f', '--format', help='specify output format', default='txt')
+ retrieve = subparsers.add_parser("retrieve")
+ retrieve.add_argument(
+ "inp",
+ metavar="in",
+ nargs="?",
+ type=argparse.FileType("r"),
+ default=sys.stdin,
+ help="input file (default: stdin)",
+ )
+ retrieve.add_argument(
+ "out",
+ nargs="?",
+ type=argparse.FileType("w"),
+ default=sys.stdout,
+ help="output file (default: stdout)",
+ )
+ retrieve.add_argument("-f", "--format", help="specify output format", default="txt")
+ mapping = subparsers.add_parser("menu")
args = parser.parse_args()
+ # code for auto generating the from - to conditional
+ if args.tool == "menu":
+ from lxml import etree
+
+ request = session.get("https://rest.uniprot.org/configure/idmapping/fields")
+ check_response(request)
+ fields = request.json()
+
+ tos = dict()
+ from_cond = etree.Element("conditional", name="from_cond")
+ from_select = etree.SubElement(
+ from_cond, "param", name="from", type="select", label="Source database:"
+ )
+
+ rules = dict()
+ for rule in fields["rules"]:
+ rules[rule["ruleId"]] = rule["tos"]
+
+ for group in fields["groups"]:
+ group_name = group["groupName"]
+ group_name = group_name.replace("databases", "DBs")
+ for item in group["items"]:
+ if item["to"]:
+ tos[item["name"]] = f"{group_name} - {item['displayName']}"
+
+ for group in fields["groups"]:
+ group_name = group["groupName"]
+ group_name = group_name.replace("databases", "DBs")
+ for item in group["items"]:
+ if not item["from"]:
+ continue
+ option = etree.SubElement(from_select, "option", value=item["name"])
+ option.text = f"{group_name} - {item['displayName']}"
+ when = etree.SubElement(from_cond, "when", value=item["name"])
+
+ to_select = etree.SubElement(
+ when, "param", name="to", type="select", label="Target database:"
+ )
+ ruleId = item["ruleId"]
+ for to in rules[ruleId]:
+ option = etree.SubElement(to_select, "option", value=to)
+ option.text = tos[to]
+ etree.indent(from_cond, space=" ")
+ print(etree.tostring(from_cond, pretty_print=True, encoding="unicode"))
+ sys.exit(0)
+
# get the IDs from the file as sorted list
# (sorted is convenient for testing)
query = set()
for line in args.inp:
query.add(line.strip())
- query = sorted(query)
-
- if args.tool == 'map':
- pload = _map(query, args.f, args.t, chunk_size=100)
- elif args.tool == 'retrieve':
- pload = _map(query, 'ACC+ID', 'ACC', args.format, chunk_size=100)
+ query = list(query)
+ results = []
+ first = True # if False the header is removed
+ while len(query) > 0:
+ batch = query[:BATCH_SIZE]
+ query = query[BATCH_SIZE:]
+ print(f"processing {len(batch)} left {len(query)}")
+ if args.tool == "map":
+ job_id = submit_id_mapping(from_db=args.f, to_db=args.t, ids=batch)
+ elif args.tool == "retrieve":
+ job_id = submit_id_mapping(from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=batch)
- adapter = TimeoutHTTPAdapter(max_retries=retry_strategy)
- http = requests.Session()
- http.mount("https://", adapter)
- for i, p in enumerate(pload):
- response = http.post(**p)
- args.out.write(response.text)
- http.close()
+ if check_id_mapping_results_ready(job_id):
+ link = get_id_mapping_results_link(job_id)
+ link = f"{link}?format={args.format}"
+ print(link)
+ results.extend(get_id_mapping_results_search(link, first))
+ first = False
+ print(f"got {len(results)} results so far")
+ if len(query):
+ sleep(5)
+
+ if not isinstance(results, str):
+ results = "\n".join(results)
+ args.out.write(f"{results}\n")
diff -r d2ad6e2c55d1 -r f31d8d59ffb6 uniprot.xml
--- a/uniprot.xml Mon Nov 21 22:02:05 2022 +0000
+++ b/uniprot.xml Tue Aug 06 14:49:34 2024 +0000
@@ -1,197 +1,824 @@
-
+
ID mapping and retrieval
-
- macros.xml
-
requests
echo "UniProt ID mapping for Galaxy in version 0.1"
id_file.tabular &&
- '$__tool_directory__/uniprot.py'
-
+ python '$__tool_directory__/uniprot.py'
#if $tool.tool_choice == "retrieve":
retrieve -f $tool.format id_file.tabular ./output
#elif $tool.tool_choice == "map":
map
-
- #if $tool.from.category_FROM == "uniprot"
- '${tool.from.db_uniprot_FROM}'
- #elif $tool.from.category_FROM == "oseqdb"
- ${tool.from.db_oseqdb}
- #elif $tool.from.category_FROM == "3Dstrdb"
- ${tool.from.db_3Dstrdb}
- #elif $tool.from.category_FROM == "ppidb"
- ${tool.from.db_ppidb}
- #elif $tool.from.category_FROM == "chemistry"
- ${tool.from.db_chemistry}
- #elif $tool.from.category_FROM == "protfgdb"
- ${tool.from.db_protfgdb}
- #elif $tool.from.category_FROM == "polymorphismANDmutation"
- ${tool.from.db_polymorphismANDmutation}
- #elif $tool.from.category_FROM == "2DgelDB"
- ${tool.from.db_2DgelDB}
- #elif $tool.from.category_FROM == "ProtocolsMaterialsDB"
- ${tool.from.db_ProtocolsMaterialsDB}
- #elif $tool.from.category_FROM == "GenomeAnnotationDB"
- ${tool.from.db_GenomeAnnotationDB}
- #elif $tool.from.category_FROM == "OrganismSpecificGeneDB"
- ${tool.from.db_OrganismSpecificGeneDB}
- #elif $tool.from.category_FROM == "phylogenomic"
- ${tool.from.db_phylogenomic}
- #elif $tool.from.category_FROM == "EnzymePathwayDB"
- ${tool.from.db_EnzymePathwayDB}
- #elif $tool.from.category_FROM == "GeneExpression"
- ${tool.from.db_GeneExpression}
- #elif $tool.from.category_FROM == "other"
- ${tool.from.db_other}
- #end if
-
- #if $tool.to.category_TO == "uniprot"
- ${tool.to.db_uniprot_TO}
- #elif $tool.to.category_TO == "oseqdb"
- ${tool.to.db_oseqdb}
- #elif $tool.to.category_TO == "3Dstrdb"
- ${tool.to.db_3Dstrdb}
- #elif $tool.to.category_TO == "ppidb"
- ${tool.to.db_ppidb}
- #elif $tool.to.category_TO == "chemistry"
- ${tool.to.db_chemistry}
- #elif $tool.to.category_TO == "protfgdb"
- ${tool.to.db_protfgdb}
- #elif $tool.to.category_TO == "polymorphismANDmutation"
- ${tool.to.db_polymorphismANDmutation}
- #elif $tool.to.category_TO == "2DgelDB"
- ${tool.to.db_2DgelDB}
- #elif $tool.to.category_TO == "ProtocolsMaterialsDB"
- ${tool.to.db_ProtocolsMaterialsDB}
- #elif $tool.to.category_TO == "GenomeAnnotationDB"
- ${tool.to.db_GenomeAnnotationDB}
- #elif $tool.to.category_TO == "OrganismSpecificGeneDB"
- ${tool.to.db_OrganismSpecificGeneDB}
- #elif $tool.to.category_TO == "phylogenomic"
- ${tool.to.db_phylogenomic}
- #elif $tool.to.category_TO == "EnzymePathwayDB"
- ${tool.to.db_EnzymePathwayDB}
- #elif $tool.to.category_TO == "GeneExpression"
- ${tool.to.db_GeneExpression}
- #elif $tool.to.category_TO == "other"
- ${tool.to.db_other}
- #end if
-
+ --format tsv
+ "$from_cond.from"
+ "$from_cond.to"
id_file.tabular
./output
#end if
-
]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -206,59 +833,60 @@
+ label="${tool.name} on ${on_string}: fasta">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'fasta'
+ label="${tool.name} on ${on_string}: gff">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'gff'
+ label="${tool.name} on ${on_string}: txt">
tool['tool_choice'] == 'retrieve'
tool['format'] == 'txt'
+ label="${tool.name} on ${on_string}: mapping">
tool['tool_choice'] == 'map'
-
+
-
+
-
+
-
+
-
-
-
-
-
+
+
+
-
+
-
-
-
-
-
+
+
+