Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 4:1137ffa5e479 draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"
| author | galaxyp |
|---|---|
| date | Tue, 06 Apr 2021 16:13:31 +0000 |
| parents | bbfabdc0701c |
| children | 42bc03ed02f6 |
comparison
equal
deleted
inserted
replaced
| 3:bbfabdc0701c | 4:1137ffa5e479 |
|---|---|
| 7 # | 7 # |
| 8 #------------------------------------------------------------------------------ | 8 #------------------------------------------------------------------------------ |
| 9 """ | 9 """ |
| 10 import json | 10 import json |
| 11 import optparse | 11 import optparse |
| 12 import re | |
| 12 import sys | 13 import sys |
| 13 import re | |
| 14 import urllib.error | 14 import urllib.error |
| 15 import urllib.parse | 15 import urllib.parse |
| 16 import urllib.request | 16 import urllib.request |
| 17 | 17 |
| 18 | 18 |
| 100 '6.5': 'form phosphoric ester bonds', | 100 '6.5': 'form phosphoric ester bonds', |
| 101 '6.6': 'form nitrogen-metal bonds', | 101 '6.6': 'form nitrogen-metal bonds', |
| 102 } | 102 } |
| 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] | 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] |
| 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] | 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] |
| 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:] |
| 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] | 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] |
| 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] | 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] |
| 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] |
| 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] |
| 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] |
| 485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | 485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') |
| 486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | 486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') |
| 487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | 487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') |
| 488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') | 488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') |
| 489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') | 489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') |
| 490 parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide') | |
| 491 parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as') | |
| 490 # debug | 492 # debug |
| 491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') | 493 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') |
| 492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') | 494 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') |
| 493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') | 495 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') |
| 494 (options, args) = parser.parse_args() | 496 (options, args) = parser.parse_args() |
| 495 if options.version: | 497 if options.version: |
| 496 print('%s' % version) | 498 print('%s' % version) |
| 497 sys.exit(0) | 499 sys.exit(0) |
| 500 | |
| 501 def tryptic_match_string(peptide, tryptic_matches): | |
| 502 if options.unmatched_aa: | |
| 503 p = peptide.lower() | |
| 504 for m in tryptic_matches: | |
| 505 p = p.replace(m.lower(), m) | |
| 506 return re.sub('[a-z]', options.unmatched_aa, p) | |
| 507 else: | |
| 508 return ','.join(tryptic_matches) | |
| 509 | |
| 498 invalid_ec = 2 if options.strict else None | 510 invalid_ec = 2 if options.strict else None |
| 499 peptides = [] | 511 peptides = [] |
| 500 # Get peptide sequences | 512 # Get peptide sequences |
| 501 if options.mzid: | 513 if options.mzid: |
| 502 peptides += read_mzid(options.mzid) | 514 peptides += read_mzid(options.mzid) |
| 520 if options.extra or options.names: | 532 if options.extra or options.names: |
| 521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | 533 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order |
| 522 else: | 534 else: |
| 523 column_order = pept2lca_column_order | 535 column_order = pept2lca_column_order |
| 524 # map to tryptic peptides | 536 # map to tryptic peptides |
| 525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} | 537 if options.peptide_match == 'full': |
| 538 pepToParts = {p: [p] for p in peptides} | |
| 539 else: | |
| 540 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} | |
| 541 if options.debug: | |
| 542 print("column_order: %s\n" % (column_order), file=sys.stderr) | |
| 526 partToPeps = {} | 543 partToPeps = {} |
| 527 for peptide, parts in pepToParts.items(): | 544 for peptide, parts in pepToParts.items(): |
| 528 if options.debug: | 545 if options.debug: |
| 529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) | 546 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) |
| 530 for part in parts: | 547 for part in parts: |
| 589 for match in unipept_resp: | 606 for match in unipept_resp: |
| 590 mapping.setdefault(match['peptide'], []).append(match) | 607 mapping.setdefault(match['peptide'], []).append(match) |
| 591 for peptide in peptides: | 608 for peptide in peptides: |
| 592 # Get the intersection of matches to the tryptic parts | 609 # Get the intersection of matches to the tryptic parts |
| 593 keyToMatch = None | 610 keyToMatch = None |
| 611 tryptic_match = [] | |
| 594 for part in pepToParts[peptide]: | 612 for part in pepToParts[peptide]: |
| 595 if part in mapping: | 613 if part in mapping: |
| 614 tryptic_match.append(part) | |
| 596 temp = {match[dupkey]: match for match in mapping[part]} | 615 temp = {match[dupkey]: match for match in mapping[part]} |
| 597 if keyToMatch: | 616 if keyToMatch: |
| 598 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | 617 dkeys = set(keyToMatch.keys()) - set(temp.keys()) |
| 599 for k in dkeys: | 618 for k in dkeys: |
| 600 del keyToMatch[k] | 619 del keyToMatch[k] |
| 603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | 622 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp |
| 604 if not keyToMatch: | 623 if not keyToMatch: |
| 605 unmatched_peptides.append(peptide) | 624 unmatched_peptides.append(peptide) |
| 606 else: | 625 else: |
| 607 for key, match in keyToMatch.items(): | 626 for key, match in keyToMatch.items(): |
| 627 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
| 608 match['tryptic_peptide'] = match['peptide'] | 628 match['tryptic_peptide'] = match['peptide'] |
| 609 match['peptide'] = peptide | 629 match['peptide'] = peptide |
| 610 peptideMatches.append(match) | 630 peptideMatches.append(match) |
| 611 elif options.unipept in ['pept2lca', 'peptinfo']: | 631 elif options.unipept in ['pept2lca', 'peptinfo']: |
| 612 # should be one response per trypticPeptide for pep2lca | 632 # should be one response per trypticPeptide for pep2lca |
| 613 respMap = {v['peptide']: v for v in unipept_resp} | 633 respMap = {v['peptide']: v for v in unipept_resp} |
| 614 # map resp back to peptides | 634 # map resp back to peptides |
| 615 for peptide in peptides: | 635 for peptide in peptides: |
| 616 matches = list() | 636 matches = list() |
| 637 tryptic_match = [] | |
| 617 for part in pepToParts[peptide]: | 638 for part in pepToParts[peptide]: |
| 618 if part in respMap: | 639 if part in respMap: |
| 640 tryptic_match.append(part) | |
| 619 matches.append(respMap[part]) | 641 matches.append(respMap[part]) |
| 620 match = best_match(peptide, matches) | 642 match = best_match(peptide, matches) |
| 621 if not match: | 643 if not match: |
| 622 unmatched_peptides.append(peptide) | 644 unmatched_peptides.append(peptide) |
| 623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 645 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
| 624 match = {'peptide': longest_tryptic_peptide} | 646 match = {'peptide': longest_tryptic_peptide} |
| 647 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
| 625 match['tryptic_peptide'] = match['peptide'] | 648 match['tryptic_peptide'] = match['peptide'] |
| 626 match['peptide'] = peptide | 649 match['peptide'] = peptide |
| 627 peptideMatches.append(match) | 650 peptideMatches.append(match) |
| 628 else: | 651 else: |
| 629 respMap = {v['peptide']: v for v in unipept_resp} | 652 respMap = {v['peptide']: v for v in unipept_resp} |
| 630 # map resp back to peptides | 653 # map resp back to peptides |
| 631 for peptide in peptides: | 654 for peptide in peptides: |
| 632 matches = list() | 655 matches = list() |
| 656 tryptic_match = [] | |
| 633 for part in pepToParts[peptide]: | 657 for part in pepToParts[peptide]: |
| 634 if part in respMap and 'total_protein_count' in respMap[part]: | 658 if part in respMap and 'total_protein_count' in respMap[part]: |
| 659 tryptic_match.append(part) | |
| 635 matches.append(respMap[part]) | 660 matches.append(respMap[part]) |
| 636 match = best_match(peptide, matches) | 661 match = best_match(peptide, matches) |
| 637 if not match: | 662 if not match: |
| 638 unmatched_peptides.append(peptide) | 663 unmatched_peptides.append(peptide) |
| 639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 664 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
| 640 match = {'peptide': longest_tryptic_peptide} | 665 match = {'peptide': longest_tryptic_peptide} |
| 666 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
| 641 match['tryptic_peptide'] = match['peptide'] | 667 match['tryptic_peptide'] = match['peptide'] |
| 642 match['peptide'] = peptide | 668 match['peptide'] = peptide |
| 643 peptideMatches.append(match) | 669 peptideMatches.append(match) |
| 644 resp = peptideMatches | 670 resp = peptideMatches |
| 645 if options.debug: | 671 if options.debug: |
| 684 for i, pdict in enumerate(resp): | 710 for i, pdict in enumerate(resp): |
| 685 peptide = pdict['peptide'] | 711 peptide = pdict['peptide'] |
| 686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | 712 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' |
| 687 column_names = ['peptide', 'total_protein_count'] | 713 column_names = ['peptide', 'total_protein_count'] |
| 688 vals = [peptide, total_protein_count] | 714 vals = [peptide, total_protein_count] |
| 715 if options.peptide_match == 'report': | |
| 716 column_names = ['peptide', 'tryptic_match', 'total_protein_count'] | |
| 717 tryptic_match = pdict.get('tryptic_match', '') | |
| 718 vals = [peptide, tryptic_match, total_protein_count] | |
| 689 if ec_dict: | 719 if ec_dict: |
| 690 vals += ec_dict[peptide] | 720 vals += ec_dict[peptide] |
| 691 column_names += ec_cols | 721 column_names += ec_cols |
| 692 if go_dict: | 722 if go_dict: |
| 693 vals += go_dict[peptide] | 723 vals += go_dict[peptide] |
| 698 if taxa: | 728 if taxa: |
| 699 vals += taxa[peptide][1:] | 729 vals += taxa[peptide][1:] |
| 700 column_names += taxon_cols[1:] | 730 column_names += taxon_cols[1:] |
| 701 rows.append(vals) | 731 rows.append(vals) |
| 702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | 732 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: |
| 733 if options.peptide_match == 'report': | |
| 734 column_order.insert(1, 'tryptic_match') | |
| 703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | 735 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
| 704 column_names = taxon_cols | 736 column_names = taxon_cols |
| 705 rows = list(taxa.values()) | 737 rows = list(taxa.values()) |
| 706 for peptide, vals in taxa.items(): | |
| 707 rows.append(vals) | |
| 708 if options.tsv: | 738 if options.tsv: |
| 709 with open(options.tsv, 'w') as outputFile: | 739 with open(options.tsv, 'w') as outputFile: |
| 710 if column_names: | 740 if column_names: |
| 711 outputFile.write("#%s\n" % '\t'.join(column_names)) | 741 outputFile.write("#%s\n" % '\t'.join(column_names)) |
| 712 for vals in rows: | 742 for vals in rows: |
