unipept: unipept.py comparison

comparison unipept.py @ 4:1137ffa5e479 draft

"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"

author	galaxyp
date	Tue, 06 Apr 2021 16:13:31 +0000
parents	bbfabdc0701c
children	42bc03ed02f6

comparison

equal deleted inserted replaced

-:bbfabdc0701c
+:1137ffa5e479
 #
 #------------------------------------------------------------------------------
 """
 import json
 import optparse
+import re
 import sys
-import re
 import urllib.error
 import urllib.parse
 import urllib.request
 '6.5': 'form phosphoric ester bonds',
 '6.6': 'form nitrogen-metal bonds',
 }
 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name']
 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma']
-pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:]
 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id']
 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids']
 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches')
 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/')
+parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide')
+parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as')
 # debug
 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST')
 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging')
 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit')
 (options, args) = parser.parse_args()
 if options.version:
 print('%s' % version)
 sys.exit(0)
+def tryptic_match_string(peptide, tryptic_matches):
+if options.unmatched_aa:
+p = peptide.lower()
+for m in tryptic_matches:
+p = p.replace(m.lower(), m)
+return re.sub('[a-z]', options.unmatched_aa, p)
+else:
+return ','.join(tryptic_matches)
 invalid_ec = 2 if options.strict else None
 peptides = []
 # Get peptide sequences
 if options.mzid:
 peptides += read_mzid(options.mzid)
 if options.extra or options.names:
 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
 else:
 column_order = pept2lca_column_order
 # map to tryptic peptides
-pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides}
+if options.peptide_match == 'full':
+pepToParts = {p: [p] for p in peptides}
+else:
+pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides}
+if options.debug:
+print("column_order: %s\n" % (column_order), file=sys.stderr)
 partToPeps = {}
 for peptide, parts in pepToParts.items():
 if options.debug:
 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr)
 for part in parts:
 for match in unipept_resp:
 mapping.setdefault(match['peptide'], []).append(match)
 for peptide in peptides:
 # Get the intersection of matches to the tryptic parts
 keyToMatch = None
+tryptic_match = []
 for part in pepToParts[peptide]:
 if part in mapping:
+tryptic_match.append(part)
 temp = {match[dupkey]: match for match in mapping[part]}
 if keyToMatch:
 dkeys = set(keyToMatch.keys()) - set(temp.keys())
 for k in dkeys:
 del keyToMatch[k]
 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
 if not keyToMatch:
 unmatched_peptides.append(peptide)
 else:
 for key, match in keyToMatch.items():
+match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
 match['tryptic_peptide'] = match['peptide']
 match['peptide'] = peptide
 peptideMatches.append(match)
 elif options.unipept in ['pept2lca', 'peptinfo']:
 # should be one response per trypticPeptide for pep2lca
 respMap = {v['peptide']: v for v in unipept_resp}
 # map resp back to peptides
 for peptide in peptides:
 matches = list()
+tryptic_match = []
 for part in pepToParts[peptide]:
 if part in respMap:
+tryptic_match.append(part)
 matches.append(respMap[part])
 match = best_match(peptide, matches)
 if not match:
 unmatched_peptides.append(peptide)
 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
 match = {'peptide': longest_tryptic_peptide}
+match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
 match['tryptic_peptide'] = match['peptide']
 match['peptide'] = peptide
 peptideMatches.append(match)
 else:
 respMap = {v['peptide']: v for v in unipept_resp}
 # map resp back to peptides
 for peptide in peptides:
 matches = list()
+tryptic_match = []
 for part in pepToParts[peptide]:
 if part in respMap and 'total_protein_count' in respMap[part]:
+tryptic_match.append(part)
 matches.append(respMap[part])
 match = best_match(peptide, matches)
 if not match:
 unmatched_peptides.append(peptide)
 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
 match = {'peptide': longest_tryptic_peptide}
+match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
 match['tryptic_peptide'] = match['peptide']
 match['peptide'] = peptide
 peptideMatches.append(match)
 resp = peptideMatches
 if options.debug:
 for i, pdict in enumerate(resp):
 peptide = pdict['peptide']
 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
 column_names = ['peptide', 'total_protein_count']
 vals = [peptide, total_protein_count]
+if options.peptide_match == 'report':
+column_names = ['peptide', 'tryptic_match', 'total_protein_count']
+tryptic_match = pdict.get('tryptic_match', '')
+vals = [peptide, tryptic_match, total_protein_count]
 if ec_dict:
 vals += ec_dict[peptide]
 column_names += ec_cols
 if go_dict:
 vals += go_dict[peptide]
 if taxa:
 vals += taxa[peptide][1:]
 column_names += taxon_cols[1:]
 rows.append(vals)
 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
+if options.peptide_match == 'report':
+column_order.insert(1, 'tryptic_match')
 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
 column_names = taxon_cols
 rows = list(taxa.values())
-for peptide, vals in taxa.items():
-rows.append(vals)
 if options.tsv:
 with open(options.tsv, 'w') as outputFile:
 if column_names:
 outputFile.write("#%s\n" % '\t'.join(column_names))
 for vals in rows:

Mercurial > repos > galaxyp > unipept

comparison unipept.py @ 4:1137ffa5e479 draft