comparison unipept.py @ 0:b33376bf2290 draft

planemo upload for repository http://unipept.ugent.be/apidocs commit 704a0414b2547298b2596219998002491505d927-dirty
author galaxyp
date Wed, 24 Oct 2018 14:45:18 -0400
parents
children b65ee881ca64
comparison
equal deleted inserted replaced
-1:000000000000 0:b33376bf2290
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 import json
16 import logging
17 import optparse
18 from optparse import OptionParser
19 import os
20 import sys
21 import re
22 import urllib
23 import urllib2
24
25 """
26 pept2taxa json
27 pept2lca json
28 pept2prot
29 pept2ec ecjson ec
30 pept2go go
31 pept2funct go ec
32 peptinfo json ecjson ec go
33
34 """
35
36 try:
37 import xml.etree.cElementTree as ET
38 except ImportError:
39 import xml.etree.ElementTree as ET
40
41 def warn_err(msg,exit_code=1):
42 sys.stderr.write(msg)
43 if exit_code:
44 sys.exit(exit_code)
45
46 go_types = ['biological process', 'molecular function', 'cellular component']
47 ec_name_dict = {
48 '1' : 'Oxidoreductase',
49 '1.1' : 'act on the CH-OH group of donors',
50 '1.2' : 'act on the aldehyde or oxo group of donors',
51 '1.3' : 'act on the CH-CH group of donors',
52 '1.4' : 'act on the CH-NH2 group of donors',
53 '1.5' : 'act on CH-NH group of donors',
54 '1.6' : 'act on NADH or NADPH',
55 '1.7' : 'act on other nitrogenous compounds as donors',
56 '1.8' : 'act on a sulfur group of donors',
57 '1.9' : 'act on a heme group of donors',
58 '1.10' : 'act on diphenols and related substances as donors',
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases',
60 '1.12' : 'act on hydrogen as a donor',
61 '1.13' : 'act on single donors with incorporation of molecular oxygen',
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen',
63 '1.15' : 'act on superoxide radicals as acceptors',
64 '1.16' : 'oxidize metal ions',
65 '1.17' : 'act on CH or CH2 groups',
66 '1.18' : 'act on iron-sulfur proteins as donors',
67 '1.19' : 'act on reduced flavodoxin as donor',
68 '1.20' : 'act on phosphorus or arsenic as donors',
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond',
70 '1.97' : 'other oxidoreductases',
71 '2' : 'Transferase',
72 '2.1' : 'transfer one-carbon groups, Methylase',
73 '2.2' : 'transfer aldehyde or ketone groups',
74 '2.3' : 'acyltransferases',
75 '2.4' : 'glycosyltransferases',
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups',
77 '2.6' : 'transfer nitrogenous groups',
78 '2.7' : 'transfer phosphorus-containing groups',
79 '2.8' : 'transfer sulfur-containing groups',
80 '2.9' : 'transfer selenium-containing groups',
81 '3' : 'Hydrolase',
82 '3.1' : 'act on ester bonds',
83 '3.2' : 'act on sugars - glycosylases',
84 '3.3' : 'act on ether bonds',
85 '3.4' : 'act on peptide bonds - Peptidase',
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds',
87 '3.6' : 'act on acid anhydrides',
88 '3.7' : 'act on carbon-carbon bonds',
89 '3.8' : 'act on halide bonds',
90 '3.9' : 'act on phosphorus-nitrogen bonds',
91 '3.10' : 'act on sulfur-nitrogen bonds',
92 '3.11' : 'act on carbon-phosphorus bonds',
93 '3.12' : 'act on sulfur-sulfur bonds',
94 '3.13' : 'act on carbon-sulfur bonds',
95 '4' : 'Lyase',
96 '4.1' : 'carbon-carbon lyases',
97 '4.2' : 'carbon-oxygen lyases',
98 '4.3' : 'carbon-nitrogen lyases',
99 '4.4' : 'carbon-sulfur lyases',
100 '4.5' : 'carbon-halide lyases',
101 '4.6' : 'phosphorus-oxygen lyases',
102 '5' : 'Isomerase',
103 '5.1' : 'racemases and epimerases',
104 '5.2' : 'cis-trans-isomerases',
105 '5.3' : 'intramolecular oxidoreductases',
106 '5.4' : 'intramolecular transferases -- mutases',
107 '5.5' : 'intramolecular lyases',
108 '5.99' : 'other isomerases',
109 '6' : 'Ligase',
110 '6.1' : 'form carbon-oxygen bonds',
111 '6.2' : 'form carbon-sulfur bonds',
112 '6.3' : 'form carbon-nitrogen bonds',
113 '6.4' : 'form carbon-carbon bonds',
114 '6.5' : 'form phosphoric ester bonds',
115 '6.6' : 'form nitrogen-metal bonds',
116 }
117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go']
127
128 def __main__():
129 version = '2.0'
130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
131
132 def read_tabular(filepath,col):
133 peptides = []
134 with open(filepath) as fp:
135 for i,line in enumerate(fp):
136 if line.strip() == '' or line.startswith('#'):
137 continue
138 fields = line.rstrip('\n').split('\t')
139 peptide = fields[col]
140 if not re.match(pep_pat,peptide):
141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
142 peptides.append(peptide)
143 return peptides
144
145 def get_fasta_entries(fp):
146 name, seq = None, []
147 for line in fp:
148 line = line.rstrip()
149 if line.startswith(">"):
150 if name: yield (name, ''.join(seq))
151 name, seq = line, []
152 else:
153 seq.append(line)
154 if name: yield (name, ''.join(seq))
155
156 def read_fasta(filepath):
157 peptides = []
158 with open(filepath) as fp:
159 for id, peptide in get_fasta_entries(fp):
160 if not re.match(pep_pat,peptide):
161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
162 peptides.append(peptide)
163 return peptides
164
165 def read_mzid(fp):
166 peptides = []
167 for event, elem in ET.iterparse(fp):
168 if event == 'end':
169 if re.search('PeptideSequence',elem.tag):
170 peptides.append(elem.text)
171 return peptides
172
173 def read_pepxml(fp):
174 peptides = []
175 for event, elem in ET.iterparse(fp):
176 if event == 'end':
177 if re.search('search_hit',elem.tag):
178 peptides.append(elem.get('peptide'))
179 return peptides
180
181 def best_match(peptide,matches):
182 if not matches:
183 return None
184 elif len(matches) == 1:
185 return matches[0].copy()
186 else:
187 # find the most specific match (peptide is always the first column order field)
188 for col in reversed(pept2lca_extra_column_order[1:]):
189 col_id = col+"_id" if options.extra else col
190 for match in matches:
191 if 'taxon_rank' in match and match['taxon_rank'] == col:
192 return match.copy()
193 if col_id in match and match[col_id]:
194 return match.copy()
195 return None
196
197 def get_taxon_json(resp):
198 found_keys = set()
199 for i,pdict in enumerate(resp):
200 found_keys |= set(pdict.keys())
201 taxa_cols = []
202 for col in pept2lca_extra_column_order[-1:0:-1]:
203 if col+'_id' in found_keys:
204 taxa_cols.append(col)
205 id_to_node = dict()
206 def get_node(id,name,rank,child,seq):
207 if id not in id_to_node:
208 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] }
209 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
210 id_to_node[id] = node
211 else:
212 node = id_to_node[id]
213 node['data']['count'] += 1
214 if seq is not None and seq not in node['data']['sequences']:
215 node['data']['sequences'].append(seq)
216 if child is None:
217 node['data']['self_count'] += 1
218 elif child['id'] not in node['kids']:
219 node['kids'].append(child['id'])
220 node['children'].append(child)
221 return node
222 root = get_node(1,'root','no rank',None,None)
223 for i,pdict in enumerate(resp):
224 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
225 seq = sequence
226 child = None
227 for col in taxa_cols:
228 col_id = col+'_id'
229 if col_id in pdict and pdict.get(col_id):
230 col_name = col if col in found_keys else col+'_name'
231 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
232 seq = None
233 if child:
234 get_node(1,'root','no rank',child,None)
235 return root
236
237 def get_ec_json(resp):
238 ecMap = dict()
239 for pdict in resp:
240 if 'ec' in pdict:
241 for ec in pdict['ec']:
242 ec_number = ec['ec_number']
243 if ec_number not in ecMap:
244 ecMap[ec_number] = []
245 ecMap[ec_number].append(pdict)
246 def get_ids(ec):
247 ids = []
248 i = len(ec)
249 while i >= 0:
250 ids.append(ec[:i])
251 i = ec.rfind('.',0,i - 1)
252 return ids
253 id_to_node = dict()
254 def get_node(id,name,child,seq):
255 if id not in id_to_node:
256 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] }
257 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
258 id_to_node[id] = node
259 else:
260 node = id_to_node[id]
261 node['data']['count'] += 1
262 if seq is not None and seq not in node['data']['sequences']:
263 node['data']['sequences'].append(seq)
264 if child is None:
265 node['data']['self_count'] += 1
266 elif child['id'] not in node['kids']:
267 node['kids'].append(child['id'])
268 node['children'].append(child)
269 return node
270 root = get_node(0,'-.-.-.-',None,None)
271 for i in range(1,7):
272 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None)
273 get_node(0,'-.-.-.-',child,None)
274 for i,pdict in enumerate(resp):
275 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
276 seq = sequence
277 if 'ec' in pdict:
278 for ec in pdict['ec']:
279 child = None
280 protein_count = ec['protein_count']
281 ec_number = ec['ec_number']
282 for ec_id in get_ids(ec_number):
283 child = get_node(ec_id,ec_id,child,seq)
284 seq = None
285 if child:
286 get_node(0,'-.-.-.-',child,None)
287 return root
288
289 def get_taxon_dict(resp, column_order, extra=False, names=False):
290 found_keys = set()
291 results = []
292 for i,pdict in enumerate(resp):
293 results.append(pdict)
294 found_keys |= set(pdict.keys())
295 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
296 column_names = []
297 column_keys = []
298 for col in column_order:
299 if col in found_keys:
300 column_names.append(col)
301 column_keys.append(col)
302 elif names:
303 col_id = col+'_id'
304 col_name = col+'_name'
305 if extra:
306 if col_id in found_keys:
307 column_names.append(col_id)
308 column_keys.append(col_id)
309 if names:
310 if col_name in found_keys:
311 column_names.append(col)
312 column_keys.append(col_name)
313 else:
314 if col+'_name' in found_keys:
315 column_names.append(col)
316 column_keys.append(col+'_name')
317 elif col+'_id' in found_keys:
318 column_names.append(col)
319 column_keys.append(col+'_id')
320 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
321 taxa = dict() ## peptide : [taxonomy]
322 for i,pdict in enumerate(results):
323 peptide = pdict['peptide'] if 'peptide' in pdict else None
324 if peptide and peptide not in taxa:
325 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
326 taxa[peptide] = vals
327 return (taxa,column_names)
328
329 def get_ec_dict(resp, extra=False):
330 ec_cols = ['ec_numbers', 'ec_protein_counts']
331 if extra:
332 ec_cols.append('ec_names')
333 ec_dict = dict()
334 for i,pdict in enumerate(resp):
335 peptide = pdict['peptide']
336 ec_numbers = []
337 protein_counts = []
338 ec_names = []
339 if 'ec' in pdict:
340 for ec in pdict['ec']:
341 ec_numbers.append(ec['ec_number'])
342 protein_counts.append(str(ec['protein_count']))
343 if extra:
344 ec_names.append(ec['name'] if 'name' in ec else '')
345 vals = [','.join(ec_numbers),','.join(protein_counts)]
346 if extra:
347 vals.append(','.join(ec_names))
348 ec_dict[peptide] = vals
349 return (ec_dict, ec_cols)
350
351 def get_go_dict(resp, extra=False):
352 go_cols = ['go_terms', 'go_protein_counts']
353 if extra:
354 go_cols.append('go_names')
355 go_dict = dict()
356 for i,pdict in enumerate(resp):
357 peptide = pdict['peptide']
358 go_terms = []
359 protein_counts = []
360 go_names = []
361 if 'go' in pdict:
362 for go in pdict['go']:
363 if 'go_term' in go:
364 go_terms.append(go['go_term'])
365 protein_counts.append(str(go['protein_count']))
366 if extra:
367 go_names.append(go['name'] if 'name' in go else '')
368 else:
369 for go_type in go_types:
370 if go_type in go:
371 for _go in go[go_type]:
372 go_terms.append(_go['go_term'])
373 protein_counts.append(str(_go['protein_count']))
374 if extra:
375 go_names.append(_go['name'] if 'name' in _go else '')
376 vals = [','.join(go_terms),','.join(protein_counts)]
377 if extra:
378 vals.append(','.join(go_names))
379 go_dict[peptide] = vals
380 return (go_dict, go_cols)
381
382 def write_ec_table(outfile, resp, column_order):
383 with open(outfile,'w') as fh:
384 for i,pdict in enumerate(resp):
385 if 'ec' in pdict:
386 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
387 for ec in pdict['ec']:
388 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
389 fh.write('%s\n' % '\t'.join(tvals + vals))
390
391 def write_go_table(outfile, resp, column_order):
392 with open(outfile,'w') as fh:
393 for i,pdict in enumerate(resp):
394 if 'go' in pdict:
395 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
396 for go in pdict['go']:
397 if 'go_term' in go:
398 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
399 fh.write('%s\n' % '\t'.join(tvals + vals))
400 else:
401 for go_type in go_types:
402 if go_type in go:
403 for _go in go[go_type]:
404 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
405 vals.append(go_type)
406 fh.write('%s\n' % '\t'.join(tvals + vals))
407
408 #Parse Command Line
409 parser = optparse.OptionParser()
410 # unipept API choice
411 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'],
412 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' )
413 # input files
414 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
415 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
416 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
417 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
418 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
419 # Unipept Flags
420 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
421 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
422 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
423 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
424 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
425
426 # output fields
427 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
428 # Warn vs Error Flag
429 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
430 # output files
431 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
432 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
433 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
434 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
435 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
436 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
437 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
438 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
439 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
440 # debug
441 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
442 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
443 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
444 (options, args) = parser.parse_args()
445 if options.version:
446 print >> sys.stdout,"%s" % version
447 sys.exit(0)
448 invalid_ec = 2 if options.strict else None
449 peptides = []
450 ## Get peptide sequences
451 if options.mzid:
452 peptides += read_mzid(options.mzid)
453 if options.pepxml:
454 peptides += read_pepxml(options.pepxml)
455 if options.tabular:
456 peptides += read_tabular(options.tabular,options.column)
457 if options.fasta:
458 peptides += read_fasta(options.fasta)
459 if args and len(args) > 0:
460 for i,peptide in enumerate(args):
461 if not re.match(pep_pat,peptide):
462 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
463 peptides.append(peptide)
464 if len(peptides) < 1:
465 warn_err("No peptides input!",exit_code=1)
466 column_order = pept2lca_column_order
467 if options.unipept == 'pept2prot':
468 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
469 else:
470 if options.extra or options.names:
471 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
472 else:
473 column_order = pept2lca_column_order
474 ## map to tryptic peptides
475 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
476 partToPeps = {}
477 for peptide, parts in pepToParts.iteritems():
478 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
479 for part in parts:
480 if len(part) > 50:
481 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
482 if 5 <= len(part) <= 50:
483 partToPeps.setdefault(part,[]).append(peptide)
484 trypticPeptides = partToPeps.keys()
485 ## unipept
486 unipept_resp = []
487 idx = range(0,len(trypticPeptides),options.max_request)
488 idx.append(len(trypticPeptides))
489 for i in range(len(idx)-1):
490 post_data = []
491 if options.equate_il:
492 post_data.append(("equate_il","true"))
493 if options.names or options.json:
494 post_data.append(("extra","true"))
495 post_data.append(("names","true"))
496 elif options.extra or options.json:
497 post_data.append(("extra","true"))
498 if options.domains:
499 post_data.append(("domains","true"))
500 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
501 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data))
502 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data])
503 #headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
504 headers = {'Accept': 'application/json'}
505 url = '%s/%s' % (options.url.rstrip('/'),options.unipept)
506 if options.get:
507 url = '%s.json?%s' % (url,params)
508 req = urllib2.Request( url )
509 else:
510 url = '%s.json' % (url)
511 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
512 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url))
513 try:
514 resp = urllib2.urlopen( req )
515 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode()))
516 if resp.getcode() == 200:
517 unipept_resp += json.loads( urllib2.urlopen( req ).read() )
518 except Exception, e:
519 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None)
520 unmatched_peptides = []
521 peptideMatches = []
522 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
523 if options.unipept in ['pept2prot', 'pept2taxa']:
524 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
525 ## multiple entries per trypticPeptide for pep2prot or pep2taxa
526 mapping = {}
527 for match in unipept_resp:
528 mapping.setdefault(match['peptide'],[]).append(match)
529 for peptide in peptides:
530 # Get the intersection of matches to the tryptic parts
531 keyToMatch = None
532 for part in pepToParts[peptide]:
533 if part in mapping:
534 temp = {match[dupkey] : match for match in mapping[part]}
535 if keyToMatch:
536 dkeys = set(keyToMatch.keys()) - set(temp.keys())
537 for k in dkeys:
538 del keyToMatch[k]
539 else:
540 keyToMatch = temp
541 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
542 if not keyToMatch:
543 unmatched_peptides.append(peptide)
544 else:
545 for key,match in keyToMatch.iteritems():
546 match['tryptic_peptide'] = match['peptide']
547 match['peptide'] = peptide
548 peptideMatches.append(match)
549 elif options.unipept in ['pept2lca']:
550 ## should be one response per trypticPeptide for pep2lca
551 respMap = {v['peptide']:v for v in unipept_resp}
552 ## map resp back to peptides
553 for peptide in peptides:
554 matches = list()
555 for part in pepToParts[peptide]:
556 if part in respMap:
557 matches.append(respMap[part])
558 match = best_match(peptide,matches)
559 if not match:
560 unmatched_peptides.append(peptide)
561 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
562 match = {'peptide' : longest_tryptic_peptide}
563 match['tryptic_peptide'] = match['peptide']
564 match['peptide'] = peptide
565 peptideMatches.append(match)
566 else:
567 respMap = {v['peptide']:v for v in unipept_resp}
568 ## map resp back to peptides
569 for peptide in peptides:
570 matches = list()
571 for part in pepToParts[peptide]:
572 if part in respMap and 'total_protein_count' in respMap[part]:
573 matches.append(respMap[part])
574 match = best_match(peptide,matches)
575 if not match:
576 unmatched_peptides.append(peptide)
577 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
578 match = {'peptide' : longest_tryptic_peptide}
579 match['tryptic_peptide'] = match['peptide']
580 match['peptide'] = peptide
581 peptideMatches.append(match)
582 resp = peptideMatches
583 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
584 ## output results
585 if not (options.unmatched or options.json or options.tsv or options.csv):
586 print >> sys.stdout, str(resp)
587 if options.unmatched:
588 with open(options.unmatched,'w') as outputFile:
589 for peptide in peptides:
590 if peptide in unmatched_peptides:
591 outputFile.write("%s\n" % peptide)
592 if options.json:
593 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
594 root = get_taxon_json(resp)
595 with open(options.json,'w') as outputFile:
596 outputFile.write(json.dumps(root))
597 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
598 with open(options.json,'w') as outputFile:
599 outputFile.write(str(resp))
600 if options.ec_json:
601 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
602 root = get_ec_json(resp)
603 with open(options.ec_json,'w') as outputFile:
604 outputFile.write(json.dumps(root))
605 if options.tsv or options.csv:
606 rows = []
607 column_names = None
608 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
609 taxa = None
610 ec_dict = None
611 go_dict = None
612 if options.unipept in ['peptinfo']:
613 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
614 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
615 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
616 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
617 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
618 for i,pdict in enumerate(resp):
619 peptide = pdict['peptide']
620 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
621 column_names = ['peptide', 'total_protein_count']
622 vals = [peptide,total_protein_count]
623 if ec_dict:
624 vals += ec_dict[peptide]
625 column_names += ec_cols
626 if go_dict:
627 vals += go_dict[peptide]
628 column_names += go_cols
629 if taxa:
630 vals += taxa[peptide][1:]
631 column_names += taxon_cols[1:]
632 rows.append(vals)
633 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
634 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
635 column_names = taxon_cols
636 rows = taxa.values()
637 for peptide,vals in taxa.iteritems():
638 rows.append(vals)
639 if options.tsv:
640 with open(options.tsv,'w') as outputFile:
641 if column_names:
642 outputFile.write("#%s\n"% '\t'.join(column_names))
643 for vals in rows:
644 outputFile.write("%s\n"% '\t'.join(vals))
645 if options.csv:
646 with open(options.csv,'w') as outputFile:
647 if column_names:
648 outputFile.write("%s\n"% ','.join(column_names))
649 for vals in rows:
650 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
651 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
652 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
653 write_ec_table(options.ec_tsv, resp, column_order)
654 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
655 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
656 write_go_table(options.go_tsv, resp, column_order)
657
658 if __name__ == "__main__" : __main__()