Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 0:b33376bf2290 draft
planemo upload for repository http://unipept.ugent.be/apidocs commit 704a0414b2547298b2596219998002491505d927-dirty
author | galaxyp |
---|---|
date | Wed, 24 Oct 2018 14:45:18 -0400 |
parents | |
children | b65ee881ca64 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b33376bf2290 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 # | |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2015, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | |
9 # | |
10 # James E Johnson | |
11 # | |
12 #------------------------------------------------------------------------------ | |
13 """ | |
14 | |
15 import json | |
16 import logging | |
17 import optparse | |
18 from optparse import OptionParser | |
19 import os | |
20 import sys | |
21 import re | |
22 import urllib | |
23 import urllib2 | |
24 | |
25 """ | |
26 pept2taxa json | |
27 pept2lca json | |
28 pept2prot | |
29 pept2ec ecjson ec | |
30 pept2go go | |
31 pept2funct go ec | |
32 peptinfo json ecjson ec go | |
33 | |
34 """ | |
35 | |
36 try: | |
37 import xml.etree.cElementTree as ET | |
38 except ImportError: | |
39 import xml.etree.ElementTree as ET | |
40 | |
41 def warn_err(msg,exit_code=1): | |
42 sys.stderr.write(msg) | |
43 if exit_code: | |
44 sys.exit(exit_code) | |
45 | |
46 go_types = ['biological process', 'molecular function', 'cellular component'] | |
47 ec_name_dict = { | |
48 '1' : 'Oxidoreductase', | |
49 '1.1' : 'act on the CH-OH group of donors', | |
50 '1.2' : 'act on the aldehyde or oxo group of donors', | |
51 '1.3' : 'act on the CH-CH group of donors', | |
52 '1.4' : 'act on the CH-NH2 group of donors', | |
53 '1.5' : 'act on CH-NH group of donors', | |
54 '1.6' : 'act on NADH or NADPH', | |
55 '1.7' : 'act on other nitrogenous compounds as donors', | |
56 '1.8' : 'act on a sulfur group of donors', | |
57 '1.9' : 'act on a heme group of donors', | |
58 '1.10' : 'act on diphenols and related substances as donors', | |
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', | |
60 '1.12' : 'act on hydrogen as a donor', | |
61 '1.13' : 'act on single donors with incorporation of molecular oxygen', | |
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', | |
63 '1.15' : 'act on superoxide radicals as acceptors', | |
64 '1.16' : 'oxidize metal ions', | |
65 '1.17' : 'act on CH or CH2 groups', | |
66 '1.18' : 'act on iron-sulfur proteins as donors', | |
67 '1.19' : 'act on reduced flavodoxin as donor', | |
68 '1.20' : 'act on phosphorus or arsenic as donors', | |
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', | |
70 '1.97' : 'other oxidoreductases', | |
71 '2' : 'Transferase', | |
72 '2.1' : 'transfer one-carbon groups, Methylase', | |
73 '2.2' : 'transfer aldehyde or ketone groups', | |
74 '2.3' : 'acyltransferases', | |
75 '2.4' : 'glycosyltransferases', | |
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', | |
77 '2.6' : 'transfer nitrogenous groups', | |
78 '2.7' : 'transfer phosphorus-containing groups', | |
79 '2.8' : 'transfer sulfur-containing groups', | |
80 '2.9' : 'transfer selenium-containing groups', | |
81 '3' : 'Hydrolase', | |
82 '3.1' : 'act on ester bonds', | |
83 '3.2' : 'act on sugars - glycosylases', | |
84 '3.3' : 'act on ether bonds', | |
85 '3.4' : 'act on peptide bonds - Peptidase', | |
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', | |
87 '3.6' : 'act on acid anhydrides', | |
88 '3.7' : 'act on carbon-carbon bonds', | |
89 '3.8' : 'act on halide bonds', | |
90 '3.9' : 'act on phosphorus-nitrogen bonds', | |
91 '3.10' : 'act on sulfur-nitrogen bonds', | |
92 '3.11' : 'act on carbon-phosphorus bonds', | |
93 '3.12' : 'act on sulfur-sulfur bonds', | |
94 '3.13' : 'act on carbon-sulfur bonds', | |
95 '4' : 'Lyase', | |
96 '4.1' : 'carbon-carbon lyases', | |
97 '4.2' : 'carbon-oxygen lyases', | |
98 '4.3' : 'carbon-nitrogen lyases', | |
99 '4.4' : 'carbon-sulfur lyases', | |
100 '4.5' : 'carbon-halide lyases', | |
101 '4.6' : 'phosphorus-oxygen lyases', | |
102 '5' : 'Isomerase', | |
103 '5.1' : 'racemases and epimerases', | |
104 '5.2' : 'cis-trans-isomerases', | |
105 '5.3' : 'intramolecular oxidoreductases', | |
106 '5.4' : 'intramolecular transferases -- mutases', | |
107 '5.5' : 'intramolecular lyases', | |
108 '5.99' : 'other isomerases', | |
109 '6' : 'Ligase', | |
110 '6.1' : 'form carbon-oxygen bonds', | |
111 '6.2' : 'form carbon-sulfur bonds', | |
112 '6.3' : 'form carbon-nitrogen bonds', | |
113 '6.4' : 'form carbon-carbon bonds', | |
114 '6.5' : 'form phosphoric ester bonds', | |
115 '6.6' : 'form nitrogen-metal bonds', | |
116 } | |
117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] | |
118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | |
119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | |
120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | |
121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | |
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | |
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | |
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | |
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] | |
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] | |
127 | |
128 def __main__(): | |
129 version = '2.0' | |
130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | |
131 | |
132 def read_tabular(filepath,col): | |
133 peptides = [] | |
134 with open(filepath) as fp: | |
135 for i,line in enumerate(fp): | |
136 if line.strip() == '' or line.startswith('#'): | |
137 continue | |
138 fields = line.rstrip('\n').split('\t') | |
139 peptide = fields[col] | |
140 if not re.match(pep_pat,peptide): | |
141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) | |
142 peptides.append(peptide) | |
143 return peptides | |
144 | |
145 def get_fasta_entries(fp): | |
146 name, seq = None, [] | |
147 for line in fp: | |
148 line = line.rstrip() | |
149 if line.startswith(">"): | |
150 if name: yield (name, ''.join(seq)) | |
151 name, seq = line, [] | |
152 else: | |
153 seq.append(line) | |
154 if name: yield (name, ''.join(seq)) | |
155 | |
156 def read_fasta(filepath): | |
157 peptides = [] | |
158 with open(filepath) as fp: | |
159 for id, peptide in get_fasta_entries(fp): | |
160 if not re.match(pep_pat,peptide): | |
161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) | |
162 peptides.append(peptide) | |
163 return peptides | |
164 | |
165 def read_mzid(fp): | |
166 peptides = [] | |
167 for event, elem in ET.iterparse(fp): | |
168 if event == 'end': | |
169 if re.search('PeptideSequence',elem.tag): | |
170 peptides.append(elem.text) | |
171 return peptides | |
172 | |
173 def read_pepxml(fp): | |
174 peptides = [] | |
175 for event, elem in ET.iterparse(fp): | |
176 if event == 'end': | |
177 if re.search('search_hit',elem.tag): | |
178 peptides.append(elem.get('peptide')) | |
179 return peptides | |
180 | |
181 def best_match(peptide,matches): | |
182 if not matches: | |
183 return None | |
184 elif len(matches) == 1: | |
185 return matches[0].copy() | |
186 else: | |
187 # find the most specific match (peptide is always the first column order field) | |
188 for col in reversed(pept2lca_extra_column_order[1:]): | |
189 col_id = col+"_id" if options.extra else col | |
190 for match in matches: | |
191 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
192 return match.copy() | |
193 if col_id in match and match[col_id]: | |
194 return match.copy() | |
195 return None | |
196 | |
197 def get_taxon_json(resp): | |
198 found_keys = set() | |
199 for i,pdict in enumerate(resp): | |
200 found_keys |= set(pdict.keys()) | |
201 taxa_cols = [] | |
202 for col in pept2lca_extra_column_order[-1:0:-1]: | |
203 if col+'_id' in found_keys: | |
204 taxa_cols.append(col) | |
205 id_to_node = dict() | |
206 def get_node(id,name,rank,child,seq): | |
207 if id not in id_to_node: | |
208 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | |
209 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
210 id_to_node[id] = node | |
211 else: | |
212 node = id_to_node[id] | |
213 node['data']['count'] += 1 | |
214 if seq is not None and seq not in node['data']['sequences']: | |
215 node['data']['sequences'].append(seq) | |
216 if child is None: | |
217 node['data']['self_count'] += 1 | |
218 elif child['id'] not in node['kids']: | |
219 node['kids'].append(child['id']) | |
220 node['children'].append(child) | |
221 return node | |
222 root = get_node(1,'root','no rank',None,None) | |
223 for i,pdict in enumerate(resp): | |
224 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
225 seq = sequence | |
226 child = None | |
227 for col in taxa_cols: | |
228 col_id = col+'_id' | |
229 if col_id in pdict and pdict.get(col_id): | |
230 col_name = col if col in found_keys else col+'_name' | |
231 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | |
232 seq = None | |
233 if child: | |
234 get_node(1,'root','no rank',child,None) | |
235 return root | |
236 | |
237 def get_ec_json(resp): | |
238 ecMap = dict() | |
239 for pdict in resp: | |
240 if 'ec' in pdict: | |
241 for ec in pdict['ec']: | |
242 ec_number = ec['ec_number'] | |
243 if ec_number not in ecMap: | |
244 ecMap[ec_number] = [] | |
245 ecMap[ec_number].append(pdict) | |
246 def get_ids(ec): | |
247 ids = [] | |
248 i = len(ec) | |
249 while i >= 0: | |
250 ids.append(ec[:i]) | |
251 i = ec.rfind('.',0,i - 1) | |
252 return ids | |
253 id_to_node = dict() | |
254 def get_node(id,name,child,seq): | |
255 if id not in id_to_node: | |
256 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } | |
257 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
258 id_to_node[id] = node | |
259 else: | |
260 node = id_to_node[id] | |
261 node['data']['count'] += 1 | |
262 if seq is not None and seq not in node['data']['sequences']: | |
263 node['data']['sequences'].append(seq) | |
264 if child is None: | |
265 node['data']['self_count'] += 1 | |
266 elif child['id'] not in node['kids']: | |
267 node['kids'].append(child['id']) | |
268 node['children'].append(child) | |
269 return node | |
270 root = get_node(0,'-.-.-.-',None,None) | |
271 for i in range(1,7): | |
272 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) | |
273 get_node(0,'-.-.-.-',child,None) | |
274 for i,pdict in enumerate(resp): | |
275 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
276 seq = sequence | |
277 if 'ec' in pdict: | |
278 for ec in pdict['ec']: | |
279 child = None | |
280 protein_count = ec['protein_count'] | |
281 ec_number = ec['ec_number'] | |
282 for ec_id in get_ids(ec_number): | |
283 child = get_node(ec_id,ec_id,child,seq) | |
284 seq = None | |
285 if child: | |
286 get_node(0,'-.-.-.-',child,None) | |
287 return root | |
288 | |
289 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
290 found_keys = set() | |
291 results = [] | |
292 for i,pdict in enumerate(resp): | |
293 results.append(pdict) | |
294 found_keys |= set(pdict.keys()) | |
295 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
296 column_names = [] | |
297 column_keys = [] | |
298 for col in column_order: | |
299 if col in found_keys: | |
300 column_names.append(col) | |
301 column_keys.append(col) | |
302 elif names: | |
303 col_id = col+'_id' | |
304 col_name = col+'_name' | |
305 if extra: | |
306 if col_id in found_keys: | |
307 column_names.append(col_id) | |
308 column_keys.append(col_id) | |
309 if names: | |
310 if col_name in found_keys: | |
311 column_names.append(col) | |
312 column_keys.append(col_name) | |
313 else: | |
314 if col+'_name' in found_keys: | |
315 column_names.append(col) | |
316 column_keys.append(col+'_name') | |
317 elif col+'_id' in found_keys: | |
318 column_names.append(col) | |
319 column_keys.append(col+'_id') | |
320 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
321 taxa = dict() ## peptide : [taxonomy] | |
322 for i,pdict in enumerate(results): | |
323 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
324 if peptide and peptide not in taxa: | |
325 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
326 taxa[peptide] = vals | |
327 return (taxa,column_names) | |
328 | |
329 def get_ec_dict(resp, extra=False): | |
330 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
331 if extra: | |
332 ec_cols.append('ec_names') | |
333 ec_dict = dict() | |
334 for i,pdict in enumerate(resp): | |
335 peptide = pdict['peptide'] | |
336 ec_numbers = [] | |
337 protein_counts = [] | |
338 ec_names = [] | |
339 if 'ec' in pdict: | |
340 for ec in pdict['ec']: | |
341 ec_numbers.append(ec['ec_number']) | |
342 protein_counts.append(str(ec['protein_count'])) | |
343 if extra: | |
344 ec_names.append(ec['name'] if 'name' in ec else '') | |
345 vals = [','.join(ec_numbers),','.join(protein_counts)] | |
346 if extra: | |
347 vals.append(','.join(ec_names)) | |
348 ec_dict[peptide] = vals | |
349 return (ec_dict, ec_cols) | |
350 | |
351 def get_go_dict(resp, extra=False): | |
352 go_cols = ['go_terms', 'go_protein_counts'] | |
353 if extra: | |
354 go_cols.append('go_names') | |
355 go_dict = dict() | |
356 for i,pdict in enumerate(resp): | |
357 peptide = pdict['peptide'] | |
358 go_terms = [] | |
359 protein_counts = [] | |
360 go_names = [] | |
361 if 'go' in pdict: | |
362 for go in pdict['go']: | |
363 if 'go_term' in go: | |
364 go_terms.append(go['go_term']) | |
365 protein_counts.append(str(go['protein_count'])) | |
366 if extra: | |
367 go_names.append(go['name'] if 'name' in go else '') | |
368 else: | |
369 for go_type in go_types: | |
370 if go_type in go: | |
371 for _go in go[go_type]: | |
372 go_terms.append(_go['go_term']) | |
373 protein_counts.append(str(_go['protein_count'])) | |
374 if extra: | |
375 go_names.append(_go['name'] if 'name' in _go else '') | |
376 vals = [','.join(go_terms),','.join(protein_counts)] | |
377 if extra: | |
378 vals.append(','.join(go_names)) | |
379 go_dict[peptide] = vals | |
380 return (go_dict, go_cols) | |
381 | |
382 def write_ec_table(outfile, resp, column_order): | |
383 with open(outfile,'w') as fh: | |
384 for i,pdict in enumerate(resp): | |
385 if 'ec' in pdict: | |
386 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
387 for ec in pdict['ec']: | |
388 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
389 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
390 | |
391 def write_go_table(outfile, resp, column_order): | |
392 with open(outfile,'w') as fh: | |
393 for i,pdict in enumerate(resp): | |
394 if 'go' in pdict: | |
395 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
396 for go in pdict['go']: | |
397 if 'go_term' in go: | |
398 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
399 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
400 else: | |
401 for go_type in go_types: | |
402 if go_type in go: | |
403 for _go in go[go_type]: | |
404 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | |
405 vals.append(go_type) | |
406 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
407 | |
408 #Parse Command Line | |
409 parser = optparse.OptionParser() | |
410 # unipept API choice | |
411 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], | |
412 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) | |
413 # input files | |
414 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | |
415 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | |
416 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | |
417 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | |
418 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | |
419 # Unipept Flags | |
420 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | |
421 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | |
422 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | |
423 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) | |
424 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | |
425 | |
426 # output fields | |
427 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | |
428 # Warn vs Error Flag | |
429 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | |
430 # output files | |
431 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
432 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
433 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
434 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
435 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
436 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
437 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
438 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) | |
439 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) | |
440 # debug | |
441 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) | |
442 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | |
443 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | |
444 (options, args) = parser.parse_args() | |
445 if options.version: | |
446 print >> sys.stdout,"%s" % version | |
447 sys.exit(0) | |
448 invalid_ec = 2 if options.strict else None | |
449 peptides = [] | |
450 ## Get peptide sequences | |
451 if options.mzid: | |
452 peptides += read_mzid(options.mzid) | |
453 if options.pepxml: | |
454 peptides += read_pepxml(options.pepxml) | |
455 if options.tabular: | |
456 peptides += read_tabular(options.tabular,options.column) | |
457 if options.fasta: | |
458 peptides += read_fasta(options.fasta) | |
459 if args and len(args) > 0: | |
460 for i,peptide in enumerate(args): | |
461 if not re.match(pep_pat,peptide): | |
462 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | |
463 peptides.append(peptide) | |
464 if len(peptides) < 1: | |
465 warn_err("No peptides input!",exit_code=1) | |
466 column_order = pept2lca_column_order | |
467 if options.unipept == 'pept2prot': | |
468 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order | |
469 else: | |
470 if options.extra or options.names: | |
471 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | |
472 else: | |
473 column_order = pept2lca_column_order | |
474 ## map to tryptic peptides | |
475 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} | |
476 partToPeps = {} | |
477 for peptide, parts in pepToParts.iteritems(): | |
478 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) | |
479 for part in parts: | |
480 if len(part) > 50: | |
481 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | |
482 if 5 <= len(part) <= 50: | |
483 partToPeps.setdefault(part,[]).append(peptide) | |
484 trypticPeptides = partToPeps.keys() | |
485 ## unipept | |
486 unipept_resp = [] | |
487 idx = range(0,len(trypticPeptides),options.max_request) | |
488 idx.append(len(trypticPeptides)) | |
489 for i in range(len(idx)-1): | |
490 post_data = [] | |
491 if options.equate_il: | |
492 post_data.append(("equate_il","true")) | |
493 if options.names or options.json: | |
494 post_data.append(("extra","true")) | |
495 post_data.append(("names","true")) | |
496 elif options.extra or options.json: | |
497 post_data.append(("extra","true")) | |
498 if options.domains: | |
499 post_data.append(("domains","true")) | |
500 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] | |
501 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) | |
502 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) | |
503 #headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | |
504 headers = {'Accept': 'application/json'} | |
505 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) | |
506 if options.get: | |
507 url = '%s.json?%s' % (url,params) | |
508 req = urllib2.Request( url ) | |
509 else: | |
510 url = '%s.json' % (url) | |
511 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | |
512 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) | |
513 try: | |
514 resp = urllib2.urlopen( req ) | |
515 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) | |
516 if resp.getcode() == 200: | |
517 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | |
518 except Exception, e: | |
519 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) | |
520 unmatched_peptides = [] | |
521 peptideMatches = [] | |
522 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | |
523 if options.unipept in ['pept2prot', 'pept2taxa']: | |
524 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | |
525 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | |
526 mapping = {} | |
527 for match in unipept_resp: | |
528 mapping.setdefault(match['peptide'],[]).append(match) | |
529 for peptide in peptides: | |
530 # Get the intersection of matches to the tryptic parts | |
531 keyToMatch = None | |
532 for part in pepToParts[peptide]: | |
533 if part in mapping: | |
534 temp = {match[dupkey] : match for match in mapping[part]} | |
535 if keyToMatch: | |
536 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | |
537 for k in dkeys: | |
538 del keyToMatch[k] | |
539 else: | |
540 keyToMatch = temp | |
541 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | |
542 if not keyToMatch: | |
543 unmatched_peptides.append(peptide) | |
544 else: | |
545 for key,match in keyToMatch.iteritems(): | |
546 match['tryptic_peptide'] = match['peptide'] | |
547 match['peptide'] = peptide | |
548 peptideMatches.append(match) | |
549 elif options.unipept in ['pept2lca']: | |
550 ## should be one response per trypticPeptide for pep2lca | |
551 respMap = {v['peptide']:v for v in unipept_resp} | |
552 ## map resp back to peptides | |
553 for peptide in peptides: | |
554 matches = list() | |
555 for part in pepToParts[peptide]: | |
556 if part in respMap: | |
557 matches.append(respMap[part]) | |
558 match = best_match(peptide,matches) | |
559 if not match: | |
560 unmatched_peptides.append(peptide) | |
561 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
562 match = {'peptide' : longest_tryptic_peptide} | |
563 match['tryptic_peptide'] = match['peptide'] | |
564 match['peptide'] = peptide | |
565 peptideMatches.append(match) | |
566 else: | |
567 respMap = {v['peptide']:v for v in unipept_resp} | |
568 ## map resp back to peptides | |
569 for peptide in peptides: | |
570 matches = list() | |
571 for part in pepToParts[peptide]: | |
572 if part in respMap and 'total_protein_count' in respMap[part]: | |
573 matches.append(respMap[part]) | |
574 match = best_match(peptide,matches) | |
575 if not match: | |
576 unmatched_peptides.append(peptide) | |
577 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
578 match = {'peptide' : longest_tryptic_peptide} | |
579 match['tryptic_peptide'] = match['peptide'] | |
580 match['peptide'] = peptide | |
581 peptideMatches.append(match) | |
582 resp = peptideMatches | |
583 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) | |
584 ## output results | |
585 if not (options.unmatched or options.json or options.tsv or options.csv): | |
586 print >> sys.stdout, str(resp) | |
587 if options.unmatched: | |
588 with open(options.unmatched,'w') as outputFile: | |
589 for peptide in peptides: | |
590 if peptide in unmatched_peptides: | |
591 outputFile.write("%s\n" % peptide) | |
592 if options.json: | |
593 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: | |
594 root = get_taxon_json(resp) | |
595 with open(options.json,'w') as outputFile: | |
596 outputFile.write(json.dumps(root)) | |
597 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: | |
598 with open(options.json,'w') as outputFile: | |
599 outputFile.write(str(resp)) | |
600 if options.ec_json: | |
601 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
602 root = get_ec_json(resp) | |
603 with open(options.ec_json,'w') as outputFile: | |
604 outputFile.write(json.dumps(root)) | |
605 if options.tsv or options.csv: | |
606 rows = [] | |
607 column_names = None | |
608 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: | |
609 taxa = None | |
610 ec_dict = None | |
611 go_dict = None | |
612 if options.unipept in ['peptinfo']: | |
613 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
614 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
615 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) | |
616 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
617 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) | |
618 for i,pdict in enumerate(resp): | |
619 peptide = pdict['peptide'] | |
620 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | |
621 column_names = ['peptide', 'total_protein_count'] | |
622 vals = [peptide,total_protein_count] | |
623 if ec_dict: | |
624 vals += ec_dict[peptide] | |
625 column_names += ec_cols | |
626 if go_dict: | |
627 vals += go_dict[peptide] | |
628 column_names += go_cols | |
629 if taxa: | |
630 vals += taxa[peptide][1:] | |
631 column_names += taxon_cols[1:] | |
632 rows.append(vals) | |
633 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | |
634 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
635 column_names = taxon_cols | |
636 rows = taxa.values() | |
637 for peptide,vals in taxa.iteritems(): | |
638 rows.append(vals) | |
639 if options.tsv: | |
640 with open(options.tsv,'w') as outputFile: | |
641 if column_names: | |
642 outputFile.write("#%s\n"% '\t'.join(column_names)) | |
643 for vals in rows: | |
644 outputFile.write("%s\n"% '\t'.join(vals)) | |
645 if options.csv: | |
646 with open(options.csv,'w') as outputFile: | |
647 if column_names: | |
648 outputFile.write("%s\n"% ','.join(column_names)) | |
649 for vals in rows: | |
650 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | |
651 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
652 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order | |
653 write_ec_table(options.ec_tsv, resp, column_order) | |
654 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
655 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order | |
656 write_go_table(options.go_tsv, resp, column_order) | |
657 | |
658 if __name__ == "__main__" : __main__() |