Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 0:b33376bf2290 draft
planemo upload for repository http://unipept.ugent.be/apidocs commit 704a0414b2547298b2596219998002491505d927-dirty
| author | galaxyp |
|---|---|
| date | Wed, 24 Oct 2018 14:45:18 -0400 |
| parents | |
| children | b65ee881ca64 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b33376bf2290 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 # | |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2015, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | |
| 9 # | |
| 10 # James E Johnson | |
| 11 # | |
| 12 #------------------------------------------------------------------------------ | |
| 13 """ | |
| 14 | |
| 15 import json | |
| 16 import logging | |
| 17 import optparse | |
| 18 from optparse import OptionParser | |
| 19 import os | |
| 20 import sys | |
| 21 import re | |
| 22 import urllib | |
| 23 import urllib2 | |
| 24 | |
| 25 """ | |
| 26 pept2taxa json | |
| 27 pept2lca json | |
| 28 pept2prot | |
| 29 pept2ec ecjson ec | |
| 30 pept2go go | |
| 31 pept2funct go ec | |
| 32 peptinfo json ecjson ec go | |
| 33 | |
| 34 """ | |
| 35 | |
| 36 try: | |
| 37 import xml.etree.cElementTree as ET | |
| 38 except ImportError: | |
| 39 import xml.etree.ElementTree as ET | |
| 40 | |
| 41 def warn_err(msg,exit_code=1): | |
| 42 sys.stderr.write(msg) | |
| 43 if exit_code: | |
| 44 sys.exit(exit_code) | |
| 45 | |
| 46 go_types = ['biological process', 'molecular function', 'cellular component'] | |
| 47 ec_name_dict = { | |
| 48 '1' : 'Oxidoreductase', | |
| 49 '1.1' : 'act on the CH-OH group of donors', | |
| 50 '1.2' : 'act on the aldehyde or oxo group of donors', | |
| 51 '1.3' : 'act on the CH-CH group of donors', | |
| 52 '1.4' : 'act on the CH-NH2 group of donors', | |
| 53 '1.5' : 'act on CH-NH group of donors', | |
| 54 '1.6' : 'act on NADH or NADPH', | |
| 55 '1.7' : 'act on other nitrogenous compounds as donors', | |
| 56 '1.8' : 'act on a sulfur group of donors', | |
| 57 '1.9' : 'act on a heme group of donors', | |
| 58 '1.10' : 'act on diphenols and related substances as donors', | |
| 59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', | |
| 60 '1.12' : 'act on hydrogen as a donor', | |
| 61 '1.13' : 'act on single donors with incorporation of molecular oxygen', | |
| 62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', | |
| 63 '1.15' : 'act on superoxide radicals as acceptors', | |
| 64 '1.16' : 'oxidize metal ions', | |
| 65 '1.17' : 'act on CH or CH2 groups', | |
| 66 '1.18' : 'act on iron-sulfur proteins as donors', | |
| 67 '1.19' : 'act on reduced flavodoxin as donor', | |
| 68 '1.20' : 'act on phosphorus or arsenic as donors', | |
| 69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', | |
| 70 '1.97' : 'other oxidoreductases', | |
| 71 '2' : 'Transferase', | |
| 72 '2.1' : 'transfer one-carbon groups, Methylase', | |
| 73 '2.2' : 'transfer aldehyde or ketone groups', | |
| 74 '2.3' : 'acyltransferases', | |
| 75 '2.4' : 'glycosyltransferases', | |
| 76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', | |
| 77 '2.6' : 'transfer nitrogenous groups', | |
| 78 '2.7' : 'transfer phosphorus-containing groups', | |
| 79 '2.8' : 'transfer sulfur-containing groups', | |
| 80 '2.9' : 'transfer selenium-containing groups', | |
| 81 '3' : 'Hydrolase', | |
| 82 '3.1' : 'act on ester bonds', | |
| 83 '3.2' : 'act on sugars - glycosylases', | |
| 84 '3.3' : 'act on ether bonds', | |
| 85 '3.4' : 'act on peptide bonds - Peptidase', | |
| 86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', | |
| 87 '3.6' : 'act on acid anhydrides', | |
| 88 '3.7' : 'act on carbon-carbon bonds', | |
| 89 '3.8' : 'act on halide bonds', | |
| 90 '3.9' : 'act on phosphorus-nitrogen bonds', | |
| 91 '3.10' : 'act on sulfur-nitrogen bonds', | |
| 92 '3.11' : 'act on carbon-phosphorus bonds', | |
| 93 '3.12' : 'act on sulfur-sulfur bonds', | |
| 94 '3.13' : 'act on carbon-sulfur bonds', | |
| 95 '4' : 'Lyase', | |
| 96 '4.1' : 'carbon-carbon lyases', | |
| 97 '4.2' : 'carbon-oxygen lyases', | |
| 98 '4.3' : 'carbon-nitrogen lyases', | |
| 99 '4.4' : 'carbon-sulfur lyases', | |
| 100 '4.5' : 'carbon-halide lyases', | |
| 101 '4.6' : 'phosphorus-oxygen lyases', | |
| 102 '5' : 'Isomerase', | |
| 103 '5.1' : 'racemases and epimerases', | |
| 104 '5.2' : 'cis-trans-isomerases', | |
| 105 '5.3' : 'intramolecular oxidoreductases', | |
| 106 '5.4' : 'intramolecular transferases -- mutases', | |
| 107 '5.5' : 'intramolecular lyases', | |
| 108 '5.99' : 'other isomerases', | |
| 109 '6' : 'Ligase', | |
| 110 '6.1' : 'form carbon-oxygen bonds', | |
| 111 '6.2' : 'form carbon-sulfur bonds', | |
| 112 '6.3' : 'form carbon-nitrogen bonds', | |
| 113 '6.4' : 'form carbon-carbon bonds', | |
| 114 '6.5' : 'form phosphoric ester bonds', | |
| 115 '6.6' : 'form nitrogen-metal bonds', | |
| 116 } | |
| 117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] | |
| 118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | |
| 119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | |
| 120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | |
| 121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | |
| 122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | |
| 123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | |
| 124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | |
| 125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] | |
| 126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] | |
| 127 | |
| 128 def __main__(): | |
| 129 version = '2.0' | |
| 130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | |
| 131 | |
| 132 def read_tabular(filepath,col): | |
| 133 peptides = [] | |
| 134 with open(filepath) as fp: | |
| 135 for i,line in enumerate(fp): | |
| 136 if line.strip() == '' or line.startswith('#'): | |
| 137 continue | |
| 138 fields = line.rstrip('\n').split('\t') | |
| 139 peptide = fields[col] | |
| 140 if not re.match(pep_pat,peptide): | |
| 141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) | |
| 142 peptides.append(peptide) | |
| 143 return peptides | |
| 144 | |
| 145 def get_fasta_entries(fp): | |
| 146 name, seq = None, [] | |
| 147 for line in fp: | |
| 148 line = line.rstrip() | |
| 149 if line.startswith(">"): | |
| 150 if name: yield (name, ''.join(seq)) | |
| 151 name, seq = line, [] | |
| 152 else: | |
| 153 seq.append(line) | |
| 154 if name: yield (name, ''.join(seq)) | |
| 155 | |
| 156 def read_fasta(filepath): | |
| 157 peptides = [] | |
| 158 with open(filepath) as fp: | |
| 159 for id, peptide in get_fasta_entries(fp): | |
| 160 if not re.match(pep_pat,peptide): | |
| 161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) | |
| 162 peptides.append(peptide) | |
| 163 return peptides | |
| 164 | |
| 165 def read_mzid(fp): | |
| 166 peptides = [] | |
| 167 for event, elem in ET.iterparse(fp): | |
| 168 if event == 'end': | |
| 169 if re.search('PeptideSequence',elem.tag): | |
| 170 peptides.append(elem.text) | |
| 171 return peptides | |
| 172 | |
| 173 def read_pepxml(fp): | |
| 174 peptides = [] | |
| 175 for event, elem in ET.iterparse(fp): | |
| 176 if event == 'end': | |
| 177 if re.search('search_hit',elem.tag): | |
| 178 peptides.append(elem.get('peptide')) | |
| 179 return peptides | |
| 180 | |
| 181 def best_match(peptide,matches): | |
| 182 if not matches: | |
| 183 return None | |
| 184 elif len(matches) == 1: | |
| 185 return matches[0].copy() | |
| 186 else: | |
| 187 # find the most specific match (peptide is always the first column order field) | |
| 188 for col in reversed(pept2lca_extra_column_order[1:]): | |
| 189 col_id = col+"_id" if options.extra else col | |
| 190 for match in matches: | |
| 191 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
| 192 return match.copy() | |
| 193 if col_id in match and match[col_id]: | |
| 194 return match.copy() | |
| 195 return None | |
| 196 | |
| 197 def get_taxon_json(resp): | |
| 198 found_keys = set() | |
| 199 for i,pdict in enumerate(resp): | |
| 200 found_keys |= set(pdict.keys()) | |
| 201 taxa_cols = [] | |
| 202 for col in pept2lca_extra_column_order[-1:0:-1]: | |
| 203 if col+'_id' in found_keys: | |
| 204 taxa_cols.append(col) | |
| 205 id_to_node = dict() | |
| 206 def get_node(id,name,rank,child,seq): | |
| 207 if id not in id_to_node: | |
| 208 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | |
| 209 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
| 210 id_to_node[id] = node | |
| 211 else: | |
| 212 node = id_to_node[id] | |
| 213 node['data']['count'] += 1 | |
| 214 if seq is not None and seq not in node['data']['sequences']: | |
| 215 node['data']['sequences'].append(seq) | |
| 216 if child is None: | |
| 217 node['data']['self_count'] += 1 | |
| 218 elif child['id'] not in node['kids']: | |
| 219 node['kids'].append(child['id']) | |
| 220 node['children'].append(child) | |
| 221 return node | |
| 222 root = get_node(1,'root','no rank',None,None) | |
| 223 for i,pdict in enumerate(resp): | |
| 224 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
| 225 seq = sequence | |
| 226 child = None | |
| 227 for col in taxa_cols: | |
| 228 col_id = col+'_id' | |
| 229 if col_id in pdict and pdict.get(col_id): | |
| 230 col_name = col if col in found_keys else col+'_name' | |
| 231 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | |
| 232 seq = None | |
| 233 if child: | |
| 234 get_node(1,'root','no rank',child,None) | |
| 235 return root | |
| 236 | |
| 237 def get_ec_json(resp): | |
| 238 ecMap = dict() | |
| 239 for pdict in resp: | |
| 240 if 'ec' in pdict: | |
| 241 for ec in pdict['ec']: | |
| 242 ec_number = ec['ec_number'] | |
| 243 if ec_number not in ecMap: | |
| 244 ecMap[ec_number] = [] | |
| 245 ecMap[ec_number].append(pdict) | |
| 246 def get_ids(ec): | |
| 247 ids = [] | |
| 248 i = len(ec) | |
| 249 while i >= 0: | |
| 250 ids.append(ec[:i]) | |
| 251 i = ec.rfind('.',0,i - 1) | |
| 252 return ids | |
| 253 id_to_node = dict() | |
| 254 def get_node(id,name,child,seq): | |
| 255 if id not in id_to_node: | |
| 256 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } | |
| 257 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
| 258 id_to_node[id] = node | |
| 259 else: | |
| 260 node = id_to_node[id] | |
| 261 node['data']['count'] += 1 | |
| 262 if seq is not None and seq not in node['data']['sequences']: | |
| 263 node['data']['sequences'].append(seq) | |
| 264 if child is None: | |
| 265 node['data']['self_count'] += 1 | |
| 266 elif child['id'] not in node['kids']: | |
| 267 node['kids'].append(child['id']) | |
| 268 node['children'].append(child) | |
| 269 return node | |
| 270 root = get_node(0,'-.-.-.-',None,None) | |
| 271 for i in range(1,7): | |
| 272 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) | |
| 273 get_node(0,'-.-.-.-',child,None) | |
| 274 for i,pdict in enumerate(resp): | |
| 275 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
| 276 seq = sequence | |
| 277 if 'ec' in pdict: | |
| 278 for ec in pdict['ec']: | |
| 279 child = None | |
| 280 protein_count = ec['protein_count'] | |
| 281 ec_number = ec['ec_number'] | |
| 282 for ec_id in get_ids(ec_number): | |
| 283 child = get_node(ec_id,ec_id,child,seq) | |
| 284 seq = None | |
| 285 if child: | |
| 286 get_node(0,'-.-.-.-',child,None) | |
| 287 return root | |
| 288 | |
| 289 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
| 290 found_keys = set() | |
| 291 results = [] | |
| 292 for i,pdict in enumerate(resp): | |
| 293 results.append(pdict) | |
| 294 found_keys |= set(pdict.keys()) | |
| 295 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
| 296 column_names = [] | |
| 297 column_keys = [] | |
| 298 for col in column_order: | |
| 299 if col in found_keys: | |
| 300 column_names.append(col) | |
| 301 column_keys.append(col) | |
| 302 elif names: | |
| 303 col_id = col+'_id' | |
| 304 col_name = col+'_name' | |
| 305 if extra: | |
| 306 if col_id in found_keys: | |
| 307 column_names.append(col_id) | |
| 308 column_keys.append(col_id) | |
| 309 if names: | |
| 310 if col_name in found_keys: | |
| 311 column_names.append(col) | |
| 312 column_keys.append(col_name) | |
| 313 else: | |
| 314 if col+'_name' in found_keys: | |
| 315 column_names.append(col) | |
| 316 column_keys.append(col+'_name') | |
| 317 elif col+'_id' in found_keys: | |
| 318 column_names.append(col) | |
| 319 column_keys.append(col+'_id') | |
| 320 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
| 321 taxa = dict() ## peptide : [taxonomy] | |
| 322 for i,pdict in enumerate(results): | |
| 323 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
| 324 if peptide and peptide not in taxa: | |
| 325 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
| 326 taxa[peptide] = vals | |
| 327 return (taxa,column_names) | |
| 328 | |
| 329 def get_ec_dict(resp, extra=False): | |
| 330 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
| 331 if extra: | |
| 332 ec_cols.append('ec_names') | |
| 333 ec_dict = dict() | |
| 334 for i,pdict in enumerate(resp): | |
| 335 peptide = pdict['peptide'] | |
| 336 ec_numbers = [] | |
| 337 protein_counts = [] | |
| 338 ec_names = [] | |
| 339 if 'ec' in pdict: | |
| 340 for ec in pdict['ec']: | |
| 341 ec_numbers.append(ec['ec_number']) | |
| 342 protein_counts.append(str(ec['protein_count'])) | |
| 343 if extra: | |
| 344 ec_names.append(ec['name'] if 'name' in ec else '') | |
| 345 vals = [','.join(ec_numbers),','.join(protein_counts)] | |
| 346 if extra: | |
| 347 vals.append(','.join(ec_names)) | |
| 348 ec_dict[peptide] = vals | |
| 349 return (ec_dict, ec_cols) | |
| 350 | |
| 351 def get_go_dict(resp, extra=False): | |
| 352 go_cols = ['go_terms', 'go_protein_counts'] | |
| 353 if extra: | |
| 354 go_cols.append('go_names') | |
| 355 go_dict = dict() | |
| 356 for i,pdict in enumerate(resp): | |
| 357 peptide = pdict['peptide'] | |
| 358 go_terms = [] | |
| 359 protein_counts = [] | |
| 360 go_names = [] | |
| 361 if 'go' in pdict: | |
| 362 for go in pdict['go']: | |
| 363 if 'go_term' in go: | |
| 364 go_terms.append(go['go_term']) | |
| 365 protein_counts.append(str(go['protein_count'])) | |
| 366 if extra: | |
| 367 go_names.append(go['name'] if 'name' in go else '') | |
| 368 else: | |
| 369 for go_type in go_types: | |
| 370 if go_type in go: | |
| 371 for _go in go[go_type]: | |
| 372 go_terms.append(_go['go_term']) | |
| 373 protein_counts.append(str(_go['protein_count'])) | |
| 374 if extra: | |
| 375 go_names.append(_go['name'] if 'name' in _go else '') | |
| 376 vals = [','.join(go_terms),','.join(protein_counts)] | |
| 377 if extra: | |
| 378 vals.append(','.join(go_names)) | |
| 379 go_dict[peptide] = vals | |
| 380 return (go_dict, go_cols) | |
| 381 | |
| 382 def write_ec_table(outfile, resp, column_order): | |
| 383 with open(outfile,'w') as fh: | |
| 384 for i,pdict in enumerate(resp): | |
| 385 if 'ec' in pdict: | |
| 386 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 387 for ec in pdict['ec']: | |
| 388 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
| 389 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 390 | |
| 391 def write_go_table(outfile, resp, column_order): | |
| 392 with open(outfile,'w') as fh: | |
| 393 for i,pdict in enumerate(resp): | |
| 394 if 'go' in pdict: | |
| 395 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 396 for go in pdict['go']: | |
| 397 if 'go_term' in go: | |
| 398 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
| 399 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 400 else: | |
| 401 for go_type in go_types: | |
| 402 if go_type in go: | |
| 403 for _go in go[go_type]: | |
| 404 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | |
| 405 vals.append(go_type) | |
| 406 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 407 | |
| 408 #Parse Command Line | |
| 409 parser = optparse.OptionParser() | |
| 410 # unipept API choice | |
| 411 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], | |
| 412 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) | |
| 413 # input files | |
| 414 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | |
| 415 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | |
| 416 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | |
| 417 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | |
| 418 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | |
| 419 # Unipept Flags | |
| 420 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | |
| 421 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | |
| 422 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | |
| 423 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) | |
| 424 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | |
| 425 | |
| 426 # output fields | |
| 427 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | |
| 428 # Warn vs Error Flag | |
| 429 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | |
| 430 # output files | |
| 431 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
| 432 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
| 433 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
| 434 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
| 435 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
| 436 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
| 437 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
| 438 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) | |
| 439 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) | |
| 440 # debug | |
| 441 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) | |
| 442 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | |
| 443 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | |
| 444 (options, args) = parser.parse_args() | |
| 445 if options.version: | |
| 446 print >> sys.stdout,"%s" % version | |
| 447 sys.exit(0) | |
| 448 invalid_ec = 2 if options.strict else None | |
| 449 peptides = [] | |
| 450 ## Get peptide sequences | |
| 451 if options.mzid: | |
| 452 peptides += read_mzid(options.mzid) | |
| 453 if options.pepxml: | |
| 454 peptides += read_pepxml(options.pepxml) | |
| 455 if options.tabular: | |
| 456 peptides += read_tabular(options.tabular,options.column) | |
| 457 if options.fasta: | |
| 458 peptides += read_fasta(options.fasta) | |
| 459 if args and len(args) > 0: | |
| 460 for i,peptide in enumerate(args): | |
| 461 if not re.match(pep_pat,peptide): | |
| 462 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | |
| 463 peptides.append(peptide) | |
| 464 if len(peptides) < 1: | |
| 465 warn_err("No peptides input!",exit_code=1) | |
| 466 column_order = pept2lca_column_order | |
| 467 if options.unipept == 'pept2prot': | |
| 468 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order | |
| 469 else: | |
| 470 if options.extra or options.names: | |
| 471 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | |
| 472 else: | |
| 473 column_order = pept2lca_column_order | |
| 474 ## map to tryptic peptides | |
| 475 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} | |
| 476 partToPeps = {} | |
| 477 for peptide, parts in pepToParts.iteritems(): | |
| 478 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) | |
| 479 for part in parts: | |
| 480 if len(part) > 50: | |
| 481 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | |
| 482 if 5 <= len(part) <= 50: | |
| 483 partToPeps.setdefault(part,[]).append(peptide) | |
| 484 trypticPeptides = partToPeps.keys() | |
| 485 ## unipept | |
| 486 unipept_resp = [] | |
| 487 idx = range(0,len(trypticPeptides),options.max_request) | |
| 488 idx.append(len(trypticPeptides)) | |
| 489 for i in range(len(idx)-1): | |
| 490 post_data = [] | |
| 491 if options.equate_il: | |
| 492 post_data.append(("equate_il","true")) | |
| 493 if options.names or options.json: | |
| 494 post_data.append(("extra","true")) | |
| 495 post_data.append(("names","true")) | |
| 496 elif options.extra or options.json: | |
| 497 post_data.append(("extra","true")) | |
| 498 if options.domains: | |
| 499 post_data.append(("domains","true")) | |
| 500 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] | |
| 501 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) | |
| 502 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) | |
| 503 #headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | |
| 504 headers = {'Accept': 'application/json'} | |
| 505 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) | |
| 506 if options.get: | |
| 507 url = '%s.json?%s' % (url,params) | |
| 508 req = urllib2.Request( url ) | |
| 509 else: | |
| 510 url = '%s.json' % (url) | |
| 511 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | |
| 512 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) | |
| 513 try: | |
| 514 resp = urllib2.urlopen( req ) | |
| 515 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) | |
| 516 if resp.getcode() == 200: | |
| 517 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | |
| 518 except Exception, e: | |
| 519 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) | |
| 520 unmatched_peptides = [] | |
| 521 peptideMatches = [] | |
| 522 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | |
| 523 if options.unipept in ['pept2prot', 'pept2taxa']: | |
| 524 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | |
| 525 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | |
| 526 mapping = {} | |
| 527 for match in unipept_resp: | |
| 528 mapping.setdefault(match['peptide'],[]).append(match) | |
| 529 for peptide in peptides: | |
| 530 # Get the intersection of matches to the tryptic parts | |
| 531 keyToMatch = None | |
| 532 for part in pepToParts[peptide]: | |
| 533 if part in mapping: | |
| 534 temp = {match[dupkey] : match for match in mapping[part]} | |
| 535 if keyToMatch: | |
| 536 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | |
| 537 for k in dkeys: | |
| 538 del keyToMatch[k] | |
| 539 else: | |
| 540 keyToMatch = temp | |
| 541 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | |
| 542 if not keyToMatch: | |
| 543 unmatched_peptides.append(peptide) | |
| 544 else: | |
| 545 for key,match in keyToMatch.iteritems(): | |
| 546 match['tryptic_peptide'] = match['peptide'] | |
| 547 match['peptide'] = peptide | |
| 548 peptideMatches.append(match) | |
| 549 elif options.unipept in ['pept2lca']: | |
| 550 ## should be one response per trypticPeptide for pep2lca | |
| 551 respMap = {v['peptide']:v for v in unipept_resp} | |
| 552 ## map resp back to peptides | |
| 553 for peptide in peptides: | |
| 554 matches = list() | |
| 555 for part in pepToParts[peptide]: | |
| 556 if part in respMap: | |
| 557 matches.append(respMap[part]) | |
| 558 match = best_match(peptide,matches) | |
| 559 if not match: | |
| 560 unmatched_peptides.append(peptide) | |
| 561 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
| 562 match = {'peptide' : longest_tryptic_peptide} | |
| 563 match['tryptic_peptide'] = match['peptide'] | |
| 564 match['peptide'] = peptide | |
| 565 peptideMatches.append(match) | |
| 566 else: | |
| 567 respMap = {v['peptide']:v for v in unipept_resp} | |
| 568 ## map resp back to peptides | |
| 569 for peptide in peptides: | |
| 570 matches = list() | |
| 571 for part in pepToParts[peptide]: | |
| 572 if part in respMap and 'total_protein_count' in respMap[part]: | |
| 573 matches.append(respMap[part]) | |
| 574 match = best_match(peptide,matches) | |
| 575 if not match: | |
| 576 unmatched_peptides.append(peptide) | |
| 577 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
| 578 match = {'peptide' : longest_tryptic_peptide} | |
| 579 match['tryptic_peptide'] = match['peptide'] | |
| 580 match['peptide'] = peptide | |
| 581 peptideMatches.append(match) | |
| 582 resp = peptideMatches | |
| 583 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) | |
| 584 ## output results | |
| 585 if not (options.unmatched or options.json or options.tsv or options.csv): | |
| 586 print >> sys.stdout, str(resp) | |
| 587 if options.unmatched: | |
| 588 with open(options.unmatched,'w') as outputFile: | |
| 589 for peptide in peptides: | |
| 590 if peptide in unmatched_peptides: | |
| 591 outputFile.write("%s\n" % peptide) | |
| 592 if options.json: | |
| 593 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: | |
| 594 root = get_taxon_json(resp) | |
| 595 with open(options.json,'w') as outputFile: | |
| 596 outputFile.write(json.dumps(root)) | |
| 597 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: | |
| 598 with open(options.json,'w') as outputFile: | |
| 599 outputFile.write(str(resp)) | |
| 600 if options.ec_json: | |
| 601 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 602 root = get_ec_json(resp) | |
| 603 with open(options.ec_json,'w') as outputFile: | |
| 604 outputFile.write(json.dumps(root)) | |
| 605 if options.tsv or options.csv: | |
| 606 rows = [] | |
| 607 column_names = None | |
| 608 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: | |
| 609 taxa = None | |
| 610 ec_dict = None | |
| 611 go_dict = None | |
| 612 if options.unipept in ['peptinfo']: | |
| 613 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
| 614 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 615 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) | |
| 616 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
| 617 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) | |
| 618 for i,pdict in enumerate(resp): | |
| 619 peptide = pdict['peptide'] | |
| 620 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | |
| 621 column_names = ['peptide', 'total_protein_count'] | |
| 622 vals = [peptide,total_protein_count] | |
| 623 if ec_dict: | |
| 624 vals += ec_dict[peptide] | |
| 625 column_names += ec_cols | |
| 626 if go_dict: | |
| 627 vals += go_dict[peptide] | |
| 628 column_names += go_cols | |
| 629 if taxa: | |
| 630 vals += taxa[peptide][1:] | |
| 631 column_names += taxon_cols[1:] | |
| 632 rows.append(vals) | |
| 633 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | |
| 634 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
| 635 column_names = taxon_cols | |
| 636 rows = taxa.values() | |
| 637 for peptide,vals in taxa.iteritems(): | |
| 638 rows.append(vals) | |
| 639 if options.tsv: | |
| 640 with open(options.tsv,'w') as outputFile: | |
| 641 if column_names: | |
| 642 outputFile.write("#%s\n"% '\t'.join(column_names)) | |
| 643 for vals in rows: | |
| 644 outputFile.write("%s\n"% '\t'.join(vals)) | |
| 645 if options.csv: | |
| 646 with open(options.csv,'w') as outputFile: | |
| 647 if column_names: | |
| 648 outputFile.write("%s\n"% ','.join(column_names)) | |
| 649 for vals in rows: | |
| 650 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | |
| 651 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 652 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order | |
| 653 write_ec_table(options.ec_tsv, resp, column_order) | |
| 654 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
| 655 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order | |
| 656 write_go_table(options.go_tsv, resp, column_order) | |
| 657 | |
| 658 if __name__ == "__main__" : __main__() |
