Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 2:dca8a1fe0bf3 draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit dd464f03c32f657fc555081117da18ba4c091af6"
| author | galaxyp |
|---|---|
| date | Wed, 29 Apr 2020 23:52:22 +0000 |
| parents | b65ee881ca64 |
| children | bbfabdc0701c |
comparison
equal
deleted
inserted
replaced
| 1:b65ee881ca64 | 2:dca8a1fe0bf3 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """ | 2 """ |
| 3 # | 3 # |
| 4 #------------------------------------------------------------------------------ | |
| 5 # University of Minnesota | |
| 6 # Copyright 2015, Regents of the University of Minnesota | |
| 7 #------------------------------------------------------------------------------ | |
| 8 # Author: | 4 # Author: |
| 9 # | 5 # |
| 10 # James E Johnson | 6 # James E Johnson |
| 11 # | 7 # |
| 12 #------------------------------------------------------------------------------ | 8 #------------------------------------------------------------------------------ |
| 13 """ | 9 """ |
| 14 | |
| 15 import json | 10 import json |
| 16 import logging | |
| 17 import optparse | 11 import optparse |
| 18 from optparse import OptionParser | |
| 19 import os | |
| 20 import sys | 12 import sys |
| 21 import re | 13 import re |
| 22 import urllib | 14 import urllib.error |
| 23 import urllib2 | 15 import urllib.parse |
| 24 | 16 import urllib.request |
| 25 """ | 17 |
| 26 pept2taxa json | |
| 27 pept2lca json | |
| 28 pept2prot | |
| 29 pept2ec ecjson ec | |
| 30 pept2go go | |
| 31 pept2funct go ec | |
| 32 peptinfo json ecjson ec go | |
| 33 | |
| 34 """ | |
| 35 | 18 |
| 36 try: | 19 try: |
| 37 import xml.etree.cElementTree as ET | 20 import xml.etree.cElementTree as ET |
| 38 except ImportError: | 21 except ImportError: |
| 39 import xml.etree.ElementTree as ET | 22 import xml.etree.ElementTree as ET |
| 40 | 23 |
| 41 def warn_err(msg,exit_code=1): | 24 |
| 25 def warn_err(msg, exit_code=1): | |
| 42 sys.stderr.write(msg) | 26 sys.stderr.write(msg) |
| 43 if exit_code: | 27 if exit_code: |
| 44 sys.exit(exit_code) | 28 sys.exit(exit_code) |
| 29 | |
| 45 | 30 |
| 46 go_types = ['biological process', 'molecular function', 'cellular component'] | 31 go_types = ['biological process', 'molecular function', 'cellular component'] |
| 32 ipr_types = ['Domain', 'Family', 'Homologous_superfamily', 'Repeat', 'Conserved_site', 'Active_site', 'Binding_site', 'PTM'] | |
| 47 ec_name_dict = { | 33 ec_name_dict = { |
| 48 '1' : 'Oxidoreductase', | 34 '1': 'Oxidoreductase', |
| 49 '1.1' : 'act on the CH-OH group of donors', | 35 '1.1': 'act on the CH-OH group of donors', |
| 50 '1.2' : 'act on the aldehyde or oxo group of donors', | 36 '1.2': 'act on the aldehyde or oxo group of donors', |
| 51 '1.3' : 'act on the CH-CH group of donors', | 37 '1.3': 'act on the CH-CH group of donors', |
| 52 '1.4' : 'act on the CH-NH2 group of donors', | 38 '1.4': 'act on the CH-NH2 group of donors', |
| 53 '1.5' : 'act on CH-NH group of donors', | 39 '1.5': 'act on CH-NH group of donors', |
| 54 '1.6' : 'act on NADH or NADPH', | 40 '1.6': 'act on NADH or NADPH', |
| 55 '1.7' : 'act on other nitrogenous compounds as donors', | 41 '1.7': 'act on other nitrogenous compounds as donors', |
| 56 '1.8' : 'act on a sulfur group of donors', | 42 '1.8': 'act on a sulfur group of donors', |
| 57 '1.9' : 'act on a heme group of donors', | 43 '1.9': 'act on a heme group of donors', |
| 58 '1.10' : 'act on diphenols and related substances as donors', | 44 '1.10': 'act on diphenols and related substances as donors', |
| 59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', | 45 '1.11': 'act on peroxide as an acceptor -- peroxidases', |
| 60 '1.12' : 'act on hydrogen as a donor', | 46 '1.12': 'act on hydrogen as a donor', |
| 61 '1.13' : 'act on single donors with incorporation of molecular oxygen', | 47 '1.13': 'act on single donors with incorporation of molecular oxygen', |
| 62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', | 48 '1.14': 'act on paired donors with incorporation of molecular oxygen', |
| 63 '1.15' : 'act on superoxide radicals as acceptors', | 49 '1.15': 'act on superoxide radicals as acceptors', |
| 64 '1.16' : 'oxidize metal ions', | 50 '1.16': 'oxidize metal ions', |
| 65 '1.17' : 'act on CH or CH2 groups', | 51 '1.17': 'act on CH or CH2 groups', |
| 66 '1.18' : 'act on iron-sulfur proteins as donors', | 52 '1.18': 'act on iron-sulfur proteins as donors', |
| 67 '1.19' : 'act on reduced flavodoxin as donor', | 53 '1.19': 'act on reduced flavodoxin as donor', |
| 68 '1.20' : 'act on phosphorus or arsenic as donors', | 54 '1.20': 'act on phosphorus or arsenic as donors', |
| 69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', | 55 '1.21': 'act on X-H and Y-H to form an X-Y bond', |
| 70 '1.97' : 'other oxidoreductases', | 56 '1.97': 'other oxidoreductases', |
| 71 '2' : 'Transferase', | 57 '2': 'Transferase', |
| 72 '2.1' : 'transfer one-carbon groups, Methylase', | 58 '2.1': 'transfer one-carbon groups, Methylase', |
| 73 '2.2' : 'transfer aldehyde or ketone groups', | 59 '2.2': 'transfer aldehyde or ketone groups', |
| 74 '2.3' : 'acyltransferases', | 60 '2.3': 'acyltransferases', |
| 75 '2.4' : 'glycosyltransferases', | 61 '2.4': 'glycosyltransferases', |
| 76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', | 62 '2.5': 'transfer alkyl or aryl groups, other than methyl groups', |
| 77 '2.6' : 'transfer nitrogenous groups', | 63 '2.6': 'transfer nitrogenous groups', |
| 78 '2.7' : 'transfer phosphorus-containing groups', | 64 '2.7': 'transfer phosphorus-containing groups', |
| 79 '2.8' : 'transfer sulfur-containing groups', | 65 '2.8': 'transfer sulfur-containing groups', |
| 80 '2.9' : 'transfer selenium-containing groups', | 66 '2.9': 'transfer selenium-containing groups', |
| 81 '3' : 'Hydrolase', | 67 '3': 'Hydrolase', |
| 82 '3.1' : 'act on ester bonds', | 68 '3.1': 'act on ester bonds', |
| 83 '3.2' : 'act on sugars - glycosylases', | 69 '3.2': 'act on sugars - glycosylases', |
| 84 '3.3' : 'act on ether bonds', | 70 '3.3': 'act on ether bonds', |
| 85 '3.4' : 'act on peptide bonds - Peptidase', | 71 '3.4': 'act on peptide bonds - Peptidase', |
| 86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', | 72 '3.5': 'act on carbon-nitrogen bonds, other than peptide bonds', |
| 87 '3.6' : 'act on acid anhydrides', | 73 '3.6': 'act on acid anhydrides', |
| 88 '3.7' : 'act on carbon-carbon bonds', | 74 '3.7': 'act on carbon-carbon bonds', |
| 89 '3.8' : 'act on halide bonds', | 75 '3.8': 'act on halide bonds', |
| 90 '3.9' : 'act on phosphorus-nitrogen bonds', | 76 '3.9': 'act on phosphorus-nitrogen bonds', |
| 91 '3.10' : 'act on sulfur-nitrogen bonds', | 77 '3.10': 'act on sulfur-nitrogen bonds', |
| 92 '3.11' : 'act on carbon-phosphorus bonds', | 78 '3.11': 'act on carbon-phosphorus bonds', |
| 93 '3.12' : 'act on sulfur-sulfur bonds', | 79 '3.12': 'act on sulfur-sulfur bonds', |
| 94 '3.13' : 'act on carbon-sulfur bonds', | 80 '3.13': 'act on carbon-sulfur bonds', |
| 95 '4' : 'Lyase', | 81 '4': 'Lyase', |
| 96 '4.1' : 'carbon-carbon lyases', | 82 '4.1': 'carbon-carbon lyases', |
| 97 '4.2' : 'carbon-oxygen lyases', | 83 '4.2': 'carbon-oxygen lyases', |
| 98 '4.3' : 'carbon-nitrogen lyases', | 84 '4.3': 'carbon-nitrogen lyases', |
| 99 '4.4' : 'carbon-sulfur lyases', | 85 '4.4': 'carbon-sulfur lyases', |
| 100 '4.5' : 'carbon-halide lyases', | 86 '4.5': 'carbon-halide lyases', |
| 101 '4.6' : 'phosphorus-oxygen lyases', | 87 '4.6': 'phosphorus-oxygen lyases', |
| 102 '5' : 'Isomerase', | 88 '5': 'Isomerase', |
| 103 '5.1' : 'racemases and epimerases', | 89 '5.1': 'racemases and epimerases', |
| 104 '5.2' : 'cis-trans-isomerases', | 90 '5.2': 'cis-trans-isomerases', |
| 105 '5.3' : 'intramolecular oxidoreductases', | 91 '5.3': 'intramolecular oxidoreductases', |
| 106 '5.4' : 'intramolecular transferases -- mutases', | 92 '5.4': 'intramolecular transferases -- mutases', |
| 107 '5.5' : 'intramolecular lyases', | 93 '5.5': 'intramolecular lyases', |
| 108 '5.99' : 'other isomerases', | 94 '5.99': 'other isomerases', |
| 109 '6' : 'Ligase', | 95 '6': 'Ligase', |
| 110 '6.1' : 'form carbon-oxygen bonds', | 96 '6.1': 'form carbon-oxygen bonds', |
| 111 '6.2' : 'form carbon-sulfur bonds', | 97 '6.2': 'form carbon-sulfur bonds', |
| 112 '6.3' : 'form carbon-nitrogen bonds', | 98 '6.3': 'form carbon-nitrogen bonds', |
| 113 '6.4' : 'form carbon-carbon bonds', | 99 '6.4': 'form carbon-carbon bonds', |
| 114 '6.5' : 'form phosphoric ester bonds', | 100 '6.5': 'form phosphoric ester bonds', |
| 115 '6.6' : 'form nitrogen-metal bonds', | 101 '6.6': 'form nitrogen-metal bonds', |
| 116 } | 102 } |
| 117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] | 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] |
| 118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] |
| 119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] |
| 120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] |
| 121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] |
| 122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] |
| 123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] |
| 124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] |
| 125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] | 111 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] |
| 126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] | 112 pept2interpro_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count']] |
| 113 pept2interpro_extra_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count', 'type', 'name']] | |
| 114 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go', 'ipr'] | |
| 115 | |
| 127 | 116 |
| 128 def __main__(): | 117 def __main__(): |
| 129 version = '2.0' | 118 version = '4.3' |
| 130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | 119 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' |
| 131 | 120 |
| 132 def read_tabular(filepath,col): | 121 def read_tabular(filepath, col): |
| 122 peptides = [] | |
| 123 with open(filepath) as fp: | |
| 124 for i, line in enumerate(fp): | |
| 125 if line.strip() == '' or line.startswith('#'): | |
| 126 continue | |
| 127 fields = line.rstrip('\n').split('\t') | |
| 128 peptide = fields[col] | |
| 129 if not re.match(pep_pat, peptide): | |
| 130 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide, i, col, filepath), exit_code=invalid_ec) | |
| 131 peptides.append(peptide) | |
| 132 return peptides | |
| 133 | |
| 134 def get_fasta_entries(fp): | |
| 135 name, seq = None, [] | |
| 136 for line in fp: | |
| 137 line = line.rstrip() | |
| 138 if line.startswith(">"): | |
| 139 if name: | |
| 140 yield (name, ''.join(seq)) | |
| 141 name, seq = line, [] | |
| 142 else: | |
| 143 seq.append(line) | |
| 144 if name: | |
| 145 yield (name, ''.join(seq)) | |
| 146 | |
| 147 def read_fasta(filepath): | |
| 148 peptides = [] | |
| 149 with open(filepath) as fp: | |
| 150 for id, peptide in get_fasta_entries(fp): | |
| 151 if not re.match(pep_pat, peptide): | |
| 152 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide, id, filepath), exit_code=invalid_ec) | |
| 153 peptides.append(peptide) | |
| 154 return peptides | |
| 155 | |
| 156 def read_mzid(fp): | |
| 157 peptides = [] | |
| 158 for event, elem in ET.iterparse(fp): | |
| 159 if event == 'end': | |
| 160 if re.search('PeptideSequence', elem.tag): | |
| 161 peptides.append(elem.text) | |
| 162 return peptides | |
| 163 | |
| 164 def read_pepxml(fp): | |
| 165 peptides = [] | |
| 166 for event, elem in ET.iterparse(fp): | |
| 167 if event == 'end': | |
| 168 if re.search('search_hit', elem.tag): | |
| 169 peptides.append(elem.get('peptide')) | |
| 170 return peptides | |
| 171 | |
| 172 def best_match(peptide, matches): | |
| 173 if not matches: | |
| 174 return None | |
| 175 elif len(matches) == 1: | |
| 176 return matches[0].copy() | |
| 177 elif 'taxon_rank' in matches[0]: | |
| 178 # find the most specific match (peptide is always the first column order field) | |
| 179 for col in reversed(pept2lca_extra_column_order[1:]): | |
| 180 col_id = col + "_id" if options.extra else col | |
| 181 for match in matches: | |
| 182 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
| 183 return match.copy() | |
| 184 if col_id in match and match[col_id]: | |
| 185 return match.copy() | |
| 186 else: | |
| 187 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() | |
| 188 return None | |
| 189 | |
| 190 def get_taxon_json(resp): | |
| 191 found_keys = set() | |
| 192 for i, pdict in enumerate(resp): | |
| 193 found_keys |= set(pdict.keys()) | |
| 194 taxa_cols = [] | |
| 195 for col in pept2lca_extra_column_order[-1:0:-1]: | |
| 196 if col + '_id' in found_keys: | |
| 197 taxa_cols.append(col) | |
| 198 id_to_node = dict() | |
| 199 | |
| 200 def get_node(id, name, rank, child, seq): | |
| 201 if id not in id_to_node: | |
| 202 data = {'count': 0, 'self_count': 0, 'valid_taxon': 1, 'rank': rank, 'sequences': []} | |
| 203 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data} | |
| 204 id_to_node[id] = node | |
| 205 else: | |
| 206 node = id_to_node[id] | |
| 207 node['data']['count'] += 1 | |
| 208 if seq is not None and seq not in node['data']['sequences']: | |
| 209 node['data']['sequences'].append(seq) | |
| 210 if child is None: | |
| 211 node['data']['self_count'] += 1 | |
| 212 elif child['id'] not in node['kids']: | |
| 213 node['kids'].append(child['id']) | |
| 214 node['children'].append(child) | |
| 215 return node | |
| 216 root = get_node(1, 'root', 'no rank', None, None) | |
| 217 for i, pdict in enumerate(resp): | |
| 218 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None)) | |
| 219 seq = sequence | |
| 220 child = None | |
| 221 for col in taxa_cols: | |
| 222 col_id = col + '_id' | |
| 223 if col_id in pdict and pdict.get(col_id): | |
| 224 col_name = col if col in found_keys else col + '_name' | |
| 225 child = get_node(pdict.get(col_id, None), pdict.get(col_name, ''), col, child, seq) | |
| 226 seq = None | |
| 227 if child is not None: | |
| 228 get_node(1, 'root', 'no rank', child, None) | |
| 229 return root | |
| 230 | |
| 231 def get_ec_json(resp): | |
| 232 ecMap = dict() | |
| 233 for pdict in resp: | |
| 234 if 'ec' in pdict: | |
| 235 for ec in pdict['ec']: | |
| 236 ec_number = ec['ec_number'] | |
| 237 if ec_number not in ecMap: | |
| 238 ecMap[ec_number] = [] | |
| 239 ecMap[ec_number].append(pdict) | |
| 240 | |
| 241 def get_ids(ec): | |
| 242 ids = [] | |
| 243 i = len(ec) | |
| 244 while i >= 0: | |
| 245 ids.append(ec[:i]) | |
| 246 i = ec.rfind('.', 0, i - 1) | |
| 247 return ids | |
| 248 id_to_node = dict() | |
| 249 | |
| 250 def get_node(id, name, child, seq): | |
| 251 if id not in id_to_node: | |
| 252 data = {'count': 0, 'self_count': 0, 'sequences': []} | |
| 253 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data} | |
| 254 id_to_node[id] = node | |
| 255 else: | |
| 256 node = id_to_node[id] | |
| 257 node['data']['count'] += 1 | |
| 258 if seq is not None and seq not in node['data']['sequences']: | |
| 259 node['data']['sequences'].append(seq) | |
| 260 if child is None: | |
| 261 node['data']['self_count'] += 1 | |
| 262 elif child['id'] not in node['kids']: | |
| 263 node['kids'].append(child['id']) | |
| 264 node['children'].append(child) | |
| 265 return node | |
| 266 root = get_node(0, '-.-.-.-', None, None) | |
| 267 for i in range(1, 7): | |
| 268 child = get_node(str(i), '%s\n%s' % (str(i), ec_name_dict[str(i)]), None, None) | |
| 269 get_node(0, '-.-.-.-', child, None) | |
| 270 for i, pdict in enumerate(resp): | |
| 271 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None)) | |
| 272 seq = sequence | |
| 273 if 'ec' in pdict: | |
| 274 for ec in pdict['ec']: | |
| 275 child = None | |
| 276 ec_number = ec['ec_number'] | |
| 277 for ec_id in get_ids(ec_number): | |
| 278 ec_name = str(ec_id) | |
| 279 child = get_node(ec_id, ec_name, child, seq) | |
| 280 seq = None | |
| 281 if child: | |
| 282 get_node(0, '-.-.-.-', child, None) | |
| 283 return root | |
| 284 | |
| 285 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
| 286 found_keys = set() | |
| 287 results = [] | |
| 288 for i, pdict in enumerate(resp): | |
| 289 results.append(pdict) | |
| 290 found_keys |= set(pdict.keys()) | |
| 291 # print >> sys.stderr, "%s\n%s" % (pdict.keys(), found_keys) | |
| 292 column_names = [] | |
| 293 column_keys = [] | |
| 294 for col in column_order: | |
| 295 if col in found_keys: | |
| 296 column_names.append(col) | |
| 297 column_keys.append(col) | |
| 298 elif names: | |
| 299 col_id = col + '_id' | |
| 300 col_name = col + '_name' | |
| 301 if extra: | |
| 302 if col_id in found_keys: | |
| 303 column_names.append(col_id) | |
| 304 column_keys.append(col_id) | |
| 305 if names: | |
| 306 if col_name in found_keys: | |
| 307 column_names.append(col) | |
| 308 column_keys.append(col_name) | |
| 309 else: | |
| 310 if col + '_name' in found_keys: | |
| 311 column_names.append(col) | |
| 312 column_keys.append(col + '_name') | |
| 313 elif col + '_id' in found_keys: | |
| 314 column_names.append(col) | |
| 315 column_keys.append(col + '_id') | |
| 316 # print >> sys.stderr, "%s\n%s" % (column_names, column_keys) | |
| 317 taxa = dict() # peptide: [taxonomy] | |
| 318 for i, pdict in enumerate(results): | |
| 319 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
| 320 if peptide and peptide not in taxa: | |
| 321 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
| 322 taxa[peptide] = vals | |
| 323 return (taxa, column_names) | |
| 324 | |
| 325 def get_ec_dict(resp, extra=False): | |
| 326 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
| 327 if extra: | |
| 328 ec_cols.append('ec_names') | |
| 329 ec_dict = dict() | |
| 330 for i, pdict in enumerate(resp): | |
| 331 peptide = pdict['peptide'] | |
| 332 ec_numbers = [] | |
| 333 protein_counts = [] | |
| 334 ec_names = [] | |
| 335 if 'ec' in pdict: | |
| 336 for ec in pdict['ec']: | |
| 337 ec_numbers.append(ec['ec_number']) | |
| 338 protein_counts.append(str(ec['protein_count'])) | |
| 339 if extra: | |
| 340 ec_names.append(ec['name'] if 'name' in ec else '') | |
| 341 vals = [','.join(ec_numbers), ','.join(protein_counts)] | |
| 342 if extra: | |
| 343 vals.append(','.join(ec_names)) | |
| 344 ec_dict[peptide] = vals | |
| 345 return (ec_dict, ec_cols) | |
| 346 | |
| 347 def get_go_dict(resp, extra=False): | |
| 348 go_cols = ['go_terms', 'go_protein_counts'] | |
| 349 if extra: | |
| 350 go_cols.append('go_names') | |
| 351 go_dict = dict() | |
| 352 for i, pdict in enumerate(resp): | |
| 353 peptide = pdict['peptide'] | |
| 354 go_terms = [] | |
| 355 protein_counts = [] | |
| 356 go_names = [] | |
| 357 if 'go' in pdict: | |
| 358 for go in pdict['go']: | |
| 359 if 'go_term' in go: | |
| 360 go_terms.append(go['go_term']) | |
| 361 protein_counts.append(str(go['protein_count'])) | |
| 362 if extra: | |
| 363 go_names.append(go['name'] if 'name' in go else '') | |
| 364 else: | |
| 365 for go_type in go_types: | |
| 366 if go_type in go: | |
| 367 for _go in go[go_type]: | |
| 368 go_terms.append(_go['go_term']) | |
| 369 protein_counts.append(str(_go['protein_count'])) | |
| 370 if extra: | |
| 371 go_names.append(_go['name'] if 'name' in _go else '') | |
| 372 vals = [','.join(go_terms), ','.join(protein_counts)] | |
| 373 if extra: | |
| 374 vals.append(','.join(go_names)) | |
| 375 go_dict[peptide] = vals | |
| 376 return (go_dict, go_cols) | |
| 377 | |
| 378 def get_ipr_dict(resp, extra=False): | |
| 379 ipr_cols = ['ipr_codes', 'ipr_protein_counts'] | |
| 380 if extra: | |
| 381 ipr_cols.append('ipr_types') | |
| 382 ipr_cols.append('ipr_names') | |
| 383 ipr_dict = dict() | |
| 384 for i, pdict in enumerate(resp): | |
| 385 peptide = pdict['peptide'] | |
| 386 ipr_codes = [] | |
| 387 protein_counts = [] | |
| 388 ipr_names = [] | |
| 389 ipr_types = [] | |
| 390 if 'ipr' in pdict: | |
| 391 for ipr in pdict['ipr']: | |
| 392 if 'code' in ipr: | |
| 393 ipr_codes.append(ipr['code']) | |
| 394 protein_counts.append(str(ipr['protein_count'])) | |
| 395 if extra: | |
| 396 ipr_types.append(ipr['type'] if 'type' in ipr else '') | |
| 397 ipr_names.append(ipr['name'] if 'name' in ipr else '') | |
| 398 else: | |
| 399 for ipr_type in ipr_types: | |
| 400 if ipr_type in ipr: | |
| 401 for _ipr in ipr[ipr_type]: | |
| 402 ipr_codes.append(_ipr['code']) | |
| 403 protein_counts.append(str(_ipr['protein_count'])) | |
| 404 if extra: | |
| 405 ipr_types.append(ipr_type) | |
| 406 ipr_names.append(_ipr['name'] if 'name' in _ipr else '') | |
| 407 vals = [','.join(ipr_codes), ','.join(protein_counts)] | |
| 408 if extra: | |
| 409 vals.append(','.join(ipr_types)) | |
| 410 vals.append(','.join(ipr_names)) | |
| 411 ipr_dict[peptide] = vals | |
| 412 return (ipr_dict, ipr_cols) | |
| 413 | |
| 414 def write_ec_table(outfile, resp, column_order): | |
| 415 with open(outfile, 'w') as fh: | |
| 416 for i, pdict in enumerate(resp): | |
| 417 if 'ec' in pdict: | |
| 418 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 419 for ec in pdict['ec']: | |
| 420 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
| 421 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 422 | |
| 423 def write_go_table(outfile, resp, column_order): | |
| 424 with open(outfile, 'w') as fh: | |
| 425 for i, pdict in enumerate(resp): | |
| 426 if 'go' in pdict: | |
| 427 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 428 for go in pdict['go']: | |
| 429 if 'go_term' in go: | |
| 430 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
| 431 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 432 else: | |
| 433 for go_type in go_types: | |
| 434 if go_type in go: | |
| 435 for _go in go[go_type]: | |
| 436 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | |
| 437 vals.append(go_type) | |
| 438 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 439 | |
| 440 def write_ipr_table(outfile, resp, column_order): | |
| 441 with open(outfile, 'w') as fh: | |
| 442 for i, pdict in enumerate(resp): | |
| 443 if 'ipr' in pdict: | |
| 444 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 445 for ipr in pdict['ipr']: | |
| 446 if 'code' in ipr: | |
| 447 vals = [str(ipr[x]) if x in ipr and ipr[x] else '' for x in column_order[-1]] | |
| 448 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 449 else: | |
| 450 for ipr_type in ipr_types: | |
| 451 if ipr_type in ipr: | |
| 452 for _ipr in ipr[ipr_type]: | |
| 453 vals = [str(_ipr[x]) if x in _ipr and _ipr[x] else '' for x in column_order[-1]] | |
| 454 vals.append(ipr_type) | |
| 455 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 456 | |
| 457 # Parse Command Line | |
| 458 parser = optparse.OptionParser() | |
| 459 # unipept API choice | |
| 460 parser.add_option('-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca', 'pept2taxa', 'pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo'], | |
| 461 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo') | |
| 462 # input files | |
| 463 parser.add_option('-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column') | |
| 464 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences') | |
| 465 parser.add_option('-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences') | |
| 466 parser.add_option('-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences') | |
| 467 parser.add_option('-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences') | |
| 468 # Unipept Flags | |
| 469 parser.add_option('-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records') | |
| 470 parser.add_option('-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor') | |
| 471 parser.add_option('-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor') | |
| 472 parser.add_option('-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component') | |
| 473 parser.add_option('-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request') | |
| 474 | |
| 475 # output fields | |
| 476 parser.add_option('-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs') | |
| 477 # Warn vs Error Flag | |
| 478 parser.add_option('-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide') | |
| 479 # output files | |
| 480 parser.add_option('-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
| 481 parser.add_option('-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
| 482 parser.add_option('-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
| 483 parser.add_option('-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
| 484 parser.add_option('-I', '--ipr_tsv', dest='ipr_tsv', default=None, help='Output file path for InterPro TAB-separated-values (.tsv) formatted results') | |
| 485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
| 486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
| 487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
| 488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') | |
| 489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') | |
| 490 # debug | |
| 491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') | |
| 492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') | |
| 493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') | |
| 494 (options, args) = parser.parse_args() | |
| 495 if options.version: | |
| 496 print('%s' % version) | |
| 497 sys.exit(0) | |
| 498 invalid_ec = 2 if options.strict else None | |
| 133 peptides = [] | 499 peptides = [] |
| 134 with open(filepath) as fp: | 500 # Get peptide sequences |
| 135 for i,line in enumerate(fp): | 501 if options.mzid: |
| 136 if line.strip() == '' or line.startswith('#'): | 502 peptides += read_mzid(options.mzid) |
| 137 continue | 503 if options.pepxml: |
| 138 fields = line.rstrip('\n').split('\t') | 504 peptides += read_pepxml(options.pepxml) |
| 139 peptide = fields[col] | 505 if options.tabular: |
| 140 if not re.match(pep_pat,peptide): | 506 peptides += read_tabular(options.tabular, options.column) |
| 141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) | 507 if options.fasta: |
| 142 peptides.append(peptide) | 508 peptides += read_fasta(options.fasta) |
| 143 return peptides | 509 if args and len(args) > 0: |
| 144 | 510 for i, peptide in enumerate(args): |
| 145 def get_fasta_entries(fp): | 511 if not re.match(pep_pat, peptide): |
| 146 name, seq = None, [] | 512 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide, i), exit_code=invalid_ec) |
| 147 for line in fp: | 513 peptides.append(peptide) |
| 148 line = line.rstrip() | 514 if len(peptides) < 1: |
| 149 if line.startswith(">"): | 515 warn_err("No peptides input!", exit_code=1) |
| 150 if name: yield (name, ''.join(seq)) | 516 column_order = pept2lca_column_order |
| 151 name, seq = line, [] | 517 if options.unipept == 'pept2prot': |
| 152 else: | 518 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order |
| 153 seq.append(line) | |
| 154 if name: yield (name, ''.join(seq)) | |
| 155 | |
| 156 def read_fasta(filepath): | |
| 157 peptides = [] | |
| 158 with open(filepath) as fp: | |
| 159 for id, peptide in get_fasta_entries(fp): | |
| 160 if not re.match(pep_pat,peptide): | |
| 161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) | |
| 162 peptides.append(peptide) | |
| 163 return peptides | |
| 164 | |
| 165 def read_mzid(fp): | |
| 166 peptides = [] | |
| 167 for event, elem in ET.iterparse(fp): | |
| 168 if event == 'end': | |
| 169 if re.search('PeptideSequence',elem.tag): | |
| 170 peptides.append(elem.text) | |
| 171 return peptides | |
| 172 | |
| 173 def read_pepxml(fp): | |
| 174 peptides = [] | |
| 175 for event, elem in ET.iterparse(fp): | |
| 176 if event == 'end': | |
| 177 if re.search('search_hit',elem.tag): | |
| 178 peptides.append(elem.get('peptide')) | |
| 179 return peptides | |
| 180 | |
| 181 def best_match(peptide,matches): | |
| 182 if not matches: | |
| 183 return None | |
| 184 elif len(matches) == 1: | |
| 185 return matches[0].copy() | |
| 186 elif 'taxon_rank' in matches[0]: | |
| 187 # find the most specific match (peptide is always the first column order field) | |
| 188 for col in reversed(pept2lca_extra_column_order[1:]): | |
| 189 col_id = col+"_id" if options.extra else col | |
| 190 for match in matches: | |
| 191 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
| 192 return match.copy() | |
| 193 if col_id in match and match[col_id]: | |
| 194 return match.copy() | |
| 195 else: | 519 else: |
| 196 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() | 520 if options.extra or options.names: |
| 197 return None | 521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order |
| 198 | 522 else: |
| 199 def get_taxon_json(resp): | 523 column_order = pept2lca_column_order |
| 200 found_keys = set() | 524 # map to tryptic peptides |
| 201 for i,pdict in enumerate(resp): | 525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} |
| 202 found_keys |= set(pdict.keys()) | 526 partToPeps = {} |
| 203 taxa_cols = [] | 527 for peptide, parts in pepToParts.items(): |
| 204 for col in pept2lca_extra_column_order[-1:0:-1]: | 528 if options.debug: |
| 205 if col+'_id' in found_keys: | 529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) |
| 206 taxa_cols.append(col) | 530 for part in parts: |
| 207 id_to_node = dict() | 531 if len(part) > 50: |
| 208 def get_node(id,name,rank,child,seq): | 532 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide, len(part), part), exit_code=None) |
| 209 if id not in id_to_node: | 533 if 5 <= len(part) <= 50: |
| 210 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | 534 partToPeps.setdefault(part, []).append(peptide) |
| 211 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | 535 trypticPeptides = list(partToPeps.keys()) |
| 212 id_to_node[id] = node | 536 # unipept |
| 213 else: | 537 unipept_resp = [] |
| 214 node = id_to_node[id] | 538 idx = list(range(0, len(trypticPeptides), options.max_request)) |
| 215 node['data']['count'] += 1 | 539 idx.append(len(trypticPeptides)) |
| 216 if seq is not None and seq not in node['data']['sequences']: | 540 for i in range(len(idx) - 1): |
| 217 node['data']['sequences'].append(seq) | 541 post_data = [] |
| 218 if child is None: | 542 if options.equate_il: |
| 219 node['data']['self_count'] += 1 | 543 post_data.append(('equate_il', 'true')) |
| 220 elif child['id'] not in node['kids']: | 544 if options.names or options.json: |
| 221 node['kids'].append(child['id']) | 545 post_data.append(('extra', 'true')) |
| 222 node['children'].append(child) | 546 post_data.append(('names', 'true')) |
| 223 return node | 547 elif options.extra or options.json: |
| 224 root = get_node(1,'root','no rank',None,None) | 548 post_data.append(('extra', 'true')) |
| 225 for i,pdict in enumerate(resp): | 549 if options.domains: |
| 226 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | 550 post_data.append(('domains', 'true')) |
| 227 seq = sequence | 551 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i + 1]]] |
| 228 child = None | 552 if options.debug: |
| 229 for col in taxa_cols: | 553 print('post_data: %s\n' % (str(post_data)), file=sys.stderr) |
| 230 col_id = col+'_id' | 554 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} |
| 231 if col_id in pdict and pdict.get(col_id): | 555 url = '%s/%s' % (options.url.rstrip('/'), options.unipept) |
| 232 col_name = col if col in found_keys else col+'_name' | 556 if options.get: |
| 233 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | 557 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data]) |
| 234 seq = None | 558 url = '%s.json?%s' % (url, params) |
| 235 if child: | 559 req = urllib.request.Request(url) |
| 236 get_node(1,'root','no rank',child,None) | 560 else: |
| 237 return root | 561 url = '%s.json' % (url) |
| 238 | 562 data = urllib.parse.urlencode(post_data).encode() |
| 239 def get_ec_json(resp): | 563 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data]) |
| 240 ecMap = dict() | 564 if options.debug: |
| 241 for pdict in resp: | 565 print('data:\n%s\n' % (data), file=sys.stderr) |
| 242 if 'ec' in pdict: | 566 req = urllib.request.Request(url, headers=headers, data=urllib.parse.urlencode(post_data).encode(), method='POST') |
| 243 for ec in pdict['ec']: | 567 if options.debug: |
| 244 ec_number = ec['ec_number'] | 568 print("url: %s\n" % (str(url)), file=sys.stderr) |
| 245 if ec_number not in ecMap: | 569 try: |
| 246 ecMap[ec_number] = [] | 570 resp = urllib.request.urlopen(req) |
| 247 ecMap[ec_number].append(pdict) | 571 rdata = resp.read() |
| 248 def get_ids(ec): | 572 if options.debug: |
| 249 ids = [] | 573 print("%s %s\n" % (url, str(resp.getcode())), file=sys.stderr) |
| 250 i = len(ec) | 574 if resp.getcode() == 200: |
| 251 while i >= 0: | 575 if options.debug: |
| 252 ids.append(ec[:i]) | 576 print("rdata: \n%s\n\n" % (rdata), file=sys.stderr) |
| 253 i = ec.rfind('.',0,i - 1) | 577 unipept_resp += json.loads(rdata) |
| 254 return ids | 578 # unipept_resp += json.loads(urllib.request.urlopen(req).read()) |
| 255 id_to_node = dict() | 579 except Exception as e: |
| 256 def get_node(id,name,child,seq): | 580 warn_err('HTTP Error %s\n' % (str(e)), exit_code=None) |
| 257 if id not in id_to_node: | 581 unmatched_peptides = [] |
| 258 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } | 582 peptideMatches = [] |
| 259 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | 583 if options.debug: |
| 260 id_to_node[id] = node | 584 print("unipept response: %s\n" % str(unipept_resp), file=sys.stderr) |
| 261 else: | 585 if options.unipept in ['pept2prot', 'pept2taxa']: |
| 262 node = id_to_node[id] | 586 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' # should only keep one of these per input peptide |
| 263 node['data']['count'] += 1 | 587 # multiple entries per trypticPeptide for pep2prot or pep2taxa |
| 264 if seq is not None and seq not in node['data']['sequences']: | 588 mapping = {} |
| 265 node['data']['sequences'].append(seq) | 589 for match in unipept_resp: |
| 266 if child is None: | 590 mapping.setdefault(match['peptide'], []).append(match) |
| 267 node['data']['self_count'] += 1 | 591 for peptide in peptides: |
| 268 elif child['id'] not in node['kids']: | 592 # Get the intersection of matches to the tryptic parts |
| 269 node['kids'].append(child['id']) | 593 keyToMatch = None |
| 270 node['children'].append(child) | 594 for part in pepToParts[peptide]: |
| 271 return node | 595 if part in mapping: |
| 272 root = get_node(0,'-.-.-.-',None,None) | 596 temp = {match[dupkey]: match for match in mapping[part]} |
| 273 for i in range(1,7): | 597 if keyToMatch: |
| 274 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) | 598 dkeys = set(keyToMatch.keys()) - set(temp.keys()) |
| 275 get_node(0,'-.-.-.-',child,None) | 599 for k in dkeys: |
| 276 for i,pdict in enumerate(resp): | 600 del keyToMatch[k] |
| 277 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | 601 else: |
| 278 seq = sequence | 602 keyToMatch = temp |
| 279 if 'ec' in pdict: | 603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp |
| 280 for ec in pdict['ec']: | 604 if not keyToMatch: |
| 281 child = None | 605 unmatched_peptides.append(peptide) |
| 282 protein_count = ec['protein_count'] | |
| 283 ec_number = ec['ec_number'] | |
| 284 for ec_id in get_ids(ec_number): | |
| 285 ec_name = str(ec_id) | |
| 286 ## if len(ec_id) == 3: | |
| 287 ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)]) | |
| 288 child = get_node(ec_id,ec_name,child,seq) | |
| 289 seq = None | |
| 290 if child: | |
| 291 get_node(0,'-.-.-.-',child,None) | |
| 292 return root | |
| 293 | |
| 294 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
| 295 found_keys = set() | |
| 296 results = [] | |
| 297 for i,pdict in enumerate(resp): | |
| 298 results.append(pdict) | |
| 299 found_keys |= set(pdict.keys()) | |
| 300 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
| 301 column_names = [] | |
| 302 column_keys = [] | |
| 303 for col in column_order: | |
| 304 if col in found_keys: | |
| 305 column_names.append(col) | |
| 306 column_keys.append(col) | |
| 307 elif names: | |
| 308 col_id = col+'_id' | |
| 309 col_name = col+'_name' | |
| 310 if extra: | |
| 311 if col_id in found_keys: | |
| 312 column_names.append(col_id) | |
| 313 column_keys.append(col_id) | |
| 314 if names: | |
| 315 if col_name in found_keys: | |
| 316 column_names.append(col) | |
| 317 column_keys.append(col_name) | |
| 318 else: | |
| 319 if col+'_name' in found_keys: | |
| 320 column_names.append(col) | |
| 321 column_keys.append(col+'_name') | |
| 322 elif col+'_id' in found_keys: | |
| 323 column_names.append(col) | |
| 324 column_keys.append(col+'_id') | |
| 325 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
| 326 taxa = dict() ## peptide : [taxonomy] | |
| 327 for i,pdict in enumerate(results): | |
| 328 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
| 329 if peptide and peptide not in taxa: | |
| 330 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
| 331 taxa[peptide] = vals | |
| 332 return (taxa,column_names) | |
| 333 | |
| 334 def get_ec_dict(resp, extra=False): | |
| 335 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
| 336 if extra: | |
| 337 ec_cols.append('ec_names') | |
| 338 ec_dict = dict() | |
| 339 for i,pdict in enumerate(resp): | |
| 340 peptide = pdict['peptide'] | |
| 341 ec_numbers = [] | |
| 342 protein_counts = [] | |
| 343 ec_names = [] | |
| 344 if 'ec' in pdict: | |
| 345 for ec in pdict['ec']: | |
| 346 ec_numbers.append(ec['ec_number']) | |
| 347 protein_counts.append(str(ec['protein_count'])) | |
| 348 if extra: | |
| 349 ec_names.append(ec['name'] if 'name' in ec else '') | |
| 350 vals = [','.join(ec_numbers),','.join(protein_counts)] | |
| 351 if extra: | |
| 352 vals.append(','.join(ec_names)) | |
| 353 ec_dict[peptide] = vals | |
| 354 return (ec_dict, ec_cols) | |
| 355 | |
| 356 def get_go_dict(resp, extra=False): | |
| 357 go_cols = ['go_terms', 'go_protein_counts'] | |
| 358 if extra: | |
| 359 go_cols.append('go_names') | |
| 360 go_dict = dict() | |
| 361 for i,pdict in enumerate(resp): | |
| 362 peptide = pdict['peptide'] | |
| 363 go_terms = [] | |
| 364 protein_counts = [] | |
| 365 go_names = [] | |
| 366 if 'go' in pdict: | |
| 367 for go in pdict['go']: | |
| 368 if 'go_term' in go: | |
| 369 go_terms.append(go['go_term']) | |
| 370 protein_counts.append(str(go['protein_count'])) | |
| 371 if extra: | |
| 372 go_names.append(go['name'] if 'name' in go else '') | |
| 373 else: | |
| 374 for go_type in go_types: | |
| 375 if go_type in go: | |
| 376 for _go in go[go_type]: | |
| 377 go_terms.append(_go['go_term']) | |
| 378 protein_counts.append(str(_go['protein_count'])) | |
| 379 if extra: | |
| 380 go_names.append(_go['name'] if 'name' in _go else '') | |
| 381 vals = [','.join(go_terms),','.join(protein_counts)] | |
| 382 if extra: | |
| 383 vals.append(','.join(go_names)) | |
| 384 go_dict[peptide] = vals | |
| 385 return (go_dict, go_cols) | |
| 386 | |
| 387 def write_ec_table(outfile, resp, column_order): | |
| 388 with open(outfile,'w') as fh: | |
| 389 for i,pdict in enumerate(resp): | |
| 390 if 'ec' in pdict: | |
| 391 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 392 for ec in pdict['ec']: | |
| 393 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
| 394 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 395 | |
| 396 def write_go_table(outfile, resp, column_order): | |
| 397 with open(outfile,'w') as fh: | |
| 398 for i,pdict in enumerate(resp): | |
| 399 if 'go' in pdict: | |
| 400 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
| 401 for go in pdict['go']: | |
| 402 if 'go_term' in go: | |
| 403 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
| 404 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
| 405 else: | 606 else: |
| 406 for go_type in go_types: | 607 for key, match in keyToMatch.items(): |
| 407 if go_type in go: | 608 match['tryptic_peptide'] = match['peptide'] |
| 408 for _go in go[go_type]: | 609 match['peptide'] = peptide |
| 409 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | 610 peptideMatches.append(match) |
| 410 vals.append(go_type) | 611 elif options.unipept in ['pept2lca', 'peptinfo']: |
| 411 fh.write('%s\n' % '\t'.join(tvals + vals)) | 612 # should be one response per trypticPeptide for pep2lca |
| 412 | 613 respMap = {v['peptide']: v for v in unipept_resp} |
| 413 #Parse Command Line | 614 # map resp back to peptides |
| 414 parser = optparse.OptionParser() | 615 for peptide in peptides: |
| 415 # unipept API choice | 616 matches = list() |
| 416 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], | 617 for part in pepToParts[peptide]: |
| 417 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) | 618 if part in respMap: |
| 418 # input files | 619 matches.append(respMap[part]) |
| 419 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | 620 match = best_match(peptide, matches) |
| 420 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | 621 if not match: |
| 421 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | 622 unmatched_peptides.append(peptide) |
| 422 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | 623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
| 423 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 624 match = {'peptide': longest_tryptic_peptide} |
| 424 # Unipept Flags | 625 match['tryptic_peptide'] = match['peptide'] |
| 425 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | 626 match['peptide'] = peptide |
| 426 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | 627 peptideMatches.append(match) |
| 427 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | |
| 428 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) | |
| 429 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | |
| 430 | |
| 431 # output fields | |
| 432 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | |
| 433 # Warn vs Error Flag | |
| 434 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | |
| 435 # output files | |
| 436 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
| 437 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
| 438 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
| 439 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
| 440 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
| 441 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
| 442 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
| 443 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) | |
| 444 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) | |
| 445 # debug | |
| 446 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) | |
| 447 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | |
| 448 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | |
| 449 (options, args) = parser.parse_args() | |
| 450 if options.version: | |
| 451 print >> sys.stdout,"%s" % version | |
| 452 sys.exit(0) | |
| 453 invalid_ec = 2 if options.strict else None | |
| 454 peptides = [] | |
| 455 ## Get peptide sequences | |
| 456 if options.mzid: | |
| 457 peptides += read_mzid(options.mzid) | |
| 458 if options.pepxml: | |
| 459 peptides += read_pepxml(options.pepxml) | |
| 460 if options.tabular: | |
| 461 peptides += read_tabular(options.tabular,options.column) | |
| 462 if options.fasta: | |
| 463 peptides += read_fasta(options.fasta) | |
| 464 if args and len(args) > 0: | |
| 465 for i,peptide in enumerate(args): | |
| 466 if not re.match(pep_pat,peptide): | |
| 467 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | |
| 468 peptides.append(peptide) | |
| 469 if len(peptides) < 1: | |
| 470 warn_err("No peptides input!",exit_code=1) | |
| 471 column_order = pept2lca_column_order | |
| 472 if options.unipept == 'pept2prot': | |
| 473 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order | |
| 474 else: | |
| 475 if options.extra or options.names: | |
| 476 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | |
| 477 else: | 628 else: |
| 478 column_order = pept2lca_column_order | 629 respMap = {v['peptide']: v for v in unipept_resp} |
| 479 ## map to tryptic peptides | 630 # map resp back to peptides |
| 480 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} | 631 for peptide in peptides: |
| 481 partToPeps = {} | 632 matches = list() |
| 482 for peptide, parts in pepToParts.iteritems(): | 633 for part in pepToParts[peptide]: |
| 483 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) | 634 if part in respMap and 'total_protein_count' in respMap[part]: |
| 484 for part in parts: | 635 matches.append(respMap[part]) |
| 485 if len(part) > 50: | 636 match = best_match(peptide, matches) |
| 486 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | 637 if not match: |
| 487 if 5 <= len(part) <= 50: | 638 unmatched_peptides.append(peptide) |
| 488 partToPeps.setdefault(part,[]).append(peptide) | 639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
| 489 trypticPeptides = partToPeps.keys() | 640 match = {'peptide': longest_tryptic_peptide} |
| 490 ## unipept | 641 match['tryptic_peptide'] = match['peptide'] |
| 491 unipept_resp = [] | 642 match['peptide'] = peptide |
| 492 idx = range(0,len(trypticPeptides),options.max_request) | 643 peptideMatches.append(match) |
| 493 idx.append(len(trypticPeptides)) | 644 resp = peptideMatches |
| 494 for i in range(len(idx)-1): | 645 if options.debug: |
| 495 post_data = [] | 646 print("\nmapped response: %s\n" % str(resp), file=sys.stderr) |
| 496 if options.equate_il: | 647 # output results |
| 497 post_data.append(("equate_il","true")) | 648 if not (options.unmatched or options.json or options.tsv or options.csv): |
| 498 if options.names or options.json: | 649 print(str(resp)) |
| 499 post_data.append(("extra","true")) | 650 if options.unmatched: |
| 500 post_data.append(("names","true")) | 651 with open(options.unmatched, 'w') as outputFile: |
| 501 elif options.extra or options.json: | 652 for peptide in peptides: |
| 502 post_data.append(("extra","true")) | 653 if peptide in unmatched_peptides: |
| 503 if options.domains: | 654 outputFile.write("%s\n" % peptide) |
| 504 post_data.append(("domains","true")) | 655 if options.json: |
| 505 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] | 656 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: |
| 506 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) | 657 root = get_taxon_json(resp) |
| 507 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | 658 with open(options.json, 'w') as outputFile: |
| 508 ## headers = {'Accept': 'application/json'} | 659 outputFile.write(json.dumps(root)) |
| 509 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) | 660 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct']: |
| 510 if options.get: | 661 with open(options.json, 'w') as outputFile: |
| 511 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) | 662 outputFile.write(str(resp)) |
| 512 url = '%s.json?%s' % (url,params) | 663 if options.ec_json: |
| 513 req = urllib2.Request( url ) | 664 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
| 514 else: | 665 root = get_ec_json(resp) |
| 515 url = '%s.json' % (url) | 666 with open(options.ec_json, 'w') as outputFile: |
| 516 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | 667 outputFile.write(json.dumps(root)) |
| 517 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) | 668 if options.tsv or options.csv: |
| 518 try: | 669 rows = [] |
| 519 resp = urllib2.urlopen( req ) | 670 column_names = None |
| 520 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) | 671 if options.unipept in ['pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo']: |
| 521 if resp.getcode() == 200: | 672 taxa = None |
| 522 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | 673 ec_dict = None |
| 523 except Exception, e: | 674 go_dict = None |
| 524 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) | 675 ipr_dict = None |
| 525 unmatched_peptides = [] | 676 if options.unipept in ['peptinfo']: |
| 526 peptideMatches = [] | 677 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
| 527 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | 678 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
| 528 if options.unipept in ['pept2prot', 'pept2taxa']: | 679 (ec_dict, ec_cols) = get_ec_dict(resp, extra=options.extra) |
| 529 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | 680 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: |
| 530 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | 681 (go_dict, go_cols) = get_go_dict(resp, extra=options.extra) |
| 531 mapping = {} | 682 if options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']: |
| 532 for match in unipept_resp: | 683 (ipr_dict, ipr_cols) = get_ipr_dict(resp, extra=options.extra) |
| 533 mapping.setdefault(match['peptide'],[]).append(match) | 684 for i, pdict in enumerate(resp): |
| 534 for peptide in peptides: | 685 peptide = pdict['peptide'] |
| 535 # Get the intersection of matches to the tryptic parts | 686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' |
| 536 keyToMatch = None | 687 column_names = ['peptide', 'total_protein_count'] |
| 537 for part in pepToParts[peptide]: | 688 vals = [peptide, total_protein_count] |
| 538 if part in mapping: | 689 if ec_dict: |
| 539 temp = {match[dupkey] : match for match in mapping[part]} | 690 vals += ec_dict[peptide] |
| 540 if keyToMatch: | 691 column_names += ec_cols |
| 541 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | 692 if go_dict: |
| 542 for k in dkeys: | 693 vals += go_dict[peptide] |
| 543 del keyToMatch[k] | 694 column_names += go_cols |
| 544 else: | 695 if ipr_dict: |
| 545 keyToMatch = temp | 696 vals += ipr_dict[peptide] |
| 546 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | 697 column_names += ipr_cols |
| 547 if not keyToMatch: | 698 if taxa: |
| 548 unmatched_peptides.append(peptide) | 699 vals += taxa[peptide][1:] |
| 549 else: | 700 column_names += taxon_cols[1:] |
| 550 for key,match in keyToMatch.iteritems(): | 701 rows.append(vals) |
| 551 match['tryptic_peptide'] = match['peptide'] | 702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: |
| 552 match['peptide'] = peptide | 703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
| 553 peptideMatches.append(match) | 704 column_names = taxon_cols |
| 554 elif options.unipept in ['pept2lca', 'peptinfo']: | 705 rows = list(taxa.values()) |
| 555 ## should be one response per trypticPeptide for pep2lca | 706 for peptide, vals in taxa.items(): |
| 556 respMap = {v['peptide']:v for v in unipept_resp} | 707 rows.append(vals) |
| 557 ## map resp back to peptides | 708 if options.tsv: |
| 558 for peptide in peptides: | 709 with open(options.tsv, 'w') as outputFile: |
| 559 matches = list() | 710 if column_names: |
| 560 for part in pepToParts[peptide]: | 711 outputFile.write("#%s\n" % '\t'.join(column_names)) |
| 561 if part in respMap: | 712 for vals in rows: |
| 562 matches.append(respMap[part]) | 713 outputFile.write("%s\n" % '\t'.join(vals)) |
| 563 match = best_match(peptide,matches) | 714 if options.csv: |
| 564 if not match: | 715 with open(options.csv, 'w') as outputFile: |
| 565 unmatched_peptides.append(peptide) | 716 if column_names: |
| 566 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 717 outputFile.write("%s\n" % ','.join(column_names)) |
| 567 match = {'peptide' : longest_tryptic_peptide} | 718 for vals in rows: |
| 568 match['tryptic_peptide'] = match['peptide'] | 719 outputFile.write("%s\n" % ','.join(['"%s"' % (v if v else '') for v in vals])) |
| 569 match['peptide'] = peptide | 720 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
| 570 peptideMatches.append(match) | 721 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order |
| 571 else: | 722 write_ec_table(options.ec_tsv, resp, column_order) |
| 572 respMap = {v['peptide']:v for v in unipept_resp} | 723 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: |
| 573 ## map resp back to peptides | 724 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order |
| 574 for peptide in peptides: | 725 write_go_table(options.go_tsv, resp, column_order) |
| 575 matches = list() | 726 if options.ipr_tsv and options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']: |
| 576 for part in pepToParts[peptide]: | 727 column_order = pept2interpro_extra_column_order if options.extra else pept2interpro_column_order |
| 577 if part in respMap and 'total_protein_count' in respMap[part]: | 728 write_ipr_table(options.ipr_tsv, resp, column_order) |
| 578 matches.append(respMap[part]) | 729 |
| 579 match = best_match(peptide,matches) | 730 |
| 580 if not match: | 731 if __name__ == "__main__": |
| 581 unmatched_peptides.append(peptide) | 732 __main__() |
| 582 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
| 583 match = {'peptide' : longest_tryptic_peptide} | |
| 584 match['tryptic_peptide'] = match['peptide'] | |
| 585 match['peptide'] = peptide | |
| 586 peptideMatches.append(match) | |
| 587 resp = peptideMatches | |
| 588 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) | |
| 589 ## output results | |
| 590 if not (options.unmatched or options.json or options.tsv or options.csv): | |
| 591 print >> sys.stdout, str(resp) | |
| 592 if options.unmatched: | |
| 593 with open(options.unmatched,'w') as outputFile: | |
| 594 for peptide in peptides: | |
| 595 if peptide in unmatched_peptides: | |
| 596 outputFile.write("%s\n" % peptide) | |
| 597 if options.json: | |
| 598 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: | |
| 599 root = get_taxon_json(resp) | |
| 600 with open(options.json,'w') as outputFile: | |
| 601 outputFile.write(json.dumps(root)) | |
| 602 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: | |
| 603 with open(options.json,'w') as outputFile: | |
| 604 outputFile.write(str(resp)) | |
| 605 if options.ec_json: | |
| 606 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 607 root = get_ec_json(resp) | |
| 608 with open(options.ec_json,'w') as outputFile: | |
| 609 outputFile.write(json.dumps(root)) | |
| 610 if options.tsv or options.csv: | |
| 611 rows = [] | |
| 612 column_names = None | |
| 613 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: | |
| 614 taxa = None | |
| 615 ec_dict = None | |
| 616 go_dict = None | |
| 617 if options.unipept in ['peptinfo']: | |
| 618 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
| 619 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 620 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) | |
| 621 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
| 622 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) | |
| 623 for i,pdict in enumerate(resp): | |
| 624 peptide = pdict['peptide'] | |
| 625 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | |
| 626 column_names = ['peptide', 'total_protein_count'] | |
| 627 vals = [peptide,total_protein_count] | |
| 628 if ec_dict: | |
| 629 vals += ec_dict[peptide] | |
| 630 column_names += ec_cols | |
| 631 if go_dict: | |
| 632 vals += go_dict[peptide] | |
| 633 column_names += go_cols | |
| 634 if taxa: | |
| 635 vals += taxa[peptide][1:] | |
| 636 column_names += taxon_cols[1:] | |
| 637 rows.append(vals) | |
| 638 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | |
| 639 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
| 640 column_names = taxon_cols | |
| 641 rows = taxa.values() | |
| 642 for peptide,vals in taxa.iteritems(): | |
| 643 rows.append(vals) | |
| 644 if options.tsv: | |
| 645 with open(options.tsv,'w') as outputFile: | |
| 646 if column_names: | |
| 647 outputFile.write("#%s\n"% '\t'.join(column_names)) | |
| 648 for vals in rows: | |
| 649 outputFile.write("%s\n"% '\t'.join(vals)) | |
| 650 if options.csv: | |
| 651 with open(options.csv,'w') as outputFile: | |
| 652 if column_names: | |
| 653 outputFile.write("%s\n"% ','.join(column_names)) | |
| 654 for vals in rows: | |
| 655 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | |
| 656 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
| 657 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order | |
| 658 write_ec_table(options.ec_tsv, resp, column_order) | |
| 659 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
| 660 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order | |
| 661 write_go_table(options.go_tsv, resp, column_order) | |
| 662 | |
| 663 if __name__ == "__main__" : __main__() |
