annotate unipept.py @ 0:c643e1fe2faa draft default tip

Imported from capsule None
author jjohnson
date Mon, 30 Mar 2015 18:00:54 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
1 #!/usr/bin/env python
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
2 """
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
3 #
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
4 #------------------------------------------------------------------------------
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
5 # University of Minnesota
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
6 # Copyright 2015, Regents of the University of Minnesota
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
7 #------------------------------------------------------------------------------
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
8 # Author:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
9 #
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
10 # James E Johnson
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
11 #
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
12 #------------------------------------------------------------------------------
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
13 """
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
14
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
15 import json
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
16 import logging
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
17 import optparse
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
18 from optparse import OptionParser
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
19 import os
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
20 import sys
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
21 import re
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
22 import urllib
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
23 import urllib2
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
24 try:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
25 import xml.etree.cElementTree as ET
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
26 except ImportError:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
27 import xml.etree.ElementTree as ET
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
28
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
29 def warn_err(msg,exit_code=1):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
30 sys.stderr.write(msg)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
31 if exit_code:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
32 sys.exit(exit_code)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
33
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
34 def read_fasta(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
35 name, seq = None, []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
36 for line in fp:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
37 line = line.rstrip()
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
38 if line.startswith(">"):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
39 if name: yield (name, ''.join(seq))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
40 name, seq = line, []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
41 else:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
42 seq.append(line)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
43 if name: yield (name, ''.join(seq))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
44
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
45 def read_mzid(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
46 peptides = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
47 for event, elem in ET.iterparse(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
48 if event == 'end':
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
49 if re.search('PeptideSequence',elem.tag):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
50 peptides.append(elem.text)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
51 return peptides
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
52
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
53 def read_pepxml(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
54 peptides = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
55 for event, elem in ET.iterparse(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
56 if event == 'end':
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
57 if re.search('search_hit',elem.tag):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
58 peptides.append(elem.get('peptide'))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
59 return peptides
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
60
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
61 def __main__():
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
62 #Parse Command Line
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
63 parser = optparse.OptionParser()
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
64 # unipept API
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
65 parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
66 # files
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
67 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
68 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
69 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
70 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
71 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
72 # Unipept Flags
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
73 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
74 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
75 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
76 # Warn vs Error Flag
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
77 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
78 # outputs
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
79 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
80 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
81 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
82 parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
83 (options, args) = parser.parse_args()
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
84 invalid_ec = 2 if options.strict else None
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
85 peptides = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
86 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
87 ## Get peptide sequences
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
88 if options.mzid:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
89 peptides += read_mzid(options.mzid)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
90 if options.pepxml:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
91 peptides += read_pepxml(options.pepxml)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
92 if options.tabular:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
93 with open(options.tabular) as fp:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
94 for i,line in enumerate(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
95 if line.strip() == '' or line.startswith('#'):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
96 continue
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
97 fields = line.rstrip('\n').split('\t')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
98 peptide = fields[options.column]
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
99 if not re.match(pep_pat,peptide):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
100 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
101 peptides.append(peptide)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
102 if options.fasta:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
103 with open(options.fasta) as fp:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
104 for id, peptide in read_fasta(fp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
105 if not re.match(pep_pat,peptide):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
106 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
107 peptides.append(peptide)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
108 if args and len(args) > 0:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
109 for i,peptide in enumerate(args):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
110 if not re.match(pep_pat,peptide):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
111 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
112 peptides.append(peptide)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
113 if len(peptides) < 1:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
114 warn_err("No peptides input!",exit_code=1)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
115 ## unipept
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
116 post_data = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
117 if options.equate_il:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
118 post_data.append(("equate_il","true"))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
119 if options.names:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
120 post_data.append(("extra","true"))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
121 post_data.append(("names","true"))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
122 elif options.extra:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
123 post_data.append(("extra","true"))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
124 post_data += [('input[]', x) for x in peptides]
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
125 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
126 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
127 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
128 resp = json.loads( urllib2.urlopen( req ).read() )
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
129 ## output results
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
130 if not (options.mismatch or options.json or options.tsv or options.csv):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
131 print >> sys.stdout, str(resp)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
132 if options.mismatch:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
133 peptides_matched = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
134 for i,pdict in enumerate(resp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
135 peptides_matched.append(pdict['peptide'])
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
136 with open(options.mismatch,'w') as outputFile:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
137 for peptide in peptides:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
138 if not peptide in peptides_matched:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
139 outputFile.write("%s\n" % peptide)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
140 if options.json:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
141 with open(options.json,'w') as outputFile:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
142 outputFile.write(str(resp))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
143 if options.tsv or options.csv:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
144 # 'pept2lca','pept2taxa','pept2prot'
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
145 pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
146 pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
147 column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
148 found_keys = set()
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
149 results = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
150 for i,pdict in enumerate(resp):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
151 results.append(pdict)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
152 found_keys |= set(pdict.keys())
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
153 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
154 column_names = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
155 column_keys = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
156 for col in column_order:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
157 if col in found_keys:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
158 column_names.append(col)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
159 column_keys.append(col)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
160 elif col+'_name' in found_keys:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
161 column_names.append(col)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
162 column_keys.append(col+'_name')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
163 elif col+'_id' in found_keys:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
164 column_names.append(col)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
165 column_keys.append(col+'_id')
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
166 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
167 taxa = []
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
168 for i,pdict in enumerate(results):
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
169 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
170 taxa.append(vals)
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
171 if options.tsv:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
172 with open(options.tsv,'w') as outputFile:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
173 outputFile.write("#%s\n"% '\t'.join(column_names))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
174 for vals in taxa:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
175 outputFile.write("%s\n"% '\t'.join(vals))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
176 if options.csv:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
177 with open(options.csv,'w') as outputFile:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
178 outputFile.write("%s\n"% ','.join(column_names))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
179 for vals in taxa:
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
180 outputFile.write("%s\n"% ','.join([v if v else "" for v in vals]))
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
181
c643e1fe2faa Imported from capsule None
jjohnson
parents:
diff changeset
182 if __name__ == "__main__" : __main__()