Mercurial > repos > bcclaywell > argo_navis
annotate venv/lib/python2.7/site-packages/requests/packages/chardet/sbcharsetprober.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
rev | line source |
---|---|
0
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
1 ######################## BEGIN LICENSE BLOCK ######################## |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
2 # The Original Code is Mozilla Universal charset detector code. |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
3 # |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
4 # The Initial Developer of the Original Code is |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
5 # Netscape Communications Corporation. |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
6 # Portions created by the Initial Developer are Copyright (C) 2001 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
7 # the Initial Developer. All Rights Reserved. |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
8 # |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
9 # Contributor(s): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
10 # Mark Pilgrim - port to Python |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
11 # Shy Shalom - original C code |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
12 # |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
13 # This library is free software; you can redistribute it and/or |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
14 # modify it under the terms of the GNU Lesser General Public |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
15 # License as published by the Free Software Foundation; either |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
16 # version 2.1 of the License, or (at your option) any later version. |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
17 # |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
18 # This library is distributed in the hope that it will be useful, |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
21 # Lesser General Public License for more details. |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
22 # |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
23 # You should have received a copy of the GNU Lesser General Public |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
24 # License along with this library; if not, write to the Free Software |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
26 # 02110-1301 USA |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
27 ######################### END LICENSE BLOCK ######################### |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
28 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
29 import sys |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
30 from . import constants |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
31 from .charsetprober import CharSetProber |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
32 from .compat import wrap_ord |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
33 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
34 SAMPLE_SIZE = 64 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
35 SB_ENOUGH_REL_THRESHOLD = 1024 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
36 POSITIVE_SHORTCUT_THRESHOLD = 0.95 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
37 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
38 SYMBOL_CAT_ORDER = 250 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
39 NUMBER_OF_SEQ_CAT = 4 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
40 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
41 #NEGATIVE_CAT = 0 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
42 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
43 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
44 class SingleByteCharSetProber(CharSetProber): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
45 def __init__(self, model, reversed=False, nameProber=None): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
46 CharSetProber.__init__(self) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
47 self._mModel = model |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
48 # TRUE if we need to reverse every pair in the model lookup |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
49 self._mReversed = reversed |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
50 # Optional auxiliary prober for name decision |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
51 self._mNameProber = nameProber |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
52 self.reset() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
53 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
54 def reset(self): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
55 CharSetProber.reset(self) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
56 # char order of last character |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
57 self._mLastOrder = 255 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
58 self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
59 self._mTotalSeqs = 0 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
60 self._mTotalChar = 0 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
61 # characters that fall in our sampling range |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
62 self._mFreqChar = 0 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
63 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
64 def get_charset_name(self): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
65 if self._mNameProber: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
66 return self._mNameProber.get_charset_name() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
67 else: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
68 return self._mModel['charsetName'] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
69 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
70 def feed(self, aBuf): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
71 if not self._mModel['keepEnglishLetter']: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
72 aBuf = self.filter_without_english_letters(aBuf) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
73 aLen = len(aBuf) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
74 if not aLen: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
75 return self.get_state() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
76 for c in aBuf: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
77 order = self._mModel['charToOrderMap'][wrap_ord(c)] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
78 if order < SYMBOL_CAT_ORDER: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
79 self._mTotalChar += 1 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
80 if order < SAMPLE_SIZE: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
81 self._mFreqChar += 1 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
82 if self._mLastOrder < SAMPLE_SIZE: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
83 self._mTotalSeqs += 1 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
84 if not self._mReversed: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
85 i = (self._mLastOrder * SAMPLE_SIZE) + order |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
86 model = self._mModel['precedenceMatrix'][i] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
87 else: # reverse the order of the letters in the lookup |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
88 i = (order * SAMPLE_SIZE) + self._mLastOrder |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
89 model = self._mModel['precedenceMatrix'][i] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
90 self._mSeqCounters[model] += 1 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
91 self._mLastOrder = order |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
92 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
93 if self.get_state() == constants.eDetecting: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
94 if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
95 cf = self.get_confidence() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
96 if cf > POSITIVE_SHORTCUT_THRESHOLD: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
97 if constants._debug: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
98 sys.stderr.write('%s confidence = %s, we have a' |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
99 'winner\n' % |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
100 (self._mModel['charsetName'], cf)) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
101 self._mState = constants.eFoundIt |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
102 elif cf < NEGATIVE_SHORTCUT_THRESHOLD: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
103 if constants._debug: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
104 sys.stderr.write('%s confidence = %s, below negative' |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
105 'shortcut threshhold %s\n' % |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
106 (self._mModel['charsetName'], cf, |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
107 NEGATIVE_SHORTCUT_THRESHOLD)) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
108 self._mState = constants.eNotMe |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
109 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
110 return self.get_state() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
111 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
112 def get_confidence(self): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
113 r = 0.01 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
114 if self._mTotalSeqs > 0: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
115 r = ((1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
116 / self._mModel['mTypicalPositiveRatio']) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
117 r = r * self._mFreqChar / self._mTotalChar |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
118 if r >= 1.0: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
119 r = 0.99 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
120 return r |