Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/site-packages/docutils/utils/smartquotes.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d67268158946 |
---|---|
1 #!/usr/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # :Id: $Id: smartquotes.py 7716 2013-08-21 21:54:57Z milde $ | |
5 # :Copyright: © 2010 Günter Milde, | |
6 # original `SmartyPants`_: © 2003 John Gruber | |
7 # smartypants.py: © 2004, 2007 Chad Miller | |
8 # :Maintainer: docutils-develop@lists.sourceforge.net | |
9 # :License: Released under the terms of the `2-Clause BSD license`_, in short: | |
10 # | |
11 # Copying and distribution of this file, with or without modification, | |
12 # are permitted in any medium without royalty provided the copyright | |
13 # notices and this notice are preserved. | |
14 # This file is offered as-is, without any warranty. | |
15 # | |
16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
17 | |
18 | |
19 r""" | |
20 ======================== | |
21 SmartyPants for Docutils | |
22 ======================== | |
23 | |
24 Synopsis | |
25 ======== | |
26 | |
27 Smart-quotes for Docutils. | |
28 | |
29 The original "SmartyPants" is a free web publishing plug-in for Movable Type, | |
30 Blosxom, and BBEdit that easily translates plain ASCII punctuation characters | |
31 into "smart" typographic punctuation characters. | |
32 | |
33 `smartypants.py`, endeavours to be a functional port of | |
34 SmartyPants to Python, for use with Pyblosxom_. | |
35 | |
36 `smartquotes.py` is an adaption of Smartypants to Docutils_. By using Unicode | |
37 characters instead of HTML entities for typographic quotes, it works for any | |
38 output format that supports Unicode. | |
39 | |
40 Authors | |
41 ======= | |
42 | |
43 `John Gruber`_ did all of the hard work of writing this software in Perl for | |
44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ | |
45 ported it to Python to use with Pyblosxom_. | |
46 Adapted to Docutils_ by Günter Milde | |
47 | |
48 Additional Credits | |
49 ================== | |
50 | |
51 Portions of the SmartyPants original work are based on Brad Choate's nifty | |
52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to | |
53 this plug-in. Brad Choate is a fine hacker indeed. | |
54 | |
55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta | |
56 testing of the original SmartyPants. | |
57 | |
58 `Rael Dornfest`_ ported SmartyPants to Blosxom. | |
59 | |
60 .. _Brad Choate: http://bradchoate.com/ | |
61 .. _Jeremy Hedley: http://antipixel.com/ | |
62 .. _Charles Wiltgen: http://playbacktime.com/ | |
63 .. _Rael Dornfest: http://raelity.org/ | |
64 | |
65 | |
66 Copyright and License | |
67 ===================== | |
68 | |
69 SmartyPants_ license (3-Clause BSD license): | |
70 | |
71 Copyright (c) 2003 John Gruber (http://daringfireball.net/) | |
72 All rights reserved. | |
73 | |
74 Redistribution and use in source and binary forms, with or without | |
75 modification, are permitted provided that the following conditions are | |
76 met: | |
77 | |
78 * Redistributions of source code must retain the above copyright | |
79 notice, this list of conditions and the following disclaimer. | |
80 | |
81 * Redistributions in binary form must reproduce the above copyright | |
82 notice, this list of conditions and the following disclaimer in | |
83 the documentation and/or other materials provided with the | |
84 distribution. | |
85 | |
86 * Neither the name "SmartyPants" nor the names of its contributors | |
87 may be used to endorse or promote products derived from this | |
88 software without specific prior written permission. | |
89 | |
90 This software is provided by the copyright holders and contributors | |
91 "as is" and any express or implied warranties, including, but not | |
92 limited to, the implied warranties of merchantability and fitness for | |
93 a particular purpose are disclaimed. In no event shall the copyright | |
94 owner or contributors be liable for any direct, indirect, incidental, | |
95 special, exemplary, or consequential damages (including, but not | |
96 limited to, procurement of substitute goods or services; loss of use, | |
97 data, or profits; or business interruption) however caused and on any | |
98 theory of liability, whether in contract, strict liability, or tort | |
99 (including negligence or otherwise) arising in any way out of the use | |
100 of this software, even if advised of the possibility of such damage. | |
101 | |
102 smartypants.py license (2-Clause BSD license): | |
103 | |
104 smartypants.py is a derivative work of SmartyPants. | |
105 | |
106 Redistribution and use in source and binary forms, with or without | |
107 modification, are permitted provided that the following conditions are | |
108 met: | |
109 | |
110 * Redistributions of source code must retain the above copyright | |
111 notice, this list of conditions and the following disclaimer. | |
112 | |
113 * Redistributions in binary form must reproduce the above copyright | |
114 notice, this list of conditions and the following disclaimer in | |
115 the documentation and/or other materials provided with the | |
116 distribution. | |
117 | |
118 This software is provided by the copyright holders and contributors | |
119 "as is" and any express or implied warranties, including, but not | |
120 limited to, the implied warranties of merchantability and fitness for | |
121 a particular purpose are disclaimed. In no event shall the copyright | |
122 owner or contributors be liable for any direct, indirect, incidental, | |
123 special, exemplary, or consequential damages (including, but not | |
124 limited to, procurement of substitute goods or services; loss of use, | |
125 data, or profits; or business interruption) however caused and on any | |
126 theory of liability, whether in contract, strict liability, or tort | |
127 (including negligence or otherwise) arising in any way out of the use | |
128 of this software, even if advised of the possibility of such damage. | |
129 | |
130 .. _John Gruber: http://daringfireball.net/ | |
131 .. _Chad Miller: http://web.chad.org/ | |
132 | |
133 .. _Pyblosxom: http://pyblosxom.bluesock.org/ | |
134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/ | |
135 .. _Movable Type: http://www.movabletype.org/ | |
136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
137 .. _Docutils: http://docutils.sf.net/ | |
138 | |
139 Description | |
140 =========== | |
141 | |
142 SmartyPants can perform the following transformations: | |
143 | |
144 - Straight quotes ( " and ' ) into "curly" quote characters | |
145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters | |
146 - Dashes (``--`` and ``---``) into en- and em-dash entities | |
147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity | |
148 | |
149 This means you can write, edit, and save your posts using plain old | |
150 ASCII straight quotes, plain dashes, and plain dots, but your published | |
151 posts (and final HTML output) will appear with smart quotes, em-dashes, | |
152 and proper ellipses. | |
153 | |
154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, | |
155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to | |
156 display text where smart quotes and other "smart punctuation" would not be | |
157 appropriate, such as source code or example markup. | |
158 | |
159 | |
160 Backslash Escapes | |
161 ================= | |
162 | |
163 If you need to use literal straight quotes (or plain hyphens and | |
164 periods), SmartyPants accepts the following backslash escape sequences | |
165 to force non-smart punctuation. It does so by transforming the escape | |
166 sequence into a character: | |
167 | |
168 ======== ===== ========= | |
169 Escape Value Character | |
170 ======== ===== ========= | |
171 ``\\\\`` \ \\ | |
172 \\" " " | |
173 \\' ' ' | |
174 \\. . . | |
175 \\- - \- | |
176 \\` ` \` | |
177 ======== ===== ========= | |
178 | |
179 This is useful, for example, when you want to use straight quotes as | |
180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. | |
181 | |
182 Options | |
183 ======= | |
184 | |
185 For Pyblosxom users, the ``smartypants_attributes`` attribute is where you | |
186 specify configuration options. | |
187 | |
188 Numeric values are the easiest way to configure SmartyPants' behavior: | |
189 | |
190 "0" | |
191 Suppress all transformations. (Do nothing.) | |
192 "1" | |
193 Performs default SmartyPants transformations: quotes (including | |
194 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) | |
195 is used to signify an em-dash; there is no support for en-dashes. | |
196 | |
197 "2" | |
198 Same as smarty_pants="1", except that it uses the old-school typewriter | |
199 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" | |
200 (dash dash dash) | |
201 for em-dashes. | |
202 | |
203 "3" | |
204 Same as smarty_pants="2", but inverts the shorthand for dashes: | |
205 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for | |
206 en-dashes. | |
207 | |
208 "-1" | |
209 Stupefy mode. Reverses the SmartyPants transformation process, turning | |
210 the characters produced by SmartyPants into their ASCII equivalents. | |
211 E.g. "“" is turned into a simple double-quote (\"), "—" is | |
212 turned into two dashes, etc. | |
213 | |
214 | |
215 The following single-character attribute values can be combined to toggle | |
216 individual transformations from within the smarty_pants attribute. For | |
217 example, to educate normal quotes and em-dashes, but not ellipses or | |
218 \`\`backticks'' -style quotes: | |
219 | |
220 ``py['smartypants_attributes'] = "1"`` | |
221 | |
222 "q" | |
223 Educates normal quote characters: (") and ('). | |
224 | |
225 "b" | |
226 Educates \`\`backticks'' -style double quotes. | |
227 | |
228 "B" | |
229 Educates \`\`backticks'' -style double quotes and \`single' quotes. | |
230 | |
231 "d" | |
232 Educates em-dashes. | |
233 | |
234 "D" | |
235 Educates em-dashes and en-dashes, using old-school typewriter shorthand: | |
236 (dash dash) for en-dashes, (dash dash dash) for em-dashes. | |
237 | |
238 "i" | |
239 Educates em-dashes and en-dashes, using inverted old-school typewriter | |
240 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. | |
241 | |
242 "e" | |
243 Educates ellipses. | |
244 | |
245 "w" | |
246 Translates any instance of ``"`` into a normal double-quote character. | |
247 This should be of no interest to most people, but of particular interest | |
248 to anyone who writes their posts using Dreamweaver, as Dreamweaver | |
249 inexplicably uses this entity to represent a literal double-quote | |
250 character. SmartyPants only educates normal quotes, not entities (because | |
251 ordinarily, entities are used for the explicit purpose of representing the | |
252 specific character they represent). The "w" option must be used in | |
253 conjunction with one (or both) of the other quote options ("q" or "b"). | |
254 Thus, if you wish to apply all SmartyPants transformations (quotes, en- | |
255 and em-dashes, and ellipses) and also translate ``"`` entities into | |
256 regular quotes so SmartyPants can educate them, you should pass the | |
257 following to the smarty_pants attribute: | |
258 | |
259 | |
260 Caveats | |
261 ======= | |
262 | |
263 Why You Might Not Want to Use Smart Quotes in Your Weblog | |
264 --------------------------------------------------------- | |
265 | |
266 For one thing, you might not care. | |
267 | |
268 Most normal, mentally stable individuals do not take notice of proper | |
269 typographic punctuation. Many design and typography nerds, however, break | |
270 out in a nasty rash when they encounter, say, a restaurant sign that uses | |
271 a straight apostrophe to spell "Joe's". | |
272 | |
273 If you're the sort of person who just doesn't care, you might well want to | |
274 continue not caring. Using straight quotes -- and sticking to the 7-bit | |
275 ASCII character set in general -- is certainly a simpler way to live. | |
276 | |
277 Even if you I *do* care about accurate typography, you still might want to | |
278 think twice before educating the quote characters in your weblog. One side | |
279 effect of publishing curly quote characters is that it makes your | |
280 weblog a bit harder for others to quote from using copy-and-paste. What | |
281 happens is that when someone copies text from your blog, the copied text | |
282 contains the 8-bit curly quote characters (as well as the 8-bit characters | |
283 for em-dashes and ellipses, if you use these options). These characters | |
284 are not standard across different text encoding methods, which is why they | |
285 need to be encoded as characters. | |
286 | |
287 People copying text from your weblog, however, may not notice that you're | |
288 using curly quotes, and they'll go ahead and paste the unencoded 8-bit | |
289 characters copied from their browser into an email message or their own | |
290 weblog. When pasted as raw "smart quotes", these characters are likely to | |
291 get mangled beyond recognition. | |
292 | |
293 That said, my own opinion is that any decent text editor or email client | |
294 makes it easy to stupefy smart quote characters into their 7-bit | |
295 equivalents, and I don't consider it my problem if you're using an | |
296 indecent text editor or email client. | |
297 | |
298 | |
299 Algorithmic Shortcomings | |
300 ------------------------ | |
301 | |
302 One situation in which quotes will get curled the wrong way is when | |
303 apostrophes are used at the start of leading contractions. For example: | |
304 | |
305 ``'Twas the night before Christmas.`` | |
306 | |
307 In the case above, SmartyPants will turn the apostrophe into an opening | |
308 single-quote, when in fact it should be a closing one. I don't think | |
309 this problem can be solved in the general case -- every word processor | |
310 I've tried gets this wrong as well. In such cases, it's best to use the | |
311 proper character for closing single-quotes (``’``) by hand. | |
312 | |
313 | |
314 Version History | |
315 =============== | |
316 | |
317 1.7 2012-11-19 | |
318 - Internationalization: language-dependent quotes. | |
319 | |
320 1.6.1: 2012-11-06 | |
321 - Refactor code, code cleanup, | |
322 - `educate_tokens()` generator as interface for Docutils. | |
323 | |
324 1.6: 2010-08-26 | |
325 - Adaption to Docutils: | |
326 - Use Unicode instead of HTML entities, | |
327 - Remove code special to pyblosxom. | |
328 | |
329 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 | |
330 - Fixed bug where blocks of precious unalterable text was instead | |
331 interpreted. Thanks to Le Roux and Dirk van Oosterbosch. | |
332 | |
333 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 | |
334 - Fix bogus magical quotation when there is no hint that the | |
335 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. | |
336 - Be smarter about quotes before terminating numbers in an en-dash'ed | |
337 range. | |
338 | |
339 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 | |
340 - Fix a date-processing bug, as reported by jacob childress. | |
341 - Begin a test-suite for ensuring correct output. | |
342 - Removed import of "string", since I didn't really need it. | |
343 (This was my first every Python program. Sue me!) | |
344 | |
345 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 | |
346 - Abort processing if the flavour is in forbidden-list. Default of | |
347 [ "rss" ] (Idea of Wolfgang SCHNERRING.) | |
348 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. | |
349 | |
350 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 | |
351 - Some single quotes weren't replaced properly. Diff-tesuji played | |
352 by Benjamin GEIGER. | |
353 | |
354 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 | |
355 - Support upcoming pyblosxom 0.9 plugin verification feature. | |
356 | |
357 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 | |
358 - Initial release | |
359 """ | |
360 | |
361 default_smartypants_attr = "1" | |
362 | |
363 | |
364 import re | |
365 | |
366 class smartchars(object): | |
367 """Smart quotes and dashes | |
368 """ | |
369 | |
370 endash = u'–' # "–" EN DASH | |
371 emdash = u'—' # "—" EM DASH | |
372 ellipsis = u'…' # "…" HORIZONTAL ELLIPSIS | |
373 | |
374 # quote characters (language-specific, set in __init__()) | |
375 # | |
376 # English smart quotes (open primary, close primary, open secondary, close | |
377 # secondary) are: | |
378 # opquote = u'“' # "“" LEFT DOUBLE QUOTATION MARK | |
379 # cpquote = u'”' # "”" RIGHT DOUBLE QUOTATION MARK | |
380 # osquote = u'‘' # "‘" LEFT SINGLE QUOTATION MARK | |
381 # csquote = u'’' # "’" RIGHT SINGLE QUOTATION MARK | |
382 # For other languages see: | |
383 # http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks | |
384 # http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen | |
385 quotes = {'af': u'“”‘’', | |
386 'af-x-altquot': u'„”‚’', | |
387 'ca': u'«»“”', | |
388 'ca-x-altquot': u'“”‘’', | |
389 'cs': u'„“‚‘', | |
390 'cs-x-altquot': u'»«›‹', | |
391 'da': u'»«‘’', | |
392 'da-x-altquot': u'„“‚‘', | |
393 'de': u'„“‚‘', | |
394 'de-x-altquot': u'»«›‹', | |
395 'de-CH': u'«»‹›', | |
396 'el': u'«»“”', | |
397 'en': u'“”‘’', | |
398 'en-UK': u'‘’“”', | |
399 'eo': u'“”‘’', | |
400 'es': u'«»“”', | |
401 'et': u'„“‚‘', # no secondary quote listed in | |
402 'et-x-altquot': u'»«›‹', # the sources above (wikipedia.org) | |
403 'eu': u'«»‹›', | |
404 'es-x-altquot': u'“”‘’', | |
405 'fi': u'””’’', | |
406 'fi-x-altquot': u'»»’’', | |
407 'fr': (u'« ', u' »', u'‹ ', u' ›'), # with narrow no-break space | |
408 'fr-x-altquot': u'«»‹›', # for use with manually set spaces | |
409 # 'fr-x-altquot': (u'“ ', u' ”', u'‘ ', u' ’'), # rarely used | |
410 'fr-CH': u'«»‹›', | |
411 'gl': u'«»“”', | |
412 'he': u'”“»«', | |
413 'he-x-altquot': u'„”‚’', | |
414 'it': u'«»“”', | |
415 'it-CH': u'«»‹›', | |
416 'it-x-altquot': u'“”‘’', | |
417 'ja': u'「」『』', | |
418 'lt': u'„“‚‘', | |
419 'nl': u'“”‘’', | |
420 'nl-x-altquot': u'„”‚’', | |
421 'pl': u'„”«»', | |
422 'pl-x-altquot': u'«»“”', | |
423 'pt': u'«»“”', | |
424 'pt-BR': u'“”‘’', | |
425 'ro': u'„”«»', | |
426 'ro-x-altquot': u'«»„”', | |
427 'ru': u'«»„“', | |
428 'sk': u'„“‚‘', | |
429 'sk-x-altquot': u'»«›‹', | |
430 'sv': u'„“‚‘', | |
431 'sv-x-altquot': u'»«›‹', | |
432 'zh-CN': u'“”‘’', | |
433 'it': u'«»“”', | |
434 'zh-TW': u'「」『』', | |
435 } | |
436 | |
437 def __init__(self, language='en'): | |
438 self.language = language | |
439 try: | |
440 (self.opquote, self.cpquote, | |
441 self.osquote, self.csquote) = self.quotes[language] | |
442 except KeyError: | |
443 self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\'' | |
444 | |
445 | |
446 def smartyPants(text, attr=default_smartypants_attr, language='en'): | |
447 """Main function for "traditional" use.""" | |
448 | |
449 return "".join([t for t in educate_tokens(tokenize(text), | |
450 attr, language)]) | |
451 | |
452 | |
453 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'): | |
454 """Return iterator that "educates" the items of `text_tokens`. | |
455 """ | |
456 | |
457 # Parse attributes: | |
458 # 0 : do nothing | |
459 # 1 : set all | |
460 # 2 : set all, using old school en- and em- dash shortcuts | |
461 # 3 : set all, using inverted old school en and em- dash shortcuts | |
462 # | |
463 # q : quotes | |
464 # b : backtick quotes (``double'' only) | |
465 # B : backtick quotes (``double'' and `single') | |
466 # d : dashes | |
467 # D : old school dashes | |
468 # i : inverted old school dashes | |
469 # e : ellipses | |
470 # w : convert " entities to " for Dreamweaver users | |
471 | |
472 convert_quot = False # translate " entities into normal quotes? | |
473 do_dashes = False | |
474 do_backticks = False | |
475 do_quotes = False | |
476 do_ellipses = False | |
477 do_stupefy = False | |
478 | |
479 if attr == "0": # Do nothing. | |
480 yield text | |
481 elif attr == "1": # Do everything, turn all options on. | |
482 do_quotes = True | |
483 do_backticks = True | |
484 do_dashes = 1 | |
485 do_ellipses = True | |
486 elif attr == "2": | |
487 # Do everything, turn all options on, use old school dash shorthand. | |
488 do_quotes = True | |
489 do_backticks = True | |
490 do_dashes = 2 | |
491 do_ellipses = True | |
492 elif attr == "3": | |
493 # Do everything, use inverted old school dash shorthand. | |
494 do_quotes = True | |
495 do_backticks = True | |
496 do_dashes = 3 | |
497 do_ellipses = True | |
498 elif attr == "-1": # Special "stupefy" mode. | |
499 do_stupefy = True | |
500 else: | |
501 if "q" in attr: do_quotes = True | |
502 if "b" in attr: do_backticks = True | |
503 if "B" in attr: do_backticks = 2 | |
504 if "d" in attr: do_dashes = 1 | |
505 if "D" in attr: do_dashes = 2 | |
506 if "i" in attr: do_dashes = 3 | |
507 if "e" in attr: do_ellipses = True | |
508 if "w" in attr: convert_quot = True | |
509 | |
510 prev_token_last_char = " " | |
511 # Last character of the previous text token. Used as | |
512 # context to curl leading quote characters correctly. | |
513 | |
514 for (ttype, text) in text_tokens: | |
515 | |
516 # skip HTML and/or XML tags as well as emtpy text tokens | |
517 # without updating the last character | |
518 if ttype == 'tag' or not text: | |
519 yield text | |
520 continue | |
521 | |
522 # skip literal text (math, literal, raw, ...) | |
523 if ttype == 'literal': | |
524 prev_token_last_char = text[-1:] | |
525 yield text | |
526 continue | |
527 | |
528 last_char = text[-1:] # Remember last char before processing. | |
529 | |
530 text = processEscapes(text) | |
531 | |
532 if convert_quot: | |
533 text = re.sub('"', '"', text) | |
534 | |
535 if do_dashes == 1: | |
536 text = educateDashes(text) | |
537 elif do_dashes == 2: | |
538 text = educateDashesOldSchool(text) | |
539 elif do_dashes == 3: | |
540 text = educateDashesOldSchoolInverted(text) | |
541 | |
542 if do_ellipses: | |
543 text = educateEllipses(text) | |
544 | |
545 # Note: backticks need to be processed before quotes. | |
546 if do_backticks: | |
547 text = educateBackticks(text, language) | |
548 | |
549 if do_backticks == 2: | |
550 text = educateSingleBackticks(text, language) | |
551 | |
552 if do_quotes: | |
553 text = educateQuotes(prev_token_last_char+text, language)[1:] | |
554 | |
555 if do_stupefy: | |
556 text = stupefyEntities(text, language) | |
557 | |
558 # Remember last char as context for the next token | |
559 prev_token_last_char = last_char | |
560 | |
561 text = processEscapes(text, restore=True) | |
562 | |
563 yield text | |
564 | |
565 | |
566 | |
567 def educateQuotes(text, language='en'): | |
568 """ | |
569 Parameter: - text string (unicode or bytes). | |
570 - language (`BCP 47` language tag.) | |
571 Returns: The `text`, with "educated" curly quote characters. | |
572 | |
573 Example input: "Isn't this fun?" | |
574 Example output: “Isn’t this fun?“; | |
575 """ | |
576 | |
577 smart = smartchars(language) | |
578 | |
579 # oldtext = text | |
580 punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" | |
581 | |
582 # Special case if the very first character is a quote | |
583 # followed by punctuation at a non-word-break. | |
584 # Close the quotes by brute force: | |
585 text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text) | |
586 text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text) | |
587 | |
588 # Special case for double sets of quotes, e.g.: | |
589 # <p>He said, "'Quoted' words in a larger quote."</p> | |
590 text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) | |
591 text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) | |
592 | |
593 # Special case for decade abbreviations (the '80s): | |
594 text = re.sub(r"""\b'(?=\d{2}s)""", smart.csquote, text) | |
595 | |
596 close_class = r"""[^\ \t\r\n\[\{\(\-]""" | |
597 dec_dashes = r"""–|—""" | |
598 | |
599 # Get most opening single quotes: | |
600 opening_single_quotes_regex = re.compile(r""" | |
601 ( | |
602 \s | # a whitespace char, or | |
603 | # a non-breaking space entity, or | |
604 -- | # dashes, or | |
605 &[mn]dash; | # named dash entities | |
606 %s | # or decimal entities | |
607 &\#x201[34]; # or hex | |
608 ) | |
609 ' # the quote | |
610 (?=\w) # followed by a word character | |
611 """ % (dec_dashes,), re.VERBOSE) | |
612 text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) | |
613 | |
614 closing_single_quotes_regex = re.compile(r""" | |
615 (%s) | |
616 ' | |
617 (?!\s | s\b | \d) | |
618 """ % (close_class,), re.VERBOSE) | |
619 text = closing_single_quotes_regex.sub(r'\1'+smart.csquote, text) | |
620 | |
621 closing_single_quotes_regex = re.compile(r""" | |
622 (%s) | |
623 ' | |
624 (\s | s\b) | |
625 """ % (close_class,), re.VERBOSE) | |
626 text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text) | |
627 | |
628 # Any remaining single quotes should be opening ones: | |
629 text = re.sub(r"""'""", smart.osquote, text) | |
630 | |
631 # Get most opening double quotes: | |
632 opening_double_quotes_regex = re.compile(r""" | |
633 ( | |
634 \s | # a whitespace char, or | |
635 | # a non-breaking space entity, or | |
636 -- | # dashes, or | |
637 &[mn]dash; | # named dash entities | |
638 %s | # or decimal entities | |
639 &\#x201[34]; # or hex | |
640 ) | |
641 " # the quote | |
642 (?=\w) # followed by a word character | |
643 """ % (dec_dashes,), re.VERBOSE) | |
644 text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) | |
645 | |
646 # Double closing quotes: | |
647 closing_double_quotes_regex = re.compile(r""" | |
648 #(%s)? # character that indicates the quote should be closing | |
649 " | |
650 (?=\s) | |
651 """ % (close_class,), re.VERBOSE) | |
652 text = closing_double_quotes_regex.sub(smart.cpquote, text) | |
653 | |
654 closing_double_quotes_regex = re.compile(r""" | |
655 (%s) # character that indicates the quote should be closing | |
656 " | |
657 """ % (close_class,), re.VERBOSE) | |
658 text = closing_double_quotes_regex.sub(r'\1'+smart.cpquote, text) | |
659 | |
660 # Any remaining quotes should be opening ones. | |
661 text = re.sub(r'"', smart.opquote, text) | |
662 | |
663 return text | |
664 | |
665 | |
666 def educateBackticks(text, language='en'): | |
667 """ | |
668 Parameter: String (unicode or bytes). | |
669 Returns: The `text`, with ``backticks'' -style double quotes | |
670 translated into HTML curly quote entities. | |
671 Example input: ``Isn't this fun?'' | |
672 Example output: “Isn't this fun?“; | |
673 """ | |
674 smart = smartchars(language) | |
675 | |
676 text = re.sub(r"""``""", smart.opquote, text) | |
677 text = re.sub(r"""''""", smart.cpquote, text) | |
678 return text | |
679 | |
680 | |
681 def educateSingleBackticks(text, language='en'): | |
682 """ | |
683 Parameter: String (unicode or bytes). | |
684 Returns: The `text`, with `backticks' -style single quotes | |
685 translated into HTML curly quote entities. | |
686 | |
687 Example input: `Isn't this fun?' | |
688 Example output: ‘Isn’t this fun?’ | |
689 """ | |
690 smart = smartchars(language) | |
691 | |
692 text = re.sub(r"""`""", smart.osquote, text) | |
693 text = re.sub(r"""'""", smart.csquote, text) | |
694 return text | |
695 | |
696 | |
697 def educateDashes(text): | |
698 """ | |
699 Parameter: String (unicode or bytes). | |
700 Returns: The `text`, with each instance of "--" translated to | |
701 an em-dash character. | |
702 """ | |
703 | |
704 text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards) | |
705 text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards) | |
706 return text | |
707 | |
708 | |
709 def educateDashesOldSchool(text): | |
710 """ | |
711 Parameter: String (unicode or bytes). | |
712 Returns: The `text`, with each instance of "--" translated to | |
713 an en-dash character, and each "---" translated to | |
714 an em-dash character. | |
715 """ | |
716 | |
717 text = re.sub(r"""---""", smartchars.emdash, text) | |
718 text = re.sub(r"""--""", smartchars.endash, text) | |
719 return text | |
720 | |
721 | |
722 def educateDashesOldSchoolInverted(text): | |
723 """ | |
724 Parameter: String (unicode or bytes). | |
725 Returns: The `text`, with each instance of "--" translated to | |
726 an em-dash character, and each "---" translated to | |
727 an en-dash character. Two reasons why: First, unlike the | |
728 en- and em-dash syntax supported by | |
729 EducateDashesOldSchool(), it's compatible with existing | |
730 entries written before SmartyPants 1.1, back when "--" was | |
731 only used for em-dashes. Second, em-dashes are more | |
732 common than en-dashes, and so it sort of makes sense that | |
733 the shortcut should be shorter to type. (Thanks to Aaron | |
734 Swartz for the idea.) | |
735 """ | |
736 text = re.sub(r"""---""", smartchars.endash, text) # em | |
737 text = re.sub(r"""--""", smartchars.emdash, text) # en | |
738 return text | |
739 | |
740 | |
741 | |
742 def educateEllipses(text): | |
743 """ | |
744 Parameter: String (unicode or bytes). | |
745 Returns: The `text`, with each instance of "..." translated to | |
746 an ellipsis character. | |
747 | |
748 Example input: Huh...? | |
749 Example output: Huh…? | |
750 """ | |
751 | |
752 text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text) | |
753 text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text) | |
754 return text | |
755 | |
756 | |
757 def stupefyEntities(text, language='en'): | |
758 """ | |
759 Parameter: String (unicode or bytes). | |
760 Returns: The `text`, with each SmartyPants character translated to | |
761 its ASCII counterpart. | |
762 | |
763 Example input: “Hello — world.” | |
764 Example output: "Hello -- world." | |
765 """ | |
766 smart = smartchars(language) | |
767 | |
768 text = re.sub(smart.endash, "-", text) # en-dash | |
769 text = re.sub(smart.emdash, "--", text) # em-dash | |
770 | |
771 text = re.sub(smart.osquote, "'", text) # open single quote | |
772 text = re.sub(smart.csquote, "'", text) # close single quote | |
773 | |
774 text = re.sub(smart.opquote, '"', text) # open double quote | |
775 text = re.sub(smart.cpquote, '"', text) # close double quote | |
776 | |
777 text = re.sub(smart.ellipsis, '...', text)# ellipsis | |
778 | |
779 return text | |
780 | |
781 | |
782 def processEscapes(text, restore=False): | |
783 r""" | |
784 Parameter: String (unicode or bytes). | |
785 Returns: The `text`, with after processing the following backslash | |
786 escape sequences. This is useful if you want to force a "dumb" | |
787 quote or other character to appear. | |
788 | |
789 Escape Value | |
790 ------ ----- | |
791 \\ \ | |
792 \" " | |
793 \' ' | |
794 \. . | |
795 \- - | |
796 \` ` | |
797 """ | |
798 replacements = ((r'\\', r'\'), | |
799 (r'\"', r'"'), | |
800 (r"\'", r'''), | |
801 (r'\.', r'.'), | |
802 (r'\-', r'-'), | |
803 (r'\`', r'`')) | |
804 if restore: | |
805 for (ch, rep) in replacements: | |
806 text = text.replace(rep, ch[1]) | |
807 else: | |
808 for (ch, rep) in replacements: | |
809 text = text.replace(ch, rep) | |
810 | |
811 return text | |
812 | |
813 | |
814 def tokenize(text): | |
815 """ | |
816 Parameter: String containing HTML markup. | |
817 Returns: An iterator that yields the tokens comprising the input | |
818 string. Each token is either a tag (possibly with nested, | |
819 tags contained therein, such as <a href="<MTFoo>">, or a | |
820 run of text between tags. Each yielded element is a | |
821 two-element tuple; the first is either 'tag' or 'text'; | |
822 the second is the actual value. | |
823 | |
824 Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. | |
825 <http://www.bradchoate.com/past/mtregex.php> | |
826 """ | |
827 | |
828 pos = 0 | |
829 length = len(text) | |
830 # tokens = [] | |
831 | |
832 depth = 6 | |
833 nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) | |
834 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments | |
835 # (?: <\? .*? \?> ) | # directives | |
836 # %s # nested tags """ % (nested_tags,) | |
837 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") | |
838 | |
839 token_match = tag_soup.search(text) | |
840 | |
841 previous_end = 0 | |
842 while token_match is not None: | |
843 if token_match.group(1): | |
844 yield ('text', token_match.group(1)) | |
845 | |
846 yield ('tag', token_match.group(2)) | |
847 | |
848 previous_end = token_match.end() | |
849 token_match = tag_soup.search(text, token_match.end()) | |
850 | |
851 if previous_end < len(text): | |
852 yield ('text', text[previous_end:]) | |
853 | |
854 | |
855 | |
856 if __name__ == "__main__": | |
857 | |
858 import locale | |
859 | |
860 try: | |
861 locale.setlocale(locale.LC_ALL, '') | |
862 except: | |
863 pass | |
864 | |
865 from docutils.core import publish_string | |
866 docstring_html = publish_string(__doc__, writer_name='html') | |
867 | |
868 print docstring_html | |
869 | |
870 | |
871 # Unit test output goes out stderr. | |
872 import unittest | |
873 sp = smartyPants | |
874 | |
875 class TestSmartypantsAllAttributes(unittest.TestCase): | |
876 # the default attribute is "1", which means "all". | |
877 | |
878 def test_dates(self): | |
879 self.assertEqual(sp("1440-80's"), u"1440-80’s") | |
880 self.assertEqual(sp("1440-'80s"), u"1440-‘80s") | |
881 self.assertEqual(sp("1440---'80s"), u"1440–‘80s") | |
882 self.assertEqual(sp("1960s"), "1960s") # no effect. | |
883 self.assertEqual(sp("1960's"), u"1960’s") | |
884 self.assertEqual(sp("one two '60s"), u"one two ‘60s") | |
885 self.assertEqual(sp("'60s"), u"‘60s") | |
886 | |
887 def test_ordinal_numbers(self): | |
888 self.assertEqual(sp("21st century"), "21st century") # no effect. | |
889 self.assertEqual(sp("3rd"), "3rd") # no effect. | |
890 | |
891 def test_educated_quotes(self): | |
892 self.assertEqual(sp('''"Isn't this fun?"'''), u'“Isn’t this fun?”') | |
893 | |
894 def test_html_tags(self): | |
895 text = '<a src="foo">more</a>' | |
896 self.assertEqual(sp(text), text) | |
897 | |
898 unittest.main() | |
899 | |
900 | |
901 | |
902 | |
903 __author__ = "Chad Miller <smartypantspy@chad.org>" | |
904 __version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400" | |
905 __url__ = "http://wiki.chad.org/SmartyPantsPy" | |
906 __description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom" |