comparison venv/lib/python2.7/site-packages/docutils/utils/smartquotes.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # :Id: $Id: smartquotes.py 7716 2013-08-21 21:54:57Z milde $
5 # :Copyright: © 2010 Günter Milde,
6 # original `SmartyPants`_: © 2003 John Gruber
7 # smartypants.py: © 2004, 2007 Chad Miller
8 # :Maintainer: docutils-develop@lists.sourceforge.net
9 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
10 #
11 # Copying and distribution of this file, with or without modification,
12 # are permitted in any medium without royalty provided the copyright
13 # notices and this notice are preserved.
14 # This file is offered as-is, without any warranty.
15 #
16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
17
18
19 r"""
20 ========================
21 SmartyPants for Docutils
22 ========================
23
24 Synopsis
25 ========
26
27 Smart-quotes for Docutils.
28
29 The original "SmartyPants" is a free web publishing plug-in for Movable Type,
30 Blosxom, and BBEdit that easily translates plain ASCII punctuation characters
31 into "smart" typographic punctuation characters.
32
33 `smartypants.py`, endeavours to be a functional port of
34 SmartyPants to Python, for use with Pyblosxom_.
35
36 `smartquotes.py` is an adaption of Smartypants to Docutils_. By using Unicode
37 characters instead of HTML entities for typographic quotes, it works for any
38 output format that supports Unicode.
39
40 Authors
41 =======
42
43 `John Gruber`_ did all of the hard work of writing this software in Perl for
44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_
45 ported it to Python to use with Pyblosxom_.
46 Adapted to Docutils_ by Günter Milde
47
48 Additional Credits
49 ==================
50
51 Portions of the SmartyPants original work are based on Brad Choate's nifty
52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to
53 this plug-in. Brad Choate is a fine hacker indeed.
54
55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
56 testing of the original SmartyPants.
57
58 `Rael Dornfest`_ ported SmartyPants to Blosxom.
59
60 .. _Brad Choate: http://bradchoate.com/
61 .. _Jeremy Hedley: http://antipixel.com/
62 .. _Charles Wiltgen: http://playbacktime.com/
63 .. _Rael Dornfest: http://raelity.org/
64
65
66 Copyright and License
67 =====================
68
69 SmartyPants_ license (3-Clause BSD license):
70
71 Copyright (c) 2003 John Gruber (http://daringfireball.net/)
72 All rights reserved.
73
74 Redistribution and use in source and binary forms, with or without
75 modification, are permitted provided that the following conditions are
76 met:
77
78 * Redistributions of source code must retain the above copyright
79 notice, this list of conditions and the following disclaimer.
80
81 * Redistributions in binary form must reproduce the above copyright
82 notice, this list of conditions and the following disclaimer in
83 the documentation and/or other materials provided with the
84 distribution.
85
86 * Neither the name "SmartyPants" nor the names of its contributors
87 may be used to endorse or promote products derived from this
88 software without specific prior written permission.
89
90 This software is provided by the copyright holders and contributors
91 "as is" and any express or implied warranties, including, but not
92 limited to, the implied warranties of merchantability and fitness for
93 a particular purpose are disclaimed. In no event shall the copyright
94 owner or contributors be liable for any direct, indirect, incidental,
95 special, exemplary, or consequential damages (including, but not
96 limited to, procurement of substitute goods or services; loss of use,
97 data, or profits; or business interruption) however caused and on any
98 theory of liability, whether in contract, strict liability, or tort
99 (including negligence or otherwise) arising in any way out of the use
100 of this software, even if advised of the possibility of such damage.
101
102 smartypants.py license (2-Clause BSD license):
103
104 smartypants.py is a derivative work of SmartyPants.
105
106 Redistribution and use in source and binary forms, with or without
107 modification, are permitted provided that the following conditions are
108 met:
109
110 * Redistributions of source code must retain the above copyright
111 notice, this list of conditions and the following disclaimer.
112
113 * Redistributions in binary form must reproduce the above copyright
114 notice, this list of conditions and the following disclaimer in
115 the documentation and/or other materials provided with the
116 distribution.
117
118 This software is provided by the copyright holders and contributors
119 "as is" and any express or implied warranties, including, but not
120 limited to, the implied warranties of merchantability and fitness for
121 a particular purpose are disclaimed. In no event shall the copyright
122 owner or contributors be liable for any direct, indirect, incidental,
123 special, exemplary, or consequential damages (including, but not
124 limited to, procurement of substitute goods or services; loss of use,
125 data, or profits; or business interruption) however caused and on any
126 theory of liability, whether in contract, strict liability, or tort
127 (including negligence or otherwise) arising in any way out of the use
128 of this software, even if advised of the possibility of such damage.
129
130 .. _John Gruber: http://daringfireball.net/
131 .. _Chad Miller: http://web.chad.org/
132
133 .. _Pyblosxom: http://pyblosxom.bluesock.org/
134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/
135 .. _Movable Type: http://www.movabletype.org/
136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
137 .. _Docutils: http://docutils.sf.net/
138
139 Description
140 ===========
141
142 SmartyPants can perform the following transformations:
143
144 - Straight quotes ( " and ' ) into "curly" quote characters
145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters
146 - Dashes (``--`` and ``---``) into en- and em-dash entities
147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
148
149 This means you can write, edit, and save your posts using plain old
150 ASCII straight quotes, plain dashes, and plain dots, but your published
151 posts (and final HTML output) will appear with smart quotes, em-dashes,
152 and proper ellipses.
153
154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
156 display text where smart quotes and other "smart punctuation" would not be
157 appropriate, such as source code or example markup.
158
159
160 Backslash Escapes
161 =================
162
163 If you need to use literal straight quotes (or plain hyphens and
164 periods), SmartyPants accepts the following backslash escape sequences
165 to force non-smart punctuation. It does so by transforming the escape
166 sequence into a character:
167
168 ======== ===== =========
169 Escape Value Character
170 ======== ===== =========
171 ``\\\\`` &#92; \\
172 \\" &#34; "
173 \\' &#39; '
174 \\. &#46; .
175 \\- &#45; \-
176 \\` &#96; \`
177 ======== ===== =========
178
179 This is useful, for example, when you want to use straight quotes as
180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac.
181
182 Options
183 =======
184
185 For Pyblosxom users, the ``smartypants_attributes`` attribute is where you
186 specify configuration options.
187
188 Numeric values are the easiest way to configure SmartyPants' behavior:
189
190 "0"
191 Suppress all transformations. (Do nothing.)
192 "1"
193 Performs default SmartyPants transformations: quotes (including
194 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
195 is used to signify an em-dash; there is no support for en-dashes.
196
197 "2"
198 Same as smarty_pants="1", except that it uses the old-school typewriter
199 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``"
200 (dash dash dash)
201 for em-dashes.
202
203 "3"
204 Same as smarty_pants="2", but inverts the shorthand for dashes:
205 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
206 en-dashes.
207
208 "-1"
209 Stupefy mode. Reverses the SmartyPants transformation process, turning
210 the characters produced by SmartyPants into their ASCII equivalents.
211 E.g. "“" is turned into a simple double-quote (\"), "—" is
212 turned into two dashes, etc.
213
214
215 The following single-character attribute values can be combined to toggle
216 individual transformations from within the smarty_pants attribute. For
217 example, to educate normal quotes and em-dashes, but not ellipses or
218 \`\`backticks'' -style quotes:
219
220 ``py['smartypants_attributes'] = "1"``
221
222 "q"
223 Educates normal quote characters: (") and (').
224
225 "b"
226 Educates \`\`backticks'' -style double quotes.
227
228 "B"
229 Educates \`\`backticks'' -style double quotes and \`single' quotes.
230
231 "d"
232 Educates em-dashes.
233
234 "D"
235 Educates em-dashes and en-dashes, using old-school typewriter shorthand:
236 (dash dash) for en-dashes, (dash dash dash) for em-dashes.
237
238 "i"
239 Educates em-dashes and en-dashes, using inverted old-school typewriter
240 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
241
242 "e"
243 Educates ellipses.
244
245 "w"
246 Translates any instance of ``&quot;`` into a normal double-quote character.
247 This should be of no interest to most people, but of particular interest
248 to anyone who writes their posts using Dreamweaver, as Dreamweaver
249 inexplicably uses this entity to represent a literal double-quote
250 character. SmartyPants only educates normal quotes, not entities (because
251 ordinarily, entities are used for the explicit purpose of representing the
252 specific character they represent). The "w" option must be used in
253 conjunction with one (or both) of the other quote options ("q" or "b").
254 Thus, if you wish to apply all SmartyPants transformations (quotes, en-
255 and em-dashes, and ellipses) and also translate ``&quot;`` entities into
256 regular quotes so SmartyPants can educate them, you should pass the
257 following to the smarty_pants attribute:
258
259
260 Caveats
261 =======
262
263 Why You Might Not Want to Use Smart Quotes in Your Weblog
264 ---------------------------------------------------------
265
266 For one thing, you might not care.
267
268 Most normal, mentally stable individuals do not take notice of proper
269 typographic punctuation. Many design and typography nerds, however, break
270 out in a nasty rash when they encounter, say, a restaurant sign that uses
271 a straight apostrophe to spell "Joe's".
272
273 If you're the sort of person who just doesn't care, you might well want to
274 continue not caring. Using straight quotes -- and sticking to the 7-bit
275 ASCII character set in general -- is certainly a simpler way to live.
276
277 Even if you I *do* care about accurate typography, you still might want to
278 think twice before educating the quote characters in your weblog. One side
279 effect of publishing curly quote characters is that it makes your
280 weblog a bit harder for others to quote from using copy-and-paste. What
281 happens is that when someone copies text from your blog, the copied text
282 contains the 8-bit curly quote characters (as well as the 8-bit characters
283 for em-dashes and ellipses, if you use these options). These characters
284 are not standard across different text encoding methods, which is why they
285 need to be encoded as characters.
286
287 People copying text from your weblog, however, may not notice that you're
288 using curly quotes, and they'll go ahead and paste the unencoded 8-bit
289 characters copied from their browser into an email message or their own
290 weblog. When pasted as raw "smart quotes", these characters are likely to
291 get mangled beyond recognition.
292
293 That said, my own opinion is that any decent text editor or email client
294 makes it easy to stupefy smart quote characters into their 7-bit
295 equivalents, and I don't consider it my problem if you're using an
296 indecent text editor or email client.
297
298
299 Algorithmic Shortcomings
300 ------------------------
301
302 One situation in which quotes will get curled the wrong way is when
303 apostrophes are used at the start of leading contractions. For example:
304
305 ``'Twas the night before Christmas.``
306
307 In the case above, SmartyPants will turn the apostrophe into an opening
308 single-quote, when in fact it should be a closing one. I don't think
309 this problem can be solved in the general case -- every word processor
310 I've tried gets this wrong as well. In such cases, it's best to use the
311 proper character for closing single-quotes (``’``) by hand.
312
313
314 Version History
315 ===============
316
317 1.7 2012-11-19
318 - Internationalization: language-dependent quotes.
319
320 1.6.1: 2012-11-06
321 - Refactor code, code cleanup,
322 - `educate_tokens()` generator as interface for Docutils.
323
324 1.6: 2010-08-26
325 - Adaption to Docutils:
326 - Use Unicode instead of HTML entities,
327 - Remove code special to pyblosxom.
328
329 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
330 - Fixed bug where blocks of precious unalterable text was instead
331 interpreted. Thanks to Le Roux and Dirk van Oosterbosch.
332
333 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
334 - Fix bogus magical quotation when there is no hint that the
335 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen.
336 - Be smarter about quotes before terminating numbers in an en-dash'ed
337 range.
338
339 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
340 - Fix a date-processing bug, as reported by jacob childress.
341 - Begin a test-suite for ensuring correct output.
342 - Removed import of "string", since I didn't really need it.
343 (This was my first every Python program. Sue me!)
344
345 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
346 - Abort processing if the flavour is in forbidden-list. Default of
347 [ "rss" ] (Idea of Wolfgang SCHNERRING.)
348 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING.
349
350 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
351 - Some single quotes weren't replaced properly. Diff-tesuji played
352 by Benjamin GEIGER.
353
354 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
355 - Support upcoming pyblosxom 0.9 plugin verification feature.
356
357 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
358 - Initial release
359 """
360
361 default_smartypants_attr = "1"
362
363
364 import re
365
366 class smartchars(object):
367 """Smart quotes and dashes
368 """
369
370 endash = u'–' # "&#8211;" EN DASH
371 emdash = u'—' # "&#8212;" EM DASH
372 ellipsis = u'…' # "&#8230;" HORIZONTAL ELLIPSIS
373
374 # quote characters (language-specific, set in __init__())
375 #
376 # English smart quotes (open primary, close primary, open secondary, close
377 # secondary) are:
378 # opquote = u'“' # "&#8220;" LEFT DOUBLE QUOTATION MARK
379 # cpquote = u'”' # "&#8221;" RIGHT DOUBLE QUOTATION MARK
380 # osquote = u'‘' # "&#8216;" LEFT SINGLE QUOTATION MARK
381 # csquote = u'’' # "&#8217;" RIGHT SINGLE QUOTATION MARK
382 # For other languages see:
383 # http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
384 # http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen
385 quotes = {'af': u'“”‘’',
386 'af-x-altquot': u'„”‚’',
387 'ca': u'«»“”',
388 'ca-x-altquot': u'“”‘’',
389 'cs': u'„“‚‘',
390 'cs-x-altquot': u'»«›‹',
391 'da': u'»«‘’',
392 'da-x-altquot': u'„“‚‘',
393 'de': u'„“‚‘',
394 'de-x-altquot': u'»«›‹',
395 'de-CH': u'«»‹›',
396 'el': u'«»“”',
397 'en': u'“”‘’',
398 'en-UK': u'‘’“”',
399 'eo': u'“”‘’',
400 'es': u'«»“”',
401 'et': u'„“‚‘', # no secondary quote listed in
402 'et-x-altquot': u'»«›‹', # the sources above (wikipedia.org)
403 'eu': u'«»‹›',
404 'es-x-altquot': u'“”‘’',
405 'fi': u'””’’',
406 'fi-x-altquot': u'»»’’',
407 'fr': (u'« ', u' »', u'‹ ', u' ›'), # with narrow no-break space
408 'fr-x-altquot': u'«»‹›', # for use with manually set spaces
409 # 'fr-x-altquot': (u'“ ', u' ”', u'‘ ', u' ’'), # rarely used
410 'fr-CH': u'«»‹›',
411 'gl': u'«»“”',
412 'he': u'”“»«',
413 'he-x-altquot': u'„”‚’',
414 'it': u'«»“”',
415 'it-CH': u'«»‹›',
416 'it-x-altquot': u'“”‘’',
417 'ja': u'「」『』',
418 'lt': u'„“‚‘',
419 'nl': u'“”‘’',
420 'nl-x-altquot': u'„”‚’',
421 'pl': u'„”«»',
422 'pl-x-altquot': u'«»“”',
423 'pt': u'«»“”',
424 'pt-BR': u'“”‘’',
425 'ro': u'„”«»',
426 'ro-x-altquot': u'«»„”',
427 'ru': u'«»„“',
428 'sk': u'„“‚‘',
429 'sk-x-altquot': u'»«›‹',
430 'sv': u'„“‚‘',
431 'sv-x-altquot': u'»«›‹',
432 'zh-CN': u'“”‘’',
433 'it': u'«»“”',
434 'zh-TW': u'「」『』',
435 }
436
437 def __init__(self, language='en'):
438 self.language = language
439 try:
440 (self.opquote, self.cpquote,
441 self.osquote, self.csquote) = self.quotes[language]
442 except KeyError:
443 self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\''
444
445
446 def smartyPants(text, attr=default_smartypants_attr, language='en'):
447 """Main function for "traditional" use."""
448
449 return "".join([t for t in educate_tokens(tokenize(text),
450 attr, language)])
451
452
453 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'):
454 """Return iterator that "educates" the items of `text_tokens`.
455 """
456
457 # Parse attributes:
458 # 0 : do nothing
459 # 1 : set all
460 # 2 : set all, using old school en- and em- dash shortcuts
461 # 3 : set all, using inverted old school en and em- dash shortcuts
462 #
463 # q : quotes
464 # b : backtick quotes (``double'' only)
465 # B : backtick quotes (``double'' and `single')
466 # d : dashes
467 # D : old school dashes
468 # i : inverted old school dashes
469 # e : ellipses
470 # w : convert &quot; entities to " for Dreamweaver users
471
472 convert_quot = False # translate &quot; entities into normal quotes?
473 do_dashes = False
474 do_backticks = False
475 do_quotes = False
476 do_ellipses = False
477 do_stupefy = False
478
479 if attr == "0": # Do nothing.
480 yield text
481 elif attr == "1": # Do everything, turn all options on.
482 do_quotes = True
483 do_backticks = True
484 do_dashes = 1
485 do_ellipses = True
486 elif attr == "2":
487 # Do everything, turn all options on, use old school dash shorthand.
488 do_quotes = True
489 do_backticks = True
490 do_dashes = 2
491 do_ellipses = True
492 elif attr == "3":
493 # Do everything, use inverted old school dash shorthand.
494 do_quotes = True
495 do_backticks = True
496 do_dashes = 3
497 do_ellipses = True
498 elif attr == "-1": # Special "stupefy" mode.
499 do_stupefy = True
500 else:
501 if "q" in attr: do_quotes = True
502 if "b" in attr: do_backticks = True
503 if "B" in attr: do_backticks = 2
504 if "d" in attr: do_dashes = 1
505 if "D" in attr: do_dashes = 2
506 if "i" in attr: do_dashes = 3
507 if "e" in attr: do_ellipses = True
508 if "w" in attr: convert_quot = True
509
510 prev_token_last_char = " "
511 # Last character of the previous text token. Used as
512 # context to curl leading quote characters correctly.
513
514 for (ttype, text) in text_tokens:
515
516 # skip HTML and/or XML tags as well as emtpy text tokens
517 # without updating the last character
518 if ttype == 'tag' or not text:
519 yield text
520 continue
521
522 # skip literal text (math, literal, raw, ...)
523 if ttype == 'literal':
524 prev_token_last_char = text[-1:]
525 yield text
526 continue
527
528 last_char = text[-1:] # Remember last char before processing.
529
530 text = processEscapes(text)
531
532 if convert_quot:
533 text = re.sub('&quot;', '"', text)
534
535 if do_dashes == 1:
536 text = educateDashes(text)
537 elif do_dashes == 2:
538 text = educateDashesOldSchool(text)
539 elif do_dashes == 3:
540 text = educateDashesOldSchoolInverted(text)
541
542 if do_ellipses:
543 text = educateEllipses(text)
544
545 # Note: backticks need to be processed before quotes.
546 if do_backticks:
547 text = educateBackticks(text, language)
548
549 if do_backticks == 2:
550 text = educateSingleBackticks(text, language)
551
552 if do_quotes:
553 text = educateQuotes(prev_token_last_char+text, language)[1:]
554
555 if do_stupefy:
556 text = stupefyEntities(text, language)
557
558 # Remember last char as context for the next token
559 prev_token_last_char = last_char
560
561 text = processEscapes(text, restore=True)
562
563 yield text
564
565
566
567 def educateQuotes(text, language='en'):
568 """
569 Parameter: - text string (unicode or bytes).
570 - language (`BCP 47` language tag.)
571 Returns: The `text`, with "educated" curly quote characters.
572
573 Example input: "Isn't this fun?"
574 Example output: “Isn’t this fun?“;
575 """
576
577 smart = smartchars(language)
578
579 # oldtext = text
580 punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
581
582 # Special case if the very first character is a quote
583 # followed by punctuation at a non-word-break.
584 # Close the quotes by brute force:
585 text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text)
586 text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text)
587
588 # Special case for double sets of quotes, e.g.:
589 # <p>He said, "'Quoted' words in a larger quote."</p>
590 text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text)
591 text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text)
592
593 # Special case for decade abbreviations (the '80s):
594 text = re.sub(r"""\b'(?=\d{2}s)""", smart.csquote, text)
595
596 close_class = r"""[^\ \t\r\n\[\{\(\-]"""
597 dec_dashes = r"""&#8211;|&#8212;"""
598
599 # Get most opening single quotes:
600 opening_single_quotes_regex = re.compile(r"""
601 (
602 \s | # a whitespace char, or
603 &nbsp; | # a non-breaking space entity, or
604 -- | # dashes, or
605 &[mn]dash; | # named dash entities
606 %s | # or decimal entities
607 &\#x201[34]; # or hex
608 )
609 ' # the quote
610 (?=\w) # followed by a word character
611 """ % (dec_dashes,), re.VERBOSE)
612 text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text)
613
614 closing_single_quotes_regex = re.compile(r"""
615 (%s)
616 '
617 (?!\s | s\b | \d)
618 """ % (close_class,), re.VERBOSE)
619 text = closing_single_quotes_regex.sub(r'\1'+smart.csquote, text)
620
621 closing_single_quotes_regex = re.compile(r"""
622 (%s)
623 '
624 (\s | s\b)
625 """ % (close_class,), re.VERBOSE)
626 text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text)
627
628 # Any remaining single quotes should be opening ones:
629 text = re.sub(r"""'""", smart.osquote, text)
630
631 # Get most opening double quotes:
632 opening_double_quotes_regex = re.compile(r"""
633 (
634 \s | # a whitespace char, or
635 &nbsp; | # a non-breaking space entity, or
636 -- | # dashes, or
637 &[mn]dash; | # named dash entities
638 %s | # or decimal entities
639 &\#x201[34]; # or hex
640 )
641 " # the quote
642 (?=\w) # followed by a word character
643 """ % (dec_dashes,), re.VERBOSE)
644 text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text)
645
646 # Double closing quotes:
647 closing_double_quotes_regex = re.compile(r"""
648 #(%s)? # character that indicates the quote should be closing
649 "
650 (?=\s)
651 """ % (close_class,), re.VERBOSE)
652 text = closing_double_quotes_regex.sub(smart.cpquote, text)
653
654 closing_double_quotes_regex = re.compile(r"""
655 (%s) # character that indicates the quote should be closing
656 "
657 """ % (close_class,), re.VERBOSE)
658 text = closing_double_quotes_regex.sub(r'\1'+smart.cpquote, text)
659
660 # Any remaining quotes should be opening ones.
661 text = re.sub(r'"', smart.opquote, text)
662
663 return text
664
665
666 def educateBackticks(text, language='en'):
667 """
668 Parameter: String (unicode or bytes).
669 Returns: The `text`, with ``backticks'' -style double quotes
670 translated into HTML curly quote entities.
671 Example input: ``Isn't this fun?''
672 Example output: “Isn't this fun?“;
673 """
674 smart = smartchars(language)
675
676 text = re.sub(r"""``""", smart.opquote, text)
677 text = re.sub(r"""''""", smart.cpquote, text)
678 return text
679
680
681 def educateSingleBackticks(text, language='en'):
682 """
683 Parameter: String (unicode or bytes).
684 Returns: The `text`, with `backticks' -style single quotes
685 translated into HTML curly quote entities.
686
687 Example input: `Isn't this fun?'
688 Example output: ‘Isn’t this fun?’
689 """
690 smart = smartchars(language)
691
692 text = re.sub(r"""`""", smart.osquote, text)
693 text = re.sub(r"""'""", smart.csquote, text)
694 return text
695
696
697 def educateDashes(text):
698 """
699 Parameter: String (unicode or bytes).
700 Returns: The `text`, with each instance of "--" translated to
701 an em-dash character.
702 """
703
704 text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards)
705 text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards)
706 return text
707
708
709 def educateDashesOldSchool(text):
710 """
711 Parameter: String (unicode or bytes).
712 Returns: The `text`, with each instance of "--" translated to
713 an en-dash character, and each "---" translated to
714 an em-dash character.
715 """
716
717 text = re.sub(r"""---""", smartchars.emdash, text)
718 text = re.sub(r"""--""", smartchars.endash, text)
719 return text
720
721
722 def educateDashesOldSchoolInverted(text):
723 """
724 Parameter: String (unicode or bytes).
725 Returns: The `text`, with each instance of "--" translated to
726 an em-dash character, and each "---" translated to
727 an en-dash character. Two reasons why: First, unlike the
728 en- and em-dash syntax supported by
729 EducateDashesOldSchool(), it's compatible with existing
730 entries written before SmartyPants 1.1, back when "--" was
731 only used for em-dashes. Second, em-dashes are more
732 common than en-dashes, and so it sort of makes sense that
733 the shortcut should be shorter to type. (Thanks to Aaron
734 Swartz for the idea.)
735 """
736 text = re.sub(r"""---""", smartchars.endash, text) # em
737 text = re.sub(r"""--""", smartchars.emdash, text) # en
738 return text
739
740
741
742 def educateEllipses(text):
743 """
744 Parameter: String (unicode or bytes).
745 Returns: The `text`, with each instance of "..." translated to
746 an ellipsis character.
747
748 Example input: Huh...?
749 Example output: Huh&#8230;?
750 """
751
752 text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text)
753 text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text)
754 return text
755
756
757 def stupefyEntities(text, language='en'):
758 """
759 Parameter: String (unicode or bytes).
760 Returns: The `text`, with each SmartyPants character translated to
761 its ASCII counterpart.
762
763 Example input: “Hello — world.”
764 Example output: "Hello -- world."
765 """
766 smart = smartchars(language)
767
768 text = re.sub(smart.endash, "-", text) # en-dash
769 text = re.sub(smart.emdash, "--", text) # em-dash
770
771 text = re.sub(smart.osquote, "'", text) # open single quote
772 text = re.sub(smart.csquote, "'", text) # close single quote
773
774 text = re.sub(smart.opquote, '"', text) # open double quote
775 text = re.sub(smart.cpquote, '"', text) # close double quote
776
777 text = re.sub(smart.ellipsis, '...', text)# ellipsis
778
779 return text
780
781
782 def processEscapes(text, restore=False):
783 r"""
784 Parameter: String (unicode or bytes).
785 Returns: The `text`, with after processing the following backslash
786 escape sequences. This is useful if you want to force a "dumb"
787 quote or other character to appear.
788
789 Escape Value
790 ------ -----
791 \\ &#92;
792 \" &#34;
793 \' &#39;
794 \. &#46;
795 \- &#45;
796 \` &#96;
797 """
798 replacements = ((r'\\', r'&#92;'),
799 (r'\"', r'&#34;'),
800 (r"\'", r'&#39;'),
801 (r'\.', r'&#46;'),
802 (r'\-', r'&#45;'),
803 (r'\`', r'&#96;'))
804 if restore:
805 for (ch, rep) in replacements:
806 text = text.replace(rep, ch[1])
807 else:
808 for (ch, rep) in replacements:
809 text = text.replace(ch, rep)
810
811 return text
812
813
814 def tokenize(text):
815 """
816 Parameter: String containing HTML markup.
817 Returns: An iterator that yields the tokens comprising the input
818 string. Each token is either a tag (possibly with nested,
819 tags contained therein, such as <a href="<MTFoo>">, or a
820 run of text between tags. Each yielded element is a
821 two-element tuple; the first is either 'tag' or 'text';
822 the second is the actual value.
823
824 Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
825 <http://www.bradchoate.com/past/mtregex.php>
826 """
827
828 pos = 0
829 length = len(text)
830 # tokens = []
831
832 depth = 6
833 nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
834 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments
835 # (?: <\? .*? \?> ) | # directives
836 # %s # nested tags """ % (nested_tags,)
837 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
838
839 token_match = tag_soup.search(text)
840
841 previous_end = 0
842 while token_match is not None:
843 if token_match.group(1):
844 yield ('text', token_match.group(1))
845
846 yield ('tag', token_match.group(2))
847
848 previous_end = token_match.end()
849 token_match = tag_soup.search(text, token_match.end())
850
851 if previous_end < len(text):
852 yield ('text', text[previous_end:])
853
854
855
856 if __name__ == "__main__":
857
858 import locale
859
860 try:
861 locale.setlocale(locale.LC_ALL, '')
862 except:
863 pass
864
865 from docutils.core import publish_string
866 docstring_html = publish_string(__doc__, writer_name='html')
867
868 print docstring_html
869
870
871 # Unit test output goes out stderr.
872 import unittest
873 sp = smartyPants
874
875 class TestSmartypantsAllAttributes(unittest.TestCase):
876 # the default attribute is "1", which means "all".
877
878 def test_dates(self):
879 self.assertEqual(sp("1440-80's"), u"1440-80’s")
880 self.assertEqual(sp("1440-'80s"), u"1440-‘80s")
881 self.assertEqual(sp("1440---'80s"), u"1440–‘80s")
882 self.assertEqual(sp("1960s"), "1960s") # no effect.
883 self.assertEqual(sp("1960's"), u"1960’s")
884 self.assertEqual(sp("one two '60s"), u"one two ‘60s")
885 self.assertEqual(sp("'60s"), u"‘60s")
886
887 def test_ordinal_numbers(self):
888 self.assertEqual(sp("21st century"), "21st century") # no effect.
889 self.assertEqual(sp("3rd"), "3rd") # no effect.
890
891 def test_educated_quotes(self):
892 self.assertEqual(sp('''"Isn't this fun?"'''), u'“Isn’t this fun?”')
893
894 def test_html_tags(self):
895 text = '<a src="foo">more</a>'
896 self.assertEqual(sp(text), text)
897
898 unittest.main()
899
900
901
902
903 __author__ = "Chad Miller <smartypantspy@chad.org>"
904 __version__ = "1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400"
905 __url__ = "http://wiki.chad.org/SmartyPantsPy"
906 __description__ = "Smart-quotes, smart-ellipses, and smart-dashes for weblog entries in pyblosxom"