Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/sre_parse.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d67268158946 |
---|---|
1 # | |
2 # Secret Labs' Regular Expression Engine | |
3 # | |
4 # convert re-style regular expression to sre pattern | |
5 # | |
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. | |
7 # | |
8 # See the sre.py file for information on usage and redistribution. | |
9 # | |
10 | |
11 """Internal support module for sre""" | |
12 | |
13 # XXX: show string offset and offending character for all errors | |
14 | |
15 import sys | |
16 | |
17 from sre_constants import * | |
18 | |
19 SPECIAL_CHARS = ".\\[{()*+?^$|" | |
20 REPEAT_CHARS = "*+?{" | |
21 | |
22 DIGITS = set("0123456789") | |
23 | |
24 OCTDIGITS = set("01234567") | |
25 HEXDIGITS = set("0123456789abcdefABCDEF") | |
26 | |
27 WHITESPACE = set(" \t\n\r\v\f") | |
28 | |
29 ESCAPES = { | |
30 r"\a": (LITERAL, ord("\a")), | |
31 r"\b": (LITERAL, ord("\b")), | |
32 r"\f": (LITERAL, ord("\f")), | |
33 r"\n": (LITERAL, ord("\n")), | |
34 r"\r": (LITERAL, ord("\r")), | |
35 r"\t": (LITERAL, ord("\t")), | |
36 r"\v": (LITERAL, ord("\v")), | |
37 r"\\": (LITERAL, ord("\\")) | |
38 } | |
39 | |
40 CATEGORIES = { | |
41 r"\A": (AT, AT_BEGINNING_STRING), # start of string | |
42 r"\b": (AT, AT_BOUNDARY), | |
43 r"\B": (AT, AT_NON_BOUNDARY), | |
44 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | |
45 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | |
46 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | |
47 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | |
48 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | |
49 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | |
50 r"\Z": (AT, AT_END_STRING), # end of string | |
51 } | |
52 | |
53 FLAGS = { | |
54 # standard flags | |
55 "i": SRE_FLAG_IGNORECASE, | |
56 "L": SRE_FLAG_LOCALE, | |
57 "m": SRE_FLAG_MULTILINE, | |
58 "s": SRE_FLAG_DOTALL, | |
59 "x": SRE_FLAG_VERBOSE, | |
60 # extensions | |
61 "t": SRE_FLAG_TEMPLATE, | |
62 "u": SRE_FLAG_UNICODE, | |
63 } | |
64 | |
65 class Pattern: | |
66 # master pattern object. keeps track of global attributes | |
67 def __init__(self): | |
68 self.flags = 0 | |
69 self.open = [] | |
70 self.groups = 1 | |
71 self.groupdict = {} | |
72 self.lookbehind = 0 | |
73 | |
74 def opengroup(self, name=None): | |
75 gid = self.groups | |
76 self.groups = gid + 1 | |
77 if name is not None: | |
78 ogid = self.groupdict.get(name, None) | |
79 if ogid is not None: | |
80 raise error, ("redefinition of group name %s as group %d; " | |
81 "was group %d" % (repr(name), gid, ogid)) | |
82 self.groupdict[name] = gid | |
83 self.open.append(gid) | |
84 return gid | |
85 def closegroup(self, gid): | |
86 self.open.remove(gid) | |
87 def checkgroup(self, gid): | |
88 return gid < self.groups and gid not in self.open | |
89 | |
90 class SubPattern: | |
91 # a subpattern, in intermediate form | |
92 def __init__(self, pattern, data=None): | |
93 self.pattern = pattern | |
94 if data is None: | |
95 data = [] | |
96 self.data = data | |
97 self.width = None | |
98 def dump(self, level=0): | |
99 seqtypes = (tuple, list) | |
100 for op, av in self.data: | |
101 print level*" " + op, | |
102 if op == IN: | |
103 # member sublanguage | |
104 print | |
105 for op, a in av: | |
106 print (level+1)*" " + op, a | |
107 elif op == BRANCH: | |
108 print | |
109 for i, a in enumerate(av[1]): | |
110 if i: | |
111 print level*" " + "or" | |
112 a.dump(level+1) | |
113 elif op == GROUPREF_EXISTS: | |
114 condgroup, item_yes, item_no = av | |
115 print condgroup | |
116 item_yes.dump(level+1) | |
117 if item_no: | |
118 print level*" " + "else" | |
119 item_no.dump(level+1) | |
120 elif isinstance(av, seqtypes): | |
121 nl = 0 | |
122 for a in av: | |
123 if isinstance(a, SubPattern): | |
124 if not nl: | |
125 print | |
126 a.dump(level+1) | |
127 nl = 1 | |
128 else: | |
129 print a, | |
130 nl = 0 | |
131 if not nl: | |
132 print | |
133 else: | |
134 print av | |
135 def __repr__(self): | |
136 return repr(self.data) | |
137 def __len__(self): | |
138 return len(self.data) | |
139 def __delitem__(self, index): | |
140 del self.data[index] | |
141 def __getitem__(self, index): | |
142 if isinstance(index, slice): | |
143 return SubPattern(self.pattern, self.data[index]) | |
144 return self.data[index] | |
145 def __setitem__(self, index, code): | |
146 self.data[index] = code | |
147 def insert(self, index, code): | |
148 self.data.insert(index, code) | |
149 def append(self, code): | |
150 self.data.append(code) | |
151 def getwidth(self): | |
152 # determine the width (min, max) for this subpattern | |
153 if self.width: | |
154 return self.width | |
155 lo = hi = 0 | |
156 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) | |
157 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) | |
158 for op, av in self.data: | |
159 if op is BRANCH: | |
160 i = MAXREPEAT - 1 | |
161 j = 0 | |
162 for av in av[1]: | |
163 l, h = av.getwidth() | |
164 i = min(i, l) | |
165 j = max(j, h) | |
166 lo = lo + i | |
167 hi = hi + j | |
168 elif op is CALL: | |
169 i, j = av.getwidth() | |
170 lo = lo + i | |
171 hi = hi + j | |
172 elif op is SUBPATTERN: | |
173 i, j = av[1].getwidth() | |
174 lo = lo + i | |
175 hi = hi + j | |
176 elif op in REPEATCODES: | |
177 i, j = av[2].getwidth() | |
178 lo = lo + i * av[0] | |
179 hi = hi + j * av[1] | |
180 elif op in UNITCODES: | |
181 lo = lo + 1 | |
182 hi = hi + 1 | |
183 elif op == SUCCESS: | |
184 break | |
185 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) | |
186 return self.width | |
187 | |
188 class Tokenizer: | |
189 def __init__(self, string): | |
190 self.string = string | |
191 self.index = 0 | |
192 self.__next() | |
193 def __next(self): | |
194 if self.index >= len(self.string): | |
195 self.next = None | |
196 return | |
197 char = self.string[self.index] | |
198 if char[0] == "\\": | |
199 try: | |
200 c = self.string[self.index + 1] | |
201 except IndexError: | |
202 raise error, "bogus escape (end of line)" | |
203 char = char + c | |
204 self.index = self.index + len(char) | |
205 self.next = char | |
206 def match(self, char, skip=1): | |
207 if char == self.next: | |
208 if skip: | |
209 self.__next() | |
210 return 1 | |
211 return 0 | |
212 def get(self): | |
213 this = self.next | |
214 self.__next() | |
215 return this | |
216 def tell(self): | |
217 return self.index, self.next | |
218 def seek(self, index): | |
219 self.index, self.next = index | |
220 | |
221 def isident(char): | |
222 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" | |
223 | |
224 def isdigit(char): | |
225 return "0" <= char <= "9" | |
226 | |
227 def isname(name): | |
228 # check that group name is a valid string | |
229 if not isident(name[0]): | |
230 return False | |
231 for char in name[1:]: | |
232 if not isident(char) and not isdigit(char): | |
233 return False | |
234 return True | |
235 | |
236 def _class_escape(source, escape): | |
237 # handle escape code inside character class | |
238 code = ESCAPES.get(escape) | |
239 if code: | |
240 return code | |
241 code = CATEGORIES.get(escape) | |
242 if code and code[0] == IN: | |
243 return code | |
244 try: | |
245 c = escape[1:2] | |
246 if c == "x": | |
247 # hexadecimal escape (exactly two digits) | |
248 while source.next in HEXDIGITS and len(escape) < 4: | |
249 escape = escape + source.get() | |
250 escape = escape[2:] | |
251 if len(escape) != 2: | |
252 raise error, "bogus escape: %s" % repr("\\" + escape) | |
253 return LITERAL, int(escape, 16) & 0xff | |
254 elif c in OCTDIGITS: | |
255 # octal escape (up to three digits) | |
256 while source.next in OCTDIGITS and len(escape) < 4: | |
257 escape = escape + source.get() | |
258 escape = escape[1:] | |
259 return LITERAL, int(escape, 8) & 0xff | |
260 elif c in DIGITS: | |
261 raise error, "bogus escape: %s" % repr(escape) | |
262 if len(escape) == 2: | |
263 return LITERAL, ord(escape[1]) | |
264 except ValueError: | |
265 pass | |
266 raise error, "bogus escape: %s" % repr(escape) | |
267 | |
268 def _escape(source, escape, state): | |
269 # handle escape code in expression | |
270 code = CATEGORIES.get(escape) | |
271 if code: | |
272 return code | |
273 code = ESCAPES.get(escape) | |
274 if code: | |
275 return code | |
276 try: | |
277 c = escape[1:2] | |
278 if c == "x": | |
279 # hexadecimal escape | |
280 while source.next in HEXDIGITS and len(escape) < 4: | |
281 escape = escape + source.get() | |
282 if len(escape) != 4: | |
283 raise ValueError | |
284 return LITERAL, int(escape[2:], 16) & 0xff | |
285 elif c == "0": | |
286 # octal escape | |
287 while source.next in OCTDIGITS and len(escape) < 4: | |
288 escape = escape + source.get() | |
289 return LITERAL, int(escape[1:], 8) & 0xff | |
290 elif c in DIGITS: | |
291 # octal escape *or* decimal group reference (sigh) | |
292 if source.next in DIGITS: | |
293 escape = escape + source.get() | |
294 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and | |
295 source.next in OCTDIGITS): | |
296 # got three octal digits; this is an octal escape | |
297 escape = escape + source.get() | |
298 return LITERAL, int(escape[1:], 8) & 0xff | |
299 # not an octal escape, so this is a group reference | |
300 group = int(escape[1:]) | |
301 if group < state.groups: | |
302 if not state.checkgroup(group): | |
303 raise error, "cannot refer to open group" | |
304 if state.lookbehind: | |
305 import warnings | |
306 warnings.warn('group references in lookbehind ' | |
307 'assertions are not supported', | |
308 RuntimeWarning) | |
309 return GROUPREF, group | |
310 raise ValueError | |
311 if len(escape) == 2: | |
312 return LITERAL, ord(escape[1]) | |
313 except ValueError: | |
314 pass | |
315 raise error, "bogus escape: %s" % repr(escape) | |
316 | |
317 def _parse_sub(source, state, nested=1): | |
318 # parse an alternation: a|b|c | |
319 | |
320 items = [] | |
321 itemsappend = items.append | |
322 sourcematch = source.match | |
323 while 1: | |
324 itemsappend(_parse(source, state)) | |
325 if sourcematch("|"): | |
326 continue | |
327 if not nested: | |
328 break | |
329 if not source.next or sourcematch(")", 0): | |
330 break | |
331 else: | |
332 raise error, "pattern not properly closed" | |
333 | |
334 if len(items) == 1: | |
335 return items[0] | |
336 | |
337 subpattern = SubPattern(state) | |
338 subpatternappend = subpattern.append | |
339 | |
340 # check if all items share a common prefix | |
341 while 1: | |
342 prefix = None | |
343 for item in items: | |
344 if not item: | |
345 break | |
346 if prefix is None: | |
347 prefix = item[0] | |
348 elif item[0] != prefix: | |
349 break | |
350 else: | |
351 # all subitems start with a common "prefix". | |
352 # move it out of the branch | |
353 for item in items: | |
354 del item[0] | |
355 subpatternappend(prefix) | |
356 continue # check next one | |
357 break | |
358 | |
359 # check if the branch can be replaced by a character set | |
360 for item in items: | |
361 if len(item) != 1 or item[0][0] != LITERAL: | |
362 break | |
363 else: | |
364 # we can store this as a character set instead of a | |
365 # branch (the compiler may optimize this even more) | |
366 set = [] | |
367 setappend = set.append | |
368 for item in items: | |
369 setappend(item[0]) | |
370 subpatternappend((IN, set)) | |
371 return subpattern | |
372 | |
373 subpattern.append((BRANCH, (None, items))) | |
374 return subpattern | |
375 | |
376 def _parse_sub_cond(source, state, condgroup): | |
377 item_yes = _parse(source, state) | |
378 if source.match("|"): | |
379 item_no = _parse(source, state) | |
380 if source.match("|"): | |
381 raise error, "conditional backref with more than two branches" | |
382 else: | |
383 item_no = None | |
384 if source.next and not source.match(")", 0): | |
385 raise error, "pattern not properly closed" | |
386 subpattern = SubPattern(state) | |
387 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) | |
388 return subpattern | |
389 | |
390 _PATTERNENDERS = set("|)") | |
391 _ASSERTCHARS = set("=!<") | |
392 _LOOKBEHINDASSERTCHARS = set("=!") | |
393 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) | |
394 | |
395 def _parse(source, state): | |
396 # parse a simple pattern | |
397 subpattern = SubPattern(state) | |
398 | |
399 # precompute constants into local variables | |
400 subpatternappend = subpattern.append | |
401 sourceget = source.get | |
402 sourcematch = source.match | |
403 _len = len | |
404 PATTERNENDERS = _PATTERNENDERS | |
405 ASSERTCHARS = _ASSERTCHARS | |
406 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS | |
407 REPEATCODES = _REPEATCODES | |
408 | |
409 while 1: | |
410 | |
411 if source.next in PATTERNENDERS: | |
412 break # end of subpattern | |
413 this = sourceget() | |
414 if this is None: | |
415 break # end of pattern | |
416 | |
417 if state.flags & SRE_FLAG_VERBOSE: | |
418 # skip whitespace and comments | |
419 if this in WHITESPACE: | |
420 continue | |
421 if this == "#": | |
422 while 1: | |
423 this = sourceget() | |
424 if this in (None, "\n"): | |
425 break | |
426 continue | |
427 | |
428 if this and this[0] not in SPECIAL_CHARS: | |
429 subpatternappend((LITERAL, ord(this))) | |
430 | |
431 elif this == "[": | |
432 # character set | |
433 set = [] | |
434 setappend = set.append | |
435 ## if sourcematch(":"): | |
436 ## pass # handle character classes | |
437 if sourcematch("^"): | |
438 setappend((NEGATE, None)) | |
439 # check remaining characters | |
440 start = set[:] | |
441 while 1: | |
442 this = sourceget() | |
443 if this == "]" and set != start: | |
444 break | |
445 elif this and this[0] == "\\": | |
446 code1 = _class_escape(source, this) | |
447 elif this: | |
448 code1 = LITERAL, ord(this) | |
449 else: | |
450 raise error, "unexpected end of regular expression" | |
451 if sourcematch("-"): | |
452 # potential range | |
453 this = sourceget() | |
454 if this == "]": | |
455 if code1[0] is IN: | |
456 code1 = code1[1][0] | |
457 setappend(code1) | |
458 setappend((LITERAL, ord("-"))) | |
459 break | |
460 elif this: | |
461 if this[0] == "\\": | |
462 code2 = _class_escape(source, this) | |
463 else: | |
464 code2 = LITERAL, ord(this) | |
465 if code1[0] != LITERAL or code2[0] != LITERAL: | |
466 raise error, "bad character range" | |
467 lo = code1[1] | |
468 hi = code2[1] | |
469 if hi < lo: | |
470 raise error, "bad character range" | |
471 setappend((RANGE, (lo, hi))) | |
472 else: | |
473 raise error, "unexpected end of regular expression" | |
474 else: | |
475 if code1[0] is IN: | |
476 code1 = code1[1][0] | |
477 setappend(code1) | |
478 | |
479 # XXX: <fl> should move set optimization to compiler! | |
480 if _len(set)==1 and set[0][0] is LITERAL: | |
481 subpatternappend(set[0]) # optimization | |
482 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: | |
483 subpatternappend((NOT_LITERAL, set[1][1])) # optimization | |
484 else: | |
485 # XXX: <fl> should add charmap optimization here | |
486 subpatternappend((IN, set)) | |
487 | |
488 elif this and this[0] in REPEAT_CHARS: | |
489 # repeat previous item | |
490 if this == "?": | |
491 min, max = 0, 1 | |
492 elif this == "*": | |
493 min, max = 0, MAXREPEAT | |
494 | |
495 elif this == "+": | |
496 min, max = 1, MAXREPEAT | |
497 elif this == "{": | |
498 if source.next == "}": | |
499 subpatternappend((LITERAL, ord(this))) | |
500 continue | |
501 here = source.tell() | |
502 min, max = 0, MAXREPEAT | |
503 lo = hi = "" | |
504 while source.next in DIGITS: | |
505 lo = lo + source.get() | |
506 if sourcematch(","): | |
507 while source.next in DIGITS: | |
508 hi = hi + sourceget() | |
509 else: | |
510 hi = lo | |
511 if not sourcematch("}"): | |
512 subpatternappend((LITERAL, ord(this))) | |
513 source.seek(here) | |
514 continue | |
515 if lo: | |
516 min = int(lo) | |
517 if min >= MAXREPEAT: | |
518 raise OverflowError("the repetition number is too large") | |
519 if hi: | |
520 max = int(hi) | |
521 if max >= MAXREPEAT: | |
522 raise OverflowError("the repetition number is too large") | |
523 if max < min: | |
524 raise error("bad repeat interval") | |
525 else: | |
526 raise error, "not supported" | |
527 # figure out which item to repeat | |
528 if subpattern: | |
529 item = subpattern[-1:] | |
530 else: | |
531 item = None | |
532 if not item or (_len(item) == 1 and item[0][0] == AT): | |
533 raise error, "nothing to repeat" | |
534 if item[0][0] in REPEATCODES: | |
535 raise error, "multiple repeat" | |
536 if sourcematch("?"): | |
537 subpattern[-1] = (MIN_REPEAT, (min, max, item)) | |
538 else: | |
539 subpattern[-1] = (MAX_REPEAT, (min, max, item)) | |
540 | |
541 elif this == ".": | |
542 subpatternappend((ANY, None)) | |
543 | |
544 elif this == "(": | |
545 group = 1 | |
546 name = None | |
547 condgroup = None | |
548 if sourcematch("?"): | |
549 group = 0 | |
550 # options | |
551 if sourcematch("P"): | |
552 # python extensions | |
553 if sourcematch("<"): | |
554 # named group: skip forward to end of name | |
555 name = "" | |
556 while 1: | |
557 char = sourceget() | |
558 if char is None: | |
559 raise error, "unterminated name" | |
560 if char == ">": | |
561 break | |
562 name = name + char | |
563 group = 1 | |
564 if not name: | |
565 raise error("missing group name") | |
566 if not isname(name): | |
567 raise error("bad character in group name %r" % | |
568 name) | |
569 elif sourcematch("="): | |
570 # named backreference | |
571 name = "" | |
572 while 1: | |
573 char = sourceget() | |
574 if char is None: | |
575 raise error, "unterminated name" | |
576 if char == ")": | |
577 break | |
578 name = name + char | |
579 if not name: | |
580 raise error("missing group name") | |
581 if not isname(name): | |
582 raise error("bad character in backref group name " | |
583 "%r" % name) | |
584 gid = state.groupdict.get(name) | |
585 if gid is None: | |
586 msg = "unknown group name: {0!r}".format(name) | |
587 raise error(msg) | |
588 if state.lookbehind: | |
589 import warnings | |
590 warnings.warn('group references in lookbehind ' | |
591 'assertions are not supported', | |
592 RuntimeWarning) | |
593 subpatternappend((GROUPREF, gid)) | |
594 continue | |
595 else: | |
596 char = sourceget() | |
597 if char is None: | |
598 raise error, "unexpected end of pattern" | |
599 raise error, "unknown specifier: ?P%s" % char | |
600 elif sourcematch(":"): | |
601 # non-capturing group | |
602 group = 2 | |
603 elif sourcematch("#"): | |
604 # comment | |
605 while 1: | |
606 if source.next is None or source.next == ")": | |
607 break | |
608 sourceget() | |
609 if not sourcematch(")"): | |
610 raise error, "unbalanced parenthesis" | |
611 continue | |
612 elif source.next in ASSERTCHARS: | |
613 # lookahead assertions | |
614 char = sourceget() | |
615 dir = 1 | |
616 if char == "<": | |
617 if source.next not in LOOKBEHINDASSERTCHARS: | |
618 raise error, "syntax error" | |
619 dir = -1 # lookbehind | |
620 char = sourceget() | |
621 state.lookbehind += 1 | |
622 p = _parse_sub(source, state) | |
623 if dir < 0: | |
624 state.lookbehind -= 1 | |
625 if not sourcematch(")"): | |
626 raise error, "unbalanced parenthesis" | |
627 if char == "=": | |
628 subpatternappend((ASSERT, (dir, p))) | |
629 else: | |
630 subpatternappend((ASSERT_NOT, (dir, p))) | |
631 continue | |
632 elif sourcematch("("): | |
633 # conditional backreference group | |
634 condname = "" | |
635 while 1: | |
636 char = sourceget() | |
637 if char is None: | |
638 raise error, "unterminated name" | |
639 if char == ")": | |
640 break | |
641 condname = condname + char | |
642 group = 2 | |
643 if not condname: | |
644 raise error("missing group name") | |
645 if isname(condname): | |
646 condgroup = state.groupdict.get(condname) | |
647 if condgroup is None: | |
648 msg = "unknown group name: {0!r}".format(condname) | |
649 raise error(msg) | |
650 else: | |
651 try: | |
652 condgroup = int(condname) | |
653 except ValueError: | |
654 raise error, "bad character in group name" | |
655 if state.lookbehind: | |
656 import warnings | |
657 warnings.warn('group references in lookbehind ' | |
658 'assertions are not supported', | |
659 RuntimeWarning) | |
660 else: | |
661 # flags | |
662 if not source.next in FLAGS: | |
663 raise error, "unexpected end of pattern" | |
664 while source.next in FLAGS: | |
665 state.flags = state.flags | FLAGS[sourceget()] | |
666 if group: | |
667 # parse group contents | |
668 if group == 2: | |
669 # anonymous group | |
670 group = None | |
671 else: | |
672 group = state.opengroup(name) | |
673 if condgroup: | |
674 p = _parse_sub_cond(source, state, condgroup) | |
675 else: | |
676 p = _parse_sub(source, state) | |
677 if not sourcematch(")"): | |
678 raise error, "unbalanced parenthesis" | |
679 if group is not None: | |
680 state.closegroup(group) | |
681 subpatternappend((SUBPATTERN, (group, p))) | |
682 else: | |
683 while 1: | |
684 char = sourceget() | |
685 if char is None: | |
686 raise error, "unexpected end of pattern" | |
687 if char == ")": | |
688 break | |
689 raise error, "unknown extension" | |
690 | |
691 elif this == "^": | |
692 subpatternappend((AT, AT_BEGINNING)) | |
693 | |
694 elif this == "$": | |
695 subpattern.append((AT, AT_END)) | |
696 | |
697 elif this and this[0] == "\\": | |
698 code = _escape(source, this, state) | |
699 subpatternappend(code) | |
700 | |
701 else: | |
702 raise error, "parser error" | |
703 | |
704 return subpattern | |
705 | |
706 def parse(str, flags=0, pattern=None): | |
707 # parse 're' pattern into list of (opcode, argument) tuples | |
708 | |
709 source = Tokenizer(str) | |
710 | |
711 if pattern is None: | |
712 pattern = Pattern() | |
713 pattern.flags = flags | |
714 pattern.str = str | |
715 | |
716 p = _parse_sub(source, pattern, 0) | |
717 | |
718 tail = source.get() | |
719 if tail == ")": | |
720 raise error, "unbalanced parenthesis" | |
721 elif tail: | |
722 raise error, "bogus characters at end of regular expression" | |
723 | |
724 if flags & SRE_FLAG_DEBUG: | |
725 p.dump() | |
726 | |
727 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: | |
728 # the VERBOSE flag was switched on inside the pattern. to be | |
729 # on the safe side, we'll parse the whole thing again... | |
730 return parse(str, p.pattern.flags) | |
731 | |
732 return p | |
733 | |
734 def parse_template(source, pattern): | |
735 # parse 're' replacement string into list of literals and | |
736 # group references | |
737 s = Tokenizer(source) | |
738 sget = s.get | |
739 p = [] | |
740 a = p.append | |
741 def literal(literal, p=p, pappend=a): | |
742 if p and p[-1][0] is LITERAL: | |
743 p[-1] = LITERAL, p[-1][1] + literal | |
744 else: | |
745 pappend((LITERAL, literal)) | |
746 sep = source[:0] | |
747 if type(sep) is type(""): | |
748 makechar = chr | |
749 else: | |
750 makechar = unichr | |
751 while 1: | |
752 this = sget() | |
753 if this is None: | |
754 break # end of replacement string | |
755 if this and this[0] == "\\": | |
756 # group | |
757 c = this[1:2] | |
758 if c == "g": | |
759 name = "" | |
760 if s.match("<"): | |
761 while 1: | |
762 char = sget() | |
763 if char is None: | |
764 raise error, "unterminated group name" | |
765 if char == ">": | |
766 break | |
767 name = name + char | |
768 if not name: | |
769 raise error, "missing group name" | |
770 try: | |
771 index = int(name) | |
772 if index < 0: | |
773 raise error, "negative group number" | |
774 except ValueError: | |
775 if not isname(name): | |
776 raise error, "bad character in group name" | |
777 try: | |
778 index = pattern.groupindex[name] | |
779 except KeyError: | |
780 msg = "unknown group name: {0!r}".format(name) | |
781 raise IndexError(msg) | |
782 a((MARK, index)) | |
783 elif c == "0": | |
784 if s.next in OCTDIGITS: | |
785 this = this + sget() | |
786 if s.next in OCTDIGITS: | |
787 this = this + sget() | |
788 literal(makechar(int(this[1:], 8) & 0xff)) | |
789 elif c in DIGITS: | |
790 isoctal = False | |
791 if s.next in DIGITS: | |
792 this = this + sget() | |
793 if (c in OCTDIGITS and this[2] in OCTDIGITS and | |
794 s.next in OCTDIGITS): | |
795 this = this + sget() | |
796 isoctal = True | |
797 literal(makechar(int(this[1:], 8) & 0xff)) | |
798 if not isoctal: | |
799 a((MARK, int(this[1:]))) | |
800 else: | |
801 try: | |
802 this = makechar(ESCAPES[this][1]) | |
803 except KeyError: | |
804 pass | |
805 literal(this) | |
806 else: | |
807 literal(this) | |
808 # convert template to groups and literals lists | |
809 i = 0 | |
810 groups = [] | |
811 groupsappend = groups.append | |
812 literals = [None] * len(p) | |
813 for c, s in p: | |
814 if c is MARK: | |
815 groupsappend((i, s)) | |
816 # literal[i] is already None | |
817 else: | |
818 literals[i] = s | |
819 i = i + 1 | |
820 return groups, literals | |
821 | |
822 def expand_template(template, match): | |
823 g = match.group | |
824 sep = match.string[:0] | |
825 groups, literals = template | |
826 literals = literals[:] | |
827 try: | |
828 for index, group in groups: | |
829 literals[index] = s = g(group) | |
830 if s is None: | |
831 raise error, "unmatched group" | |
832 except IndexError: | |
833 raise error, "invalid group reference" | |
834 return sep.join(literals) |