Mercurial > repos > bcclaywell > argo_navis
comparison venv/lib/python2.7/site-packages/yaml/scanner.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
| author | bcclaywell |
|---|---|
| date | Mon, 12 Oct 2015 17:43:33 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d67268158946 |
|---|---|
| 1 | |
| 2 # Scanner produces tokens of the following types: | |
| 3 # STREAM-START | |
| 4 # STREAM-END | |
| 5 # DIRECTIVE(name, value) | |
| 6 # DOCUMENT-START | |
| 7 # DOCUMENT-END | |
| 8 # BLOCK-SEQUENCE-START | |
| 9 # BLOCK-MAPPING-START | |
| 10 # BLOCK-END | |
| 11 # FLOW-SEQUENCE-START | |
| 12 # FLOW-MAPPING-START | |
| 13 # FLOW-SEQUENCE-END | |
| 14 # FLOW-MAPPING-END | |
| 15 # BLOCK-ENTRY | |
| 16 # FLOW-ENTRY | |
| 17 # KEY | |
| 18 # VALUE | |
| 19 # ALIAS(value) | |
| 20 # ANCHOR(value) | |
| 21 # TAG(value) | |
| 22 # SCALAR(value, plain, style) | |
| 23 # | |
| 24 # Read comments in the Scanner code for more details. | |
| 25 # | |
| 26 | |
| 27 __all__ = ['Scanner', 'ScannerError'] | |
| 28 | |
| 29 from error import MarkedYAMLError | |
| 30 from tokens import * | |
| 31 | |
| 32 class ScannerError(MarkedYAMLError): | |
| 33 pass | |
| 34 | |
| 35 class SimpleKey(object): | |
| 36 # See below simple keys treatment. | |
| 37 | |
| 38 def __init__(self, token_number, required, index, line, column, mark): | |
| 39 self.token_number = token_number | |
| 40 self.required = required | |
| 41 self.index = index | |
| 42 self.line = line | |
| 43 self.column = column | |
| 44 self.mark = mark | |
| 45 | |
| 46 class Scanner(object): | |
| 47 | |
| 48 def __init__(self): | |
| 49 """Initialize the scanner.""" | |
| 50 # It is assumed that Scanner and Reader will have a common descendant. | |
| 51 # Reader do the dirty work of checking for BOM and converting the | |
| 52 # input data to Unicode. It also adds NUL to the end. | |
| 53 # | |
| 54 # Reader supports the following methods | |
| 55 # self.peek(i=0) # peek the next i-th character | |
| 56 # self.prefix(l=1) # peek the next l characters | |
| 57 # self.forward(l=1) # read the next l characters and move the pointer. | |
| 58 | |
| 59 # Had we reached the end of the stream? | |
| 60 self.done = False | |
| 61 | |
| 62 # The number of unclosed '{' and '['. `flow_level == 0` means block | |
| 63 # context. | |
| 64 self.flow_level = 0 | |
| 65 | |
| 66 # List of processed tokens that are not yet emitted. | |
| 67 self.tokens = [] | |
| 68 | |
| 69 # Add the STREAM-START token. | |
| 70 self.fetch_stream_start() | |
| 71 | |
| 72 # Number of tokens that were emitted through the `get_token` method. | |
| 73 self.tokens_taken = 0 | |
| 74 | |
| 75 # The current indentation level. | |
| 76 self.indent = -1 | |
| 77 | |
| 78 # Past indentation levels. | |
| 79 self.indents = [] | |
| 80 | |
| 81 # Variables related to simple keys treatment. | |
| 82 | |
| 83 # A simple key is a key that is not denoted by the '?' indicator. | |
| 84 # Example of simple keys: | |
| 85 # --- | |
| 86 # block simple key: value | |
| 87 # ? not a simple key: | |
| 88 # : { flow simple key: value } | |
| 89 # We emit the KEY token before all keys, so when we find a potential | |
| 90 # simple key, we try to locate the corresponding ':' indicator. | |
| 91 # Simple keys should be limited to a single line and 1024 characters. | |
| 92 | |
| 93 # Can a simple key start at the current position? A simple key may | |
| 94 # start: | |
| 95 # - at the beginning of the line, not counting indentation spaces | |
| 96 # (in block context), | |
| 97 # - after '{', '[', ',' (in the flow context), | |
| 98 # - after '?', ':', '-' (in the block context). | |
| 99 # In the block context, this flag also signifies if a block collection | |
| 100 # may start at the current position. | |
| 101 self.allow_simple_key = True | |
| 102 | |
| 103 # Keep track of possible simple keys. This is a dictionary. The key | |
| 104 # is `flow_level`; there can be no more that one possible simple key | |
| 105 # for each level. The value is a SimpleKey record: | |
| 106 # (token_number, required, index, line, column, mark) | |
| 107 # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow), | |
| 108 # '[', or '{' tokens. | |
| 109 self.possible_simple_keys = {} | |
| 110 | |
| 111 # Public methods. | |
| 112 | |
| 113 def check_token(self, *choices): | |
| 114 # Check if the next token is one of the given types. | |
| 115 while self.need_more_tokens(): | |
| 116 self.fetch_more_tokens() | |
| 117 if self.tokens: | |
| 118 if not choices: | |
| 119 return True | |
| 120 for choice in choices: | |
| 121 if isinstance(self.tokens[0], choice): | |
| 122 return True | |
| 123 return False | |
| 124 | |
| 125 def peek_token(self): | |
| 126 # Return the next token, but do not delete if from the queue. | |
| 127 while self.need_more_tokens(): | |
| 128 self.fetch_more_tokens() | |
| 129 if self.tokens: | |
| 130 return self.tokens[0] | |
| 131 | |
| 132 def get_token(self): | |
| 133 # Return the next token. | |
| 134 while self.need_more_tokens(): | |
| 135 self.fetch_more_tokens() | |
| 136 if self.tokens: | |
| 137 self.tokens_taken += 1 | |
| 138 return self.tokens.pop(0) | |
| 139 | |
| 140 # Private methods. | |
| 141 | |
| 142 def need_more_tokens(self): | |
| 143 if self.done: | |
| 144 return False | |
| 145 if not self.tokens: | |
| 146 return True | |
| 147 # The current token may be a potential simple key, so we | |
| 148 # need to look further. | |
| 149 self.stale_possible_simple_keys() | |
| 150 if self.next_possible_simple_key() == self.tokens_taken: | |
| 151 return True | |
| 152 | |
| 153 def fetch_more_tokens(self): | |
| 154 | |
| 155 # Eat whitespaces and comments until we reach the next token. | |
| 156 self.scan_to_next_token() | |
| 157 | |
| 158 # Remove obsolete possible simple keys. | |
| 159 self.stale_possible_simple_keys() | |
| 160 | |
| 161 # Compare the current indentation and column. It may add some tokens | |
| 162 # and decrease the current indentation level. | |
| 163 self.unwind_indent(self.column) | |
| 164 | |
| 165 # Peek the next character. | |
| 166 ch = self.peek() | |
| 167 | |
| 168 # Is it the end of stream? | |
| 169 if ch == u'\0': | |
| 170 return self.fetch_stream_end() | |
| 171 | |
| 172 # Is it a directive? | |
| 173 if ch == u'%' and self.check_directive(): | |
| 174 return self.fetch_directive() | |
| 175 | |
| 176 # Is it the document start? | |
| 177 if ch == u'-' and self.check_document_start(): | |
| 178 return self.fetch_document_start() | |
| 179 | |
| 180 # Is it the document end? | |
| 181 if ch == u'.' and self.check_document_end(): | |
| 182 return self.fetch_document_end() | |
| 183 | |
| 184 # TODO: support for BOM within a stream. | |
| 185 #if ch == u'\uFEFF': | |
| 186 # return self.fetch_bom() <-- issue BOMToken | |
| 187 | |
| 188 # Note: the order of the following checks is NOT significant. | |
| 189 | |
| 190 # Is it the flow sequence start indicator? | |
| 191 if ch == u'[': | |
| 192 return self.fetch_flow_sequence_start() | |
| 193 | |
| 194 # Is it the flow mapping start indicator? | |
| 195 if ch == u'{': | |
| 196 return self.fetch_flow_mapping_start() | |
| 197 | |
| 198 # Is it the flow sequence end indicator? | |
| 199 if ch == u']': | |
| 200 return self.fetch_flow_sequence_end() | |
| 201 | |
| 202 # Is it the flow mapping end indicator? | |
| 203 if ch == u'}': | |
| 204 return self.fetch_flow_mapping_end() | |
| 205 | |
| 206 # Is it the flow entry indicator? | |
| 207 if ch == u',': | |
| 208 return self.fetch_flow_entry() | |
| 209 | |
| 210 # Is it the block entry indicator? | |
| 211 if ch == u'-' and self.check_block_entry(): | |
| 212 return self.fetch_block_entry() | |
| 213 | |
| 214 # Is it the key indicator? | |
| 215 if ch == u'?' and self.check_key(): | |
| 216 return self.fetch_key() | |
| 217 | |
| 218 # Is it the value indicator? | |
| 219 if ch == u':' and self.check_value(): | |
| 220 return self.fetch_value() | |
| 221 | |
| 222 # Is it an alias? | |
| 223 if ch == u'*': | |
| 224 return self.fetch_alias() | |
| 225 | |
| 226 # Is it an anchor? | |
| 227 if ch == u'&': | |
| 228 return self.fetch_anchor() | |
| 229 | |
| 230 # Is it a tag? | |
| 231 if ch == u'!': | |
| 232 return self.fetch_tag() | |
| 233 | |
| 234 # Is it a literal scalar? | |
| 235 if ch == u'|' and not self.flow_level: | |
| 236 return self.fetch_literal() | |
| 237 | |
| 238 # Is it a folded scalar? | |
| 239 if ch == u'>' and not self.flow_level: | |
| 240 return self.fetch_folded() | |
| 241 | |
| 242 # Is it a single quoted scalar? | |
| 243 if ch == u'\'': | |
| 244 return self.fetch_single() | |
| 245 | |
| 246 # Is it a double quoted scalar? | |
| 247 if ch == u'\"': | |
| 248 return self.fetch_double() | |
| 249 | |
| 250 # It must be a plain scalar then. | |
| 251 if self.check_plain(): | |
| 252 return self.fetch_plain() | |
| 253 | |
| 254 # No? It's an error. Let's produce a nice error message. | |
| 255 raise ScannerError("while scanning for the next token", None, | |
| 256 "found character %r that cannot start any token" | |
| 257 % ch.encode('utf-8'), self.get_mark()) | |
| 258 | |
| 259 # Simple keys treatment. | |
| 260 | |
| 261 def next_possible_simple_key(self): | |
| 262 # Return the number of the nearest possible simple key. Actually we | |
| 263 # don't need to loop through the whole dictionary. We may replace it | |
| 264 # with the following code: | |
| 265 # if not self.possible_simple_keys: | |
| 266 # return None | |
| 267 # return self.possible_simple_keys[ | |
| 268 # min(self.possible_simple_keys.keys())].token_number | |
| 269 min_token_number = None | |
| 270 for level in self.possible_simple_keys: | |
| 271 key = self.possible_simple_keys[level] | |
| 272 if min_token_number is None or key.token_number < min_token_number: | |
| 273 min_token_number = key.token_number | |
| 274 return min_token_number | |
| 275 | |
| 276 def stale_possible_simple_keys(self): | |
| 277 # Remove entries that are no longer possible simple keys. According to | |
| 278 # the YAML specification, simple keys | |
| 279 # - should be limited to a single line, | |
| 280 # - should be no longer than 1024 characters. | |
| 281 # Disabling this procedure will allow simple keys of any length and | |
| 282 # height (may cause problems if indentation is broken though). | |
| 283 for level in self.possible_simple_keys.keys(): | |
| 284 key = self.possible_simple_keys[level] | |
| 285 if key.line != self.line \ | |
| 286 or self.index-key.index > 1024: | |
| 287 if key.required: | |
| 288 raise ScannerError("while scanning a simple key", key.mark, | |
| 289 "could not found expected ':'", self.get_mark()) | |
| 290 del self.possible_simple_keys[level] | |
| 291 | |
| 292 def save_possible_simple_key(self): | |
| 293 # The next token may start a simple key. We check if it's possible | |
| 294 # and save its position. This function is called for | |
| 295 # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. | |
| 296 | |
| 297 # Check if a simple key is required at the current position. | |
| 298 required = not self.flow_level and self.indent == self.column | |
| 299 | |
| 300 # A simple key is required only if it is the first token in the current | |
| 301 # line. Therefore it is always allowed. | |
| 302 assert self.allow_simple_key or not required | |
| 303 | |
| 304 # The next token might be a simple key. Let's save it's number and | |
| 305 # position. | |
| 306 if self.allow_simple_key: | |
| 307 self.remove_possible_simple_key() | |
| 308 token_number = self.tokens_taken+len(self.tokens) | |
| 309 key = SimpleKey(token_number, required, | |
| 310 self.index, self.line, self.column, self.get_mark()) | |
| 311 self.possible_simple_keys[self.flow_level] = key | |
| 312 | |
| 313 def remove_possible_simple_key(self): | |
| 314 # Remove the saved possible key position at the current flow level. | |
| 315 if self.flow_level in self.possible_simple_keys: | |
| 316 key = self.possible_simple_keys[self.flow_level] | |
| 317 | |
| 318 if key.required: | |
| 319 raise ScannerError("while scanning a simple key", key.mark, | |
| 320 "could not found expected ':'", self.get_mark()) | |
| 321 | |
| 322 del self.possible_simple_keys[self.flow_level] | |
| 323 | |
| 324 # Indentation functions. | |
| 325 | |
| 326 def unwind_indent(self, column): | |
| 327 | |
| 328 ## In flow context, tokens should respect indentation. | |
| 329 ## Actually the condition should be `self.indent >= column` according to | |
| 330 ## the spec. But this condition will prohibit intuitively correct | |
| 331 ## constructions such as | |
| 332 ## key : { | |
| 333 ## } | |
| 334 #if self.flow_level and self.indent > column: | |
| 335 # raise ScannerError(None, None, | |
| 336 # "invalid intendation or unclosed '[' or '{'", | |
| 337 # self.get_mark()) | |
| 338 | |
| 339 # In the flow context, indentation is ignored. We make the scanner less | |
| 340 # restrictive then specification requires. | |
| 341 if self.flow_level: | |
| 342 return | |
| 343 | |
| 344 # In block context, we may need to issue the BLOCK-END tokens. | |
| 345 while self.indent > column: | |
| 346 mark = self.get_mark() | |
| 347 self.indent = self.indents.pop() | |
| 348 self.tokens.append(BlockEndToken(mark, mark)) | |
| 349 | |
| 350 def add_indent(self, column): | |
| 351 # Check if we need to increase indentation. | |
| 352 if self.indent < column: | |
| 353 self.indents.append(self.indent) | |
| 354 self.indent = column | |
| 355 return True | |
| 356 return False | |
| 357 | |
| 358 # Fetchers. | |
| 359 | |
| 360 def fetch_stream_start(self): | |
| 361 # We always add STREAM-START as the first token and STREAM-END as the | |
| 362 # last token. | |
| 363 | |
| 364 # Read the token. | |
| 365 mark = self.get_mark() | |
| 366 | |
| 367 # Add STREAM-START. | |
| 368 self.tokens.append(StreamStartToken(mark, mark, | |
| 369 encoding=self.encoding)) | |
| 370 | |
| 371 | |
| 372 def fetch_stream_end(self): | |
| 373 | |
| 374 # Set the current intendation to -1. | |
| 375 self.unwind_indent(-1) | |
| 376 | |
| 377 # Reset simple keys. | |
| 378 self.remove_possible_simple_key() | |
| 379 self.allow_simple_key = False | |
| 380 self.possible_simple_keys = {} | |
| 381 | |
| 382 # Read the token. | |
| 383 mark = self.get_mark() | |
| 384 | |
| 385 # Add STREAM-END. | |
| 386 self.tokens.append(StreamEndToken(mark, mark)) | |
| 387 | |
| 388 # The steam is finished. | |
| 389 self.done = True | |
| 390 | |
| 391 def fetch_directive(self): | |
| 392 | |
| 393 # Set the current intendation to -1. | |
| 394 self.unwind_indent(-1) | |
| 395 | |
| 396 # Reset simple keys. | |
| 397 self.remove_possible_simple_key() | |
| 398 self.allow_simple_key = False | |
| 399 | |
| 400 # Scan and add DIRECTIVE. | |
| 401 self.tokens.append(self.scan_directive()) | |
| 402 | |
| 403 def fetch_document_start(self): | |
| 404 self.fetch_document_indicator(DocumentStartToken) | |
| 405 | |
| 406 def fetch_document_end(self): | |
| 407 self.fetch_document_indicator(DocumentEndToken) | |
| 408 | |
| 409 def fetch_document_indicator(self, TokenClass): | |
| 410 | |
| 411 # Set the current intendation to -1. | |
| 412 self.unwind_indent(-1) | |
| 413 | |
| 414 # Reset simple keys. Note that there could not be a block collection | |
| 415 # after '---'. | |
| 416 self.remove_possible_simple_key() | |
| 417 self.allow_simple_key = False | |
| 418 | |
| 419 # Add DOCUMENT-START or DOCUMENT-END. | |
| 420 start_mark = self.get_mark() | |
| 421 self.forward(3) | |
| 422 end_mark = self.get_mark() | |
| 423 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 424 | |
| 425 def fetch_flow_sequence_start(self): | |
| 426 self.fetch_flow_collection_start(FlowSequenceStartToken) | |
| 427 | |
| 428 def fetch_flow_mapping_start(self): | |
| 429 self.fetch_flow_collection_start(FlowMappingStartToken) | |
| 430 | |
| 431 def fetch_flow_collection_start(self, TokenClass): | |
| 432 | |
| 433 # '[' and '{' may start a simple key. | |
| 434 self.save_possible_simple_key() | |
| 435 | |
| 436 # Increase the flow level. | |
| 437 self.flow_level += 1 | |
| 438 | |
| 439 # Simple keys are allowed after '[' and '{'. | |
| 440 self.allow_simple_key = True | |
| 441 | |
| 442 # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START. | |
| 443 start_mark = self.get_mark() | |
| 444 self.forward() | |
| 445 end_mark = self.get_mark() | |
| 446 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 447 | |
| 448 def fetch_flow_sequence_end(self): | |
| 449 self.fetch_flow_collection_end(FlowSequenceEndToken) | |
| 450 | |
| 451 def fetch_flow_mapping_end(self): | |
| 452 self.fetch_flow_collection_end(FlowMappingEndToken) | |
| 453 | |
| 454 def fetch_flow_collection_end(self, TokenClass): | |
| 455 | |
| 456 # Reset possible simple key on the current level. | |
| 457 self.remove_possible_simple_key() | |
| 458 | |
| 459 # Decrease the flow level. | |
| 460 self.flow_level -= 1 | |
| 461 | |
| 462 # No simple keys after ']' or '}'. | |
| 463 self.allow_simple_key = False | |
| 464 | |
| 465 # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END. | |
| 466 start_mark = self.get_mark() | |
| 467 self.forward() | |
| 468 end_mark = self.get_mark() | |
| 469 self.tokens.append(TokenClass(start_mark, end_mark)) | |
| 470 | |
| 471 def fetch_flow_entry(self): | |
| 472 | |
| 473 # Simple keys are allowed after ','. | |
| 474 self.allow_simple_key = True | |
| 475 | |
| 476 # Reset possible simple key on the current level. | |
| 477 self.remove_possible_simple_key() | |
| 478 | |
| 479 # Add FLOW-ENTRY. | |
| 480 start_mark = self.get_mark() | |
| 481 self.forward() | |
| 482 end_mark = self.get_mark() | |
| 483 self.tokens.append(FlowEntryToken(start_mark, end_mark)) | |
| 484 | |
| 485 def fetch_block_entry(self): | |
| 486 | |
| 487 # Block context needs additional checks. | |
| 488 if not self.flow_level: | |
| 489 | |
| 490 # Are we allowed to start a new entry? | |
| 491 if not self.allow_simple_key: | |
| 492 raise ScannerError(None, None, | |
| 493 "sequence entries are not allowed here", | |
| 494 self.get_mark()) | |
| 495 | |
| 496 # We may need to add BLOCK-SEQUENCE-START. | |
| 497 if self.add_indent(self.column): | |
| 498 mark = self.get_mark() | |
| 499 self.tokens.append(BlockSequenceStartToken(mark, mark)) | |
| 500 | |
| 501 # It's an error for the block entry to occur in the flow context, | |
| 502 # but we let the parser detect this. | |
| 503 else: | |
| 504 pass | |
| 505 | |
| 506 # Simple keys are allowed after '-'. | |
| 507 self.allow_simple_key = True | |
| 508 | |
| 509 # Reset possible simple key on the current level. | |
| 510 self.remove_possible_simple_key() | |
| 511 | |
| 512 # Add BLOCK-ENTRY. | |
| 513 start_mark = self.get_mark() | |
| 514 self.forward() | |
| 515 end_mark = self.get_mark() | |
| 516 self.tokens.append(BlockEntryToken(start_mark, end_mark)) | |
| 517 | |
| 518 def fetch_key(self): | |
| 519 | |
| 520 # Block context needs additional checks. | |
| 521 if not self.flow_level: | |
| 522 | |
| 523 # Are we allowed to start a key (not nessesary a simple)? | |
| 524 if not self.allow_simple_key: | |
| 525 raise ScannerError(None, None, | |
| 526 "mapping keys are not allowed here", | |
| 527 self.get_mark()) | |
| 528 | |
| 529 # We may need to add BLOCK-MAPPING-START. | |
| 530 if self.add_indent(self.column): | |
| 531 mark = self.get_mark() | |
| 532 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| 533 | |
| 534 # Simple keys are allowed after '?' in the block context. | |
| 535 self.allow_simple_key = not self.flow_level | |
| 536 | |
| 537 # Reset possible simple key on the current level. | |
| 538 self.remove_possible_simple_key() | |
| 539 | |
| 540 # Add KEY. | |
| 541 start_mark = self.get_mark() | |
| 542 self.forward() | |
| 543 end_mark = self.get_mark() | |
| 544 self.tokens.append(KeyToken(start_mark, end_mark)) | |
| 545 | |
| 546 def fetch_value(self): | |
| 547 | |
| 548 # Do we determine a simple key? | |
| 549 if self.flow_level in self.possible_simple_keys: | |
| 550 | |
| 551 # Add KEY. | |
| 552 key = self.possible_simple_keys[self.flow_level] | |
| 553 del self.possible_simple_keys[self.flow_level] | |
| 554 self.tokens.insert(key.token_number-self.tokens_taken, | |
| 555 KeyToken(key.mark, key.mark)) | |
| 556 | |
| 557 # If this key starts a new block mapping, we need to add | |
| 558 # BLOCK-MAPPING-START. | |
| 559 if not self.flow_level: | |
| 560 if self.add_indent(key.column): | |
| 561 self.tokens.insert(key.token_number-self.tokens_taken, | |
| 562 BlockMappingStartToken(key.mark, key.mark)) | |
| 563 | |
| 564 # There cannot be two simple keys one after another. | |
| 565 self.allow_simple_key = False | |
| 566 | |
| 567 # It must be a part of a complex key. | |
| 568 else: | |
| 569 | |
| 570 # Block context needs additional checks. | |
| 571 # (Do we really need them? They will be catched by the parser | |
| 572 # anyway.) | |
| 573 if not self.flow_level: | |
| 574 | |
| 575 # We are allowed to start a complex value if and only if | |
| 576 # we can start a simple key. | |
| 577 if not self.allow_simple_key: | |
| 578 raise ScannerError(None, None, | |
| 579 "mapping values are not allowed here", | |
| 580 self.get_mark()) | |
| 581 | |
| 582 # If this value starts a new block mapping, we need to add | |
| 583 # BLOCK-MAPPING-START. It will be detected as an error later by | |
| 584 # the parser. | |
| 585 if not self.flow_level: | |
| 586 if self.add_indent(self.column): | |
| 587 mark = self.get_mark() | |
| 588 self.tokens.append(BlockMappingStartToken(mark, mark)) | |
| 589 | |
| 590 # Simple keys are allowed after ':' in the block context. | |
| 591 self.allow_simple_key = not self.flow_level | |
| 592 | |
| 593 # Reset possible simple key on the current level. | |
| 594 self.remove_possible_simple_key() | |
| 595 | |
| 596 # Add VALUE. | |
| 597 start_mark = self.get_mark() | |
| 598 self.forward() | |
| 599 end_mark = self.get_mark() | |
| 600 self.tokens.append(ValueToken(start_mark, end_mark)) | |
| 601 | |
| 602 def fetch_alias(self): | |
| 603 | |
| 604 # ALIAS could be a simple key. | |
| 605 self.save_possible_simple_key() | |
| 606 | |
| 607 # No simple keys after ALIAS. | |
| 608 self.allow_simple_key = False | |
| 609 | |
| 610 # Scan and add ALIAS. | |
| 611 self.tokens.append(self.scan_anchor(AliasToken)) | |
| 612 | |
| 613 def fetch_anchor(self): | |
| 614 | |
| 615 # ANCHOR could start a simple key. | |
| 616 self.save_possible_simple_key() | |
| 617 | |
| 618 # No simple keys after ANCHOR. | |
| 619 self.allow_simple_key = False | |
| 620 | |
| 621 # Scan and add ANCHOR. | |
| 622 self.tokens.append(self.scan_anchor(AnchorToken)) | |
| 623 | |
| 624 def fetch_tag(self): | |
| 625 | |
| 626 # TAG could start a simple key. | |
| 627 self.save_possible_simple_key() | |
| 628 | |
| 629 # No simple keys after TAG. | |
| 630 self.allow_simple_key = False | |
| 631 | |
| 632 # Scan and add TAG. | |
| 633 self.tokens.append(self.scan_tag()) | |
| 634 | |
| 635 def fetch_literal(self): | |
| 636 self.fetch_block_scalar(style='|') | |
| 637 | |
| 638 def fetch_folded(self): | |
| 639 self.fetch_block_scalar(style='>') | |
| 640 | |
| 641 def fetch_block_scalar(self, style): | |
| 642 | |
| 643 # A simple key may follow a block scalar. | |
| 644 self.allow_simple_key = True | |
| 645 | |
| 646 # Reset possible simple key on the current level. | |
| 647 self.remove_possible_simple_key() | |
| 648 | |
| 649 # Scan and add SCALAR. | |
| 650 self.tokens.append(self.scan_block_scalar(style)) | |
| 651 | |
| 652 def fetch_single(self): | |
| 653 self.fetch_flow_scalar(style='\'') | |
| 654 | |
| 655 def fetch_double(self): | |
| 656 self.fetch_flow_scalar(style='"') | |
| 657 | |
| 658 def fetch_flow_scalar(self, style): | |
| 659 | |
| 660 # A flow scalar could be a simple key. | |
| 661 self.save_possible_simple_key() | |
| 662 | |
| 663 # No simple keys after flow scalars. | |
| 664 self.allow_simple_key = False | |
| 665 | |
| 666 # Scan and add SCALAR. | |
| 667 self.tokens.append(self.scan_flow_scalar(style)) | |
| 668 | |
| 669 def fetch_plain(self): | |
| 670 | |
| 671 # A plain scalar could be a simple key. | |
| 672 self.save_possible_simple_key() | |
| 673 | |
| 674 # No simple keys after plain scalars. But note that `scan_plain` will | |
| 675 # change this flag if the scan is finished at the beginning of the | |
| 676 # line. | |
| 677 self.allow_simple_key = False | |
| 678 | |
| 679 # Scan and add SCALAR. May change `allow_simple_key`. | |
| 680 self.tokens.append(self.scan_plain()) | |
| 681 | |
| 682 # Checkers. | |
| 683 | |
| 684 def check_directive(self): | |
| 685 | |
| 686 # DIRECTIVE: ^ '%' ... | |
| 687 # The '%' indicator is already checked. | |
| 688 if self.column == 0: | |
| 689 return True | |
| 690 | |
| 691 def check_document_start(self): | |
| 692 | |
| 693 # DOCUMENT-START: ^ '---' (' '|'\n') | |
| 694 if self.column == 0: | |
| 695 if self.prefix(3) == u'---' \ | |
| 696 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': | |
| 697 return True | |
| 698 | |
| 699 def check_document_end(self): | |
| 700 | |
| 701 # DOCUMENT-END: ^ '...' (' '|'\n') | |
| 702 if self.column == 0: | |
| 703 if self.prefix(3) == u'...' \ | |
| 704 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': | |
| 705 return True | |
| 706 | |
| 707 def check_block_entry(self): | |
| 708 | |
| 709 # BLOCK-ENTRY: '-' (' '|'\n') | |
| 710 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' | |
| 711 | |
| 712 def check_key(self): | |
| 713 | |
| 714 # KEY(flow context): '?' | |
| 715 if self.flow_level: | |
| 716 return True | |
| 717 | |
| 718 # KEY(block context): '?' (' '|'\n') | |
| 719 else: | |
| 720 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' | |
| 721 | |
| 722 def check_value(self): | |
| 723 | |
| 724 # VALUE(flow context): ':' | |
| 725 if self.flow_level: | |
| 726 return True | |
| 727 | |
| 728 # VALUE(block context): ':' (' '|'\n') | |
| 729 else: | |
| 730 return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' | |
| 731 | |
| 732 def check_plain(self): | |
| 733 | |
| 734 # A plain scalar may start with any non-space character except: | |
| 735 # '-', '?', ':', ',', '[', ']', '{', '}', | |
| 736 # '#', '&', '*', '!', '|', '>', '\'', '\"', | |
| 737 # '%', '@', '`'. | |
| 738 # | |
| 739 # It may also start with | |
| 740 # '-', '?', ':' | |
| 741 # if it is followed by a non-space character. | |
| 742 # | |
| 743 # Note that we limit the last rule to the block context (except the | |
| 744 # '-' character) because we want the flow context to be space | |
| 745 # independent. | |
| 746 ch = self.peek() | |
| 747 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ | |
| 748 or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029' | |
| 749 and (ch == u'-' or (not self.flow_level and ch in u'?:'))) | |
| 750 | |
| 751 # Scanners. | |
| 752 | |
| 753 def scan_to_next_token(self): | |
| 754 # We ignore spaces, line breaks and comments. | |
| 755 # If we find a line break in the block context, we set the flag | |
| 756 # `allow_simple_key` on. | |
| 757 # The byte order mark is stripped if it's the first character in the | |
| 758 # stream. We do not yet support BOM inside the stream as the | |
| 759 # specification requires. Any such mark will be considered as a part | |
| 760 # of the document. | |
| 761 # | |
| 762 # TODO: We need to make tab handling rules more sane. A good rule is | |
| 763 # Tabs cannot precede tokens | |
| 764 # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, | |
| 765 # KEY(block), VALUE(block), BLOCK-ENTRY | |
| 766 # So the checking code is | |
| 767 # if <TAB>: | |
| 768 # self.allow_simple_keys = False | |
| 769 # We also need to add the check for `allow_simple_keys == True` to | |
| 770 # `unwind_indent` before issuing BLOCK-END. | |
| 771 # Scanners for block, flow, and plain scalars need to be modified. | |
| 772 | |
| 773 if self.index == 0 and self.peek() == u'\uFEFF': | |
| 774 self.forward() | |
| 775 found = False | |
| 776 while not found: | |
| 777 while self.peek() == u' ': | |
| 778 self.forward() | |
| 779 if self.peek() == u'#': | |
| 780 while self.peek() not in u'\0\r\n\x85\u2028\u2029': | |
| 781 self.forward() | |
| 782 if self.scan_line_break(): | |
| 783 if not self.flow_level: | |
| 784 self.allow_simple_key = True | |
| 785 else: | |
| 786 found = True | |
| 787 | |
| 788 def scan_directive(self): | |
| 789 # See the specification for details. | |
| 790 start_mark = self.get_mark() | |
| 791 self.forward() | |
| 792 name = self.scan_directive_name(start_mark) | |
| 793 value = None | |
| 794 if name == u'YAML': | |
| 795 value = self.scan_yaml_directive_value(start_mark) | |
| 796 end_mark = self.get_mark() | |
| 797 elif name == u'TAG': | |
| 798 value = self.scan_tag_directive_value(start_mark) | |
| 799 end_mark = self.get_mark() | |
| 800 else: | |
| 801 end_mark = self.get_mark() | |
| 802 while self.peek() not in u'\0\r\n\x85\u2028\u2029': | |
| 803 self.forward() | |
| 804 self.scan_directive_ignored_line(start_mark) | |
| 805 return DirectiveToken(name, value, start_mark, end_mark) | |
| 806 | |
| 807 def scan_directive_name(self, start_mark): | |
| 808 # See the specification for details. | |
| 809 length = 0 | |
| 810 ch = self.peek(length) | |
| 811 while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \ | |
| 812 or ch in u'-_': | |
| 813 length += 1 | |
| 814 ch = self.peek(length) | |
| 815 if not length: | |
| 816 raise ScannerError("while scanning a directive", start_mark, | |
| 817 "expected alphabetic or numeric character, but found %r" | |
| 818 % ch.encode('utf-8'), self.get_mark()) | |
| 819 value = self.prefix(length) | |
| 820 self.forward(length) | |
| 821 ch = self.peek() | |
| 822 if ch not in u'\0 \r\n\x85\u2028\u2029': | |
| 823 raise ScannerError("while scanning a directive", start_mark, | |
| 824 "expected alphabetic or numeric character, but found %r" | |
| 825 % ch.encode('utf-8'), self.get_mark()) | |
| 826 return value | |
| 827 | |
| 828 def scan_yaml_directive_value(self, start_mark): | |
| 829 # See the specification for details. | |
| 830 while self.peek() == u' ': | |
| 831 self.forward() | |
| 832 major = self.scan_yaml_directive_number(start_mark) | |
| 833 if self.peek() != '.': | |
| 834 raise ScannerError("while scanning a directive", start_mark, | |
| 835 "expected a digit or '.', but found %r" | |
| 836 % self.peek().encode('utf-8'), | |
| 837 self.get_mark()) | |
| 838 self.forward() | |
| 839 minor = self.scan_yaml_directive_number(start_mark) | |
| 840 if self.peek() not in u'\0 \r\n\x85\u2028\u2029': | |
| 841 raise ScannerError("while scanning a directive", start_mark, | |
| 842 "expected a digit or ' ', but found %r" | |
| 843 % self.peek().encode('utf-8'), | |
| 844 self.get_mark()) | |
| 845 return (major, minor) | |
| 846 | |
| 847 def scan_yaml_directive_number(self, start_mark): | |
| 848 # See the specification for details. | |
| 849 ch = self.peek() | |
| 850 if not (u'0' <= ch <= u'9'): | |
| 851 raise ScannerError("while scanning a directive", start_mark, | |
| 852 "expected a digit, but found %r" % ch.encode('utf-8'), | |
| 853 self.get_mark()) | |
| 854 length = 0 | |
| 855 while u'0' <= self.peek(length) <= u'9': | |
| 856 length += 1 | |
| 857 value = int(self.prefix(length)) | |
| 858 self.forward(length) | |
| 859 return value | |
| 860 | |
| 861 def scan_tag_directive_value(self, start_mark): | |
| 862 # See the specification for details. | |
| 863 while self.peek() == u' ': | |
| 864 self.forward() | |
| 865 handle = self.scan_tag_directive_handle(start_mark) | |
| 866 while self.peek() == u' ': | |
| 867 self.forward() | |
| 868 prefix = self.scan_tag_directive_prefix(start_mark) | |
| 869 return (handle, prefix) | |
| 870 | |
| 871 def scan_tag_directive_handle(self, start_mark): | |
| 872 # See the specification for details. | |
| 873 value = self.scan_tag_handle('directive', start_mark) | |
| 874 ch = self.peek() | |
| 875 if ch != u' ': | |
| 876 raise ScannerError("while scanning a directive", start_mark, | |
| 877 "expected ' ', but found %r" % ch.encode('utf-8'), | |
| 878 self.get_mark()) | |
| 879 return value | |
| 880 | |
| 881 def scan_tag_directive_prefix(self, start_mark): | |
| 882 # See the specification for details. | |
| 883 value = self.scan_tag_uri('directive', start_mark) | |
| 884 ch = self.peek() | |
| 885 if ch not in u'\0 \r\n\x85\u2028\u2029': | |
| 886 raise ScannerError("while scanning a directive", start_mark, | |
| 887 "expected ' ', but found %r" % ch.encode('utf-8'), | |
| 888 self.get_mark()) | |
| 889 return value | |
| 890 | |
| 891 def scan_directive_ignored_line(self, start_mark): | |
| 892 # See the specification for details. | |
| 893 while self.peek() == u' ': | |
| 894 self.forward() | |
| 895 if self.peek() == u'#': | |
| 896 while self.peek() not in u'\0\r\n\x85\u2028\u2029': | |
| 897 self.forward() | |
| 898 ch = self.peek() | |
| 899 if ch not in u'\0\r\n\x85\u2028\u2029': | |
| 900 raise ScannerError("while scanning a directive", start_mark, | |
| 901 "expected a comment or a line break, but found %r" | |
| 902 % ch.encode('utf-8'), self.get_mark()) | |
| 903 self.scan_line_break() | |
| 904 | |
| 905 def scan_anchor(self, TokenClass): | |
| 906 # The specification does not restrict characters for anchors and | |
| 907 # aliases. This may lead to problems, for instance, the document: | |
| 908 # [ *alias, value ] | |
| 909 # can be interpteted in two ways, as | |
| 910 # [ "value" ] | |
| 911 # and | |
| 912 # [ *alias , "value" ] | |
| 913 # Therefore we restrict aliases to numbers and ASCII letters. | |
| 914 start_mark = self.get_mark() | |
| 915 indicator = self.peek() | |
| 916 if indicator == u'*': | |
| 917 name = 'alias' | |
| 918 else: | |
| 919 name = 'anchor' | |
| 920 self.forward() | |
| 921 length = 0 | |
| 922 ch = self.peek(length) | |
| 923 while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \ | |
| 924 or ch in u'-_': | |
| 925 length += 1 | |
| 926 ch = self.peek(length) | |
| 927 if not length: | |
| 928 raise ScannerError("while scanning an %s" % name, start_mark, | |
| 929 "expected alphabetic or numeric character, but found %r" | |
| 930 % ch.encode('utf-8'), self.get_mark()) | |
| 931 value = self.prefix(length) | |
| 932 self.forward(length) | |
| 933 ch = self.peek() | |
| 934 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`': | |
| 935 raise ScannerError("while scanning an %s" % name, start_mark, | |
| 936 "expected alphabetic or numeric character, but found %r" | |
| 937 % ch.encode('utf-8'), self.get_mark()) | |
| 938 end_mark = self.get_mark() | |
| 939 return TokenClass(value, start_mark, end_mark) | |
| 940 | |
| 941 def scan_tag(self): | |
| 942 # See the specification for details. | |
| 943 start_mark = self.get_mark() | |
| 944 ch = self.peek(1) | |
| 945 if ch == u'<': | |
| 946 handle = None | |
| 947 self.forward(2) | |
| 948 suffix = self.scan_tag_uri('tag', start_mark) | |
| 949 if self.peek() != u'>': | |
| 950 raise ScannerError("while parsing a tag", start_mark, | |
| 951 "expected '>', but found %r" % self.peek().encode('utf-8'), | |
| 952 self.get_mark()) | |
| 953 self.forward() | |
| 954 elif ch in u'\0 \t\r\n\x85\u2028\u2029': | |
| 955 handle = None | |
| 956 suffix = u'!' | |
| 957 self.forward() | |
| 958 else: | |
| 959 length = 1 | |
| 960 use_handle = False | |
| 961 while ch not in u'\0 \r\n\x85\u2028\u2029': | |
| 962 if ch == u'!': | |
| 963 use_handle = True | |
| 964 break | |
| 965 length += 1 | |
| 966 ch = self.peek(length) | |
| 967 handle = u'!' | |
| 968 if use_handle: | |
| 969 handle = self.scan_tag_handle('tag', start_mark) | |
| 970 else: | |
| 971 handle = u'!' | |
| 972 self.forward() | |
| 973 suffix = self.scan_tag_uri('tag', start_mark) | |
| 974 ch = self.peek() | |
| 975 if ch not in u'\0 \r\n\x85\u2028\u2029': | |
| 976 raise ScannerError("while scanning a tag", start_mark, | |
| 977 "expected ' ', but found %r" % ch.encode('utf-8'), | |
| 978 self.get_mark()) | |
| 979 value = (handle, suffix) | |
| 980 end_mark = self.get_mark() | |
| 981 return TagToken(value, start_mark, end_mark) | |
| 982 | |
| 983 def scan_block_scalar(self, style): | |
| 984 # See the specification for details. | |
| 985 | |
| 986 if style == '>': | |
| 987 folded = True | |
| 988 else: | |
| 989 folded = False | |
| 990 | |
| 991 chunks = [] | |
| 992 start_mark = self.get_mark() | |
| 993 | |
| 994 # Scan the header. | |
| 995 self.forward() | |
| 996 chomping, increment = self.scan_block_scalar_indicators(start_mark) | |
| 997 self.scan_block_scalar_ignored_line(start_mark) | |
| 998 | |
| 999 # Determine the indentation level and go to the first non-empty line. | |
| 1000 min_indent = self.indent+1 | |
| 1001 if min_indent < 1: | |
| 1002 min_indent = 1 | |
| 1003 if increment is None: | |
| 1004 breaks, max_indent, end_mark = self.scan_block_scalar_indentation() | |
| 1005 indent = max(min_indent, max_indent) | |
| 1006 else: | |
| 1007 indent = min_indent+increment-1 | |
| 1008 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| 1009 line_break = u'' | |
| 1010 | |
| 1011 # Scan the inner part of the block scalar. | |
| 1012 while self.column == indent and self.peek() != u'\0': | |
| 1013 chunks.extend(breaks) | |
| 1014 leading_non_space = self.peek() not in u' \t' | |
| 1015 length = 0 | |
| 1016 while self.peek(length) not in u'\0\r\n\x85\u2028\u2029': | |
| 1017 length += 1 | |
| 1018 chunks.append(self.prefix(length)) | |
| 1019 self.forward(length) | |
| 1020 line_break = self.scan_line_break() | |
| 1021 breaks, end_mark = self.scan_block_scalar_breaks(indent) | |
| 1022 if self.column == indent and self.peek() != u'\0': | |
| 1023 | |
| 1024 # Unfortunately, folding rules are ambiguous. | |
| 1025 # | |
| 1026 # This is the folding according to the specification: | |
| 1027 | |
| 1028 if folded and line_break == u'\n' \ | |
| 1029 and leading_non_space and self.peek() not in u' \t': | |
| 1030 if not breaks: | |
| 1031 chunks.append(u' ') | |
| 1032 else: | |
| 1033 chunks.append(line_break) | |
| 1034 | |
| 1035 # This is Clark Evans's interpretation (also in the spec | |
| 1036 # examples): | |
| 1037 # | |
| 1038 #if folded and line_break == u'\n': | |
| 1039 # if not breaks: | |
| 1040 # if self.peek() not in ' \t': | |
| 1041 # chunks.append(u' ') | |
| 1042 # else: | |
| 1043 # chunks.append(line_break) | |
| 1044 #else: | |
| 1045 # chunks.append(line_break) | |
| 1046 else: | |
| 1047 break | |
| 1048 | |
| 1049 # Chomp the tail. | |
| 1050 if chomping is not False: | |
| 1051 chunks.append(line_break) | |
| 1052 if chomping is True: | |
| 1053 chunks.extend(breaks) | |
| 1054 | |
| 1055 # We are done. | |
| 1056 return ScalarToken(u''.join(chunks), False, start_mark, end_mark, | |
| 1057 style) | |
| 1058 | |
| 1059 def scan_block_scalar_indicators(self, start_mark): | |
| 1060 # See the specification for details. | |
| 1061 chomping = None | |
| 1062 increment = None | |
| 1063 ch = self.peek() | |
| 1064 if ch in u'+-': | |
| 1065 if ch == '+': | |
| 1066 chomping = True | |
| 1067 else: | |
| 1068 chomping = False | |
| 1069 self.forward() | |
| 1070 ch = self.peek() | |
| 1071 if ch in u'0123456789': | |
| 1072 increment = int(ch) | |
| 1073 if increment == 0: | |
| 1074 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1075 "expected indentation indicator in the range 1-9, but found 0", | |
| 1076 self.get_mark()) | |
| 1077 self.forward() | |
| 1078 elif ch in u'0123456789': | |
| 1079 increment = int(ch) | |
| 1080 if increment == 0: | |
| 1081 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1082 "expected indentation indicator in the range 1-9, but found 0", | |
| 1083 self.get_mark()) | |
| 1084 self.forward() | |
| 1085 ch = self.peek() | |
| 1086 if ch in u'+-': | |
| 1087 if ch == '+': | |
| 1088 chomping = True | |
| 1089 else: | |
| 1090 chomping = False | |
| 1091 self.forward() | |
| 1092 ch = self.peek() | |
| 1093 if ch not in u'\0 \r\n\x85\u2028\u2029': | |
| 1094 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1095 "expected chomping or indentation indicators, but found %r" | |
| 1096 % ch.encode('utf-8'), self.get_mark()) | |
| 1097 return chomping, increment | |
| 1098 | |
| 1099 def scan_block_scalar_ignored_line(self, start_mark): | |
| 1100 # See the specification for details. | |
| 1101 while self.peek() == u' ': | |
| 1102 self.forward() | |
| 1103 if self.peek() == u'#': | |
| 1104 while self.peek() not in u'\0\r\n\x85\u2028\u2029': | |
| 1105 self.forward() | |
| 1106 ch = self.peek() | |
| 1107 if ch not in u'\0\r\n\x85\u2028\u2029': | |
| 1108 raise ScannerError("while scanning a block scalar", start_mark, | |
| 1109 "expected a comment or a line break, but found %r" | |
| 1110 % ch.encode('utf-8'), self.get_mark()) | |
| 1111 self.scan_line_break() | |
| 1112 | |
| 1113 def scan_block_scalar_indentation(self): | |
| 1114 # See the specification for details. | |
| 1115 chunks = [] | |
| 1116 max_indent = 0 | |
| 1117 end_mark = self.get_mark() | |
| 1118 while self.peek() in u' \r\n\x85\u2028\u2029': | |
| 1119 if self.peek() != u' ': | |
| 1120 chunks.append(self.scan_line_break()) | |
| 1121 end_mark = self.get_mark() | |
| 1122 else: | |
| 1123 self.forward() | |
| 1124 if self.column > max_indent: | |
| 1125 max_indent = self.column | |
| 1126 return chunks, max_indent, end_mark | |
| 1127 | |
| 1128 def scan_block_scalar_breaks(self, indent): | |
| 1129 # See the specification for details. | |
| 1130 chunks = [] | |
| 1131 end_mark = self.get_mark() | |
| 1132 while self.column < indent and self.peek() == u' ': | |
| 1133 self.forward() | |
| 1134 while self.peek() in u'\r\n\x85\u2028\u2029': | |
| 1135 chunks.append(self.scan_line_break()) | |
| 1136 end_mark = self.get_mark() | |
| 1137 while self.column < indent and self.peek() == u' ': | |
| 1138 self.forward() | |
| 1139 return chunks, end_mark | |
| 1140 | |
| 1141 def scan_flow_scalar(self, style): | |
| 1142 # See the specification for details. | |
| 1143 # Note that we loose indentation rules for quoted scalars. Quoted | |
| 1144 # scalars don't need to adhere indentation because " and ' clearly | |
| 1145 # mark the beginning and the end of them. Therefore we are less | |
| 1146 # restrictive then the specification requires. We only need to check | |
| 1147 # that document separators are not included in scalars. | |
| 1148 if style == '"': | |
| 1149 double = True | |
| 1150 else: | |
| 1151 double = False | |
| 1152 chunks = [] | |
| 1153 start_mark = self.get_mark() | |
| 1154 quote = self.peek() | |
| 1155 self.forward() | |
| 1156 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| 1157 while self.peek() != quote: | |
| 1158 chunks.extend(self.scan_flow_scalar_spaces(double, start_mark)) | |
| 1159 chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark)) | |
| 1160 self.forward() | |
| 1161 end_mark = self.get_mark() | |
| 1162 return ScalarToken(u''.join(chunks), False, start_mark, end_mark, | |
| 1163 style) | |
| 1164 | |
| 1165 ESCAPE_REPLACEMENTS = { | |
| 1166 u'0': u'\0', | |
| 1167 u'a': u'\x07', | |
| 1168 u'b': u'\x08', | |
| 1169 u't': u'\x09', | |
| 1170 u'\t': u'\x09', | |
| 1171 u'n': u'\x0A', | |
| 1172 u'v': u'\x0B', | |
| 1173 u'f': u'\x0C', | |
| 1174 u'r': u'\x0D', | |
| 1175 u'e': u'\x1B', | |
| 1176 u' ': u'\x20', | |
| 1177 u'\"': u'\"', | |
| 1178 u'\\': u'\\', | |
| 1179 u'N': u'\x85', | |
| 1180 u'_': u'\xA0', | |
| 1181 u'L': u'\u2028', | |
| 1182 u'P': u'\u2029', | |
| 1183 } | |
| 1184 | |
| 1185 ESCAPE_CODES = { | |
| 1186 u'x': 2, | |
| 1187 u'u': 4, | |
| 1188 u'U': 8, | |
| 1189 } | |
| 1190 | |
| 1191 def scan_flow_scalar_non_spaces(self, double, start_mark): | |
| 1192 # See the specification for details. | |
| 1193 chunks = [] | |
| 1194 while True: | |
| 1195 length = 0 | |
| 1196 while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029': | |
| 1197 length += 1 | |
| 1198 if length: | |
| 1199 chunks.append(self.prefix(length)) | |
| 1200 self.forward(length) | |
| 1201 ch = self.peek() | |
| 1202 if not double and ch == u'\'' and self.peek(1) == u'\'': | |
| 1203 chunks.append(u'\'') | |
| 1204 self.forward(2) | |
| 1205 elif (double and ch == u'\'') or (not double and ch in u'\"\\'): | |
| 1206 chunks.append(ch) | |
| 1207 self.forward() | |
| 1208 elif double and ch == u'\\': | |
| 1209 self.forward() | |
| 1210 ch = self.peek() | |
| 1211 if ch in self.ESCAPE_REPLACEMENTS: | |
| 1212 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) | |
| 1213 self.forward() | |
| 1214 elif ch in self.ESCAPE_CODES: | |
| 1215 length = self.ESCAPE_CODES[ch] | |
| 1216 self.forward() | |
| 1217 for k in range(length): | |
| 1218 if self.peek(k) not in u'0123456789ABCDEFabcdef': | |
| 1219 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| 1220 "expected escape sequence of %d hexdecimal numbers, but found %r" % | |
| 1221 (length, self.peek(k).encode('utf-8')), self.get_mark()) | |
| 1222 code = int(self.prefix(length), 16) | |
| 1223 chunks.append(unichr(code)) | |
| 1224 self.forward(length) | |
| 1225 elif ch in u'\r\n\x85\u2028\u2029': | |
| 1226 self.scan_line_break() | |
| 1227 chunks.extend(self.scan_flow_scalar_breaks(double, start_mark)) | |
| 1228 else: | |
| 1229 raise ScannerError("while scanning a double-quoted scalar", start_mark, | |
| 1230 "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark()) | |
| 1231 else: | |
| 1232 return chunks | |
| 1233 | |
| 1234 def scan_flow_scalar_spaces(self, double, start_mark): | |
| 1235 # See the specification for details. | |
| 1236 chunks = [] | |
| 1237 length = 0 | |
| 1238 while self.peek(length) in u' \t': | |
| 1239 length += 1 | |
| 1240 whitespaces = self.prefix(length) | |
| 1241 self.forward(length) | |
| 1242 ch = self.peek() | |
| 1243 if ch == u'\0': | |
| 1244 raise ScannerError("while scanning a quoted scalar", start_mark, | |
| 1245 "found unexpected end of stream", self.get_mark()) | |
| 1246 elif ch in u'\r\n\x85\u2028\u2029': | |
| 1247 line_break = self.scan_line_break() | |
| 1248 breaks = self.scan_flow_scalar_breaks(double, start_mark) | |
| 1249 if line_break != u'\n': | |
| 1250 chunks.append(line_break) | |
| 1251 elif not breaks: | |
| 1252 chunks.append(u' ') | |
| 1253 chunks.extend(breaks) | |
| 1254 else: | |
| 1255 chunks.append(whitespaces) | |
| 1256 return chunks | |
| 1257 | |
| 1258 def scan_flow_scalar_breaks(self, double, start_mark): | |
| 1259 # See the specification for details. | |
| 1260 chunks = [] | |
| 1261 while True: | |
| 1262 # Instead of checking indentation, we check for document | |
| 1263 # separators. | |
| 1264 prefix = self.prefix(3) | |
| 1265 if (prefix == u'---' or prefix == u'...') \ | |
| 1266 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': | |
| 1267 raise ScannerError("while scanning a quoted scalar", start_mark, | |
| 1268 "found unexpected document separator", self.get_mark()) | |
| 1269 while self.peek() in u' \t': | |
| 1270 self.forward() | |
| 1271 if self.peek() in u'\r\n\x85\u2028\u2029': | |
| 1272 chunks.append(self.scan_line_break()) | |
| 1273 else: | |
| 1274 return chunks | |
| 1275 | |
| 1276 def scan_plain(self): | |
| 1277 # See the specification for details. | |
| 1278 # We add an additional restriction for the flow context: | |
| 1279 # plain scalars in the flow context cannot contain ',', ':' and '?'. | |
| 1280 # We also keep track of the `allow_simple_key` flag here. | |
| 1281 # Indentation rules are loosed for the flow context. | |
| 1282 chunks = [] | |
| 1283 start_mark = self.get_mark() | |
| 1284 end_mark = start_mark | |
| 1285 indent = self.indent+1 | |
| 1286 # We allow zero indentation for scalars, but then we need to check for | |
| 1287 # document separators at the beginning of the line. | |
| 1288 #if indent == 0: | |
| 1289 # indent = 1 | |
| 1290 spaces = [] | |
| 1291 while True: | |
| 1292 length = 0 | |
| 1293 if self.peek() == u'#': | |
| 1294 break | |
| 1295 while True: | |
| 1296 ch = self.peek(length) | |
| 1297 if ch in u'\0 \t\r\n\x85\u2028\u2029' \ | |
| 1298 or (not self.flow_level and ch == u':' and | |
| 1299 self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \ | |
| 1300 or (self.flow_level and ch in u',:?[]{}'): | |
| 1301 break | |
| 1302 length += 1 | |
| 1303 # It's not clear what we should do with ':' in the flow context. | |
| 1304 if (self.flow_level and ch == u':' | |
| 1305 and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'): | |
| 1306 self.forward(length) | |
| 1307 raise ScannerError("while scanning a plain scalar", start_mark, | |
| 1308 "found unexpected ':'", self.get_mark(), | |
| 1309 "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.") | |
| 1310 if length == 0: | |
| 1311 break | |
| 1312 self.allow_simple_key = False | |
| 1313 chunks.extend(spaces) | |
| 1314 chunks.append(self.prefix(length)) | |
| 1315 self.forward(length) | |
| 1316 end_mark = self.get_mark() | |
| 1317 spaces = self.scan_plain_spaces(indent, start_mark) | |
| 1318 if not spaces or self.peek() == u'#' \ | |
| 1319 or (not self.flow_level and self.column < indent): | |
| 1320 break | |
| 1321 return ScalarToken(u''.join(chunks), True, start_mark, end_mark) | |
| 1322 | |
| 1323 def scan_plain_spaces(self, indent, start_mark): | |
| 1324 # See the specification for details. | |
| 1325 # The specification is really confusing about tabs in plain scalars. | |
| 1326 # We just forbid them completely. Do not use tabs in YAML! | |
| 1327 chunks = [] | |
| 1328 length = 0 | |
| 1329 while self.peek(length) in u' ': | |
| 1330 length += 1 | |
| 1331 whitespaces = self.prefix(length) | |
| 1332 self.forward(length) | |
| 1333 ch = self.peek() | |
| 1334 if ch in u'\r\n\x85\u2028\u2029': | |
| 1335 line_break = self.scan_line_break() | |
| 1336 self.allow_simple_key = True | |
| 1337 prefix = self.prefix(3) | |
| 1338 if (prefix == u'---' or prefix == u'...') \ | |
| 1339 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': | |
| 1340 return | |
| 1341 breaks = [] | |
| 1342 while self.peek() in u' \r\n\x85\u2028\u2029': | |
| 1343 if self.peek() == ' ': | |
| 1344 self.forward() | |
| 1345 else: | |
| 1346 breaks.append(self.scan_line_break()) | |
| 1347 prefix = self.prefix(3) | |
| 1348 if (prefix == u'---' or prefix == u'...') \ | |
| 1349 and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': | |
| 1350 return | |
| 1351 if line_break != u'\n': | |
| 1352 chunks.append(line_break) | |
| 1353 elif not breaks: | |
| 1354 chunks.append(u' ') | |
| 1355 chunks.extend(breaks) | |
| 1356 elif whitespaces: | |
| 1357 chunks.append(whitespaces) | |
| 1358 return chunks | |
| 1359 | |
| 1360 def scan_tag_handle(self, name, start_mark): | |
| 1361 # See the specification for details. | |
| 1362 # For some strange reasons, the specification does not allow '_' in | |
| 1363 # tag handles. I have allowed it anyway. | |
| 1364 ch = self.peek() | |
| 1365 if ch != u'!': | |
| 1366 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1367 "expected '!', but found %r" % ch.encode('utf-8'), | |
| 1368 self.get_mark()) | |
| 1369 length = 1 | |
| 1370 ch = self.peek(length) | |
| 1371 if ch != u' ': | |
| 1372 while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \ | |
| 1373 or ch in u'-_': | |
| 1374 length += 1 | |
| 1375 ch = self.peek(length) | |
| 1376 if ch != u'!': | |
| 1377 self.forward(length) | |
| 1378 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1379 "expected '!', but found %r" % ch.encode('utf-8'), | |
| 1380 self.get_mark()) | |
| 1381 length += 1 | |
| 1382 value = self.prefix(length) | |
| 1383 self.forward(length) | |
| 1384 return value | |
| 1385 | |
| 1386 def scan_tag_uri(self, name, start_mark): | |
| 1387 # See the specification for details. | |
| 1388 # Note: we do not check if URI is well-formed. | |
| 1389 chunks = [] | |
| 1390 length = 0 | |
| 1391 ch = self.peek(length) | |
| 1392 while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \ | |
| 1393 or ch in u'-;/?:@&=+$,_.!~*\'()[]%': | |
| 1394 if ch == u'%': | |
| 1395 chunks.append(self.prefix(length)) | |
| 1396 self.forward(length) | |
| 1397 length = 0 | |
| 1398 chunks.append(self.scan_uri_escapes(name, start_mark)) | |
| 1399 else: | |
| 1400 length += 1 | |
| 1401 ch = self.peek(length) | |
| 1402 if length: | |
| 1403 chunks.append(self.prefix(length)) | |
| 1404 self.forward(length) | |
| 1405 length = 0 | |
| 1406 if not chunks: | |
| 1407 raise ScannerError("while parsing a %s" % name, start_mark, | |
| 1408 "expected URI, but found %r" % ch.encode('utf-8'), | |
| 1409 self.get_mark()) | |
| 1410 return u''.join(chunks) | |
| 1411 | |
| 1412 def scan_uri_escapes(self, name, start_mark): | |
| 1413 # See the specification for details. | |
| 1414 bytes = [] | |
| 1415 mark = self.get_mark() | |
| 1416 while self.peek() == u'%': | |
| 1417 self.forward() | |
| 1418 for k in range(2): | |
| 1419 if self.peek(k) not in u'0123456789ABCDEFabcdef': | |
| 1420 raise ScannerError("while scanning a %s" % name, start_mark, | |
| 1421 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" % | |
| 1422 (self.peek(k).encode('utf-8')), self.get_mark()) | |
| 1423 bytes.append(chr(int(self.prefix(2), 16))) | |
| 1424 self.forward(2) | |
| 1425 try: | |
| 1426 value = unicode(''.join(bytes), 'utf-8') | |
| 1427 except UnicodeDecodeError, exc: | |
| 1428 raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark) | |
| 1429 return value | |
| 1430 | |
| 1431 def scan_line_break(self): | |
| 1432 # Transforms: | |
| 1433 # '\r\n' : '\n' | |
| 1434 # '\r' : '\n' | |
| 1435 # '\n' : '\n' | |
| 1436 # '\x85' : '\n' | |
| 1437 # '\u2028' : '\u2028' | |
| 1438 # '\u2029 : '\u2029' | |
| 1439 # default : '' | |
| 1440 ch = self.peek() | |
| 1441 if ch in u'\r\n\x85': | |
| 1442 if self.prefix(2) == u'\r\n': | |
| 1443 self.forward(2) | |
| 1444 else: | |
| 1445 self.forward() | |
| 1446 return u'\n' | |
| 1447 elif ch in u'\u2028\u2029': | |
| 1448 self.forward() | |
| 1449 return ch | |
| 1450 return u'' | |
| 1451 | |
| 1452 #try: | |
| 1453 # import psyco | |
| 1454 # psyco.bind(Scanner) | |
| 1455 #except ImportError: | |
| 1456 # pass | |
| 1457 |
