sqlglot.tokenizer_core
1from __future__ import annotations 2 3import typing as t 4from enum import IntEnum, auto 5 6from sqlglot.errors import TokenError 7 8# dict lookup is faster than .upper(), .isspace(), .isdigit() 9_CHAR_UPPER: t.Dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)} 10 11_SPACE_CHARS: t.FrozenSet[str] = frozenset( 12 "\t\n\r \x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0" 13 "\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a" 14 "\u2028\u2029\u202f\u205f\u3000" 15) 16_DIGIT_CHARS: t.FrozenSet[str] = frozenset("0123456789") 17 18 19class TokenType(IntEnum): 20 L_PAREN = auto() 21 R_PAREN = auto() 22 L_BRACKET = auto() 23 R_BRACKET = auto() 24 L_BRACE = auto() 25 R_BRACE = auto() 26 COMMA = auto() 27 DOT = auto() 28 DASH = auto() 29 PLUS = auto() 30 COLON = auto() 31 DOTCOLON = auto() 32 DCOLON = auto() 33 DCOLONDOLLAR = auto() 34 DCOLONPERCENT = auto() 35 DCOLONQMARK = auto() 36 DQMARK = auto() 37 SEMICOLON = auto() 38 STAR = auto() 39 BACKSLASH = auto() 40 SLASH = auto() 41 LT = auto() 42 LTE = auto() 43 GT = auto() 44 GTE = auto() 45 NOT = auto() 46 EQ = auto() 47 NEQ = auto() 48 NULLSAFE_EQ = auto() 49 COLON_EQ = auto() 50 COLON_GT = auto() 51 NCOLON_GT = auto() 52 AND = auto() 53 OR = auto() 54 AMP = auto() 55 DPIPE = auto() 56 PIPE_GT = auto() 57 PIPE = auto() 58 PIPE_SLASH = auto() 59 DPIPE_SLASH = auto() 60 CARET = auto() 61 CARET_AT = auto() 62 TILDE = auto() 63 ARROW = auto() 64 DARROW = auto() 65 FARROW = auto() 66 HASH = auto() 67 HASH_ARROW = auto() 68 DHASH_ARROW = auto() 69 LR_ARROW = auto() 70 DAT = auto() 71 LT_AT = auto() 72 AT_GT = auto() 73 DOLLAR = auto() 74 PARAMETER = auto() 75 SESSION = auto() 76 SESSION_PARAMETER = auto() 77 SESSION_USER = auto() 78 DAMP = auto() 79 AMP_LT = auto() 80 AMP_GT = auto() 81 ADJACENT = auto() 82 XOR = auto() 83 DSTAR = auto() 84 QMARK_AMP = auto() 85 QMARK_PIPE = auto() 86 HASH_DASH = auto() 87 EXCLAMATION = auto() 88 89 URI_START = auto() 90 91 BLOCK_START = auto() 92 BLOCK_END = auto() 93 94 SPACE = auto() 95 BREAK = auto() 96 97 STRING = auto() 98 NUMBER = auto() 99 IDENTIFIER = auto() 100 DATABASE = auto() 101 COLUMN = auto() 102 COLUMN_DEF = auto() 103 SCHEMA = auto() 104 TABLE = auto() 105 WAREHOUSE = auto() 106 STAGE = auto() 107 STREAMLIT = auto() 108 VAR = auto() 109 BIT_STRING = auto() 110 HEX_STRING = auto() 111 BYTE_STRING = auto() 112 NATIONAL_STRING = auto() 113 RAW_STRING = auto() 114 HEREDOC_STRING = auto() 115 UNICODE_STRING = auto() 116 117 # types 118 BIT = auto() 119 BOOLEAN = auto() 120 TINYINT = auto() 121 UTINYINT = auto() 122 SMALLINT = auto() 123 USMALLINT = auto() 124 MEDIUMINT = auto() 125 UMEDIUMINT = auto() 126 INT = auto() 127 UINT = auto() 128 BIGINT = auto() 129 UBIGINT = auto() 130 BIGNUM = auto() 131 INT128 = auto() 132 UINT128 = auto() 133 INT256 = auto() 134 UINT256 = auto() 135 FLOAT = auto() 136 DOUBLE = auto() 137 UDOUBLE = auto() 138 DECIMAL = auto() 139 DECIMAL32 = auto() 140 DECIMAL64 = auto() 141 DECIMAL128 = auto() 142 DECIMAL256 = auto() 143 DECFLOAT = auto() 144 UDECIMAL = auto() 145 BIGDECIMAL = auto() 146 CHAR = auto() 147 NCHAR = auto() 148 VARCHAR = auto() 149 NVARCHAR = auto() 150 BPCHAR = auto() 151 TEXT = auto() 152 MEDIUMTEXT = auto() 153 LONGTEXT = auto() 154 BLOB = auto() 155 MEDIUMBLOB = auto() 156 LONGBLOB = auto() 157 TINYBLOB = auto() 158 TINYTEXT = auto() 159 NAME = auto() 160 BINARY = auto() 161 VARBINARY = auto() 162 JSON = auto() 163 JSONB = auto() 164 TIME = auto() 165 TIMETZ = auto() 166 TIME_NS = auto() 167 TIMESTAMP = auto() 168 TIMESTAMPTZ = auto() 169 TIMESTAMPLTZ = auto() 170 TIMESTAMPNTZ = auto() 171 TIMESTAMP_S = auto() 172 TIMESTAMP_MS = auto() 173 TIMESTAMP_NS = auto() 174 DATETIME = auto() 175 DATETIME2 = auto() 176 DATETIME64 = auto() 177 SMALLDATETIME = auto() 178 DATE = auto() 179 DATE32 = auto() 180 INT4RANGE = auto() 181 INT4MULTIRANGE = auto() 182 INT8RANGE = auto() 183 INT8MULTIRANGE = auto() 184 NUMRANGE = auto() 185 NUMMULTIRANGE = auto() 186 TSRANGE = auto() 187 TSMULTIRANGE = auto() 188 TSTZRANGE = auto() 189 TSTZMULTIRANGE = auto() 190 DATERANGE = auto() 191 DATEMULTIRANGE = auto() 192 UUID = auto() 193 GEOGRAPHY = auto() 194 GEOGRAPHYPOINT = auto() 195 NULLABLE = auto() 196 GEOMETRY = auto() 197 POINT = auto() 198 RING = auto() 199 LINESTRING = auto() 200 LOCALTIME = auto() 201 LOCALTIMESTAMP = auto() 202 SYSTIMESTAMP = auto() 203 MULTILINESTRING = auto() 204 POLYGON = auto() 205 MULTIPOLYGON = auto() 206 HLLSKETCH = auto() 207 HSTORE = auto() 208 SUPER = auto() 209 SERIAL = auto() 210 SMALLSERIAL = auto() 211 BIGSERIAL = auto() 212 XML = auto() 213 YEAR = auto() 214 USERDEFINED = auto() 215 MONEY = auto() 216 SMALLMONEY = auto() 217 ROWVERSION = auto() 218 IMAGE = auto() 219 VARIANT = auto() 220 OBJECT = auto() 221 INET = auto() 222 IPADDRESS = auto() 223 IPPREFIX = auto() 224 IPV4 = auto() 225 IPV6 = auto() 226 ENUM = auto() 227 ENUM8 = auto() 228 ENUM16 = auto() 229 FIXEDSTRING = auto() 230 LOWCARDINALITY = auto() 231 NESTED = auto() 232 AGGREGATEFUNCTION = auto() 233 SIMPLEAGGREGATEFUNCTION = auto() 234 TDIGEST = auto() 235 UNKNOWN = auto() 236 VECTOR = auto() 237 DYNAMIC = auto() 238 VOID = auto() 239 240 # keywords 241 ALIAS = auto() 242 ALTER = auto() 243 ALL = auto() 244 ANTI = auto() 245 ANY = auto() 246 APPLY = auto() 247 ARRAY = auto() 248 ASC = auto() 249 ASOF = auto() 250 ATTACH = auto() 251 AUTO_INCREMENT = auto() 252 BEGIN = auto() 253 BETWEEN = auto() 254 BULK_COLLECT_INTO = auto() 255 CACHE = auto() 256 CASE = auto() 257 CHARACTER_SET = auto() 258 CLUSTER_BY = auto() 259 COLLATE = auto() 260 COMMAND = auto() 261 COMMENT = auto() 262 COMMIT = auto() 263 CONNECT_BY = auto() 264 CONSTRAINT = auto() 265 COPY = auto() 266 CREATE = auto() 267 CROSS = auto() 268 CUBE = auto() 269 CURRENT_DATE = auto() 270 CURRENT_DATETIME = auto() 271 CURRENT_SCHEMA = auto() 272 CURRENT_TIME = auto() 273 CURRENT_TIMESTAMP = auto() 274 CURRENT_USER = auto() 275 CURRENT_ROLE = auto() 276 CURRENT_CATALOG = auto() 277 DECLARE = auto() 278 DEFAULT = auto() 279 DELETE = auto() 280 DESC = auto() 281 DESCRIBE = auto() 282 DETACH = auto() 283 DICTIONARY = auto() 284 DISTINCT = auto() 285 DISTRIBUTE_BY = auto() 286 DIV = auto() 287 DROP = auto() 288 ELSE = auto() 289 END = auto() 290 ESCAPE = auto() 291 EXCEPT = auto() 292 EXECUTE = auto() 293 EXISTS = auto() 294 FALSE = auto() 295 FETCH = auto() 296 FILE = auto() 297 FILE_FORMAT = auto() 298 FILTER = auto() 299 FINAL = auto() 300 FIRST = auto() 301 FOR = auto() 302 FORCE = auto() 303 FOREIGN_KEY = auto() 304 FORMAT = auto() 305 FROM = auto() 306 FULL = auto() 307 FUNCTION = auto() 308 GET = auto() 309 GLOB = auto() 310 GLOBAL = auto() 311 GRANT = auto() 312 GROUP_BY = auto() 313 GROUPING_SETS = auto() 314 HAVING = auto() 315 HINT = auto() 316 IGNORE = auto() 317 ILIKE = auto() 318 IN = auto() 319 INDEX = auto() 320 INDEXED_BY = auto() 321 INNER = auto() 322 INSERT = auto() 323 INSTALL = auto() 324 INTERSECT = auto() 325 INTERVAL = auto() 326 INTO = auto() 327 INTRODUCER = auto() 328 IRLIKE = auto() 329 IS = auto() 330 ISNULL = auto() 331 JOIN = auto() 332 JOIN_MARKER = auto() 333 KEEP = auto() 334 KEY = auto() 335 KILL = auto() 336 LANGUAGE = auto() 337 LATERAL = auto() 338 LEFT = auto() 339 LIKE = auto() 340 LIMIT = auto() 341 LIST = auto() 342 LOAD = auto() 343 LOCK = auto() 344 MAP = auto() 345 MATCH = auto() 346 MATCH_CONDITION = auto() 347 MATCH_RECOGNIZE = auto() 348 MEMBER_OF = auto() 349 MERGE = auto() 350 MOD = auto() 351 MODEL = auto() 352 NATURAL = auto() 353 NEXT = auto() 354 NOTHING = auto() 355 NOTNULL = auto() 356 NULL = auto() 357 OBJECT_IDENTIFIER = auto() 358 OFFSET = auto() 359 ON = auto() 360 ONLY = auto() 361 OPERATOR = auto() 362 ORDER_BY = auto() 363 ORDER_SIBLINGS_BY = auto() 364 ORDERED = auto() 365 ORDINALITY = auto() 366 OUT = auto() 367 INOUT = auto() 368 OUTER = auto() 369 OVER = auto() 370 OVERLAPS = auto() 371 OVERWRITE = auto() 372 PARTITION = auto() 373 PARTITION_BY = auto() 374 PERCENT = auto() 375 PIVOT = auto() 376 PLACEHOLDER = auto() 377 POSITIONAL = auto() 378 PRAGMA = auto() 379 PREWHERE = auto() 380 PRIMARY_KEY = auto() 381 PROCEDURE = auto() 382 PROPERTIES = auto() 383 PSEUDO_TYPE = auto() 384 PUT = auto() 385 QUALIFY = auto() 386 QUOTE = auto() 387 QDCOLON = auto() 388 RANGE = auto() 389 RECURSIVE = auto() 390 REFRESH = auto() 391 RENAME = auto() 392 REPLACE = auto() 393 RETURNING = auto() 394 REVOKE = auto() 395 REFERENCES = auto() 396 RIGHT = auto() 397 RLIKE = auto() 398 ROLLBACK = auto() 399 ROLLUP = auto() 400 ROW = auto() 401 ROWS = auto() 402 SELECT = auto() 403 SEMI = auto() 404 SEPARATOR = auto() 405 SEQUENCE = auto() 406 SERDE_PROPERTIES = auto() 407 SET = auto() 408 SETTINGS = auto() 409 SHOW = auto() 410 SIMILAR_TO = auto() 411 SOME = auto() 412 SORT_BY = auto() 413 SOUNDS_LIKE = auto() 414 START_WITH = auto() 415 STORAGE_INTEGRATION = auto() 416 STRAIGHT_JOIN = auto() 417 STRUCT = auto() 418 SUMMARIZE = auto() 419 TABLE_SAMPLE = auto() 420 TAG = auto() 421 TEMPORARY = auto() 422 TOP = auto() 423 THEN = auto() 424 TRUE = auto() 425 TRUNCATE = auto() 426 TRIGGER = auto() 427 UNCACHE = auto() 428 UNION = auto() 429 UNNEST = auto() 430 UNPIVOT = auto() 431 UPDATE = auto() 432 USE = auto() 433 USING = auto() 434 VALUES = auto() 435 VARIADIC = auto() 436 VIEW = auto() 437 SEMANTIC_VIEW = auto() 438 VOLATILE = auto() 439 WHEN = auto() 440 WHERE = auto() 441 WINDOW = auto() 442 WITH = auto() 443 UNIQUE = auto() 444 UTC_DATE = auto() 445 UTC_TIME = auto() 446 UTC_TIMESTAMP = auto() 447 VERSION_SNAPSHOT = auto() 448 TIMESTAMP_SNAPSHOT = auto() 449 OPTION = auto() 450 SINK = auto() 451 SOURCE = auto() 452 ANALYZE = auto() 453 NAMESPACE = auto() 454 EXPORT = auto() 455 456 # sentinel 457 HIVE_TOKEN_STREAM = auto() 458 459 def __str__(self) -> str: 460 return f"TokenType.{self.name}" 461 462 463class Token: 464 # mypyc doesn't expose slots 465 _attrs: t.ClassVar[t.Tuple[str, ...]] = ( 466 "token_type", 467 "text", 468 "line", 469 "col", 470 "start", 471 "end", 472 "comments", 473 ) 474 __slots__ = _attrs 475 476 @classmethod 477 def number(cls, number: int) -> Token: 478 """Returns a NUMBER token with `number` as its text.""" 479 return cls(TokenType.NUMBER, str(number)) 480 481 @classmethod 482 def string(cls, string: str) -> Token: 483 """Returns a STRING token with `string` as its text.""" 484 return cls(TokenType.STRING, string) 485 486 @classmethod 487 def identifier(cls, identifier: str) -> Token: 488 """Returns an IDENTIFIER token with `identifier` as its text.""" 489 return cls(TokenType.IDENTIFIER, identifier) 490 491 @classmethod 492 def var(cls, var: str) -> Token: 493 """Returns an VAR token with `var` as its text.""" 494 return cls(TokenType.VAR, var) 495 496 def __init__( 497 self, 498 token_type: TokenType, 499 text: str, 500 line: int = 1, 501 col: int = 1, 502 start: int = 0, 503 end: int = 0, 504 comments: t.Optional[t.List[str]] = None, 505 ) -> None: 506 self.token_type = token_type 507 self.text = text 508 self.line = line 509 self.col = col 510 self.start = start 511 self.end = end 512 self.comments = [] if comments is None else comments 513 514 def __repr__(self) -> str: 515 attributes = ", ".join( 516 f"{k}: TokenType.{self.token_type.name}" 517 if k == "token_type" 518 else f"{k}: {getattr(self, k)}" 519 for k in self._attrs 520 ) 521 return f"<Token {attributes}>" 522 523 524class TokenizerCore: 525 __slots__ = ( 526 "sql", 527 "size", 528 "tokens", 529 "_start", 530 "_current", 531 "_line", 532 "_col", 533 "_comments", 534 "_char", 535 "_end", 536 "_peek", 537 "_prev_token_line", 538 "single_tokens", 539 "keywords", 540 "quotes", 541 "format_strings", 542 "identifiers", 543 "comments", 544 "string_escapes", 545 "byte_string_escapes", 546 "identifier_escapes", 547 "escape_follow_chars", 548 "commands", 549 "command_prefix_tokens", 550 "nested_comments", 551 "hint_start", 552 "tokens_preceding_hint", 553 "bit_strings", 554 "hex_strings", 555 "numeric_literals", 556 "var_single_tokens", 557 "string_escapes_allowed_in_raw_strings", 558 "heredoc_tag_is_identifier", 559 "heredoc_string_alternative", 560 "keyword_trie", 561 "numbers_can_be_underscore_separated", 562 "identifiers_can_start_with_digit", 563 "unescaped_sequences", 564 ) 565 566 def __init__( 567 self, 568 single_tokens: t.Dict[str, TokenType], 569 keywords: t.Dict[str, TokenType], 570 quotes: t.Dict[str, str], 571 format_strings: t.Dict[str, t.Tuple[str, TokenType]], 572 identifiers: t.Dict[str, str], 573 comments: t.Dict[str, t.Optional[str]], 574 string_escapes: t.Set[str], 575 byte_string_escapes: t.Set[str], 576 identifier_escapes: t.Set[str], 577 escape_follow_chars: t.Set[str], 578 commands: t.Set[TokenType], 579 command_prefix_tokens: t.Set[TokenType], 580 nested_comments: bool, 581 hint_start: str, 582 tokens_preceding_hint: t.Set[TokenType], 583 bit_strings: t.List[t.Union[str, t.Tuple[str, str]]], 584 hex_strings: t.List[t.Union[str, t.Tuple[str, str]]], 585 numeric_literals: t.Dict[str, str], 586 var_single_tokens: t.Set[str], 587 string_escapes_allowed_in_raw_strings: bool, 588 heredoc_tag_is_identifier: bool, 589 heredoc_string_alternative: TokenType, 590 keyword_trie: t.Dict, 591 numbers_can_be_underscore_separated: bool, 592 identifiers_can_start_with_digit: bool, 593 unescaped_sequences: t.Dict[str, str], 594 ) -> None: 595 self.single_tokens = single_tokens 596 self.keywords = keywords 597 self.quotes = quotes 598 self.format_strings = format_strings 599 self.identifiers = identifiers 600 self.comments = comments 601 self.string_escapes = string_escapes 602 self.byte_string_escapes = byte_string_escapes 603 self.identifier_escapes = identifier_escapes 604 self.escape_follow_chars = escape_follow_chars 605 self.commands = commands 606 self.command_prefix_tokens = command_prefix_tokens 607 self.nested_comments = nested_comments 608 self.hint_start = hint_start 609 self.tokens_preceding_hint = tokens_preceding_hint 610 self.bit_strings = bit_strings 611 self.hex_strings = hex_strings 612 self.numeric_literals = numeric_literals 613 self.var_single_tokens = var_single_tokens 614 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 615 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 616 self.heredoc_string_alternative = heredoc_string_alternative 617 self.keyword_trie = keyword_trie 618 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 619 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 620 self.unescaped_sequences = unescaped_sequences 621 self.reset() 622 623 def reset(self) -> None: 624 self.sql = "" 625 self.size = 0 626 self.tokens: t.List[Token] = [] 627 self._start = 0 628 self._current = 0 629 self._line = 1 630 self._col = 0 631 self._comments: t.List[str] = [] 632 self._char = "" 633 self._end = False 634 self._peek = "" 635 self._prev_token_line = -1 636 637 def tokenize(self, sql: str) -> t.List[Token]: 638 """Returns a list of tokens corresponding to the SQL string `sql`.""" 639 self.reset() 640 self.sql = sql 641 self.size = len(sql) 642 643 try: 644 self._scan() 645 except Exception as e: 646 start = max(self._current - 50, 0) 647 end = min(self._current + 50, self.size - 1) 648 context = self.sql[start:end] 649 raise TokenError(f"Error tokenizing '{context}'") from e 650 651 return self.tokens 652 653 def _scan(self, check_semicolon: bool = False) -> None: 654 identifiers = self.identifiers 655 space_chars = _SPACE_CHARS 656 digit_chars = _DIGIT_CHARS 657 658 while self.size and not self._end: 659 current = self._current 660 661 # Skip spaces here rather than iteratively calling advance() for performance reasons 662 while current < self.size: 663 char = self.sql[current] 664 665 if char == " " or char == "\t": 666 current += 1 667 else: 668 break 669 670 offset = current - self._current if current > self._current else 1 671 672 self._start = current 673 self._advance(offset) 674 675 if self._char not in space_chars: 676 if self._char in digit_chars: 677 self._scan_number() 678 elif self._char in identifiers: 679 self._scan_identifier(identifiers[self._char]) 680 else: 681 self._scan_keywords() 682 683 if check_semicolon and self._peek == ";": 684 break 685 686 if self.tokens and self._comments: 687 self.tokens[-1].comments.extend(self._comments) 688 689 def _chars(self, size: int) -> str: 690 if size == 1: 691 return self._char 692 693 start = self._current - 1 694 end = start + size 695 696 return self.sql[start:end] if end <= self.size else "" 697 698 def _advance(self, i: int = 1, alnum: bool = False) -> None: 699 char = self._char 700 701 if char == "\n" or char == "\r": 702 # Ensures we don't count an extra line if we get a \r\n line break sequence 703 if not (char == "\r" and self._peek == "\n"): 704 self._col = i 705 self._line += 1 706 else: 707 self._col += i 708 709 self._current += i 710 sql = self.sql 711 size = self.size 712 self._end = self._current >= size 713 self._char = sql[self._current - 1] 714 self._peek = "" if self._end else sql[self._current] 715 716 if alnum and self._char.isalnum(): 717 # Cache to local variables instead of attributes for better performance 718 _col = self._col 719 _current = self._current 720 _end = self._end 721 _peek = self._peek 722 723 while _peek.isalnum(): 724 _col += 1 725 _current += 1 726 _end = _current >= size 727 _peek = "" if _end else sql[_current] 728 729 self._col = _col 730 self._current = _current 731 self._end = _end 732 self._peek = _peek 733 self._char = sql[_current - 1] 734 735 @property 736 def _text(self) -> str: 737 return self.sql[self._start : self._current] 738 739 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 740 self._prev_token_line = self._line 741 742 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 743 self.tokens[-1].comments.extend(self._comments) 744 self._comments = [] 745 746 if text is None: 747 text = self.sql[self._start : self._current] 748 749 self.tokens.append( 750 Token( 751 token_type, 752 text=text, 753 line=self._line, 754 col=self._col, 755 start=self._start, 756 end=self._current - 1, 757 comments=self._comments, 758 ) 759 ) 760 self._comments = [] 761 762 # If we have either a semicolon or a begin token before the command's token, we'll parse 763 # whatever follows the command's token as a string 764 if ( 765 token_type in self.commands 766 and self._peek != ";" 767 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 768 ): 769 start = self._current 770 tokens = len(self.tokens) 771 self._scan(check_semicolon=True) 772 self.tokens = self.tokens[:tokens] 773 text = self.sql[start : self._current].strip() 774 if text: 775 self._add(TokenType.STRING, text) 776 777 def _scan_keywords(self) -> None: 778 sql = self.sql 779 sql_size = self.size 780 single_tokens = self.single_tokens 781 char_upper = _CHAR_UPPER 782 space_chars = _SPACE_CHARS 783 size = 0 784 word = None 785 chars = self._char 786 char = chars 787 prev_space = False 788 skip = False 789 trie = self.keyword_trie 790 single_token = char in single_tokens 791 792 while chars: 793 if not skip: 794 sub = trie.get(char_upper.get(char, char)) 795 if sub is None: 796 break 797 trie = sub 798 if 0 in trie: 799 word = chars 800 801 end = self._current + size 802 size += 1 803 804 if end < sql_size: 805 char = sql[end] 806 single_token = single_token or char in single_tokens 807 is_space = char in space_chars 808 809 if not is_space or not prev_space: 810 if is_space: 811 char = " " 812 chars += char 813 prev_space = is_space 814 skip = False 815 else: 816 skip = True 817 else: 818 char = "" 819 break 820 821 if word: 822 if self._scan_string(word): 823 return 824 if self._scan_comment(word): 825 return 826 if prev_space or single_token or not char: 827 self._advance(size - 1) 828 word = word.upper() 829 self._add(self.keywords[word], text=word) 830 return 831 832 if self._char in single_tokens: 833 self._add(single_tokens[self._char], text=self._char) 834 return 835 836 self._scan_var() 837 838 def _scan_comment(self, comment_start: str) -> bool: 839 if comment_start not in self.comments: 840 return False 841 842 comment_start_line = self._line 843 comment_start_size = len(comment_start) 844 comment_end = self.comments[comment_start] 845 846 if comment_end: 847 # Skip the comment's start delimiter 848 self._advance(comment_start_size) 849 850 comment_count = 1 851 comment_end_size = len(comment_end) 852 nested_comments = self.nested_comments 853 854 while not self._end: 855 if self._chars(comment_end_size) == comment_end: 856 comment_count -= 1 857 if not comment_count: 858 break 859 860 self._advance(alnum=True) 861 862 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 863 if ( 864 nested_comments 865 and not self._end 866 and self._chars(comment_end_size) == comment_start 867 ): 868 self._advance(comment_start_size) 869 comment_count += 1 870 871 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 872 self._advance(comment_end_size - 1) 873 else: 874 _peek = self._peek 875 while not self._end and _peek != "\n" and _peek != "\r": 876 self._advance(alnum=True) 877 _peek = self._peek 878 self._comments.append(self._text[comment_start_size:]) 879 880 if ( 881 comment_start == self.hint_start 882 and self.tokens 883 and self.tokens[-1].token_type in self.tokens_preceding_hint 884 ): 885 self._add(TokenType.HINT) 886 887 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 888 # Multiple consecutive comments are preserved by appending them to the current comments list. 889 if comment_start_line == self._prev_token_line: 890 self.tokens[-1].comments.extend(self._comments) 891 self._comments = [] 892 self._prev_token_line = self._line 893 894 return True 895 896 def _scan_number(self) -> None: 897 if self._char == "0": 898 peek = _CHAR_UPPER.get(self._peek, self._peek) 899 if peek == "B": 900 return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER) 901 elif peek == "X": 902 return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER) 903 904 decimal = False 905 scientific = 0 906 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 907 single_tokens = self.single_tokens 908 keywords = self.keywords 909 numeric_literals = self.numeric_literals 910 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 911 912 while True: 913 if self._peek in _DIGIT_CHARS: 914 self._advance() 915 elif self._peek == "." and not decimal: 916 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER: 917 return self._add(TokenType.NUMBER) 918 decimal = True 919 self._advance() 920 elif self._peek in ("-", "+") and scientific == 1: 921 # Only consume +/- if followed by a digit 922 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 923 scientific += 1 924 self._advance() 925 else: 926 return self._add(TokenType.NUMBER) 927 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 928 scientific += 1 929 self._advance() 930 elif self._peek == "_" and numbers_can_be_underscore_separated: 931 self._advance() 932 elif self._peek.isidentifier(): 933 number_text = self._text 934 literal = "" 935 936 while ( 937 self._peek 938 and self._peek not in _SPACE_CHARS 939 and self._peek not in single_tokens 940 ): 941 literal += self._peek 942 self._advance() 943 944 token_type = keywords.get(numeric_literals.get(literal.upper(), "")) 945 946 if token_type: 947 self._add(TokenType.NUMBER, number_text) 948 self._add(TokenType.DCOLON, "::") 949 return self._add(token_type, literal) 950 elif identifiers_can_start_with_digit: 951 return self._add(TokenType.VAR) 952 953 self._advance(-len(literal)) 954 return self._add(TokenType.NUMBER, number_text) 955 else: 956 return self._add(TokenType.NUMBER) 957 958 def _scan_bits(self) -> None: 959 self._advance() 960 value = self._extract_value() 961 try: 962 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 963 int(value, 2) 964 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 965 except ValueError: 966 self._add(TokenType.IDENTIFIER) 967 968 def _scan_hex(self) -> None: 969 self._advance() 970 value = self._extract_value() 971 try: 972 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 973 int(value, 16) 974 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 975 except ValueError: 976 self._add(TokenType.IDENTIFIER) 977 978 def _extract_value(self) -> str: 979 single_tokens = self.single_tokens 980 981 while True: 982 char = self._peek.strip() 983 if char and char not in single_tokens: 984 self._advance(alnum=True) 985 else: 986 break 987 988 return self._text 989 990 def _scan_string(self, start: str) -> bool: 991 base = None 992 token_type = TokenType.STRING 993 994 if start in self.quotes: 995 end = self.quotes[start] 996 elif start in self.format_strings: 997 end, token_type = self.format_strings[start] 998 999 if token_type == TokenType.HEX_STRING: 1000 base = 16 1001 elif token_type == TokenType.BIT_STRING: 1002 base = 2 1003 elif token_type == TokenType.HEREDOC_STRING: 1004 self._advance() 1005 1006 if self._char == end: 1007 tag = "" 1008 else: 1009 tag = self._extract_string( 1010 end, 1011 raw_string=True, 1012 raise_unmatched=not self.heredoc_tag_is_identifier, 1013 ) 1014 1015 if ( 1016 tag 1017 and self.heredoc_tag_is_identifier 1018 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1019 ): 1020 if not self._end: 1021 self._advance(-1) 1022 1023 self._advance(-len(tag)) 1024 self._add(self.heredoc_string_alternative) 1025 return True 1026 1027 end = f"{start}{tag}{end}" 1028 else: 1029 return False 1030 1031 self._advance(len(start)) 1032 text = self._extract_string( 1033 end, 1034 escapes=( 1035 self.byte_string_escapes 1036 if token_type == TokenType.BYTE_STRING 1037 else self.string_escapes 1038 ), 1039 raw_string=token_type == TokenType.RAW_STRING, 1040 ) 1041 1042 if base and text: 1043 try: 1044 int(text, base) 1045 except Exception: 1046 raise TokenError( 1047 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1048 ) 1049 1050 self._add(token_type, text) 1051 return True 1052 1053 def _scan_identifier(self, identifier_end: str) -> None: 1054 self._advance() 1055 text = self._extract_string( 1056 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1057 ) 1058 self._add(TokenType.IDENTIFIER, text) 1059 1060 def _scan_var(self) -> None: 1061 var_single_tokens = self.var_single_tokens 1062 single_tokens = self.single_tokens 1063 1064 while True: 1065 peek = self._peek 1066 if not peek or peek in _SPACE_CHARS: 1067 break 1068 if peek not in var_single_tokens and peek in single_tokens: 1069 break 1070 self._advance(alnum=True) 1071 1072 self._add( 1073 TokenType.VAR 1074 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1075 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1076 ) 1077 1078 def _extract_string( 1079 self, 1080 delimiter: str, 1081 escapes: t.Optional[t.Set[str]] = None, 1082 raw_string: bool = False, 1083 raise_unmatched: bool = True, 1084 ) -> str: 1085 text = "" 1086 delim_size = len(delimiter) 1087 escapes = self.string_escapes if escapes is None else escapes 1088 unescaped_sequences = self.unescaped_sequences 1089 escape_follow_chars = self.escape_follow_chars 1090 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1091 quotes = self.quotes 1092 1093 while True: 1094 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1095 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1096 if unescaped_sequence: 1097 self._advance(2) 1098 text += unescaped_sequence 1099 continue 1100 1101 is_valid_custom_escape = ( 1102 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1103 ) 1104 1105 if ( 1106 (string_escapes_allowed_in_raw_strings or not raw_string) 1107 and self._char in escapes 1108 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1109 and (self._char not in quotes or self._char == self._peek) 1110 ): 1111 if self._peek == delimiter: 1112 text += self._peek 1113 elif is_valid_custom_escape and self._char != self._peek: 1114 text += self._peek 1115 else: 1116 text += self._char + self._peek 1117 1118 if self._current + 1 < self.size: 1119 self._advance(2) 1120 else: 1121 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1122 else: 1123 if self._chars(delim_size) == delimiter: 1124 if delim_size > 1: 1125 self._advance(delim_size - 1) 1126 break 1127 1128 if self._end: 1129 if not raise_unmatched: 1130 return text + self._char 1131 1132 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1133 1134 current = self._current - 1 1135 self._advance(alnum=True) 1136 text += self.sql[current : self._current - 1] 1137 1138 return text
class
TokenType(enum.IntEnum):
20class TokenType(IntEnum): 21 L_PAREN = auto() 22 R_PAREN = auto() 23 L_BRACKET = auto() 24 R_BRACKET = auto() 25 L_BRACE = auto() 26 R_BRACE = auto() 27 COMMA = auto() 28 DOT = auto() 29 DASH = auto() 30 PLUS = auto() 31 COLON = auto() 32 DOTCOLON = auto() 33 DCOLON = auto() 34 DCOLONDOLLAR = auto() 35 DCOLONPERCENT = auto() 36 DCOLONQMARK = auto() 37 DQMARK = auto() 38 SEMICOLON = auto() 39 STAR = auto() 40 BACKSLASH = auto() 41 SLASH = auto() 42 LT = auto() 43 LTE = auto() 44 GT = auto() 45 GTE = auto() 46 NOT = auto() 47 EQ = auto() 48 NEQ = auto() 49 NULLSAFE_EQ = auto() 50 COLON_EQ = auto() 51 COLON_GT = auto() 52 NCOLON_GT = auto() 53 AND = auto() 54 OR = auto() 55 AMP = auto() 56 DPIPE = auto() 57 PIPE_GT = auto() 58 PIPE = auto() 59 PIPE_SLASH = auto() 60 DPIPE_SLASH = auto() 61 CARET = auto() 62 CARET_AT = auto() 63 TILDE = auto() 64 ARROW = auto() 65 DARROW = auto() 66 FARROW = auto() 67 HASH = auto() 68 HASH_ARROW = auto() 69 DHASH_ARROW = auto() 70 LR_ARROW = auto() 71 DAT = auto() 72 LT_AT = auto() 73 AT_GT = auto() 74 DOLLAR = auto() 75 PARAMETER = auto() 76 SESSION = auto() 77 SESSION_PARAMETER = auto() 78 SESSION_USER = auto() 79 DAMP = auto() 80 AMP_LT = auto() 81 AMP_GT = auto() 82 ADJACENT = auto() 83 XOR = auto() 84 DSTAR = auto() 85 QMARK_AMP = auto() 86 QMARK_PIPE = auto() 87 HASH_DASH = auto() 88 EXCLAMATION = auto() 89 90 URI_START = auto() 91 92 BLOCK_START = auto() 93 BLOCK_END = auto() 94 95 SPACE = auto() 96 BREAK = auto() 97 98 STRING = auto() 99 NUMBER = auto() 100 IDENTIFIER = auto() 101 DATABASE = auto() 102 COLUMN = auto() 103 COLUMN_DEF = auto() 104 SCHEMA = auto() 105 TABLE = auto() 106 WAREHOUSE = auto() 107 STAGE = auto() 108 STREAMLIT = auto() 109 VAR = auto() 110 BIT_STRING = auto() 111 HEX_STRING = auto() 112 BYTE_STRING = auto() 113 NATIONAL_STRING = auto() 114 RAW_STRING = auto() 115 HEREDOC_STRING = auto() 116 UNICODE_STRING = auto() 117 118 # types 119 BIT = auto() 120 BOOLEAN = auto() 121 TINYINT = auto() 122 UTINYINT = auto() 123 SMALLINT = auto() 124 USMALLINT = auto() 125 MEDIUMINT = auto() 126 UMEDIUMINT = auto() 127 INT = auto() 128 UINT = auto() 129 BIGINT = auto() 130 UBIGINT = auto() 131 BIGNUM = auto() 132 INT128 = auto() 133 UINT128 = auto() 134 INT256 = auto() 135 UINT256 = auto() 136 FLOAT = auto() 137 DOUBLE = auto() 138 UDOUBLE = auto() 139 DECIMAL = auto() 140 DECIMAL32 = auto() 141 DECIMAL64 = auto() 142 DECIMAL128 = auto() 143 DECIMAL256 = auto() 144 DECFLOAT = auto() 145 UDECIMAL = auto() 146 BIGDECIMAL = auto() 147 CHAR = auto() 148 NCHAR = auto() 149 VARCHAR = auto() 150 NVARCHAR = auto() 151 BPCHAR = auto() 152 TEXT = auto() 153 MEDIUMTEXT = auto() 154 LONGTEXT = auto() 155 BLOB = auto() 156 MEDIUMBLOB = auto() 157 LONGBLOB = auto() 158 TINYBLOB = auto() 159 TINYTEXT = auto() 160 NAME = auto() 161 BINARY = auto() 162 VARBINARY = auto() 163 JSON = auto() 164 JSONB = auto() 165 TIME = auto() 166 TIMETZ = auto() 167 TIME_NS = auto() 168 TIMESTAMP = auto() 169 TIMESTAMPTZ = auto() 170 TIMESTAMPLTZ = auto() 171 TIMESTAMPNTZ = auto() 172 TIMESTAMP_S = auto() 173 TIMESTAMP_MS = auto() 174 TIMESTAMP_NS = auto() 175 DATETIME = auto() 176 DATETIME2 = auto() 177 DATETIME64 = auto() 178 SMALLDATETIME = auto() 179 DATE = auto() 180 DATE32 = auto() 181 INT4RANGE = auto() 182 INT4MULTIRANGE = auto() 183 INT8RANGE = auto() 184 INT8MULTIRANGE = auto() 185 NUMRANGE = auto() 186 NUMMULTIRANGE = auto() 187 TSRANGE = auto() 188 TSMULTIRANGE = auto() 189 TSTZRANGE = auto() 190 TSTZMULTIRANGE = auto() 191 DATERANGE = auto() 192 DATEMULTIRANGE = auto() 193 UUID = auto() 194 GEOGRAPHY = auto() 195 GEOGRAPHYPOINT = auto() 196 NULLABLE = auto() 197 GEOMETRY = auto() 198 POINT = auto() 199 RING = auto() 200 LINESTRING = auto() 201 LOCALTIME = auto() 202 LOCALTIMESTAMP = auto() 203 SYSTIMESTAMP = auto() 204 MULTILINESTRING = auto() 205 POLYGON = auto() 206 MULTIPOLYGON = auto() 207 HLLSKETCH = auto() 208 HSTORE = auto() 209 SUPER = auto() 210 SERIAL = auto() 211 SMALLSERIAL = auto() 212 BIGSERIAL = auto() 213 XML = auto() 214 YEAR = auto() 215 USERDEFINED = auto() 216 MONEY = auto() 217 SMALLMONEY = auto() 218 ROWVERSION = auto() 219 IMAGE = auto() 220 VARIANT = auto() 221 OBJECT = auto() 222 INET = auto() 223 IPADDRESS = auto() 224 IPPREFIX = auto() 225 IPV4 = auto() 226 IPV6 = auto() 227 ENUM = auto() 228 ENUM8 = auto() 229 ENUM16 = auto() 230 FIXEDSTRING = auto() 231 LOWCARDINALITY = auto() 232 NESTED = auto() 233 AGGREGATEFUNCTION = auto() 234 SIMPLEAGGREGATEFUNCTION = auto() 235 TDIGEST = auto() 236 UNKNOWN = auto() 237 VECTOR = auto() 238 DYNAMIC = auto() 239 VOID = auto() 240 241 # keywords 242 ALIAS = auto() 243 ALTER = auto() 244 ALL = auto() 245 ANTI = auto() 246 ANY = auto() 247 APPLY = auto() 248 ARRAY = auto() 249 ASC = auto() 250 ASOF = auto() 251 ATTACH = auto() 252 AUTO_INCREMENT = auto() 253 BEGIN = auto() 254 BETWEEN = auto() 255 BULK_COLLECT_INTO = auto() 256 CACHE = auto() 257 CASE = auto() 258 CHARACTER_SET = auto() 259 CLUSTER_BY = auto() 260 COLLATE = auto() 261 COMMAND = auto() 262 COMMENT = auto() 263 COMMIT = auto() 264 CONNECT_BY = auto() 265 CONSTRAINT = auto() 266 COPY = auto() 267 CREATE = auto() 268 CROSS = auto() 269 CUBE = auto() 270 CURRENT_DATE = auto() 271 CURRENT_DATETIME = auto() 272 CURRENT_SCHEMA = auto() 273 CURRENT_TIME = auto() 274 CURRENT_TIMESTAMP = auto() 275 CURRENT_USER = auto() 276 CURRENT_ROLE = auto() 277 CURRENT_CATALOG = auto() 278 DECLARE = auto() 279 DEFAULT = auto() 280 DELETE = auto() 281 DESC = auto() 282 DESCRIBE = auto() 283 DETACH = auto() 284 DICTIONARY = auto() 285 DISTINCT = auto() 286 DISTRIBUTE_BY = auto() 287 DIV = auto() 288 DROP = auto() 289 ELSE = auto() 290 END = auto() 291 ESCAPE = auto() 292 EXCEPT = auto() 293 EXECUTE = auto() 294 EXISTS = auto() 295 FALSE = auto() 296 FETCH = auto() 297 FILE = auto() 298 FILE_FORMAT = auto() 299 FILTER = auto() 300 FINAL = auto() 301 FIRST = auto() 302 FOR = auto() 303 FORCE = auto() 304 FOREIGN_KEY = auto() 305 FORMAT = auto() 306 FROM = auto() 307 FULL = auto() 308 FUNCTION = auto() 309 GET = auto() 310 GLOB = auto() 311 GLOBAL = auto() 312 GRANT = auto() 313 GROUP_BY = auto() 314 GROUPING_SETS = auto() 315 HAVING = auto() 316 HINT = auto() 317 IGNORE = auto() 318 ILIKE = auto() 319 IN = auto() 320 INDEX = auto() 321 INDEXED_BY = auto() 322 INNER = auto() 323 INSERT = auto() 324 INSTALL = auto() 325 INTERSECT = auto() 326 INTERVAL = auto() 327 INTO = auto() 328 INTRODUCER = auto() 329 IRLIKE = auto() 330 IS = auto() 331 ISNULL = auto() 332 JOIN = auto() 333 JOIN_MARKER = auto() 334 KEEP = auto() 335 KEY = auto() 336 KILL = auto() 337 LANGUAGE = auto() 338 LATERAL = auto() 339 LEFT = auto() 340 LIKE = auto() 341 LIMIT = auto() 342 LIST = auto() 343 LOAD = auto() 344 LOCK = auto() 345 MAP = auto() 346 MATCH = auto() 347 MATCH_CONDITION = auto() 348 MATCH_RECOGNIZE = auto() 349 MEMBER_OF = auto() 350 MERGE = auto() 351 MOD = auto() 352 MODEL = auto() 353 NATURAL = auto() 354 NEXT = auto() 355 NOTHING = auto() 356 NOTNULL = auto() 357 NULL = auto() 358 OBJECT_IDENTIFIER = auto() 359 OFFSET = auto() 360 ON = auto() 361 ONLY = auto() 362 OPERATOR = auto() 363 ORDER_BY = auto() 364 ORDER_SIBLINGS_BY = auto() 365 ORDERED = auto() 366 ORDINALITY = auto() 367 OUT = auto() 368 INOUT = auto() 369 OUTER = auto() 370 OVER = auto() 371 OVERLAPS = auto() 372 OVERWRITE = auto() 373 PARTITION = auto() 374 PARTITION_BY = auto() 375 PERCENT = auto() 376 PIVOT = auto() 377 PLACEHOLDER = auto() 378 POSITIONAL = auto() 379 PRAGMA = auto() 380 PREWHERE = auto() 381 PRIMARY_KEY = auto() 382 PROCEDURE = auto() 383 PROPERTIES = auto() 384 PSEUDO_TYPE = auto() 385 PUT = auto() 386 QUALIFY = auto() 387 QUOTE = auto() 388 QDCOLON = auto() 389 RANGE = auto() 390 RECURSIVE = auto() 391 REFRESH = auto() 392 RENAME = auto() 393 REPLACE = auto() 394 RETURNING = auto() 395 REVOKE = auto() 396 REFERENCES = auto() 397 RIGHT = auto() 398 RLIKE = auto() 399 ROLLBACK = auto() 400 ROLLUP = auto() 401 ROW = auto() 402 ROWS = auto() 403 SELECT = auto() 404 SEMI = auto() 405 SEPARATOR = auto() 406 SEQUENCE = auto() 407 SERDE_PROPERTIES = auto() 408 SET = auto() 409 SETTINGS = auto() 410 SHOW = auto() 411 SIMILAR_TO = auto() 412 SOME = auto() 413 SORT_BY = auto() 414 SOUNDS_LIKE = auto() 415 START_WITH = auto() 416 STORAGE_INTEGRATION = auto() 417 STRAIGHT_JOIN = auto() 418 STRUCT = auto() 419 SUMMARIZE = auto() 420 TABLE_SAMPLE = auto() 421 TAG = auto() 422 TEMPORARY = auto() 423 TOP = auto() 424 THEN = auto() 425 TRUE = auto() 426 TRUNCATE = auto() 427 TRIGGER = auto() 428 UNCACHE = auto() 429 UNION = auto() 430 UNNEST = auto() 431 UNPIVOT = auto() 432 UPDATE = auto() 433 USE = auto() 434 USING = auto() 435 VALUES = auto() 436 VARIADIC = auto() 437 VIEW = auto() 438 SEMANTIC_VIEW = auto() 439 VOLATILE = auto() 440 WHEN = auto() 441 WHERE = auto() 442 WINDOW = auto() 443 WITH = auto() 444 UNIQUE = auto() 445 UTC_DATE = auto() 446 UTC_TIME = auto() 447 UTC_TIMESTAMP = auto() 448 VERSION_SNAPSHOT = auto() 449 TIMESTAMP_SNAPSHOT = auto() 450 OPTION = auto() 451 SINK = auto() 452 SOURCE = auto() 453 ANALYZE = auto() 454 NAMESPACE = auto() 455 EXPORT = auto() 456 457 # sentinel 458 HIVE_TOKEN_STREAM = auto() 459 460 def __str__(self) -> str: 461 return f"TokenType.{self.name}"
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 1>
R_PAREN =
<TokenType.R_PAREN: 2>
L_BRACKET =
<TokenType.L_BRACKET: 3>
R_BRACKET =
<TokenType.R_BRACKET: 4>
L_BRACE =
<TokenType.L_BRACE: 5>
R_BRACE =
<TokenType.R_BRACE: 6>
COMMA =
<TokenType.COMMA: 7>
DOT =
<TokenType.DOT: 8>
DASH =
<TokenType.DASH: 9>
PLUS =
<TokenType.PLUS: 10>
COLON =
<TokenType.COLON: 11>
DOTCOLON =
<TokenType.DOTCOLON: 12>
DCOLON =
<TokenType.DCOLON: 13>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 14>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 15>
DCOLONQMARK =
<TokenType.DCOLONQMARK: 16>
DQMARK =
<TokenType.DQMARK: 17>
SEMICOLON =
<TokenType.SEMICOLON: 18>
STAR =
<TokenType.STAR: 19>
BACKSLASH =
<TokenType.BACKSLASH: 20>
SLASH =
<TokenType.SLASH: 21>
LT =
<TokenType.LT: 22>
LTE =
<TokenType.LTE: 23>
GT =
<TokenType.GT: 24>
GTE =
<TokenType.GTE: 25>
NOT =
<TokenType.NOT: 26>
EQ =
<TokenType.EQ: 27>
NEQ =
<TokenType.NEQ: 28>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 29>
COLON_EQ =
<TokenType.COLON_EQ: 30>
COLON_GT =
<TokenType.COLON_GT: 31>
NCOLON_GT =
<TokenType.NCOLON_GT: 32>
AND =
<TokenType.AND: 33>
OR =
<TokenType.OR: 34>
AMP =
<TokenType.AMP: 35>
DPIPE =
<TokenType.DPIPE: 36>
PIPE_GT =
<TokenType.PIPE_GT: 37>
PIPE =
<TokenType.PIPE: 38>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 39>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 40>
CARET =
<TokenType.CARET: 41>
CARET_AT =
<TokenType.CARET_AT: 42>
TILDE =
<TokenType.TILDE: 43>
ARROW =
<TokenType.ARROW: 44>
DARROW =
<TokenType.DARROW: 45>
FARROW =
<TokenType.FARROW: 46>
HASH =
<TokenType.HASH: 47>
HASH_ARROW =
<TokenType.HASH_ARROW: 48>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 49>
LR_ARROW =
<TokenType.LR_ARROW: 50>
DAT =
<TokenType.DAT: 51>
LT_AT =
<TokenType.LT_AT: 52>
AT_GT =
<TokenType.AT_GT: 53>
DOLLAR =
<TokenType.DOLLAR: 54>
PARAMETER =
<TokenType.PARAMETER: 55>
SESSION =
<TokenType.SESSION: 56>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 57>
SESSION_USER =
<TokenType.SESSION_USER: 58>
DAMP =
<TokenType.DAMP: 59>
AMP_LT =
<TokenType.AMP_LT: 60>
AMP_GT =
<TokenType.AMP_GT: 61>
ADJACENT =
<TokenType.ADJACENT: 62>
XOR =
<TokenType.XOR: 63>
DSTAR =
<TokenType.DSTAR: 64>
QMARK_AMP =
<TokenType.QMARK_AMP: 65>
QMARK_PIPE =
<TokenType.QMARK_PIPE: 66>
HASH_DASH =
<TokenType.HASH_DASH: 67>
EXCLAMATION =
<TokenType.EXCLAMATION: 68>
URI_START =
<TokenType.URI_START: 69>
BLOCK_START =
<TokenType.BLOCK_START: 70>
BLOCK_END =
<TokenType.BLOCK_END: 71>
SPACE =
<TokenType.SPACE: 72>
BREAK =
<TokenType.BREAK: 73>
STRING =
<TokenType.STRING: 74>
NUMBER =
<TokenType.NUMBER: 75>
IDENTIFIER =
<TokenType.IDENTIFIER: 76>
DATABASE =
<TokenType.DATABASE: 77>
COLUMN =
<TokenType.COLUMN: 78>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 79>
SCHEMA =
<TokenType.SCHEMA: 80>
TABLE =
<TokenType.TABLE: 81>
WAREHOUSE =
<TokenType.WAREHOUSE: 82>
STAGE =
<TokenType.STAGE: 83>
STREAMLIT =
<TokenType.STREAMLIT: 84>
VAR =
<TokenType.VAR: 85>
BIT_STRING =
<TokenType.BIT_STRING: 86>
HEX_STRING =
<TokenType.HEX_STRING: 87>
BYTE_STRING =
<TokenType.BYTE_STRING: 88>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 89>
RAW_STRING =
<TokenType.RAW_STRING: 90>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 91>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 92>
BIT =
<TokenType.BIT: 93>
BOOLEAN =
<TokenType.BOOLEAN: 94>
TINYINT =
<TokenType.TINYINT: 95>
UTINYINT =
<TokenType.UTINYINT: 96>
SMALLINT =
<TokenType.SMALLINT: 97>
USMALLINT =
<TokenType.USMALLINT: 98>
MEDIUMINT =
<TokenType.MEDIUMINT: 99>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 100>
INT =
<TokenType.INT: 101>
UINT =
<TokenType.UINT: 102>
BIGINT =
<TokenType.BIGINT: 103>
UBIGINT =
<TokenType.UBIGINT: 104>
BIGNUM =
<TokenType.BIGNUM: 105>
INT128 =
<TokenType.INT128: 106>
UINT128 =
<TokenType.UINT128: 107>
INT256 =
<TokenType.INT256: 108>
UINT256 =
<TokenType.UINT256: 109>
FLOAT =
<TokenType.FLOAT: 110>
DOUBLE =
<TokenType.DOUBLE: 111>
UDOUBLE =
<TokenType.UDOUBLE: 112>
DECIMAL =
<TokenType.DECIMAL: 113>
DECIMAL32 =
<TokenType.DECIMAL32: 114>
DECIMAL64 =
<TokenType.DECIMAL64: 115>
DECIMAL128 =
<TokenType.DECIMAL128: 116>
DECIMAL256 =
<TokenType.DECIMAL256: 117>
DECFLOAT =
<TokenType.DECFLOAT: 118>
UDECIMAL =
<TokenType.UDECIMAL: 119>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 120>
CHAR =
<TokenType.CHAR: 121>
NCHAR =
<TokenType.NCHAR: 122>
VARCHAR =
<TokenType.VARCHAR: 123>
NVARCHAR =
<TokenType.NVARCHAR: 124>
BPCHAR =
<TokenType.BPCHAR: 125>
TEXT =
<TokenType.TEXT: 126>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 127>
LONGTEXT =
<TokenType.LONGTEXT: 128>
BLOB =
<TokenType.BLOB: 129>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 130>
LONGBLOB =
<TokenType.LONGBLOB: 131>
TINYBLOB =
<TokenType.TINYBLOB: 132>
TINYTEXT =
<TokenType.TINYTEXT: 133>
NAME =
<TokenType.NAME: 134>
BINARY =
<TokenType.BINARY: 135>
VARBINARY =
<TokenType.VARBINARY: 136>
JSON =
<TokenType.JSON: 137>
JSONB =
<TokenType.JSONB: 138>
TIME =
<TokenType.TIME: 139>
TIMETZ =
<TokenType.TIMETZ: 140>
TIME_NS =
<TokenType.TIME_NS: 141>
TIMESTAMP =
<TokenType.TIMESTAMP: 142>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 143>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 144>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 145>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 146>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 147>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 148>
DATETIME =
<TokenType.DATETIME: 149>
DATETIME2 =
<TokenType.DATETIME2: 150>
DATETIME64 =
<TokenType.DATETIME64: 151>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 152>
DATE =
<TokenType.DATE: 153>
DATE32 =
<TokenType.DATE32: 154>
INT4RANGE =
<TokenType.INT4RANGE: 155>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 156>
INT8RANGE =
<TokenType.INT8RANGE: 157>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 158>
NUMRANGE =
<TokenType.NUMRANGE: 159>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 160>
TSRANGE =
<TokenType.TSRANGE: 161>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 162>
TSTZRANGE =
<TokenType.TSTZRANGE: 163>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 164>
DATERANGE =
<TokenType.DATERANGE: 165>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 166>
UUID =
<TokenType.UUID: 167>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 168>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 169>
NULLABLE =
<TokenType.NULLABLE: 170>
GEOMETRY =
<TokenType.GEOMETRY: 171>
POINT =
<TokenType.POINT: 172>
RING =
<TokenType.RING: 173>
LINESTRING =
<TokenType.LINESTRING: 174>
LOCALTIME =
<TokenType.LOCALTIME: 175>
LOCALTIMESTAMP =
<TokenType.LOCALTIMESTAMP: 176>
SYSTIMESTAMP =
<TokenType.SYSTIMESTAMP: 177>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 178>
POLYGON =
<TokenType.POLYGON: 179>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 180>
HLLSKETCH =
<TokenType.HLLSKETCH: 181>
HSTORE =
<TokenType.HSTORE: 182>
SUPER =
<TokenType.SUPER: 183>
SERIAL =
<TokenType.SERIAL: 184>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 185>
BIGSERIAL =
<TokenType.BIGSERIAL: 186>
XML =
<TokenType.XML: 187>
YEAR =
<TokenType.YEAR: 188>
USERDEFINED =
<TokenType.USERDEFINED: 189>
MONEY =
<TokenType.MONEY: 190>
SMALLMONEY =
<TokenType.SMALLMONEY: 191>
ROWVERSION =
<TokenType.ROWVERSION: 192>
IMAGE =
<TokenType.IMAGE: 193>
VARIANT =
<TokenType.VARIANT: 194>
OBJECT =
<TokenType.OBJECT: 195>
INET =
<TokenType.INET: 196>
IPADDRESS =
<TokenType.IPADDRESS: 197>
IPPREFIX =
<TokenType.IPPREFIX: 198>
IPV4 =
<TokenType.IPV4: 199>
IPV6 =
<TokenType.IPV6: 200>
ENUM =
<TokenType.ENUM: 201>
ENUM8 =
<TokenType.ENUM8: 202>
ENUM16 =
<TokenType.ENUM16: 203>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 204>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 205>
NESTED =
<TokenType.NESTED: 206>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 207>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 208>
TDIGEST =
<TokenType.TDIGEST: 209>
UNKNOWN =
<TokenType.UNKNOWN: 210>
VECTOR =
<TokenType.VECTOR: 211>
DYNAMIC =
<TokenType.DYNAMIC: 212>
VOID =
<TokenType.VOID: 213>
ALIAS =
<TokenType.ALIAS: 214>
ALTER =
<TokenType.ALTER: 215>
ALL =
<TokenType.ALL: 216>
ANTI =
<TokenType.ANTI: 217>
ANY =
<TokenType.ANY: 218>
APPLY =
<TokenType.APPLY: 219>
ARRAY =
<TokenType.ARRAY: 220>
ASC =
<TokenType.ASC: 221>
ASOF =
<TokenType.ASOF: 222>
ATTACH =
<TokenType.ATTACH: 223>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 224>
BEGIN =
<TokenType.BEGIN: 225>
BETWEEN =
<TokenType.BETWEEN: 226>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 227>
CACHE =
<TokenType.CACHE: 228>
CASE =
<TokenType.CASE: 229>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 230>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 231>
COLLATE =
<TokenType.COLLATE: 232>
COMMAND =
<TokenType.COMMAND: 233>
COMMENT =
<TokenType.COMMENT: 234>
COMMIT =
<TokenType.COMMIT: 235>
CONNECT_BY =
<TokenType.CONNECT_BY: 236>
CONSTRAINT =
<TokenType.CONSTRAINT: 237>
COPY =
<TokenType.COPY: 238>
CREATE =
<TokenType.CREATE: 239>
CROSS =
<TokenType.CROSS: 240>
CUBE =
<TokenType.CUBE: 241>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 242>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 243>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 244>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 245>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 246>
CURRENT_USER =
<TokenType.CURRENT_USER: 247>
CURRENT_ROLE =
<TokenType.CURRENT_ROLE: 248>
CURRENT_CATALOG =
<TokenType.CURRENT_CATALOG: 249>
DECLARE =
<TokenType.DECLARE: 250>
DEFAULT =
<TokenType.DEFAULT: 251>
DELETE =
<TokenType.DELETE: 252>
DESC =
<TokenType.DESC: 253>
DESCRIBE =
<TokenType.DESCRIBE: 254>
DETACH =
<TokenType.DETACH: 255>
DICTIONARY =
<TokenType.DICTIONARY: 256>
DISTINCT =
<TokenType.DISTINCT: 257>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 258>
DIV =
<TokenType.DIV: 259>
DROP =
<TokenType.DROP: 260>
ELSE =
<TokenType.ELSE: 261>
END =
<TokenType.END: 262>
ESCAPE =
<TokenType.ESCAPE: 263>
EXCEPT =
<TokenType.EXCEPT: 264>
EXECUTE =
<TokenType.EXECUTE: 265>
EXISTS =
<TokenType.EXISTS: 266>
FALSE =
<TokenType.FALSE: 267>
FETCH =
<TokenType.FETCH: 268>
FILE =
<TokenType.FILE: 269>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 270>
FILTER =
<TokenType.FILTER: 271>
FINAL =
<TokenType.FINAL: 272>
FIRST =
<TokenType.FIRST: 273>
FOR =
<TokenType.FOR: 274>
FORCE =
<TokenType.FORCE: 275>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 276>
FORMAT =
<TokenType.FORMAT: 277>
FROM =
<TokenType.FROM: 278>
FULL =
<TokenType.FULL: 279>
FUNCTION =
<TokenType.FUNCTION: 280>
GET =
<TokenType.GET: 281>
GLOB =
<TokenType.GLOB: 282>
GLOBAL =
<TokenType.GLOBAL: 283>
GRANT =
<TokenType.GRANT: 284>
GROUP_BY =
<TokenType.GROUP_BY: 285>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 286>
HAVING =
<TokenType.HAVING: 287>
HINT =
<TokenType.HINT: 288>
IGNORE =
<TokenType.IGNORE: 289>
ILIKE =
<TokenType.ILIKE: 290>
IN =
<TokenType.IN: 291>
INDEX =
<TokenType.INDEX: 292>
INDEXED_BY =
<TokenType.INDEXED_BY: 293>
INNER =
<TokenType.INNER: 294>
INSERT =
<TokenType.INSERT: 295>
INSTALL =
<TokenType.INSTALL: 296>
INTERSECT =
<TokenType.INTERSECT: 297>
INTERVAL =
<TokenType.INTERVAL: 298>
INTO =
<TokenType.INTO: 299>
INTRODUCER =
<TokenType.INTRODUCER: 300>
IRLIKE =
<TokenType.IRLIKE: 301>
IS =
<TokenType.IS: 302>
ISNULL =
<TokenType.ISNULL: 303>
JOIN =
<TokenType.JOIN: 304>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 305>
KEEP =
<TokenType.KEEP: 306>
KEY =
<TokenType.KEY: 307>
KILL =
<TokenType.KILL: 308>
LANGUAGE =
<TokenType.LANGUAGE: 309>
LATERAL =
<TokenType.LATERAL: 310>
LEFT =
<TokenType.LEFT: 311>
LIKE =
<TokenType.LIKE: 312>
LIMIT =
<TokenType.LIMIT: 313>
LIST =
<TokenType.LIST: 314>
LOAD =
<TokenType.LOAD: 315>
LOCK =
<TokenType.LOCK: 316>
MAP =
<TokenType.MAP: 317>
MATCH =
<TokenType.MATCH: 318>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 319>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 320>
MEMBER_OF =
<TokenType.MEMBER_OF: 321>
MERGE =
<TokenType.MERGE: 322>
MOD =
<TokenType.MOD: 323>
MODEL =
<TokenType.MODEL: 324>
NATURAL =
<TokenType.NATURAL: 325>
NEXT =
<TokenType.NEXT: 326>
NOTHING =
<TokenType.NOTHING: 327>
NOTNULL =
<TokenType.NOTNULL: 328>
NULL =
<TokenType.NULL: 329>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 330>
OFFSET =
<TokenType.OFFSET: 331>
ON =
<TokenType.ON: 332>
ONLY =
<TokenType.ONLY: 333>
OPERATOR =
<TokenType.OPERATOR: 334>
ORDER_BY =
<TokenType.ORDER_BY: 335>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 336>
ORDERED =
<TokenType.ORDERED: 337>
ORDINALITY =
<TokenType.ORDINALITY: 338>
OUT =
<TokenType.OUT: 339>
INOUT =
<TokenType.INOUT: 340>
OUTER =
<TokenType.OUTER: 341>
OVER =
<TokenType.OVER: 342>
OVERLAPS =
<TokenType.OVERLAPS: 343>
OVERWRITE =
<TokenType.OVERWRITE: 344>
PARTITION =
<TokenType.PARTITION: 345>
PARTITION_BY =
<TokenType.PARTITION_BY: 346>
PERCENT =
<TokenType.PERCENT: 347>
PIVOT =
<TokenType.PIVOT: 348>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 349>
POSITIONAL =
<TokenType.POSITIONAL: 350>
PRAGMA =
<TokenType.PRAGMA: 351>
PREWHERE =
<TokenType.PREWHERE: 352>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 353>
PROCEDURE =
<TokenType.PROCEDURE: 354>
PROPERTIES =
<TokenType.PROPERTIES: 355>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 356>
PUT =
<TokenType.PUT: 357>
QUALIFY =
<TokenType.QUALIFY: 358>
QUOTE =
<TokenType.QUOTE: 359>
QDCOLON =
<TokenType.QDCOLON: 360>
RANGE =
<TokenType.RANGE: 361>
RECURSIVE =
<TokenType.RECURSIVE: 362>
REFRESH =
<TokenType.REFRESH: 363>
RENAME =
<TokenType.RENAME: 364>
REPLACE =
<TokenType.REPLACE: 365>
RETURNING =
<TokenType.RETURNING: 366>
REVOKE =
<TokenType.REVOKE: 367>
REFERENCES =
<TokenType.REFERENCES: 368>
RIGHT =
<TokenType.RIGHT: 369>
RLIKE =
<TokenType.RLIKE: 370>
ROLLBACK =
<TokenType.ROLLBACK: 371>
ROLLUP =
<TokenType.ROLLUP: 372>
ROW =
<TokenType.ROW: 373>
ROWS =
<TokenType.ROWS: 374>
SELECT =
<TokenType.SELECT: 375>
SEMI =
<TokenType.SEMI: 376>
SEPARATOR =
<TokenType.SEPARATOR: 377>
SEQUENCE =
<TokenType.SEQUENCE: 378>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 379>
SET =
<TokenType.SET: 380>
SETTINGS =
<TokenType.SETTINGS: 381>
SHOW =
<TokenType.SHOW: 382>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 383>
SOME =
<TokenType.SOME: 384>
SORT_BY =
<TokenType.SORT_BY: 385>
SOUNDS_LIKE =
<TokenType.SOUNDS_LIKE: 386>
START_WITH =
<TokenType.START_WITH: 387>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 388>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 389>
STRUCT =
<TokenType.STRUCT: 390>
SUMMARIZE =
<TokenType.SUMMARIZE: 391>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 392>
TAG =
<TokenType.TAG: 393>
TEMPORARY =
<TokenType.TEMPORARY: 394>
TOP =
<TokenType.TOP: 395>
THEN =
<TokenType.THEN: 396>
TRUE =
<TokenType.TRUE: 397>
TRUNCATE =
<TokenType.TRUNCATE: 398>
TRIGGER =
<TokenType.TRIGGER: 399>
UNCACHE =
<TokenType.UNCACHE: 400>
UNION =
<TokenType.UNION: 401>
UNNEST =
<TokenType.UNNEST: 402>
UNPIVOT =
<TokenType.UNPIVOT: 403>
UPDATE =
<TokenType.UPDATE: 404>
USE =
<TokenType.USE: 405>
USING =
<TokenType.USING: 406>
VALUES =
<TokenType.VALUES: 407>
VARIADIC =
<TokenType.VARIADIC: 408>
VIEW =
<TokenType.VIEW: 409>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 410>
VOLATILE =
<TokenType.VOLATILE: 411>
WHEN =
<TokenType.WHEN: 412>
WHERE =
<TokenType.WHERE: 413>
WINDOW =
<TokenType.WINDOW: 414>
WITH =
<TokenType.WITH: 415>
UNIQUE =
<TokenType.UNIQUE: 416>
UTC_DATE =
<TokenType.UTC_DATE: 417>
UTC_TIME =
<TokenType.UTC_TIME: 418>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 419>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 420>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 421>
OPTION =
<TokenType.OPTION: 422>
SINK =
<TokenType.SINK: 423>
SOURCE =
<TokenType.SOURCE: 424>
ANALYZE =
<TokenType.ANALYZE: 425>
NAMESPACE =
<TokenType.NAMESPACE: 426>
EXPORT =
<TokenType.EXPORT: 427>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 428>
class
Token:
464class Token: 465 # mypyc doesn't expose slots 466 _attrs: t.ClassVar[t.Tuple[str, ...]] = ( 467 "token_type", 468 "text", 469 "line", 470 "col", 471 "start", 472 "end", 473 "comments", 474 ) 475 __slots__ = _attrs 476 477 @classmethod 478 def number(cls, number: int) -> Token: 479 """Returns a NUMBER token with `number` as its text.""" 480 return cls(TokenType.NUMBER, str(number)) 481 482 @classmethod 483 def string(cls, string: str) -> Token: 484 """Returns a STRING token with `string` as its text.""" 485 return cls(TokenType.STRING, string) 486 487 @classmethod 488 def identifier(cls, identifier: str) -> Token: 489 """Returns an IDENTIFIER token with `identifier` as its text.""" 490 return cls(TokenType.IDENTIFIER, identifier) 491 492 @classmethod 493 def var(cls, var: str) -> Token: 494 """Returns an VAR token with `var` as its text.""" 495 return cls(TokenType.VAR, var) 496 497 def __init__( 498 self, 499 token_type: TokenType, 500 text: str, 501 line: int = 1, 502 col: int = 1, 503 start: int = 0, 504 end: int = 0, 505 comments: t.Optional[t.List[str]] = None, 506 ) -> None: 507 self.token_type = token_type 508 self.text = text 509 self.line = line 510 self.col = col 511 self.start = start 512 self.end = end 513 self.comments = [] if comments is None else comments 514 515 def __repr__(self) -> str: 516 attributes = ", ".join( 517 f"{k}: TokenType.{self.token_type.name}" 518 if k == "token_type" 519 else f"{k}: {getattr(self, k)}" 520 for k in self._attrs 521 ) 522 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: Optional[List[str]] = None)
497 def __init__( 498 self, 499 token_type: TokenType, 500 text: str, 501 line: int = 1, 502 col: int = 1, 503 start: int = 0, 504 end: int = 0, 505 comments: t.Optional[t.List[str]] = None, 506 ) -> None: 507 self.token_type = token_type 508 self.text = text 509 self.line = line 510 self.col = col 511 self.start = start 512 self.end = end 513 self.comments = [] if comments is None else comments
477 @classmethod 478 def number(cls, number: int) -> Token: 479 """Returns a NUMBER token with `number` as its text.""" 480 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
482 @classmethod 483 def string(cls, string: str) -> Token: 484 """Returns a STRING token with `string` as its text.""" 485 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
487 @classmethod 488 def identifier(cls, identifier: str) -> Token: 489 """Returns an IDENTIFIER token with `identifier` as its text.""" 490 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
TokenizerCore:
525class TokenizerCore: 526 __slots__ = ( 527 "sql", 528 "size", 529 "tokens", 530 "_start", 531 "_current", 532 "_line", 533 "_col", 534 "_comments", 535 "_char", 536 "_end", 537 "_peek", 538 "_prev_token_line", 539 "single_tokens", 540 "keywords", 541 "quotes", 542 "format_strings", 543 "identifiers", 544 "comments", 545 "string_escapes", 546 "byte_string_escapes", 547 "identifier_escapes", 548 "escape_follow_chars", 549 "commands", 550 "command_prefix_tokens", 551 "nested_comments", 552 "hint_start", 553 "tokens_preceding_hint", 554 "bit_strings", 555 "hex_strings", 556 "numeric_literals", 557 "var_single_tokens", 558 "string_escapes_allowed_in_raw_strings", 559 "heredoc_tag_is_identifier", 560 "heredoc_string_alternative", 561 "keyword_trie", 562 "numbers_can_be_underscore_separated", 563 "identifiers_can_start_with_digit", 564 "unescaped_sequences", 565 ) 566 567 def __init__( 568 self, 569 single_tokens: t.Dict[str, TokenType], 570 keywords: t.Dict[str, TokenType], 571 quotes: t.Dict[str, str], 572 format_strings: t.Dict[str, t.Tuple[str, TokenType]], 573 identifiers: t.Dict[str, str], 574 comments: t.Dict[str, t.Optional[str]], 575 string_escapes: t.Set[str], 576 byte_string_escapes: t.Set[str], 577 identifier_escapes: t.Set[str], 578 escape_follow_chars: t.Set[str], 579 commands: t.Set[TokenType], 580 command_prefix_tokens: t.Set[TokenType], 581 nested_comments: bool, 582 hint_start: str, 583 tokens_preceding_hint: t.Set[TokenType], 584 bit_strings: t.List[t.Union[str, t.Tuple[str, str]]], 585 hex_strings: t.List[t.Union[str, t.Tuple[str, str]]], 586 numeric_literals: t.Dict[str, str], 587 var_single_tokens: t.Set[str], 588 string_escapes_allowed_in_raw_strings: bool, 589 heredoc_tag_is_identifier: bool, 590 heredoc_string_alternative: TokenType, 591 keyword_trie: t.Dict, 592 numbers_can_be_underscore_separated: bool, 593 identifiers_can_start_with_digit: bool, 594 unescaped_sequences: t.Dict[str, str], 595 ) -> None: 596 self.single_tokens = single_tokens 597 self.keywords = keywords 598 self.quotes = quotes 599 self.format_strings = format_strings 600 self.identifiers = identifiers 601 self.comments = comments 602 self.string_escapes = string_escapes 603 self.byte_string_escapes = byte_string_escapes 604 self.identifier_escapes = identifier_escapes 605 self.escape_follow_chars = escape_follow_chars 606 self.commands = commands 607 self.command_prefix_tokens = command_prefix_tokens 608 self.nested_comments = nested_comments 609 self.hint_start = hint_start 610 self.tokens_preceding_hint = tokens_preceding_hint 611 self.bit_strings = bit_strings 612 self.hex_strings = hex_strings 613 self.numeric_literals = numeric_literals 614 self.var_single_tokens = var_single_tokens 615 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 616 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 617 self.heredoc_string_alternative = heredoc_string_alternative 618 self.keyword_trie = keyword_trie 619 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 620 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 621 self.unescaped_sequences = unescaped_sequences 622 self.reset() 623 624 def reset(self) -> None: 625 self.sql = "" 626 self.size = 0 627 self.tokens: t.List[Token] = [] 628 self._start = 0 629 self._current = 0 630 self._line = 1 631 self._col = 0 632 self._comments: t.List[str] = [] 633 self._char = "" 634 self._end = False 635 self._peek = "" 636 self._prev_token_line = -1 637 638 def tokenize(self, sql: str) -> t.List[Token]: 639 """Returns a list of tokens corresponding to the SQL string `sql`.""" 640 self.reset() 641 self.sql = sql 642 self.size = len(sql) 643 644 try: 645 self._scan() 646 except Exception as e: 647 start = max(self._current - 50, 0) 648 end = min(self._current + 50, self.size - 1) 649 context = self.sql[start:end] 650 raise TokenError(f"Error tokenizing '{context}'") from e 651 652 return self.tokens 653 654 def _scan(self, check_semicolon: bool = False) -> None: 655 identifiers = self.identifiers 656 space_chars = _SPACE_CHARS 657 digit_chars = _DIGIT_CHARS 658 659 while self.size and not self._end: 660 current = self._current 661 662 # Skip spaces here rather than iteratively calling advance() for performance reasons 663 while current < self.size: 664 char = self.sql[current] 665 666 if char == " " or char == "\t": 667 current += 1 668 else: 669 break 670 671 offset = current - self._current if current > self._current else 1 672 673 self._start = current 674 self._advance(offset) 675 676 if self._char not in space_chars: 677 if self._char in digit_chars: 678 self._scan_number() 679 elif self._char in identifiers: 680 self._scan_identifier(identifiers[self._char]) 681 else: 682 self._scan_keywords() 683 684 if check_semicolon and self._peek == ";": 685 break 686 687 if self.tokens and self._comments: 688 self.tokens[-1].comments.extend(self._comments) 689 690 def _chars(self, size: int) -> str: 691 if size == 1: 692 return self._char 693 694 start = self._current - 1 695 end = start + size 696 697 return self.sql[start:end] if end <= self.size else "" 698 699 def _advance(self, i: int = 1, alnum: bool = False) -> None: 700 char = self._char 701 702 if char == "\n" or char == "\r": 703 # Ensures we don't count an extra line if we get a \r\n line break sequence 704 if not (char == "\r" and self._peek == "\n"): 705 self._col = i 706 self._line += 1 707 else: 708 self._col += i 709 710 self._current += i 711 sql = self.sql 712 size = self.size 713 self._end = self._current >= size 714 self._char = sql[self._current - 1] 715 self._peek = "" if self._end else sql[self._current] 716 717 if alnum and self._char.isalnum(): 718 # Cache to local variables instead of attributes for better performance 719 _col = self._col 720 _current = self._current 721 _end = self._end 722 _peek = self._peek 723 724 while _peek.isalnum(): 725 _col += 1 726 _current += 1 727 _end = _current >= size 728 _peek = "" if _end else sql[_current] 729 730 self._col = _col 731 self._current = _current 732 self._end = _end 733 self._peek = _peek 734 self._char = sql[_current - 1] 735 736 @property 737 def _text(self) -> str: 738 return self.sql[self._start : self._current] 739 740 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 741 self._prev_token_line = self._line 742 743 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 744 self.tokens[-1].comments.extend(self._comments) 745 self._comments = [] 746 747 if text is None: 748 text = self.sql[self._start : self._current] 749 750 self.tokens.append( 751 Token( 752 token_type, 753 text=text, 754 line=self._line, 755 col=self._col, 756 start=self._start, 757 end=self._current - 1, 758 comments=self._comments, 759 ) 760 ) 761 self._comments = [] 762 763 # If we have either a semicolon or a begin token before the command's token, we'll parse 764 # whatever follows the command's token as a string 765 if ( 766 token_type in self.commands 767 and self._peek != ";" 768 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 769 ): 770 start = self._current 771 tokens = len(self.tokens) 772 self._scan(check_semicolon=True) 773 self.tokens = self.tokens[:tokens] 774 text = self.sql[start : self._current].strip() 775 if text: 776 self._add(TokenType.STRING, text) 777 778 def _scan_keywords(self) -> None: 779 sql = self.sql 780 sql_size = self.size 781 single_tokens = self.single_tokens 782 char_upper = _CHAR_UPPER 783 space_chars = _SPACE_CHARS 784 size = 0 785 word = None 786 chars = self._char 787 char = chars 788 prev_space = False 789 skip = False 790 trie = self.keyword_trie 791 single_token = char in single_tokens 792 793 while chars: 794 if not skip: 795 sub = trie.get(char_upper.get(char, char)) 796 if sub is None: 797 break 798 trie = sub 799 if 0 in trie: 800 word = chars 801 802 end = self._current + size 803 size += 1 804 805 if end < sql_size: 806 char = sql[end] 807 single_token = single_token or char in single_tokens 808 is_space = char in space_chars 809 810 if not is_space or not prev_space: 811 if is_space: 812 char = " " 813 chars += char 814 prev_space = is_space 815 skip = False 816 else: 817 skip = True 818 else: 819 char = "" 820 break 821 822 if word: 823 if self._scan_string(word): 824 return 825 if self._scan_comment(word): 826 return 827 if prev_space or single_token or not char: 828 self._advance(size - 1) 829 word = word.upper() 830 self._add(self.keywords[word], text=word) 831 return 832 833 if self._char in single_tokens: 834 self._add(single_tokens[self._char], text=self._char) 835 return 836 837 self._scan_var() 838 839 def _scan_comment(self, comment_start: str) -> bool: 840 if comment_start not in self.comments: 841 return False 842 843 comment_start_line = self._line 844 comment_start_size = len(comment_start) 845 comment_end = self.comments[comment_start] 846 847 if comment_end: 848 # Skip the comment's start delimiter 849 self._advance(comment_start_size) 850 851 comment_count = 1 852 comment_end_size = len(comment_end) 853 nested_comments = self.nested_comments 854 855 while not self._end: 856 if self._chars(comment_end_size) == comment_end: 857 comment_count -= 1 858 if not comment_count: 859 break 860 861 self._advance(alnum=True) 862 863 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 864 if ( 865 nested_comments 866 and not self._end 867 and self._chars(comment_end_size) == comment_start 868 ): 869 self._advance(comment_start_size) 870 comment_count += 1 871 872 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 873 self._advance(comment_end_size - 1) 874 else: 875 _peek = self._peek 876 while not self._end and _peek != "\n" and _peek != "\r": 877 self._advance(alnum=True) 878 _peek = self._peek 879 self._comments.append(self._text[comment_start_size:]) 880 881 if ( 882 comment_start == self.hint_start 883 and self.tokens 884 and self.tokens[-1].token_type in self.tokens_preceding_hint 885 ): 886 self._add(TokenType.HINT) 887 888 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 889 # Multiple consecutive comments are preserved by appending them to the current comments list. 890 if comment_start_line == self._prev_token_line: 891 self.tokens[-1].comments.extend(self._comments) 892 self._comments = [] 893 self._prev_token_line = self._line 894 895 return True 896 897 def _scan_number(self) -> None: 898 if self._char == "0": 899 peek = _CHAR_UPPER.get(self._peek, self._peek) 900 if peek == "B": 901 return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER) 902 elif peek == "X": 903 return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER) 904 905 decimal = False 906 scientific = 0 907 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 908 single_tokens = self.single_tokens 909 keywords = self.keywords 910 numeric_literals = self.numeric_literals 911 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 912 913 while True: 914 if self._peek in _DIGIT_CHARS: 915 self._advance() 916 elif self._peek == "." and not decimal: 917 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER: 918 return self._add(TokenType.NUMBER) 919 decimal = True 920 self._advance() 921 elif self._peek in ("-", "+") and scientific == 1: 922 # Only consume +/- if followed by a digit 923 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 924 scientific += 1 925 self._advance() 926 else: 927 return self._add(TokenType.NUMBER) 928 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 929 scientific += 1 930 self._advance() 931 elif self._peek == "_" and numbers_can_be_underscore_separated: 932 self._advance() 933 elif self._peek.isidentifier(): 934 number_text = self._text 935 literal = "" 936 937 while ( 938 self._peek 939 and self._peek not in _SPACE_CHARS 940 and self._peek not in single_tokens 941 ): 942 literal += self._peek 943 self._advance() 944 945 token_type = keywords.get(numeric_literals.get(literal.upper(), "")) 946 947 if token_type: 948 self._add(TokenType.NUMBER, number_text) 949 self._add(TokenType.DCOLON, "::") 950 return self._add(token_type, literal) 951 elif identifiers_can_start_with_digit: 952 return self._add(TokenType.VAR) 953 954 self._advance(-len(literal)) 955 return self._add(TokenType.NUMBER, number_text) 956 else: 957 return self._add(TokenType.NUMBER) 958 959 def _scan_bits(self) -> None: 960 self._advance() 961 value = self._extract_value() 962 try: 963 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 964 int(value, 2) 965 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 966 except ValueError: 967 self._add(TokenType.IDENTIFIER) 968 969 def _scan_hex(self) -> None: 970 self._advance() 971 value = self._extract_value() 972 try: 973 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 974 int(value, 16) 975 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 976 except ValueError: 977 self._add(TokenType.IDENTIFIER) 978 979 def _extract_value(self) -> str: 980 single_tokens = self.single_tokens 981 982 while True: 983 char = self._peek.strip() 984 if char and char not in single_tokens: 985 self._advance(alnum=True) 986 else: 987 break 988 989 return self._text 990 991 def _scan_string(self, start: str) -> bool: 992 base = None 993 token_type = TokenType.STRING 994 995 if start in self.quotes: 996 end = self.quotes[start] 997 elif start in self.format_strings: 998 end, token_type = self.format_strings[start] 999 1000 if token_type == TokenType.HEX_STRING: 1001 base = 16 1002 elif token_type == TokenType.BIT_STRING: 1003 base = 2 1004 elif token_type == TokenType.HEREDOC_STRING: 1005 self._advance() 1006 1007 if self._char == end: 1008 tag = "" 1009 else: 1010 tag = self._extract_string( 1011 end, 1012 raw_string=True, 1013 raise_unmatched=not self.heredoc_tag_is_identifier, 1014 ) 1015 1016 if ( 1017 tag 1018 and self.heredoc_tag_is_identifier 1019 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1020 ): 1021 if not self._end: 1022 self._advance(-1) 1023 1024 self._advance(-len(tag)) 1025 self._add(self.heredoc_string_alternative) 1026 return True 1027 1028 end = f"{start}{tag}{end}" 1029 else: 1030 return False 1031 1032 self._advance(len(start)) 1033 text = self._extract_string( 1034 end, 1035 escapes=( 1036 self.byte_string_escapes 1037 if token_type == TokenType.BYTE_STRING 1038 else self.string_escapes 1039 ), 1040 raw_string=token_type == TokenType.RAW_STRING, 1041 ) 1042 1043 if base and text: 1044 try: 1045 int(text, base) 1046 except Exception: 1047 raise TokenError( 1048 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1049 ) 1050 1051 self._add(token_type, text) 1052 return True 1053 1054 def _scan_identifier(self, identifier_end: str) -> None: 1055 self._advance() 1056 text = self._extract_string( 1057 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1058 ) 1059 self._add(TokenType.IDENTIFIER, text) 1060 1061 def _scan_var(self) -> None: 1062 var_single_tokens = self.var_single_tokens 1063 single_tokens = self.single_tokens 1064 1065 while True: 1066 peek = self._peek 1067 if not peek or peek in _SPACE_CHARS: 1068 break 1069 if peek not in var_single_tokens and peek in single_tokens: 1070 break 1071 self._advance(alnum=True) 1072 1073 self._add( 1074 TokenType.VAR 1075 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1076 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1077 ) 1078 1079 def _extract_string( 1080 self, 1081 delimiter: str, 1082 escapes: t.Optional[t.Set[str]] = None, 1083 raw_string: bool = False, 1084 raise_unmatched: bool = True, 1085 ) -> str: 1086 text = "" 1087 delim_size = len(delimiter) 1088 escapes = self.string_escapes if escapes is None else escapes 1089 unescaped_sequences = self.unescaped_sequences 1090 escape_follow_chars = self.escape_follow_chars 1091 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1092 quotes = self.quotes 1093 1094 while True: 1095 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1096 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1097 if unescaped_sequence: 1098 self._advance(2) 1099 text += unescaped_sequence 1100 continue 1101 1102 is_valid_custom_escape = ( 1103 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1104 ) 1105 1106 if ( 1107 (string_escapes_allowed_in_raw_strings or not raw_string) 1108 and self._char in escapes 1109 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1110 and (self._char not in quotes or self._char == self._peek) 1111 ): 1112 if self._peek == delimiter: 1113 text += self._peek 1114 elif is_valid_custom_escape and self._char != self._peek: 1115 text += self._peek 1116 else: 1117 text += self._char + self._peek 1118 1119 if self._current + 1 < self.size: 1120 self._advance(2) 1121 else: 1122 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1123 else: 1124 if self._chars(delim_size) == delimiter: 1125 if delim_size > 1: 1126 self._advance(delim_size - 1) 1127 break 1128 1129 if self._end: 1130 if not raise_unmatched: 1131 return text + self._char 1132 1133 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1134 1135 current = self._current - 1 1136 self._advance(alnum=True) 1137 text += self.sql[current : self._current - 1] 1138 1139 return text
TokenizerCore( single_tokens: Dict[str, TokenType], keywords: Dict[str, TokenType], quotes: Dict[str, str], format_strings: Dict[str, Tuple[str, TokenType]], identifiers: Dict[str, str], comments: Dict[str, Optional[str]], string_escapes: Set[str], byte_string_escapes: Set[str], identifier_escapes: Set[str], escape_follow_chars: Set[str], commands: Set[TokenType], command_prefix_tokens: Set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: Set[TokenType], bit_strings: List[Union[str, Tuple[str, str]]], hex_strings: List[Union[str, Tuple[str, str]]], numeric_literals: Dict[str, str], var_single_tokens: Set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: Dict, numbers_can_be_underscore_separated: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: Dict[str, str])
567 def __init__( 568 self, 569 single_tokens: t.Dict[str, TokenType], 570 keywords: t.Dict[str, TokenType], 571 quotes: t.Dict[str, str], 572 format_strings: t.Dict[str, t.Tuple[str, TokenType]], 573 identifiers: t.Dict[str, str], 574 comments: t.Dict[str, t.Optional[str]], 575 string_escapes: t.Set[str], 576 byte_string_escapes: t.Set[str], 577 identifier_escapes: t.Set[str], 578 escape_follow_chars: t.Set[str], 579 commands: t.Set[TokenType], 580 command_prefix_tokens: t.Set[TokenType], 581 nested_comments: bool, 582 hint_start: str, 583 tokens_preceding_hint: t.Set[TokenType], 584 bit_strings: t.List[t.Union[str, t.Tuple[str, str]]], 585 hex_strings: t.List[t.Union[str, t.Tuple[str, str]]], 586 numeric_literals: t.Dict[str, str], 587 var_single_tokens: t.Set[str], 588 string_escapes_allowed_in_raw_strings: bool, 589 heredoc_tag_is_identifier: bool, 590 heredoc_string_alternative: TokenType, 591 keyword_trie: t.Dict, 592 numbers_can_be_underscore_separated: bool, 593 identifiers_can_start_with_digit: bool, 594 unescaped_sequences: t.Dict[str, str], 595 ) -> None: 596 self.single_tokens = single_tokens 597 self.keywords = keywords 598 self.quotes = quotes 599 self.format_strings = format_strings 600 self.identifiers = identifiers 601 self.comments = comments 602 self.string_escapes = string_escapes 603 self.byte_string_escapes = byte_string_escapes 604 self.identifier_escapes = identifier_escapes 605 self.escape_follow_chars = escape_follow_chars 606 self.commands = commands 607 self.command_prefix_tokens = command_prefix_tokens 608 self.nested_comments = nested_comments 609 self.hint_start = hint_start 610 self.tokens_preceding_hint = tokens_preceding_hint 611 self.bit_strings = bit_strings 612 self.hex_strings = hex_strings 613 self.numeric_literals = numeric_literals 614 self.var_single_tokens = var_single_tokens 615 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 616 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 617 self.heredoc_string_alternative = heredoc_string_alternative 618 self.keyword_trie = keyword_trie 619 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 620 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 621 self.unescaped_sequences = unescaped_sequences 622 self.reset()
def
reset(self) -> None:
624 def reset(self) -> None: 625 self.sql = "" 626 self.size = 0 627 self.tokens: t.List[Token] = [] 628 self._start = 0 629 self._current = 0 630 self._line = 1 631 self._col = 0 632 self._comments: t.List[str] = [] 633 self._char = "" 634 self._end = False 635 self._peek = "" 636 self._prev_token_line = -1
638 def tokenize(self, sql: str) -> t.List[Token]: 639 """Returns a list of tokens corresponding to the SQL string `sql`.""" 640 self.reset() 641 self.sql = sql 642 self.size = len(sql) 643 644 try: 645 self._scan() 646 except Exception as e: 647 start = max(self._current - 50, 0) 648 end = min(self._current + 50, self.size - 1) 649 context = self.sql[start:end] 650 raise TokenError(f"Error tokenizing '{context}'") from e 651 652 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.