sqlglot.tokenizer_core
1from __future__ import annotations 2 3import typing as t 4from enum import IntEnum, auto 5 6from sqlglot.errors import TokenError 7 8# dict lookup is faster than .upper() and .isdigit() 9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)} 10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789") 11 12 13class TokenType(IntEnum): 14 L_PAREN = auto() 15 R_PAREN = auto() 16 L_BRACKET = auto() 17 R_BRACKET = auto() 18 L_BRACE = auto() 19 R_BRACE = auto() 20 COMMA = auto() 21 DOT = auto() 22 DASH = auto() 23 PLUS = auto() 24 COLON = auto() 25 DOTCOLON = auto() 26 DOTCARET = auto() 27 DCOLON = auto() 28 DCOLONDOLLAR = auto() 29 DCOLONPERCENT = auto() 30 DCOLONQMARK = auto() 31 DQMARK = auto() 32 SEMICOLON = auto() 33 STAR = auto() 34 BACKSLASH = auto() 35 SLASH = auto() 36 LT = auto() 37 LTE = auto() 38 GT = auto() 39 GTE = auto() 40 NOT = auto() 41 EQ = auto() 42 NEQ = auto() 43 NULLSAFE_EQ = auto() 44 COLON_EQ = auto() 45 COLON_GT = auto() 46 NCOLON_GT = auto() 47 AND = auto() 48 OR = auto() 49 AMP = auto() 50 DPIPE = auto() 51 PIPE_GT = auto() 52 PIPE = auto() 53 PIPE_SLASH = auto() 54 DPIPE_SLASH = auto() 55 CARET = auto() 56 CARET_AT = auto() 57 TILDE = auto() 58 ARROW = auto() 59 DARROW = auto() 60 FARROW = auto() 61 HASH = auto() 62 HASH_ARROW = auto() 63 DHASH_ARROW = auto() 64 LR_ARROW = auto() 65 LLRR_ARROW = auto() 66 DAT = auto() 67 AT_QMARK = auto() 68 LT_AT = auto() 69 AT_GT = auto() 70 DOLLAR = auto() 71 PARAMETER = auto() 72 SESSION = auto() 73 SESSION_PARAMETER = auto() 74 SESSION_USER = auto() 75 DAMP = auto() 76 AMP_LT = auto() 77 AMP_GT = auto() 78 ADJACENT = auto() 79 XOR = auto() 80 DSTAR = auto() 81 QMARK_AMP = auto() 82 QMARK_PIPE = auto() 83 HASH_DASH = auto() 84 EXCLAMATION = auto() 85 86 URI_START = auto() 87 88 BLOCK_START = auto() 89 BLOCK_END = auto() 90 91 SPACE = auto() 92 BREAK = auto() 93 94 STRING = auto() 95 NUMBER = auto() 96 IDENTIFIER = auto() 97 DATABASE = auto() 98 COLUMN = auto() 99 COLUMN_DEF = auto() 100 SCHEMA = auto() 101 TABLE = auto() 102 WAREHOUSE = auto() 103 STAGE = auto() 104 STREAM = auto() 105 STREAMLIT = auto() 106 VAR = auto() 107 BIT_STRING = auto() 108 HEX_STRING = auto() 109 BYTE_STRING = auto() 110 NATIONAL_STRING = auto() 111 RAW_STRING = auto() 112 HEREDOC_STRING = auto() 113 UNICODE_STRING = auto() 114 115 # types 116 BIT = auto() 117 BOOLEAN = auto() 118 TINYINT = auto() 119 UTINYINT = auto() 120 SMALLINT = auto() 121 USMALLINT = auto() 122 MEDIUMINT = auto() 123 UMEDIUMINT = auto() 124 INT = auto() 125 UINT = auto() 126 BIGINT = auto() 127 UBIGINT = auto() 128 BIGNUM = auto() 129 INT128 = auto() 130 UINT128 = auto() 131 INT256 = auto() 132 UINT256 = auto() 133 FLOAT = auto() 134 DOUBLE = auto() 135 UDOUBLE = auto() 136 DECIMAL = auto() 137 DECIMAL32 = auto() 138 DECIMAL64 = auto() 139 DECIMAL128 = auto() 140 DECIMAL256 = auto() 141 DECFLOAT = auto() 142 UDECIMAL = auto() 143 BIGDECIMAL = auto() 144 CHAR = auto() 145 NCHAR = auto() 146 VARCHAR = auto() 147 NVARCHAR = auto() 148 BPCHAR = auto() 149 TEXT = auto() 150 MEDIUMTEXT = auto() 151 LONGTEXT = auto() 152 BLOB = auto() 153 MEDIUMBLOB = auto() 154 LONGBLOB = auto() 155 TINYBLOB = auto() 156 TINYTEXT = auto() 157 NAME = auto() 158 BINARY = auto() 159 VARBINARY = auto() 160 JSON = auto() 161 JSONB = auto() 162 TIME = auto() 163 TIMETZ = auto() 164 TIME_NS = auto() 165 TIMESTAMP = auto() 166 TIMESTAMPTZ = auto() 167 TIMESTAMPLTZ = auto() 168 TIMESTAMPNTZ = auto() 169 TIMESTAMP_S = auto() 170 TIMESTAMP_MS = auto() 171 TIMESTAMP_NS = auto() 172 DATETIME = auto() 173 DATETIME2 = auto() 174 DATETIME64 = auto() 175 SMALLDATETIME = auto() 176 DATE = auto() 177 DATE32 = auto() 178 INT4RANGE = auto() 179 INT4MULTIRANGE = auto() 180 INT8RANGE = auto() 181 INT8MULTIRANGE = auto() 182 NUMRANGE = auto() 183 NUMMULTIRANGE = auto() 184 TSRANGE = auto() 185 TSMULTIRANGE = auto() 186 TSTZRANGE = auto() 187 TSTZMULTIRANGE = auto() 188 DATERANGE = auto() 189 DATEMULTIRANGE = auto() 190 UUID = auto() 191 GEOGRAPHY = auto() 192 GEOGRAPHYPOINT = auto() 193 NULLABLE = auto() 194 GEOMETRY = auto() 195 POINT = auto() 196 RING = auto() 197 LINESTRING = auto() 198 LOCALTIME = auto() 199 LOCALTIMESTAMP = auto() 200 SYSTIMESTAMP = auto() 201 MULTILINESTRING = auto() 202 POLYGON = auto() 203 MULTIPOLYGON = auto() 204 HLLSKETCH = auto() 205 HSTORE = auto() 206 SUPER = auto() 207 SERIAL = auto() 208 SMALLSERIAL = auto() 209 BIGSERIAL = auto() 210 XML = auto() 211 YEAR = auto() 212 USERDEFINED = auto() 213 MONEY = auto() 214 SMALLMONEY = auto() 215 ROWVERSION = auto() 216 IMAGE = auto() 217 VARIANT = auto() 218 OBJECT = auto() 219 INET = auto() 220 IPADDRESS = auto() 221 IPPREFIX = auto() 222 IPV4 = auto() 223 IPV6 = auto() 224 ENUM = auto() 225 ENUM8 = auto() 226 ENUM16 = auto() 227 FIXEDSTRING = auto() 228 LOWCARDINALITY = auto() 229 NESTED = auto() 230 AGGREGATEFUNCTION = auto() 231 SIMPLEAGGREGATEFUNCTION = auto() 232 TDIGEST = auto() 233 UNKNOWN = auto() 234 VECTOR = auto() 235 DYNAMIC = auto() 236 VOID = auto() 237 238 # keywords 239 ALIAS = auto() 240 ALTER = auto() 241 ALL = auto() 242 ANTI = auto() 243 ANY = auto() 244 APPLY = auto() 245 ARRAY = auto() 246 ASC = auto() 247 ASOF = auto() 248 ATTACH = auto() 249 AUTO_INCREMENT = auto() 250 BEGIN = auto() 251 BETWEEN = auto() 252 BULK_COLLECT_INTO = auto() 253 CACHE = auto() 254 CASE = auto() 255 CHARACTER_SET = auto() 256 CLUSTER_BY = auto() 257 COLLATE = auto() 258 COMMAND = auto() 259 COMMENT = auto() 260 COMMIT = auto() 261 CONNECT_BY = auto() 262 CONSTRAINT = auto() 263 COPY = auto() 264 CREATE = auto() 265 CROSS = auto() 266 CUBE = auto() 267 CURRENT_DATE = auto() 268 CURRENT_DATETIME = auto() 269 CURRENT_SCHEMA = auto() 270 CURRENT_TIME = auto() 271 CURRENT_TIMESTAMP = auto() 272 CURRENT_USER = auto() 273 CURRENT_USER_ID = auto() 274 CURRENT_ROLE = auto() 275 CURRENT_CATALOG = auto() 276 DECLARE = auto() 277 DEFAULT = auto() 278 DELETE = auto() 279 DESC = auto() 280 DESCRIBE = auto() 281 DETACH = auto() 282 DICTIONARY = auto() 283 DISTINCT = auto() 284 DISTRIBUTE_BY = auto() 285 DIV = auto() 286 DROP = auto() 287 ELSE = auto() 288 END = auto() 289 ESCAPE = auto() 290 EXCEPT = auto() 291 EXECUTE = auto() 292 EXISTS = auto() 293 FALSE = auto() 294 FETCH = auto() 295 FILE = auto() 296 FILE_FORMAT = auto() 297 FILTER = auto() 298 FINAL = auto() 299 FIRST = auto() 300 FOR = auto() 301 FORCE = auto() 302 FOREIGN_KEY = auto() 303 FORMAT = auto() 304 FROM = auto() 305 FULL = auto() 306 FUNCTION = auto() 307 GET = auto() 308 GLOB = auto() 309 GLOBAL = auto() 310 GRANT = auto() 311 GROUP_BY = auto() 312 GROUPING_SETS = auto() 313 HAVING = auto() 314 HINT = auto() 315 IGNORE = auto() 316 ILIKE = auto() 317 IN = auto() 318 INDEX = auto() 319 INDEXED_BY = auto() 320 INNER = auto() 321 INSERT = auto() 322 INSTALL = auto() 323 INTEGRATION = auto() 324 INTERSECT = auto() 325 INTERVAL = auto() 326 INTO = auto() 327 INTRODUCER = auto() 328 IRLIKE = auto() 329 IS = auto() 330 ISNULL = auto() 331 JOIN = auto() 332 JOIN_MARKER = auto() 333 KEEP = auto() 334 KEY = auto() 335 KILL = auto() 336 LANGUAGE = auto() 337 LATERAL = auto() 338 LEFT = auto() 339 LIKE = auto() 340 LIMIT = auto() 341 LIST = auto() 342 LOAD = auto() 343 LOCK = auto() 344 MAP = auto() 345 MATCH = auto() 346 MATCH_CONDITION = auto() 347 MATCH_RECOGNIZE = auto() 348 MEMBER_OF = auto() 349 MERGE = auto() 350 MOD = auto() 351 MODEL = auto() 352 NATURAL = auto() 353 NEXT = auto() 354 NOTHING = auto() 355 NOTNULL = auto() 356 NULL = auto() 357 OBJECT_IDENTIFIER = auto() 358 OFFSET = auto() 359 ON = auto() 360 ONLY = auto() 361 OPERATOR = auto() 362 ORDER_BY = auto() 363 ORDER_SIBLINGS_BY = auto() 364 ORDERED = auto() 365 ORDINALITY = auto() 366 OUT = auto() 367 INOUT = auto() 368 OUTER = auto() 369 OVER = auto() 370 OVERLAPS = auto() 371 OVERWRITE = auto() 372 PACKAGE = auto() 373 PARTITION = auto() 374 PARTITION_BY = auto() 375 PERCENT = auto() 376 PIVOT = auto() 377 PLACEHOLDER = auto() 378 POLICY = auto() 379 POOL = auto() 380 POSITIONAL = auto() 381 PRAGMA = auto() 382 PREWHERE = auto() 383 PRIMARY_KEY = auto() 384 PROCEDURE = auto() 385 PROPERTIES = auto() 386 PSEUDO_TYPE = auto() 387 PUT = auto() 388 QUALIFY = auto() 389 QUOTE = auto() 390 QDCOLON = auto() 391 RANGE = auto() 392 RECURSIVE = auto() 393 REFRESH = auto() 394 RENAME = auto() 395 REPLACE = auto() 396 RETURNING = auto() 397 REVOKE = auto() 398 REFERENCES = auto() 399 RIGHT = auto() 400 RLIKE = auto() 401 ROLE = auto() 402 ROLLBACK = auto() 403 ROLLUP = auto() 404 ROW = auto() 405 ROWS = auto() 406 RULE = auto() 407 SELECT = auto() 408 SEMI = auto() 409 SEPARATOR = auto() 410 SEQUENCE = auto() 411 SERDE_PROPERTIES = auto() 412 SET = auto() 413 SETTINGS = auto() 414 SHOW = auto() 415 SIMILAR_TO = auto() 416 SOME = auto() 417 SORT_BY = auto() 418 SOUNDS_LIKE = auto() 419 SQL_SECURITY = auto() 420 START_WITH = auto() 421 STORAGE_INTEGRATION = auto() 422 STRAIGHT_JOIN = auto() 423 STRUCT = auto() 424 SUMMARIZE = auto() 425 TABLE_SAMPLE = auto() 426 TAG = auto() 427 TEMPORARY = auto() 428 TOP = auto() 429 THEN = auto() 430 TRUE = auto() 431 TRUNCATE = auto() 432 TRIGGER = auto() 433 TYPE = auto() 434 UNCACHE = auto() 435 UNDROP = auto() 436 UNION = auto() 437 UNNEST = auto() 438 UNPIVOT = auto() 439 UPDATE = auto() 440 USE = auto() 441 USING = auto() 442 VALUES = auto() 443 VARIADIC = auto() 444 VIEW = auto() 445 SEMANTIC_VIEW = auto() 446 VOLATILE = auto() 447 VOLUME = auto() 448 WHEN = auto() 449 WHERE = auto() 450 WINDOW = auto() 451 WITH = auto() 452 UNIQUE = auto() 453 UTC_DATE = auto() 454 UTC_TIME = auto() 455 UTC_TIMESTAMP = auto() 456 VERSION_SNAPSHOT = auto() 457 TIMESTAMP_SNAPSHOT = auto() 458 OPTION = auto() 459 SINK = auto() 460 SOURCE = auto() 461 ANALYZE = auto() 462 NAMESPACE = auto() 463 EXPORT = auto() 464 465 # sentinels 466 HIVE_TOKEN_STREAM = auto() 467 SENTINEL = auto() 468 469 def __str__(self) -> str: 470 return f"TokenType.{self.name}" 471 472 473class Token: 474 # mypyc doesn't expose slots 475 _attrs: t.ClassVar[tuple[str, ...]] = ( 476 "token_type", 477 "text", 478 "line", 479 "col", 480 "start", 481 "end", 482 "comments", 483 ) 484 __slots__ = _attrs 485 486 @classmethod 487 def number(cls, number: int) -> Token: 488 """Returns a NUMBER token with `number` as its text.""" 489 return cls(TokenType.NUMBER, str(number)) 490 491 @classmethod 492 def string(cls, string: str) -> Token: 493 """Returns a STRING token with `string` as its text.""" 494 return cls(TokenType.STRING, string) 495 496 @classmethod 497 def identifier(cls, identifier: str) -> Token: 498 """Returns an IDENTIFIER token with `identifier` as its text.""" 499 return cls(TokenType.IDENTIFIER, identifier) 500 501 @classmethod 502 def var(cls, var: str) -> Token: 503 """Returns an VAR token with `var` as its text.""" 504 return cls(TokenType.VAR, var) 505 506 def __init__( 507 self, 508 token_type: TokenType, 509 text: str, 510 line: int = 1, 511 col: int = 1, 512 start: int = 0, 513 end: int = 0, 514 comments: list[str] | None = None, 515 ) -> None: 516 self.token_type = token_type 517 self.text = text 518 self.line = line 519 self.col = col 520 self.start = start 521 self.end = end 522 self.comments = [] if comments is None else comments 523 524 def __bool__(self) -> bool: 525 return self.token_type != TokenType.SENTINEL 526 527 def __repr__(self) -> str: 528 attributes = ", ".join( 529 f"{k}: TokenType.{self.token_type.name}" 530 if k == "token_type" 531 else f"{k}: {getattr(self, k)}" 532 for k in self._attrs 533 ) 534 return f"<Token {attributes}>" 535 536 537class TokenizerCore: 538 __slots__ = ( 539 "sql", 540 "size", 541 "tokens", 542 "_start", 543 "_current", 544 "_line", 545 "_col", 546 "_comments", 547 "_char", 548 "_end", 549 "_peek", 550 "_prev_token_line", 551 "single_tokens", 552 "keywords", 553 "quotes", 554 "format_strings", 555 "identifiers", 556 "comments", 557 "string_escapes", 558 "byte_string_escapes", 559 "identifier_escapes", 560 "escape_follow_chars", 561 "commands", 562 "command_prefix_tokens", 563 "nested_comments", 564 "hint_start", 565 "tokens_preceding_hint", 566 "has_bit_strings", 567 "has_hex_strings", 568 "numeric_literals", 569 "var_single_tokens", 570 "string_escapes_allowed_in_raw_strings", 571 "heredoc_tag_is_identifier", 572 "heredoc_string_alternative", 573 "keyword_trie", 574 "numbers_can_be_underscore_separated", 575 "numbers_can_have_decimals", 576 "identifiers_can_start_with_digit", 577 "unescaped_sequences", 578 ) 579 580 def __init__( 581 self, 582 single_tokens: dict[str, TokenType], 583 keywords: dict[str, TokenType], 584 quotes: dict[str, str], 585 format_strings: dict[str, tuple[str, TokenType]], 586 identifiers: dict[str, str], 587 comments: dict[str, str | None], 588 string_escapes: set[str], 589 byte_string_escapes: set[str], 590 identifier_escapes: set[str], 591 escape_follow_chars: set[str], 592 commands: set[TokenType], 593 command_prefix_tokens: set[TokenType], 594 nested_comments: bool, 595 hint_start: str, 596 tokens_preceding_hint: set[TokenType], 597 has_bit_strings: bool, 598 has_hex_strings: bool, 599 numeric_literals: dict[str, str], 600 var_single_tokens: set[str], 601 string_escapes_allowed_in_raw_strings: bool, 602 heredoc_tag_is_identifier: bool, 603 heredoc_string_alternative: TokenType, 604 keyword_trie: dict, 605 numbers_can_be_underscore_separated: bool, 606 numbers_can_have_decimals: bool, 607 identifiers_can_start_with_digit: bool, 608 unescaped_sequences: dict[str, str], 609 ) -> None: 610 self.single_tokens = single_tokens 611 self.keywords = keywords 612 self.quotes = quotes 613 self.format_strings = format_strings 614 self.identifiers = identifiers 615 self.comments = comments 616 self.string_escapes = string_escapes 617 self.byte_string_escapes = byte_string_escapes 618 self.identifier_escapes = identifier_escapes 619 self.escape_follow_chars = escape_follow_chars 620 self.commands = commands 621 self.command_prefix_tokens = command_prefix_tokens 622 self.nested_comments = nested_comments 623 self.hint_start = hint_start 624 self.tokens_preceding_hint = tokens_preceding_hint 625 self.has_bit_strings = has_bit_strings 626 self.has_hex_strings = has_hex_strings 627 self.numeric_literals = numeric_literals 628 self.var_single_tokens = var_single_tokens 629 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 630 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 631 self.heredoc_string_alternative = heredoc_string_alternative 632 self.keyword_trie = keyword_trie 633 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 634 self.numbers_can_have_decimals = numbers_can_have_decimals 635 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 636 self.unescaped_sequences = unescaped_sequences 637 self.sql = "" 638 self.size = 0 639 self.tokens: list[Token] = [] 640 self._start = 0 641 self._current = 0 642 self._line = 1 643 self._col = 0 644 self._comments: list[str] = [] 645 self._char = "" 646 self._end = False 647 self._peek = "" 648 self._prev_token_line = -1 649 650 def reset(self) -> None: 651 self.sql = "" 652 self.size = 0 653 self.tokens = [] 654 self._start = 0 655 self._current = 0 656 self._line = 1 657 self._col = 0 658 self._comments = [] 659 self._char = "" 660 self._end = False 661 self._peek = "" 662 self._prev_token_line = -1 663 664 def tokenize(self, sql: str) -> list[Token]: 665 """Returns a list of tokens corresponding to the SQL string `sql`.""" 666 self.reset() 667 self.sql = sql 668 self.size = len(sql) 669 670 try: 671 self._scan() 672 except Exception as e: 673 start = max(self._current - 50, 0) 674 end = min(self._current + 50, self.size - 1) 675 context = self.sql[start:end] 676 raise TokenError(f"Error tokenizing '{context}'") from e 677 678 return self.tokens 679 680 def _scan(self, check_semicolon: bool = False) -> None: 681 identifiers = self.identifiers 682 digit_chars = _DIGIT_CHARS 683 684 while self.size and not self._end: 685 current = self._current 686 687 # Skip spaces here rather than iteratively calling advance() for performance reasons 688 while current < self.size: 689 char = self.sql[current] 690 691 if char == " " or char == "\t": 692 current += 1 693 else: 694 break 695 696 offset = current - self._current if current > self._current else 1 697 698 self._start = current 699 self._advance(offset) 700 701 if not self._char.isspace(): 702 if self._char in digit_chars: 703 self._scan_number() 704 elif self._char in identifiers: 705 self._scan_identifier(identifiers[self._char]) 706 else: 707 self._scan_keywords() 708 709 if check_semicolon and self._peek == ";": 710 break 711 712 if self.tokens and self._comments: 713 self.tokens[-1].comments.extend(self._comments) 714 715 def _chars(self, size: int) -> str: 716 if size == 1: 717 return self._char 718 719 start = self._current - 1 720 end = start + size 721 722 return self.sql[start:end] if end <= self.size else "" 723 724 def _advance(self, i: int = 1, alnum: bool = False) -> None: 725 char = self._char 726 727 if char == "\n" or char == "\r": 728 # Ensures we don't count an extra line if we get a \r\n line break sequence 729 if not (char == "\r" and self._peek == "\n"): 730 self._col = i 731 self._line += 1 732 else: 733 self._col += i 734 735 self._current += i 736 sql = self.sql 737 size = self.size 738 self._end = self._current >= size 739 self._char = sql[self._current - 1] 740 self._peek = "" if self._end else sql[self._current] 741 742 if alnum and self._char.isalnum(): 743 # Cache to local variables instead of attributes for better performance 744 _col = self._col 745 _current = self._current 746 _end = self._end 747 _peek = self._peek 748 749 while _peek.isalnum(): 750 _col += 1 751 _current += 1 752 _end = _current >= size 753 _peek = "" if _end else sql[_current] 754 755 self._col = _col 756 self._current = _current 757 self._end = _end 758 self._peek = _peek 759 self._char = sql[_current - 1] 760 761 @property 762 def _text(self) -> str: 763 return self.sql[self._start : self._current] 764 765 def _add(self, token_type: TokenType, text: str | None = None) -> None: 766 self._prev_token_line = self._line 767 768 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 769 self.tokens[-1].comments.extend(self._comments) 770 self._comments = [] 771 772 if text is None: 773 text = self.sql[self._start : self._current] 774 775 self.tokens.append( 776 Token( 777 token_type, 778 text=text, 779 line=self._line, 780 col=self._col, 781 start=self._start, 782 end=self._current - 1, 783 comments=self._comments, 784 ) 785 ) 786 self._comments = [] 787 788 # If we have either a semicolon or a begin token before the command's token, we'll parse 789 # whatever follows the command's token as a string 790 if ( 791 token_type in self.commands 792 and self._peek != ";" 793 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 794 ): 795 start = self._current 796 tokens = len(self.tokens) 797 self._scan(check_semicolon=True) 798 self.tokens = self.tokens[:tokens] 799 text = self.sql[start : self._current].strip() 800 if text: 801 self._add(TokenType.STRING, text) 802 803 def _scan_keywords(self) -> None: 804 sql = self.sql 805 sql_size = self.size 806 single_tokens = self.single_tokens 807 char_upper = _CHAR_UPPER 808 size = 0 809 word = None 810 chars = self._char 811 char = chars 812 prev_space = False 813 skip = False 814 trie = self.keyword_trie 815 single_token = char in single_tokens 816 817 while chars: 818 if not skip: 819 sub = trie.get(char_upper.get(char, char)) 820 if sub is None: 821 break 822 trie = sub 823 if 0 in trie: 824 word = chars 825 826 end = self._current + size 827 size += 1 828 829 if end < sql_size: 830 char = sql[end] 831 single_token = single_token or char in single_tokens 832 is_space = char.isspace() 833 834 if not is_space or not prev_space: 835 if is_space: 836 char = " " 837 chars += char 838 prev_space = is_space 839 skip = False 840 else: 841 skip = True 842 else: 843 char = "" 844 break 845 846 if word: 847 if self._scan_string(word): 848 return 849 if self._scan_comment(word): 850 return 851 if prev_space or single_token or not char: 852 self._advance(size - 1) 853 word = word.upper() 854 self._add(self.keywords[word], text=word) 855 return 856 857 if self._char in single_tokens: 858 self._add(single_tokens[self._char], text=self._char) 859 return 860 861 self._scan_var() 862 863 def _scan_comment(self, comment_start: str) -> bool: 864 if comment_start not in self.comments: 865 return False 866 867 comment_start_line = self._line 868 comment_start_size = len(comment_start) 869 comment_end = self.comments[comment_start] 870 871 if comment_end: 872 # Skip the comment's start delimiter 873 self._advance(comment_start_size) 874 875 comment_count = 1 876 comment_end_size = len(comment_end) 877 nested_comments = self.nested_comments 878 879 while not self._end: 880 if self._chars(comment_end_size) == comment_end: 881 comment_count -= 1 882 if not comment_count: 883 break 884 885 self._advance(alnum=True) 886 887 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 888 if ( 889 nested_comments 890 and not self._end 891 and self._chars(comment_end_size) == comment_start 892 ): 893 self._advance(comment_start_size) 894 comment_count += 1 895 896 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 897 self._advance(comment_end_size - 1) 898 else: 899 _peek = self._peek 900 while not self._end and _peek != "\n" and _peek != "\r": 901 self._advance(alnum=True) 902 _peek = self._peek 903 self._comments.append(self._text[comment_start_size:]) 904 905 if ( 906 comment_start == self.hint_start 907 and self.tokens 908 and self.tokens[-1].token_type in self.tokens_preceding_hint 909 ): 910 self._add(TokenType.HINT) 911 912 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 913 # Multiple consecutive comments are preserved by appending them to the current comments list. 914 if comment_start_line == self._prev_token_line: 915 self.tokens[-1].comments.extend(self._comments) 916 self._comments = [] 917 self._prev_token_line = self._line 918 919 return True 920 921 def _scan_number(self) -> None: 922 if self._char == "0": 923 peek = _CHAR_UPPER.get(self._peek, self._peek) 924 if peek == "B": 925 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 926 elif peek == "X": 927 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 928 929 decimal = False 930 scientific = 0 931 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 932 single_tokens = self.single_tokens 933 keywords = self.keywords 934 numeric_literals = self.numeric_literals 935 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 936 937 is_underscore_separated: bool = False 938 number_text: str = "" 939 numeric_literal: str = "" 940 numeric_type: TokenType | None = None 941 942 while True: 943 if self._peek in _DIGIT_CHARS: 944 # Batch consecutive digits: scan ahead to find how many 945 sql = self.sql 946 end = self._current + 1 947 size = self.size 948 while end < size and sql[end] in _DIGIT_CHARS: 949 end += 1 950 self._advance(end - self._current) 951 elif self._peek == "." and not decimal: 952 if ( 953 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 954 ) or not self.numbers_can_have_decimals: 955 break 956 decimal = True 957 self._advance() 958 elif self._peek in ("-", "+") and scientific == 1: 959 # Only consume +/- if followed by a digit 960 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 961 scientific += 1 962 self._advance() 963 else: 964 break 965 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 966 scientific += 1 967 self._advance() 968 elif self._peek == "_" and numbers_can_be_underscore_separated: 969 is_underscore_separated = True 970 self._advance() 971 elif self._peek.isidentifier(): 972 number_text = self._text 973 974 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 975 numeric_literal += self._peek 976 self._advance() 977 978 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 979 980 if numeric_type: 981 break 982 elif identifiers_can_start_with_digit: 983 return self._add(TokenType.VAR) 984 985 self._advance(-len(numeric_literal)) 986 break 987 else: 988 break 989 990 number_text = number_text or self.sql[self._start : self._current] 991 992 # Normalize inputs such as 100_000 to 100000 993 if is_underscore_separated: 994 number_text = number_text.replace("_", "") 995 996 self._add(TokenType.NUMBER, number_text) 997 998 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 999 if numeric_type: 1000 self._add(TokenType.DCOLON, "::") 1001 self._add(numeric_type, numeric_literal) 1002 1003 def _scan_bits(self) -> None: 1004 self._advance() 1005 value = self._extract_value() 1006 try: 1007 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1008 int(value, 2) 1009 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1010 except ValueError: 1011 self._add(TokenType.IDENTIFIER) 1012 1013 def _scan_hex(self) -> None: 1014 self._advance() 1015 value = self._extract_value() 1016 try: 1017 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1018 int(value, 16) 1019 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1020 except ValueError: 1021 self._add(TokenType.IDENTIFIER) 1022 1023 def _extract_value(self) -> str: 1024 single_tokens = self.single_tokens 1025 1026 while True: 1027 char = self._peek.strip() 1028 if char and char not in single_tokens: 1029 self._advance(alnum=True) 1030 else: 1031 break 1032 1033 return self._text 1034 1035 def _scan_string(self, start: str) -> bool: 1036 base = None 1037 token_type = TokenType.STRING 1038 1039 if start in self.quotes: 1040 end = self.quotes[start] 1041 elif start in self.format_strings: 1042 end, token_type = self.format_strings[start] 1043 1044 if token_type == TokenType.HEX_STRING: 1045 base = 16 1046 elif token_type == TokenType.BIT_STRING: 1047 base = 2 1048 elif token_type == TokenType.HEREDOC_STRING: 1049 self._advance() 1050 1051 if self._char == end: 1052 tag = "" 1053 else: 1054 tag = self._extract_string( 1055 end, 1056 raw_string=True, 1057 raise_unmatched=not self.heredoc_tag_is_identifier, 1058 ) 1059 1060 if ( 1061 tag 1062 and self.heredoc_tag_is_identifier 1063 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1064 ): 1065 if not self._end: 1066 self._advance(-1) 1067 1068 self._advance(-len(tag)) 1069 self._add(self.heredoc_string_alternative) 1070 return True 1071 1072 end = f"{start}{tag}{end}" 1073 else: 1074 return False 1075 1076 self._advance(len(start)) 1077 text = self._extract_string( 1078 end, 1079 escapes=( 1080 self.byte_string_escapes 1081 if token_type == TokenType.BYTE_STRING 1082 else self.string_escapes 1083 ), 1084 raw_string=token_type == TokenType.RAW_STRING, 1085 ) 1086 1087 if base and text: 1088 try: 1089 int(text, base) 1090 except Exception: 1091 raise TokenError( 1092 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1093 ) 1094 1095 self._add(token_type, text) 1096 return True 1097 1098 def _scan_identifier(self, identifier_end: str) -> None: 1099 self._advance() 1100 text = self._extract_string( 1101 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1102 ) 1103 self._add(TokenType.IDENTIFIER, text) 1104 1105 def _scan_var(self) -> None: 1106 var_single_tokens = self.var_single_tokens 1107 single_tokens = self.single_tokens 1108 1109 while True: 1110 peek = self._peek 1111 if not peek or peek.isspace(): 1112 break 1113 if peek not in var_single_tokens and peek in single_tokens: 1114 break 1115 self._advance(alnum=True) 1116 1117 self._add( 1118 TokenType.VAR 1119 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1120 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1121 ) 1122 1123 def _extract_string( 1124 self, 1125 delimiter: str, 1126 escapes: set[str] | None = None, 1127 raw_string: bool = False, 1128 raise_unmatched: bool = True, 1129 ) -> str: 1130 text = "" 1131 delim_size = len(delimiter) 1132 escapes = self.string_escapes if escapes is None else escapes 1133 unescaped_sequences = self.unescaped_sequences 1134 escape_follow_chars = self.escape_follow_chars 1135 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1136 quotes = self.quotes 1137 sql = self.sql 1138 1139 # use str.find() when the string is simple... no \ or other escapes 1140 if delim_size == 1: 1141 pos = self._current - 1 1142 end = sql.find(delimiter, pos) 1143 1144 if ( 1145 # the closing delimiter was found 1146 end != -1 1147 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1148 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1149 # no backslash in the string that would need escape processing 1150 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1151 ): 1152 newlines = sql.count("\n", pos, end) 1153 if newlines: 1154 self._line += newlines 1155 self._col = end - sql.rfind("\n", pos, end) 1156 else: 1157 self._col += end - pos 1158 1159 self._current = end + 1 1160 self._end = self._current >= self.size 1161 self._char = sql[end] 1162 self._peek = "" if self._end else sql[self._current] 1163 return sql[pos:end] 1164 1165 while True: 1166 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1167 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1168 if unescaped_sequence: 1169 self._advance(2) 1170 text += unescaped_sequence 1171 continue 1172 1173 is_valid_custom_escape = ( 1174 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1175 ) 1176 1177 if ( 1178 (string_escapes_allowed_in_raw_strings or not raw_string) 1179 and self._char in escapes 1180 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1181 and (self._char not in quotes or self._char == self._peek) 1182 ): 1183 if self._peek == delimiter: 1184 text += self._peek 1185 elif is_valid_custom_escape and self._char != self._peek: 1186 text += self._peek 1187 else: 1188 text += self._char + self._peek 1189 1190 if self._current + 1 < self.size: 1191 self._advance(2) 1192 else: 1193 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1194 else: 1195 if self._chars(delim_size) == delimiter: 1196 if delim_size > 1: 1197 self._advance(delim_size - 1) 1198 break 1199 1200 if self._end: 1201 if not raise_unmatched: 1202 return text + self._char 1203 1204 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1205 1206 current = self._current - 1 1207 self._advance(alnum=True) 1208 text += sql[current : self._current - 1] 1209 1210 return text
class
TokenType(enum.IntEnum):
14class TokenType(IntEnum): 15 L_PAREN = auto() 16 R_PAREN = auto() 17 L_BRACKET = auto() 18 R_BRACKET = auto() 19 L_BRACE = auto() 20 R_BRACE = auto() 21 COMMA = auto() 22 DOT = auto() 23 DASH = auto() 24 PLUS = auto() 25 COLON = auto() 26 DOTCOLON = auto() 27 DOTCARET = auto() 28 DCOLON = auto() 29 DCOLONDOLLAR = auto() 30 DCOLONPERCENT = auto() 31 DCOLONQMARK = auto() 32 DQMARK = auto() 33 SEMICOLON = auto() 34 STAR = auto() 35 BACKSLASH = auto() 36 SLASH = auto() 37 LT = auto() 38 LTE = auto() 39 GT = auto() 40 GTE = auto() 41 NOT = auto() 42 EQ = auto() 43 NEQ = auto() 44 NULLSAFE_EQ = auto() 45 COLON_EQ = auto() 46 COLON_GT = auto() 47 NCOLON_GT = auto() 48 AND = auto() 49 OR = auto() 50 AMP = auto() 51 DPIPE = auto() 52 PIPE_GT = auto() 53 PIPE = auto() 54 PIPE_SLASH = auto() 55 DPIPE_SLASH = auto() 56 CARET = auto() 57 CARET_AT = auto() 58 TILDE = auto() 59 ARROW = auto() 60 DARROW = auto() 61 FARROW = auto() 62 HASH = auto() 63 HASH_ARROW = auto() 64 DHASH_ARROW = auto() 65 LR_ARROW = auto() 66 LLRR_ARROW = auto() 67 DAT = auto() 68 AT_QMARK = auto() 69 LT_AT = auto() 70 AT_GT = auto() 71 DOLLAR = auto() 72 PARAMETER = auto() 73 SESSION = auto() 74 SESSION_PARAMETER = auto() 75 SESSION_USER = auto() 76 DAMP = auto() 77 AMP_LT = auto() 78 AMP_GT = auto() 79 ADJACENT = auto() 80 XOR = auto() 81 DSTAR = auto() 82 QMARK_AMP = auto() 83 QMARK_PIPE = auto() 84 HASH_DASH = auto() 85 EXCLAMATION = auto() 86 87 URI_START = auto() 88 89 BLOCK_START = auto() 90 BLOCK_END = auto() 91 92 SPACE = auto() 93 BREAK = auto() 94 95 STRING = auto() 96 NUMBER = auto() 97 IDENTIFIER = auto() 98 DATABASE = auto() 99 COLUMN = auto() 100 COLUMN_DEF = auto() 101 SCHEMA = auto() 102 TABLE = auto() 103 WAREHOUSE = auto() 104 STAGE = auto() 105 STREAM = auto() 106 STREAMLIT = auto() 107 VAR = auto() 108 BIT_STRING = auto() 109 HEX_STRING = auto() 110 BYTE_STRING = auto() 111 NATIONAL_STRING = auto() 112 RAW_STRING = auto() 113 HEREDOC_STRING = auto() 114 UNICODE_STRING = auto() 115 116 # types 117 BIT = auto() 118 BOOLEAN = auto() 119 TINYINT = auto() 120 UTINYINT = auto() 121 SMALLINT = auto() 122 USMALLINT = auto() 123 MEDIUMINT = auto() 124 UMEDIUMINT = auto() 125 INT = auto() 126 UINT = auto() 127 BIGINT = auto() 128 UBIGINT = auto() 129 BIGNUM = auto() 130 INT128 = auto() 131 UINT128 = auto() 132 INT256 = auto() 133 UINT256 = auto() 134 FLOAT = auto() 135 DOUBLE = auto() 136 UDOUBLE = auto() 137 DECIMAL = auto() 138 DECIMAL32 = auto() 139 DECIMAL64 = auto() 140 DECIMAL128 = auto() 141 DECIMAL256 = auto() 142 DECFLOAT = auto() 143 UDECIMAL = auto() 144 BIGDECIMAL = auto() 145 CHAR = auto() 146 NCHAR = auto() 147 VARCHAR = auto() 148 NVARCHAR = auto() 149 BPCHAR = auto() 150 TEXT = auto() 151 MEDIUMTEXT = auto() 152 LONGTEXT = auto() 153 BLOB = auto() 154 MEDIUMBLOB = auto() 155 LONGBLOB = auto() 156 TINYBLOB = auto() 157 TINYTEXT = auto() 158 NAME = auto() 159 BINARY = auto() 160 VARBINARY = auto() 161 JSON = auto() 162 JSONB = auto() 163 TIME = auto() 164 TIMETZ = auto() 165 TIME_NS = auto() 166 TIMESTAMP = auto() 167 TIMESTAMPTZ = auto() 168 TIMESTAMPLTZ = auto() 169 TIMESTAMPNTZ = auto() 170 TIMESTAMP_S = auto() 171 TIMESTAMP_MS = auto() 172 TIMESTAMP_NS = auto() 173 DATETIME = auto() 174 DATETIME2 = auto() 175 DATETIME64 = auto() 176 SMALLDATETIME = auto() 177 DATE = auto() 178 DATE32 = auto() 179 INT4RANGE = auto() 180 INT4MULTIRANGE = auto() 181 INT8RANGE = auto() 182 INT8MULTIRANGE = auto() 183 NUMRANGE = auto() 184 NUMMULTIRANGE = auto() 185 TSRANGE = auto() 186 TSMULTIRANGE = auto() 187 TSTZRANGE = auto() 188 TSTZMULTIRANGE = auto() 189 DATERANGE = auto() 190 DATEMULTIRANGE = auto() 191 UUID = auto() 192 GEOGRAPHY = auto() 193 GEOGRAPHYPOINT = auto() 194 NULLABLE = auto() 195 GEOMETRY = auto() 196 POINT = auto() 197 RING = auto() 198 LINESTRING = auto() 199 LOCALTIME = auto() 200 LOCALTIMESTAMP = auto() 201 SYSTIMESTAMP = auto() 202 MULTILINESTRING = auto() 203 POLYGON = auto() 204 MULTIPOLYGON = auto() 205 HLLSKETCH = auto() 206 HSTORE = auto() 207 SUPER = auto() 208 SERIAL = auto() 209 SMALLSERIAL = auto() 210 BIGSERIAL = auto() 211 XML = auto() 212 YEAR = auto() 213 USERDEFINED = auto() 214 MONEY = auto() 215 SMALLMONEY = auto() 216 ROWVERSION = auto() 217 IMAGE = auto() 218 VARIANT = auto() 219 OBJECT = auto() 220 INET = auto() 221 IPADDRESS = auto() 222 IPPREFIX = auto() 223 IPV4 = auto() 224 IPV6 = auto() 225 ENUM = auto() 226 ENUM8 = auto() 227 ENUM16 = auto() 228 FIXEDSTRING = auto() 229 LOWCARDINALITY = auto() 230 NESTED = auto() 231 AGGREGATEFUNCTION = auto() 232 SIMPLEAGGREGATEFUNCTION = auto() 233 TDIGEST = auto() 234 UNKNOWN = auto() 235 VECTOR = auto() 236 DYNAMIC = auto() 237 VOID = auto() 238 239 # keywords 240 ALIAS = auto() 241 ALTER = auto() 242 ALL = auto() 243 ANTI = auto() 244 ANY = auto() 245 APPLY = auto() 246 ARRAY = auto() 247 ASC = auto() 248 ASOF = auto() 249 ATTACH = auto() 250 AUTO_INCREMENT = auto() 251 BEGIN = auto() 252 BETWEEN = auto() 253 BULK_COLLECT_INTO = auto() 254 CACHE = auto() 255 CASE = auto() 256 CHARACTER_SET = auto() 257 CLUSTER_BY = auto() 258 COLLATE = auto() 259 COMMAND = auto() 260 COMMENT = auto() 261 COMMIT = auto() 262 CONNECT_BY = auto() 263 CONSTRAINT = auto() 264 COPY = auto() 265 CREATE = auto() 266 CROSS = auto() 267 CUBE = auto() 268 CURRENT_DATE = auto() 269 CURRENT_DATETIME = auto() 270 CURRENT_SCHEMA = auto() 271 CURRENT_TIME = auto() 272 CURRENT_TIMESTAMP = auto() 273 CURRENT_USER = auto() 274 CURRENT_USER_ID = auto() 275 CURRENT_ROLE = auto() 276 CURRENT_CATALOG = auto() 277 DECLARE = auto() 278 DEFAULT = auto() 279 DELETE = auto() 280 DESC = auto() 281 DESCRIBE = auto() 282 DETACH = auto() 283 DICTIONARY = auto() 284 DISTINCT = auto() 285 DISTRIBUTE_BY = auto() 286 DIV = auto() 287 DROP = auto() 288 ELSE = auto() 289 END = auto() 290 ESCAPE = auto() 291 EXCEPT = auto() 292 EXECUTE = auto() 293 EXISTS = auto() 294 FALSE = auto() 295 FETCH = auto() 296 FILE = auto() 297 FILE_FORMAT = auto() 298 FILTER = auto() 299 FINAL = auto() 300 FIRST = auto() 301 FOR = auto() 302 FORCE = auto() 303 FOREIGN_KEY = auto() 304 FORMAT = auto() 305 FROM = auto() 306 FULL = auto() 307 FUNCTION = auto() 308 GET = auto() 309 GLOB = auto() 310 GLOBAL = auto() 311 GRANT = auto() 312 GROUP_BY = auto() 313 GROUPING_SETS = auto() 314 HAVING = auto() 315 HINT = auto() 316 IGNORE = auto() 317 ILIKE = auto() 318 IN = auto() 319 INDEX = auto() 320 INDEXED_BY = auto() 321 INNER = auto() 322 INSERT = auto() 323 INSTALL = auto() 324 INTEGRATION = auto() 325 INTERSECT = auto() 326 INTERVAL = auto() 327 INTO = auto() 328 INTRODUCER = auto() 329 IRLIKE = auto() 330 IS = auto() 331 ISNULL = auto() 332 JOIN = auto() 333 JOIN_MARKER = auto() 334 KEEP = auto() 335 KEY = auto() 336 KILL = auto() 337 LANGUAGE = auto() 338 LATERAL = auto() 339 LEFT = auto() 340 LIKE = auto() 341 LIMIT = auto() 342 LIST = auto() 343 LOAD = auto() 344 LOCK = auto() 345 MAP = auto() 346 MATCH = auto() 347 MATCH_CONDITION = auto() 348 MATCH_RECOGNIZE = auto() 349 MEMBER_OF = auto() 350 MERGE = auto() 351 MOD = auto() 352 MODEL = auto() 353 NATURAL = auto() 354 NEXT = auto() 355 NOTHING = auto() 356 NOTNULL = auto() 357 NULL = auto() 358 OBJECT_IDENTIFIER = auto() 359 OFFSET = auto() 360 ON = auto() 361 ONLY = auto() 362 OPERATOR = auto() 363 ORDER_BY = auto() 364 ORDER_SIBLINGS_BY = auto() 365 ORDERED = auto() 366 ORDINALITY = auto() 367 OUT = auto() 368 INOUT = auto() 369 OUTER = auto() 370 OVER = auto() 371 OVERLAPS = auto() 372 OVERWRITE = auto() 373 PACKAGE = auto() 374 PARTITION = auto() 375 PARTITION_BY = auto() 376 PERCENT = auto() 377 PIVOT = auto() 378 PLACEHOLDER = auto() 379 POLICY = auto() 380 POOL = auto() 381 POSITIONAL = auto() 382 PRAGMA = auto() 383 PREWHERE = auto() 384 PRIMARY_KEY = auto() 385 PROCEDURE = auto() 386 PROPERTIES = auto() 387 PSEUDO_TYPE = auto() 388 PUT = auto() 389 QUALIFY = auto() 390 QUOTE = auto() 391 QDCOLON = auto() 392 RANGE = auto() 393 RECURSIVE = auto() 394 REFRESH = auto() 395 RENAME = auto() 396 REPLACE = auto() 397 RETURNING = auto() 398 REVOKE = auto() 399 REFERENCES = auto() 400 RIGHT = auto() 401 RLIKE = auto() 402 ROLE = auto() 403 ROLLBACK = auto() 404 ROLLUP = auto() 405 ROW = auto() 406 ROWS = auto() 407 RULE = auto() 408 SELECT = auto() 409 SEMI = auto() 410 SEPARATOR = auto() 411 SEQUENCE = auto() 412 SERDE_PROPERTIES = auto() 413 SET = auto() 414 SETTINGS = auto() 415 SHOW = auto() 416 SIMILAR_TO = auto() 417 SOME = auto() 418 SORT_BY = auto() 419 SOUNDS_LIKE = auto() 420 SQL_SECURITY = auto() 421 START_WITH = auto() 422 STORAGE_INTEGRATION = auto() 423 STRAIGHT_JOIN = auto() 424 STRUCT = auto() 425 SUMMARIZE = auto() 426 TABLE_SAMPLE = auto() 427 TAG = auto() 428 TEMPORARY = auto() 429 TOP = auto() 430 THEN = auto() 431 TRUE = auto() 432 TRUNCATE = auto() 433 TRIGGER = auto() 434 TYPE = auto() 435 UNCACHE = auto() 436 UNDROP = auto() 437 UNION = auto() 438 UNNEST = auto() 439 UNPIVOT = auto() 440 UPDATE = auto() 441 USE = auto() 442 USING = auto() 443 VALUES = auto() 444 VARIADIC = auto() 445 VIEW = auto() 446 SEMANTIC_VIEW = auto() 447 VOLATILE = auto() 448 VOLUME = auto() 449 WHEN = auto() 450 WHERE = auto() 451 WINDOW = auto() 452 WITH = auto() 453 UNIQUE = auto() 454 UTC_DATE = auto() 455 UTC_TIME = auto() 456 UTC_TIMESTAMP = auto() 457 VERSION_SNAPSHOT = auto() 458 TIMESTAMP_SNAPSHOT = auto() 459 OPTION = auto() 460 SINK = auto() 461 SOURCE = auto() 462 ANALYZE = auto() 463 NAMESPACE = auto() 464 EXPORT = auto() 465 466 # sentinels 467 HIVE_TOKEN_STREAM = auto() 468 SENTINEL = auto() 469 470 def __str__(self) -> str: 471 return f"TokenType.{self.name}"
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 1>
R_PAREN =
<TokenType.R_PAREN: 2>
L_BRACKET =
<TokenType.L_BRACKET: 3>
R_BRACKET =
<TokenType.R_BRACKET: 4>
L_BRACE =
<TokenType.L_BRACE: 5>
R_BRACE =
<TokenType.R_BRACE: 6>
COMMA =
<TokenType.COMMA: 7>
DOT =
<TokenType.DOT: 8>
DASH =
<TokenType.DASH: 9>
PLUS =
<TokenType.PLUS: 10>
COLON =
<TokenType.COLON: 11>
DOTCOLON =
<TokenType.DOTCOLON: 12>
DOTCARET =
<TokenType.DOTCARET: 13>
DCOLON =
<TokenType.DCOLON: 14>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 16>
DCOLONQMARK =
<TokenType.DCOLONQMARK: 17>
DQMARK =
<TokenType.DQMARK: 18>
SEMICOLON =
<TokenType.SEMICOLON: 19>
STAR =
<TokenType.STAR: 20>
BACKSLASH =
<TokenType.BACKSLASH: 21>
SLASH =
<TokenType.SLASH: 22>
LT =
<TokenType.LT: 23>
LTE =
<TokenType.LTE: 24>
GT =
<TokenType.GT: 25>
GTE =
<TokenType.GTE: 26>
NOT =
<TokenType.NOT: 27>
EQ =
<TokenType.EQ: 28>
NEQ =
<TokenType.NEQ: 29>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 30>
COLON_EQ =
<TokenType.COLON_EQ: 31>
COLON_GT =
<TokenType.COLON_GT: 32>
NCOLON_GT =
<TokenType.NCOLON_GT: 33>
AND =
<TokenType.AND: 34>
OR =
<TokenType.OR: 35>
AMP =
<TokenType.AMP: 36>
DPIPE =
<TokenType.DPIPE: 37>
PIPE_GT =
<TokenType.PIPE_GT: 38>
PIPE =
<TokenType.PIPE: 39>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 40>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 41>
CARET =
<TokenType.CARET: 42>
CARET_AT =
<TokenType.CARET_AT: 43>
TILDE =
<TokenType.TILDE: 44>
ARROW =
<TokenType.ARROW: 45>
DARROW =
<TokenType.DARROW: 46>
FARROW =
<TokenType.FARROW: 47>
HASH =
<TokenType.HASH: 48>
HASH_ARROW =
<TokenType.HASH_ARROW: 49>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 50>
LR_ARROW =
<TokenType.LR_ARROW: 51>
LLRR_ARROW =
<TokenType.LLRR_ARROW: 52>
DAT =
<TokenType.DAT: 53>
AT_QMARK =
<TokenType.AT_QMARK: 54>
LT_AT =
<TokenType.LT_AT: 55>
AT_GT =
<TokenType.AT_GT: 56>
DOLLAR =
<TokenType.DOLLAR: 57>
PARAMETER =
<TokenType.PARAMETER: 58>
SESSION =
<TokenType.SESSION: 59>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 60>
SESSION_USER =
<TokenType.SESSION_USER: 61>
DAMP =
<TokenType.DAMP: 62>
AMP_LT =
<TokenType.AMP_LT: 63>
AMP_GT =
<TokenType.AMP_GT: 64>
ADJACENT =
<TokenType.ADJACENT: 65>
XOR =
<TokenType.XOR: 66>
DSTAR =
<TokenType.DSTAR: 67>
QMARK_AMP =
<TokenType.QMARK_AMP: 68>
QMARK_PIPE =
<TokenType.QMARK_PIPE: 69>
HASH_DASH =
<TokenType.HASH_DASH: 70>
EXCLAMATION =
<TokenType.EXCLAMATION: 71>
URI_START =
<TokenType.URI_START: 72>
BLOCK_START =
<TokenType.BLOCK_START: 73>
BLOCK_END =
<TokenType.BLOCK_END: 74>
SPACE =
<TokenType.SPACE: 75>
BREAK =
<TokenType.BREAK: 76>
STRING =
<TokenType.STRING: 77>
NUMBER =
<TokenType.NUMBER: 78>
IDENTIFIER =
<TokenType.IDENTIFIER: 79>
DATABASE =
<TokenType.DATABASE: 80>
COLUMN =
<TokenType.COLUMN: 81>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 82>
SCHEMA =
<TokenType.SCHEMA: 83>
TABLE =
<TokenType.TABLE: 84>
WAREHOUSE =
<TokenType.WAREHOUSE: 85>
STAGE =
<TokenType.STAGE: 86>
STREAM =
<TokenType.STREAM: 87>
STREAMLIT =
<TokenType.STREAMLIT: 88>
VAR =
<TokenType.VAR: 89>
BIT_STRING =
<TokenType.BIT_STRING: 90>
HEX_STRING =
<TokenType.HEX_STRING: 91>
BYTE_STRING =
<TokenType.BYTE_STRING: 92>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 93>
RAW_STRING =
<TokenType.RAW_STRING: 94>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 95>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 96>
BIT =
<TokenType.BIT: 97>
BOOLEAN =
<TokenType.BOOLEAN: 98>
TINYINT =
<TokenType.TINYINT: 99>
UTINYINT =
<TokenType.UTINYINT: 100>
SMALLINT =
<TokenType.SMALLINT: 101>
USMALLINT =
<TokenType.USMALLINT: 102>
MEDIUMINT =
<TokenType.MEDIUMINT: 103>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 104>
INT =
<TokenType.INT: 105>
UINT =
<TokenType.UINT: 106>
BIGINT =
<TokenType.BIGINT: 107>
UBIGINT =
<TokenType.UBIGINT: 108>
BIGNUM =
<TokenType.BIGNUM: 109>
INT128 =
<TokenType.INT128: 110>
UINT128 =
<TokenType.UINT128: 111>
INT256 =
<TokenType.INT256: 112>
UINT256 =
<TokenType.UINT256: 113>
FLOAT =
<TokenType.FLOAT: 114>
DOUBLE =
<TokenType.DOUBLE: 115>
UDOUBLE =
<TokenType.UDOUBLE: 116>
DECIMAL =
<TokenType.DECIMAL: 117>
DECIMAL32 =
<TokenType.DECIMAL32: 118>
DECIMAL64 =
<TokenType.DECIMAL64: 119>
DECIMAL128 =
<TokenType.DECIMAL128: 120>
DECIMAL256 =
<TokenType.DECIMAL256: 121>
DECFLOAT =
<TokenType.DECFLOAT: 122>
UDECIMAL =
<TokenType.UDECIMAL: 123>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 124>
CHAR =
<TokenType.CHAR: 125>
NCHAR =
<TokenType.NCHAR: 126>
VARCHAR =
<TokenType.VARCHAR: 127>
NVARCHAR =
<TokenType.NVARCHAR: 128>
BPCHAR =
<TokenType.BPCHAR: 129>
TEXT =
<TokenType.TEXT: 130>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 131>
LONGTEXT =
<TokenType.LONGTEXT: 132>
BLOB =
<TokenType.BLOB: 133>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 134>
LONGBLOB =
<TokenType.LONGBLOB: 135>
TINYBLOB =
<TokenType.TINYBLOB: 136>
TINYTEXT =
<TokenType.TINYTEXT: 137>
NAME =
<TokenType.NAME: 138>
BINARY =
<TokenType.BINARY: 139>
VARBINARY =
<TokenType.VARBINARY: 140>
JSON =
<TokenType.JSON: 141>
JSONB =
<TokenType.JSONB: 142>
TIME =
<TokenType.TIME: 143>
TIMETZ =
<TokenType.TIMETZ: 144>
TIME_NS =
<TokenType.TIME_NS: 145>
TIMESTAMP =
<TokenType.TIMESTAMP: 146>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 147>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 148>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 149>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 150>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 151>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 152>
DATETIME =
<TokenType.DATETIME: 153>
DATETIME2 =
<TokenType.DATETIME2: 154>
DATETIME64 =
<TokenType.DATETIME64: 155>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 156>
DATE =
<TokenType.DATE: 157>
DATE32 =
<TokenType.DATE32: 158>
INT4RANGE =
<TokenType.INT4RANGE: 159>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 160>
INT8RANGE =
<TokenType.INT8RANGE: 161>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 162>
NUMRANGE =
<TokenType.NUMRANGE: 163>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 164>
TSRANGE =
<TokenType.TSRANGE: 165>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 166>
TSTZRANGE =
<TokenType.TSTZRANGE: 167>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 168>
DATERANGE =
<TokenType.DATERANGE: 169>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 170>
UUID =
<TokenType.UUID: 171>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 172>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 173>
NULLABLE =
<TokenType.NULLABLE: 174>
GEOMETRY =
<TokenType.GEOMETRY: 175>
POINT =
<TokenType.POINT: 176>
RING =
<TokenType.RING: 177>
LINESTRING =
<TokenType.LINESTRING: 178>
LOCALTIME =
<TokenType.LOCALTIME: 179>
LOCALTIMESTAMP =
<TokenType.LOCALTIMESTAMP: 180>
SYSTIMESTAMP =
<TokenType.SYSTIMESTAMP: 181>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 182>
POLYGON =
<TokenType.POLYGON: 183>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 184>
HLLSKETCH =
<TokenType.HLLSKETCH: 185>
HSTORE =
<TokenType.HSTORE: 186>
SUPER =
<TokenType.SUPER: 187>
SERIAL =
<TokenType.SERIAL: 188>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 189>
BIGSERIAL =
<TokenType.BIGSERIAL: 190>
XML =
<TokenType.XML: 191>
YEAR =
<TokenType.YEAR: 192>
USERDEFINED =
<TokenType.USERDEFINED: 193>
MONEY =
<TokenType.MONEY: 194>
SMALLMONEY =
<TokenType.SMALLMONEY: 195>
ROWVERSION =
<TokenType.ROWVERSION: 196>
IMAGE =
<TokenType.IMAGE: 197>
VARIANT =
<TokenType.VARIANT: 198>
OBJECT =
<TokenType.OBJECT: 199>
INET =
<TokenType.INET: 200>
IPADDRESS =
<TokenType.IPADDRESS: 201>
IPPREFIX =
<TokenType.IPPREFIX: 202>
IPV4 =
<TokenType.IPV4: 203>
IPV6 =
<TokenType.IPV6: 204>
ENUM =
<TokenType.ENUM: 205>
ENUM8 =
<TokenType.ENUM8: 206>
ENUM16 =
<TokenType.ENUM16: 207>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 208>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 209>
NESTED =
<TokenType.NESTED: 210>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 211>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 212>
TDIGEST =
<TokenType.TDIGEST: 213>
UNKNOWN =
<TokenType.UNKNOWN: 214>
VECTOR =
<TokenType.VECTOR: 215>
DYNAMIC =
<TokenType.DYNAMIC: 216>
VOID =
<TokenType.VOID: 217>
ALIAS =
<TokenType.ALIAS: 218>
ALTER =
<TokenType.ALTER: 219>
ALL =
<TokenType.ALL: 220>
ANTI =
<TokenType.ANTI: 221>
ANY =
<TokenType.ANY: 222>
APPLY =
<TokenType.APPLY: 223>
ARRAY =
<TokenType.ARRAY: 224>
ASC =
<TokenType.ASC: 225>
ASOF =
<TokenType.ASOF: 226>
ATTACH =
<TokenType.ATTACH: 227>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 228>
BEGIN =
<TokenType.BEGIN: 229>
BETWEEN =
<TokenType.BETWEEN: 230>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 231>
CACHE =
<TokenType.CACHE: 232>
CASE =
<TokenType.CASE: 233>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 234>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 235>
COLLATE =
<TokenType.COLLATE: 236>
COMMAND =
<TokenType.COMMAND: 237>
COMMENT =
<TokenType.COMMENT: 238>
COMMIT =
<TokenType.COMMIT: 239>
CONNECT_BY =
<TokenType.CONNECT_BY: 240>
CONSTRAINT =
<TokenType.CONSTRAINT: 241>
COPY =
<TokenType.COPY: 242>
CREATE =
<TokenType.CREATE: 243>
CROSS =
<TokenType.CROSS: 244>
CUBE =
<TokenType.CUBE: 245>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 246>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 247>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 248>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 249>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 250>
CURRENT_USER =
<TokenType.CURRENT_USER: 251>
CURRENT_USER_ID =
<TokenType.CURRENT_USER_ID: 252>
CURRENT_ROLE =
<TokenType.CURRENT_ROLE: 253>
CURRENT_CATALOG =
<TokenType.CURRENT_CATALOG: 254>
DECLARE =
<TokenType.DECLARE: 255>
DEFAULT =
<TokenType.DEFAULT: 256>
DELETE =
<TokenType.DELETE: 257>
DESC =
<TokenType.DESC: 258>
DESCRIBE =
<TokenType.DESCRIBE: 259>
DETACH =
<TokenType.DETACH: 260>
DICTIONARY =
<TokenType.DICTIONARY: 261>
DISTINCT =
<TokenType.DISTINCT: 262>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 263>
DIV =
<TokenType.DIV: 264>
DROP =
<TokenType.DROP: 265>
ELSE =
<TokenType.ELSE: 266>
END =
<TokenType.END: 267>
ESCAPE =
<TokenType.ESCAPE: 268>
EXCEPT =
<TokenType.EXCEPT: 269>
EXECUTE =
<TokenType.EXECUTE: 270>
EXISTS =
<TokenType.EXISTS: 271>
FALSE =
<TokenType.FALSE: 272>
FETCH =
<TokenType.FETCH: 273>
FILE =
<TokenType.FILE: 274>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 275>
FILTER =
<TokenType.FILTER: 276>
FINAL =
<TokenType.FINAL: 277>
FIRST =
<TokenType.FIRST: 278>
FOR =
<TokenType.FOR: 279>
FORCE =
<TokenType.FORCE: 280>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 281>
FORMAT =
<TokenType.FORMAT: 282>
FROM =
<TokenType.FROM: 283>
FULL =
<TokenType.FULL: 284>
FUNCTION =
<TokenType.FUNCTION: 285>
GET =
<TokenType.GET: 286>
GLOB =
<TokenType.GLOB: 287>
GLOBAL =
<TokenType.GLOBAL: 288>
GRANT =
<TokenType.GRANT: 289>
GROUP_BY =
<TokenType.GROUP_BY: 290>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 291>
HAVING =
<TokenType.HAVING: 292>
HINT =
<TokenType.HINT: 293>
IGNORE =
<TokenType.IGNORE: 294>
ILIKE =
<TokenType.ILIKE: 295>
IN =
<TokenType.IN: 296>
INDEX =
<TokenType.INDEX: 297>
INDEXED_BY =
<TokenType.INDEXED_BY: 298>
INNER =
<TokenType.INNER: 299>
INSERT =
<TokenType.INSERT: 300>
INSTALL =
<TokenType.INSTALL: 301>
INTEGRATION =
<TokenType.INTEGRATION: 302>
INTERSECT =
<TokenType.INTERSECT: 303>
INTERVAL =
<TokenType.INTERVAL: 304>
INTO =
<TokenType.INTO: 305>
INTRODUCER =
<TokenType.INTRODUCER: 306>
IRLIKE =
<TokenType.IRLIKE: 307>
IS =
<TokenType.IS: 308>
ISNULL =
<TokenType.ISNULL: 309>
JOIN =
<TokenType.JOIN: 310>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 311>
KEEP =
<TokenType.KEEP: 312>
KEY =
<TokenType.KEY: 313>
KILL =
<TokenType.KILL: 314>
LANGUAGE =
<TokenType.LANGUAGE: 315>
LATERAL =
<TokenType.LATERAL: 316>
LEFT =
<TokenType.LEFT: 317>
LIKE =
<TokenType.LIKE: 318>
LIMIT =
<TokenType.LIMIT: 319>
LIST =
<TokenType.LIST: 320>
LOAD =
<TokenType.LOAD: 321>
LOCK =
<TokenType.LOCK: 322>
MAP =
<TokenType.MAP: 323>
MATCH =
<TokenType.MATCH: 324>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 325>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 326>
MEMBER_OF =
<TokenType.MEMBER_OF: 327>
MERGE =
<TokenType.MERGE: 328>
MOD =
<TokenType.MOD: 329>
MODEL =
<TokenType.MODEL: 330>
NATURAL =
<TokenType.NATURAL: 331>
NEXT =
<TokenType.NEXT: 332>
NOTHING =
<TokenType.NOTHING: 333>
NOTNULL =
<TokenType.NOTNULL: 334>
NULL =
<TokenType.NULL: 335>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 336>
OFFSET =
<TokenType.OFFSET: 337>
ON =
<TokenType.ON: 338>
ONLY =
<TokenType.ONLY: 339>
OPERATOR =
<TokenType.OPERATOR: 340>
ORDER_BY =
<TokenType.ORDER_BY: 341>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 342>
ORDERED =
<TokenType.ORDERED: 343>
ORDINALITY =
<TokenType.ORDINALITY: 344>
OUT =
<TokenType.OUT: 345>
INOUT =
<TokenType.INOUT: 346>
OUTER =
<TokenType.OUTER: 347>
OVER =
<TokenType.OVER: 348>
OVERLAPS =
<TokenType.OVERLAPS: 349>
OVERWRITE =
<TokenType.OVERWRITE: 350>
PACKAGE =
<TokenType.PACKAGE: 351>
PARTITION =
<TokenType.PARTITION: 352>
PARTITION_BY =
<TokenType.PARTITION_BY: 353>
PERCENT =
<TokenType.PERCENT: 354>
PIVOT =
<TokenType.PIVOT: 355>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 356>
POLICY =
<TokenType.POLICY: 357>
POOL =
<TokenType.POOL: 358>
POSITIONAL =
<TokenType.POSITIONAL: 359>
PRAGMA =
<TokenType.PRAGMA: 360>
PREWHERE =
<TokenType.PREWHERE: 361>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 362>
PROCEDURE =
<TokenType.PROCEDURE: 363>
PROPERTIES =
<TokenType.PROPERTIES: 364>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 365>
PUT =
<TokenType.PUT: 366>
QUALIFY =
<TokenType.QUALIFY: 367>
QUOTE =
<TokenType.QUOTE: 368>
QDCOLON =
<TokenType.QDCOLON: 369>
RANGE =
<TokenType.RANGE: 370>
RECURSIVE =
<TokenType.RECURSIVE: 371>
REFRESH =
<TokenType.REFRESH: 372>
RENAME =
<TokenType.RENAME: 373>
REPLACE =
<TokenType.REPLACE: 374>
RETURNING =
<TokenType.RETURNING: 375>
REVOKE =
<TokenType.REVOKE: 376>
REFERENCES =
<TokenType.REFERENCES: 377>
RIGHT =
<TokenType.RIGHT: 378>
RLIKE =
<TokenType.RLIKE: 379>
ROLE =
<TokenType.ROLE: 380>
ROLLBACK =
<TokenType.ROLLBACK: 381>
ROLLUP =
<TokenType.ROLLUP: 382>
ROW =
<TokenType.ROW: 383>
ROWS =
<TokenType.ROWS: 384>
RULE =
<TokenType.RULE: 385>
SELECT =
<TokenType.SELECT: 386>
SEMI =
<TokenType.SEMI: 387>
SEPARATOR =
<TokenType.SEPARATOR: 388>
SEQUENCE =
<TokenType.SEQUENCE: 389>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 390>
SET =
<TokenType.SET: 391>
SETTINGS =
<TokenType.SETTINGS: 392>
SHOW =
<TokenType.SHOW: 393>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 394>
SOME =
<TokenType.SOME: 395>
SORT_BY =
<TokenType.SORT_BY: 396>
SOUNDS_LIKE =
<TokenType.SOUNDS_LIKE: 397>
SQL_SECURITY =
<TokenType.SQL_SECURITY: 398>
START_WITH =
<TokenType.START_WITH: 399>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 400>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 401>
STRUCT =
<TokenType.STRUCT: 402>
SUMMARIZE =
<TokenType.SUMMARIZE: 403>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 404>
TAG =
<TokenType.TAG: 405>
TEMPORARY =
<TokenType.TEMPORARY: 406>
TOP =
<TokenType.TOP: 407>
THEN =
<TokenType.THEN: 408>
TRUE =
<TokenType.TRUE: 409>
TRUNCATE =
<TokenType.TRUNCATE: 410>
TRIGGER =
<TokenType.TRIGGER: 411>
TYPE =
<TokenType.TYPE: 412>
UNCACHE =
<TokenType.UNCACHE: 413>
UNDROP =
<TokenType.UNDROP: 414>
UNION =
<TokenType.UNION: 415>
UNNEST =
<TokenType.UNNEST: 416>
UNPIVOT =
<TokenType.UNPIVOT: 417>
UPDATE =
<TokenType.UPDATE: 418>
USE =
<TokenType.USE: 419>
USING =
<TokenType.USING: 420>
VALUES =
<TokenType.VALUES: 421>
VARIADIC =
<TokenType.VARIADIC: 422>
VIEW =
<TokenType.VIEW: 423>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 424>
VOLATILE =
<TokenType.VOLATILE: 425>
VOLUME =
<TokenType.VOLUME: 426>
WHEN =
<TokenType.WHEN: 427>
WHERE =
<TokenType.WHERE: 428>
WINDOW =
<TokenType.WINDOW: 429>
WITH =
<TokenType.WITH: 430>
UNIQUE =
<TokenType.UNIQUE: 431>
UTC_DATE =
<TokenType.UTC_DATE: 432>
UTC_TIME =
<TokenType.UTC_TIME: 433>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 434>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 435>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 436>
OPTION =
<TokenType.OPTION: 437>
SINK =
<TokenType.SINK: 438>
SOURCE =
<TokenType.SOURCE: 439>
ANALYZE =
<TokenType.ANALYZE: 440>
NAMESPACE =
<TokenType.NAMESPACE: 441>
EXPORT =
<TokenType.EXPORT: 442>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 443>
SENTINEL =
<TokenType.SENTINEL: 444>
class
Token:
474class Token: 475 # mypyc doesn't expose slots 476 _attrs: t.ClassVar[tuple[str, ...]] = ( 477 "token_type", 478 "text", 479 "line", 480 "col", 481 "start", 482 "end", 483 "comments", 484 ) 485 __slots__ = _attrs 486 487 @classmethod 488 def number(cls, number: int) -> Token: 489 """Returns a NUMBER token with `number` as its text.""" 490 return cls(TokenType.NUMBER, str(number)) 491 492 @classmethod 493 def string(cls, string: str) -> Token: 494 """Returns a STRING token with `string` as its text.""" 495 return cls(TokenType.STRING, string) 496 497 @classmethod 498 def identifier(cls, identifier: str) -> Token: 499 """Returns an IDENTIFIER token with `identifier` as its text.""" 500 return cls(TokenType.IDENTIFIER, identifier) 501 502 @classmethod 503 def var(cls, var: str) -> Token: 504 """Returns an VAR token with `var` as its text.""" 505 return cls(TokenType.VAR, var) 506 507 def __init__( 508 self, 509 token_type: TokenType, 510 text: str, 511 line: int = 1, 512 col: int = 1, 513 start: int = 0, 514 end: int = 0, 515 comments: list[str] | None = None, 516 ) -> None: 517 self.token_type = token_type 518 self.text = text 519 self.line = line 520 self.col = col 521 self.start = start 522 self.end = end 523 self.comments = [] if comments is None else comments 524 525 def __bool__(self) -> bool: 526 return self.token_type != TokenType.SENTINEL 527 528 def __repr__(self) -> str: 529 attributes = ", ".join( 530 f"{k}: TokenType.{self.token_type.name}" 531 if k == "token_type" 532 else f"{k}: {getattr(self, k)}" 533 for k in self._attrs 534 ) 535 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
507 def __init__( 508 self, 509 token_type: TokenType, 510 text: str, 511 line: int = 1, 512 col: int = 1, 513 start: int = 0, 514 end: int = 0, 515 comments: list[str] | None = None, 516 ) -> None: 517 self.token_type = token_type 518 self.text = text 519 self.line = line 520 self.col = col 521 self.start = start 522 self.end = end 523 self.comments = [] if comments is None else comments
487 @classmethod 488 def number(cls, number: int) -> Token: 489 """Returns a NUMBER token with `number` as its text.""" 490 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
492 @classmethod 493 def string(cls, string: str) -> Token: 494 """Returns a STRING token with `string` as its text.""" 495 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
497 @classmethod 498 def identifier(cls, identifier: str) -> Token: 499 """Returns an IDENTIFIER token with `identifier` as its text.""" 500 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
TokenizerCore:
538class TokenizerCore: 539 __slots__ = ( 540 "sql", 541 "size", 542 "tokens", 543 "_start", 544 "_current", 545 "_line", 546 "_col", 547 "_comments", 548 "_char", 549 "_end", 550 "_peek", 551 "_prev_token_line", 552 "single_tokens", 553 "keywords", 554 "quotes", 555 "format_strings", 556 "identifiers", 557 "comments", 558 "string_escapes", 559 "byte_string_escapes", 560 "identifier_escapes", 561 "escape_follow_chars", 562 "commands", 563 "command_prefix_tokens", 564 "nested_comments", 565 "hint_start", 566 "tokens_preceding_hint", 567 "has_bit_strings", 568 "has_hex_strings", 569 "numeric_literals", 570 "var_single_tokens", 571 "string_escapes_allowed_in_raw_strings", 572 "heredoc_tag_is_identifier", 573 "heredoc_string_alternative", 574 "keyword_trie", 575 "numbers_can_be_underscore_separated", 576 "numbers_can_have_decimals", 577 "identifiers_can_start_with_digit", 578 "unescaped_sequences", 579 ) 580 581 def __init__( 582 self, 583 single_tokens: dict[str, TokenType], 584 keywords: dict[str, TokenType], 585 quotes: dict[str, str], 586 format_strings: dict[str, tuple[str, TokenType]], 587 identifiers: dict[str, str], 588 comments: dict[str, str | None], 589 string_escapes: set[str], 590 byte_string_escapes: set[str], 591 identifier_escapes: set[str], 592 escape_follow_chars: set[str], 593 commands: set[TokenType], 594 command_prefix_tokens: set[TokenType], 595 nested_comments: bool, 596 hint_start: str, 597 tokens_preceding_hint: set[TokenType], 598 has_bit_strings: bool, 599 has_hex_strings: bool, 600 numeric_literals: dict[str, str], 601 var_single_tokens: set[str], 602 string_escapes_allowed_in_raw_strings: bool, 603 heredoc_tag_is_identifier: bool, 604 heredoc_string_alternative: TokenType, 605 keyword_trie: dict, 606 numbers_can_be_underscore_separated: bool, 607 numbers_can_have_decimals: bool, 608 identifiers_can_start_with_digit: bool, 609 unescaped_sequences: dict[str, str], 610 ) -> None: 611 self.single_tokens = single_tokens 612 self.keywords = keywords 613 self.quotes = quotes 614 self.format_strings = format_strings 615 self.identifiers = identifiers 616 self.comments = comments 617 self.string_escapes = string_escapes 618 self.byte_string_escapes = byte_string_escapes 619 self.identifier_escapes = identifier_escapes 620 self.escape_follow_chars = escape_follow_chars 621 self.commands = commands 622 self.command_prefix_tokens = command_prefix_tokens 623 self.nested_comments = nested_comments 624 self.hint_start = hint_start 625 self.tokens_preceding_hint = tokens_preceding_hint 626 self.has_bit_strings = has_bit_strings 627 self.has_hex_strings = has_hex_strings 628 self.numeric_literals = numeric_literals 629 self.var_single_tokens = var_single_tokens 630 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 631 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 632 self.heredoc_string_alternative = heredoc_string_alternative 633 self.keyword_trie = keyword_trie 634 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 635 self.numbers_can_have_decimals = numbers_can_have_decimals 636 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 637 self.unescaped_sequences = unescaped_sequences 638 self.sql = "" 639 self.size = 0 640 self.tokens: list[Token] = [] 641 self._start = 0 642 self._current = 0 643 self._line = 1 644 self._col = 0 645 self._comments: list[str] = [] 646 self._char = "" 647 self._end = False 648 self._peek = "" 649 self._prev_token_line = -1 650 651 def reset(self) -> None: 652 self.sql = "" 653 self.size = 0 654 self.tokens = [] 655 self._start = 0 656 self._current = 0 657 self._line = 1 658 self._col = 0 659 self._comments = [] 660 self._char = "" 661 self._end = False 662 self._peek = "" 663 self._prev_token_line = -1 664 665 def tokenize(self, sql: str) -> list[Token]: 666 """Returns a list of tokens corresponding to the SQL string `sql`.""" 667 self.reset() 668 self.sql = sql 669 self.size = len(sql) 670 671 try: 672 self._scan() 673 except Exception as e: 674 start = max(self._current - 50, 0) 675 end = min(self._current + 50, self.size - 1) 676 context = self.sql[start:end] 677 raise TokenError(f"Error tokenizing '{context}'") from e 678 679 return self.tokens 680 681 def _scan(self, check_semicolon: bool = False) -> None: 682 identifiers = self.identifiers 683 digit_chars = _DIGIT_CHARS 684 685 while self.size and not self._end: 686 current = self._current 687 688 # Skip spaces here rather than iteratively calling advance() for performance reasons 689 while current < self.size: 690 char = self.sql[current] 691 692 if char == " " or char == "\t": 693 current += 1 694 else: 695 break 696 697 offset = current - self._current if current > self._current else 1 698 699 self._start = current 700 self._advance(offset) 701 702 if not self._char.isspace(): 703 if self._char in digit_chars: 704 self._scan_number() 705 elif self._char in identifiers: 706 self._scan_identifier(identifiers[self._char]) 707 else: 708 self._scan_keywords() 709 710 if check_semicolon and self._peek == ";": 711 break 712 713 if self.tokens and self._comments: 714 self.tokens[-1].comments.extend(self._comments) 715 716 def _chars(self, size: int) -> str: 717 if size == 1: 718 return self._char 719 720 start = self._current - 1 721 end = start + size 722 723 return self.sql[start:end] if end <= self.size else "" 724 725 def _advance(self, i: int = 1, alnum: bool = False) -> None: 726 char = self._char 727 728 if char == "\n" or char == "\r": 729 # Ensures we don't count an extra line if we get a \r\n line break sequence 730 if not (char == "\r" and self._peek == "\n"): 731 self._col = i 732 self._line += 1 733 else: 734 self._col += i 735 736 self._current += i 737 sql = self.sql 738 size = self.size 739 self._end = self._current >= size 740 self._char = sql[self._current - 1] 741 self._peek = "" if self._end else sql[self._current] 742 743 if alnum and self._char.isalnum(): 744 # Cache to local variables instead of attributes for better performance 745 _col = self._col 746 _current = self._current 747 _end = self._end 748 _peek = self._peek 749 750 while _peek.isalnum(): 751 _col += 1 752 _current += 1 753 _end = _current >= size 754 _peek = "" if _end else sql[_current] 755 756 self._col = _col 757 self._current = _current 758 self._end = _end 759 self._peek = _peek 760 self._char = sql[_current - 1] 761 762 @property 763 def _text(self) -> str: 764 return self.sql[self._start : self._current] 765 766 def _add(self, token_type: TokenType, text: str | None = None) -> None: 767 self._prev_token_line = self._line 768 769 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 770 self.tokens[-1].comments.extend(self._comments) 771 self._comments = [] 772 773 if text is None: 774 text = self.sql[self._start : self._current] 775 776 self.tokens.append( 777 Token( 778 token_type, 779 text=text, 780 line=self._line, 781 col=self._col, 782 start=self._start, 783 end=self._current - 1, 784 comments=self._comments, 785 ) 786 ) 787 self._comments = [] 788 789 # If we have either a semicolon or a begin token before the command's token, we'll parse 790 # whatever follows the command's token as a string 791 if ( 792 token_type in self.commands 793 and self._peek != ";" 794 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 795 ): 796 start = self._current 797 tokens = len(self.tokens) 798 self._scan(check_semicolon=True) 799 self.tokens = self.tokens[:tokens] 800 text = self.sql[start : self._current].strip() 801 if text: 802 self._add(TokenType.STRING, text) 803 804 def _scan_keywords(self) -> None: 805 sql = self.sql 806 sql_size = self.size 807 single_tokens = self.single_tokens 808 char_upper = _CHAR_UPPER 809 size = 0 810 word = None 811 chars = self._char 812 char = chars 813 prev_space = False 814 skip = False 815 trie = self.keyword_trie 816 single_token = char in single_tokens 817 818 while chars: 819 if not skip: 820 sub = trie.get(char_upper.get(char, char)) 821 if sub is None: 822 break 823 trie = sub 824 if 0 in trie: 825 word = chars 826 827 end = self._current + size 828 size += 1 829 830 if end < sql_size: 831 char = sql[end] 832 single_token = single_token or char in single_tokens 833 is_space = char.isspace() 834 835 if not is_space or not prev_space: 836 if is_space: 837 char = " " 838 chars += char 839 prev_space = is_space 840 skip = False 841 else: 842 skip = True 843 else: 844 char = "" 845 break 846 847 if word: 848 if self._scan_string(word): 849 return 850 if self._scan_comment(word): 851 return 852 if prev_space or single_token or not char: 853 self._advance(size - 1) 854 word = word.upper() 855 self._add(self.keywords[word], text=word) 856 return 857 858 if self._char in single_tokens: 859 self._add(single_tokens[self._char], text=self._char) 860 return 861 862 self._scan_var() 863 864 def _scan_comment(self, comment_start: str) -> bool: 865 if comment_start not in self.comments: 866 return False 867 868 comment_start_line = self._line 869 comment_start_size = len(comment_start) 870 comment_end = self.comments[comment_start] 871 872 if comment_end: 873 # Skip the comment's start delimiter 874 self._advance(comment_start_size) 875 876 comment_count = 1 877 comment_end_size = len(comment_end) 878 nested_comments = self.nested_comments 879 880 while not self._end: 881 if self._chars(comment_end_size) == comment_end: 882 comment_count -= 1 883 if not comment_count: 884 break 885 886 self._advance(alnum=True) 887 888 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 889 if ( 890 nested_comments 891 and not self._end 892 and self._chars(comment_end_size) == comment_start 893 ): 894 self._advance(comment_start_size) 895 comment_count += 1 896 897 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 898 self._advance(comment_end_size - 1) 899 else: 900 _peek = self._peek 901 while not self._end and _peek != "\n" and _peek != "\r": 902 self._advance(alnum=True) 903 _peek = self._peek 904 self._comments.append(self._text[comment_start_size:]) 905 906 if ( 907 comment_start == self.hint_start 908 and self.tokens 909 and self.tokens[-1].token_type in self.tokens_preceding_hint 910 ): 911 self._add(TokenType.HINT) 912 913 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 914 # Multiple consecutive comments are preserved by appending them to the current comments list. 915 if comment_start_line == self._prev_token_line: 916 self.tokens[-1].comments.extend(self._comments) 917 self._comments = [] 918 self._prev_token_line = self._line 919 920 return True 921 922 def _scan_number(self) -> None: 923 if self._char == "0": 924 peek = _CHAR_UPPER.get(self._peek, self._peek) 925 if peek == "B": 926 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 927 elif peek == "X": 928 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 929 930 decimal = False 931 scientific = 0 932 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 933 single_tokens = self.single_tokens 934 keywords = self.keywords 935 numeric_literals = self.numeric_literals 936 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 937 938 is_underscore_separated: bool = False 939 number_text: str = "" 940 numeric_literal: str = "" 941 numeric_type: TokenType | None = None 942 943 while True: 944 if self._peek in _DIGIT_CHARS: 945 # Batch consecutive digits: scan ahead to find how many 946 sql = self.sql 947 end = self._current + 1 948 size = self.size 949 while end < size and sql[end] in _DIGIT_CHARS: 950 end += 1 951 self._advance(end - self._current) 952 elif self._peek == "." and not decimal: 953 if ( 954 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 955 ) or not self.numbers_can_have_decimals: 956 break 957 decimal = True 958 self._advance() 959 elif self._peek in ("-", "+") and scientific == 1: 960 # Only consume +/- if followed by a digit 961 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 962 scientific += 1 963 self._advance() 964 else: 965 break 966 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 967 scientific += 1 968 self._advance() 969 elif self._peek == "_" and numbers_can_be_underscore_separated: 970 is_underscore_separated = True 971 self._advance() 972 elif self._peek.isidentifier(): 973 number_text = self._text 974 975 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 976 numeric_literal += self._peek 977 self._advance() 978 979 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 980 981 if numeric_type: 982 break 983 elif identifiers_can_start_with_digit: 984 return self._add(TokenType.VAR) 985 986 self._advance(-len(numeric_literal)) 987 break 988 else: 989 break 990 991 number_text = number_text or self.sql[self._start : self._current] 992 993 # Normalize inputs such as 100_000 to 100000 994 if is_underscore_separated: 995 number_text = number_text.replace("_", "") 996 997 self._add(TokenType.NUMBER, number_text) 998 999 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 1000 if numeric_type: 1001 self._add(TokenType.DCOLON, "::") 1002 self._add(numeric_type, numeric_literal) 1003 1004 def _scan_bits(self) -> None: 1005 self._advance() 1006 value = self._extract_value() 1007 try: 1008 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1009 int(value, 2) 1010 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1011 except ValueError: 1012 self._add(TokenType.IDENTIFIER) 1013 1014 def _scan_hex(self) -> None: 1015 self._advance() 1016 value = self._extract_value() 1017 try: 1018 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1019 int(value, 16) 1020 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1021 except ValueError: 1022 self._add(TokenType.IDENTIFIER) 1023 1024 def _extract_value(self) -> str: 1025 single_tokens = self.single_tokens 1026 1027 while True: 1028 char = self._peek.strip() 1029 if char and char not in single_tokens: 1030 self._advance(alnum=True) 1031 else: 1032 break 1033 1034 return self._text 1035 1036 def _scan_string(self, start: str) -> bool: 1037 base = None 1038 token_type = TokenType.STRING 1039 1040 if start in self.quotes: 1041 end = self.quotes[start] 1042 elif start in self.format_strings: 1043 end, token_type = self.format_strings[start] 1044 1045 if token_type == TokenType.HEX_STRING: 1046 base = 16 1047 elif token_type == TokenType.BIT_STRING: 1048 base = 2 1049 elif token_type == TokenType.HEREDOC_STRING: 1050 self._advance() 1051 1052 if self._char == end: 1053 tag = "" 1054 else: 1055 tag = self._extract_string( 1056 end, 1057 raw_string=True, 1058 raise_unmatched=not self.heredoc_tag_is_identifier, 1059 ) 1060 1061 if ( 1062 tag 1063 and self.heredoc_tag_is_identifier 1064 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1065 ): 1066 if not self._end: 1067 self._advance(-1) 1068 1069 self._advance(-len(tag)) 1070 self._add(self.heredoc_string_alternative) 1071 return True 1072 1073 end = f"{start}{tag}{end}" 1074 else: 1075 return False 1076 1077 self._advance(len(start)) 1078 text = self._extract_string( 1079 end, 1080 escapes=( 1081 self.byte_string_escapes 1082 if token_type == TokenType.BYTE_STRING 1083 else self.string_escapes 1084 ), 1085 raw_string=token_type == TokenType.RAW_STRING, 1086 ) 1087 1088 if base and text: 1089 try: 1090 int(text, base) 1091 except Exception: 1092 raise TokenError( 1093 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1094 ) 1095 1096 self._add(token_type, text) 1097 return True 1098 1099 def _scan_identifier(self, identifier_end: str) -> None: 1100 self._advance() 1101 text = self._extract_string( 1102 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1103 ) 1104 self._add(TokenType.IDENTIFIER, text) 1105 1106 def _scan_var(self) -> None: 1107 var_single_tokens = self.var_single_tokens 1108 single_tokens = self.single_tokens 1109 1110 while True: 1111 peek = self._peek 1112 if not peek or peek.isspace(): 1113 break 1114 if peek not in var_single_tokens and peek in single_tokens: 1115 break 1116 self._advance(alnum=True) 1117 1118 self._add( 1119 TokenType.VAR 1120 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1121 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1122 ) 1123 1124 def _extract_string( 1125 self, 1126 delimiter: str, 1127 escapes: set[str] | None = None, 1128 raw_string: bool = False, 1129 raise_unmatched: bool = True, 1130 ) -> str: 1131 text = "" 1132 delim_size = len(delimiter) 1133 escapes = self.string_escapes if escapes is None else escapes 1134 unescaped_sequences = self.unescaped_sequences 1135 escape_follow_chars = self.escape_follow_chars 1136 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1137 quotes = self.quotes 1138 sql = self.sql 1139 1140 # use str.find() when the string is simple... no \ or other escapes 1141 if delim_size == 1: 1142 pos = self._current - 1 1143 end = sql.find(delimiter, pos) 1144 1145 if ( 1146 # the closing delimiter was found 1147 end != -1 1148 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1149 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1150 # no backslash in the string that would need escape processing 1151 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1152 ): 1153 newlines = sql.count("\n", pos, end) 1154 if newlines: 1155 self._line += newlines 1156 self._col = end - sql.rfind("\n", pos, end) 1157 else: 1158 self._col += end - pos 1159 1160 self._current = end + 1 1161 self._end = self._current >= self.size 1162 self._char = sql[end] 1163 self._peek = "" if self._end else sql[self._current] 1164 return sql[pos:end] 1165 1166 while True: 1167 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1168 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1169 if unescaped_sequence: 1170 self._advance(2) 1171 text += unescaped_sequence 1172 continue 1173 1174 is_valid_custom_escape = ( 1175 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1176 ) 1177 1178 if ( 1179 (string_escapes_allowed_in_raw_strings or not raw_string) 1180 and self._char in escapes 1181 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1182 and (self._char not in quotes or self._char == self._peek) 1183 ): 1184 if self._peek == delimiter: 1185 text += self._peek 1186 elif is_valid_custom_escape and self._char != self._peek: 1187 text += self._peek 1188 else: 1189 text += self._char + self._peek 1190 1191 if self._current + 1 < self.size: 1192 self._advance(2) 1193 else: 1194 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1195 else: 1196 if self._chars(delim_size) == delimiter: 1197 if delim_size > 1: 1198 self._advance(delim_size - 1) 1199 break 1200 1201 if self._end: 1202 if not raise_unmatched: 1203 return text + self._char 1204 1205 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1206 1207 current = self._current - 1 1208 self._advance(alnum=True) 1209 text += sql[current : self._current - 1] 1210 1211 return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
581 def __init__( 582 self, 583 single_tokens: dict[str, TokenType], 584 keywords: dict[str, TokenType], 585 quotes: dict[str, str], 586 format_strings: dict[str, tuple[str, TokenType]], 587 identifiers: dict[str, str], 588 comments: dict[str, str | None], 589 string_escapes: set[str], 590 byte_string_escapes: set[str], 591 identifier_escapes: set[str], 592 escape_follow_chars: set[str], 593 commands: set[TokenType], 594 command_prefix_tokens: set[TokenType], 595 nested_comments: bool, 596 hint_start: str, 597 tokens_preceding_hint: set[TokenType], 598 has_bit_strings: bool, 599 has_hex_strings: bool, 600 numeric_literals: dict[str, str], 601 var_single_tokens: set[str], 602 string_escapes_allowed_in_raw_strings: bool, 603 heredoc_tag_is_identifier: bool, 604 heredoc_string_alternative: TokenType, 605 keyword_trie: dict, 606 numbers_can_be_underscore_separated: bool, 607 numbers_can_have_decimals: bool, 608 identifiers_can_start_with_digit: bool, 609 unescaped_sequences: dict[str, str], 610 ) -> None: 611 self.single_tokens = single_tokens 612 self.keywords = keywords 613 self.quotes = quotes 614 self.format_strings = format_strings 615 self.identifiers = identifiers 616 self.comments = comments 617 self.string_escapes = string_escapes 618 self.byte_string_escapes = byte_string_escapes 619 self.identifier_escapes = identifier_escapes 620 self.escape_follow_chars = escape_follow_chars 621 self.commands = commands 622 self.command_prefix_tokens = command_prefix_tokens 623 self.nested_comments = nested_comments 624 self.hint_start = hint_start 625 self.tokens_preceding_hint = tokens_preceding_hint 626 self.has_bit_strings = has_bit_strings 627 self.has_hex_strings = has_hex_strings 628 self.numeric_literals = numeric_literals 629 self.var_single_tokens = var_single_tokens 630 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 631 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 632 self.heredoc_string_alternative = heredoc_string_alternative 633 self.keyword_trie = keyword_trie 634 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 635 self.numbers_can_have_decimals = numbers_can_have_decimals 636 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 637 self.unescaped_sequences = unescaped_sequences 638 self.sql = "" 639 self.size = 0 640 self.tokens: list[Token] = [] 641 self._start = 0 642 self._current = 0 643 self._line = 1 644 self._col = 0 645 self._comments: list[str] = [] 646 self._char = "" 647 self._end = False 648 self._peek = "" 649 self._prev_token_line = -1
tokens: list[Token]
665 def tokenize(self, sql: str) -> list[Token]: 666 """Returns a list of tokens corresponding to the SQL string `sql`.""" 667 self.reset() 668 self.sql = sql 669 self.size = len(sql) 670 671 try: 672 self._scan() 673 except Exception as e: 674 start = max(self._current - 50, 0) 675 end = min(self._current + 50, self.size - 1) 676 context = self.sql[start:end] 677 raise TokenError(f"Error tokenizing '{context}'") from e 678 679 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.