sqlglot.tokenizer_core
1from __future__ import annotations 2 3import typing as t 4from enum import IntEnum, auto 5 6from sqlglot.errors import TokenError 7 8# dict lookup is faster than .upper() and .isdigit() 9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)} 10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789") 11 12 13class TokenType(IntEnum): 14 L_PAREN = auto() 15 R_PAREN = auto() 16 L_BRACKET = auto() 17 R_BRACKET = auto() 18 L_BRACE = auto() 19 R_BRACE = auto() 20 COMMA = auto() 21 DOT = auto() 22 DASH = auto() 23 PLUS = auto() 24 COLON = auto() 25 DOTCOLON = auto() 26 DOTCARET = auto() 27 DCOLON = auto() 28 DCOLONDOLLAR = auto() 29 DCOLONPERCENT = auto() 30 DCOLONQMARK = auto() 31 DQMARK = auto() 32 SEMICOLON = auto() 33 STAR = auto() 34 BACKSLASH = auto() 35 SLASH = auto() 36 LT = auto() 37 LTE = auto() 38 GT = auto() 39 GTE = auto() 40 NOT = auto() 41 EQ = auto() 42 NEQ = auto() 43 NULLSAFE_EQ = auto() 44 COLON_EQ = auto() 45 COLON_GT = auto() 46 NCOLON_GT = auto() 47 AND = auto() 48 OR = auto() 49 AMP = auto() 50 DPIPE = auto() 51 PIPE_GT = auto() 52 PIPE = auto() 53 PIPE_SLASH = auto() 54 DPIPE_SLASH = auto() 55 CARET = auto() 56 CARET_AT = auto() 57 TILDE = auto() 58 ARROW = auto() 59 DARROW = auto() 60 FARROW = auto() 61 HASH = auto() 62 HASH_ARROW = auto() 63 DHASH_ARROW = auto() 64 LR_ARROW = auto() 65 DAT = auto() 66 LT_AT = auto() 67 AT_GT = auto() 68 DOLLAR = auto() 69 PARAMETER = auto() 70 SESSION = auto() 71 SESSION_PARAMETER = auto() 72 SESSION_USER = auto() 73 DAMP = auto() 74 AMP_LT = auto() 75 AMP_GT = auto() 76 ADJACENT = auto() 77 XOR = auto() 78 DSTAR = auto() 79 QMARK_AMP = auto() 80 QMARK_PIPE = auto() 81 HASH_DASH = auto() 82 EXCLAMATION = auto() 83 84 URI_START = auto() 85 86 BLOCK_START = auto() 87 BLOCK_END = auto() 88 89 SPACE = auto() 90 BREAK = auto() 91 92 STRING = auto() 93 NUMBER = auto() 94 IDENTIFIER = auto() 95 DATABASE = auto() 96 COLUMN = auto() 97 COLUMN_DEF = auto() 98 SCHEMA = auto() 99 TABLE = auto() 100 WAREHOUSE = auto() 101 STAGE = auto() 102 STREAM = auto() 103 STREAMLIT = auto() 104 VAR = auto() 105 BIT_STRING = auto() 106 HEX_STRING = auto() 107 BYTE_STRING = auto() 108 NATIONAL_STRING = auto() 109 RAW_STRING = auto() 110 HEREDOC_STRING = auto() 111 UNICODE_STRING = auto() 112 113 # types 114 BIT = auto() 115 BOOLEAN = auto() 116 TINYINT = auto() 117 UTINYINT = auto() 118 SMALLINT = auto() 119 USMALLINT = auto() 120 MEDIUMINT = auto() 121 UMEDIUMINT = auto() 122 INT = auto() 123 UINT = auto() 124 BIGINT = auto() 125 UBIGINT = auto() 126 BIGNUM = auto() 127 INT128 = auto() 128 UINT128 = auto() 129 INT256 = auto() 130 UINT256 = auto() 131 FLOAT = auto() 132 DOUBLE = auto() 133 UDOUBLE = auto() 134 DECIMAL = auto() 135 DECIMAL32 = auto() 136 DECIMAL64 = auto() 137 DECIMAL128 = auto() 138 DECIMAL256 = auto() 139 DECFLOAT = auto() 140 UDECIMAL = auto() 141 BIGDECIMAL = auto() 142 CHAR = auto() 143 NCHAR = auto() 144 VARCHAR = auto() 145 NVARCHAR = auto() 146 BPCHAR = auto() 147 TEXT = auto() 148 MEDIUMTEXT = auto() 149 LONGTEXT = auto() 150 BLOB = auto() 151 MEDIUMBLOB = auto() 152 LONGBLOB = auto() 153 TINYBLOB = auto() 154 TINYTEXT = auto() 155 NAME = auto() 156 BINARY = auto() 157 VARBINARY = auto() 158 JSON = auto() 159 JSONB = auto() 160 TIME = auto() 161 TIMETZ = auto() 162 TIME_NS = auto() 163 TIMESTAMP = auto() 164 TIMESTAMPTZ = auto() 165 TIMESTAMPLTZ = auto() 166 TIMESTAMPNTZ = auto() 167 TIMESTAMP_S = auto() 168 TIMESTAMP_MS = auto() 169 TIMESTAMP_NS = auto() 170 DATETIME = auto() 171 DATETIME2 = auto() 172 DATETIME64 = auto() 173 SMALLDATETIME = auto() 174 DATE = auto() 175 DATE32 = auto() 176 INT4RANGE = auto() 177 INT4MULTIRANGE = auto() 178 INT8RANGE = auto() 179 INT8MULTIRANGE = auto() 180 NUMRANGE = auto() 181 NUMMULTIRANGE = auto() 182 TSRANGE = auto() 183 TSMULTIRANGE = auto() 184 TSTZRANGE = auto() 185 TSTZMULTIRANGE = auto() 186 DATERANGE = auto() 187 DATEMULTIRANGE = auto() 188 UUID = auto() 189 GEOGRAPHY = auto() 190 GEOGRAPHYPOINT = auto() 191 NULLABLE = auto() 192 GEOMETRY = auto() 193 POINT = auto() 194 RING = auto() 195 LINESTRING = auto() 196 LOCALTIME = auto() 197 LOCALTIMESTAMP = auto() 198 SYSTIMESTAMP = auto() 199 MULTILINESTRING = auto() 200 POLYGON = auto() 201 MULTIPOLYGON = auto() 202 HLLSKETCH = auto() 203 HSTORE = auto() 204 SUPER = auto() 205 SERIAL = auto() 206 SMALLSERIAL = auto() 207 BIGSERIAL = auto() 208 XML = auto() 209 YEAR = auto() 210 USERDEFINED = auto() 211 MONEY = auto() 212 SMALLMONEY = auto() 213 ROWVERSION = auto() 214 IMAGE = auto() 215 VARIANT = auto() 216 OBJECT = auto() 217 INET = auto() 218 IPADDRESS = auto() 219 IPPREFIX = auto() 220 IPV4 = auto() 221 IPV6 = auto() 222 ENUM = auto() 223 ENUM8 = auto() 224 ENUM16 = auto() 225 FIXEDSTRING = auto() 226 LOWCARDINALITY = auto() 227 NESTED = auto() 228 AGGREGATEFUNCTION = auto() 229 SIMPLEAGGREGATEFUNCTION = auto() 230 TDIGEST = auto() 231 UNKNOWN = auto() 232 VECTOR = auto() 233 DYNAMIC = auto() 234 VOID = auto() 235 236 # keywords 237 ALIAS = auto() 238 ALTER = auto() 239 ALL = auto() 240 ANTI = auto() 241 ANY = auto() 242 APPLY = auto() 243 ARRAY = auto() 244 ASC = auto() 245 ASOF = auto() 246 ATTACH = auto() 247 AUTO_INCREMENT = auto() 248 BEGIN = auto() 249 BETWEEN = auto() 250 BULK_COLLECT_INTO = auto() 251 CACHE = auto() 252 CASE = auto() 253 CHARACTER_SET = auto() 254 CLUSTER_BY = auto() 255 COLLATE = auto() 256 COMMAND = auto() 257 COMMENT = auto() 258 COMMIT = auto() 259 CONNECT_BY = auto() 260 CONSTRAINT = auto() 261 COPY = auto() 262 CREATE = auto() 263 CROSS = auto() 264 CUBE = auto() 265 CURRENT_DATE = auto() 266 CURRENT_DATETIME = auto() 267 CURRENT_SCHEMA = auto() 268 CURRENT_TIME = auto() 269 CURRENT_TIMESTAMP = auto() 270 CURRENT_USER = auto() 271 CURRENT_USER_ID = auto() 272 CURRENT_ROLE = auto() 273 CURRENT_CATALOG = auto() 274 DECLARE = auto() 275 DEFAULT = auto() 276 DELETE = auto() 277 DESC = auto() 278 DESCRIBE = auto() 279 DETACH = auto() 280 DICTIONARY = auto() 281 DISTINCT = auto() 282 DISTRIBUTE_BY = auto() 283 DIV = auto() 284 DROP = auto() 285 ELSE = auto() 286 END = auto() 287 ESCAPE = auto() 288 EXCEPT = auto() 289 EXECUTE = auto() 290 EXISTS = auto() 291 FALSE = auto() 292 FETCH = auto() 293 FILE = auto() 294 FILE_FORMAT = auto() 295 FILTER = auto() 296 FINAL = auto() 297 FIRST = auto() 298 FOR = auto() 299 FORCE = auto() 300 FOREIGN_KEY = auto() 301 FORMAT = auto() 302 FROM = auto() 303 FULL = auto() 304 FUNCTION = auto() 305 GET = auto() 306 GLOB = auto() 307 GLOBAL = auto() 308 GRANT = auto() 309 GROUP_BY = auto() 310 GROUPING_SETS = auto() 311 HAVING = auto() 312 HINT = auto() 313 IGNORE = auto() 314 ILIKE = auto() 315 IN = auto() 316 INDEX = auto() 317 INDEXED_BY = auto() 318 INNER = auto() 319 INSERT = auto() 320 INSTALL = auto() 321 INTEGRATION = auto() 322 INTERSECT = auto() 323 INTERVAL = auto() 324 INTO = auto() 325 INTRODUCER = auto() 326 IRLIKE = auto() 327 IS = auto() 328 ISNULL = auto() 329 JOIN = auto() 330 JOIN_MARKER = auto() 331 KEEP = auto() 332 KEY = auto() 333 KILL = auto() 334 LANGUAGE = auto() 335 LATERAL = auto() 336 LEFT = auto() 337 LIKE = auto() 338 LIMIT = auto() 339 LIST = auto() 340 LOAD = auto() 341 LOCK = auto() 342 MAP = auto() 343 MATCH = auto() 344 MATCH_CONDITION = auto() 345 MATCH_RECOGNIZE = auto() 346 MEMBER_OF = auto() 347 MERGE = auto() 348 MOD = auto() 349 MODEL = auto() 350 NATURAL = auto() 351 NEXT = auto() 352 NOTHING = auto() 353 NOTNULL = auto() 354 NULL = auto() 355 OBJECT_IDENTIFIER = auto() 356 OFFSET = auto() 357 ON = auto() 358 ONLY = auto() 359 OPERATOR = auto() 360 ORDER_BY = auto() 361 ORDER_SIBLINGS_BY = auto() 362 ORDERED = auto() 363 ORDINALITY = auto() 364 OUT = auto() 365 INOUT = auto() 366 OUTER = auto() 367 OVER = auto() 368 OVERLAPS = auto() 369 OVERWRITE = auto() 370 PACKAGE = auto() 371 PARTITION = auto() 372 PARTITION_BY = auto() 373 PERCENT = auto() 374 PIVOT = auto() 375 PLACEHOLDER = auto() 376 POLICY = auto() 377 POOL = auto() 378 POSITIONAL = auto() 379 PRAGMA = auto() 380 PREWHERE = auto() 381 PRIMARY_KEY = auto() 382 PROCEDURE = auto() 383 PROPERTIES = auto() 384 PSEUDO_TYPE = auto() 385 PUT = auto() 386 QUALIFY = auto() 387 QUOTE = auto() 388 QDCOLON = auto() 389 RANGE = auto() 390 RECURSIVE = auto() 391 REFRESH = auto() 392 RENAME = auto() 393 REPLACE = auto() 394 RETURNING = auto() 395 REVOKE = auto() 396 REFERENCES = auto() 397 RIGHT = auto() 398 RLIKE = auto() 399 ROLE = auto() 400 ROLLBACK = auto() 401 ROLLUP = auto() 402 ROW = auto() 403 ROWS = auto() 404 RULE = auto() 405 SELECT = auto() 406 SEMI = auto() 407 SEPARATOR = auto() 408 SEQUENCE = auto() 409 SERDE_PROPERTIES = auto() 410 SET = auto() 411 SETTINGS = auto() 412 SHOW = auto() 413 SIMILAR_TO = auto() 414 SOME = auto() 415 SORT_BY = auto() 416 SOUNDS_LIKE = auto() 417 SQL_SECURITY = auto() 418 START_WITH = auto() 419 STORAGE_INTEGRATION = auto() 420 STRAIGHT_JOIN = auto() 421 STRUCT = auto() 422 SUMMARIZE = auto() 423 TABLE_SAMPLE = auto() 424 TAG = auto() 425 TEMPORARY = auto() 426 TOP = auto() 427 THEN = auto() 428 TRUE = auto() 429 TRUNCATE = auto() 430 TRIGGER = auto() 431 TYPE = auto() 432 UNCACHE = auto() 433 UNION = auto() 434 UNNEST = auto() 435 UNPIVOT = auto() 436 UPDATE = auto() 437 USE = auto() 438 USING = auto() 439 VALUES = auto() 440 VARIADIC = auto() 441 VIEW = auto() 442 SEMANTIC_VIEW = auto() 443 VOLATILE = auto() 444 VOLUME = auto() 445 WHEN = auto() 446 WHERE = auto() 447 WINDOW = auto() 448 WITH = auto() 449 UNIQUE = auto() 450 UTC_DATE = auto() 451 UTC_TIME = auto() 452 UTC_TIMESTAMP = auto() 453 VERSION_SNAPSHOT = auto() 454 TIMESTAMP_SNAPSHOT = auto() 455 OPTION = auto() 456 SINK = auto() 457 SOURCE = auto() 458 ANALYZE = auto() 459 NAMESPACE = auto() 460 EXPORT = auto() 461 462 # sentinels 463 HIVE_TOKEN_STREAM = auto() 464 SENTINEL = auto() 465 466 def __str__(self) -> str: 467 return f"TokenType.{self.name}" 468 469 470class Token: 471 # mypyc doesn't expose slots 472 _attrs: t.ClassVar[tuple[str, ...]] = ( 473 "token_type", 474 "text", 475 "line", 476 "col", 477 "start", 478 "end", 479 "comments", 480 ) 481 __slots__ = _attrs 482 483 @classmethod 484 def number(cls, number: int) -> Token: 485 """Returns a NUMBER token with `number` as its text.""" 486 return cls(TokenType.NUMBER, str(number)) 487 488 @classmethod 489 def string(cls, string: str) -> Token: 490 """Returns a STRING token with `string` as its text.""" 491 return cls(TokenType.STRING, string) 492 493 @classmethod 494 def identifier(cls, identifier: str) -> Token: 495 """Returns an IDENTIFIER token with `identifier` as its text.""" 496 return cls(TokenType.IDENTIFIER, identifier) 497 498 @classmethod 499 def var(cls, var: str) -> Token: 500 """Returns an VAR token with `var` as its text.""" 501 return cls(TokenType.VAR, var) 502 503 def __init__( 504 self, 505 token_type: TokenType, 506 text: str, 507 line: int = 1, 508 col: int = 1, 509 start: int = 0, 510 end: int = 0, 511 comments: list[str] | None = None, 512 ) -> None: 513 self.token_type = token_type 514 self.text = text 515 self.line = line 516 self.col = col 517 self.start = start 518 self.end = end 519 self.comments = [] if comments is None else comments 520 521 def __bool__(self) -> bool: 522 return self.token_type != TokenType.SENTINEL 523 524 def __repr__(self) -> str: 525 attributes = ", ".join( 526 f"{k}: TokenType.{self.token_type.name}" 527 if k == "token_type" 528 else f"{k}: {getattr(self, k)}" 529 for k in self._attrs 530 ) 531 return f"<Token {attributes}>" 532 533 534class TokenizerCore: 535 __slots__ = ( 536 "sql", 537 "size", 538 "tokens", 539 "_start", 540 "_current", 541 "_line", 542 "_col", 543 "_comments", 544 "_char", 545 "_end", 546 "_peek", 547 "_prev_token_line", 548 "single_tokens", 549 "keywords", 550 "quotes", 551 "format_strings", 552 "identifiers", 553 "comments", 554 "string_escapes", 555 "byte_string_escapes", 556 "identifier_escapes", 557 "escape_follow_chars", 558 "commands", 559 "command_prefix_tokens", 560 "nested_comments", 561 "hint_start", 562 "tokens_preceding_hint", 563 "has_bit_strings", 564 "has_hex_strings", 565 "numeric_literals", 566 "var_single_tokens", 567 "string_escapes_allowed_in_raw_strings", 568 "heredoc_tag_is_identifier", 569 "heredoc_string_alternative", 570 "keyword_trie", 571 "numbers_can_be_underscore_separated", 572 "numbers_can_have_decimals", 573 "identifiers_can_start_with_digit", 574 "unescaped_sequences", 575 ) 576 577 def __init__( 578 self, 579 single_tokens: dict[str, TokenType], 580 keywords: dict[str, TokenType], 581 quotes: dict[str, str], 582 format_strings: dict[str, tuple[str, TokenType]], 583 identifiers: dict[str, str], 584 comments: dict[str, str | None], 585 string_escapes: set[str], 586 byte_string_escapes: set[str], 587 identifier_escapes: set[str], 588 escape_follow_chars: set[str], 589 commands: set[TokenType], 590 command_prefix_tokens: set[TokenType], 591 nested_comments: bool, 592 hint_start: str, 593 tokens_preceding_hint: set[TokenType], 594 has_bit_strings: bool, 595 has_hex_strings: bool, 596 numeric_literals: dict[str, str], 597 var_single_tokens: set[str], 598 string_escapes_allowed_in_raw_strings: bool, 599 heredoc_tag_is_identifier: bool, 600 heredoc_string_alternative: TokenType, 601 keyword_trie: dict, 602 numbers_can_be_underscore_separated: bool, 603 numbers_can_have_decimals: bool, 604 identifiers_can_start_with_digit: bool, 605 unescaped_sequences: dict[str, str], 606 ) -> None: 607 self.single_tokens = single_tokens 608 self.keywords = keywords 609 self.quotes = quotes 610 self.format_strings = format_strings 611 self.identifiers = identifiers 612 self.comments = comments 613 self.string_escapes = string_escapes 614 self.byte_string_escapes = byte_string_escapes 615 self.identifier_escapes = identifier_escapes 616 self.escape_follow_chars = escape_follow_chars 617 self.commands = commands 618 self.command_prefix_tokens = command_prefix_tokens 619 self.nested_comments = nested_comments 620 self.hint_start = hint_start 621 self.tokens_preceding_hint = tokens_preceding_hint 622 self.has_bit_strings = has_bit_strings 623 self.has_hex_strings = has_hex_strings 624 self.numeric_literals = numeric_literals 625 self.var_single_tokens = var_single_tokens 626 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 627 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 628 self.heredoc_string_alternative = heredoc_string_alternative 629 self.keyword_trie = keyword_trie 630 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 631 self.numbers_can_have_decimals = numbers_can_have_decimals 632 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 633 self.unescaped_sequences = unescaped_sequences 634 self.sql = "" 635 self.size = 0 636 self.tokens: list[Token] = [] 637 self._start = 0 638 self._current = 0 639 self._line = 1 640 self._col = 0 641 self._comments: list[str] = [] 642 self._char = "" 643 self._end = False 644 self._peek = "" 645 self._prev_token_line = -1 646 647 def reset(self) -> None: 648 self.sql = "" 649 self.size = 0 650 self.tokens = [] 651 self._start = 0 652 self._current = 0 653 self._line = 1 654 self._col = 0 655 self._comments = [] 656 self._char = "" 657 self._end = False 658 self._peek = "" 659 self._prev_token_line = -1 660 661 def tokenize(self, sql: str) -> list[Token]: 662 """Returns a list of tokens corresponding to the SQL string `sql`.""" 663 self.reset() 664 self.sql = sql 665 self.size = len(sql) 666 667 try: 668 self._scan() 669 except Exception as e: 670 start = max(self._current - 50, 0) 671 end = min(self._current + 50, self.size - 1) 672 context = self.sql[start:end] 673 raise TokenError(f"Error tokenizing '{context}'") from e 674 675 return self.tokens 676 677 def _scan(self, check_semicolon: bool = False) -> None: 678 identifiers = self.identifiers 679 digit_chars = _DIGIT_CHARS 680 681 while self.size and not self._end: 682 current = self._current 683 684 # Skip spaces here rather than iteratively calling advance() for performance reasons 685 while current < self.size: 686 char = self.sql[current] 687 688 if char == " " or char == "\t": 689 current += 1 690 else: 691 break 692 693 offset = current - self._current if current > self._current else 1 694 695 self._start = current 696 self._advance(offset) 697 698 if not self._char.isspace(): 699 if self._char in digit_chars: 700 self._scan_number() 701 elif self._char in identifiers: 702 self._scan_identifier(identifiers[self._char]) 703 else: 704 self._scan_keywords() 705 706 if check_semicolon and self._peek == ";": 707 break 708 709 if self.tokens and self._comments: 710 self.tokens[-1].comments.extend(self._comments) 711 712 def _chars(self, size: int) -> str: 713 if size == 1: 714 return self._char 715 716 start = self._current - 1 717 end = start + size 718 719 return self.sql[start:end] if end <= self.size else "" 720 721 def _advance(self, i: int = 1, alnum: bool = False) -> None: 722 char = self._char 723 724 if char == "\n" or char == "\r": 725 # Ensures we don't count an extra line if we get a \r\n line break sequence 726 if not (char == "\r" and self._peek == "\n"): 727 self._col = i 728 self._line += 1 729 else: 730 self._col += i 731 732 self._current += i 733 sql = self.sql 734 size = self.size 735 self._end = self._current >= size 736 self._char = sql[self._current - 1] 737 self._peek = "" if self._end else sql[self._current] 738 739 if alnum and self._char.isalnum(): 740 # Cache to local variables instead of attributes for better performance 741 _col = self._col 742 _current = self._current 743 _end = self._end 744 _peek = self._peek 745 746 while _peek.isalnum(): 747 _col += 1 748 _current += 1 749 _end = _current >= size 750 _peek = "" if _end else sql[_current] 751 752 self._col = _col 753 self._current = _current 754 self._end = _end 755 self._peek = _peek 756 self._char = sql[_current - 1] 757 758 @property 759 def _text(self) -> str: 760 return self.sql[self._start : self._current] 761 762 def _add(self, token_type: TokenType, text: str | None = None) -> None: 763 self._prev_token_line = self._line 764 765 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 766 self.tokens[-1].comments.extend(self._comments) 767 self._comments = [] 768 769 if text is None: 770 text = self.sql[self._start : self._current] 771 772 self.tokens.append( 773 Token( 774 token_type, 775 text=text, 776 line=self._line, 777 col=self._col, 778 start=self._start, 779 end=self._current - 1, 780 comments=self._comments, 781 ) 782 ) 783 self._comments = [] 784 785 # If we have either a semicolon or a begin token before the command's token, we'll parse 786 # whatever follows the command's token as a string 787 if ( 788 token_type in self.commands 789 and self._peek != ";" 790 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 791 ): 792 start = self._current 793 tokens = len(self.tokens) 794 self._scan(check_semicolon=True) 795 self.tokens = self.tokens[:tokens] 796 text = self.sql[start : self._current].strip() 797 if text: 798 self._add(TokenType.STRING, text) 799 800 def _scan_keywords(self) -> None: 801 sql = self.sql 802 sql_size = self.size 803 single_tokens = self.single_tokens 804 char_upper = _CHAR_UPPER 805 size = 0 806 word = None 807 chars = self._char 808 char = chars 809 prev_space = False 810 skip = False 811 trie = self.keyword_trie 812 single_token = char in single_tokens 813 814 while chars: 815 if not skip: 816 sub = trie.get(char_upper.get(char, char)) 817 if sub is None: 818 break 819 trie = sub 820 if 0 in trie: 821 word = chars 822 823 end = self._current + size 824 size += 1 825 826 if end < sql_size: 827 char = sql[end] 828 single_token = single_token or char in single_tokens 829 is_space = char.isspace() 830 831 if not is_space or not prev_space: 832 if is_space: 833 char = " " 834 chars += char 835 prev_space = is_space 836 skip = False 837 else: 838 skip = True 839 else: 840 char = "" 841 break 842 843 if word: 844 if self._scan_string(word): 845 return 846 if self._scan_comment(word): 847 return 848 if prev_space or single_token or not char: 849 self._advance(size - 1) 850 word = word.upper() 851 self._add(self.keywords[word], text=word) 852 return 853 854 if self._char in single_tokens: 855 self._add(single_tokens[self._char], text=self._char) 856 return 857 858 self._scan_var() 859 860 def _scan_comment(self, comment_start: str) -> bool: 861 if comment_start not in self.comments: 862 return False 863 864 comment_start_line = self._line 865 comment_start_size = len(comment_start) 866 comment_end = self.comments[comment_start] 867 868 if comment_end: 869 # Skip the comment's start delimiter 870 self._advance(comment_start_size) 871 872 comment_count = 1 873 comment_end_size = len(comment_end) 874 nested_comments = self.nested_comments 875 876 while not self._end: 877 if self._chars(comment_end_size) == comment_end: 878 comment_count -= 1 879 if not comment_count: 880 break 881 882 self._advance(alnum=True) 883 884 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 885 if ( 886 nested_comments 887 and not self._end 888 and self._chars(comment_end_size) == comment_start 889 ): 890 self._advance(comment_start_size) 891 comment_count += 1 892 893 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 894 self._advance(comment_end_size - 1) 895 else: 896 _peek = self._peek 897 while not self._end and _peek != "\n" and _peek != "\r": 898 self._advance(alnum=True) 899 _peek = self._peek 900 self._comments.append(self._text[comment_start_size:]) 901 902 if ( 903 comment_start == self.hint_start 904 and self.tokens 905 and self.tokens[-1].token_type in self.tokens_preceding_hint 906 ): 907 self._add(TokenType.HINT) 908 909 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 910 # Multiple consecutive comments are preserved by appending them to the current comments list. 911 if comment_start_line == self._prev_token_line: 912 self.tokens[-1].comments.extend(self._comments) 913 self._comments = [] 914 self._prev_token_line = self._line 915 916 return True 917 918 def _scan_number(self) -> None: 919 if self._char == "0": 920 peek = _CHAR_UPPER.get(self._peek, self._peek) 921 if peek == "B": 922 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 923 elif peek == "X": 924 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 925 926 decimal = False 927 scientific = 0 928 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 929 single_tokens = self.single_tokens 930 keywords = self.keywords 931 numeric_literals = self.numeric_literals 932 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 933 934 is_underscore_separated: bool = False 935 number_text: str = "" 936 numeric_literal: str = "" 937 numeric_type: TokenType | None = None 938 939 while True: 940 if self._peek in _DIGIT_CHARS: 941 # Batch consecutive digits: scan ahead to find how many 942 sql = self.sql 943 end = self._current + 1 944 size = self.size 945 while end < size and sql[end] in _DIGIT_CHARS: 946 end += 1 947 self._advance(end - self._current) 948 elif self._peek == "." and not decimal: 949 if ( 950 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 951 ) or not self.numbers_can_have_decimals: 952 break 953 decimal = True 954 self._advance() 955 elif self._peek in ("-", "+") and scientific == 1: 956 # Only consume +/- if followed by a digit 957 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 958 scientific += 1 959 self._advance() 960 else: 961 break 962 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 963 scientific += 1 964 self._advance() 965 elif self._peek == "_" and numbers_can_be_underscore_separated: 966 is_underscore_separated = True 967 self._advance() 968 elif self._peek.isidentifier(): 969 number_text = self._text 970 971 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 972 numeric_literal += self._peek 973 self._advance() 974 975 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 976 977 if numeric_type: 978 break 979 elif identifiers_can_start_with_digit: 980 return self._add(TokenType.VAR) 981 982 self._advance(-len(numeric_literal)) 983 break 984 else: 985 break 986 987 number_text = number_text or self.sql[self._start : self._current] 988 989 # Normalize inputs such as 100_000 to 100000 990 if is_underscore_separated: 991 number_text = number_text.replace("_", "") 992 993 self._add(TokenType.NUMBER, number_text) 994 995 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 996 if numeric_type: 997 self._add(TokenType.DCOLON, "::") 998 self._add(numeric_type, numeric_literal) 999 1000 def _scan_bits(self) -> None: 1001 self._advance() 1002 value = self._extract_value() 1003 try: 1004 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1005 int(value, 2) 1006 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1007 except ValueError: 1008 self._add(TokenType.IDENTIFIER) 1009 1010 def _scan_hex(self) -> None: 1011 self._advance() 1012 value = self._extract_value() 1013 try: 1014 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1015 int(value, 16) 1016 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1017 except ValueError: 1018 self._add(TokenType.IDENTIFIER) 1019 1020 def _extract_value(self) -> str: 1021 single_tokens = self.single_tokens 1022 1023 while True: 1024 char = self._peek.strip() 1025 if char and char not in single_tokens: 1026 self._advance(alnum=True) 1027 else: 1028 break 1029 1030 return self._text 1031 1032 def _scan_string(self, start: str) -> bool: 1033 base = None 1034 token_type = TokenType.STRING 1035 1036 if start in self.quotes: 1037 end = self.quotes[start] 1038 elif start in self.format_strings: 1039 end, token_type = self.format_strings[start] 1040 1041 if token_type == TokenType.HEX_STRING: 1042 base = 16 1043 elif token_type == TokenType.BIT_STRING: 1044 base = 2 1045 elif token_type == TokenType.HEREDOC_STRING: 1046 self._advance() 1047 1048 if self._char == end: 1049 tag = "" 1050 else: 1051 tag = self._extract_string( 1052 end, 1053 raw_string=True, 1054 raise_unmatched=not self.heredoc_tag_is_identifier, 1055 ) 1056 1057 if ( 1058 tag 1059 and self.heredoc_tag_is_identifier 1060 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1061 ): 1062 if not self._end: 1063 self._advance(-1) 1064 1065 self._advance(-len(tag)) 1066 self._add(self.heredoc_string_alternative) 1067 return True 1068 1069 end = f"{start}{tag}{end}" 1070 else: 1071 return False 1072 1073 self._advance(len(start)) 1074 text = self._extract_string( 1075 end, 1076 escapes=( 1077 self.byte_string_escapes 1078 if token_type == TokenType.BYTE_STRING 1079 else self.string_escapes 1080 ), 1081 raw_string=token_type == TokenType.RAW_STRING, 1082 ) 1083 1084 if base and text: 1085 try: 1086 int(text, base) 1087 except Exception: 1088 raise TokenError( 1089 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1090 ) 1091 1092 self._add(token_type, text) 1093 return True 1094 1095 def _scan_identifier(self, identifier_end: str) -> None: 1096 self._advance() 1097 text = self._extract_string( 1098 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1099 ) 1100 self._add(TokenType.IDENTIFIER, text) 1101 1102 def _scan_var(self) -> None: 1103 var_single_tokens = self.var_single_tokens 1104 single_tokens = self.single_tokens 1105 1106 while True: 1107 peek = self._peek 1108 if not peek or peek.isspace(): 1109 break 1110 if peek not in var_single_tokens and peek in single_tokens: 1111 break 1112 self._advance(alnum=True) 1113 1114 self._add( 1115 TokenType.VAR 1116 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1117 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1118 ) 1119 1120 def _extract_string( 1121 self, 1122 delimiter: str, 1123 escapes: set[str] | None = None, 1124 raw_string: bool = False, 1125 raise_unmatched: bool = True, 1126 ) -> str: 1127 text = "" 1128 delim_size = len(delimiter) 1129 escapes = self.string_escapes if escapes is None else escapes 1130 unescaped_sequences = self.unescaped_sequences 1131 escape_follow_chars = self.escape_follow_chars 1132 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1133 quotes = self.quotes 1134 sql = self.sql 1135 1136 # use str.find() when the string is simple... no \ or other escapes 1137 if delim_size == 1: 1138 pos = self._current - 1 1139 end = sql.find(delimiter, pos) 1140 1141 if ( 1142 # the closing delimiter was found 1143 end != -1 1144 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1145 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1146 # no backslash in the string that would need escape processing 1147 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1148 ): 1149 newlines = sql.count("\n", pos, end) 1150 if newlines: 1151 self._line += newlines 1152 self._col = end - sql.rfind("\n", pos, end) 1153 else: 1154 self._col += end - pos 1155 1156 self._current = end + 1 1157 self._end = self._current >= self.size 1158 self._char = sql[end] 1159 self._peek = "" if self._end else sql[self._current] 1160 return sql[pos:end] 1161 1162 while True: 1163 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1164 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1165 if unescaped_sequence: 1166 self._advance(2) 1167 text += unescaped_sequence 1168 continue 1169 1170 is_valid_custom_escape = ( 1171 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1172 ) 1173 1174 if ( 1175 (string_escapes_allowed_in_raw_strings or not raw_string) 1176 and self._char in escapes 1177 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1178 and (self._char not in quotes or self._char == self._peek) 1179 ): 1180 if self._peek == delimiter: 1181 text += self._peek 1182 elif is_valid_custom_escape and self._char != self._peek: 1183 text += self._peek 1184 else: 1185 text += self._char + self._peek 1186 1187 if self._current + 1 < self.size: 1188 self._advance(2) 1189 else: 1190 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1191 else: 1192 if self._chars(delim_size) == delimiter: 1193 if delim_size > 1: 1194 self._advance(delim_size - 1) 1195 break 1196 1197 if self._end: 1198 if not raise_unmatched: 1199 return text + self._char 1200 1201 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1202 1203 current = self._current - 1 1204 self._advance(alnum=True) 1205 text += sql[current : self._current - 1] 1206 1207 return text
class
TokenType(enum.IntEnum):
14class TokenType(IntEnum): 15 L_PAREN = auto() 16 R_PAREN = auto() 17 L_BRACKET = auto() 18 R_BRACKET = auto() 19 L_BRACE = auto() 20 R_BRACE = auto() 21 COMMA = auto() 22 DOT = auto() 23 DASH = auto() 24 PLUS = auto() 25 COLON = auto() 26 DOTCOLON = auto() 27 DOTCARET = auto() 28 DCOLON = auto() 29 DCOLONDOLLAR = auto() 30 DCOLONPERCENT = auto() 31 DCOLONQMARK = auto() 32 DQMARK = auto() 33 SEMICOLON = auto() 34 STAR = auto() 35 BACKSLASH = auto() 36 SLASH = auto() 37 LT = auto() 38 LTE = auto() 39 GT = auto() 40 GTE = auto() 41 NOT = auto() 42 EQ = auto() 43 NEQ = auto() 44 NULLSAFE_EQ = auto() 45 COLON_EQ = auto() 46 COLON_GT = auto() 47 NCOLON_GT = auto() 48 AND = auto() 49 OR = auto() 50 AMP = auto() 51 DPIPE = auto() 52 PIPE_GT = auto() 53 PIPE = auto() 54 PIPE_SLASH = auto() 55 DPIPE_SLASH = auto() 56 CARET = auto() 57 CARET_AT = auto() 58 TILDE = auto() 59 ARROW = auto() 60 DARROW = auto() 61 FARROW = auto() 62 HASH = auto() 63 HASH_ARROW = auto() 64 DHASH_ARROW = auto() 65 LR_ARROW = auto() 66 DAT = auto() 67 LT_AT = auto() 68 AT_GT = auto() 69 DOLLAR = auto() 70 PARAMETER = auto() 71 SESSION = auto() 72 SESSION_PARAMETER = auto() 73 SESSION_USER = auto() 74 DAMP = auto() 75 AMP_LT = auto() 76 AMP_GT = auto() 77 ADJACENT = auto() 78 XOR = auto() 79 DSTAR = auto() 80 QMARK_AMP = auto() 81 QMARK_PIPE = auto() 82 HASH_DASH = auto() 83 EXCLAMATION = auto() 84 85 URI_START = auto() 86 87 BLOCK_START = auto() 88 BLOCK_END = auto() 89 90 SPACE = auto() 91 BREAK = auto() 92 93 STRING = auto() 94 NUMBER = auto() 95 IDENTIFIER = auto() 96 DATABASE = auto() 97 COLUMN = auto() 98 COLUMN_DEF = auto() 99 SCHEMA = auto() 100 TABLE = auto() 101 WAREHOUSE = auto() 102 STAGE = auto() 103 STREAM = auto() 104 STREAMLIT = auto() 105 VAR = auto() 106 BIT_STRING = auto() 107 HEX_STRING = auto() 108 BYTE_STRING = auto() 109 NATIONAL_STRING = auto() 110 RAW_STRING = auto() 111 HEREDOC_STRING = auto() 112 UNICODE_STRING = auto() 113 114 # types 115 BIT = auto() 116 BOOLEAN = auto() 117 TINYINT = auto() 118 UTINYINT = auto() 119 SMALLINT = auto() 120 USMALLINT = auto() 121 MEDIUMINT = auto() 122 UMEDIUMINT = auto() 123 INT = auto() 124 UINT = auto() 125 BIGINT = auto() 126 UBIGINT = auto() 127 BIGNUM = auto() 128 INT128 = auto() 129 UINT128 = auto() 130 INT256 = auto() 131 UINT256 = auto() 132 FLOAT = auto() 133 DOUBLE = auto() 134 UDOUBLE = auto() 135 DECIMAL = auto() 136 DECIMAL32 = auto() 137 DECIMAL64 = auto() 138 DECIMAL128 = auto() 139 DECIMAL256 = auto() 140 DECFLOAT = auto() 141 UDECIMAL = auto() 142 BIGDECIMAL = auto() 143 CHAR = auto() 144 NCHAR = auto() 145 VARCHAR = auto() 146 NVARCHAR = auto() 147 BPCHAR = auto() 148 TEXT = auto() 149 MEDIUMTEXT = auto() 150 LONGTEXT = auto() 151 BLOB = auto() 152 MEDIUMBLOB = auto() 153 LONGBLOB = auto() 154 TINYBLOB = auto() 155 TINYTEXT = auto() 156 NAME = auto() 157 BINARY = auto() 158 VARBINARY = auto() 159 JSON = auto() 160 JSONB = auto() 161 TIME = auto() 162 TIMETZ = auto() 163 TIME_NS = auto() 164 TIMESTAMP = auto() 165 TIMESTAMPTZ = auto() 166 TIMESTAMPLTZ = auto() 167 TIMESTAMPNTZ = auto() 168 TIMESTAMP_S = auto() 169 TIMESTAMP_MS = auto() 170 TIMESTAMP_NS = auto() 171 DATETIME = auto() 172 DATETIME2 = auto() 173 DATETIME64 = auto() 174 SMALLDATETIME = auto() 175 DATE = auto() 176 DATE32 = auto() 177 INT4RANGE = auto() 178 INT4MULTIRANGE = auto() 179 INT8RANGE = auto() 180 INT8MULTIRANGE = auto() 181 NUMRANGE = auto() 182 NUMMULTIRANGE = auto() 183 TSRANGE = auto() 184 TSMULTIRANGE = auto() 185 TSTZRANGE = auto() 186 TSTZMULTIRANGE = auto() 187 DATERANGE = auto() 188 DATEMULTIRANGE = auto() 189 UUID = auto() 190 GEOGRAPHY = auto() 191 GEOGRAPHYPOINT = auto() 192 NULLABLE = auto() 193 GEOMETRY = auto() 194 POINT = auto() 195 RING = auto() 196 LINESTRING = auto() 197 LOCALTIME = auto() 198 LOCALTIMESTAMP = auto() 199 SYSTIMESTAMP = auto() 200 MULTILINESTRING = auto() 201 POLYGON = auto() 202 MULTIPOLYGON = auto() 203 HLLSKETCH = auto() 204 HSTORE = auto() 205 SUPER = auto() 206 SERIAL = auto() 207 SMALLSERIAL = auto() 208 BIGSERIAL = auto() 209 XML = auto() 210 YEAR = auto() 211 USERDEFINED = auto() 212 MONEY = auto() 213 SMALLMONEY = auto() 214 ROWVERSION = auto() 215 IMAGE = auto() 216 VARIANT = auto() 217 OBJECT = auto() 218 INET = auto() 219 IPADDRESS = auto() 220 IPPREFIX = auto() 221 IPV4 = auto() 222 IPV6 = auto() 223 ENUM = auto() 224 ENUM8 = auto() 225 ENUM16 = auto() 226 FIXEDSTRING = auto() 227 LOWCARDINALITY = auto() 228 NESTED = auto() 229 AGGREGATEFUNCTION = auto() 230 SIMPLEAGGREGATEFUNCTION = auto() 231 TDIGEST = auto() 232 UNKNOWN = auto() 233 VECTOR = auto() 234 DYNAMIC = auto() 235 VOID = auto() 236 237 # keywords 238 ALIAS = auto() 239 ALTER = auto() 240 ALL = auto() 241 ANTI = auto() 242 ANY = auto() 243 APPLY = auto() 244 ARRAY = auto() 245 ASC = auto() 246 ASOF = auto() 247 ATTACH = auto() 248 AUTO_INCREMENT = auto() 249 BEGIN = auto() 250 BETWEEN = auto() 251 BULK_COLLECT_INTO = auto() 252 CACHE = auto() 253 CASE = auto() 254 CHARACTER_SET = auto() 255 CLUSTER_BY = auto() 256 COLLATE = auto() 257 COMMAND = auto() 258 COMMENT = auto() 259 COMMIT = auto() 260 CONNECT_BY = auto() 261 CONSTRAINT = auto() 262 COPY = auto() 263 CREATE = auto() 264 CROSS = auto() 265 CUBE = auto() 266 CURRENT_DATE = auto() 267 CURRENT_DATETIME = auto() 268 CURRENT_SCHEMA = auto() 269 CURRENT_TIME = auto() 270 CURRENT_TIMESTAMP = auto() 271 CURRENT_USER = auto() 272 CURRENT_USER_ID = auto() 273 CURRENT_ROLE = auto() 274 CURRENT_CATALOG = auto() 275 DECLARE = auto() 276 DEFAULT = auto() 277 DELETE = auto() 278 DESC = auto() 279 DESCRIBE = auto() 280 DETACH = auto() 281 DICTIONARY = auto() 282 DISTINCT = auto() 283 DISTRIBUTE_BY = auto() 284 DIV = auto() 285 DROP = auto() 286 ELSE = auto() 287 END = auto() 288 ESCAPE = auto() 289 EXCEPT = auto() 290 EXECUTE = auto() 291 EXISTS = auto() 292 FALSE = auto() 293 FETCH = auto() 294 FILE = auto() 295 FILE_FORMAT = auto() 296 FILTER = auto() 297 FINAL = auto() 298 FIRST = auto() 299 FOR = auto() 300 FORCE = auto() 301 FOREIGN_KEY = auto() 302 FORMAT = auto() 303 FROM = auto() 304 FULL = auto() 305 FUNCTION = auto() 306 GET = auto() 307 GLOB = auto() 308 GLOBAL = auto() 309 GRANT = auto() 310 GROUP_BY = auto() 311 GROUPING_SETS = auto() 312 HAVING = auto() 313 HINT = auto() 314 IGNORE = auto() 315 ILIKE = auto() 316 IN = auto() 317 INDEX = auto() 318 INDEXED_BY = auto() 319 INNER = auto() 320 INSERT = auto() 321 INSTALL = auto() 322 INTEGRATION = auto() 323 INTERSECT = auto() 324 INTERVAL = auto() 325 INTO = auto() 326 INTRODUCER = auto() 327 IRLIKE = auto() 328 IS = auto() 329 ISNULL = auto() 330 JOIN = auto() 331 JOIN_MARKER = auto() 332 KEEP = auto() 333 KEY = auto() 334 KILL = auto() 335 LANGUAGE = auto() 336 LATERAL = auto() 337 LEFT = auto() 338 LIKE = auto() 339 LIMIT = auto() 340 LIST = auto() 341 LOAD = auto() 342 LOCK = auto() 343 MAP = auto() 344 MATCH = auto() 345 MATCH_CONDITION = auto() 346 MATCH_RECOGNIZE = auto() 347 MEMBER_OF = auto() 348 MERGE = auto() 349 MOD = auto() 350 MODEL = auto() 351 NATURAL = auto() 352 NEXT = auto() 353 NOTHING = auto() 354 NOTNULL = auto() 355 NULL = auto() 356 OBJECT_IDENTIFIER = auto() 357 OFFSET = auto() 358 ON = auto() 359 ONLY = auto() 360 OPERATOR = auto() 361 ORDER_BY = auto() 362 ORDER_SIBLINGS_BY = auto() 363 ORDERED = auto() 364 ORDINALITY = auto() 365 OUT = auto() 366 INOUT = auto() 367 OUTER = auto() 368 OVER = auto() 369 OVERLAPS = auto() 370 OVERWRITE = auto() 371 PACKAGE = auto() 372 PARTITION = auto() 373 PARTITION_BY = auto() 374 PERCENT = auto() 375 PIVOT = auto() 376 PLACEHOLDER = auto() 377 POLICY = auto() 378 POOL = auto() 379 POSITIONAL = auto() 380 PRAGMA = auto() 381 PREWHERE = auto() 382 PRIMARY_KEY = auto() 383 PROCEDURE = auto() 384 PROPERTIES = auto() 385 PSEUDO_TYPE = auto() 386 PUT = auto() 387 QUALIFY = auto() 388 QUOTE = auto() 389 QDCOLON = auto() 390 RANGE = auto() 391 RECURSIVE = auto() 392 REFRESH = auto() 393 RENAME = auto() 394 REPLACE = auto() 395 RETURNING = auto() 396 REVOKE = auto() 397 REFERENCES = auto() 398 RIGHT = auto() 399 RLIKE = auto() 400 ROLE = auto() 401 ROLLBACK = auto() 402 ROLLUP = auto() 403 ROW = auto() 404 ROWS = auto() 405 RULE = auto() 406 SELECT = auto() 407 SEMI = auto() 408 SEPARATOR = auto() 409 SEQUENCE = auto() 410 SERDE_PROPERTIES = auto() 411 SET = auto() 412 SETTINGS = auto() 413 SHOW = auto() 414 SIMILAR_TO = auto() 415 SOME = auto() 416 SORT_BY = auto() 417 SOUNDS_LIKE = auto() 418 SQL_SECURITY = auto() 419 START_WITH = auto() 420 STORAGE_INTEGRATION = auto() 421 STRAIGHT_JOIN = auto() 422 STRUCT = auto() 423 SUMMARIZE = auto() 424 TABLE_SAMPLE = auto() 425 TAG = auto() 426 TEMPORARY = auto() 427 TOP = auto() 428 THEN = auto() 429 TRUE = auto() 430 TRUNCATE = auto() 431 TRIGGER = auto() 432 TYPE = auto() 433 UNCACHE = auto() 434 UNION = auto() 435 UNNEST = auto() 436 UNPIVOT = auto() 437 UPDATE = auto() 438 USE = auto() 439 USING = auto() 440 VALUES = auto() 441 VARIADIC = auto() 442 VIEW = auto() 443 SEMANTIC_VIEW = auto() 444 VOLATILE = auto() 445 VOLUME = auto() 446 WHEN = auto() 447 WHERE = auto() 448 WINDOW = auto() 449 WITH = auto() 450 UNIQUE = auto() 451 UTC_DATE = auto() 452 UTC_TIME = auto() 453 UTC_TIMESTAMP = auto() 454 VERSION_SNAPSHOT = auto() 455 TIMESTAMP_SNAPSHOT = auto() 456 OPTION = auto() 457 SINK = auto() 458 SOURCE = auto() 459 ANALYZE = auto() 460 NAMESPACE = auto() 461 EXPORT = auto() 462 463 # sentinels 464 HIVE_TOKEN_STREAM = auto() 465 SENTINEL = auto() 466 467 def __str__(self) -> str: 468 return f"TokenType.{self.name}"
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 1>
R_PAREN =
<TokenType.R_PAREN: 2>
L_BRACKET =
<TokenType.L_BRACKET: 3>
R_BRACKET =
<TokenType.R_BRACKET: 4>
L_BRACE =
<TokenType.L_BRACE: 5>
R_BRACE =
<TokenType.R_BRACE: 6>
COMMA =
<TokenType.COMMA: 7>
DOT =
<TokenType.DOT: 8>
DASH =
<TokenType.DASH: 9>
PLUS =
<TokenType.PLUS: 10>
COLON =
<TokenType.COLON: 11>
DOTCOLON =
<TokenType.DOTCOLON: 12>
DOTCARET =
<TokenType.DOTCARET: 13>
DCOLON =
<TokenType.DCOLON: 14>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 16>
DCOLONQMARK =
<TokenType.DCOLONQMARK: 17>
DQMARK =
<TokenType.DQMARK: 18>
SEMICOLON =
<TokenType.SEMICOLON: 19>
STAR =
<TokenType.STAR: 20>
BACKSLASH =
<TokenType.BACKSLASH: 21>
SLASH =
<TokenType.SLASH: 22>
LT =
<TokenType.LT: 23>
LTE =
<TokenType.LTE: 24>
GT =
<TokenType.GT: 25>
GTE =
<TokenType.GTE: 26>
NOT =
<TokenType.NOT: 27>
EQ =
<TokenType.EQ: 28>
NEQ =
<TokenType.NEQ: 29>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 30>
COLON_EQ =
<TokenType.COLON_EQ: 31>
COLON_GT =
<TokenType.COLON_GT: 32>
NCOLON_GT =
<TokenType.NCOLON_GT: 33>
AND =
<TokenType.AND: 34>
OR =
<TokenType.OR: 35>
AMP =
<TokenType.AMP: 36>
DPIPE =
<TokenType.DPIPE: 37>
PIPE_GT =
<TokenType.PIPE_GT: 38>
PIPE =
<TokenType.PIPE: 39>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 40>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 41>
CARET =
<TokenType.CARET: 42>
CARET_AT =
<TokenType.CARET_AT: 43>
TILDE =
<TokenType.TILDE: 44>
ARROW =
<TokenType.ARROW: 45>
DARROW =
<TokenType.DARROW: 46>
FARROW =
<TokenType.FARROW: 47>
HASH =
<TokenType.HASH: 48>
HASH_ARROW =
<TokenType.HASH_ARROW: 49>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 50>
LR_ARROW =
<TokenType.LR_ARROW: 51>
DAT =
<TokenType.DAT: 52>
LT_AT =
<TokenType.LT_AT: 53>
AT_GT =
<TokenType.AT_GT: 54>
DOLLAR =
<TokenType.DOLLAR: 55>
PARAMETER =
<TokenType.PARAMETER: 56>
SESSION =
<TokenType.SESSION: 57>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 58>
SESSION_USER =
<TokenType.SESSION_USER: 59>
DAMP =
<TokenType.DAMP: 60>
AMP_LT =
<TokenType.AMP_LT: 61>
AMP_GT =
<TokenType.AMP_GT: 62>
ADJACENT =
<TokenType.ADJACENT: 63>
XOR =
<TokenType.XOR: 64>
DSTAR =
<TokenType.DSTAR: 65>
QMARK_AMP =
<TokenType.QMARK_AMP: 66>
QMARK_PIPE =
<TokenType.QMARK_PIPE: 67>
HASH_DASH =
<TokenType.HASH_DASH: 68>
EXCLAMATION =
<TokenType.EXCLAMATION: 69>
URI_START =
<TokenType.URI_START: 70>
BLOCK_START =
<TokenType.BLOCK_START: 71>
BLOCK_END =
<TokenType.BLOCK_END: 72>
SPACE =
<TokenType.SPACE: 73>
BREAK =
<TokenType.BREAK: 74>
STRING =
<TokenType.STRING: 75>
NUMBER =
<TokenType.NUMBER: 76>
IDENTIFIER =
<TokenType.IDENTIFIER: 77>
DATABASE =
<TokenType.DATABASE: 78>
COLUMN =
<TokenType.COLUMN: 79>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 80>
SCHEMA =
<TokenType.SCHEMA: 81>
TABLE =
<TokenType.TABLE: 82>
WAREHOUSE =
<TokenType.WAREHOUSE: 83>
STAGE =
<TokenType.STAGE: 84>
STREAM =
<TokenType.STREAM: 85>
STREAMLIT =
<TokenType.STREAMLIT: 86>
VAR =
<TokenType.VAR: 87>
BIT_STRING =
<TokenType.BIT_STRING: 88>
HEX_STRING =
<TokenType.HEX_STRING: 89>
BYTE_STRING =
<TokenType.BYTE_STRING: 90>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 91>
RAW_STRING =
<TokenType.RAW_STRING: 92>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 93>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 94>
BIT =
<TokenType.BIT: 95>
BOOLEAN =
<TokenType.BOOLEAN: 96>
TINYINT =
<TokenType.TINYINT: 97>
UTINYINT =
<TokenType.UTINYINT: 98>
SMALLINT =
<TokenType.SMALLINT: 99>
USMALLINT =
<TokenType.USMALLINT: 100>
MEDIUMINT =
<TokenType.MEDIUMINT: 101>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 102>
INT =
<TokenType.INT: 103>
UINT =
<TokenType.UINT: 104>
BIGINT =
<TokenType.BIGINT: 105>
UBIGINT =
<TokenType.UBIGINT: 106>
BIGNUM =
<TokenType.BIGNUM: 107>
INT128 =
<TokenType.INT128: 108>
UINT128 =
<TokenType.UINT128: 109>
INT256 =
<TokenType.INT256: 110>
UINT256 =
<TokenType.UINT256: 111>
FLOAT =
<TokenType.FLOAT: 112>
DOUBLE =
<TokenType.DOUBLE: 113>
UDOUBLE =
<TokenType.UDOUBLE: 114>
DECIMAL =
<TokenType.DECIMAL: 115>
DECIMAL32 =
<TokenType.DECIMAL32: 116>
DECIMAL64 =
<TokenType.DECIMAL64: 117>
DECIMAL128 =
<TokenType.DECIMAL128: 118>
DECIMAL256 =
<TokenType.DECIMAL256: 119>
DECFLOAT =
<TokenType.DECFLOAT: 120>
UDECIMAL =
<TokenType.UDECIMAL: 121>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 122>
CHAR =
<TokenType.CHAR: 123>
NCHAR =
<TokenType.NCHAR: 124>
VARCHAR =
<TokenType.VARCHAR: 125>
NVARCHAR =
<TokenType.NVARCHAR: 126>
BPCHAR =
<TokenType.BPCHAR: 127>
TEXT =
<TokenType.TEXT: 128>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 129>
LONGTEXT =
<TokenType.LONGTEXT: 130>
BLOB =
<TokenType.BLOB: 131>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 132>
LONGBLOB =
<TokenType.LONGBLOB: 133>
TINYBLOB =
<TokenType.TINYBLOB: 134>
TINYTEXT =
<TokenType.TINYTEXT: 135>
NAME =
<TokenType.NAME: 136>
BINARY =
<TokenType.BINARY: 137>
VARBINARY =
<TokenType.VARBINARY: 138>
JSON =
<TokenType.JSON: 139>
JSONB =
<TokenType.JSONB: 140>
TIME =
<TokenType.TIME: 141>
TIMETZ =
<TokenType.TIMETZ: 142>
TIME_NS =
<TokenType.TIME_NS: 143>
TIMESTAMP =
<TokenType.TIMESTAMP: 144>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 145>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 146>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 147>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 148>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 149>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 150>
DATETIME =
<TokenType.DATETIME: 151>
DATETIME2 =
<TokenType.DATETIME2: 152>
DATETIME64 =
<TokenType.DATETIME64: 153>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 154>
DATE =
<TokenType.DATE: 155>
DATE32 =
<TokenType.DATE32: 156>
INT4RANGE =
<TokenType.INT4RANGE: 157>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 158>
INT8RANGE =
<TokenType.INT8RANGE: 159>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 160>
NUMRANGE =
<TokenType.NUMRANGE: 161>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 162>
TSRANGE =
<TokenType.TSRANGE: 163>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 164>
TSTZRANGE =
<TokenType.TSTZRANGE: 165>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 166>
DATERANGE =
<TokenType.DATERANGE: 167>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 168>
UUID =
<TokenType.UUID: 169>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 170>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 171>
NULLABLE =
<TokenType.NULLABLE: 172>
GEOMETRY =
<TokenType.GEOMETRY: 173>
POINT =
<TokenType.POINT: 174>
RING =
<TokenType.RING: 175>
LINESTRING =
<TokenType.LINESTRING: 176>
LOCALTIME =
<TokenType.LOCALTIME: 177>
LOCALTIMESTAMP =
<TokenType.LOCALTIMESTAMP: 178>
SYSTIMESTAMP =
<TokenType.SYSTIMESTAMP: 179>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 180>
POLYGON =
<TokenType.POLYGON: 181>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 182>
HLLSKETCH =
<TokenType.HLLSKETCH: 183>
HSTORE =
<TokenType.HSTORE: 184>
SUPER =
<TokenType.SUPER: 185>
SERIAL =
<TokenType.SERIAL: 186>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 187>
BIGSERIAL =
<TokenType.BIGSERIAL: 188>
XML =
<TokenType.XML: 189>
YEAR =
<TokenType.YEAR: 190>
USERDEFINED =
<TokenType.USERDEFINED: 191>
MONEY =
<TokenType.MONEY: 192>
SMALLMONEY =
<TokenType.SMALLMONEY: 193>
ROWVERSION =
<TokenType.ROWVERSION: 194>
IMAGE =
<TokenType.IMAGE: 195>
VARIANT =
<TokenType.VARIANT: 196>
OBJECT =
<TokenType.OBJECT: 197>
INET =
<TokenType.INET: 198>
IPADDRESS =
<TokenType.IPADDRESS: 199>
IPPREFIX =
<TokenType.IPPREFIX: 200>
IPV4 =
<TokenType.IPV4: 201>
IPV6 =
<TokenType.IPV6: 202>
ENUM =
<TokenType.ENUM: 203>
ENUM8 =
<TokenType.ENUM8: 204>
ENUM16 =
<TokenType.ENUM16: 205>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 206>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 207>
NESTED =
<TokenType.NESTED: 208>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 209>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 210>
TDIGEST =
<TokenType.TDIGEST: 211>
UNKNOWN =
<TokenType.UNKNOWN: 212>
VECTOR =
<TokenType.VECTOR: 213>
DYNAMIC =
<TokenType.DYNAMIC: 214>
VOID =
<TokenType.VOID: 215>
ALIAS =
<TokenType.ALIAS: 216>
ALTER =
<TokenType.ALTER: 217>
ALL =
<TokenType.ALL: 218>
ANTI =
<TokenType.ANTI: 219>
ANY =
<TokenType.ANY: 220>
APPLY =
<TokenType.APPLY: 221>
ARRAY =
<TokenType.ARRAY: 222>
ASC =
<TokenType.ASC: 223>
ASOF =
<TokenType.ASOF: 224>
ATTACH =
<TokenType.ATTACH: 225>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 226>
BEGIN =
<TokenType.BEGIN: 227>
BETWEEN =
<TokenType.BETWEEN: 228>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 229>
CACHE =
<TokenType.CACHE: 230>
CASE =
<TokenType.CASE: 231>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 232>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 233>
COLLATE =
<TokenType.COLLATE: 234>
COMMAND =
<TokenType.COMMAND: 235>
COMMENT =
<TokenType.COMMENT: 236>
COMMIT =
<TokenType.COMMIT: 237>
CONNECT_BY =
<TokenType.CONNECT_BY: 238>
CONSTRAINT =
<TokenType.CONSTRAINT: 239>
COPY =
<TokenType.COPY: 240>
CREATE =
<TokenType.CREATE: 241>
CROSS =
<TokenType.CROSS: 242>
CUBE =
<TokenType.CUBE: 243>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 244>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 245>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 246>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 247>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 248>
CURRENT_USER =
<TokenType.CURRENT_USER: 249>
CURRENT_USER_ID =
<TokenType.CURRENT_USER_ID: 250>
CURRENT_ROLE =
<TokenType.CURRENT_ROLE: 251>
CURRENT_CATALOG =
<TokenType.CURRENT_CATALOG: 252>
DECLARE =
<TokenType.DECLARE: 253>
DEFAULT =
<TokenType.DEFAULT: 254>
DELETE =
<TokenType.DELETE: 255>
DESC =
<TokenType.DESC: 256>
DESCRIBE =
<TokenType.DESCRIBE: 257>
DETACH =
<TokenType.DETACH: 258>
DICTIONARY =
<TokenType.DICTIONARY: 259>
DISTINCT =
<TokenType.DISTINCT: 260>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 261>
DIV =
<TokenType.DIV: 262>
DROP =
<TokenType.DROP: 263>
ELSE =
<TokenType.ELSE: 264>
END =
<TokenType.END: 265>
ESCAPE =
<TokenType.ESCAPE: 266>
EXCEPT =
<TokenType.EXCEPT: 267>
EXECUTE =
<TokenType.EXECUTE: 268>
EXISTS =
<TokenType.EXISTS: 269>
FALSE =
<TokenType.FALSE: 270>
FETCH =
<TokenType.FETCH: 271>
FILE =
<TokenType.FILE: 272>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 273>
FILTER =
<TokenType.FILTER: 274>
FINAL =
<TokenType.FINAL: 275>
FIRST =
<TokenType.FIRST: 276>
FOR =
<TokenType.FOR: 277>
FORCE =
<TokenType.FORCE: 278>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 279>
FORMAT =
<TokenType.FORMAT: 280>
FROM =
<TokenType.FROM: 281>
FULL =
<TokenType.FULL: 282>
FUNCTION =
<TokenType.FUNCTION: 283>
GET =
<TokenType.GET: 284>
GLOB =
<TokenType.GLOB: 285>
GLOBAL =
<TokenType.GLOBAL: 286>
GRANT =
<TokenType.GRANT: 287>
GROUP_BY =
<TokenType.GROUP_BY: 288>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 289>
HAVING =
<TokenType.HAVING: 290>
HINT =
<TokenType.HINT: 291>
IGNORE =
<TokenType.IGNORE: 292>
ILIKE =
<TokenType.ILIKE: 293>
IN =
<TokenType.IN: 294>
INDEX =
<TokenType.INDEX: 295>
INDEXED_BY =
<TokenType.INDEXED_BY: 296>
INNER =
<TokenType.INNER: 297>
INSERT =
<TokenType.INSERT: 298>
INSTALL =
<TokenType.INSTALL: 299>
INTEGRATION =
<TokenType.INTEGRATION: 300>
INTERSECT =
<TokenType.INTERSECT: 301>
INTERVAL =
<TokenType.INTERVAL: 302>
INTO =
<TokenType.INTO: 303>
INTRODUCER =
<TokenType.INTRODUCER: 304>
IRLIKE =
<TokenType.IRLIKE: 305>
IS =
<TokenType.IS: 306>
ISNULL =
<TokenType.ISNULL: 307>
JOIN =
<TokenType.JOIN: 308>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 309>
KEEP =
<TokenType.KEEP: 310>
KEY =
<TokenType.KEY: 311>
KILL =
<TokenType.KILL: 312>
LANGUAGE =
<TokenType.LANGUAGE: 313>
LATERAL =
<TokenType.LATERAL: 314>
LEFT =
<TokenType.LEFT: 315>
LIKE =
<TokenType.LIKE: 316>
LIMIT =
<TokenType.LIMIT: 317>
LIST =
<TokenType.LIST: 318>
LOAD =
<TokenType.LOAD: 319>
LOCK =
<TokenType.LOCK: 320>
MAP =
<TokenType.MAP: 321>
MATCH =
<TokenType.MATCH: 322>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 323>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 324>
MEMBER_OF =
<TokenType.MEMBER_OF: 325>
MERGE =
<TokenType.MERGE: 326>
MOD =
<TokenType.MOD: 327>
MODEL =
<TokenType.MODEL: 328>
NATURAL =
<TokenType.NATURAL: 329>
NEXT =
<TokenType.NEXT: 330>
NOTHING =
<TokenType.NOTHING: 331>
NOTNULL =
<TokenType.NOTNULL: 332>
NULL =
<TokenType.NULL: 333>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 334>
OFFSET =
<TokenType.OFFSET: 335>
ON =
<TokenType.ON: 336>
ONLY =
<TokenType.ONLY: 337>
OPERATOR =
<TokenType.OPERATOR: 338>
ORDER_BY =
<TokenType.ORDER_BY: 339>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 340>
ORDERED =
<TokenType.ORDERED: 341>
ORDINALITY =
<TokenType.ORDINALITY: 342>
OUT =
<TokenType.OUT: 343>
INOUT =
<TokenType.INOUT: 344>
OUTER =
<TokenType.OUTER: 345>
OVER =
<TokenType.OVER: 346>
OVERLAPS =
<TokenType.OVERLAPS: 347>
OVERWRITE =
<TokenType.OVERWRITE: 348>
PACKAGE =
<TokenType.PACKAGE: 349>
PARTITION =
<TokenType.PARTITION: 350>
PARTITION_BY =
<TokenType.PARTITION_BY: 351>
PERCENT =
<TokenType.PERCENT: 352>
PIVOT =
<TokenType.PIVOT: 353>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 354>
POLICY =
<TokenType.POLICY: 355>
POOL =
<TokenType.POOL: 356>
POSITIONAL =
<TokenType.POSITIONAL: 357>
PRAGMA =
<TokenType.PRAGMA: 358>
PREWHERE =
<TokenType.PREWHERE: 359>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 360>
PROCEDURE =
<TokenType.PROCEDURE: 361>
PROPERTIES =
<TokenType.PROPERTIES: 362>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 363>
PUT =
<TokenType.PUT: 364>
QUALIFY =
<TokenType.QUALIFY: 365>
QUOTE =
<TokenType.QUOTE: 366>
QDCOLON =
<TokenType.QDCOLON: 367>
RANGE =
<TokenType.RANGE: 368>
RECURSIVE =
<TokenType.RECURSIVE: 369>
REFRESH =
<TokenType.REFRESH: 370>
RENAME =
<TokenType.RENAME: 371>
REPLACE =
<TokenType.REPLACE: 372>
RETURNING =
<TokenType.RETURNING: 373>
REVOKE =
<TokenType.REVOKE: 374>
REFERENCES =
<TokenType.REFERENCES: 375>
RIGHT =
<TokenType.RIGHT: 376>
RLIKE =
<TokenType.RLIKE: 377>
ROLE =
<TokenType.ROLE: 378>
ROLLBACK =
<TokenType.ROLLBACK: 379>
ROLLUP =
<TokenType.ROLLUP: 380>
ROW =
<TokenType.ROW: 381>
ROWS =
<TokenType.ROWS: 382>
RULE =
<TokenType.RULE: 383>
SELECT =
<TokenType.SELECT: 384>
SEMI =
<TokenType.SEMI: 385>
SEPARATOR =
<TokenType.SEPARATOR: 386>
SEQUENCE =
<TokenType.SEQUENCE: 387>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 388>
SET =
<TokenType.SET: 389>
SETTINGS =
<TokenType.SETTINGS: 390>
SHOW =
<TokenType.SHOW: 391>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 392>
SOME =
<TokenType.SOME: 393>
SORT_BY =
<TokenType.SORT_BY: 394>
SOUNDS_LIKE =
<TokenType.SOUNDS_LIKE: 395>
SQL_SECURITY =
<TokenType.SQL_SECURITY: 396>
START_WITH =
<TokenType.START_WITH: 397>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 398>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 399>
STRUCT =
<TokenType.STRUCT: 400>
SUMMARIZE =
<TokenType.SUMMARIZE: 401>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 402>
TAG =
<TokenType.TAG: 403>
TEMPORARY =
<TokenType.TEMPORARY: 404>
TOP =
<TokenType.TOP: 405>
THEN =
<TokenType.THEN: 406>
TRUE =
<TokenType.TRUE: 407>
TRUNCATE =
<TokenType.TRUNCATE: 408>
TRIGGER =
<TokenType.TRIGGER: 409>
TYPE =
<TokenType.TYPE: 410>
UNCACHE =
<TokenType.UNCACHE: 411>
UNION =
<TokenType.UNION: 412>
UNNEST =
<TokenType.UNNEST: 413>
UNPIVOT =
<TokenType.UNPIVOT: 414>
UPDATE =
<TokenType.UPDATE: 415>
USE =
<TokenType.USE: 416>
USING =
<TokenType.USING: 417>
VALUES =
<TokenType.VALUES: 418>
VARIADIC =
<TokenType.VARIADIC: 419>
VIEW =
<TokenType.VIEW: 420>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 421>
VOLATILE =
<TokenType.VOLATILE: 422>
VOLUME =
<TokenType.VOLUME: 423>
WHEN =
<TokenType.WHEN: 424>
WHERE =
<TokenType.WHERE: 425>
WINDOW =
<TokenType.WINDOW: 426>
WITH =
<TokenType.WITH: 427>
UNIQUE =
<TokenType.UNIQUE: 428>
UTC_DATE =
<TokenType.UTC_DATE: 429>
UTC_TIME =
<TokenType.UTC_TIME: 430>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 431>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 432>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 433>
OPTION =
<TokenType.OPTION: 434>
SINK =
<TokenType.SINK: 435>
SOURCE =
<TokenType.SOURCE: 436>
ANALYZE =
<TokenType.ANALYZE: 437>
NAMESPACE =
<TokenType.NAMESPACE: 438>
EXPORT =
<TokenType.EXPORT: 439>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 440>
SENTINEL =
<TokenType.SENTINEL: 441>
class
Token:
471class Token: 472 # mypyc doesn't expose slots 473 _attrs: t.ClassVar[tuple[str, ...]] = ( 474 "token_type", 475 "text", 476 "line", 477 "col", 478 "start", 479 "end", 480 "comments", 481 ) 482 __slots__ = _attrs 483 484 @classmethod 485 def number(cls, number: int) -> Token: 486 """Returns a NUMBER token with `number` as its text.""" 487 return cls(TokenType.NUMBER, str(number)) 488 489 @classmethod 490 def string(cls, string: str) -> Token: 491 """Returns a STRING token with `string` as its text.""" 492 return cls(TokenType.STRING, string) 493 494 @classmethod 495 def identifier(cls, identifier: str) -> Token: 496 """Returns an IDENTIFIER token with `identifier` as its text.""" 497 return cls(TokenType.IDENTIFIER, identifier) 498 499 @classmethod 500 def var(cls, var: str) -> Token: 501 """Returns an VAR token with `var` as its text.""" 502 return cls(TokenType.VAR, var) 503 504 def __init__( 505 self, 506 token_type: TokenType, 507 text: str, 508 line: int = 1, 509 col: int = 1, 510 start: int = 0, 511 end: int = 0, 512 comments: list[str] | None = None, 513 ) -> None: 514 self.token_type = token_type 515 self.text = text 516 self.line = line 517 self.col = col 518 self.start = start 519 self.end = end 520 self.comments = [] if comments is None else comments 521 522 def __bool__(self) -> bool: 523 return self.token_type != TokenType.SENTINEL 524 525 def __repr__(self) -> str: 526 attributes = ", ".join( 527 f"{k}: TokenType.{self.token_type.name}" 528 if k == "token_type" 529 else f"{k}: {getattr(self, k)}" 530 for k in self._attrs 531 ) 532 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
504 def __init__( 505 self, 506 token_type: TokenType, 507 text: str, 508 line: int = 1, 509 col: int = 1, 510 start: int = 0, 511 end: int = 0, 512 comments: list[str] | None = None, 513 ) -> None: 514 self.token_type = token_type 515 self.text = text 516 self.line = line 517 self.col = col 518 self.start = start 519 self.end = end 520 self.comments = [] if comments is None else comments
484 @classmethod 485 def number(cls, number: int) -> Token: 486 """Returns a NUMBER token with `number` as its text.""" 487 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
489 @classmethod 490 def string(cls, string: str) -> Token: 491 """Returns a STRING token with `string` as its text.""" 492 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
494 @classmethod 495 def identifier(cls, identifier: str) -> Token: 496 """Returns an IDENTIFIER token with `identifier` as its text.""" 497 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
TokenizerCore:
535class TokenizerCore: 536 __slots__ = ( 537 "sql", 538 "size", 539 "tokens", 540 "_start", 541 "_current", 542 "_line", 543 "_col", 544 "_comments", 545 "_char", 546 "_end", 547 "_peek", 548 "_prev_token_line", 549 "single_tokens", 550 "keywords", 551 "quotes", 552 "format_strings", 553 "identifiers", 554 "comments", 555 "string_escapes", 556 "byte_string_escapes", 557 "identifier_escapes", 558 "escape_follow_chars", 559 "commands", 560 "command_prefix_tokens", 561 "nested_comments", 562 "hint_start", 563 "tokens_preceding_hint", 564 "has_bit_strings", 565 "has_hex_strings", 566 "numeric_literals", 567 "var_single_tokens", 568 "string_escapes_allowed_in_raw_strings", 569 "heredoc_tag_is_identifier", 570 "heredoc_string_alternative", 571 "keyword_trie", 572 "numbers_can_be_underscore_separated", 573 "numbers_can_have_decimals", 574 "identifiers_can_start_with_digit", 575 "unescaped_sequences", 576 ) 577 578 def __init__( 579 self, 580 single_tokens: dict[str, TokenType], 581 keywords: dict[str, TokenType], 582 quotes: dict[str, str], 583 format_strings: dict[str, tuple[str, TokenType]], 584 identifiers: dict[str, str], 585 comments: dict[str, str | None], 586 string_escapes: set[str], 587 byte_string_escapes: set[str], 588 identifier_escapes: set[str], 589 escape_follow_chars: set[str], 590 commands: set[TokenType], 591 command_prefix_tokens: set[TokenType], 592 nested_comments: bool, 593 hint_start: str, 594 tokens_preceding_hint: set[TokenType], 595 has_bit_strings: bool, 596 has_hex_strings: bool, 597 numeric_literals: dict[str, str], 598 var_single_tokens: set[str], 599 string_escapes_allowed_in_raw_strings: bool, 600 heredoc_tag_is_identifier: bool, 601 heredoc_string_alternative: TokenType, 602 keyword_trie: dict, 603 numbers_can_be_underscore_separated: bool, 604 numbers_can_have_decimals: bool, 605 identifiers_can_start_with_digit: bool, 606 unescaped_sequences: dict[str, str], 607 ) -> None: 608 self.single_tokens = single_tokens 609 self.keywords = keywords 610 self.quotes = quotes 611 self.format_strings = format_strings 612 self.identifiers = identifiers 613 self.comments = comments 614 self.string_escapes = string_escapes 615 self.byte_string_escapes = byte_string_escapes 616 self.identifier_escapes = identifier_escapes 617 self.escape_follow_chars = escape_follow_chars 618 self.commands = commands 619 self.command_prefix_tokens = command_prefix_tokens 620 self.nested_comments = nested_comments 621 self.hint_start = hint_start 622 self.tokens_preceding_hint = tokens_preceding_hint 623 self.has_bit_strings = has_bit_strings 624 self.has_hex_strings = has_hex_strings 625 self.numeric_literals = numeric_literals 626 self.var_single_tokens = var_single_tokens 627 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 628 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 629 self.heredoc_string_alternative = heredoc_string_alternative 630 self.keyword_trie = keyword_trie 631 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 632 self.numbers_can_have_decimals = numbers_can_have_decimals 633 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 634 self.unescaped_sequences = unescaped_sequences 635 self.sql = "" 636 self.size = 0 637 self.tokens: list[Token] = [] 638 self._start = 0 639 self._current = 0 640 self._line = 1 641 self._col = 0 642 self._comments: list[str] = [] 643 self._char = "" 644 self._end = False 645 self._peek = "" 646 self._prev_token_line = -1 647 648 def reset(self) -> None: 649 self.sql = "" 650 self.size = 0 651 self.tokens = [] 652 self._start = 0 653 self._current = 0 654 self._line = 1 655 self._col = 0 656 self._comments = [] 657 self._char = "" 658 self._end = False 659 self._peek = "" 660 self._prev_token_line = -1 661 662 def tokenize(self, sql: str) -> list[Token]: 663 """Returns a list of tokens corresponding to the SQL string `sql`.""" 664 self.reset() 665 self.sql = sql 666 self.size = len(sql) 667 668 try: 669 self._scan() 670 except Exception as e: 671 start = max(self._current - 50, 0) 672 end = min(self._current + 50, self.size - 1) 673 context = self.sql[start:end] 674 raise TokenError(f"Error tokenizing '{context}'") from e 675 676 return self.tokens 677 678 def _scan(self, check_semicolon: bool = False) -> None: 679 identifiers = self.identifiers 680 digit_chars = _DIGIT_CHARS 681 682 while self.size and not self._end: 683 current = self._current 684 685 # Skip spaces here rather than iteratively calling advance() for performance reasons 686 while current < self.size: 687 char = self.sql[current] 688 689 if char == " " or char == "\t": 690 current += 1 691 else: 692 break 693 694 offset = current - self._current if current > self._current else 1 695 696 self._start = current 697 self._advance(offset) 698 699 if not self._char.isspace(): 700 if self._char in digit_chars: 701 self._scan_number() 702 elif self._char in identifiers: 703 self._scan_identifier(identifiers[self._char]) 704 else: 705 self._scan_keywords() 706 707 if check_semicolon and self._peek == ";": 708 break 709 710 if self.tokens and self._comments: 711 self.tokens[-1].comments.extend(self._comments) 712 713 def _chars(self, size: int) -> str: 714 if size == 1: 715 return self._char 716 717 start = self._current - 1 718 end = start + size 719 720 return self.sql[start:end] if end <= self.size else "" 721 722 def _advance(self, i: int = 1, alnum: bool = False) -> None: 723 char = self._char 724 725 if char == "\n" or char == "\r": 726 # Ensures we don't count an extra line if we get a \r\n line break sequence 727 if not (char == "\r" and self._peek == "\n"): 728 self._col = i 729 self._line += 1 730 else: 731 self._col += i 732 733 self._current += i 734 sql = self.sql 735 size = self.size 736 self._end = self._current >= size 737 self._char = sql[self._current - 1] 738 self._peek = "" if self._end else sql[self._current] 739 740 if alnum and self._char.isalnum(): 741 # Cache to local variables instead of attributes for better performance 742 _col = self._col 743 _current = self._current 744 _end = self._end 745 _peek = self._peek 746 747 while _peek.isalnum(): 748 _col += 1 749 _current += 1 750 _end = _current >= size 751 _peek = "" if _end else sql[_current] 752 753 self._col = _col 754 self._current = _current 755 self._end = _end 756 self._peek = _peek 757 self._char = sql[_current - 1] 758 759 @property 760 def _text(self) -> str: 761 return self.sql[self._start : self._current] 762 763 def _add(self, token_type: TokenType, text: str | None = None) -> None: 764 self._prev_token_line = self._line 765 766 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 767 self.tokens[-1].comments.extend(self._comments) 768 self._comments = [] 769 770 if text is None: 771 text = self.sql[self._start : self._current] 772 773 self.tokens.append( 774 Token( 775 token_type, 776 text=text, 777 line=self._line, 778 col=self._col, 779 start=self._start, 780 end=self._current - 1, 781 comments=self._comments, 782 ) 783 ) 784 self._comments = [] 785 786 # If we have either a semicolon or a begin token before the command's token, we'll parse 787 # whatever follows the command's token as a string 788 if ( 789 token_type in self.commands 790 and self._peek != ";" 791 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 792 ): 793 start = self._current 794 tokens = len(self.tokens) 795 self._scan(check_semicolon=True) 796 self.tokens = self.tokens[:tokens] 797 text = self.sql[start : self._current].strip() 798 if text: 799 self._add(TokenType.STRING, text) 800 801 def _scan_keywords(self) -> None: 802 sql = self.sql 803 sql_size = self.size 804 single_tokens = self.single_tokens 805 char_upper = _CHAR_UPPER 806 size = 0 807 word = None 808 chars = self._char 809 char = chars 810 prev_space = False 811 skip = False 812 trie = self.keyword_trie 813 single_token = char in single_tokens 814 815 while chars: 816 if not skip: 817 sub = trie.get(char_upper.get(char, char)) 818 if sub is None: 819 break 820 trie = sub 821 if 0 in trie: 822 word = chars 823 824 end = self._current + size 825 size += 1 826 827 if end < sql_size: 828 char = sql[end] 829 single_token = single_token or char in single_tokens 830 is_space = char.isspace() 831 832 if not is_space or not prev_space: 833 if is_space: 834 char = " " 835 chars += char 836 prev_space = is_space 837 skip = False 838 else: 839 skip = True 840 else: 841 char = "" 842 break 843 844 if word: 845 if self._scan_string(word): 846 return 847 if self._scan_comment(word): 848 return 849 if prev_space or single_token or not char: 850 self._advance(size - 1) 851 word = word.upper() 852 self._add(self.keywords[word], text=word) 853 return 854 855 if self._char in single_tokens: 856 self._add(single_tokens[self._char], text=self._char) 857 return 858 859 self._scan_var() 860 861 def _scan_comment(self, comment_start: str) -> bool: 862 if comment_start not in self.comments: 863 return False 864 865 comment_start_line = self._line 866 comment_start_size = len(comment_start) 867 comment_end = self.comments[comment_start] 868 869 if comment_end: 870 # Skip the comment's start delimiter 871 self._advance(comment_start_size) 872 873 comment_count = 1 874 comment_end_size = len(comment_end) 875 nested_comments = self.nested_comments 876 877 while not self._end: 878 if self._chars(comment_end_size) == comment_end: 879 comment_count -= 1 880 if not comment_count: 881 break 882 883 self._advance(alnum=True) 884 885 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 886 if ( 887 nested_comments 888 and not self._end 889 and self._chars(comment_end_size) == comment_start 890 ): 891 self._advance(comment_start_size) 892 comment_count += 1 893 894 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 895 self._advance(comment_end_size - 1) 896 else: 897 _peek = self._peek 898 while not self._end and _peek != "\n" and _peek != "\r": 899 self._advance(alnum=True) 900 _peek = self._peek 901 self._comments.append(self._text[comment_start_size:]) 902 903 if ( 904 comment_start == self.hint_start 905 and self.tokens 906 and self.tokens[-1].token_type in self.tokens_preceding_hint 907 ): 908 self._add(TokenType.HINT) 909 910 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 911 # Multiple consecutive comments are preserved by appending them to the current comments list. 912 if comment_start_line == self._prev_token_line: 913 self.tokens[-1].comments.extend(self._comments) 914 self._comments = [] 915 self._prev_token_line = self._line 916 917 return True 918 919 def _scan_number(self) -> None: 920 if self._char == "0": 921 peek = _CHAR_UPPER.get(self._peek, self._peek) 922 if peek == "B": 923 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 924 elif peek == "X": 925 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 926 927 decimal = False 928 scientific = 0 929 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 930 single_tokens = self.single_tokens 931 keywords = self.keywords 932 numeric_literals = self.numeric_literals 933 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 934 935 is_underscore_separated: bool = False 936 number_text: str = "" 937 numeric_literal: str = "" 938 numeric_type: TokenType | None = None 939 940 while True: 941 if self._peek in _DIGIT_CHARS: 942 # Batch consecutive digits: scan ahead to find how many 943 sql = self.sql 944 end = self._current + 1 945 size = self.size 946 while end < size and sql[end] in _DIGIT_CHARS: 947 end += 1 948 self._advance(end - self._current) 949 elif self._peek == "." and not decimal: 950 if ( 951 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 952 ) or not self.numbers_can_have_decimals: 953 break 954 decimal = True 955 self._advance() 956 elif self._peek in ("-", "+") and scientific == 1: 957 # Only consume +/- if followed by a digit 958 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 959 scientific += 1 960 self._advance() 961 else: 962 break 963 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 964 scientific += 1 965 self._advance() 966 elif self._peek == "_" and numbers_can_be_underscore_separated: 967 is_underscore_separated = True 968 self._advance() 969 elif self._peek.isidentifier(): 970 number_text = self._text 971 972 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 973 numeric_literal += self._peek 974 self._advance() 975 976 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 977 978 if numeric_type: 979 break 980 elif identifiers_can_start_with_digit: 981 return self._add(TokenType.VAR) 982 983 self._advance(-len(numeric_literal)) 984 break 985 else: 986 break 987 988 number_text = number_text or self.sql[self._start : self._current] 989 990 # Normalize inputs such as 100_000 to 100000 991 if is_underscore_separated: 992 number_text = number_text.replace("_", "") 993 994 self._add(TokenType.NUMBER, number_text) 995 996 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 997 if numeric_type: 998 self._add(TokenType.DCOLON, "::") 999 self._add(numeric_type, numeric_literal) 1000 1001 def _scan_bits(self) -> None: 1002 self._advance() 1003 value = self._extract_value() 1004 try: 1005 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1006 int(value, 2) 1007 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1008 except ValueError: 1009 self._add(TokenType.IDENTIFIER) 1010 1011 def _scan_hex(self) -> None: 1012 self._advance() 1013 value = self._extract_value() 1014 try: 1015 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1016 int(value, 16) 1017 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1018 except ValueError: 1019 self._add(TokenType.IDENTIFIER) 1020 1021 def _extract_value(self) -> str: 1022 single_tokens = self.single_tokens 1023 1024 while True: 1025 char = self._peek.strip() 1026 if char and char not in single_tokens: 1027 self._advance(alnum=True) 1028 else: 1029 break 1030 1031 return self._text 1032 1033 def _scan_string(self, start: str) -> bool: 1034 base = None 1035 token_type = TokenType.STRING 1036 1037 if start in self.quotes: 1038 end = self.quotes[start] 1039 elif start in self.format_strings: 1040 end, token_type = self.format_strings[start] 1041 1042 if token_type == TokenType.HEX_STRING: 1043 base = 16 1044 elif token_type == TokenType.BIT_STRING: 1045 base = 2 1046 elif token_type == TokenType.HEREDOC_STRING: 1047 self._advance() 1048 1049 if self._char == end: 1050 tag = "" 1051 else: 1052 tag = self._extract_string( 1053 end, 1054 raw_string=True, 1055 raise_unmatched=not self.heredoc_tag_is_identifier, 1056 ) 1057 1058 if ( 1059 tag 1060 and self.heredoc_tag_is_identifier 1061 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1062 ): 1063 if not self._end: 1064 self._advance(-1) 1065 1066 self._advance(-len(tag)) 1067 self._add(self.heredoc_string_alternative) 1068 return True 1069 1070 end = f"{start}{tag}{end}" 1071 else: 1072 return False 1073 1074 self._advance(len(start)) 1075 text = self._extract_string( 1076 end, 1077 escapes=( 1078 self.byte_string_escapes 1079 if token_type == TokenType.BYTE_STRING 1080 else self.string_escapes 1081 ), 1082 raw_string=token_type == TokenType.RAW_STRING, 1083 ) 1084 1085 if base and text: 1086 try: 1087 int(text, base) 1088 except Exception: 1089 raise TokenError( 1090 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1091 ) 1092 1093 self._add(token_type, text) 1094 return True 1095 1096 def _scan_identifier(self, identifier_end: str) -> None: 1097 self._advance() 1098 text = self._extract_string( 1099 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1100 ) 1101 self._add(TokenType.IDENTIFIER, text) 1102 1103 def _scan_var(self) -> None: 1104 var_single_tokens = self.var_single_tokens 1105 single_tokens = self.single_tokens 1106 1107 while True: 1108 peek = self._peek 1109 if not peek or peek.isspace(): 1110 break 1111 if peek not in var_single_tokens and peek in single_tokens: 1112 break 1113 self._advance(alnum=True) 1114 1115 self._add( 1116 TokenType.VAR 1117 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1118 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1119 ) 1120 1121 def _extract_string( 1122 self, 1123 delimiter: str, 1124 escapes: set[str] | None = None, 1125 raw_string: bool = False, 1126 raise_unmatched: bool = True, 1127 ) -> str: 1128 text = "" 1129 delim_size = len(delimiter) 1130 escapes = self.string_escapes if escapes is None else escapes 1131 unescaped_sequences = self.unescaped_sequences 1132 escape_follow_chars = self.escape_follow_chars 1133 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1134 quotes = self.quotes 1135 sql = self.sql 1136 1137 # use str.find() when the string is simple... no \ or other escapes 1138 if delim_size == 1: 1139 pos = self._current - 1 1140 end = sql.find(delimiter, pos) 1141 1142 if ( 1143 # the closing delimiter was found 1144 end != -1 1145 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1146 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1147 # no backslash in the string that would need escape processing 1148 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1149 ): 1150 newlines = sql.count("\n", pos, end) 1151 if newlines: 1152 self._line += newlines 1153 self._col = end - sql.rfind("\n", pos, end) 1154 else: 1155 self._col += end - pos 1156 1157 self._current = end + 1 1158 self._end = self._current >= self.size 1159 self._char = sql[end] 1160 self._peek = "" if self._end else sql[self._current] 1161 return sql[pos:end] 1162 1163 while True: 1164 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1165 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1166 if unescaped_sequence: 1167 self._advance(2) 1168 text += unescaped_sequence 1169 continue 1170 1171 is_valid_custom_escape = ( 1172 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1173 ) 1174 1175 if ( 1176 (string_escapes_allowed_in_raw_strings or not raw_string) 1177 and self._char in escapes 1178 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1179 and (self._char not in quotes or self._char == self._peek) 1180 ): 1181 if self._peek == delimiter: 1182 text += self._peek 1183 elif is_valid_custom_escape and self._char != self._peek: 1184 text += self._peek 1185 else: 1186 text += self._char + self._peek 1187 1188 if self._current + 1 < self.size: 1189 self._advance(2) 1190 else: 1191 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1192 else: 1193 if self._chars(delim_size) == delimiter: 1194 if delim_size > 1: 1195 self._advance(delim_size - 1) 1196 break 1197 1198 if self._end: 1199 if not raise_unmatched: 1200 return text + self._char 1201 1202 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1203 1204 current = self._current - 1 1205 self._advance(alnum=True) 1206 text += sql[current : self._current - 1] 1207 1208 return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
578 def __init__( 579 self, 580 single_tokens: dict[str, TokenType], 581 keywords: dict[str, TokenType], 582 quotes: dict[str, str], 583 format_strings: dict[str, tuple[str, TokenType]], 584 identifiers: dict[str, str], 585 comments: dict[str, str | None], 586 string_escapes: set[str], 587 byte_string_escapes: set[str], 588 identifier_escapes: set[str], 589 escape_follow_chars: set[str], 590 commands: set[TokenType], 591 command_prefix_tokens: set[TokenType], 592 nested_comments: bool, 593 hint_start: str, 594 tokens_preceding_hint: set[TokenType], 595 has_bit_strings: bool, 596 has_hex_strings: bool, 597 numeric_literals: dict[str, str], 598 var_single_tokens: set[str], 599 string_escapes_allowed_in_raw_strings: bool, 600 heredoc_tag_is_identifier: bool, 601 heredoc_string_alternative: TokenType, 602 keyword_trie: dict, 603 numbers_can_be_underscore_separated: bool, 604 numbers_can_have_decimals: bool, 605 identifiers_can_start_with_digit: bool, 606 unescaped_sequences: dict[str, str], 607 ) -> None: 608 self.single_tokens = single_tokens 609 self.keywords = keywords 610 self.quotes = quotes 611 self.format_strings = format_strings 612 self.identifiers = identifiers 613 self.comments = comments 614 self.string_escapes = string_escapes 615 self.byte_string_escapes = byte_string_escapes 616 self.identifier_escapes = identifier_escapes 617 self.escape_follow_chars = escape_follow_chars 618 self.commands = commands 619 self.command_prefix_tokens = command_prefix_tokens 620 self.nested_comments = nested_comments 621 self.hint_start = hint_start 622 self.tokens_preceding_hint = tokens_preceding_hint 623 self.has_bit_strings = has_bit_strings 624 self.has_hex_strings = has_hex_strings 625 self.numeric_literals = numeric_literals 626 self.var_single_tokens = var_single_tokens 627 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 628 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 629 self.heredoc_string_alternative = heredoc_string_alternative 630 self.keyword_trie = keyword_trie 631 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 632 self.numbers_can_have_decimals = numbers_can_have_decimals 633 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 634 self.unescaped_sequences = unescaped_sequences 635 self.sql = "" 636 self.size = 0 637 self.tokens: list[Token] = [] 638 self._start = 0 639 self._current = 0 640 self._line = 1 641 self._col = 0 642 self._comments: list[str] = [] 643 self._char = "" 644 self._end = False 645 self._peek = "" 646 self._prev_token_line = -1
tokens: list[Token]
662 def tokenize(self, sql: str) -> list[Token]: 663 """Returns a list of tokens corresponding to the SQL string `sql`.""" 664 self.reset() 665 self.sql = sql 666 self.size = len(sql) 667 668 try: 669 self._scan() 670 except Exception as e: 671 start = max(self._current - 50, 0) 672 end = min(self._current + 50, self.size - 1) 673 context = self.sql[start:end] 674 raise TokenError(f"Error tokenizing '{context}'") from e 675 676 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.