sqlglot.tokenizer_core
1from __future__ import annotations 2 3import typing as t 4from enum import IntEnum, auto 5 6from sqlglot.errors import TokenError 7 8# dict lookup is faster than .upper() and .isdigit() 9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)} 10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789") 11 12 13class TokenType(IntEnum): 14 L_PAREN = auto() 15 R_PAREN = auto() 16 L_BRACKET = auto() 17 R_BRACKET = auto() 18 L_BRACE = auto() 19 R_BRACE = auto() 20 COMMA = auto() 21 DOT = auto() 22 DASH = auto() 23 PLUS = auto() 24 COLON = auto() 25 DOTCOLON = auto() 26 DOTCARET = auto() 27 DCOLON = auto() 28 DCOLONDOLLAR = auto() 29 DCOLONPERCENT = auto() 30 DCOLONQMARK = auto() 31 DQMARK = auto() 32 SEMICOLON = auto() 33 STAR = auto() 34 BACKSLASH = auto() 35 SLASH = auto() 36 LT = auto() 37 LTE = auto() 38 GT = auto() 39 GTE = auto() 40 NOT = auto() 41 EQ = auto() 42 NEQ = auto() 43 NULLSAFE_EQ = auto() 44 COLON_EQ = auto() 45 COLON_GT = auto() 46 NCOLON_GT = auto() 47 AND = auto() 48 OR = auto() 49 AMP = auto() 50 DPIPE = auto() 51 PIPE_GT = auto() 52 PIPE = auto() 53 PIPE_SLASH = auto() 54 DPIPE_SLASH = auto() 55 CARET = auto() 56 CARET_AT = auto() 57 TILDE = auto() 58 ARROW = auto() 59 DARROW = auto() 60 FARROW = auto() 61 HASH = auto() 62 HASH_ARROW = auto() 63 DHASH_ARROW = auto() 64 LR_ARROW = auto() 65 DAT = auto() 66 LT_AT = auto() 67 AT_GT = auto() 68 DOLLAR = auto() 69 PARAMETER = auto() 70 SESSION = auto() 71 SESSION_PARAMETER = auto() 72 SESSION_USER = auto() 73 DAMP = auto() 74 AMP_LT = auto() 75 AMP_GT = auto() 76 ADJACENT = auto() 77 XOR = auto() 78 DSTAR = auto() 79 QMARK_AMP = auto() 80 QMARK_PIPE = auto() 81 HASH_DASH = auto() 82 EXCLAMATION = auto() 83 84 URI_START = auto() 85 86 BLOCK_START = auto() 87 BLOCK_END = auto() 88 89 SPACE = auto() 90 BREAK = auto() 91 92 STRING = auto() 93 NUMBER = auto() 94 IDENTIFIER = auto() 95 DATABASE = auto() 96 COLUMN = auto() 97 COLUMN_DEF = auto() 98 SCHEMA = auto() 99 TABLE = auto() 100 WAREHOUSE = auto() 101 STAGE = auto() 102 STREAM = auto() 103 STREAMLIT = auto() 104 VAR = auto() 105 BIT_STRING = auto() 106 HEX_STRING = auto() 107 BYTE_STRING = auto() 108 NATIONAL_STRING = auto() 109 RAW_STRING = auto() 110 HEREDOC_STRING = auto() 111 UNICODE_STRING = auto() 112 113 # types 114 BIT = auto() 115 BOOLEAN = auto() 116 TINYINT = auto() 117 UTINYINT = auto() 118 SMALLINT = auto() 119 USMALLINT = auto() 120 MEDIUMINT = auto() 121 UMEDIUMINT = auto() 122 INT = auto() 123 UINT = auto() 124 BIGINT = auto() 125 UBIGINT = auto() 126 BIGNUM = auto() 127 INT128 = auto() 128 UINT128 = auto() 129 INT256 = auto() 130 UINT256 = auto() 131 FLOAT = auto() 132 DOUBLE = auto() 133 UDOUBLE = auto() 134 DECIMAL = auto() 135 DECIMAL32 = auto() 136 DECIMAL64 = auto() 137 DECIMAL128 = auto() 138 DECIMAL256 = auto() 139 DECFLOAT = auto() 140 UDECIMAL = auto() 141 BIGDECIMAL = auto() 142 CHAR = auto() 143 NCHAR = auto() 144 VARCHAR = auto() 145 NVARCHAR = auto() 146 BPCHAR = auto() 147 TEXT = auto() 148 MEDIUMTEXT = auto() 149 LONGTEXT = auto() 150 BLOB = auto() 151 MEDIUMBLOB = auto() 152 LONGBLOB = auto() 153 TINYBLOB = auto() 154 TINYTEXT = auto() 155 NAME = auto() 156 BINARY = auto() 157 VARBINARY = auto() 158 JSON = auto() 159 JSONB = auto() 160 TIME = auto() 161 TIMETZ = auto() 162 TIME_NS = auto() 163 TIMESTAMP = auto() 164 TIMESTAMPTZ = auto() 165 TIMESTAMPLTZ = auto() 166 TIMESTAMPNTZ = auto() 167 TIMESTAMP_S = auto() 168 TIMESTAMP_MS = auto() 169 TIMESTAMP_NS = auto() 170 DATETIME = auto() 171 DATETIME2 = auto() 172 DATETIME64 = auto() 173 SMALLDATETIME = auto() 174 DATE = auto() 175 DATE32 = auto() 176 INT4RANGE = auto() 177 INT4MULTIRANGE = auto() 178 INT8RANGE = auto() 179 INT8MULTIRANGE = auto() 180 NUMRANGE = auto() 181 NUMMULTIRANGE = auto() 182 TSRANGE = auto() 183 TSMULTIRANGE = auto() 184 TSTZRANGE = auto() 185 TSTZMULTIRANGE = auto() 186 DATERANGE = auto() 187 DATEMULTIRANGE = auto() 188 UUID = auto() 189 GEOGRAPHY = auto() 190 GEOGRAPHYPOINT = auto() 191 NULLABLE = auto() 192 GEOMETRY = auto() 193 POINT = auto() 194 RING = auto() 195 LINESTRING = auto() 196 LOCALTIME = auto() 197 LOCALTIMESTAMP = auto() 198 SYSTIMESTAMP = auto() 199 MULTILINESTRING = auto() 200 POLYGON = auto() 201 MULTIPOLYGON = auto() 202 HLLSKETCH = auto() 203 HSTORE = auto() 204 SUPER = auto() 205 SERIAL = auto() 206 SMALLSERIAL = auto() 207 BIGSERIAL = auto() 208 XML = auto() 209 YEAR = auto() 210 USERDEFINED = auto() 211 MONEY = auto() 212 SMALLMONEY = auto() 213 ROWVERSION = auto() 214 IMAGE = auto() 215 VARIANT = auto() 216 OBJECT = auto() 217 INET = auto() 218 IPADDRESS = auto() 219 IPPREFIX = auto() 220 IPV4 = auto() 221 IPV6 = auto() 222 ENUM = auto() 223 ENUM8 = auto() 224 ENUM16 = auto() 225 FIXEDSTRING = auto() 226 LOWCARDINALITY = auto() 227 NESTED = auto() 228 AGGREGATEFUNCTION = auto() 229 SIMPLEAGGREGATEFUNCTION = auto() 230 TDIGEST = auto() 231 UNKNOWN = auto() 232 VECTOR = auto() 233 DYNAMIC = auto() 234 VOID = auto() 235 236 # keywords 237 ALIAS = auto() 238 ALTER = auto() 239 ALL = auto() 240 ANTI = auto() 241 ANY = auto() 242 APPLY = auto() 243 ARRAY = auto() 244 ASC = auto() 245 ASOF = auto() 246 ATTACH = auto() 247 AUTO_INCREMENT = auto() 248 BEGIN = auto() 249 BETWEEN = auto() 250 BULK_COLLECT_INTO = auto() 251 CACHE = auto() 252 CASE = auto() 253 CHARACTER_SET = auto() 254 CLUSTER_BY = auto() 255 COLLATE = auto() 256 COMMAND = auto() 257 COMMENT = auto() 258 COMMIT = auto() 259 CONNECT_BY = auto() 260 CONSTRAINT = auto() 261 COPY = auto() 262 CREATE = auto() 263 CROSS = auto() 264 CUBE = auto() 265 CURRENT_DATE = auto() 266 CURRENT_DATETIME = auto() 267 CURRENT_SCHEMA = auto() 268 CURRENT_TIME = auto() 269 CURRENT_TIMESTAMP = auto() 270 CURRENT_USER = auto() 271 CURRENT_ROLE = auto() 272 CURRENT_CATALOG = auto() 273 DECLARE = auto() 274 DEFAULT = auto() 275 DELETE = auto() 276 DESC = auto() 277 DESCRIBE = auto() 278 DETACH = auto() 279 DICTIONARY = auto() 280 DISTINCT = auto() 281 DISTRIBUTE_BY = auto() 282 DIV = auto() 283 DROP = auto() 284 ELSE = auto() 285 END = auto() 286 ESCAPE = auto() 287 EXCEPT = auto() 288 EXECUTE = auto() 289 EXISTS = auto() 290 FALSE = auto() 291 FETCH = auto() 292 FILE = auto() 293 FILE_FORMAT = auto() 294 FILTER = auto() 295 FINAL = auto() 296 FIRST = auto() 297 FOR = auto() 298 FORCE = auto() 299 FOREIGN_KEY = auto() 300 FORMAT = auto() 301 FROM = auto() 302 FULL = auto() 303 FUNCTION = auto() 304 GET = auto() 305 GLOB = auto() 306 GLOBAL = auto() 307 GRANT = auto() 308 GROUP_BY = auto() 309 GROUPING_SETS = auto() 310 HAVING = auto() 311 HINT = auto() 312 IGNORE = auto() 313 ILIKE = auto() 314 IN = auto() 315 INDEX = auto() 316 INDEXED_BY = auto() 317 INNER = auto() 318 INSERT = auto() 319 INSTALL = auto() 320 INTEGRATION = auto() 321 INTERSECT = auto() 322 INTERVAL = auto() 323 INTO = auto() 324 INTRODUCER = auto() 325 IRLIKE = auto() 326 IS = auto() 327 ISNULL = auto() 328 JOIN = auto() 329 JOIN_MARKER = auto() 330 KEEP = auto() 331 KEY = auto() 332 KILL = auto() 333 LANGUAGE = auto() 334 LATERAL = auto() 335 LEFT = auto() 336 LIKE = auto() 337 LIMIT = auto() 338 LIST = auto() 339 LOAD = auto() 340 LOCK = auto() 341 MAP = auto() 342 MATCH = auto() 343 MATCH_CONDITION = auto() 344 MATCH_RECOGNIZE = auto() 345 MEMBER_OF = auto() 346 MERGE = auto() 347 MOD = auto() 348 MODEL = auto() 349 NATURAL = auto() 350 NEXT = auto() 351 NOTHING = auto() 352 NOTNULL = auto() 353 NULL = auto() 354 OBJECT_IDENTIFIER = auto() 355 OFFSET = auto() 356 ON = auto() 357 ONLY = auto() 358 OPERATOR = auto() 359 ORDER_BY = auto() 360 ORDER_SIBLINGS_BY = auto() 361 ORDERED = auto() 362 ORDINALITY = auto() 363 OUT = auto() 364 INOUT = auto() 365 OUTER = auto() 366 OVER = auto() 367 OVERLAPS = auto() 368 OVERWRITE = auto() 369 PACKAGE = auto() 370 PARTITION = auto() 371 PARTITION_BY = auto() 372 PERCENT = auto() 373 PIVOT = auto() 374 PLACEHOLDER = auto() 375 POLICY = auto() 376 POOL = auto() 377 POSITIONAL = auto() 378 PRAGMA = auto() 379 PREWHERE = auto() 380 PRIMARY_KEY = auto() 381 PROCEDURE = auto() 382 PROPERTIES = auto() 383 PSEUDO_TYPE = auto() 384 PUT = auto() 385 QUALIFY = auto() 386 QUOTE = auto() 387 QDCOLON = auto() 388 RANGE = auto() 389 RECURSIVE = auto() 390 REFRESH = auto() 391 RENAME = auto() 392 REPLACE = auto() 393 RETURNING = auto() 394 REVOKE = auto() 395 REFERENCES = auto() 396 RIGHT = auto() 397 RLIKE = auto() 398 ROLE = auto() 399 ROLLBACK = auto() 400 ROLLUP = auto() 401 ROW = auto() 402 ROWS = auto() 403 RULE = auto() 404 SELECT = auto() 405 SEMI = auto() 406 SEPARATOR = auto() 407 SEQUENCE = auto() 408 SERDE_PROPERTIES = auto() 409 SET = auto() 410 SETTINGS = auto() 411 SHOW = auto() 412 SIMILAR_TO = auto() 413 SOME = auto() 414 SORT_BY = auto() 415 SOUNDS_LIKE = auto() 416 SQL_SECURITY = auto() 417 START_WITH = auto() 418 STORAGE_INTEGRATION = auto() 419 STRAIGHT_JOIN = auto() 420 STRUCT = auto() 421 SUMMARIZE = auto() 422 TABLE_SAMPLE = auto() 423 TAG = auto() 424 TEMPORARY = auto() 425 TOP = auto() 426 THEN = auto() 427 TRUE = auto() 428 TRUNCATE = auto() 429 TRIGGER = auto() 430 UNCACHE = auto() 431 UNION = auto() 432 UNNEST = auto() 433 UNPIVOT = auto() 434 UPDATE = auto() 435 USE = auto() 436 USING = auto() 437 VALUES = auto() 438 VARIADIC = auto() 439 VIEW = auto() 440 SEMANTIC_VIEW = auto() 441 VOLATILE = auto() 442 VOLUME = auto() 443 WHEN = auto() 444 WHERE = auto() 445 WINDOW = auto() 446 WITH = auto() 447 UNIQUE = auto() 448 UTC_DATE = auto() 449 UTC_TIME = auto() 450 UTC_TIMESTAMP = auto() 451 VERSION_SNAPSHOT = auto() 452 TIMESTAMP_SNAPSHOT = auto() 453 OPTION = auto() 454 SINK = auto() 455 SOURCE = auto() 456 ANALYZE = auto() 457 NAMESPACE = auto() 458 EXPORT = auto() 459 460 # sentinels 461 HIVE_TOKEN_STREAM = auto() 462 SENTINEL = auto() 463 464 def __str__(self) -> str: 465 return f"TokenType.{self.name}" 466 467 468class Token: 469 # mypyc doesn't expose slots 470 _attrs: t.ClassVar[tuple[str, ...]] = ( 471 "token_type", 472 "text", 473 "line", 474 "col", 475 "start", 476 "end", 477 "comments", 478 ) 479 __slots__ = _attrs 480 481 @classmethod 482 def number(cls, number: int) -> Token: 483 """Returns a NUMBER token with `number` as its text.""" 484 return cls(TokenType.NUMBER, str(number)) 485 486 @classmethod 487 def string(cls, string: str) -> Token: 488 """Returns a STRING token with `string` as its text.""" 489 return cls(TokenType.STRING, string) 490 491 @classmethod 492 def identifier(cls, identifier: str) -> Token: 493 """Returns an IDENTIFIER token with `identifier` as its text.""" 494 return cls(TokenType.IDENTIFIER, identifier) 495 496 @classmethod 497 def var(cls, var: str) -> Token: 498 """Returns an VAR token with `var` as its text.""" 499 return cls(TokenType.VAR, var) 500 501 def __init__( 502 self, 503 token_type: TokenType, 504 text: str, 505 line: int = 1, 506 col: int = 1, 507 start: int = 0, 508 end: int = 0, 509 comments: list[str] | None = None, 510 ) -> None: 511 self.token_type = token_type 512 self.text = text 513 self.line = line 514 self.col = col 515 self.start = start 516 self.end = end 517 self.comments = [] if comments is None else comments 518 519 def __bool__(self) -> bool: 520 return self.token_type != TokenType.SENTINEL 521 522 def __repr__(self) -> str: 523 attributes = ", ".join( 524 f"{k}: TokenType.{self.token_type.name}" 525 if k == "token_type" 526 else f"{k}: {getattr(self, k)}" 527 for k in self._attrs 528 ) 529 return f"<Token {attributes}>" 530 531 532class TokenizerCore: 533 __slots__ = ( 534 "sql", 535 "size", 536 "tokens", 537 "_start", 538 "_current", 539 "_line", 540 "_col", 541 "_comments", 542 "_char", 543 "_end", 544 "_peek", 545 "_prev_token_line", 546 "single_tokens", 547 "keywords", 548 "quotes", 549 "format_strings", 550 "identifiers", 551 "comments", 552 "string_escapes", 553 "byte_string_escapes", 554 "identifier_escapes", 555 "escape_follow_chars", 556 "commands", 557 "command_prefix_tokens", 558 "nested_comments", 559 "hint_start", 560 "tokens_preceding_hint", 561 "bit_strings", 562 "hex_strings", 563 "numeric_literals", 564 "var_single_tokens", 565 "string_escapes_allowed_in_raw_strings", 566 "heredoc_tag_is_identifier", 567 "heredoc_string_alternative", 568 "keyword_trie", 569 "numbers_can_be_underscore_separated", 570 "numbers_can_have_decimals", 571 "identifiers_can_start_with_digit", 572 "unescaped_sequences", 573 ) 574 575 def __init__( 576 self, 577 single_tokens: dict[str, TokenType], 578 keywords: dict[str, TokenType], 579 quotes: dict[str, str], 580 format_strings: dict[str, tuple[str, TokenType]], 581 identifiers: dict[str, str], 582 comments: dict[str, str | None], 583 string_escapes: set[str], 584 byte_string_escapes: set[str], 585 identifier_escapes: set[str], 586 escape_follow_chars: set[str], 587 commands: set[TokenType], 588 command_prefix_tokens: set[TokenType], 589 nested_comments: bool, 590 hint_start: str, 591 tokens_preceding_hint: set[TokenType], 592 bit_strings: list[str | tuple[str, str]], 593 hex_strings: list[str | tuple[str, str]], 594 numeric_literals: dict[str, str], 595 var_single_tokens: set[str], 596 string_escapes_allowed_in_raw_strings: bool, 597 heredoc_tag_is_identifier: bool, 598 heredoc_string_alternative: TokenType, 599 keyword_trie: dict, 600 numbers_can_be_underscore_separated: bool, 601 numbers_can_have_decimals: bool, 602 identifiers_can_start_with_digit: bool, 603 unescaped_sequences: dict[str, str], 604 ) -> None: 605 self.single_tokens = single_tokens 606 self.keywords = keywords 607 self.quotes = quotes 608 self.format_strings = format_strings 609 self.identifiers = identifiers 610 self.comments = comments 611 self.string_escapes = string_escapes 612 self.byte_string_escapes = byte_string_escapes 613 self.identifier_escapes = identifier_escapes 614 self.escape_follow_chars = escape_follow_chars 615 self.commands = commands 616 self.command_prefix_tokens = command_prefix_tokens 617 self.nested_comments = nested_comments 618 self.hint_start = hint_start 619 self.tokens_preceding_hint = tokens_preceding_hint 620 self.bit_strings = bit_strings 621 self.hex_strings = hex_strings 622 self.numeric_literals = numeric_literals 623 self.var_single_tokens = var_single_tokens 624 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 625 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 626 self.heredoc_string_alternative = heredoc_string_alternative 627 self.keyword_trie = keyword_trie 628 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 629 self.numbers_can_have_decimals = numbers_can_have_decimals 630 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 631 self.unescaped_sequences = unescaped_sequences 632 self.sql = "" 633 self.size = 0 634 self.tokens: list[Token] = [] 635 self._start = 0 636 self._current = 0 637 self._line = 1 638 self._col = 0 639 self._comments: list[str] = [] 640 self._char = "" 641 self._end = False 642 self._peek = "" 643 self._prev_token_line = -1 644 645 def reset(self) -> None: 646 self.sql = "" 647 self.size = 0 648 self.tokens = [] 649 self._start = 0 650 self._current = 0 651 self._line = 1 652 self._col = 0 653 self._comments = [] 654 self._char = "" 655 self._end = False 656 self._peek = "" 657 self._prev_token_line = -1 658 659 def tokenize(self, sql: str) -> list[Token]: 660 """Returns a list of tokens corresponding to the SQL string `sql`.""" 661 self.reset() 662 self.sql = sql 663 self.size = len(sql) 664 665 try: 666 self._scan() 667 except Exception as e: 668 start = max(self._current - 50, 0) 669 end = min(self._current + 50, self.size - 1) 670 context = self.sql[start:end] 671 raise TokenError(f"Error tokenizing '{context}'") from e 672 673 return self.tokens 674 675 def _scan(self, check_semicolon: bool = False) -> None: 676 identifiers = self.identifiers 677 digit_chars = _DIGIT_CHARS 678 679 while self.size and not self._end: 680 current = self._current 681 682 # Skip spaces here rather than iteratively calling advance() for performance reasons 683 while current < self.size: 684 char = self.sql[current] 685 686 if char == " " or char == "\t": 687 current += 1 688 else: 689 break 690 691 offset = current - self._current if current > self._current else 1 692 693 self._start = current 694 self._advance(offset) 695 696 if not self._char.isspace(): 697 if self._char in digit_chars: 698 self._scan_number() 699 elif self._char in identifiers: 700 self._scan_identifier(identifiers[self._char]) 701 else: 702 self._scan_keywords() 703 704 if check_semicolon and self._peek == ";": 705 break 706 707 if self.tokens and self._comments: 708 self.tokens[-1].comments.extend(self._comments) 709 710 def _chars(self, size: int) -> str: 711 if size == 1: 712 return self._char 713 714 start = self._current - 1 715 end = start + size 716 717 return self.sql[start:end] if end <= self.size else "" 718 719 def _advance(self, i: int = 1, alnum: bool = False) -> None: 720 char = self._char 721 722 if char == "\n" or char == "\r": 723 # Ensures we don't count an extra line if we get a \r\n line break sequence 724 if not (char == "\r" and self._peek == "\n"): 725 self._col = i 726 self._line += 1 727 else: 728 self._col += i 729 730 self._current += i 731 sql = self.sql 732 size = self.size 733 self._end = self._current >= size 734 self._char = sql[self._current - 1] 735 self._peek = "" if self._end else sql[self._current] 736 737 if alnum and self._char.isalnum(): 738 # Cache to local variables instead of attributes for better performance 739 _col = self._col 740 _current = self._current 741 _end = self._end 742 _peek = self._peek 743 744 while _peek.isalnum(): 745 _col += 1 746 _current += 1 747 _end = _current >= size 748 _peek = "" if _end else sql[_current] 749 750 self._col = _col 751 self._current = _current 752 self._end = _end 753 self._peek = _peek 754 self._char = sql[_current - 1] 755 756 @property 757 def _text(self) -> str: 758 return self.sql[self._start : self._current] 759 760 def _add(self, token_type: TokenType, text: str | None = None) -> None: 761 self._prev_token_line = self._line 762 763 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 764 self.tokens[-1].comments.extend(self._comments) 765 self._comments = [] 766 767 if text is None: 768 text = self.sql[self._start : self._current] 769 770 self.tokens.append( 771 Token( 772 token_type, 773 text=text, 774 line=self._line, 775 col=self._col, 776 start=self._start, 777 end=self._current - 1, 778 comments=self._comments, 779 ) 780 ) 781 self._comments = [] 782 783 # If we have either a semicolon or a begin token before the command's token, we'll parse 784 # whatever follows the command's token as a string 785 if ( 786 token_type in self.commands 787 and self._peek != ";" 788 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 789 ): 790 start = self._current 791 tokens = len(self.tokens) 792 self._scan(check_semicolon=True) 793 self.tokens = self.tokens[:tokens] 794 text = self.sql[start : self._current].strip() 795 if text: 796 self._add(TokenType.STRING, text) 797 798 def _scan_keywords(self) -> None: 799 sql = self.sql 800 sql_size = self.size 801 single_tokens = self.single_tokens 802 char_upper = _CHAR_UPPER 803 size = 0 804 word = None 805 chars = self._char 806 char = chars 807 prev_space = False 808 skip = False 809 trie = self.keyword_trie 810 single_token = char in single_tokens 811 812 while chars: 813 if not skip: 814 sub = trie.get(char_upper.get(char, char)) 815 if sub is None: 816 break 817 trie = sub 818 if 0 in trie: 819 word = chars 820 821 end = self._current + size 822 size += 1 823 824 if end < sql_size: 825 char = sql[end] 826 single_token = single_token or char in single_tokens 827 is_space = char.isspace() 828 829 if not is_space or not prev_space: 830 if is_space: 831 char = " " 832 chars += char 833 prev_space = is_space 834 skip = False 835 else: 836 skip = True 837 else: 838 char = "" 839 break 840 841 if word: 842 if self._scan_string(word): 843 return 844 if self._scan_comment(word): 845 return 846 if prev_space or single_token or not char: 847 self._advance(size - 1) 848 word = word.upper() 849 self._add(self.keywords[word], text=word) 850 return 851 852 if self._char in single_tokens: 853 self._add(single_tokens[self._char], text=self._char) 854 return 855 856 self._scan_var() 857 858 def _scan_comment(self, comment_start: str) -> bool: 859 if comment_start not in self.comments: 860 return False 861 862 comment_start_line = self._line 863 comment_start_size = len(comment_start) 864 comment_end = self.comments[comment_start] 865 866 if comment_end: 867 # Skip the comment's start delimiter 868 self._advance(comment_start_size) 869 870 comment_count = 1 871 comment_end_size = len(comment_end) 872 nested_comments = self.nested_comments 873 874 while not self._end: 875 if self._chars(comment_end_size) == comment_end: 876 comment_count -= 1 877 if not comment_count: 878 break 879 880 self._advance(alnum=True) 881 882 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 883 if ( 884 nested_comments 885 and not self._end 886 and self._chars(comment_end_size) == comment_start 887 ): 888 self._advance(comment_start_size) 889 comment_count += 1 890 891 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 892 self._advance(comment_end_size - 1) 893 else: 894 _peek = self._peek 895 while not self._end and _peek != "\n" and _peek != "\r": 896 self._advance(alnum=True) 897 _peek = self._peek 898 self._comments.append(self._text[comment_start_size:]) 899 900 if ( 901 comment_start == self.hint_start 902 and self.tokens 903 and self.tokens[-1].token_type in self.tokens_preceding_hint 904 ): 905 self._add(TokenType.HINT) 906 907 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 908 # Multiple consecutive comments are preserved by appending them to the current comments list. 909 if comment_start_line == self._prev_token_line: 910 self.tokens[-1].comments.extend(self._comments) 911 self._comments = [] 912 self._prev_token_line = self._line 913 914 return True 915 916 def _scan_number(self) -> None: 917 if self._char == "0": 918 peek = _CHAR_UPPER.get(self._peek, self._peek) 919 if peek == "B": 920 return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER) 921 elif peek == "X": 922 return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER) 923 924 decimal = False 925 scientific = 0 926 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 927 single_tokens = self.single_tokens 928 keywords = self.keywords 929 numeric_literals = self.numeric_literals 930 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 931 932 is_underscore_separated: bool = False 933 number_text: str = "" 934 numeric_literal: str = "" 935 numeric_type: TokenType | None = None 936 937 while True: 938 if self._peek in _DIGIT_CHARS: 939 # Batch consecutive digits: scan ahead to find how many 940 sql = self.sql 941 end = self._current + 1 942 size = self.size 943 while end < size and sql[end] in _DIGIT_CHARS: 944 end += 1 945 self._advance(end - self._current) 946 elif self._peek == "." and not decimal: 947 if ( 948 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 949 ) or not self.numbers_can_have_decimals: 950 break 951 decimal = True 952 self._advance() 953 elif self._peek in ("-", "+") and scientific == 1: 954 # Only consume +/- if followed by a digit 955 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 956 scientific += 1 957 self._advance() 958 else: 959 break 960 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 961 scientific += 1 962 self._advance() 963 elif self._peek == "_" and numbers_can_be_underscore_separated: 964 is_underscore_separated = True 965 self._advance() 966 elif self._peek.isidentifier(): 967 number_text = self._text 968 969 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 970 numeric_literal += self._peek 971 self._advance() 972 973 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 974 975 if numeric_type: 976 break 977 elif identifiers_can_start_with_digit: 978 return self._add(TokenType.VAR) 979 980 self._advance(-len(numeric_literal)) 981 break 982 else: 983 break 984 985 number_text = number_text or self.sql[self._start : self._current] 986 987 # Normalize inputs such as 100_000 to 100000 988 if is_underscore_separated: 989 number_text = number_text.replace("_", "") 990 991 self._add(TokenType.NUMBER, number_text) 992 993 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 994 if numeric_type: 995 self._add(TokenType.DCOLON, "::") 996 self._add(numeric_type, numeric_literal) 997 998 def _scan_bits(self) -> None: 999 self._advance() 1000 value = self._extract_value() 1001 try: 1002 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1003 int(value, 2) 1004 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1005 except ValueError: 1006 self._add(TokenType.IDENTIFIER) 1007 1008 def _scan_hex(self) -> None: 1009 self._advance() 1010 value = self._extract_value() 1011 try: 1012 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1013 int(value, 16) 1014 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1015 except ValueError: 1016 self._add(TokenType.IDENTIFIER) 1017 1018 def _extract_value(self) -> str: 1019 single_tokens = self.single_tokens 1020 1021 while True: 1022 char = self._peek.strip() 1023 if char and char not in single_tokens: 1024 self._advance(alnum=True) 1025 else: 1026 break 1027 1028 return self._text 1029 1030 def _scan_string(self, start: str) -> bool: 1031 base = None 1032 token_type = TokenType.STRING 1033 1034 if start in self.quotes: 1035 end = self.quotes[start] 1036 elif start in self.format_strings: 1037 end, token_type = self.format_strings[start] 1038 1039 if token_type == TokenType.HEX_STRING: 1040 base = 16 1041 elif token_type == TokenType.BIT_STRING: 1042 base = 2 1043 elif token_type == TokenType.HEREDOC_STRING: 1044 self._advance() 1045 1046 if self._char == end: 1047 tag = "" 1048 else: 1049 tag = self._extract_string( 1050 end, 1051 raw_string=True, 1052 raise_unmatched=not self.heredoc_tag_is_identifier, 1053 ) 1054 1055 if ( 1056 tag 1057 and self.heredoc_tag_is_identifier 1058 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1059 ): 1060 if not self._end: 1061 self._advance(-1) 1062 1063 self._advance(-len(tag)) 1064 self._add(self.heredoc_string_alternative) 1065 return True 1066 1067 end = f"{start}{tag}{end}" 1068 else: 1069 return False 1070 1071 self._advance(len(start)) 1072 text = self._extract_string( 1073 end, 1074 escapes=( 1075 self.byte_string_escapes 1076 if token_type == TokenType.BYTE_STRING 1077 else self.string_escapes 1078 ), 1079 raw_string=token_type == TokenType.RAW_STRING, 1080 ) 1081 1082 if base and text: 1083 try: 1084 int(text, base) 1085 except Exception: 1086 raise TokenError( 1087 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1088 ) 1089 1090 self._add(token_type, text) 1091 return True 1092 1093 def _scan_identifier(self, identifier_end: str) -> None: 1094 self._advance() 1095 text = self._extract_string( 1096 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1097 ) 1098 self._add(TokenType.IDENTIFIER, text) 1099 1100 def _scan_var(self) -> None: 1101 var_single_tokens = self.var_single_tokens 1102 single_tokens = self.single_tokens 1103 1104 while True: 1105 peek = self._peek 1106 if not peek or peek.isspace(): 1107 break 1108 if peek not in var_single_tokens and peek in single_tokens: 1109 break 1110 self._advance(alnum=True) 1111 1112 self._add( 1113 TokenType.VAR 1114 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1115 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1116 ) 1117 1118 def _extract_string( 1119 self, 1120 delimiter: str, 1121 escapes: set[str] | None = None, 1122 raw_string: bool = False, 1123 raise_unmatched: bool = True, 1124 ) -> str: 1125 text = "" 1126 delim_size = len(delimiter) 1127 escapes = self.string_escapes if escapes is None else escapes 1128 unescaped_sequences = self.unescaped_sequences 1129 escape_follow_chars = self.escape_follow_chars 1130 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1131 quotes = self.quotes 1132 sql = self.sql 1133 1134 # use str.find() when the string is simple... no \ or other escapes 1135 if delim_size == 1: 1136 pos = self._current - 1 1137 end = sql.find(delimiter, pos) 1138 1139 if ( 1140 # the closing delimiter was found 1141 end != -1 1142 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1143 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1144 # no backslash in the string that would need escape processing 1145 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1146 ): 1147 newlines = sql.count("\n", pos, end) 1148 if newlines: 1149 self._line += newlines 1150 self._col = end - sql.rfind("\n", pos, end) 1151 else: 1152 self._col += end - pos 1153 1154 self._current = end + 1 1155 self._end = self._current >= self.size 1156 self._char = sql[end] 1157 self._peek = "" if self._end else sql[self._current] 1158 return sql[pos:end] 1159 1160 while True: 1161 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1162 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1163 if unescaped_sequence: 1164 self._advance(2) 1165 text += unescaped_sequence 1166 continue 1167 1168 is_valid_custom_escape = ( 1169 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1170 ) 1171 1172 if ( 1173 (string_escapes_allowed_in_raw_strings or not raw_string) 1174 and self._char in escapes 1175 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1176 and (self._char not in quotes or self._char == self._peek) 1177 ): 1178 if self._peek == delimiter: 1179 text += self._peek 1180 elif is_valid_custom_escape and self._char != self._peek: 1181 text += self._peek 1182 else: 1183 text += self._char + self._peek 1184 1185 if self._current + 1 < self.size: 1186 self._advance(2) 1187 else: 1188 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1189 else: 1190 if self._chars(delim_size) == delimiter: 1191 if delim_size > 1: 1192 self._advance(delim_size - 1) 1193 break 1194 1195 if self._end: 1196 if not raise_unmatched: 1197 return text + self._char 1198 1199 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1200 1201 current = self._current - 1 1202 self._advance(alnum=True) 1203 text += sql[current : self._current - 1] 1204 1205 return text
class
TokenType(enum.IntEnum):
14class TokenType(IntEnum): 15 L_PAREN = auto() 16 R_PAREN = auto() 17 L_BRACKET = auto() 18 R_BRACKET = auto() 19 L_BRACE = auto() 20 R_BRACE = auto() 21 COMMA = auto() 22 DOT = auto() 23 DASH = auto() 24 PLUS = auto() 25 COLON = auto() 26 DOTCOLON = auto() 27 DOTCARET = auto() 28 DCOLON = auto() 29 DCOLONDOLLAR = auto() 30 DCOLONPERCENT = auto() 31 DCOLONQMARK = auto() 32 DQMARK = auto() 33 SEMICOLON = auto() 34 STAR = auto() 35 BACKSLASH = auto() 36 SLASH = auto() 37 LT = auto() 38 LTE = auto() 39 GT = auto() 40 GTE = auto() 41 NOT = auto() 42 EQ = auto() 43 NEQ = auto() 44 NULLSAFE_EQ = auto() 45 COLON_EQ = auto() 46 COLON_GT = auto() 47 NCOLON_GT = auto() 48 AND = auto() 49 OR = auto() 50 AMP = auto() 51 DPIPE = auto() 52 PIPE_GT = auto() 53 PIPE = auto() 54 PIPE_SLASH = auto() 55 DPIPE_SLASH = auto() 56 CARET = auto() 57 CARET_AT = auto() 58 TILDE = auto() 59 ARROW = auto() 60 DARROW = auto() 61 FARROW = auto() 62 HASH = auto() 63 HASH_ARROW = auto() 64 DHASH_ARROW = auto() 65 LR_ARROW = auto() 66 DAT = auto() 67 LT_AT = auto() 68 AT_GT = auto() 69 DOLLAR = auto() 70 PARAMETER = auto() 71 SESSION = auto() 72 SESSION_PARAMETER = auto() 73 SESSION_USER = auto() 74 DAMP = auto() 75 AMP_LT = auto() 76 AMP_GT = auto() 77 ADJACENT = auto() 78 XOR = auto() 79 DSTAR = auto() 80 QMARK_AMP = auto() 81 QMARK_PIPE = auto() 82 HASH_DASH = auto() 83 EXCLAMATION = auto() 84 85 URI_START = auto() 86 87 BLOCK_START = auto() 88 BLOCK_END = auto() 89 90 SPACE = auto() 91 BREAK = auto() 92 93 STRING = auto() 94 NUMBER = auto() 95 IDENTIFIER = auto() 96 DATABASE = auto() 97 COLUMN = auto() 98 COLUMN_DEF = auto() 99 SCHEMA = auto() 100 TABLE = auto() 101 WAREHOUSE = auto() 102 STAGE = auto() 103 STREAM = auto() 104 STREAMLIT = auto() 105 VAR = auto() 106 BIT_STRING = auto() 107 HEX_STRING = auto() 108 BYTE_STRING = auto() 109 NATIONAL_STRING = auto() 110 RAW_STRING = auto() 111 HEREDOC_STRING = auto() 112 UNICODE_STRING = auto() 113 114 # types 115 BIT = auto() 116 BOOLEAN = auto() 117 TINYINT = auto() 118 UTINYINT = auto() 119 SMALLINT = auto() 120 USMALLINT = auto() 121 MEDIUMINT = auto() 122 UMEDIUMINT = auto() 123 INT = auto() 124 UINT = auto() 125 BIGINT = auto() 126 UBIGINT = auto() 127 BIGNUM = auto() 128 INT128 = auto() 129 UINT128 = auto() 130 INT256 = auto() 131 UINT256 = auto() 132 FLOAT = auto() 133 DOUBLE = auto() 134 UDOUBLE = auto() 135 DECIMAL = auto() 136 DECIMAL32 = auto() 137 DECIMAL64 = auto() 138 DECIMAL128 = auto() 139 DECIMAL256 = auto() 140 DECFLOAT = auto() 141 UDECIMAL = auto() 142 BIGDECIMAL = auto() 143 CHAR = auto() 144 NCHAR = auto() 145 VARCHAR = auto() 146 NVARCHAR = auto() 147 BPCHAR = auto() 148 TEXT = auto() 149 MEDIUMTEXT = auto() 150 LONGTEXT = auto() 151 BLOB = auto() 152 MEDIUMBLOB = auto() 153 LONGBLOB = auto() 154 TINYBLOB = auto() 155 TINYTEXT = auto() 156 NAME = auto() 157 BINARY = auto() 158 VARBINARY = auto() 159 JSON = auto() 160 JSONB = auto() 161 TIME = auto() 162 TIMETZ = auto() 163 TIME_NS = auto() 164 TIMESTAMP = auto() 165 TIMESTAMPTZ = auto() 166 TIMESTAMPLTZ = auto() 167 TIMESTAMPNTZ = auto() 168 TIMESTAMP_S = auto() 169 TIMESTAMP_MS = auto() 170 TIMESTAMP_NS = auto() 171 DATETIME = auto() 172 DATETIME2 = auto() 173 DATETIME64 = auto() 174 SMALLDATETIME = auto() 175 DATE = auto() 176 DATE32 = auto() 177 INT4RANGE = auto() 178 INT4MULTIRANGE = auto() 179 INT8RANGE = auto() 180 INT8MULTIRANGE = auto() 181 NUMRANGE = auto() 182 NUMMULTIRANGE = auto() 183 TSRANGE = auto() 184 TSMULTIRANGE = auto() 185 TSTZRANGE = auto() 186 TSTZMULTIRANGE = auto() 187 DATERANGE = auto() 188 DATEMULTIRANGE = auto() 189 UUID = auto() 190 GEOGRAPHY = auto() 191 GEOGRAPHYPOINT = auto() 192 NULLABLE = auto() 193 GEOMETRY = auto() 194 POINT = auto() 195 RING = auto() 196 LINESTRING = auto() 197 LOCALTIME = auto() 198 LOCALTIMESTAMP = auto() 199 SYSTIMESTAMP = auto() 200 MULTILINESTRING = auto() 201 POLYGON = auto() 202 MULTIPOLYGON = auto() 203 HLLSKETCH = auto() 204 HSTORE = auto() 205 SUPER = auto() 206 SERIAL = auto() 207 SMALLSERIAL = auto() 208 BIGSERIAL = auto() 209 XML = auto() 210 YEAR = auto() 211 USERDEFINED = auto() 212 MONEY = auto() 213 SMALLMONEY = auto() 214 ROWVERSION = auto() 215 IMAGE = auto() 216 VARIANT = auto() 217 OBJECT = auto() 218 INET = auto() 219 IPADDRESS = auto() 220 IPPREFIX = auto() 221 IPV4 = auto() 222 IPV6 = auto() 223 ENUM = auto() 224 ENUM8 = auto() 225 ENUM16 = auto() 226 FIXEDSTRING = auto() 227 LOWCARDINALITY = auto() 228 NESTED = auto() 229 AGGREGATEFUNCTION = auto() 230 SIMPLEAGGREGATEFUNCTION = auto() 231 TDIGEST = auto() 232 UNKNOWN = auto() 233 VECTOR = auto() 234 DYNAMIC = auto() 235 VOID = auto() 236 237 # keywords 238 ALIAS = auto() 239 ALTER = auto() 240 ALL = auto() 241 ANTI = auto() 242 ANY = auto() 243 APPLY = auto() 244 ARRAY = auto() 245 ASC = auto() 246 ASOF = auto() 247 ATTACH = auto() 248 AUTO_INCREMENT = auto() 249 BEGIN = auto() 250 BETWEEN = auto() 251 BULK_COLLECT_INTO = auto() 252 CACHE = auto() 253 CASE = auto() 254 CHARACTER_SET = auto() 255 CLUSTER_BY = auto() 256 COLLATE = auto() 257 COMMAND = auto() 258 COMMENT = auto() 259 COMMIT = auto() 260 CONNECT_BY = auto() 261 CONSTRAINT = auto() 262 COPY = auto() 263 CREATE = auto() 264 CROSS = auto() 265 CUBE = auto() 266 CURRENT_DATE = auto() 267 CURRENT_DATETIME = auto() 268 CURRENT_SCHEMA = auto() 269 CURRENT_TIME = auto() 270 CURRENT_TIMESTAMP = auto() 271 CURRENT_USER = auto() 272 CURRENT_ROLE = auto() 273 CURRENT_CATALOG = auto() 274 DECLARE = auto() 275 DEFAULT = auto() 276 DELETE = auto() 277 DESC = auto() 278 DESCRIBE = auto() 279 DETACH = auto() 280 DICTIONARY = auto() 281 DISTINCT = auto() 282 DISTRIBUTE_BY = auto() 283 DIV = auto() 284 DROP = auto() 285 ELSE = auto() 286 END = auto() 287 ESCAPE = auto() 288 EXCEPT = auto() 289 EXECUTE = auto() 290 EXISTS = auto() 291 FALSE = auto() 292 FETCH = auto() 293 FILE = auto() 294 FILE_FORMAT = auto() 295 FILTER = auto() 296 FINAL = auto() 297 FIRST = auto() 298 FOR = auto() 299 FORCE = auto() 300 FOREIGN_KEY = auto() 301 FORMAT = auto() 302 FROM = auto() 303 FULL = auto() 304 FUNCTION = auto() 305 GET = auto() 306 GLOB = auto() 307 GLOBAL = auto() 308 GRANT = auto() 309 GROUP_BY = auto() 310 GROUPING_SETS = auto() 311 HAVING = auto() 312 HINT = auto() 313 IGNORE = auto() 314 ILIKE = auto() 315 IN = auto() 316 INDEX = auto() 317 INDEXED_BY = auto() 318 INNER = auto() 319 INSERT = auto() 320 INSTALL = auto() 321 INTEGRATION = auto() 322 INTERSECT = auto() 323 INTERVAL = auto() 324 INTO = auto() 325 INTRODUCER = auto() 326 IRLIKE = auto() 327 IS = auto() 328 ISNULL = auto() 329 JOIN = auto() 330 JOIN_MARKER = auto() 331 KEEP = auto() 332 KEY = auto() 333 KILL = auto() 334 LANGUAGE = auto() 335 LATERAL = auto() 336 LEFT = auto() 337 LIKE = auto() 338 LIMIT = auto() 339 LIST = auto() 340 LOAD = auto() 341 LOCK = auto() 342 MAP = auto() 343 MATCH = auto() 344 MATCH_CONDITION = auto() 345 MATCH_RECOGNIZE = auto() 346 MEMBER_OF = auto() 347 MERGE = auto() 348 MOD = auto() 349 MODEL = auto() 350 NATURAL = auto() 351 NEXT = auto() 352 NOTHING = auto() 353 NOTNULL = auto() 354 NULL = auto() 355 OBJECT_IDENTIFIER = auto() 356 OFFSET = auto() 357 ON = auto() 358 ONLY = auto() 359 OPERATOR = auto() 360 ORDER_BY = auto() 361 ORDER_SIBLINGS_BY = auto() 362 ORDERED = auto() 363 ORDINALITY = auto() 364 OUT = auto() 365 INOUT = auto() 366 OUTER = auto() 367 OVER = auto() 368 OVERLAPS = auto() 369 OVERWRITE = auto() 370 PACKAGE = auto() 371 PARTITION = auto() 372 PARTITION_BY = auto() 373 PERCENT = auto() 374 PIVOT = auto() 375 PLACEHOLDER = auto() 376 POLICY = auto() 377 POOL = auto() 378 POSITIONAL = auto() 379 PRAGMA = auto() 380 PREWHERE = auto() 381 PRIMARY_KEY = auto() 382 PROCEDURE = auto() 383 PROPERTIES = auto() 384 PSEUDO_TYPE = auto() 385 PUT = auto() 386 QUALIFY = auto() 387 QUOTE = auto() 388 QDCOLON = auto() 389 RANGE = auto() 390 RECURSIVE = auto() 391 REFRESH = auto() 392 RENAME = auto() 393 REPLACE = auto() 394 RETURNING = auto() 395 REVOKE = auto() 396 REFERENCES = auto() 397 RIGHT = auto() 398 RLIKE = auto() 399 ROLE = auto() 400 ROLLBACK = auto() 401 ROLLUP = auto() 402 ROW = auto() 403 ROWS = auto() 404 RULE = auto() 405 SELECT = auto() 406 SEMI = auto() 407 SEPARATOR = auto() 408 SEQUENCE = auto() 409 SERDE_PROPERTIES = auto() 410 SET = auto() 411 SETTINGS = auto() 412 SHOW = auto() 413 SIMILAR_TO = auto() 414 SOME = auto() 415 SORT_BY = auto() 416 SOUNDS_LIKE = auto() 417 SQL_SECURITY = auto() 418 START_WITH = auto() 419 STORAGE_INTEGRATION = auto() 420 STRAIGHT_JOIN = auto() 421 STRUCT = auto() 422 SUMMARIZE = auto() 423 TABLE_SAMPLE = auto() 424 TAG = auto() 425 TEMPORARY = auto() 426 TOP = auto() 427 THEN = auto() 428 TRUE = auto() 429 TRUNCATE = auto() 430 TRIGGER = auto() 431 UNCACHE = auto() 432 UNION = auto() 433 UNNEST = auto() 434 UNPIVOT = auto() 435 UPDATE = auto() 436 USE = auto() 437 USING = auto() 438 VALUES = auto() 439 VARIADIC = auto() 440 VIEW = auto() 441 SEMANTIC_VIEW = auto() 442 VOLATILE = auto() 443 VOLUME = auto() 444 WHEN = auto() 445 WHERE = auto() 446 WINDOW = auto() 447 WITH = auto() 448 UNIQUE = auto() 449 UTC_DATE = auto() 450 UTC_TIME = auto() 451 UTC_TIMESTAMP = auto() 452 VERSION_SNAPSHOT = auto() 453 TIMESTAMP_SNAPSHOT = auto() 454 OPTION = auto() 455 SINK = auto() 456 SOURCE = auto() 457 ANALYZE = auto() 458 NAMESPACE = auto() 459 EXPORT = auto() 460 461 # sentinels 462 HIVE_TOKEN_STREAM = auto() 463 SENTINEL = auto() 464 465 def __str__(self) -> str: 466 return f"TokenType.{self.name}"
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 1>
R_PAREN =
<TokenType.R_PAREN: 2>
L_BRACKET =
<TokenType.L_BRACKET: 3>
R_BRACKET =
<TokenType.R_BRACKET: 4>
L_BRACE =
<TokenType.L_BRACE: 5>
R_BRACE =
<TokenType.R_BRACE: 6>
COMMA =
<TokenType.COMMA: 7>
DOT =
<TokenType.DOT: 8>
DASH =
<TokenType.DASH: 9>
PLUS =
<TokenType.PLUS: 10>
COLON =
<TokenType.COLON: 11>
DOTCOLON =
<TokenType.DOTCOLON: 12>
DOTCARET =
<TokenType.DOTCARET: 13>
DCOLON =
<TokenType.DCOLON: 14>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 16>
DCOLONQMARK =
<TokenType.DCOLONQMARK: 17>
DQMARK =
<TokenType.DQMARK: 18>
SEMICOLON =
<TokenType.SEMICOLON: 19>
STAR =
<TokenType.STAR: 20>
BACKSLASH =
<TokenType.BACKSLASH: 21>
SLASH =
<TokenType.SLASH: 22>
LT =
<TokenType.LT: 23>
LTE =
<TokenType.LTE: 24>
GT =
<TokenType.GT: 25>
GTE =
<TokenType.GTE: 26>
NOT =
<TokenType.NOT: 27>
EQ =
<TokenType.EQ: 28>
NEQ =
<TokenType.NEQ: 29>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 30>
COLON_EQ =
<TokenType.COLON_EQ: 31>
COLON_GT =
<TokenType.COLON_GT: 32>
NCOLON_GT =
<TokenType.NCOLON_GT: 33>
AND =
<TokenType.AND: 34>
OR =
<TokenType.OR: 35>
AMP =
<TokenType.AMP: 36>
DPIPE =
<TokenType.DPIPE: 37>
PIPE_GT =
<TokenType.PIPE_GT: 38>
PIPE =
<TokenType.PIPE: 39>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 40>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 41>
CARET =
<TokenType.CARET: 42>
CARET_AT =
<TokenType.CARET_AT: 43>
TILDE =
<TokenType.TILDE: 44>
ARROW =
<TokenType.ARROW: 45>
DARROW =
<TokenType.DARROW: 46>
FARROW =
<TokenType.FARROW: 47>
HASH =
<TokenType.HASH: 48>
HASH_ARROW =
<TokenType.HASH_ARROW: 49>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 50>
LR_ARROW =
<TokenType.LR_ARROW: 51>
DAT =
<TokenType.DAT: 52>
LT_AT =
<TokenType.LT_AT: 53>
AT_GT =
<TokenType.AT_GT: 54>
DOLLAR =
<TokenType.DOLLAR: 55>
PARAMETER =
<TokenType.PARAMETER: 56>
SESSION =
<TokenType.SESSION: 57>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 58>
SESSION_USER =
<TokenType.SESSION_USER: 59>
DAMP =
<TokenType.DAMP: 60>
AMP_LT =
<TokenType.AMP_LT: 61>
AMP_GT =
<TokenType.AMP_GT: 62>
ADJACENT =
<TokenType.ADJACENT: 63>
XOR =
<TokenType.XOR: 64>
DSTAR =
<TokenType.DSTAR: 65>
QMARK_AMP =
<TokenType.QMARK_AMP: 66>
QMARK_PIPE =
<TokenType.QMARK_PIPE: 67>
HASH_DASH =
<TokenType.HASH_DASH: 68>
EXCLAMATION =
<TokenType.EXCLAMATION: 69>
URI_START =
<TokenType.URI_START: 70>
BLOCK_START =
<TokenType.BLOCK_START: 71>
BLOCK_END =
<TokenType.BLOCK_END: 72>
SPACE =
<TokenType.SPACE: 73>
BREAK =
<TokenType.BREAK: 74>
STRING =
<TokenType.STRING: 75>
NUMBER =
<TokenType.NUMBER: 76>
IDENTIFIER =
<TokenType.IDENTIFIER: 77>
DATABASE =
<TokenType.DATABASE: 78>
COLUMN =
<TokenType.COLUMN: 79>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 80>
SCHEMA =
<TokenType.SCHEMA: 81>
TABLE =
<TokenType.TABLE: 82>
WAREHOUSE =
<TokenType.WAREHOUSE: 83>
STAGE =
<TokenType.STAGE: 84>
STREAM =
<TokenType.STREAM: 85>
STREAMLIT =
<TokenType.STREAMLIT: 86>
VAR =
<TokenType.VAR: 87>
BIT_STRING =
<TokenType.BIT_STRING: 88>
HEX_STRING =
<TokenType.HEX_STRING: 89>
BYTE_STRING =
<TokenType.BYTE_STRING: 90>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 91>
RAW_STRING =
<TokenType.RAW_STRING: 92>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 93>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 94>
BIT =
<TokenType.BIT: 95>
BOOLEAN =
<TokenType.BOOLEAN: 96>
TINYINT =
<TokenType.TINYINT: 97>
UTINYINT =
<TokenType.UTINYINT: 98>
SMALLINT =
<TokenType.SMALLINT: 99>
USMALLINT =
<TokenType.USMALLINT: 100>
MEDIUMINT =
<TokenType.MEDIUMINT: 101>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 102>
INT =
<TokenType.INT: 103>
UINT =
<TokenType.UINT: 104>
BIGINT =
<TokenType.BIGINT: 105>
UBIGINT =
<TokenType.UBIGINT: 106>
BIGNUM =
<TokenType.BIGNUM: 107>
INT128 =
<TokenType.INT128: 108>
UINT128 =
<TokenType.UINT128: 109>
INT256 =
<TokenType.INT256: 110>
UINT256 =
<TokenType.UINT256: 111>
FLOAT =
<TokenType.FLOAT: 112>
DOUBLE =
<TokenType.DOUBLE: 113>
UDOUBLE =
<TokenType.UDOUBLE: 114>
DECIMAL =
<TokenType.DECIMAL: 115>
DECIMAL32 =
<TokenType.DECIMAL32: 116>
DECIMAL64 =
<TokenType.DECIMAL64: 117>
DECIMAL128 =
<TokenType.DECIMAL128: 118>
DECIMAL256 =
<TokenType.DECIMAL256: 119>
DECFLOAT =
<TokenType.DECFLOAT: 120>
UDECIMAL =
<TokenType.UDECIMAL: 121>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 122>
CHAR =
<TokenType.CHAR: 123>
NCHAR =
<TokenType.NCHAR: 124>
VARCHAR =
<TokenType.VARCHAR: 125>
NVARCHAR =
<TokenType.NVARCHAR: 126>
BPCHAR =
<TokenType.BPCHAR: 127>
TEXT =
<TokenType.TEXT: 128>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 129>
LONGTEXT =
<TokenType.LONGTEXT: 130>
BLOB =
<TokenType.BLOB: 131>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 132>
LONGBLOB =
<TokenType.LONGBLOB: 133>
TINYBLOB =
<TokenType.TINYBLOB: 134>
TINYTEXT =
<TokenType.TINYTEXT: 135>
NAME =
<TokenType.NAME: 136>
BINARY =
<TokenType.BINARY: 137>
VARBINARY =
<TokenType.VARBINARY: 138>
JSON =
<TokenType.JSON: 139>
JSONB =
<TokenType.JSONB: 140>
TIME =
<TokenType.TIME: 141>
TIMETZ =
<TokenType.TIMETZ: 142>
TIME_NS =
<TokenType.TIME_NS: 143>
TIMESTAMP =
<TokenType.TIMESTAMP: 144>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 145>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 146>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 147>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 148>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 149>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 150>
DATETIME =
<TokenType.DATETIME: 151>
DATETIME2 =
<TokenType.DATETIME2: 152>
DATETIME64 =
<TokenType.DATETIME64: 153>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 154>
DATE =
<TokenType.DATE: 155>
DATE32 =
<TokenType.DATE32: 156>
INT4RANGE =
<TokenType.INT4RANGE: 157>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 158>
INT8RANGE =
<TokenType.INT8RANGE: 159>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 160>
NUMRANGE =
<TokenType.NUMRANGE: 161>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 162>
TSRANGE =
<TokenType.TSRANGE: 163>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 164>
TSTZRANGE =
<TokenType.TSTZRANGE: 165>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 166>
DATERANGE =
<TokenType.DATERANGE: 167>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 168>
UUID =
<TokenType.UUID: 169>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 170>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 171>
NULLABLE =
<TokenType.NULLABLE: 172>
GEOMETRY =
<TokenType.GEOMETRY: 173>
POINT =
<TokenType.POINT: 174>
RING =
<TokenType.RING: 175>
LINESTRING =
<TokenType.LINESTRING: 176>
LOCALTIME =
<TokenType.LOCALTIME: 177>
LOCALTIMESTAMP =
<TokenType.LOCALTIMESTAMP: 178>
SYSTIMESTAMP =
<TokenType.SYSTIMESTAMP: 179>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 180>
POLYGON =
<TokenType.POLYGON: 181>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 182>
HLLSKETCH =
<TokenType.HLLSKETCH: 183>
HSTORE =
<TokenType.HSTORE: 184>
SUPER =
<TokenType.SUPER: 185>
SERIAL =
<TokenType.SERIAL: 186>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 187>
BIGSERIAL =
<TokenType.BIGSERIAL: 188>
XML =
<TokenType.XML: 189>
YEAR =
<TokenType.YEAR: 190>
USERDEFINED =
<TokenType.USERDEFINED: 191>
MONEY =
<TokenType.MONEY: 192>
SMALLMONEY =
<TokenType.SMALLMONEY: 193>
ROWVERSION =
<TokenType.ROWVERSION: 194>
IMAGE =
<TokenType.IMAGE: 195>
VARIANT =
<TokenType.VARIANT: 196>
OBJECT =
<TokenType.OBJECT: 197>
INET =
<TokenType.INET: 198>
IPADDRESS =
<TokenType.IPADDRESS: 199>
IPPREFIX =
<TokenType.IPPREFIX: 200>
IPV4 =
<TokenType.IPV4: 201>
IPV6 =
<TokenType.IPV6: 202>
ENUM =
<TokenType.ENUM: 203>
ENUM8 =
<TokenType.ENUM8: 204>
ENUM16 =
<TokenType.ENUM16: 205>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 206>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 207>
NESTED =
<TokenType.NESTED: 208>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 209>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 210>
TDIGEST =
<TokenType.TDIGEST: 211>
UNKNOWN =
<TokenType.UNKNOWN: 212>
VECTOR =
<TokenType.VECTOR: 213>
DYNAMIC =
<TokenType.DYNAMIC: 214>
VOID =
<TokenType.VOID: 215>
ALIAS =
<TokenType.ALIAS: 216>
ALTER =
<TokenType.ALTER: 217>
ALL =
<TokenType.ALL: 218>
ANTI =
<TokenType.ANTI: 219>
ANY =
<TokenType.ANY: 220>
APPLY =
<TokenType.APPLY: 221>
ARRAY =
<TokenType.ARRAY: 222>
ASC =
<TokenType.ASC: 223>
ASOF =
<TokenType.ASOF: 224>
ATTACH =
<TokenType.ATTACH: 225>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 226>
BEGIN =
<TokenType.BEGIN: 227>
BETWEEN =
<TokenType.BETWEEN: 228>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 229>
CACHE =
<TokenType.CACHE: 230>
CASE =
<TokenType.CASE: 231>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 232>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 233>
COLLATE =
<TokenType.COLLATE: 234>
COMMAND =
<TokenType.COMMAND: 235>
COMMENT =
<TokenType.COMMENT: 236>
COMMIT =
<TokenType.COMMIT: 237>
CONNECT_BY =
<TokenType.CONNECT_BY: 238>
CONSTRAINT =
<TokenType.CONSTRAINT: 239>
COPY =
<TokenType.COPY: 240>
CREATE =
<TokenType.CREATE: 241>
CROSS =
<TokenType.CROSS: 242>
CUBE =
<TokenType.CUBE: 243>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 244>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 245>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 246>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 247>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 248>
CURRENT_USER =
<TokenType.CURRENT_USER: 249>
CURRENT_ROLE =
<TokenType.CURRENT_ROLE: 250>
CURRENT_CATALOG =
<TokenType.CURRENT_CATALOG: 251>
DECLARE =
<TokenType.DECLARE: 252>
DEFAULT =
<TokenType.DEFAULT: 253>
DELETE =
<TokenType.DELETE: 254>
DESC =
<TokenType.DESC: 255>
DESCRIBE =
<TokenType.DESCRIBE: 256>
DETACH =
<TokenType.DETACH: 257>
DICTIONARY =
<TokenType.DICTIONARY: 258>
DISTINCT =
<TokenType.DISTINCT: 259>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 260>
DIV =
<TokenType.DIV: 261>
DROP =
<TokenType.DROP: 262>
ELSE =
<TokenType.ELSE: 263>
END =
<TokenType.END: 264>
ESCAPE =
<TokenType.ESCAPE: 265>
EXCEPT =
<TokenType.EXCEPT: 266>
EXECUTE =
<TokenType.EXECUTE: 267>
EXISTS =
<TokenType.EXISTS: 268>
FALSE =
<TokenType.FALSE: 269>
FETCH =
<TokenType.FETCH: 270>
FILE =
<TokenType.FILE: 271>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 272>
FILTER =
<TokenType.FILTER: 273>
FINAL =
<TokenType.FINAL: 274>
FIRST =
<TokenType.FIRST: 275>
FOR =
<TokenType.FOR: 276>
FORCE =
<TokenType.FORCE: 277>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 278>
FORMAT =
<TokenType.FORMAT: 279>
FROM =
<TokenType.FROM: 280>
FULL =
<TokenType.FULL: 281>
FUNCTION =
<TokenType.FUNCTION: 282>
GET =
<TokenType.GET: 283>
GLOB =
<TokenType.GLOB: 284>
GLOBAL =
<TokenType.GLOBAL: 285>
GRANT =
<TokenType.GRANT: 286>
GROUP_BY =
<TokenType.GROUP_BY: 287>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 288>
HAVING =
<TokenType.HAVING: 289>
HINT =
<TokenType.HINT: 290>
IGNORE =
<TokenType.IGNORE: 291>
ILIKE =
<TokenType.ILIKE: 292>
IN =
<TokenType.IN: 293>
INDEX =
<TokenType.INDEX: 294>
INDEXED_BY =
<TokenType.INDEXED_BY: 295>
INNER =
<TokenType.INNER: 296>
INSERT =
<TokenType.INSERT: 297>
INSTALL =
<TokenType.INSTALL: 298>
INTEGRATION =
<TokenType.INTEGRATION: 299>
INTERSECT =
<TokenType.INTERSECT: 300>
INTERVAL =
<TokenType.INTERVAL: 301>
INTO =
<TokenType.INTO: 302>
INTRODUCER =
<TokenType.INTRODUCER: 303>
IRLIKE =
<TokenType.IRLIKE: 304>
IS =
<TokenType.IS: 305>
ISNULL =
<TokenType.ISNULL: 306>
JOIN =
<TokenType.JOIN: 307>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 308>
KEEP =
<TokenType.KEEP: 309>
KEY =
<TokenType.KEY: 310>
KILL =
<TokenType.KILL: 311>
LANGUAGE =
<TokenType.LANGUAGE: 312>
LATERAL =
<TokenType.LATERAL: 313>
LEFT =
<TokenType.LEFT: 314>
LIKE =
<TokenType.LIKE: 315>
LIMIT =
<TokenType.LIMIT: 316>
LIST =
<TokenType.LIST: 317>
LOAD =
<TokenType.LOAD: 318>
LOCK =
<TokenType.LOCK: 319>
MAP =
<TokenType.MAP: 320>
MATCH =
<TokenType.MATCH: 321>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 322>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 323>
MEMBER_OF =
<TokenType.MEMBER_OF: 324>
MERGE =
<TokenType.MERGE: 325>
MOD =
<TokenType.MOD: 326>
MODEL =
<TokenType.MODEL: 327>
NATURAL =
<TokenType.NATURAL: 328>
NEXT =
<TokenType.NEXT: 329>
NOTHING =
<TokenType.NOTHING: 330>
NOTNULL =
<TokenType.NOTNULL: 331>
NULL =
<TokenType.NULL: 332>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 333>
OFFSET =
<TokenType.OFFSET: 334>
ON =
<TokenType.ON: 335>
ONLY =
<TokenType.ONLY: 336>
OPERATOR =
<TokenType.OPERATOR: 337>
ORDER_BY =
<TokenType.ORDER_BY: 338>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 339>
ORDERED =
<TokenType.ORDERED: 340>
ORDINALITY =
<TokenType.ORDINALITY: 341>
OUT =
<TokenType.OUT: 342>
INOUT =
<TokenType.INOUT: 343>
OUTER =
<TokenType.OUTER: 344>
OVER =
<TokenType.OVER: 345>
OVERLAPS =
<TokenType.OVERLAPS: 346>
OVERWRITE =
<TokenType.OVERWRITE: 347>
PACKAGE =
<TokenType.PACKAGE: 348>
PARTITION =
<TokenType.PARTITION: 349>
PARTITION_BY =
<TokenType.PARTITION_BY: 350>
PERCENT =
<TokenType.PERCENT: 351>
PIVOT =
<TokenType.PIVOT: 352>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 353>
POLICY =
<TokenType.POLICY: 354>
POOL =
<TokenType.POOL: 355>
POSITIONAL =
<TokenType.POSITIONAL: 356>
PRAGMA =
<TokenType.PRAGMA: 357>
PREWHERE =
<TokenType.PREWHERE: 358>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 359>
PROCEDURE =
<TokenType.PROCEDURE: 360>
PROPERTIES =
<TokenType.PROPERTIES: 361>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 362>
PUT =
<TokenType.PUT: 363>
QUALIFY =
<TokenType.QUALIFY: 364>
QUOTE =
<TokenType.QUOTE: 365>
QDCOLON =
<TokenType.QDCOLON: 366>
RANGE =
<TokenType.RANGE: 367>
RECURSIVE =
<TokenType.RECURSIVE: 368>
REFRESH =
<TokenType.REFRESH: 369>
RENAME =
<TokenType.RENAME: 370>
REPLACE =
<TokenType.REPLACE: 371>
RETURNING =
<TokenType.RETURNING: 372>
REVOKE =
<TokenType.REVOKE: 373>
REFERENCES =
<TokenType.REFERENCES: 374>
RIGHT =
<TokenType.RIGHT: 375>
RLIKE =
<TokenType.RLIKE: 376>
ROLE =
<TokenType.ROLE: 377>
ROLLBACK =
<TokenType.ROLLBACK: 378>
ROLLUP =
<TokenType.ROLLUP: 379>
ROW =
<TokenType.ROW: 380>
ROWS =
<TokenType.ROWS: 381>
RULE =
<TokenType.RULE: 382>
SELECT =
<TokenType.SELECT: 383>
SEMI =
<TokenType.SEMI: 384>
SEPARATOR =
<TokenType.SEPARATOR: 385>
SEQUENCE =
<TokenType.SEQUENCE: 386>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 387>
SET =
<TokenType.SET: 388>
SETTINGS =
<TokenType.SETTINGS: 389>
SHOW =
<TokenType.SHOW: 390>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 391>
SOME =
<TokenType.SOME: 392>
SORT_BY =
<TokenType.SORT_BY: 393>
SOUNDS_LIKE =
<TokenType.SOUNDS_LIKE: 394>
SQL_SECURITY =
<TokenType.SQL_SECURITY: 395>
START_WITH =
<TokenType.START_WITH: 396>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 397>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 398>
STRUCT =
<TokenType.STRUCT: 399>
SUMMARIZE =
<TokenType.SUMMARIZE: 400>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 401>
TAG =
<TokenType.TAG: 402>
TEMPORARY =
<TokenType.TEMPORARY: 403>
TOP =
<TokenType.TOP: 404>
THEN =
<TokenType.THEN: 405>
TRUE =
<TokenType.TRUE: 406>
TRUNCATE =
<TokenType.TRUNCATE: 407>
TRIGGER =
<TokenType.TRIGGER: 408>
UNCACHE =
<TokenType.UNCACHE: 409>
UNION =
<TokenType.UNION: 410>
UNNEST =
<TokenType.UNNEST: 411>
UNPIVOT =
<TokenType.UNPIVOT: 412>
UPDATE =
<TokenType.UPDATE: 413>
USE =
<TokenType.USE: 414>
USING =
<TokenType.USING: 415>
VALUES =
<TokenType.VALUES: 416>
VARIADIC =
<TokenType.VARIADIC: 417>
VIEW =
<TokenType.VIEW: 418>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 419>
VOLATILE =
<TokenType.VOLATILE: 420>
VOLUME =
<TokenType.VOLUME: 421>
WHEN =
<TokenType.WHEN: 422>
WHERE =
<TokenType.WHERE: 423>
WINDOW =
<TokenType.WINDOW: 424>
WITH =
<TokenType.WITH: 425>
UNIQUE =
<TokenType.UNIQUE: 426>
UTC_DATE =
<TokenType.UTC_DATE: 427>
UTC_TIME =
<TokenType.UTC_TIME: 428>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 429>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 430>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 431>
OPTION =
<TokenType.OPTION: 432>
SINK =
<TokenType.SINK: 433>
SOURCE =
<TokenType.SOURCE: 434>
ANALYZE =
<TokenType.ANALYZE: 435>
NAMESPACE =
<TokenType.NAMESPACE: 436>
EXPORT =
<TokenType.EXPORT: 437>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 438>
SENTINEL =
<TokenType.SENTINEL: 439>
class
Token:
469class Token: 470 # mypyc doesn't expose slots 471 _attrs: t.ClassVar[tuple[str, ...]] = ( 472 "token_type", 473 "text", 474 "line", 475 "col", 476 "start", 477 "end", 478 "comments", 479 ) 480 __slots__ = _attrs 481 482 @classmethod 483 def number(cls, number: int) -> Token: 484 """Returns a NUMBER token with `number` as its text.""" 485 return cls(TokenType.NUMBER, str(number)) 486 487 @classmethod 488 def string(cls, string: str) -> Token: 489 """Returns a STRING token with `string` as its text.""" 490 return cls(TokenType.STRING, string) 491 492 @classmethod 493 def identifier(cls, identifier: str) -> Token: 494 """Returns an IDENTIFIER token with `identifier` as its text.""" 495 return cls(TokenType.IDENTIFIER, identifier) 496 497 @classmethod 498 def var(cls, var: str) -> Token: 499 """Returns an VAR token with `var` as its text.""" 500 return cls(TokenType.VAR, var) 501 502 def __init__( 503 self, 504 token_type: TokenType, 505 text: str, 506 line: int = 1, 507 col: int = 1, 508 start: int = 0, 509 end: int = 0, 510 comments: list[str] | None = None, 511 ) -> None: 512 self.token_type = token_type 513 self.text = text 514 self.line = line 515 self.col = col 516 self.start = start 517 self.end = end 518 self.comments = [] if comments is None else comments 519 520 def __bool__(self) -> bool: 521 return self.token_type != TokenType.SENTINEL 522 523 def __repr__(self) -> str: 524 attributes = ", ".join( 525 f"{k}: TokenType.{self.token_type.name}" 526 if k == "token_type" 527 else f"{k}: {getattr(self, k)}" 528 for k in self._attrs 529 ) 530 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
502 def __init__( 503 self, 504 token_type: TokenType, 505 text: str, 506 line: int = 1, 507 col: int = 1, 508 start: int = 0, 509 end: int = 0, 510 comments: list[str] | None = None, 511 ) -> None: 512 self.token_type = token_type 513 self.text = text 514 self.line = line 515 self.col = col 516 self.start = start 517 self.end = end 518 self.comments = [] if comments is None else comments
482 @classmethod 483 def number(cls, number: int) -> Token: 484 """Returns a NUMBER token with `number` as its text.""" 485 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
487 @classmethod 488 def string(cls, string: str) -> Token: 489 """Returns a STRING token with `string` as its text.""" 490 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
492 @classmethod 493 def identifier(cls, identifier: str) -> Token: 494 """Returns an IDENTIFIER token with `identifier` as its text.""" 495 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
TokenizerCore:
533class TokenizerCore: 534 __slots__ = ( 535 "sql", 536 "size", 537 "tokens", 538 "_start", 539 "_current", 540 "_line", 541 "_col", 542 "_comments", 543 "_char", 544 "_end", 545 "_peek", 546 "_prev_token_line", 547 "single_tokens", 548 "keywords", 549 "quotes", 550 "format_strings", 551 "identifiers", 552 "comments", 553 "string_escapes", 554 "byte_string_escapes", 555 "identifier_escapes", 556 "escape_follow_chars", 557 "commands", 558 "command_prefix_tokens", 559 "nested_comments", 560 "hint_start", 561 "tokens_preceding_hint", 562 "bit_strings", 563 "hex_strings", 564 "numeric_literals", 565 "var_single_tokens", 566 "string_escapes_allowed_in_raw_strings", 567 "heredoc_tag_is_identifier", 568 "heredoc_string_alternative", 569 "keyword_trie", 570 "numbers_can_be_underscore_separated", 571 "numbers_can_have_decimals", 572 "identifiers_can_start_with_digit", 573 "unescaped_sequences", 574 ) 575 576 def __init__( 577 self, 578 single_tokens: dict[str, TokenType], 579 keywords: dict[str, TokenType], 580 quotes: dict[str, str], 581 format_strings: dict[str, tuple[str, TokenType]], 582 identifiers: dict[str, str], 583 comments: dict[str, str | None], 584 string_escapes: set[str], 585 byte_string_escapes: set[str], 586 identifier_escapes: set[str], 587 escape_follow_chars: set[str], 588 commands: set[TokenType], 589 command_prefix_tokens: set[TokenType], 590 nested_comments: bool, 591 hint_start: str, 592 tokens_preceding_hint: set[TokenType], 593 bit_strings: list[str | tuple[str, str]], 594 hex_strings: list[str | tuple[str, str]], 595 numeric_literals: dict[str, str], 596 var_single_tokens: set[str], 597 string_escapes_allowed_in_raw_strings: bool, 598 heredoc_tag_is_identifier: bool, 599 heredoc_string_alternative: TokenType, 600 keyword_trie: dict, 601 numbers_can_be_underscore_separated: bool, 602 numbers_can_have_decimals: bool, 603 identifiers_can_start_with_digit: bool, 604 unescaped_sequences: dict[str, str], 605 ) -> None: 606 self.single_tokens = single_tokens 607 self.keywords = keywords 608 self.quotes = quotes 609 self.format_strings = format_strings 610 self.identifiers = identifiers 611 self.comments = comments 612 self.string_escapes = string_escapes 613 self.byte_string_escapes = byte_string_escapes 614 self.identifier_escapes = identifier_escapes 615 self.escape_follow_chars = escape_follow_chars 616 self.commands = commands 617 self.command_prefix_tokens = command_prefix_tokens 618 self.nested_comments = nested_comments 619 self.hint_start = hint_start 620 self.tokens_preceding_hint = tokens_preceding_hint 621 self.bit_strings = bit_strings 622 self.hex_strings = hex_strings 623 self.numeric_literals = numeric_literals 624 self.var_single_tokens = var_single_tokens 625 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 626 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 627 self.heredoc_string_alternative = heredoc_string_alternative 628 self.keyword_trie = keyword_trie 629 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 630 self.numbers_can_have_decimals = numbers_can_have_decimals 631 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 632 self.unescaped_sequences = unescaped_sequences 633 self.sql = "" 634 self.size = 0 635 self.tokens: list[Token] = [] 636 self._start = 0 637 self._current = 0 638 self._line = 1 639 self._col = 0 640 self._comments: list[str] = [] 641 self._char = "" 642 self._end = False 643 self._peek = "" 644 self._prev_token_line = -1 645 646 def reset(self) -> None: 647 self.sql = "" 648 self.size = 0 649 self.tokens = [] 650 self._start = 0 651 self._current = 0 652 self._line = 1 653 self._col = 0 654 self._comments = [] 655 self._char = "" 656 self._end = False 657 self._peek = "" 658 self._prev_token_line = -1 659 660 def tokenize(self, sql: str) -> list[Token]: 661 """Returns a list of tokens corresponding to the SQL string `sql`.""" 662 self.reset() 663 self.sql = sql 664 self.size = len(sql) 665 666 try: 667 self._scan() 668 except Exception as e: 669 start = max(self._current - 50, 0) 670 end = min(self._current + 50, self.size - 1) 671 context = self.sql[start:end] 672 raise TokenError(f"Error tokenizing '{context}'") from e 673 674 return self.tokens 675 676 def _scan(self, check_semicolon: bool = False) -> None: 677 identifiers = self.identifiers 678 digit_chars = _DIGIT_CHARS 679 680 while self.size and not self._end: 681 current = self._current 682 683 # Skip spaces here rather than iteratively calling advance() for performance reasons 684 while current < self.size: 685 char = self.sql[current] 686 687 if char == " " or char == "\t": 688 current += 1 689 else: 690 break 691 692 offset = current - self._current if current > self._current else 1 693 694 self._start = current 695 self._advance(offset) 696 697 if not self._char.isspace(): 698 if self._char in digit_chars: 699 self._scan_number() 700 elif self._char in identifiers: 701 self._scan_identifier(identifiers[self._char]) 702 else: 703 self._scan_keywords() 704 705 if check_semicolon and self._peek == ";": 706 break 707 708 if self.tokens and self._comments: 709 self.tokens[-1].comments.extend(self._comments) 710 711 def _chars(self, size: int) -> str: 712 if size == 1: 713 return self._char 714 715 start = self._current - 1 716 end = start + size 717 718 return self.sql[start:end] if end <= self.size else "" 719 720 def _advance(self, i: int = 1, alnum: bool = False) -> None: 721 char = self._char 722 723 if char == "\n" or char == "\r": 724 # Ensures we don't count an extra line if we get a \r\n line break sequence 725 if not (char == "\r" and self._peek == "\n"): 726 self._col = i 727 self._line += 1 728 else: 729 self._col += i 730 731 self._current += i 732 sql = self.sql 733 size = self.size 734 self._end = self._current >= size 735 self._char = sql[self._current - 1] 736 self._peek = "" if self._end else sql[self._current] 737 738 if alnum and self._char.isalnum(): 739 # Cache to local variables instead of attributes for better performance 740 _col = self._col 741 _current = self._current 742 _end = self._end 743 _peek = self._peek 744 745 while _peek.isalnum(): 746 _col += 1 747 _current += 1 748 _end = _current >= size 749 _peek = "" if _end else sql[_current] 750 751 self._col = _col 752 self._current = _current 753 self._end = _end 754 self._peek = _peek 755 self._char = sql[_current - 1] 756 757 @property 758 def _text(self) -> str: 759 return self.sql[self._start : self._current] 760 761 def _add(self, token_type: TokenType, text: str | None = None) -> None: 762 self._prev_token_line = self._line 763 764 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 765 self.tokens[-1].comments.extend(self._comments) 766 self._comments = [] 767 768 if text is None: 769 text = self.sql[self._start : self._current] 770 771 self.tokens.append( 772 Token( 773 token_type, 774 text=text, 775 line=self._line, 776 col=self._col, 777 start=self._start, 778 end=self._current - 1, 779 comments=self._comments, 780 ) 781 ) 782 self._comments = [] 783 784 # If we have either a semicolon or a begin token before the command's token, we'll parse 785 # whatever follows the command's token as a string 786 if ( 787 token_type in self.commands 788 and self._peek != ";" 789 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 790 ): 791 start = self._current 792 tokens = len(self.tokens) 793 self._scan(check_semicolon=True) 794 self.tokens = self.tokens[:tokens] 795 text = self.sql[start : self._current].strip() 796 if text: 797 self._add(TokenType.STRING, text) 798 799 def _scan_keywords(self) -> None: 800 sql = self.sql 801 sql_size = self.size 802 single_tokens = self.single_tokens 803 char_upper = _CHAR_UPPER 804 size = 0 805 word = None 806 chars = self._char 807 char = chars 808 prev_space = False 809 skip = False 810 trie = self.keyword_trie 811 single_token = char in single_tokens 812 813 while chars: 814 if not skip: 815 sub = trie.get(char_upper.get(char, char)) 816 if sub is None: 817 break 818 trie = sub 819 if 0 in trie: 820 word = chars 821 822 end = self._current + size 823 size += 1 824 825 if end < sql_size: 826 char = sql[end] 827 single_token = single_token or char in single_tokens 828 is_space = char.isspace() 829 830 if not is_space or not prev_space: 831 if is_space: 832 char = " " 833 chars += char 834 prev_space = is_space 835 skip = False 836 else: 837 skip = True 838 else: 839 char = "" 840 break 841 842 if word: 843 if self._scan_string(word): 844 return 845 if self._scan_comment(word): 846 return 847 if prev_space or single_token or not char: 848 self._advance(size - 1) 849 word = word.upper() 850 self._add(self.keywords[word], text=word) 851 return 852 853 if self._char in single_tokens: 854 self._add(single_tokens[self._char], text=self._char) 855 return 856 857 self._scan_var() 858 859 def _scan_comment(self, comment_start: str) -> bool: 860 if comment_start not in self.comments: 861 return False 862 863 comment_start_line = self._line 864 comment_start_size = len(comment_start) 865 comment_end = self.comments[comment_start] 866 867 if comment_end: 868 # Skip the comment's start delimiter 869 self._advance(comment_start_size) 870 871 comment_count = 1 872 comment_end_size = len(comment_end) 873 nested_comments = self.nested_comments 874 875 while not self._end: 876 if self._chars(comment_end_size) == comment_end: 877 comment_count -= 1 878 if not comment_count: 879 break 880 881 self._advance(alnum=True) 882 883 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 884 if ( 885 nested_comments 886 and not self._end 887 and self._chars(comment_end_size) == comment_start 888 ): 889 self._advance(comment_start_size) 890 comment_count += 1 891 892 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 893 self._advance(comment_end_size - 1) 894 else: 895 _peek = self._peek 896 while not self._end and _peek != "\n" and _peek != "\r": 897 self._advance(alnum=True) 898 _peek = self._peek 899 self._comments.append(self._text[comment_start_size:]) 900 901 if ( 902 comment_start == self.hint_start 903 and self.tokens 904 and self.tokens[-1].token_type in self.tokens_preceding_hint 905 ): 906 self._add(TokenType.HINT) 907 908 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 909 # Multiple consecutive comments are preserved by appending them to the current comments list. 910 if comment_start_line == self._prev_token_line: 911 self.tokens[-1].comments.extend(self._comments) 912 self._comments = [] 913 self._prev_token_line = self._line 914 915 return True 916 917 def _scan_number(self) -> None: 918 if self._char == "0": 919 peek = _CHAR_UPPER.get(self._peek, self._peek) 920 if peek == "B": 921 return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER) 922 elif peek == "X": 923 return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER) 924 925 decimal = False 926 scientific = 0 927 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 928 single_tokens = self.single_tokens 929 keywords = self.keywords 930 numeric_literals = self.numeric_literals 931 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 932 933 is_underscore_separated: bool = False 934 number_text: str = "" 935 numeric_literal: str = "" 936 numeric_type: TokenType | None = None 937 938 while True: 939 if self._peek in _DIGIT_CHARS: 940 # Batch consecutive digits: scan ahead to find how many 941 sql = self.sql 942 end = self._current + 1 943 size = self.size 944 while end < size and sql[end] in _DIGIT_CHARS: 945 end += 1 946 self._advance(end - self._current) 947 elif self._peek == "." and not decimal: 948 if ( 949 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 950 ) or not self.numbers_can_have_decimals: 951 break 952 decimal = True 953 self._advance() 954 elif self._peek in ("-", "+") and scientific == 1: 955 # Only consume +/- if followed by a digit 956 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 957 scientific += 1 958 self._advance() 959 else: 960 break 961 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 962 scientific += 1 963 self._advance() 964 elif self._peek == "_" and numbers_can_be_underscore_separated: 965 is_underscore_separated = True 966 self._advance() 967 elif self._peek.isidentifier(): 968 number_text = self._text 969 970 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 971 numeric_literal += self._peek 972 self._advance() 973 974 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 975 976 if numeric_type: 977 break 978 elif identifiers_can_start_with_digit: 979 return self._add(TokenType.VAR) 980 981 self._advance(-len(numeric_literal)) 982 break 983 else: 984 break 985 986 number_text = number_text or self.sql[self._start : self._current] 987 988 # Normalize inputs such as 100_000 to 100000 989 if is_underscore_separated: 990 number_text = number_text.replace("_", "") 991 992 self._add(TokenType.NUMBER, number_text) 993 994 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 995 if numeric_type: 996 self._add(TokenType.DCOLON, "::") 997 self._add(numeric_type, numeric_literal) 998 999 def _scan_bits(self) -> None: 1000 self._advance() 1001 value = self._extract_value() 1002 try: 1003 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1004 int(value, 2) 1005 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1006 except ValueError: 1007 self._add(TokenType.IDENTIFIER) 1008 1009 def _scan_hex(self) -> None: 1010 self._advance() 1011 value = self._extract_value() 1012 try: 1013 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1014 int(value, 16) 1015 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1016 except ValueError: 1017 self._add(TokenType.IDENTIFIER) 1018 1019 def _extract_value(self) -> str: 1020 single_tokens = self.single_tokens 1021 1022 while True: 1023 char = self._peek.strip() 1024 if char and char not in single_tokens: 1025 self._advance(alnum=True) 1026 else: 1027 break 1028 1029 return self._text 1030 1031 def _scan_string(self, start: str) -> bool: 1032 base = None 1033 token_type = TokenType.STRING 1034 1035 if start in self.quotes: 1036 end = self.quotes[start] 1037 elif start in self.format_strings: 1038 end, token_type = self.format_strings[start] 1039 1040 if token_type == TokenType.HEX_STRING: 1041 base = 16 1042 elif token_type == TokenType.BIT_STRING: 1043 base = 2 1044 elif token_type == TokenType.HEREDOC_STRING: 1045 self._advance() 1046 1047 if self._char == end: 1048 tag = "" 1049 else: 1050 tag = self._extract_string( 1051 end, 1052 raw_string=True, 1053 raise_unmatched=not self.heredoc_tag_is_identifier, 1054 ) 1055 1056 if ( 1057 tag 1058 and self.heredoc_tag_is_identifier 1059 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1060 ): 1061 if not self._end: 1062 self._advance(-1) 1063 1064 self._advance(-len(tag)) 1065 self._add(self.heredoc_string_alternative) 1066 return True 1067 1068 end = f"{start}{tag}{end}" 1069 else: 1070 return False 1071 1072 self._advance(len(start)) 1073 text = self._extract_string( 1074 end, 1075 escapes=( 1076 self.byte_string_escapes 1077 if token_type == TokenType.BYTE_STRING 1078 else self.string_escapes 1079 ), 1080 raw_string=token_type == TokenType.RAW_STRING, 1081 ) 1082 1083 if base and text: 1084 try: 1085 int(text, base) 1086 except Exception: 1087 raise TokenError( 1088 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1089 ) 1090 1091 self._add(token_type, text) 1092 return True 1093 1094 def _scan_identifier(self, identifier_end: str) -> None: 1095 self._advance() 1096 text = self._extract_string( 1097 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1098 ) 1099 self._add(TokenType.IDENTIFIER, text) 1100 1101 def _scan_var(self) -> None: 1102 var_single_tokens = self.var_single_tokens 1103 single_tokens = self.single_tokens 1104 1105 while True: 1106 peek = self._peek 1107 if not peek or peek.isspace(): 1108 break 1109 if peek not in var_single_tokens and peek in single_tokens: 1110 break 1111 self._advance(alnum=True) 1112 1113 self._add( 1114 TokenType.VAR 1115 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1116 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1117 ) 1118 1119 def _extract_string( 1120 self, 1121 delimiter: str, 1122 escapes: set[str] | None = None, 1123 raw_string: bool = False, 1124 raise_unmatched: bool = True, 1125 ) -> str: 1126 text = "" 1127 delim_size = len(delimiter) 1128 escapes = self.string_escapes if escapes is None else escapes 1129 unescaped_sequences = self.unescaped_sequences 1130 escape_follow_chars = self.escape_follow_chars 1131 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1132 quotes = self.quotes 1133 sql = self.sql 1134 1135 # use str.find() when the string is simple... no \ or other escapes 1136 if delim_size == 1: 1137 pos = self._current - 1 1138 end = sql.find(delimiter, pos) 1139 1140 if ( 1141 # the closing delimiter was found 1142 end != -1 1143 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1144 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1145 # no backslash in the string that would need escape processing 1146 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1147 ): 1148 newlines = sql.count("\n", pos, end) 1149 if newlines: 1150 self._line += newlines 1151 self._col = end - sql.rfind("\n", pos, end) 1152 else: 1153 self._col += end - pos 1154 1155 self._current = end + 1 1156 self._end = self._current >= self.size 1157 self._char = sql[end] 1158 self._peek = "" if self._end else sql[self._current] 1159 return sql[pos:end] 1160 1161 while True: 1162 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1163 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1164 if unescaped_sequence: 1165 self._advance(2) 1166 text += unescaped_sequence 1167 continue 1168 1169 is_valid_custom_escape = ( 1170 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1171 ) 1172 1173 if ( 1174 (string_escapes_allowed_in_raw_strings or not raw_string) 1175 and self._char in escapes 1176 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1177 and (self._char not in quotes or self._char == self._peek) 1178 ): 1179 if self._peek == delimiter: 1180 text += self._peek 1181 elif is_valid_custom_escape and self._char != self._peek: 1182 text += self._peek 1183 else: 1184 text += self._char + self._peek 1185 1186 if self._current + 1 < self.size: 1187 self._advance(2) 1188 else: 1189 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1190 else: 1191 if self._chars(delim_size) == delimiter: 1192 if delim_size > 1: 1193 self._advance(delim_size - 1) 1194 break 1195 1196 if self._end: 1197 if not raise_unmatched: 1198 return text + self._char 1199 1200 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1201 1202 current = self._current - 1 1203 self._advance(alnum=True) 1204 text += sql[current : self._current - 1] 1205 1206 return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], bit_strings: list[str | tuple[str, str]], hex_strings: list[str | tuple[str, str]], numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
576 def __init__( 577 self, 578 single_tokens: dict[str, TokenType], 579 keywords: dict[str, TokenType], 580 quotes: dict[str, str], 581 format_strings: dict[str, tuple[str, TokenType]], 582 identifiers: dict[str, str], 583 comments: dict[str, str | None], 584 string_escapes: set[str], 585 byte_string_escapes: set[str], 586 identifier_escapes: set[str], 587 escape_follow_chars: set[str], 588 commands: set[TokenType], 589 command_prefix_tokens: set[TokenType], 590 nested_comments: bool, 591 hint_start: str, 592 tokens_preceding_hint: set[TokenType], 593 bit_strings: list[str | tuple[str, str]], 594 hex_strings: list[str | tuple[str, str]], 595 numeric_literals: dict[str, str], 596 var_single_tokens: set[str], 597 string_escapes_allowed_in_raw_strings: bool, 598 heredoc_tag_is_identifier: bool, 599 heredoc_string_alternative: TokenType, 600 keyword_trie: dict, 601 numbers_can_be_underscore_separated: bool, 602 numbers_can_have_decimals: bool, 603 identifiers_can_start_with_digit: bool, 604 unescaped_sequences: dict[str, str], 605 ) -> None: 606 self.single_tokens = single_tokens 607 self.keywords = keywords 608 self.quotes = quotes 609 self.format_strings = format_strings 610 self.identifiers = identifiers 611 self.comments = comments 612 self.string_escapes = string_escapes 613 self.byte_string_escapes = byte_string_escapes 614 self.identifier_escapes = identifier_escapes 615 self.escape_follow_chars = escape_follow_chars 616 self.commands = commands 617 self.command_prefix_tokens = command_prefix_tokens 618 self.nested_comments = nested_comments 619 self.hint_start = hint_start 620 self.tokens_preceding_hint = tokens_preceding_hint 621 self.bit_strings = bit_strings 622 self.hex_strings = hex_strings 623 self.numeric_literals = numeric_literals 624 self.var_single_tokens = var_single_tokens 625 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 626 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 627 self.heredoc_string_alternative = heredoc_string_alternative 628 self.keyword_trie = keyword_trie 629 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 630 self.numbers_can_have_decimals = numbers_can_have_decimals 631 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 632 self.unescaped_sequences = unescaped_sequences 633 self.sql = "" 634 self.size = 0 635 self.tokens: list[Token] = [] 636 self._start = 0 637 self._current = 0 638 self._line = 1 639 self._col = 0 640 self._comments: list[str] = [] 641 self._char = "" 642 self._end = False 643 self._peek = "" 644 self._prev_token_line = -1
tokens: list[Token]
660 def tokenize(self, sql: str) -> list[Token]: 661 """Returns a list of tokens corresponding to the SQL string `sql`.""" 662 self.reset() 663 self.sql = sql 664 self.size = len(sql) 665 666 try: 667 self._scan() 668 except Exception as e: 669 start = max(self._current - 50, 0) 670 end = min(self._current + 50, self.size - 1) 671 context = self.sql[start:end] 672 raise TokenError(f"Error tokenizing '{context}'") from e 673 674 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.