sqlglot.tokenizer_core
1from __future__ import annotations 2 3import typing as t 4from enum import IntEnum, auto 5 6from sqlglot.errors import TokenError 7 8# dict lookup is faster than .upper() and .isdigit() 9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)} 10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789") 11 12 13class TokenType(IntEnum): 14 L_PAREN = auto() 15 R_PAREN = auto() 16 L_BRACKET = auto() 17 R_BRACKET = auto() 18 L_BRACE = auto() 19 R_BRACE = auto() 20 COMMA = auto() 21 DOT = auto() 22 DASH = auto() 23 PLUS = auto() 24 COLON = auto() 25 DOTCOLON = auto() 26 DOTCARET = auto() 27 DCOLON = auto() 28 DCOLONDOLLAR = auto() 29 DCOLONPERCENT = auto() 30 DCOLONQMARK = auto() 31 DQMARK = auto() 32 SEMICOLON = auto() 33 STAR = auto() 34 BACKSLASH = auto() 35 SLASH = auto() 36 LT = auto() 37 LTE = auto() 38 GT = auto() 39 GTE = auto() 40 NOT = auto() 41 EQ = auto() 42 NEQ = auto() 43 NULLSAFE_EQ = auto() 44 COLON_EQ = auto() 45 COLON_GT = auto() 46 NCOLON_GT = auto() 47 AND = auto() 48 OR = auto() 49 AMP = auto() 50 DPIPE = auto() 51 PIPE_GT = auto() 52 PIPE = auto() 53 PIPE_SLASH = auto() 54 DPIPE_SLASH = auto() 55 CARET = auto() 56 CARET_AT = auto() 57 TILDE = auto() 58 ARROW = auto() 59 DARROW = auto() 60 FARROW = auto() 61 HASH = auto() 62 HASH_ARROW = auto() 63 DHASH_ARROW = auto() 64 LR_ARROW = auto() 65 LLRR_ARROW = auto() 66 DAT = auto() 67 LT_AT = auto() 68 AT_GT = auto() 69 DOLLAR = auto() 70 PARAMETER = auto() 71 SESSION = auto() 72 SESSION_PARAMETER = auto() 73 SESSION_USER = auto() 74 DAMP = auto() 75 AMP_LT = auto() 76 AMP_GT = auto() 77 ADJACENT = auto() 78 XOR = auto() 79 DSTAR = auto() 80 QMARK_AMP = auto() 81 QMARK_PIPE = auto() 82 HASH_DASH = auto() 83 EXCLAMATION = auto() 84 85 URI_START = auto() 86 87 BLOCK_START = auto() 88 BLOCK_END = auto() 89 90 SPACE = auto() 91 BREAK = auto() 92 93 STRING = auto() 94 NUMBER = auto() 95 IDENTIFIER = auto() 96 DATABASE = auto() 97 COLUMN = auto() 98 COLUMN_DEF = auto() 99 SCHEMA = auto() 100 TABLE = auto() 101 WAREHOUSE = auto() 102 STAGE = auto() 103 STREAM = auto() 104 STREAMLIT = auto() 105 VAR = auto() 106 BIT_STRING = auto() 107 HEX_STRING = auto() 108 BYTE_STRING = auto() 109 NATIONAL_STRING = auto() 110 RAW_STRING = auto() 111 HEREDOC_STRING = auto() 112 UNICODE_STRING = auto() 113 114 # types 115 BIT = auto() 116 BOOLEAN = auto() 117 TINYINT = auto() 118 UTINYINT = auto() 119 SMALLINT = auto() 120 USMALLINT = auto() 121 MEDIUMINT = auto() 122 UMEDIUMINT = auto() 123 INT = auto() 124 UINT = auto() 125 BIGINT = auto() 126 UBIGINT = auto() 127 BIGNUM = auto() 128 INT128 = auto() 129 UINT128 = auto() 130 INT256 = auto() 131 UINT256 = auto() 132 FLOAT = auto() 133 DOUBLE = auto() 134 UDOUBLE = auto() 135 DECIMAL = auto() 136 DECIMAL32 = auto() 137 DECIMAL64 = auto() 138 DECIMAL128 = auto() 139 DECIMAL256 = auto() 140 DECFLOAT = auto() 141 UDECIMAL = auto() 142 BIGDECIMAL = auto() 143 CHAR = auto() 144 NCHAR = auto() 145 VARCHAR = auto() 146 NVARCHAR = auto() 147 BPCHAR = auto() 148 TEXT = auto() 149 MEDIUMTEXT = auto() 150 LONGTEXT = auto() 151 BLOB = auto() 152 MEDIUMBLOB = auto() 153 LONGBLOB = auto() 154 TINYBLOB = auto() 155 TINYTEXT = auto() 156 NAME = auto() 157 BINARY = auto() 158 VARBINARY = auto() 159 JSON = auto() 160 JSONB = auto() 161 TIME = auto() 162 TIMETZ = auto() 163 TIME_NS = auto() 164 TIMESTAMP = auto() 165 TIMESTAMPTZ = auto() 166 TIMESTAMPLTZ = auto() 167 TIMESTAMPNTZ = auto() 168 TIMESTAMP_S = auto() 169 TIMESTAMP_MS = auto() 170 TIMESTAMP_NS = auto() 171 DATETIME = auto() 172 DATETIME2 = auto() 173 DATETIME64 = auto() 174 SMALLDATETIME = auto() 175 DATE = auto() 176 DATE32 = auto() 177 INT4RANGE = auto() 178 INT4MULTIRANGE = auto() 179 INT8RANGE = auto() 180 INT8MULTIRANGE = auto() 181 NUMRANGE = auto() 182 NUMMULTIRANGE = auto() 183 TSRANGE = auto() 184 TSMULTIRANGE = auto() 185 TSTZRANGE = auto() 186 TSTZMULTIRANGE = auto() 187 DATERANGE = auto() 188 DATEMULTIRANGE = auto() 189 UUID = auto() 190 GEOGRAPHY = auto() 191 GEOGRAPHYPOINT = auto() 192 NULLABLE = auto() 193 GEOMETRY = auto() 194 POINT = auto() 195 RING = auto() 196 LINESTRING = auto() 197 LOCALTIME = auto() 198 LOCALTIMESTAMP = auto() 199 SYSTIMESTAMP = auto() 200 MULTILINESTRING = auto() 201 POLYGON = auto() 202 MULTIPOLYGON = auto() 203 HLLSKETCH = auto() 204 HSTORE = auto() 205 SUPER = auto() 206 SERIAL = auto() 207 SMALLSERIAL = auto() 208 BIGSERIAL = auto() 209 XML = auto() 210 YEAR = auto() 211 USERDEFINED = auto() 212 MONEY = auto() 213 SMALLMONEY = auto() 214 ROWVERSION = auto() 215 IMAGE = auto() 216 VARIANT = auto() 217 OBJECT = auto() 218 INET = auto() 219 IPADDRESS = auto() 220 IPPREFIX = auto() 221 IPV4 = auto() 222 IPV6 = auto() 223 ENUM = auto() 224 ENUM8 = auto() 225 ENUM16 = auto() 226 FIXEDSTRING = auto() 227 LOWCARDINALITY = auto() 228 NESTED = auto() 229 AGGREGATEFUNCTION = auto() 230 SIMPLEAGGREGATEFUNCTION = auto() 231 TDIGEST = auto() 232 UNKNOWN = auto() 233 VECTOR = auto() 234 DYNAMIC = auto() 235 VOID = auto() 236 237 # keywords 238 ALIAS = auto() 239 ALTER = auto() 240 ALL = auto() 241 ANTI = auto() 242 ANY = auto() 243 APPLY = auto() 244 ARRAY = auto() 245 ASC = auto() 246 ASOF = auto() 247 ATTACH = auto() 248 AUTO_INCREMENT = auto() 249 BEGIN = auto() 250 BETWEEN = auto() 251 BULK_COLLECT_INTO = auto() 252 CACHE = auto() 253 CASE = auto() 254 CHARACTER_SET = auto() 255 CLUSTER_BY = auto() 256 COLLATE = auto() 257 COMMAND = auto() 258 COMMENT = auto() 259 COMMIT = auto() 260 CONNECT_BY = auto() 261 CONSTRAINT = auto() 262 COPY = auto() 263 CREATE = auto() 264 CROSS = auto() 265 CUBE = auto() 266 CURRENT_DATE = auto() 267 CURRENT_DATETIME = auto() 268 CURRENT_SCHEMA = auto() 269 CURRENT_TIME = auto() 270 CURRENT_TIMESTAMP = auto() 271 CURRENT_USER = auto() 272 CURRENT_USER_ID = auto() 273 CURRENT_ROLE = auto() 274 CURRENT_CATALOG = auto() 275 DECLARE = auto() 276 DEFAULT = auto() 277 DELETE = auto() 278 DESC = auto() 279 DESCRIBE = auto() 280 DETACH = auto() 281 DICTIONARY = auto() 282 DISTINCT = auto() 283 DISTRIBUTE_BY = auto() 284 DIV = auto() 285 DROP = auto() 286 ELSE = auto() 287 END = auto() 288 ESCAPE = auto() 289 EXCEPT = auto() 290 EXECUTE = auto() 291 EXISTS = auto() 292 FALSE = auto() 293 FETCH = auto() 294 FILE = auto() 295 FILE_FORMAT = auto() 296 FILTER = auto() 297 FINAL = auto() 298 FIRST = auto() 299 FOR = auto() 300 FORCE = auto() 301 FOREIGN_KEY = auto() 302 FORMAT = auto() 303 FROM = auto() 304 FULL = auto() 305 FUNCTION = auto() 306 GET = auto() 307 GLOB = auto() 308 GLOBAL = auto() 309 GRANT = auto() 310 GROUP_BY = auto() 311 GROUPING_SETS = auto() 312 HAVING = auto() 313 HINT = auto() 314 IGNORE = auto() 315 ILIKE = auto() 316 IN = auto() 317 INDEX = auto() 318 INDEXED_BY = auto() 319 INNER = auto() 320 INSERT = auto() 321 INSTALL = auto() 322 INTEGRATION = auto() 323 INTERSECT = auto() 324 INTERVAL = auto() 325 INTO = auto() 326 INTRODUCER = auto() 327 IRLIKE = auto() 328 IS = auto() 329 ISNULL = auto() 330 JOIN = auto() 331 JOIN_MARKER = auto() 332 KEEP = auto() 333 KEY = auto() 334 KILL = auto() 335 LANGUAGE = auto() 336 LATERAL = auto() 337 LEFT = auto() 338 LIKE = auto() 339 LIMIT = auto() 340 LIST = auto() 341 LOAD = auto() 342 LOCK = auto() 343 MAP = auto() 344 MATCH = auto() 345 MATCH_CONDITION = auto() 346 MATCH_RECOGNIZE = auto() 347 MEMBER_OF = auto() 348 MERGE = auto() 349 MOD = auto() 350 MODEL = auto() 351 NATURAL = auto() 352 NEXT = auto() 353 NOTHING = auto() 354 NOTNULL = auto() 355 NULL = auto() 356 OBJECT_IDENTIFIER = auto() 357 OFFSET = auto() 358 ON = auto() 359 ONLY = auto() 360 OPERATOR = auto() 361 ORDER_BY = auto() 362 ORDER_SIBLINGS_BY = auto() 363 ORDERED = auto() 364 ORDINALITY = auto() 365 OUT = auto() 366 INOUT = auto() 367 OUTER = auto() 368 OVER = auto() 369 OVERLAPS = auto() 370 OVERWRITE = auto() 371 PACKAGE = auto() 372 PARTITION = auto() 373 PARTITION_BY = auto() 374 PERCENT = auto() 375 PIVOT = auto() 376 PLACEHOLDER = auto() 377 POLICY = auto() 378 POOL = auto() 379 POSITIONAL = auto() 380 PRAGMA = auto() 381 PREWHERE = auto() 382 PRIMARY_KEY = auto() 383 PROCEDURE = auto() 384 PROPERTIES = auto() 385 PSEUDO_TYPE = auto() 386 PUT = auto() 387 QUALIFY = auto() 388 QUOTE = auto() 389 QDCOLON = auto() 390 RANGE = auto() 391 RECURSIVE = auto() 392 REFRESH = auto() 393 RENAME = auto() 394 REPLACE = auto() 395 RETURNING = auto() 396 REVOKE = auto() 397 REFERENCES = auto() 398 RIGHT = auto() 399 RLIKE = auto() 400 ROLE = auto() 401 ROLLBACK = auto() 402 ROLLUP = auto() 403 ROW = auto() 404 ROWS = auto() 405 RULE = auto() 406 SELECT = auto() 407 SEMI = auto() 408 SEPARATOR = auto() 409 SEQUENCE = auto() 410 SERDE_PROPERTIES = auto() 411 SET = auto() 412 SETTINGS = auto() 413 SHOW = auto() 414 SIMILAR_TO = auto() 415 SOME = auto() 416 SORT_BY = auto() 417 SOUNDS_LIKE = auto() 418 SQL_SECURITY = auto() 419 START_WITH = auto() 420 STORAGE_INTEGRATION = auto() 421 STRAIGHT_JOIN = auto() 422 STRUCT = auto() 423 SUMMARIZE = auto() 424 TABLE_SAMPLE = auto() 425 TAG = auto() 426 TEMPORARY = auto() 427 TOP = auto() 428 THEN = auto() 429 TRUE = auto() 430 TRUNCATE = auto() 431 TRIGGER = auto() 432 TYPE = auto() 433 UNCACHE = auto() 434 UNION = auto() 435 UNNEST = auto() 436 UNPIVOT = auto() 437 UPDATE = auto() 438 USE = auto() 439 USING = auto() 440 VALUES = auto() 441 VARIADIC = auto() 442 VIEW = auto() 443 SEMANTIC_VIEW = auto() 444 VOLATILE = auto() 445 VOLUME = auto() 446 WHEN = auto() 447 WHERE = auto() 448 WINDOW = auto() 449 WITH = auto() 450 UNIQUE = auto() 451 UTC_DATE = auto() 452 UTC_TIME = auto() 453 UTC_TIMESTAMP = auto() 454 VERSION_SNAPSHOT = auto() 455 TIMESTAMP_SNAPSHOT = auto() 456 OPTION = auto() 457 SINK = auto() 458 SOURCE = auto() 459 ANALYZE = auto() 460 NAMESPACE = auto() 461 EXPORT = auto() 462 463 # sentinels 464 HIVE_TOKEN_STREAM = auto() 465 SENTINEL = auto() 466 467 def __str__(self) -> str: 468 return f"TokenType.{self.name}" 469 470 471class Token: 472 # mypyc doesn't expose slots 473 _attrs: t.ClassVar[tuple[str, ...]] = ( 474 "token_type", 475 "text", 476 "line", 477 "col", 478 "start", 479 "end", 480 "comments", 481 ) 482 __slots__ = _attrs 483 484 @classmethod 485 def number(cls, number: int) -> Token: 486 """Returns a NUMBER token with `number` as its text.""" 487 return cls(TokenType.NUMBER, str(number)) 488 489 @classmethod 490 def string(cls, string: str) -> Token: 491 """Returns a STRING token with `string` as its text.""" 492 return cls(TokenType.STRING, string) 493 494 @classmethod 495 def identifier(cls, identifier: str) -> Token: 496 """Returns an IDENTIFIER token with `identifier` as its text.""" 497 return cls(TokenType.IDENTIFIER, identifier) 498 499 @classmethod 500 def var(cls, var: str) -> Token: 501 """Returns an VAR token with `var` as its text.""" 502 return cls(TokenType.VAR, var) 503 504 def __init__( 505 self, 506 token_type: TokenType, 507 text: str, 508 line: int = 1, 509 col: int = 1, 510 start: int = 0, 511 end: int = 0, 512 comments: list[str] | None = None, 513 ) -> None: 514 self.token_type = token_type 515 self.text = text 516 self.line = line 517 self.col = col 518 self.start = start 519 self.end = end 520 self.comments = [] if comments is None else comments 521 522 def __bool__(self) -> bool: 523 return self.token_type != TokenType.SENTINEL 524 525 def __repr__(self) -> str: 526 attributes = ", ".join( 527 f"{k}: TokenType.{self.token_type.name}" 528 if k == "token_type" 529 else f"{k}: {getattr(self, k)}" 530 for k in self._attrs 531 ) 532 return f"<Token {attributes}>" 533 534 535class TokenizerCore: 536 __slots__ = ( 537 "sql", 538 "size", 539 "tokens", 540 "_start", 541 "_current", 542 "_line", 543 "_col", 544 "_comments", 545 "_char", 546 "_end", 547 "_peek", 548 "_prev_token_line", 549 "single_tokens", 550 "keywords", 551 "quotes", 552 "format_strings", 553 "identifiers", 554 "comments", 555 "string_escapes", 556 "byte_string_escapes", 557 "identifier_escapes", 558 "escape_follow_chars", 559 "commands", 560 "command_prefix_tokens", 561 "nested_comments", 562 "hint_start", 563 "tokens_preceding_hint", 564 "has_bit_strings", 565 "has_hex_strings", 566 "numeric_literals", 567 "var_single_tokens", 568 "string_escapes_allowed_in_raw_strings", 569 "heredoc_tag_is_identifier", 570 "heredoc_string_alternative", 571 "keyword_trie", 572 "numbers_can_be_underscore_separated", 573 "numbers_can_have_decimals", 574 "identifiers_can_start_with_digit", 575 "unescaped_sequences", 576 ) 577 578 def __init__( 579 self, 580 single_tokens: dict[str, TokenType], 581 keywords: dict[str, TokenType], 582 quotes: dict[str, str], 583 format_strings: dict[str, tuple[str, TokenType]], 584 identifiers: dict[str, str], 585 comments: dict[str, str | None], 586 string_escapes: set[str], 587 byte_string_escapes: set[str], 588 identifier_escapes: set[str], 589 escape_follow_chars: set[str], 590 commands: set[TokenType], 591 command_prefix_tokens: set[TokenType], 592 nested_comments: bool, 593 hint_start: str, 594 tokens_preceding_hint: set[TokenType], 595 has_bit_strings: bool, 596 has_hex_strings: bool, 597 numeric_literals: dict[str, str], 598 var_single_tokens: set[str], 599 string_escapes_allowed_in_raw_strings: bool, 600 heredoc_tag_is_identifier: bool, 601 heredoc_string_alternative: TokenType, 602 keyword_trie: dict, 603 numbers_can_be_underscore_separated: bool, 604 numbers_can_have_decimals: bool, 605 identifiers_can_start_with_digit: bool, 606 unescaped_sequences: dict[str, str], 607 ) -> None: 608 self.single_tokens = single_tokens 609 self.keywords = keywords 610 self.quotes = quotes 611 self.format_strings = format_strings 612 self.identifiers = identifiers 613 self.comments = comments 614 self.string_escapes = string_escapes 615 self.byte_string_escapes = byte_string_escapes 616 self.identifier_escapes = identifier_escapes 617 self.escape_follow_chars = escape_follow_chars 618 self.commands = commands 619 self.command_prefix_tokens = command_prefix_tokens 620 self.nested_comments = nested_comments 621 self.hint_start = hint_start 622 self.tokens_preceding_hint = tokens_preceding_hint 623 self.has_bit_strings = has_bit_strings 624 self.has_hex_strings = has_hex_strings 625 self.numeric_literals = numeric_literals 626 self.var_single_tokens = var_single_tokens 627 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 628 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 629 self.heredoc_string_alternative = heredoc_string_alternative 630 self.keyword_trie = keyword_trie 631 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 632 self.numbers_can_have_decimals = numbers_can_have_decimals 633 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 634 self.unescaped_sequences = unescaped_sequences 635 self.sql = "" 636 self.size = 0 637 self.tokens: list[Token] = [] 638 self._start = 0 639 self._current = 0 640 self._line = 1 641 self._col = 0 642 self._comments: list[str] = [] 643 self._char = "" 644 self._end = False 645 self._peek = "" 646 self._prev_token_line = -1 647 648 def reset(self) -> None: 649 self.sql = "" 650 self.size = 0 651 self.tokens = [] 652 self._start = 0 653 self._current = 0 654 self._line = 1 655 self._col = 0 656 self._comments = [] 657 self._char = "" 658 self._end = False 659 self._peek = "" 660 self._prev_token_line = -1 661 662 def tokenize(self, sql: str) -> list[Token]: 663 """Returns a list of tokens corresponding to the SQL string `sql`.""" 664 self.reset() 665 self.sql = sql 666 self.size = len(sql) 667 668 try: 669 self._scan() 670 except Exception as e: 671 start = max(self._current - 50, 0) 672 end = min(self._current + 50, self.size - 1) 673 context = self.sql[start:end] 674 raise TokenError(f"Error tokenizing '{context}'") from e 675 676 return self.tokens 677 678 def _scan(self, check_semicolon: bool = False) -> None: 679 identifiers = self.identifiers 680 digit_chars = _DIGIT_CHARS 681 682 while self.size and not self._end: 683 current = self._current 684 685 # Skip spaces here rather than iteratively calling advance() for performance reasons 686 while current < self.size: 687 char = self.sql[current] 688 689 if char == " " or char == "\t": 690 current += 1 691 else: 692 break 693 694 offset = current - self._current if current > self._current else 1 695 696 self._start = current 697 self._advance(offset) 698 699 if not self._char.isspace(): 700 if self._char in digit_chars: 701 self._scan_number() 702 elif self._char in identifiers: 703 self._scan_identifier(identifiers[self._char]) 704 else: 705 self._scan_keywords() 706 707 if check_semicolon and self._peek == ";": 708 break 709 710 if self.tokens and self._comments: 711 self.tokens[-1].comments.extend(self._comments) 712 713 def _chars(self, size: int) -> str: 714 if size == 1: 715 return self._char 716 717 start = self._current - 1 718 end = start + size 719 720 return self.sql[start:end] if end <= self.size else "" 721 722 def _advance(self, i: int = 1, alnum: bool = False) -> None: 723 char = self._char 724 725 if char == "\n" or char == "\r": 726 # Ensures we don't count an extra line if we get a \r\n line break sequence 727 if not (char == "\r" and self._peek == "\n"): 728 self._col = i 729 self._line += 1 730 else: 731 self._col += i 732 733 self._current += i 734 sql = self.sql 735 size = self.size 736 self._end = self._current >= size 737 self._char = sql[self._current - 1] 738 self._peek = "" if self._end else sql[self._current] 739 740 if alnum and self._char.isalnum(): 741 # Cache to local variables instead of attributes for better performance 742 _col = self._col 743 _current = self._current 744 _end = self._end 745 _peek = self._peek 746 747 while _peek.isalnum(): 748 _col += 1 749 _current += 1 750 _end = _current >= size 751 _peek = "" if _end else sql[_current] 752 753 self._col = _col 754 self._current = _current 755 self._end = _end 756 self._peek = _peek 757 self._char = sql[_current - 1] 758 759 @property 760 def _text(self) -> str: 761 return self.sql[self._start : self._current] 762 763 def _add(self, token_type: TokenType, text: str | None = None) -> None: 764 self._prev_token_line = self._line 765 766 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 767 self.tokens[-1].comments.extend(self._comments) 768 self._comments = [] 769 770 if text is None: 771 text = self.sql[self._start : self._current] 772 773 self.tokens.append( 774 Token( 775 token_type, 776 text=text, 777 line=self._line, 778 col=self._col, 779 start=self._start, 780 end=self._current - 1, 781 comments=self._comments, 782 ) 783 ) 784 self._comments = [] 785 786 # If we have either a semicolon or a begin token before the command's token, we'll parse 787 # whatever follows the command's token as a string 788 if ( 789 token_type in self.commands 790 and self._peek != ";" 791 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 792 ): 793 start = self._current 794 tokens = len(self.tokens) 795 self._scan(check_semicolon=True) 796 self.tokens = self.tokens[:tokens] 797 text = self.sql[start : self._current].strip() 798 if text: 799 self._add(TokenType.STRING, text) 800 801 def _scan_keywords(self) -> None: 802 sql = self.sql 803 sql_size = self.size 804 single_tokens = self.single_tokens 805 char_upper = _CHAR_UPPER 806 size = 0 807 word = None 808 chars = self._char 809 char = chars 810 prev_space = False 811 skip = False 812 trie = self.keyword_trie 813 single_token = char in single_tokens 814 815 while chars: 816 if not skip: 817 sub = trie.get(char_upper.get(char, char)) 818 if sub is None: 819 break 820 trie = sub 821 if 0 in trie: 822 word = chars 823 824 end = self._current + size 825 size += 1 826 827 if end < sql_size: 828 char = sql[end] 829 single_token = single_token or char in single_tokens 830 is_space = char.isspace() 831 832 if not is_space or not prev_space: 833 if is_space: 834 char = " " 835 chars += char 836 prev_space = is_space 837 skip = False 838 else: 839 skip = True 840 else: 841 char = "" 842 break 843 844 if word: 845 if self._scan_string(word): 846 return 847 if self._scan_comment(word): 848 return 849 if prev_space or single_token or not char: 850 self._advance(size - 1) 851 word = word.upper() 852 self._add(self.keywords[word], text=word) 853 return 854 855 if self._char in single_tokens: 856 self._add(single_tokens[self._char], text=self._char) 857 return 858 859 self._scan_var() 860 861 def _scan_comment(self, comment_start: str) -> bool: 862 if comment_start not in self.comments: 863 return False 864 865 comment_start_line = self._line 866 comment_start_size = len(comment_start) 867 comment_end = self.comments[comment_start] 868 869 if comment_end: 870 # Skip the comment's start delimiter 871 self._advance(comment_start_size) 872 873 comment_count = 1 874 comment_end_size = len(comment_end) 875 nested_comments = self.nested_comments 876 877 while not self._end: 878 if self._chars(comment_end_size) == comment_end: 879 comment_count -= 1 880 if not comment_count: 881 break 882 883 self._advance(alnum=True) 884 885 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 886 if ( 887 nested_comments 888 and not self._end 889 and self._chars(comment_end_size) == comment_start 890 ): 891 self._advance(comment_start_size) 892 comment_count += 1 893 894 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 895 self._advance(comment_end_size - 1) 896 else: 897 _peek = self._peek 898 while not self._end and _peek != "\n" and _peek != "\r": 899 self._advance(alnum=True) 900 _peek = self._peek 901 self._comments.append(self._text[comment_start_size:]) 902 903 if ( 904 comment_start == self.hint_start 905 and self.tokens 906 and self.tokens[-1].token_type in self.tokens_preceding_hint 907 ): 908 self._add(TokenType.HINT) 909 910 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 911 # Multiple consecutive comments are preserved by appending them to the current comments list. 912 if comment_start_line == self._prev_token_line: 913 self.tokens[-1].comments.extend(self._comments) 914 self._comments = [] 915 self._prev_token_line = self._line 916 917 return True 918 919 def _scan_number(self) -> None: 920 if self._char == "0": 921 peek = _CHAR_UPPER.get(self._peek, self._peek) 922 if peek == "B": 923 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 924 elif peek == "X": 925 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 926 927 decimal = False 928 scientific = 0 929 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 930 single_tokens = self.single_tokens 931 keywords = self.keywords 932 numeric_literals = self.numeric_literals 933 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 934 935 is_underscore_separated: bool = False 936 number_text: str = "" 937 numeric_literal: str = "" 938 numeric_type: TokenType | None = None 939 940 while True: 941 if self._peek in _DIGIT_CHARS: 942 # Batch consecutive digits: scan ahead to find how many 943 sql = self.sql 944 end = self._current + 1 945 size = self.size 946 while end < size and sql[end] in _DIGIT_CHARS: 947 end += 1 948 self._advance(end - self._current) 949 elif self._peek == "." and not decimal: 950 if ( 951 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 952 ) or not self.numbers_can_have_decimals: 953 break 954 decimal = True 955 self._advance() 956 elif self._peek in ("-", "+") and scientific == 1: 957 # Only consume +/- if followed by a digit 958 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 959 scientific += 1 960 self._advance() 961 else: 962 break 963 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 964 scientific += 1 965 self._advance() 966 elif self._peek == "_" and numbers_can_be_underscore_separated: 967 is_underscore_separated = True 968 self._advance() 969 elif self._peek.isidentifier(): 970 number_text = self._text 971 972 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 973 numeric_literal += self._peek 974 self._advance() 975 976 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 977 978 if numeric_type: 979 break 980 elif identifiers_can_start_with_digit: 981 return self._add(TokenType.VAR) 982 983 self._advance(-len(numeric_literal)) 984 break 985 else: 986 break 987 988 number_text = number_text or self.sql[self._start : self._current] 989 990 # Normalize inputs such as 100_000 to 100000 991 if is_underscore_separated: 992 number_text = number_text.replace("_", "") 993 994 self._add(TokenType.NUMBER, number_text) 995 996 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 997 if numeric_type: 998 self._add(TokenType.DCOLON, "::") 999 self._add(numeric_type, numeric_literal) 1000 1001 def _scan_bits(self) -> None: 1002 self._advance() 1003 value = self._extract_value() 1004 try: 1005 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1006 int(value, 2) 1007 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1008 except ValueError: 1009 self._add(TokenType.IDENTIFIER) 1010 1011 def _scan_hex(self) -> None: 1012 self._advance() 1013 value = self._extract_value() 1014 try: 1015 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1016 int(value, 16) 1017 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1018 except ValueError: 1019 self._add(TokenType.IDENTIFIER) 1020 1021 def _extract_value(self) -> str: 1022 single_tokens = self.single_tokens 1023 1024 while True: 1025 char = self._peek.strip() 1026 if char and char not in single_tokens: 1027 self._advance(alnum=True) 1028 else: 1029 break 1030 1031 return self._text 1032 1033 def _scan_string(self, start: str) -> bool: 1034 base = None 1035 token_type = TokenType.STRING 1036 1037 if start in self.quotes: 1038 end = self.quotes[start] 1039 elif start in self.format_strings: 1040 end, token_type = self.format_strings[start] 1041 1042 if token_type == TokenType.HEX_STRING: 1043 base = 16 1044 elif token_type == TokenType.BIT_STRING: 1045 base = 2 1046 elif token_type == TokenType.HEREDOC_STRING: 1047 self._advance() 1048 1049 if self._char == end: 1050 tag = "" 1051 else: 1052 tag = self._extract_string( 1053 end, 1054 raw_string=True, 1055 raise_unmatched=not self.heredoc_tag_is_identifier, 1056 ) 1057 1058 if ( 1059 tag 1060 and self.heredoc_tag_is_identifier 1061 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1062 ): 1063 if not self._end: 1064 self._advance(-1) 1065 1066 self._advance(-len(tag)) 1067 self._add(self.heredoc_string_alternative) 1068 return True 1069 1070 end = f"{start}{tag}{end}" 1071 else: 1072 return False 1073 1074 self._advance(len(start)) 1075 text = self._extract_string( 1076 end, 1077 escapes=( 1078 self.byte_string_escapes 1079 if token_type == TokenType.BYTE_STRING 1080 else self.string_escapes 1081 ), 1082 raw_string=token_type == TokenType.RAW_STRING, 1083 ) 1084 1085 if base and text: 1086 try: 1087 int(text, base) 1088 except Exception: 1089 raise TokenError( 1090 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1091 ) 1092 1093 self._add(token_type, text) 1094 return True 1095 1096 def _scan_identifier(self, identifier_end: str) -> None: 1097 self._advance() 1098 text = self._extract_string( 1099 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1100 ) 1101 self._add(TokenType.IDENTIFIER, text) 1102 1103 def _scan_var(self) -> None: 1104 var_single_tokens = self.var_single_tokens 1105 single_tokens = self.single_tokens 1106 1107 while True: 1108 peek = self._peek 1109 if not peek or peek.isspace(): 1110 break 1111 if peek not in var_single_tokens and peek in single_tokens: 1112 break 1113 self._advance(alnum=True) 1114 1115 self._add( 1116 TokenType.VAR 1117 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1118 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1119 ) 1120 1121 def _extract_string( 1122 self, 1123 delimiter: str, 1124 escapes: set[str] | None = None, 1125 raw_string: bool = False, 1126 raise_unmatched: bool = True, 1127 ) -> str: 1128 text = "" 1129 delim_size = len(delimiter) 1130 escapes = self.string_escapes if escapes is None else escapes 1131 unescaped_sequences = self.unescaped_sequences 1132 escape_follow_chars = self.escape_follow_chars 1133 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1134 quotes = self.quotes 1135 sql = self.sql 1136 1137 # use str.find() when the string is simple... no \ or other escapes 1138 if delim_size == 1: 1139 pos = self._current - 1 1140 end = sql.find(delimiter, pos) 1141 1142 if ( 1143 # the closing delimiter was found 1144 end != -1 1145 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1146 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1147 # no backslash in the string that would need escape processing 1148 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1149 ): 1150 newlines = sql.count("\n", pos, end) 1151 if newlines: 1152 self._line += newlines 1153 self._col = end - sql.rfind("\n", pos, end) 1154 else: 1155 self._col += end - pos 1156 1157 self._current = end + 1 1158 self._end = self._current >= self.size 1159 self._char = sql[end] 1160 self._peek = "" if self._end else sql[self._current] 1161 return sql[pos:end] 1162 1163 while True: 1164 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1165 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1166 if unescaped_sequence: 1167 self._advance(2) 1168 text += unescaped_sequence 1169 continue 1170 1171 is_valid_custom_escape = ( 1172 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1173 ) 1174 1175 if ( 1176 (string_escapes_allowed_in_raw_strings or not raw_string) 1177 and self._char in escapes 1178 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1179 and (self._char not in quotes or self._char == self._peek) 1180 ): 1181 if self._peek == delimiter: 1182 text += self._peek 1183 elif is_valid_custom_escape and self._char != self._peek: 1184 text += self._peek 1185 else: 1186 text += self._char + self._peek 1187 1188 if self._current + 1 < self.size: 1189 self._advance(2) 1190 else: 1191 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1192 else: 1193 if self._chars(delim_size) == delimiter: 1194 if delim_size > 1: 1195 self._advance(delim_size - 1) 1196 break 1197 1198 if self._end: 1199 if not raise_unmatched: 1200 return text + self._char 1201 1202 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1203 1204 current = self._current - 1 1205 self._advance(alnum=True) 1206 text += sql[current : self._current - 1] 1207 1208 return text
class
TokenType(enum.IntEnum):
14class TokenType(IntEnum): 15 L_PAREN = auto() 16 R_PAREN = auto() 17 L_BRACKET = auto() 18 R_BRACKET = auto() 19 L_BRACE = auto() 20 R_BRACE = auto() 21 COMMA = auto() 22 DOT = auto() 23 DASH = auto() 24 PLUS = auto() 25 COLON = auto() 26 DOTCOLON = auto() 27 DOTCARET = auto() 28 DCOLON = auto() 29 DCOLONDOLLAR = auto() 30 DCOLONPERCENT = auto() 31 DCOLONQMARK = auto() 32 DQMARK = auto() 33 SEMICOLON = auto() 34 STAR = auto() 35 BACKSLASH = auto() 36 SLASH = auto() 37 LT = auto() 38 LTE = auto() 39 GT = auto() 40 GTE = auto() 41 NOT = auto() 42 EQ = auto() 43 NEQ = auto() 44 NULLSAFE_EQ = auto() 45 COLON_EQ = auto() 46 COLON_GT = auto() 47 NCOLON_GT = auto() 48 AND = auto() 49 OR = auto() 50 AMP = auto() 51 DPIPE = auto() 52 PIPE_GT = auto() 53 PIPE = auto() 54 PIPE_SLASH = auto() 55 DPIPE_SLASH = auto() 56 CARET = auto() 57 CARET_AT = auto() 58 TILDE = auto() 59 ARROW = auto() 60 DARROW = auto() 61 FARROW = auto() 62 HASH = auto() 63 HASH_ARROW = auto() 64 DHASH_ARROW = auto() 65 LR_ARROW = auto() 66 LLRR_ARROW = auto() 67 DAT = auto() 68 LT_AT = auto() 69 AT_GT = auto() 70 DOLLAR = auto() 71 PARAMETER = auto() 72 SESSION = auto() 73 SESSION_PARAMETER = auto() 74 SESSION_USER = auto() 75 DAMP = auto() 76 AMP_LT = auto() 77 AMP_GT = auto() 78 ADJACENT = auto() 79 XOR = auto() 80 DSTAR = auto() 81 QMARK_AMP = auto() 82 QMARK_PIPE = auto() 83 HASH_DASH = auto() 84 EXCLAMATION = auto() 85 86 URI_START = auto() 87 88 BLOCK_START = auto() 89 BLOCK_END = auto() 90 91 SPACE = auto() 92 BREAK = auto() 93 94 STRING = auto() 95 NUMBER = auto() 96 IDENTIFIER = auto() 97 DATABASE = auto() 98 COLUMN = auto() 99 COLUMN_DEF = auto() 100 SCHEMA = auto() 101 TABLE = auto() 102 WAREHOUSE = auto() 103 STAGE = auto() 104 STREAM = auto() 105 STREAMLIT = auto() 106 VAR = auto() 107 BIT_STRING = auto() 108 HEX_STRING = auto() 109 BYTE_STRING = auto() 110 NATIONAL_STRING = auto() 111 RAW_STRING = auto() 112 HEREDOC_STRING = auto() 113 UNICODE_STRING = auto() 114 115 # types 116 BIT = auto() 117 BOOLEAN = auto() 118 TINYINT = auto() 119 UTINYINT = auto() 120 SMALLINT = auto() 121 USMALLINT = auto() 122 MEDIUMINT = auto() 123 UMEDIUMINT = auto() 124 INT = auto() 125 UINT = auto() 126 BIGINT = auto() 127 UBIGINT = auto() 128 BIGNUM = auto() 129 INT128 = auto() 130 UINT128 = auto() 131 INT256 = auto() 132 UINT256 = auto() 133 FLOAT = auto() 134 DOUBLE = auto() 135 UDOUBLE = auto() 136 DECIMAL = auto() 137 DECIMAL32 = auto() 138 DECIMAL64 = auto() 139 DECIMAL128 = auto() 140 DECIMAL256 = auto() 141 DECFLOAT = auto() 142 UDECIMAL = auto() 143 BIGDECIMAL = auto() 144 CHAR = auto() 145 NCHAR = auto() 146 VARCHAR = auto() 147 NVARCHAR = auto() 148 BPCHAR = auto() 149 TEXT = auto() 150 MEDIUMTEXT = auto() 151 LONGTEXT = auto() 152 BLOB = auto() 153 MEDIUMBLOB = auto() 154 LONGBLOB = auto() 155 TINYBLOB = auto() 156 TINYTEXT = auto() 157 NAME = auto() 158 BINARY = auto() 159 VARBINARY = auto() 160 JSON = auto() 161 JSONB = auto() 162 TIME = auto() 163 TIMETZ = auto() 164 TIME_NS = auto() 165 TIMESTAMP = auto() 166 TIMESTAMPTZ = auto() 167 TIMESTAMPLTZ = auto() 168 TIMESTAMPNTZ = auto() 169 TIMESTAMP_S = auto() 170 TIMESTAMP_MS = auto() 171 TIMESTAMP_NS = auto() 172 DATETIME = auto() 173 DATETIME2 = auto() 174 DATETIME64 = auto() 175 SMALLDATETIME = auto() 176 DATE = auto() 177 DATE32 = auto() 178 INT4RANGE = auto() 179 INT4MULTIRANGE = auto() 180 INT8RANGE = auto() 181 INT8MULTIRANGE = auto() 182 NUMRANGE = auto() 183 NUMMULTIRANGE = auto() 184 TSRANGE = auto() 185 TSMULTIRANGE = auto() 186 TSTZRANGE = auto() 187 TSTZMULTIRANGE = auto() 188 DATERANGE = auto() 189 DATEMULTIRANGE = auto() 190 UUID = auto() 191 GEOGRAPHY = auto() 192 GEOGRAPHYPOINT = auto() 193 NULLABLE = auto() 194 GEOMETRY = auto() 195 POINT = auto() 196 RING = auto() 197 LINESTRING = auto() 198 LOCALTIME = auto() 199 LOCALTIMESTAMP = auto() 200 SYSTIMESTAMP = auto() 201 MULTILINESTRING = auto() 202 POLYGON = auto() 203 MULTIPOLYGON = auto() 204 HLLSKETCH = auto() 205 HSTORE = auto() 206 SUPER = auto() 207 SERIAL = auto() 208 SMALLSERIAL = auto() 209 BIGSERIAL = auto() 210 XML = auto() 211 YEAR = auto() 212 USERDEFINED = auto() 213 MONEY = auto() 214 SMALLMONEY = auto() 215 ROWVERSION = auto() 216 IMAGE = auto() 217 VARIANT = auto() 218 OBJECT = auto() 219 INET = auto() 220 IPADDRESS = auto() 221 IPPREFIX = auto() 222 IPV4 = auto() 223 IPV6 = auto() 224 ENUM = auto() 225 ENUM8 = auto() 226 ENUM16 = auto() 227 FIXEDSTRING = auto() 228 LOWCARDINALITY = auto() 229 NESTED = auto() 230 AGGREGATEFUNCTION = auto() 231 SIMPLEAGGREGATEFUNCTION = auto() 232 TDIGEST = auto() 233 UNKNOWN = auto() 234 VECTOR = auto() 235 DYNAMIC = auto() 236 VOID = auto() 237 238 # keywords 239 ALIAS = auto() 240 ALTER = auto() 241 ALL = auto() 242 ANTI = auto() 243 ANY = auto() 244 APPLY = auto() 245 ARRAY = auto() 246 ASC = auto() 247 ASOF = auto() 248 ATTACH = auto() 249 AUTO_INCREMENT = auto() 250 BEGIN = auto() 251 BETWEEN = auto() 252 BULK_COLLECT_INTO = auto() 253 CACHE = auto() 254 CASE = auto() 255 CHARACTER_SET = auto() 256 CLUSTER_BY = auto() 257 COLLATE = auto() 258 COMMAND = auto() 259 COMMENT = auto() 260 COMMIT = auto() 261 CONNECT_BY = auto() 262 CONSTRAINT = auto() 263 COPY = auto() 264 CREATE = auto() 265 CROSS = auto() 266 CUBE = auto() 267 CURRENT_DATE = auto() 268 CURRENT_DATETIME = auto() 269 CURRENT_SCHEMA = auto() 270 CURRENT_TIME = auto() 271 CURRENT_TIMESTAMP = auto() 272 CURRENT_USER = auto() 273 CURRENT_USER_ID = auto() 274 CURRENT_ROLE = auto() 275 CURRENT_CATALOG = auto() 276 DECLARE = auto() 277 DEFAULT = auto() 278 DELETE = auto() 279 DESC = auto() 280 DESCRIBE = auto() 281 DETACH = auto() 282 DICTIONARY = auto() 283 DISTINCT = auto() 284 DISTRIBUTE_BY = auto() 285 DIV = auto() 286 DROP = auto() 287 ELSE = auto() 288 END = auto() 289 ESCAPE = auto() 290 EXCEPT = auto() 291 EXECUTE = auto() 292 EXISTS = auto() 293 FALSE = auto() 294 FETCH = auto() 295 FILE = auto() 296 FILE_FORMAT = auto() 297 FILTER = auto() 298 FINAL = auto() 299 FIRST = auto() 300 FOR = auto() 301 FORCE = auto() 302 FOREIGN_KEY = auto() 303 FORMAT = auto() 304 FROM = auto() 305 FULL = auto() 306 FUNCTION = auto() 307 GET = auto() 308 GLOB = auto() 309 GLOBAL = auto() 310 GRANT = auto() 311 GROUP_BY = auto() 312 GROUPING_SETS = auto() 313 HAVING = auto() 314 HINT = auto() 315 IGNORE = auto() 316 ILIKE = auto() 317 IN = auto() 318 INDEX = auto() 319 INDEXED_BY = auto() 320 INNER = auto() 321 INSERT = auto() 322 INSTALL = auto() 323 INTEGRATION = auto() 324 INTERSECT = auto() 325 INTERVAL = auto() 326 INTO = auto() 327 INTRODUCER = auto() 328 IRLIKE = auto() 329 IS = auto() 330 ISNULL = auto() 331 JOIN = auto() 332 JOIN_MARKER = auto() 333 KEEP = auto() 334 KEY = auto() 335 KILL = auto() 336 LANGUAGE = auto() 337 LATERAL = auto() 338 LEFT = auto() 339 LIKE = auto() 340 LIMIT = auto() 341 LIST = auto() 342 LOAD = auto() 343 LOCK = auto() 344 MAP = auto() 345 MATCH = auto() 346 MATCH_CONDITION = auto() 347 MATCH_RECOGNIZE = auto() 348 MEMBER_OF = auto() 349 MERGE = auto() 350 MOD = auto() 351 MODEL = auto() 352 NATURAL = auto() 353 NEXT = auto() 354 NOTHING = auto() 355 NOTNULL = auto() 356 NULL = auto() 357 OBJECT_IDENTIFIER = auto() 358 OFFSET = auto() 359 ON = auto() 360 ONLY = auto() 361 OPERATOR = auto() 362 ORDER_BY = auto() 363 ORDER_SIBLINGS_BY = auto() 364 ORDERED = auto() 365 ORDINALITY = auto() 366 OUT = auto() 367 INOUT = auto() 368 OUTER = auto() 369 OVER = auto() 370 OVERLAPS = auto() 371 OVERWRITE = auto() 372 PACKAGE = auto() 373 PARTITION = auto() 374 PARTITION_BY = auto() 375 PERCENT = auto() 376 PIVOT = auto() 377 PLACEHOLDER = auto() 378 POLICY = auto() 379 POOL = auto() 380 POSITIONAL = auto() 381 PRAGMA = auto() 382 PREWHERE = auto() 383 PRIMARY_KEY = auto() 384 PROCEDURE = auto() 385 PROPERTIES = auto() 386 PSEUDO_TYPE = auto() 387 PUT = auto() 388 QUALIFY = auto() 389 QUOTE = auto() 390 QDCOLON = auto() 391 RANGE = auto() 392 RECURSIVE = auto() 393 REFRESH = auto() 394 RENAME = auto() 395 REPLACE = auto() 396 RETURNING = auto() 397 REVOKE = auto() 398 REFERENCES = auto() 399 RIGHT = auto() 400 RLIKE = auto() 401 ROLE = auto() 402 ROLLBACK = auto() 403 ROLLUP = auto() 404 ROW = auto() 405 ROWS = auto() 406 RULE = auto() 407 SELECT = auto() 408 SEMI = auto() 409 SEPARATOR = auto() 410 SEQUENCE = auto() 411 SERDE_PROPERTIES = auto() 412 SET = auto() 413 SETTINGS = auto() 414 SHOW = auto() 415 SIMILAR_TO = auto() 416 SOME = auto() 417 SORT_BY = auto() 418 SOUNDS_LIKE = auto() 419 SQL_SECURITY = auto() 420 START_WITH = auto() 421 STORAGE_INTEGRATION = auto() 422 STRAIGHT_JOIN = auto() 423 STRUCT = auto() 424 SUMMARIZE = auto() 425 TABLE_SAMPLE = auto() 426 TAG = auto() 427 TEMPORARY = auto() 428 TOP = auto() 429 THEN = auto() 430 TRUE = auto() 431 TRUNCATE = auto() 432 TRIGGER = auto() 433 TYPE = auto() 434 UNCACHE = auto() 435 UNION = auto() 436 UNNEST = auto() 437 UNPIVOT = auto() 438 UPDATE = auto() 439 USE = auto() 440 USING = auto() 441 VALUES = auto() 442 VARIADIC = auto() 443 VIEW = auto() 444 SEMANTIC_VIEW = auto() 445 VOLATILE = auto() 446 VOLUME = auto() 447 WHEN = auto() 448 WHERE = auto() 449 WINDOW = auto() 450 WITH = auto() 451 UNIQUE = auto() 452 UTC_DATE = auto() 453 UTC_TIME = auto() 454 UTC_TIMESTAMP = auto() 455 VERSION_SNAPSHOT = auto() 456 TIMESTAMP_SNAPSHOT = auto() 457 OPTION = auto() 458 SINK = auto() 459 SOURCE = auto() 460 ANALYZE = auto() 461 NAMESPACE = auto() 462 EXPORT = auto() 463 464 # sentinels 465 HIVE_TOKEN_STREAM = auto() 466 SENTINEL = auto() 467 468 def __str__(self) -> str: 469 return f"TokenType.{self.name}"
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 1>
R_PAREN =
<TokenType.R_PAREN: 2>
L_BRACKET =
<TokenType.L_BRACKET: 3>
R_BRACKET =
<TokenType.R_BRACKET: 4>
L_BRACE =
<TokenType.L_BRACE: 5>
R_BRACE =
<TokenType.R_BRACE: 6>
COMMA =
<TokenType.COMMA: 7>
DOT =
<TokenType.DOT: 8>
DASH =
<TokenType.DASH: 9>
PLUS =
<TokenType.PLUS: 10>
COLON =
<TokenType.COLON: 11>
DOTCOLON =
<TokenType.DOTCOLON: 12>
DOTCARET =
<TokenType.DOTCARET: 13>
DCOLON =
<TokenType.DCOLON: 14>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 16>
DCOLONQMARK =
<TokenType.DCOLONQMARK: 17>
DQMARK =
<TokenType.DQMARK: 18>
SEMICOLON =
<TokenType.SEMICOLON: 19>
STAR =
<TokenType.STAR: 20>
BACKSLASH =
<TokenType.BACKSLASH: 21>
SLASH =
<TokenType.SLASH: 22>
LT =
<TokenType.LT: 23>
LTE =
<TokenType.LTE: 24>
GT =
<TokenType.GT: 25>
GTE =
<TokenType.GTE: 26>
NOT =
<TokenType.NOT: 27>
EQ =
<TokenType.EQ: 28>
NEQ =
<TokenType.NEQ: 29>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 30>
COLON_EQ =
<TokenType.COLON_EQ: 31>
COLON_GT =
<TokenType.COLON_GT: 32>
NCOLON_GT =
<TokenType.NCOLON_GT: 33>
AND =
<TokenType.AND: 34>
OR =
<TokenType.OR: 35>
AMP =
<TokenType.AMP: 36>
DPIPE =
<TokenType.DPIPE: 37>
PIPE_GT =
<TokenType.PIPE_GT: 38>
PIPE =
<TokenType.PIPE: 39>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 40>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 41>
CARET =
<TokenType.CARET: 42>
CARET_AT =
<TokenType.CARET_AT: 43>
TILDE =
<TokenType.TILDE: 44>
ARROW =
<TokenType.ARROW: 45>
DARROW =
<TokenType.DARROW: 46>
FARROW =
<TokenType.FARROW: 47>
HASH =
<TokenType.HASH: 48>
HASH_ARROW =
<TokenType.HASH_ARROW: 49>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 50>
LR_ARROW =
<TokenType.LR_ARROW: 51>
LLRR_ARROW =
<TokenType.LLRR_ARROW: 52>
DAT =
<TokenType.DAT: 53>
LT_AT =
<TokenType.LT_AT: 54>
AT_GT =
<TokenType.AT_GT: 55>
DOLLAR =
<TokenType.DOLLAR: 56>
PARAMETER =
<TokenType.PARAMETER: 57>
SESSION =
<TokenType.SESSION: 58>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 59>
SESSION_USER =
<TokenType.SESSION_USER: 60>
DAMP =
<TokenType.DAMP: 61>
AMP_LT =
<TokenType.AMP_LT: 62>
AMP_GT =
<TokenType.AMP_GT: 63>
ADJACENT =
<TokenType.ADJACENT: 64>
XOR =
<TokenType.XOR: 65>
DSTAR =
<TokenType.DSTAR: 66>
QMARK_AMP =
<TokenType.QMARK_AMP: 67>
QMARK_PIPE =
<TokenType.QMARK_PIPE: 68>
HASH_DASH =
<TokenType.HASH_DASH: 69>
EXCLAMATION =
<TokenType.EXCLAMATION: 70>
URI_START =
<TokenType.URI_START: 71>
BLOCK_START =
<TokenType.BLOCK_START: 72>
BLOCK_END =
<TokenType.BLOCK_END: 73>
SPACE =
<TokenType.SPACE: 74>
BREAK =
<TokenType.BREAK: 75>
STRING =
<TokenType.STRING: 76>
NUMBER =
<TokenType.NUMBER: 77>
IDENTIFIER =
<TokenType.IDENTIFIER: 78>
DATABASE =
<TokenType.DATABASE: 79>
COLUMN =
<TokenType.COLUMN: 80>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 81>
SCHEMA =
<TokenType.SCHEMA: 82>
TABLE =
<TokenType.TABLE: 83>
WAREHOUSE =
<TokenType.WAREHOUSE: 84>
STAGE =
<TokenType.STAGE: 85>
STREAM =
<TokenType.STREAM: 86>
STREAMLIT =
<TokenType.STREAMLIT: 87>
VAR =
<TokenType.VAR: 88>
BIT_STRING =
<TokenType.BIT_STRING: 89>
HEX_STRING =
<TokenType.HEX_STRING: 90>
BYTE_STRING =
<TokenType.BYTE_STRING: 91>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 92>
RAW_STRING =
<TokenType.RAW_STRING: 93>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 94>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 95>
BIT =
<TokenType.BIT: 96>
BOOLEAN =
<TokenType.BOOLEAN: 97>
TINYINT =
<TokenType.TINYINT: 98>
UTINYINT =
<TokenType.UTINYINT: 99>
SMALLINT =
<TokenType.SMALLINT: 100>
USMALLINT =
<TokenType.USMALLINT: 101>
MEDIUMINT =
<TokenType.MEDIUMINT: 102>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 103>
INT =
<TokenType.INT: 104>
UINT =
<TokenType.UINT: 105>
BIGINT =
<TokenType.BIGINT: 106>
UBIGINT =
<TokenType.UBIGINT: 107>
BIGNUM =
<TokenType.BIGNUM: 108>
INT128 =
<TokenType.INT128: 109>
UINT128 =
<TokenType.UINT128: 110>
INT256 =
<TokenType.INT256: 111>
UINT256 =
<TokenType.UINT256: 112>
FLOAT =
<TokenType.FLOAT: 113>
DOUBLE =
<TokenType.DOUBLE: 114>
UDOUBLE =
<TokenType.UDOUBLE: 115>
DECIMAL =
<TokenType.DECIMAL: 116>
DECIMAL32 =
<TokenType.DECIMAL32: 117>
DECIMAL64 =
<TokenType.DECIMAL64: 118>
DECIMAL128 =
<TokenType.DECIMAL128: 119>
DECIMAL256 =
<TokenType.DECIMAL256: 120>
DECFLOAT =
<TokenType.DECFLOAT: 121>
UDECIMAL =
<TokenType.UDECIMAL: 122>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 123>
CHAR =
<TokenType.CHAR: 124>
NCHAR =
<TokenType.NCHAR: 125>
VARCHAR =
<TokenType.VARCHAR: 126>
NVARCHAR =
<TokenType.NVARCHAR: 127>
BPCHAR =
<TokenType.BPCHAR: 128>
TEXT =
<TokenType.TEXT: 129>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 130>
LONGTEXT =
<TokenType.LONGTEXT: 131>
BLOB =
<TokenType.BLOB: 132>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 133>
LONGBLOB =
<TokenType.LONGBLOB: 134>
TINYBLOB =
<TokenType.TINYBLOB: 135>
TINYTEXT =
<TokenType.TINYTEXT: 136>
NAME =
<TokenType.NAME: 137>
BINARY =
<TokenType.BINARY: 138>
VARBINARY =
<TokenType.VARBINARY: 139>
JSON =
<TokenType.JSON: 140>
JSONB =
<TokenType.JSONB: 141>
TIME =
<TokenType.TIME: 142>
TIMETZ =
<TokenType.TIMETZ: 143>
TIME_NS =
<TokenType.TIME_NS: 144>
TIMESTAMP =
<TokenType.TIMESTAMP: 145>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 146>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 147>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 148>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 149>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 150>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 151>
DATETIME =
<TokenType.DATETIME: 152>
DATETIME2 =
<TokenType.DATETIME2: 153>
DATETIME64 =
<TokenType.DATETIME64: 154>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 155>
DATE =
<TokenType.DATE: 156>
DATE32 =
<TokenType.DATE32: 157>
INT4RANGE =
<TokenType.INT4RANGE: 158>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 159>
INT8RANGE =
<TokenType.INT8RANGE: 160>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 161>
NUMRANGE =
<TokenType.NUMRANGE: 162>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 163>
TSRANGE =
<TokenType.TSRANGE: 164>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 165>
TSTZRANGE =
<TokenType.TSTZRANGE: 166>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 167>
DATERANGE =
<TokenType.DATERANGE: 168>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 169>
UUID =
<TokenType.UUID: 170>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 171>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 172>
NULLABLE =
<TokenType.NULLABLE: 173>
GEOMETRY =
<TokenType.GEOMETRY: 174>
POINT =
<TokenType.POINT: 175>
RING =
<TokenType.RING: 176>
LINESTRING =
<TokenType.LINESTRING: 177>
LOCALTIME =
<TokenType.LOCALTIME: 178>
LOCALTIMESTAMP =
<TokenType.LOCALTIMESTAMP: 179>
SYSTIMESTAMP =
<TokenType.SYSTIMESTAMP: 180>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 181>
POLYGON =
<TokenType.POLYGON: 182>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 183>
HLLSKETCH =
<TokenType.HLLSKETCH: 184>
HSTORE =
<TokenType.HSTORE: 185>
SUPER =
<TokenType.SUPER: 186>
SERIAL =
<TokenType.SERIAL: 187>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 188>
BIGSERIAL =
<TokenType.BIGSERIAL: 189>
XML =
<TokenType.XML: 190>
YEAR =
<TokenType.YEAR: 191>
USERDEFINED =
<TokenType.USERDEFINED: 192>
MONEY =
<TokenType.MONEY: 193>
SMALLMONEY =
<TokenType.SMALLMONEY: 194>
ROWVERSION =
<TokenType.ROWVERSION: 195>
IMAGE =
<TokenType.IMAGE: 196>
VARIANT =
<TokenType.VARIANT: 197>
OBJECT =
<TokenType.OBJECT: 198>
INET =
<TokenType.INET: 199>
IPADDRESS =
<TokenType.IPADDRESS: 200>
IPPREFIX =
<TokenType.IPPREFIX: 201>
IPV4 =
<TokenType.IPV4: 202>
IPV6 =
<TokenType.IPV6: 203>
ENUM =
<TokenType.ENUM: 204>
ENUM8 =
<TokenType.ENUM8: 205>
ENUM16 =
<TokenType.ENUM16: 206>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 207>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 208>
NESTED =
<TokenType.NESTED: 209>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 210>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 211>
TDIGEST =
<TokenType.TDIGEST: 212>
UNKNOWN =
<TokenType.UNKNOWN: 213>
VECTOR =
<TokenType.VECTOR: 214>
DYNAMIC =
<TokenType.DYNAMIC: 215>
VOID =
<TokenType.VOID: 216>
ALIAS =
<TokenType.ALIAS: 217>
ALTER =
<TokenType.ALTER: 218>
ALL =
<TokenType.ALL: 219>
ANTI =
<TokenType.ANTI: 220>
ANY =
<TokenType.ANY: 221>
APPLY =
<TokenType.APPLY: 222>
ARRAY =
<TokenType.ARRAY: 223>
ASC =
<TokenType.ASC: 224>
ASOF =
<TokenType.ASOF: 225>
ATTACH =
<TokenType.ATTACH: 226>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 227>
BEGIN =
<TokenType.BEGIN: 228>
BETWEEN =
<TokenType.BETWEEN: 229>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 230>
CACHE =
<TokenType.CACHE: 231>
CASE =
<TokenType.CASE: 232>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 233>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 234>
COLLATE =
<TokenType.COLLATE: 235>
COMMAND =
<TokenType.COMMAND: 236>
COMMENT =
<TokenType.COMMENT: 237>
COMMIT =
<TokenType.COMMIT: 238>
CONNECT_BY =
<TokenType.CONNECT_BY: 239>
CONSTRAINT =
<TokenType.CONSTRAINT: 240>
COPY =
<TokenType.COPY: 241>
CREATE =
<TokenType.CREATE: 242>
CROSS =
<TokenType.CROSS: 243>
CUBE =
<TokenType.CUBE: 244>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 245>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 246>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 247>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 248>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 249>
CURRENT_USER =
<TokenType.CURRENT_USER: 250>
CURRENT_USER_ID =
<TokenType.CURRENT_USER_ID: 251>
CURRENT_ROLE =
<TokenType.CURRENT_ROLE: 252>
CURRENT_CATALOG =
<TokenType.CURRENT_CATALOG: 253>
DECLARE =
<TokenType.DECLARE: 254>
DEFAULT =
<TokenType.DEFAULT: 255>
DELETE =
<TokenType.DELETE: 256>
DESC =
<TokenType.DESC: 257>
DESCRIBE =
<TokenType.DESCRIBE: 258>
DETACH =
<TokenType.DETACH: 259>
DICTIONARY =
<TokenType.DICTIONARY: 260>
DISTINCT =
<TokenType.DISTINCT: 261>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 262>
DIV =
<TokenType.DIV: 263>
DROP =
<TokenType.DROP: 264>
ELSE =
<TokenType.ELSE: 265>
END =
<TokenType.END: 266>
ESCAPE =
<TokenType.ESCAPE: 267>
EXCEPT =
<TokenType.EXCEPT: 268>
EXECUTE =
<TokenType.EXECUTE: 269>
EXISTS =
<TokenType.EXISTS: 270>
FALSE =
<TokenType.FALSE: 271>
FETCH =
<TokenType.FETCH: 272>
FILE =
<TokenType.FILE: 273>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 274>
FILTER =
<TokenType.FILTER: 275>
FINAL =
<TokenType.FINAL: 276>
FIRST =
<TokenType.FIRST: 277>
FOR =
<TokenType.FOR: 278>
FORCE =
<TokenType.FORCE: 279>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 280>
FORMAT =
<TokenType.FORMAT: 281>
FROM =
<TokenType.FROM: 282>
FULL =
<TokenType.FULL: 283>
FUNCTION =
<TokenType.FUNCTION: 284>
GET =
<TokenType.GET: 285>
GLOB =
<TokenType.GLOB: 286>
GLOBAL =
<TokenType.GLOBAL: 287>
GRANT =
<TokenType.GRANT: 288>
GROUP_BY =
<TokenType.GROUP_BY: 289>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 290>
HAVING =
<TokenType.HAVING: 291>
HINT =
<TokenType.HINT: 292>
IGNORE =
<TokenType.IGNORE: 293>
ILIKE =
<TokenType.ILIKE: 294>
IN =
<TokenType.IN: 295>
INDEX =
<TokenType.INDEX: 296>
INDEXED_BY =
<TokenType.INDEXED_BY: 297>
INNER =
<TokenType.INNER: 298>
INSERT =
<TokenType.INSERT: 299>
INSTALL =
<TokenType.INSTALL: 300>
INTEGRATION =
<TokenType.INTEGRATION: 301>
INTERSECT =
<TokenType.INTERSECT: 302>
INTERVAL =
<TokenType.INTERVAL: 303>
INTO =
<TokenType.INTO: 304>
INTRODUCER =
<TokenType.INTRODUCER: 305>
IRLIKE =
<TokenType.IRLIKE: 306>
IS =
<TokenType.IS: 307>
ISNULL =
<TokenType.ISNULL: 308>
JOIN =
<TokenType.JOIN: 309>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 310>
KEEP =
<TokenType.KEEP: 311>
KEY =
<TokenType.KEY: 312>
KILL =
<TokenType.KILL: 313>
LANGUAGE =
<TokenType.LANGUAGE: 314>
LATERAL =
<TokenType.LATERAL: 315>
LEFT =
<TokenType.LEFT: 316>
LIKE =
<TokenType.LIKE: 317>
LIMIT =
<TokenType.LIMIT: 318>
LIST =
<TokenType.LIST: 319>
LOAD =
<TokenType.LOAD: 320>
LOCK =
<TokenType.LOCK: 321>
MAP =
<TokenType.MAP: 322>
MATCH =
<TokenType.MATCH: 323>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 324>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 325>
MEMBER_OF =
<TokenType.MEMBER_OF: 326>
MERGE =
<TokenType.MERGE: 327>
MOD =
<TokenType.MOD: 328>
MODEL =
<TokenType.MODEL: 329>
NATURAL =
<TokenType.NATURAL: 330>
NEXT =
<TokenType.NEXT: 331>
NOTHING =
<TokenType.NOTHING: 332>
NOTNULL =
<TokenType.NOTNULL: 333>
NULL =
<TokenType.NULL: 334>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 335>
OFFSET =
<TokenType.OFFSET: 336>
ON =
<TokenType.ON: 337>
ONLY =
<TokenType.ONLY: 338>
OPERATOR =
<TokenType.OPERATOR: 339>
ORDER_BY =
<TokenType.ORDER_BY: 340>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 341>
ORDERED =
<TokenType.ORDERED: 342>
ORDINALITY =
<TokenType.ORDINALITY: 343>
OUT =
<TokenType.OUT: 344>
INOUT =
<TokenType.INOUT: 345>
OUTER =
<TokenType.OUTER: 346>
OVER =
<TokenType.OVER: 347>
OVERLAPS =
<TokenType.OVERLAPS: 348>
OVERWRITE =
<TokenType.OVERWRITE: 349>
PACKAGE =
<TokenType.PACKAGE: 350>
PARTITION =
<TokenType.PARTITION: 351>
PARTITION_BY =
<TokenType.PARTITION_BY: 352>
PERCENT =
<TokenType.PERCENT: 353>
PIVOT =
<TokenType.PIVOT: 354>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 355>
POLICY =
<TokenType.POLICY: 356>
POOL =
<TokenType.POOL: 357>
POSITIONAL =
<TokenType.POSITIONAL: 358>
PRAGMA =
<TokenType.PRAGMA: 359>
PREWHERE =
<TokenType.PREWHERE: 360>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 361>
PROCEDURE =
<TokenType.PROCEDURE: 362>
PROPERTIES =
<TokenType.PROPERTIES: 363>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 364>
PUT =
<TokenType.PUT: 365>
QUALIFY =
<TokenType.QUALIFY: 366>
QUOTE =
<TokenType.QUOTE: 367>
QDCOLON =
<TokenType.QDCOLON: 368>
RANGE =
<TokenType.RANGE: 369>
RECURSIVE =
<TokenType.RECURSIVE: 370>
REFRESH =
<TokenType.REFRESH: 371>
RENAME =
<TokenType.RENAME: 372>
REPLACE =
<TokenType.REPLACE: 373>
RETURNING =
<TokenType.RETURNING: 374>
REVOKE =
<TokenType.REVOKE: 375>
REFERENCES =
<TokenType.REFERENCES: 376>
RIGHT =
<TokenType.RIGHT: 377>
RLIKE =
<TokenType.RLIKE: 378>
ROLE =
<TokenType.ROLE: 379>
ROLLBACK =
<TokenType.ROLLBACK: 380>
ROLLUP =
<TokenType.ROLLUP: 381>
ROW =
<TokenType.ROW: 382>
ROWS =
<TokenType.ROWS: 383>
RULE =
<TokenType.RULE: 384>
SELECT =
<TokenType.SELECT: 385>
SEMI =
<TokenType.SEMI: 386>
SEPARATOR =
<TokenType.SEPARATOR: 387>
SEQUENCE =
<TokenType.SEQUENCE: 388>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 389>
SET =
<TokenType.SET: 390>
SETTINGS =
<TokenType.SETTINGS: 391>
SHOW =
<TokenType.SHOW: 392>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 393>
SOME =
<TokenType.SOME: 394>
SORT_BY =
<TokenType.SORT_BY: 395>
SOUNDS_LIKE =
<TokenType.SOUNDS_LIKE: 396>
SQL_SECURITY =
<TokenType.SQL_SECURITY: 397>
START_WITH =
<TokenType.START_WITH: 398>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 399>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 400>
STRUCT =
<TokenType.STRUCT: 401>
SUMMARIZE =
<TokenType.SUMMARIZE: 402>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 403>
TAG =
<TokenType.TAG: 404>
TEMPORARY =
<TokenType.TEMPORARY: 405>
TOP =
<TokenType.TOP: 406>
THEN =
<TokenType.THEN: 407>
TRUE =
<TokenType.TRUE: 408>
TRUNCATE =
<TokenType.TRUNCATE: 409>
TRIGGER =
<TokenType.TRIGGER: 410>
TYPE =
<TokenType.TYPE: 411>
UNCACHE =
<TokenType.UNCACHE: 412>
UNION =
<TokenType.UNION: 413>
UNNEST =
<TokenType.UNNEST: 414>
UNPIVOT =
<TokenType.UNPIVOT: 415>
UPDATE =
<TokenType.UPDATE: 416>
USE =
<TokenType.USE: 417>
USING =
<TokenType.USING: 418>
VALUES =
<TokenType.VALUES: 419>
VARIADIC =
<TokenType.VARIADIC: 420>
VIEW =
<TokenType.VIEW: 421>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 422>
VOLATILE =
<TokenType.VOLATILE: 423>
VOLUME =
<TokenType.VOLUME: 424>
WHEN =
<TokenType.WHEN: 425>
WHERE =
<TokenType.WHERE: 426>
WINDOW =
<TokenType.WINDOW: 427>
WITH =
<TokenType.WITH: 428>
UNIQUE =
<TokenType.UNIQUE: 429>
UTC_DATE =
<TokenType.UTC_DATE: 430>
UTC_TIME =
<TokenType.UTC_TIME: 431>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 432>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 433>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 434>
OPTION =
<TokenType.OPTION: 435>
SINK =
<TokenType.SINK: 436>
SOURCE =
<TokenType.SOURCE: 437>
ANALYZE =
<TokenType.ANALYZE: 438>
NAMESPACE =
<TokenType.NAMESPACE: 439>
EXPORT =
<TokenType.EXPORT: 440>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 441>
SENTINEL =
<TokenType.SENTINEL: 442>
class
Token:
472class Token: 473 # mypyc doesn't expose slots 474 _attrs: t.ClassVar[tuple[str, ...]] = ( 475 "token_type", 476 "text", 477 "line", 478 "col", 479 "start", 480 "end", 481 "comments", 482 ) 483 __slots__ = _attrs 484 485 @classmethod 486 def number(cls, number: int) -> Token: 487 """Returns a NUMBER token with `number` as its text.""" 488 return cls(TokenType.NUMBER, str(number)) 489 490 @classmethod 491 def string(cls, string: str) -> Token: 492 """Returns a STRING token with `string` as its text.""" 493 return cls(TokenType.STRING, string) 494 495 @classmethod 496 def identifier(cls, identifier: str) -> Token: 497 """Returns an IDENTIFIER token with `identifier` as its text.""" 498 return cls(TokenType.IDENTIFIER, identifier) 499 500 @classmethod 501 def var(cls, var: str) -> Token: 502 """Returns an VAR token with `var` as its text.""" 503 return cls(TokenType.VAR, var) 504 505 def __init__( 506 self, 507 token_type: TokenType, 508 text: str, 509 line: int = 1, 510 col: int = 1, 511 start: int = 0, 512 end: int = 0, 513 comments: list[str] | None = None, 514 ) -> None: 515 self.token_type = token_type 516 self.text = text 517 self.line = line 518 self.col = col 519 self.start = start 520 self.end = end 521 self.comments = [] if comments is None else comments 522 523 def __bool__(self) -> bool: 524 return self.token_type != TokenType.SENTINEL 525 526 def __repr__(self) -> str: 527 attributes = ", ".join( 528 f"{k}: TokenType.{self.token_type.name}" 529 if k == "token_type" 530 else f"{k}: {getattr(self, k)}" 531 for k in self._attrs 532 ) 533 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
505 def __init__( 506 self, 507 token_type: TokenType, 508 text: str, 509 line: int = 1, 510 col: int = 1, 511 start: int = 0, 512 end: int = 0, 513 comments: list[str] | None = None, 514 ) -> None: 515 self.token_type = token_type 516 self.text = text 517 self.line = line 518 self.col = col 519 self.start = start 520 self.end = end 521 self.comments = [] if comments is None else comments
485 @classmethod 486 def number(cls, number: int) -> Token: 487 """Returns a NUMBER token with `number` as its text.""" 488 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
490 @classmethod 491 def string(cls, string: str) -> Token: 492 """Returns a STRING token with `string` as its text.""" 493 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
495 @classmethod 496 def identifier(cls, identifier: str) -> Token: 497 """Returns an IDENTIFIER token with `identifier` as its text.""" 498 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
TokenizerCore:
536class TokenizerCore: 537 __slots__ = ( 538 "sql", 539 "size", 540 "tokens", 541 "_start", 542 "_current", 543 "_line", 544 "_col", 545 "_comments", 546 "_char", 547 "_end", 548 "_peek", 549 "_prev_token_line", 550 "single_tokens", 551 "keywords", 552 "quotes", 553 "format_strings", 554 "identifiers", 555 "comments", 556 "string_escapes", 557 "byte_string_escapes", 558 "identifier_escapes", 559 "escape_follow_chars", 560 "commands", 561 "command_prefix_tokens", 562 "nested_comments", 563 "hint_start", 564 "tokens_preceding_hint", 565 "has_bit_strings", 566 "has_hex_strings", 567 "numeric_literals", 568 "var_single_tokens", 569 "string_escapes_allowed_in_raw_strings", 570 "heredoc_tag_is_identifier", 571 "heredoc_string_alternative", 572 "keyword_trie", 573 "numbers_can_be_underscore_separated", 574 "numbers_can_have_decimals", 575 "identifiers_can_start_with_digit", 576 "unescaped_sequences", 577 ) 578 579 def __init__( 580 self, 581 single_tokens: dict[str, TokenType], 582 keywords: dict[str, TokenType], 583 quotes: dict[str, str], 584 format_strings: dict[str, tuple[str, TokenType]], 585 identifiers: dict[str, str], 586 comments: dict[str, str | None], 587 string_escapes: set[str], 588 byte_string_escapes: set[str], 589 identifier_escapes: set[str], 590 escape_follow_chars: set[str], 591 commands: set[TokenType], 592 command_prefix_tokens: set[TokenType], 593 nested_comments: bool, 594 hint_start: str, 595 tokens_preceding_hint: set[TokenType], 596 has_bit_strings: bool, 597 has_hex_strings: bool, 598 numeric_literals: dict[str, str], 599 var_single_tokens: set[str], 600 string_escapes_allowed_in_raw_strings: bool, 601 heredoc_tag_is_identifier: bool, 602 heredoc_string_alternative: TokenType, 603 keyword_trie: dict, 604 numbers_can_be_underscore_separated: bool, 605 numbers_can_have_decimals: bool, 606 identifiers_can_start_with_digit: bool, 607 unescaped_sequences: dict[str, str], 608 ) -> None: 609 self.single_tokens = single_tokens 610 self.keywords = keywords 611 self.quotes = quotes 612 self.format_strings = format_strings 613 self.identifiers = identifiers 614 self.comments = comments 615 self.string_escapes = string_escapes 616 self.byte_string_escapes = byte_string_escapes 617 self.identifier_escapes = identifier_escapes 618 self.escape_follow_chars = escape_follow_chars 619 self.commands = commands 620 self.command_prefix_tokens = command_prefix_tokens 621 self.nested_comments = nested_comments 622 self.hint_start = hint_start 623 self.tokens_preceding_hint = tokens_preceding_hint 624 self.has_bit_strings = has_bit_strings 625 self.has_hex_strings = has_hex_strings 626 self.numeric_literals = numeric_literals 627 self.var_single_tokens = var_single_tokens 628 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 629 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 630 self.heredoc_string_alternative = heredoc_string_alternative 631 self.keyword_trie = keyword_trie 632 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 633 self.numbers_can_have_decimals = numbers_can_have_decimals 634 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 635 self.unescaped_sequences = unescaped_sequences 636 self.sql = "" 637 self.size = 0 638 self.tokens: list[Token] = [] 639 self._start = 0 640 self._current = 0 641 self._line = 1 642 self._col = 0 643 self._comments: list[str] = [] 644 self._char = "" 645 self._end = False 646 self._peek = "" 647 self._prev_token_line = -1 648 649 def reset(self) -> None: 650 self.sql = "" 651 self.size = 0 652 self.tokens = [] 653 self._start = 0 654 self._current = 0 655 self._line = 1 656 self._col = 0 657 self._comments = [] 658 self._char = "" 659 self._end = False 660 self._peek = "" 661 self._prev_token_line = -1 662 663 def tokenize(self, sql: str) -> list[Token]: 664 """Returns a list of tokens corresponding to the SQL string `sql`.""" 665 self.reset() 666 self.sql = sql 667 self.size = len(sql) 668 669 try: 670 self._scan() 671 except Exception as e: 672 start = max(self._current - 50, 0) 673 end = min(self._current + 50, self.size - 1) 674 context = self.sql[start:end] 675 raise TokenError(f"Error tokenizing '{context}'") from e 676 677 return self.tokens 678 679 def _scan(self, check_semicolon: bool = False) -> None: 680 identifiers = self.identifiers 681 digit_chars = _DIGIT_CHARS 682 683 while self.size and not self._end: 684 current = self._current 685 686 # Skip spaces here rather than iteratively calling advance() for performance reasons 687 while current < self.size: 688 char = self.sql[current] 689 690 if char == " " or char == "\t": 691 current += 1 692 else: 693 break 694 695 offset = current - self._current if current > self._current else 1 696 697 self._start = current 698 self._advance(offset) 699 700 if not self._char.isspace(): 701 if self._char in digit_chars: 702 self._scan_number() 703 elif self._char in identifiers: 704 self._scan_identifier(identifiers[self._char]) 705 else: 706 self._scan_keywords() 707 708 if check_semicolon and self._peek == ";": 709 break 710 711 if self.tokens and self._comments: 712 self.tokens[-1].comments.extend(self._comments) 713 714 def _chars(self, size: int) -> str: 715 if size == 1: 716 return self._char 717 718 start = self._current - 1 719 end = start + size 720 721 return self.sql[start:end] if end <= self.size else "" 722 723 def _advance(self, i: int = 1, alnum: bool = False) -> None: 724 char = self._char 725 726 if char == "\n" or char == "\r": 727 # Ensures we don't count an extra line if we get a \r\n line break sequence 728 if not (char == "\r" and self._peek == "\n"): 729 self._col = i 730 self._line += 1 731 else: 732 self._col += i 733 734 self._current += i 735 sql = self.sql 736 size = self.size 737 self._end = self._current >= size 738 self._char = sql[self._current - 1] 739 self._peek = "" if self._end else sql[self._current] 740 741 if alnum and self._char.isalnum(): 742 # Cache to local variables instead of attributes for better performance 743 _col = self._col 744 _current = self._current 745 _end = self._end 746 _peek = self._peek 747 748 while _peek.isalnum(): 749 _col += 1 750 _current += 1 751 _end = _current >= size 752 _peek = "" if _end else sql[_current] 753 754 self._col = _col 755 self._current = _current 756 self._end = _end 757 self._peek = _peek 758 self._char = sql[_current - 1] 759 760 @property 761 def _text(self) -> str: 762 return self.sql[self._start : self._current] 763 764 def _add(self, token_type: TokenType, text: str | None = None) -> None: 765 self._prev_token_line = self._line 766 767 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 768 self.tokens[-1].comments.extend(self._comments) 769 self._comments = [] 770 771 if text is None: 772 text = self.sql[self._start : self._current] 773 774 self.tokens.append( 775 Token( 776 token_type, 777 text=text, 778 line=self._line, 779 col=self._col, 780 start=self._start, 781 end=self._current - 1, 782 comments=self._comments, 783 ) 784 ) 785 self._comments = [] 786 787 # If we have either a semicolon or a begin token before the command's token, we'll parse 788 # whatever follows the command's token as a string 789 if ( 790 token_type in self.commands 791 and self._peek != ";" 792 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens) 793 ): 794 start = self._current 795 tokens = len(self.tokens) 796 self._scan(check_semicolon=True) 797 self.tokens = self.tokens[:tokens] 798 text = self.sql[start : self._current].strip() 799 if text: 800 self._add(TokenType.STRING, text) 801 802 def _scan_keywords(self) -> None: 803 sql = self.sql 804 sql_size = self.size 805 single_tokens = self.single_tokens 806 char_upper = _CHAR_UPPER 807 size = 0 808 word = None 809 chars = self._char 810 char = chars 811 prev_space = False 812 skip = False 813 trie = self.keyword_trie 814 single_token = char in single_tokens 815 816 while chars: 817 if not skip: 818 sub = trie.get(char_upper.get(char, char)) 819 if sub is None: 820 break 821 trie = sub 822 if 0 in trie: 823 word = chars 824 825 end = self._current + size 826 size += 1 827 828 if end < sql_size: 829 char = sql[end] 830 single_token = single_token or char in single_tokens 831 is_space = char.isspace() 832 833 if not is_space or not prev_space: 834 if is_space: 835 char = " " 836 chars += char 837 prev_space = is_space 838 skip = False 839 else: 840 skip = True 841 else: 842 char = "" 843 break 844 845 if word: 846 if self._scan_string(word): 847 return 848 if self._scan_comment(word): 849 return 850 if prev_space or single_token or not char: 851 self._advance(size - 1) 852 word = word.upper() 853 self._add(self.keywords[word], text=word) 854 return 855 856 if self._char in single_tokens: 857 self._add(single_tokens[self._char], text=self._char) 858 return 859 860 self._scan_var() 861 862 def _scan_comment(self, comment_start: str) -> bool: 863 if comment_start not in self.comments: 864 return False 865 866 comment_start_line = self._line 867 comment_start_size = len(comment_start) 868 comment_end = self.comments[comment_start] 869 870 if comment_end: 871 # Skip the comment's start delimiter 872 self._advance(comment_start_size) 873 874 comment_count = 1 875 comment_end_size = len(comment_end) 876 nested_comments = self.nested_comments 877 878 while not self._end: 879 if self._chars(comment_end_size) == comment_end: 880 comment_count -= 1 881 if not comment_count: 882 break 883 884 self._advance(alnum=True) 885 886 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 887 if ( 888 nested_comments 889 and not self._end 890 and self._chars(comment_end_size) == comment_start 891 ): 892 self._advance(comment_start_size) 893 comment_count += 1 894 895 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 896 self._advance(comment_end_size - 1) 897 else: 898 _peek = self._peek 899 while not self._end and _peek != "\n" and _peek != "\r": 900 self._advance(alnum=True) 901 _peek = self._peek 902 self._comments.append(self._text[comment_start_size:]) 903 904 if ( 905 comment_start == self.hint_start 906 and self.tokens 907 and self.tokens[-1].token_type in self.tokens_preceding_hint 908 ): 909 self._add(TokenType.HINT) 910 911 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 912 # Multiple consecutive comments are preserved by appending them to the current comments list. 913 if comment_start_line == self._prev_token_line: 914 self.tokens[-1].comments.extend(self._comments) 915 self._comments = [] 916 self._prev_token_line = self._line 917 918 return True 919 920 def _scan_number(self) -> None: 921 if self._char == "0": 922 peek = _CHAR_UPPER.get(self._peek, self._peek) 923 if peek == "B": 924 return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER) 925 elif peek == "X": 926 return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER) 927 928 decimal = False 929 scientific = 0 930 numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated 931 single_tokens = self.single_tokens 932 keywords = self.keywords 933 numeric_literals = self.numeric_literals 934 identifiers_can_start_with_digit = self.identifiers_can_start_with_digit 935 936 is_underscore_separated: bool = False 937 number_text: str = "" 938 numeric_literal: str = "" 939 numeric_type: TokenType | None = None 940 941 while True: 942 if self._peek in _DIGIT_CHARS: 943 # Batch consecutive digits: scan ahead to find how many 944 sql = self.sql 945 end = self._current + 1 946 size = self.size 947 while end < size and sql[end] in _DIGIT_CHARS: 948 end += 1 949 self._advance(end - self._current) 950 elif self._peek == "." and not decimal: 951 if ( 952 self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 953 ) or not self.numbers_can_have_decimals: 954 break 955 decimal = True 956 self._advance() 957 elif self._peek in ("-", "+") and scientific == 1: 958 # Only consume +/- if followed by a digit 959 if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS: 960 scientific += 1 961 self._advance() 962 else: 963 break 964 elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific: 965 scientific += 1 966 self._advance() 967 elif self._peek == "_" and numbers_can_be_underscore_separated: 968 is_underscore_separated = True 969 self._advance() 970 elif self._peek.isidentifier(): 971 number_text = self._text 972 973 while self._peek and not self._peek.isspace() and self._peek not in single_tokens: 974 numeric_literal += self._peek 975 self._advance() 976 977 numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), "")) 978 979 if numeric_type: 980 break 981 elif identifiers_can_start_with_digit: 982 return self._add(TokenType.VAR) 983 984 self._advance(-len(numeric_literal)) 985 break 986 else: 987 break 988 989 number_text = number_text or self.sql[self._start : self._current] 990 991 # Normalize inputs such as 100_000 to 100000 992 if is_underscore_separated: 993 number_text = number_text.replace("_", "") 994 995 self._add(TokenType.NUMBER, number_text) 996 997 # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts 998 if numeric_type: 999 self._add(TokenType.DCOLON, "::") 1000 self._add(numeric_type, numeric_literal) 1001 1002 def _scan_bits(self) -> None: 1003 self._advance() 1004 value = self._extract_value() 1005 try: 1006 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1007 int(value, 2) 1008 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1009 except ValueError: 1010 self._add(TokenType.IDENTIFIER) 1011 1012 def _scan_hex(self) -> None: 1013 self._advance() 1014 value = self._extract_value() 1015 try: 1016 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1017 int(value, 16) 1018 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1019 except ValueError: 1020 self._add(TokenType.IDENTIFIER) 1021 1022 def _extract_value(self) -> str: 1023 single_tokens = self.single_tokens 1024 1025 while True: 1026 char = self._peek.strip() 1027 if char and char not in single_tokens: 1028 self._advance(alnum=True) 1029 else: 1030 break 1031 1032 return self._text 1033 1034 def _scan_string(self, start: str) -> bool: 1035 base = None 1036 token_type = TokenType.STRING 1037 1038 if start in self.quotes: 1039 end = self.quotes[start] 1040 elif start in self.format_strings: 1041 end, token_type = self.format_strings[start] 1042 1043 if token_type == TokenType.HEX_STRING: 1044 base = 16 1045 elif token_type == TokenType.BIT_STRING: 1046 base = 2 1047 elif token_type == TokenType.HEREDOC_STRING: 1048 self._advance() 1049 1050 if self._char == end: 1051 tag = "" 1052 else: 1053 tag = self._extract_string( 1054 end, 1055 raw_string=True, 1056 raise_unmatched=not self.heredoc_tag_is_identifier, 1057 ) 1058 1059 if ( 1060 tag 1061 and self.heredoc_tag_is_identifier 1062 and (self._end or tag.isdigit() or any(c.isspace() for c in tag)) 1063 ): 1064 if not self._end: 1065 self._advance(-1) 1066 1067 self._advance(-len(tag)) 1068 self._add(self.heredoc_string_alternative) 1069 return True 1070 1071 end = f"{start}{tag}{end}" 1072 else: 1073 return False 1074 1075 self._advance(len(start)) 1076 text = self._extract_string( 1077 end, 1078 escapes=( 1079 self.byte_string_escapes 1080 if token_type == TokenType.BYTE_STRING 1081 else self.string_escapes 1082 ), 1083 raw_string=token_type == TokenType.RAW_STRING, 1084 ) 1085 1086 if base and text: 1087 try: 1088 int(text, base) 1089 except Exception: 1090 raise TokenError( 1091 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1092 ) 1093 1094 self._add(token_type, text) 1095 return True 1096 1097 def _scan_identifier(self, identifier_end: str) -> None: 1098 self._advance() 1099 text = self._extract_string( 1100 identifier_end, escapes=self.identifier_escapes | {identifier_end} 1101 ) 1102 self._add(TokenType.IDENTIFIER, text) 1103 1104 def _scan_var(self) -> None: 1105 var_single_tokens = self.var_single_tokens 1106 single_tokens = self.single_tokens 1107 1108 while True: 1109 peek = self._peek 1110 if not peek or peek.isspace(): 1111 break 1112 if peek not in var_single_tokens and peek in single_tokens: 1113 break 1114 self._advance(alnum=True) 1115 1116 self._add( 1117 TokenType.VAR 1118 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1119 else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR) 1120 ) 1121 1122 def _extract_string( 1123 self, 1124 delimiter: str, 1125 escapes: set[str] | None = None, 1126 raw_string: bool = False, 1127 raise_unmatched: bool = True, 1128 ) -> str: 1129 text = "" 1130 delim_size = len(delimiter) 1131 escapes = self.string_escapes if escapes is None else escapes 1132 unescaped_sequences = self.unescaped_sequences 1133 escape_follow_chars = self.escape_follow_chars 1134 string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings 1135 quotes = self.quotes 1136 sql = self.sql 1137 1138 # use str.find() when the string is simple... no \ or other escapes 1139 if delim_size == 1: 1140 pos = self._current - 1 1141 end = sql.find(delimiter, pos) 1142 1143 if ( 1144 # the closing delimiter was found 1145 end != -1 1146 # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char 1147 and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes) 1148 # no backslash in the string that would need escape processing 1149 and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1) 1150 ): 1151 newlines = sql.count("\n", pos, end) 1152 if newlines: 1153 self._line += newlines 1154 self._col = end - sql.rfind("\n", pos, end) 1155 else: 1156 self._col += end - pos 1157 1158 self._current = end + 1 1159 self._end = self._current >= self.size 1160 self._char = sql[end] 1161 self._peek = "" if self._end else sql[self._current] 1162 return sql[pos:end] 1163 1164 while True: 1165 if not raw_string and unescaped_sequences and self._peek and self._char in escapes: 1166 unescaped_sequence = unescaped_sequences.get(self._char + self._peek) 1167 if unescaped_sequence: 1168 self._advance(2) 1169 text += unescaped_sequence 1170 continue 1171 1172 is_valid_custom_escape = ( 1173 escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars 1174 ) 1175 1176 if ( 1177 (string_escapes_allowed_in_raw_strings or not raw_string) 1178 and self._char in escapes 1179 and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape) 1180 and (self._char not in quotes or self._char == self._peek) 1181 ): 1182 if self._peek == delimiter: 1183 text += self._peek 1184 elif is_valid_custom_escape and self._char != self._peek: 1185 text += self._peek 1186 else: 1187 text += self._char + self._peek 1188 1189 if self._current + 1 < self.size: 1190 self._advance(2) 1191 else: 1192 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1193 else: 1194 if self._chars(delim_size) == delimiter: 1195 if delim_size > 1: 1196 self._advance(delim_size - 1) 1197 break 1198 1199 if self._end: 1200 if not raise_unmatched: 1201 return text + self._char 1202 1203 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1204 1205 current = self._current - 1 1206 self._advance(alnum=True) 1207 text += sql[current : self._current - 1] 1208 1209 return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
579 def __init__( 580 self, 581 single_tokens: dict[str, TokenType], 582 keywords: dict[str, TokenType], 583 quotes: dict[str, str], 584 format_strings: dict[str, tuple[str, TokenType]], 585 identifiers: dict[str, str], 586 comments: dict[str, str | None], 587 string_escapes: set[str], 588 byte_string_escapes: set[str], 589 identifier_escapes: set[str], 590 escape_follow_chars: set[str], 591 commands: set[TokenType], 592 command_prefix_tokens: set[TokenType], 593 nested_comments: bool, 594 hint_start: str, 595 tokens_preceding_hint: set[TokenType], 596 has_bit_strings: bool, 597 has_hex_strings: bool, 598 numeric_literals: dict[str, str], 599 var_single_tokens: set[str], 600 string_escapes_allowed_in_raw_strings: bool, 601 heredoc_tag_is_identifier: bool, 602 heredoc_string_alternative: TokenType, 603 keyword_trie: dict, 604 numbers_can_be_underscore_separated: bool, 605 numbers_can_have_decimals: bool, 606 identifiers_can_start_with_digit: bool, 607 unescaped_sequences: dict[str, str], 608 ) -> None: 609 self.single_tokens = single_tokens 610 self.keywords = keywords 611 self.quotes = quotes 612 self.format_strings = format_strings 613 self.identifiers = identifiers 614 self.comments = comments 615 self.string_escapes = string_escapes 616 self.byte_string_escapes = byte_string_escapes 617 self.identifier_escapes = identifier_escapes 618 self.escape_follow_chars = escape_follow_chars 619 self.commands = commands 620 self.command_prefix_tokens = command_prefix_tokens 621 self.nested_comments = nested_comments 622 self.hint_start = hint_start 623 self.tokens_preceding_hint = tokens_preceding_hint 624 self.has_bit_strings = has_bit_strings 625 self.has_hex_strings = has_hex_strings 626 self.numeric_literals = numeric_literals 627 self.var_single_tokens = var_single_tokens 628 self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings 629 self.heredoc_tag_is_identifier = heredoc_tag_is_identifier 630 self.heredoc_string_alternative = heredoc_string_alternative 631 self.keyword_trie = keyword_trie 632 self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated 633 self.numbers_can_have_decimals = numbers_can_have_decimals 634 self.identifiers_can_start_with_digit = identifiers_can_start_with_digit 635 self.unescaped_sequences = unescaped_sequences 636 self.sql = "" 637 self.size = 0 638 self.tokens: list[Token] = [] 639 self._start = 0 640 self._current = 0 641 self._line = 1 642 self._col = 0 643 self._comments: list[str] = [] 644 self._char = "" 645 self._end = False 646 self._peek = "" 647 self._prev_token_line = -1
tokens: list[Token]
663 def tokenize(self, sql: str) -> list[Token]: 664 """Returns a list of tokens corresponding to the SQL string `sql`.""" 665 self.reset() 666 self.sql = sql 667 self.size = len(sql) 668 669 try: 670 self._scan() 671 except Exception as e: 672 start = max(self._current - 50, 0) 673 end = min(self._current + 50, self.size - 1) 674 context = self.sql[start:end] 675 raise TokenError(f"Error tokenizing '{context}'") from e 676 677 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.