Edit on GitHub

sqlglot.tokenizer_core

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import IntEnum, auto
   5
   6from sqlglot.errors import TokenError
   7
   8# dict lookup is faster than .upper() and .isdigit()
   9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)}
  10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789")
  11
  12
  13class TokenType(IntEnum):
  14    L_PAREN = auto()
  15    R_PAREN = auto()
  16    L_BRACKET = auto()
  17    R_BRACKET = auto()
  18    L_BRACE = auto()
  19    R_BRACE = auto()
  20    COMMA = auto()
  21    DOT = auto()
  22    DASH = auto()
  23    PLUS = auto()
  24    COLON = auto()
  25    DOTCOLON = auto()
  26    DOTCARET = auto()
  27    DCOLON = auto()
  28    DCOLONDOLLAR = auto()
  29    DCOLONPERCENT = auto()
  30    DCOLONQMARK = auto()
  31    DQMARK = auto()
  32    SEMICOLON = auto()
  33    STAR = auto()
  34    BACKSLASH = auto()
  35    SLASH = auto()
  36    LT = auto()
  37    LTE = auto()
  38    GT = auto()
  39    GTE = auto()
  40    NOT = auto()
  41    EQ = auto()
  42    NEQ = auto()
  43    NULLSAFE_EQ = auto()
  44    COLON_EQ = auto()
  45    COLON_GT = auto()
  46    NCOLON_GT = auto()
  47    AND = auto()
  48    OR = auto()
  49    AMP = auto()
  50    DPIPE = auto()
  51    PIPE_GT = auto()
  52    PIPE = auto()
  53    PIPE_SLASH = auto()
  54    DPIPE_SLASH = auto()
  55    CARET = auto()
  56    CARET_AT = auto()
  57    TILDE = auto()
  58    ARROW = auto()
  59    DARROW = auto()
  60    FARROW = auto()
  61    HASH = auto()
  62    HASH_ARROW = auto()
  63    DHASH_ARROW = auto()
  64    LR_ARROW = auto()
  65    LLRR_ARROW = auto()
  66    DAT = auto()
  67    AT_QMARK = auto()
  68    LT_AT = auto()
  69    AT_GT = auto()
  70    DOLLAR = auto()
  71    PARAMETER = auto()
  72    SESSION = auto()
  73    SESSION_PARAMETER = auto()
  74    SESSION_USER = auto()
  75    DAMP = auto()
  76    AMP_LT = auto()
  77    AMP_GT = auto()
  78    ADJACENT = auto()
  79    XOR = auto()
  80    DSTAR = auto()
  81    QMARK_AMP = auto()
  82    QMARK_PIPE = auto()
  83    HASH_DASH = auto()
  84    EXCLAMATION = auto()
  85
  86    URI_START = auto()
  87
  88    BLOCK_START = auto()
  89    BLOCK_END = auto()
  90
  91    SPACE = auto()
  92    BREAK = auto()
  93
  94    STRING = auto()
  95    NUMBER = auto()
  96    IDENTIFIER = auto()
  97    DATABASE = auto()
  98    COLUMN = auto()
  99    COLUMN_DEF = auto()
 100    SCHEMA = auto()
 101    TABLE = auto()
 102    WAREHOUSE = auto()
 103    STAGE = auto()
 104    STREAM = auto()
 105    STREAMLIT = auto()
 106    VAR = auto()
 107    BIT_STRING = auto()
 108    HEX_STRING = auto()
 109    BYTE_STRING = auto()
 110    NATIONAL_STRING = auto()
 111    RAW_STRING = auto()
 112    HEREDOC_STRING = auto()
 113    UNICODE_STRING = auto()
 114
 115    # types
 116    BIT = auto()
 117    BOOLEAN = auto()
 118    TINYINT = auto()
 119    UTINYINT = auto()
 120    SMALLINT = auto()
 121    USMALLINT = auto()
 122    MEDIUMINT = auto()
 123    UMEDIUMINT = auto()
 124    INT = auto()
 125    UINT = auto()
 126    BIGINT = auto()
 127    UBIGINT = auto()
 128    BIGNUM = auto()
 129    INT128 = auto()
 130    UINT128 = auto()
 131    INT256 = auto()
 132    UINT256 = auto()
 133    FLOAT = auto()
 134    DOUBLE = auto()
 135    UDOUBLE = auto()
 136    DECIMAL = auto()
 137    DECIMAL32 = auto()
 138    DECIMAL64 = auto()
 139    DECIMAL128 = auto()
 140    DECIMAL256 = auto()
 141    DECFLOAT = auto()
 142    UDECIMAL = auto()
 143    BIGDECIMAL = auto()
 144    CHAR = auto()
 145    NCHAR = auto()
 146    VARCHAR = auto()
 147    NVARCHAR = auto()
 148    BPCHAR = auto()
 149    TEXT = auto()
 150    MEDIUMTEXT = auto()
 151    LONGTEXT = auto()
 152    BLOB = auto()
 153    MEDIUMBLOB = auto()
 154    LONGBLOB = auto()
 155    TINYBLOB = auto()
 156    TINYTEXT = auto()
 157    NAME = auto()
 158    BINARY = auto()
 159    VARBINARY = auto()
 160    JSON = auto()
 161    JSONB = auto()
 162    TIME = auto()
 163    TIMETZ = auto()
 164    TIME_NS = auto()
 165    TIMESTAMP = auto()
 166    TIMESTAMPTZ = auto()
 167    TIMESTAMPLTZ = auto()
 168    TIMESTAMPNTZ = auto()
 169    TIMESTAMP_S = auto()
 170    TIMESTAMP_MS = auto()
 171    TIMESTAMP_NS = auto()
 172    DATETIME = auto()
 173    DATETIME2 = auto()
 174    DATETIME64 = auto()
 175    SMALLDATETIME = auto()
 176    DATE = auto()
 177    DATE32 = auto()
 178    INT4RANGE = auto()
 179    INT4MULTIRANGE = auto()
 180    INT8RANGE = auto()
 181    INT8MULTIRANGE = auto()
 182    NUMRANGE = auto()
 183    NUMMULTIRANGE = auto()
 184    TSRANGE = auto()
 185    TSMULTIRANGE = auto()
 186    TSTZRANGE = auto()
 187    TSTZMULTIRANGE = auto()
 188    DATERANGE = auto()
 189    DATEMULTIRANGE = auto()
 190    UUID = auto()
 191    GEOGRAPHY = auto()
 192    GEOGRAPHYPOINT = auto()
 193    NULLABLE = auto()
 194    GEOMETRY = auto()
 195    POINT = auto()
 196    RING = auto()
 197    LINESTRING = auto()
 198    LOCALTIME = auto()
 199    LOCALTIMESTAMP = auto()
 200    SYSTIMESTAMP = auto()
 201    MULTILINESTRING = auto()
 202    POLYGON = auto()
 203    MULTIPOLYGON = auto()
 204    HLLSKETCH = auto()
 205    HSTORE = auto()
 206    SUPER = auto()
 207    SERIAL = auto()
 208    SMALLSERIAL = auto()
 209    BIGSERIAL = auto()
 210    XML = auto()
 211    YEAR = auto()
 212    USERDEFINED = auto()
 213    MONEY = auto()
 214    SMALLMONEY = auto()
 215    ROWVERSION = auto()
 216    IMAGE = auto()
 217    VARIANT = auto()
 218    OBJECT = auto()
 219    INET = auto()
 220    IPADDRESS = auto()
 221    IPPREFIX = auto()
 222    IPV4 = auto()
 223    IPV6 = auto()
 224    ENUM = auto()
 225    ENUM8 = auto()
 226    ENUM16 = auto()
 227    FIXEDSTRING = auto()
 228    LOWCARDINALITY = auto()
 229    NESTED = auto()
 230    AGGREGATEFUNCTION = auto()
 231    SIMPLEAGGREGATEFUNCTION = auto()
 232    TDIGEST = auto()
 233    UNKNOWN = auto()
 234    VECTOR = auto()
 235    DYNAMIC = auto()
 236    VOID = auto()
 237
 238    # keywords
 239    ALIAS = auto()
 240    ALTER = auto()
 241    ALL = auto()
 242    ANTI = auto()
 243    ANY = auto()
 244    APPLY = auto()
 245    ARRAY = auto()
 246    ASC = auto()
 247    ASOF = auto()
 248    ATTACH = auto()
 249    AUTO_INCREMENT = auto()
 250    BEGIN = auto()
 251    BETWEEN = auto()
 252    BULK_COLLECT_INTO = auto()
 253    CACHE = auto()
 254    CASE = auto()
 255    CHARACTER_SET = auto()
 256    CLUSTER_BY = auto()
 257    COLLATE = auto()
 258    COMMAND = auto()
 259    COMMENT = auto()
 260    COMMIT = auto()
 261    CONNECT_BY = auto()
 262    CONSTRAINT = auto()
 263    COPY = auto()
 264    CREATE = auto()
 265    CROSS = auto()
 266    CUBE = auto()
 267    CURRENT_DATE = auto()
 268    CURRENT_DATETIME = auto()
 269    CURRENT_SCHEMA = auto()
 270    CURRENT_TIME = auto()
 271    CURRENT_TIMESTAMP = auto()
 272    CURRENT_USER = auto()
 273    CURRENT_USER_ID = auto()
 274    CURRENT_ROLE = auto()
 275    CURRENT_CATALOG = auto()
 276    DECLARE = auto()
 277    DEFAULT = auto()
 278    DELETE = auto()
 279    DESC = auto()
 280    DESCRIBE = auto()
 281    DETACH = auto()
 282    DICTIONARY = auto()
 283    DISTINCT = auto()
 284    DISTRIBUTE_BY = auto()
 285    DIV = auto()
 286    DROP = auto()
 287    ELSE = auto()
 288    END = auto()
 289    ESCAPE = auto()
 290    EXCEPT = auto()
 291    EXECUTE = auto()
 292    EXISTS = auto()
 293    FALSE = auto()
 294    FETCH = auto()
 295    FILE = auto()
 296    FILE_FORMAT = auto()
 297    FILTER = auto()
 298    FINAL = auto()
 299    FIRST = auto()
 300    FOR = auto()
 301    FORCE = auto()
 302    FOREIGN_KEY = auto()
 303    FORMAT = auto()
 304    FROM = auto()
 305    FULL = auto()
 306    FUNCTION = auto()
 307    GET = auto()
 308    GLOB = auto()
 309    GLOBAL = auto()
 310    GRANT = auto()
 311    GROUP_BY = auto()
 312    GROUPING_SETS = auto()
 313    HAVING = auto()
 314    HINT = auto()
 315    IGNORE = auto()
 316    ILIKE = auto()
 317    IN = auto()
 318    INDEX = auto()
 319    INDEXED_BY = auto()
 320    INNER = auto()
 321    INSERT = auto()
 322    INSTALL = auto()
 323    INTEGRATION = auto()
 324    INTERSECT = auto()
 325    INTERVAL = auto()
 326    INTO = auto()
 327    INTRODUCER = auto()
 328    IRLIKE = auto()
 329    IS = auto()
 330    ISNULL = auto()
 331    JOIN = auto()
 332    JOIN_MARKER = auto()
 333    KEEP = auto()
 334    KEY = auto()
 335    KILL = auto()
 336    LANGUAGE = auto()
 337    LATERAL = auto()
 338    LEFT = auto()
 339    LIKE = auto()
 340    LIMIT = auto()
 341    LIST = auto()
 342    LOAD = auto()
 343    LOCK = auto()
 344    MAP = auto()
 345    MATCH = auto()
 346    MATCH_CONDITION = auto()
 347    MATCH_RECOGNIZE = auto()
 348    MEMBER_OF = auto()
 349    MERGE = auto()
 350    MOD = auto()
 351    MODEL = auto()
 352    NATURAL = auto()
 353    NEXT = auto()
 354    NOTHING = auto()
 355    NOTNULL = auto()
 356    NULL = auto()
 357    OBJECT_IDENTIFIER = auto()
 358    OFFSET = auto()
 359    ON = auto()
 360    ONLY = auto()
 361    OPERATOR = auto()
 362    ORDER_BY = auto()
 363    ORDER_SIBLINGS_BY = auto()
 364    ORDERED = auto()
 365    ORDINALITY = auto()
 366    OUT = auto()
 367    INOUT = auto()
 368    OUTER = auto()
 369    OVER = auto()
 370    OVERLAPS = auto()
 371    OVERWRITE = auto()
 372    PACKAGE = auto()
 373    PARTITION = auto()
 374    PARTITION_BY = auto()
 375    PERCENT = auto()
 376    PIVOT = auto()
 377    PLACEHOLDER = auto()
 378    POLICY = auto()
 379    POOL = auto()
 380    POSITIONAL = auto()
 381    PRAGMA = auto()
 382    PREWHERE = auto()
 383    PRIMARY_KEY = auto()
 384    PROCEDURE = auto()
 385    PROPERTIES = auto()
 386    PSEUDO_TYPE = auto()
 387    PUT = auto()
 388    QUALIFY = auto()
 389    QUOTE = auto()
 390    QDCOLON = auto()
 391    RANGE = auto()
 392    RECURSIVE = auto()
 393    REFRESH = auto()
 394    RENAME = auto()
 395    REPLACE = auto()
 396    RETURNING = auto()
 397    REVOKE = auto()
 398    REFERENCES = auto()
 399    RIGHT = auto()
 400    RLIKE = auto()
 401    ROLE = auto()
 402    ROLLBACK = auto()
 403    ROLLUP = auto()
 404    ROW = auto()
 405    ROWS = auto()
 406    RULE = auto()
 407    SELECT = auto()
 408    SEMI = auto()
 409    SEPARATOR = auto()
 410    SEQUENCE = auto()
 411    SERDE_PROPERTIES = auto()
 412    SET = auto()
 413    SETTINGS = auto()
 414    SHOW = auto()
 415    SIMILAR_TO = auto()
 416    SOME = auto()
 417    SORT_BY = auto()
 418    SOUNDS_LIKE = auto()
 419    SQL_SECURITY = auto()
 420    START_WITH = auto()
 421    STORAGE_INTEGRATION = auto()
 422    STRAIGHT_JOIN = auto()
 423    STRUCT = auto()
 424    SUMMARIZE = auto()
 425    TABLE_SAMPLE = auto()
 426    TAG = auto()
 427    TEMPORARY = auto()
 428    TOP = auto()
 429    THEN = auto()
 430    TRUE = auto()
 431    TRUNCATE = auto()
 432    TRIGGER = auto()
 433    TYPE = auto()
 434    UNCACHE = auto()
 435    UNDROP = auto()
 436    UNION = auto()
 437    UNNEST = auto()
 438    UNPIVOT = auto()
 439    UPDATE = auto()
 440    USE = auto()
 441    USING = auto()
 442    VALUES = auto()
 443    VARIADIC = auto()
 444    VIEW = auto()
 445    SEMANTIC_VIEW = auto()
 446    VOLATILE = auto()
 447    VOLUME = auto()
 448    WHEN = auto()
 449    WHERE = auto()
 450    WINDOW = auto()
 451    WITH = auto()
 452    UNIQUE = auto()
 453    UTC_DATE = auto()
 454    UTC_TIME = auto()
 455    UTC_TIMESTAMP = auto()
 456    VERSION_SNAPSHOT = auto()
 457    TIMESTAMP_SNAPSHOT = auto()
 458    OPTION = auto()
 459    SINK = auto()
 460    SOURCE = auto()
 461    ANALYZE = auto()
 462    NAMESPACE = auto()
 463    EXPORT = auto()
 464
 465    # sentinels
 466    HIVE_TOKEN_STREAM = auto()
 467    SENTINEL = auto()
 468
 469    def __str__(self) -> str:
 470        return f"TokenType.{self.name}"
 471
 472
 473class Token:
 474    # mypyc doesn't expose slots
 475    _attrs: t.ClassVar[tuple[str, ...]] = (
 476        "token_type",
 477        "text",
 478        "line",
 479        "col",
 480        "start",
 481        "end",
 482        "comments",
 483    )
 484    __slots__ = _attrs
 485
 486    @classmethod
 487    def number(cls, number: int) -> Token:
 488        """Returns a NUMBER token with `number` as its text."""
 489        return cls(TokenType.NUMBER, str(number))
 490
 491    @classmethod
 492    def string(cls, string: str) -> Token:
 493        """Returns a STRING token with `string` as its text."""
 494        return cls(TokenType.STRING, string)
 495
 496    @classmethod
 497    def identifier(cls, identifier: str) -> Token:
 498        """Returns an IDENTIFIER token with `identifier` as its text."""
 499        return cls(TokenType.IDENTIFIER, identifier)
 500
 501    @classmethod
 502    def var(cls, var: str) -> Token:
 503        """Returns an VAR token with `var` as its text."""
 504        return cls(TokenType.VAR, var)
 505
 506    def __init__(
 507        self,
 508        token_type: TokenType,
 509        text: str,
 510        line: int = 1,
 511        col: int = 1,
 512        start: int = 0,
 513        end: int = 0,
 514        comments: list[str] | None = None,
 515    ) -> None:
 516        self.token_type = token_type
 517        self.text = text
 518        self.line = line
 519        self.col = col
 520        self.start = start
 521        self.end = end
 522        self.comments = [] if comments is None else comments
 523
 524    def __bool__(self) -> bool:
 525        return self.token_type != TokenType.SENTINEL
 526
 527    def __repr__(self) -> str:
 528        attributes = ", ".join(
 529            f"{k}: TokenType.{self.token_type.name}"
 530            if k == "token_type"
 531            else f"{k}: {getattr(self, k)}"
 532            for k in self._attrs
 533        )
 534        return f"<Token {attributes}>"
 535
 536
 537class TokenizerCore:
 538    __slots__ = (
 539        "sql",
 540        "size",
 541        "tokens",
 542        "_start",
 543        "_current",
 544        "_line",
 545        "_col",
 546        "_comments",
 547        "_char",
 548        "_end",
 549        "_peek",
 550        "_prev_token_line",
 551        "single_tokens",
 552        "keywords",
 553        "quotes",
 554        "format_strings",
 555        "identifiers",
 556        "comments",
 557        "string_escapes",
 558        "byte_string_escapes",
 559        "identifier_escapes",
 560        "escape_follow_chars",
 561        "commands",
 562        "command_prefix_tokens",
 563        "nested_comments",
 564        "hint_start",
 565        "tokens_preceding_hint",
 566        "has_bit_strings",
 567        "has_hex_strings",
 568        "numeric_literals",
 569        "var_single_tokens",
 570        "string_escapes_allowed_in_raw_strings",
 571        "heredoc_tag_is_identifier",
 572        "heredoc_string_alternative",
 573        "keyword_trie",
 574        "numbers_can_be_underscore_separated",
 575        "numbers_can_have_decimals",
 576        "identifiers_can_start_with_digit",
 577        "unescaped_sequences",
 578    )
 579
 580    def __init__(
 581        self,
 582        single_tokens: dict[str, TokenType],
 583        keywords: dict[str, TokenType],
 584        quotes: dict[str, str],
 585        format_strings: dict[str, tuple[str, TokenType]],
 586        identifiers: dict[str, str],
 587        comments: dict[str, str | None],
 588        string_escapes: set[str],
 589        byte_string_escapes: set[str],
 590        identifier_escapes: set[str],
 591        escape_follow_chars: set[str],
 592        commands: set[TokenType],
 593        command_prefix_tokens: set[TokenType],
 594        nested_comments: bool,
 595        hint_start: str,
 596        tokens_preceding_hint: set[TokenType],
 597        has_bit_strings: bool,
 598        has_hex_strings: bool,
 599        numeric_literals: dict[str, str],
 600        var_single_tokens: set[str],
 601        string_escapes_allowed_in_raw_strings: bool,
 602        heredoc_tag_is_identifier: bool,
 603        heredoc_string_alternative: TokenType,
 604        keyword_trie: dict,
 605        numbers_can_be_underscore_separated: bool,
 606        numbers_can_have_decimals: bool,
 607        identifiers_can_start_with_digit: bool,
 608        unescaped_sequences: dict[str, str],
 609    ) -> None:
 610        self.single_tokens = single_tokens
 611        self.keywords = keywords
 612        self.quotes = quotes
 613        self.format_strings = format_strings
 614        self.identifiers = identifiers
 615        self.comments = comments
 616        self.string_escapes = string_escapes
 617        self.byte_string_escapes = byte_string_escapes
 618        self.identifier_escapes = identifier_escapes
 619        self.escape_follow_chars = escape_follow_chars
 620        self.commands = commands
 621        self.command_prefix_tokens = command_prefix_tokens
 622        self.nested_comments = nested_comments
 623        self.hint_start = hint_start
 624        self.tokens_preceding_hint = tokens_preceding_hint
 625        self.has_bit_strings = has_bit_strings
 626        self.has_hex_strings = has_hex_strings
 627        self.numeric_literals = numeric_literals
 628        self.var_single_tokens = var_single_tokens
 629        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 630        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 631        self.heredoc_string_alternative = heredoc_string_alternative
 632        self.keyword_trie = keyword_trie
 633        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 634        self.numbers_can_have_decimals = numbers_can_have_decimals
 635        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 636        self.unescaped_sequences = unescaped_sequences
 637        self.sql = ""
 638        self.size = 0
 639        self.tokens: list[Token] = []
 640        self._start = 0
 641        self._current = 0
 642        self._line = 1
 643        self._col = 0
 644        self._comments: list[str] = []
 645        self._char = ""
 646        self._end = False
 647        self._peek = ""
 648        self._prev_token_line = -1
 649
 650    def reset(self) -> None:
 651        self.sql = ""
 652        self.size = 0
 653        self.tokens = []
 654        self._start = 0
 655        self._current = 0
 656        self._line = 1
 657        self._col = 0
 658        self._comments = []
 659        self._char = ""
 660        self._end = False
 661        self._peek = ""
 662        self._prev_token_line = -1
 663
 664    def tokenize(self, sql: str) -> list[Token]:
 665        """Returns a list of tokens corresponding to the SQL string `sql`."""
 666        self.reset()
 667        self.sql = sql
 668        self.size = len(sql)
 669
 670        try:
 671            self._scan()
 672        except Exception as e:
 673            start = max(self._current - 50, 0)
 674            end = min(self._current + 50, self.size - 1)
 675            context = self.sql[start:end]
 676            raise TokenError(f"Error tokenizing '{context}'") from e
 677
 678        return self.tokens
 679
 680    def _scan(self, check_semicolon: bool = False) -> None:
 681        identifiers = self.identifiers
 682        digit_chars = _DIGIT_CHARS
 683
 684        while self.size and not self._end:
 685            current = self._current
 686
 687            # Skip spaces here rather than iteratively calling advance() for performance reasons
 688            while current < self.size:
 689                char = self.sql[current]
 690
 691                if char == " " or char == "\t":
 692                    current += 1
 693                else:
 694                    break
 695
 696            offset = current - self._current if current > self._current else 1
 697
 698            self._start = current
 699            self._advance(offset)
 700
 701            if not self._char.isspace():
 702                if self._char in digit_chars:
 703                    self._scan_number()
 704                elif self._char in identifiers:
 705                    self._scan_identifier(identifiers[self._char])
 706                else:
 707                    self._scan_keywords()
 708
 709            if check_semicolon and self._peek == ";":
 710                break
 711
 712        if self.tokens and self._comments:
 713            self.tokens[-1].comments.extend(self._comments)
 714
 715    def _chars(self, size: int) -> str:
 716        if size == 1:
 717            return self._char
 718
 719        start = self._current - 1
 720        end = start + size
 721
 722        return self.sql[start:end] if end <= self.size else ""
 723
 724    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 725        char = self._char
 726
 727        if char == "\n" or char == "\r":
 728            # Ensures we don't count an extra line if we get a \r\n line break sequence
 729            if not (char == "\r" and self._peek == "\n"):
 730                self._col = i
 731                self._line += 1
 732        else:
 733            self._col += i
 734
 735        self._current += i
 736        sql = self.sql
 737        size = self.size
 738        self._end = self._current >= size
 739        self._char = sql[self._current - 1]
 740        self._peek = "" if self._end else sql[self._current]
 741
 742        if alnum and self._char.isalnum():
 743            # Cache to local variables instead of attributes for better performance
 744            _col = self._col
 745            _current = self._current
 746            _end = self._end
 747            _peek = self._peek
 748
 749            while _peek.isalnum():
 750                _col += 1
 751                _current += 1
 752                _end = _current >= size
 753                _peek = "" if _end else sql[_current]
 754
 755            self._col = _col
 756            self._current = _current
 757            self._end = _end
 758            self._peek = _peek
 759            self._char = sql[_current - 1]
 760
 761    @property
 762    def _text(self) -> str:
 763        return self.sql[self._start : self._current]
 764
 765    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 766        self._prev_token_line = self._line
 767
 768        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 769            self.tokens[-1].comments.extend(self._comments)
 770            self._comments = []
 771
 772        if text is None:
 773            text = self.sql[self._start : self._current]
 774
 775        self.tokens.append(
 776            Token(
 777                token_type,
 778                text=text,
 779                line=self._line,
 780                col=self._col,
 781                start=self._start,
 782                end=self._current - 1,
 783                comments=self._comments,
 784            )
 785        )
 786        self._comments = []
 787
 788        # If we have either a semicolon or a begin token before the command's token, we'll parse
 789        # whatever follows the command's token as a string
 790        if (
 791            token_type in self.commands
 792            and self._peek != ";"
 793            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 794        ):
 795            start = self._current
 796            tokens = len(self.tokens)
 797            self._scan(check_semicolon=True)
 798            self.tokens = self.tokens[:tokens]
 799            text = self.sql[start : self._current].strip()
 800            if text:
 801                self._add(TokenType.STRING, text)
 802
 803    def _scan_keywords(self) -> None:
 804        sql = self.sql
 805        sql_size = self.size
 806        single_tokens = self.single_tokens
 807        char_upper = _CHAR_UPPER
 808        size = 0
 809        word = None
 810        chars = self._char
 811        char = chars
 812        prev_space = False
 813        skip = False
 814        trie = self.keyword_trie
 815        single_token = char in single_tokens
 816
 817        while chars:
 818            if not skip:
 819                sub = trie.get(char_upper.get(char, char))
 820                if sub is None:
 821                    break
 822                trie = sub
 823                if 0 in trie:
 824                    word = chars
 825
 826            end = self._current + size
 827            size += 1
 828
 829            if end < sql_size:
 830                char = sql[end]
 831                single_token = single_token or char in single_tokens
 832                is_space = char.isspace()
 833
 834                if not is_space or not prev_space:
 835                    if is_space:
 836                        char = " "
 837                    chars += char
 838                    prev_space = is_space
 839                    skip = False
 840                else:
 841                    skip = True
 842            else:
 843                char = ""
 844                break
 845
 846        if word:
 847            if self._scan_string(word):
 848                return
 849            if self._scan_comment(word):
 850                return
 851            if prev_space or single_token or not char:
 852                self._advance(size - 1)
 853                word = word.upper()
 854                self._add(self.keywords[word], text=word)
 855                return
 856
 857        if self._char in single_tokens:
 858            self._add(single_tokens[self._char], text=self._char)
 859            return
 860
 861        self._scan_var()
 862
 863    def _scan_comment(self, comment_start: str) -> bool:
 864        if comment_start not in self.comments:
 865            return False
 866
 867        comment_start_line = self._line
 868        comment_start_size = len(comment_start)
 869        comment_end = self.comments[comment_start]
 870
 871        if comment_end:
 872            # Skip the comment's start delimiter
 873            self._advance(comment_start_size)
 874
 875            comment_count = 1
 876            comment_end_size = len(comment_end)
 877            nested_comments = self.nested_comments
 878
 879            while not self._end:
 880                if self._chars(comment_end_size) == comment_end:
 881                    comment_count -= 1
 882                    if not comment_count:
 883                        break
 884
 885                self._advance(alnum=True)
 886
 887                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 888                if (
 889                    nested_comments
 890                    and not self._end
 891                    and self._chars(comment_end_size) == comment_start
 892                ):
 893                    self._advance(comment_start_size)
 894                    comment_count += 1
 895
 896            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 897            self._advance(comment_end_size - 1)
 898        else:
 899            _peek = self._peek
 900            while not self._end and _peek != "\n" and _peek != "\r":
 901                self._advance(alnum=True)
 902                _peek = self._peek
 903            self._comments.append(self._text[comment_start_size:])
 904
 905        if (
 906            comment_start == self.hint_start
 907            and self.tokens
 908            and self.tokens[-1].token_type in self.tokens_preceding_hint
 909        ):
 910            self._add(TokenType.HINT)
 911
 912        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 913        # Multiple consecutive comments are preserved by appending them to the current comments list.
 914        if comment_start_line == self._prev_token_line:
 915            self.tokens[-1].comments.extend(self._comments)
 916            self._comments = []
 917            self._prev_token_line = self._line
 918
 919        return True
 920
 921    def _scan_number(self) -> None:
 922        if self._char == "0":
 923            peek = _CHAR_UPPER.get(self._peek, self._peek)
 924            if peek == "B":
 925                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 926            elif peek == "X":
 927                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 928
 929        decimal = False
 930        scientific = 0
 931        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 932        single_tokens = self.single_tokens
 933        keywords = self.keywords
 934        numeric_literals = self.numeric_literals
 935        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 936
 937        is_underscore_separated: bool = False
 938        number_text: str = ""
 939        numeric_literal: str = ""
 940        numeric_type: TokenType | None = None
 941
 942        while True:
 943            if self._peek in _DIGIT_CHARS:
 944                # Batch consecutive digits: scan ahead to find how many
 945                sql = self.sql
 946                end = self._current + 1
 947                size = self.size
 948                while end < size and sql[end] in _DIGIT_CHARS:
 949                    end += 1
 950                self._advance(end - self._current)
 951            elif self._peek == "." and not decimal:
 952                if (
 953                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 954                ) or not self.numbers_can_have_decimals:
 955                    break
 956                decimal = True
 957                self._advance()
 958            elif self._peek in ("-", "+") and scientific == 1:
 959                # Only consume +/- if followed by a digit
 960                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 961                    scientific += 1
 962                    self._advance()
 963                else:
 964                    break
 965            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 966                scientific += 1
 967                self._advance()
 968            elif self._peek == "_" and numbers_can_be_underscore_separated:
 969                is_underscore_separated = True
 970                self._advance()
 971            elif self._peek.isidentifier():
 972                number_text = self._text
 973
 974                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 975                    numeric_literal += self._peek
 976                    self._advance()
 977
 978                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 979
 980                if numeric_type:
 981                    break
 982                elif identifiers_can_start_with_digit:
 983                    return self._add(TokenType.VAR)
 984
 985                self._advance(-len(numeric_literal))
 986                break
 987            else:
 988                break
 989
 990        number_text = number_text or self.sql[self._start : self._current]
 991
 992        # Normalize inputs such as 100_000 to 100000
 993        if is_underscore_separated:
 994            number_text = number_text.replace("_", "")
 995
 996        self._add(TokenType.NUMBER, number_text)
 997
 998        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 999        if numeric_type:
1000            self._add(TokenType.DCOLON, "::")
1001            self._add(numeric_type, numeric_literal)
1002
1003    def _scan_bits(self) -> None:
1004        self._advance()
1005        value = self._extract_value()
1006        try:
1007            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1008            int(value, 2)
1009            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1010        except ValueError:
1011            self._add(TokenType.IDENTIFIER)
1012
1013    def _scan_hex(self) -> None:
1014        self._advance()
1015        value = self._extract_value()
1016        try:
1017            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1018            int(value, 16)
1019            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1020        except ValueError:
1021            self._add(TokenType.IDENTIFIER)
1022
1023    def _extract_value(self) -> str:
1024        single_tokens = self.single_tokens
1025
1026        while True:
1027            char = self._peek.strip()
1028            if char and char not in single_tokens:
1029                self._advance(alnum=True)
1030            else:
1031                break
1032
1033        return self._text
1034
1035    def _scan_string(self, start: str) -> bool:
1036        base = None
1037        token_type = TokenType.STRING
1038
1039        if start in self.quotes:
1040            end = self.quotes[start]
1041        elif start in self.format_strings:
1042            end, token_type = self.format_strings[start]
1043
1044            if token_type == TokenType.HEX_STRING:
1045                base = 16
1046            elif token_type == TokenType.BIT_STRING:
1047                base = 2
1048            elif token_type == TokenType.HEREDOC_STRING:
1049                self._advance()
1050
1051                if self._char == end:
1052                    tag = ""
1053                else:
1054                    tag = self._extract_string(
1055                        end,
1056                        raw_string=True,
1057                        raise_unmatched=not self.heredoc_tag_is_identifier,
1058                    )
1059
1060                if (
1061                    tag
1062                    and self.heredoc_tag_is_identifier
1063                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1064                ):
1065                    if not self._end:
1066                        self._advance(-1)
1067
1068                    self._advance(-len(tag))
1069                    self._add(self.heredoc_string_alternative)
1070                    return True
1071
1072                end = f"{start}{tag}{end}"
1073        else:
1074            return False
1075
1076        self._advance(len(start))
1077        text = self._extract_string(
1078            end,
1079            escapes=(
1080                self.byte_string_escapes
1081                if token_type == TokenType.BYTE_STRING
1082                else self.string_escapes
1083            ),
1084            raw_string=token_type == TokenType.RAW_STRING,
1085        )
1086
1087        if base and text:
1088            try:
1089                int(text, base)
1090            except Exception:
1091                raise TokenError(
1092                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1093                )
1094
1095        self._add(token_type, text)
1096        return True
1097
1098    def _scan_identifier(self, identifier_end: str) -> None:
1099        self._advance()
1100        text = self._extract_string(
1101            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1102        )
1103        self._add(TokenType.IDENTIFIER, text)
1104
1105    def _scan_var(self) -> None:
1106        var_single_tokens = self.var_single_tokens
1107        single_tokens = self.single_tokens
1108
1109        while True:
1110            peek = self._peek
1111            if not peek or peek.isspace():
1112                break
1113            if peek not in var_single_tokens and peek in single_tokens:
1114                break
1115            self._advance(alnum=True)
1116
1117        self._add(
1118            TokenType.VAR
1119            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1120            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1121        )
1122
1123    def _extract_string(
1124        self,
1125        delimiter: str,
1126        escapes: set[str] | None = None,
1127        raw_string: bool = False,
1128        raise_unmatched: bool = True,
1129    ) -> str:
1130        text = ""
1131        delim_size = len(delimiter)
1132        escapes = self.string_escapes if escapes is None else escapes
1133        unescaped_sequences = self.unescaped_sequences
1134        escape_follow_chars = self.escape_follow_chars
1135        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1136        quotes = self.quotes
1137        sql = self.sql
1138
1139        # use str.find() when the string is simple... no \ or other escapes
1140        if delim_size == 1:
1141            pos = self._current - 1
1142            end = sql.find(delimiter, pos)
1143
1144            if (
1145                # the closing delimiter was found
1146                end != -1
1147                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1148                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1149                # no backslash in the string that would need escape processing
1150                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1151            ):
1152                newlines = sql.count("\n", pos, end)
1153                if newlines:
1154                    self._line += newlines
1155                    self._col = end - sql.rfind("\n", pos, end)
1156                else:
1157                    self._col += end - pos
1158
1159                self._current = end + 1
1160                self._end = self._current >= self.size
1161                self._char = sql[end]
1162                self._peek = "" if self._end else sql[self._current]
1163                return sql[pos:end]
1164
1165        while True:
1166            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1167                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1168                if unescaped_sequence:
1169                    self._advance(2)
1170                    text += unescaped_sequence
1171                    continue
1172
1173            is_valid_custom_escape = (
1174                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1175            )
1176
1177            if (
1178                (string_escapes_allowed_in_raw_strings or not raw_string)
1179                and self._char in escapes
1180                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1181                and (self._char not in quotes or self._char == self._peek)
1182            ):
1183                if self._peek == delimiter:
1184                    text += self._peek
1185                elif is_valid_custom_escape and self._char != self._peek:
1186                    text += self._peek
1187                else:
1188                    text += self._char + self._peek
1189
1190                if self._current + 1 < self.size:
1191                    self._advance(2)
1192                else:
1193                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1194            else:
1195                if self._chars(delim_size) == delimiter:
1196                    if delim_size > 1:
1197                        self._advance(delim_size - 1)
1198                    break
1199
1200                if self._end:
1201                    if not raise_unmatched:
1202                        return text + self._char
1203
1204                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1205
1206                current = self._current - 1
1207                self._advance(alnum=True)
1208                text += sql[current : self._current - 1]
1209
1210        return text
class TokenType(enum.IntEnum):
 14class TokenType(IntEnum):
 15    L_PAREN = auto()
 16    R_PAREN = auto()
 17    L_BRACKET = auto()
 18    R_BRACKET = auto()
 19    L_BRACE = auto()
 20    R_BRACE = auto()
 21    COMMA = auto()
 22    DOT = auto()
 23    DASH = auto()
 24    PLUS = auto()
 25    COLON = auto()
 26    DOTCOLON = auto()
 27    DOTCARET = auto()
 28    DCOLON = auto()
 29    DCOLONDOLLAR = auto()
 30    DCOLONPERCENT = auto()
 31    DCOLONQMARK = auto()
 32    DQMARK = auto()
 33    SEMICOLON = auto()
 34    STAR = auto()
 35    BACKSLASH = auto()
 36    SLASH = auto()
 37    LT = auto()
 38    LTE = auto()
 39    GT = auto()
 40    GTE = auto()
 41    NOT = auto()
 42    EQ = auto()
 43    NEQ = auto()
 44    NULLSAFE_EQ = auto()
 45    COLON_EQ = auto()
 46    COLON_GT = auto()
 47    NCOLON_GT = auto()
 48    AND = auto()
 49    OR = auto()
 50    AMP = auto()
 51    DPIPE = auto()
 52    PIPE_GT = auto()
 53    PIPE = auto()
 54    PIPE_SLASH = auto()
 55    DPIPE_SLASH = auto()
 56    CARET = auto()
 57    CARET_AT = auto()
 58    TILDE = auto()
 59    ARROW = auto()
 60    DARROW = auto()
 61    FARROW = auto()
 62    HASH = auto()
 63    HASH_ARROW = auto()
 64    DHASH_ARROW = auto()
 65    LR_ARROW = auto()
 66    LLRR_ARROW = auto()
 67    DAT = auto()
 68    AT_QMARK = auto()
 69    LT_AT = auto()
 70    AT_GT = auto()
 71    DOLLAR = auto()
 72    PARAMETER = auto()
 73    SESSION = auto()
 74    SESSION_PARAMETER = auto()
 75    SESSION_USER = auto()
 76    DAMP = auto()
 77    AMP_LT = auto()
 78    AMP_GT = auto()
 79    ADJACENT = auto()
 80    XOR = auto()
 81    DSTAR = auto()
 82    QMARK_AMP = auto()
 83    QMARK_PIPE = auto()
 84    HASH_DASH = auto()
 85    EXCLAMATION = auto()
 86
 87    URI_START = auto()
 88
 89    BLOCK_START = auto()
 90    BLOCK_END = auto()
 91
 92    SPACE = auto()
 93    BREAK = auto()
 94
 95    STRING = auto()
 96    NUMBER = auto()
 97    IDENTIFIER = auto()
 98    DATABASE = auto()
 99    COLUMN = auto()
100    COLUMN_DEF = auto()
101    SCHEMA = auto()
102    TABLE = auto()
103    WAREHOUSE = auto()
104    STAGE = auto()
105    STREAM = auto()
106    STREAMLIT = auto()
107    VAR = auto()
108    BIT_STRING = auto()
109    HEX_STRING = auto()
110    BYTE_STRING = auto()
111    NATIONAL_STRING = auto()
112    RAW_STRING = auto()
113    HEREDOC_STRING = auto()
114    UNICODE_STRING = auto()
115
116    # types
117    BIT = auto()
118    BOOLEAN = auto()
119    TINYINT = auto()
120    UTINYINT = auto()
121    SMALLINT = auto()
122    USMALLINT = auto()
123    MEDIUMINT = auto()
124    UMEDIUMINT = auto()
125    INT = auto()
126    UINT = auto()
127    BIGINT = auto()
128    UBIGINT = auto()
129    BIGNUM = auto()
130    INT128 = auto()
131    UINT128 = auto()
132    INT256 = auto()
133    UINT256 = auto()
134    FLOAT = auto()
135    DOUBLE = auto()
136    UDOUBLE = auto()
137    DECIMAL = auto()
138    DECIMAL32 = auto()
139    DECIMAL64 = auto()
140    DECIMAL128 = auto()
141    DECIMAL256 = auto()
142    DECFLOAT = auto()
143    UDECIMAL = auto()
144    BIGDECIMAL = auto()
145    CHAR = auto()
146    NCHAR = auto()
147    VARCHAR = auto()
148    NVARCHAR = auto()
149    BPCHAR = auto()
150    TEXT = auto()
151    MEDIUMTEXT = auto()
152    LONGTEXT = auto()
153    BLOB = auto()
154    MEDIUMBLOB = auto()
155    LONGBLOB = auto()
156    TINYBLOB = auto()
157    TINYTEXT = auto()
158    NAME = auto()
159    BINARY = auto()
160    VARBINARY = auto()
161    JSON = auto()
162    JSONB = auto()
163    TIME = auto()
164    TIMETZ = auto()
165    TIME_NS = auto()
166    TIMESTAMP = auto()
167    TIMESTAMPTZ = auto()
168    TIMESTAMPLTZ = auto()
169    TIMESTAMPNTZ = auto()
170    TIMESTAMP_S = auto()
171    TIMESTAMP_MS = auto()
172    TIMESTAMP_NS = auto()
173    DATETIME = auto()
174    DATETIME2 = auto()
175    DATETIME64 = auto()
176    SMALLDATETIME = auto()
177    DATE = auto()
178    DATE32 = auto()
179    INT4RANGE = auto()
180    INT4MULTIRANGE = auto()
181    INT8RANGE = auto()
182    INT8MULTIRANGE = auto()
183    NUMRANGE = auto()
184    NUMMULTIRANGE = auto()
185    TSRANGE = auto()
186    TSMULTIRANGE = auto()
187    TSTZRANGE = auto()
188    TSTZMULTIRANGE = auto()
189    DATERANGE = auto()
190    DATEMULTIRANGE = auto()
191    UUID = auto()
192    GEOGRAPHY = auto()
193    GEOGRAPHYPOINT = auto()
194    NULLABLE = auto()
195    GEOMETRY = auto()
196    POINT = auto()
197    RING = auto()
198    LINESTRING = auto()
199    LOCALTIME = auto()
200    LOCALTIMESTAMP = auto()
201    SYSTIMESTAMP = auto()
202    MULTILINESTRING = auto()
203    POLYGON = auto()
204    MULTIPOLYGON = auto()
205    HLLSKETCH = auto()
206    HSTORE = auto()
207    SUPER = auto()
208    SERIAL = auto()
209    SMALLSERIAL = auto()
210    BIGSERIAL = auto()
211    XML = auto()
212    YEAR = auto()
213    USERDEFINED = auto()
214    MONEY = auto()
215    SMALLMONEY = auto()
216    ROWVERSION = auto()
217    IMAGE = auto()
218    VARIANT = auto()
219    OBJECT = auto()
220    INET = auto()
221    IPADDRESS = auto()
222    IPPREFIX = auto()
223    IPV4 = auto()
224    IPV6 = auto()
225    ENUM = auto()
226    ENUM8 = auto()
227    ENUM16 = auto()
228    FIXEDSTRING = auto()
229    LOWCARDINALITY = auto()
230    NESTED = auto()
231    AGGREGATEFUNCTION = auto()
232    SIMPLEAGGREGATEFUNCTION = auto()
233    TDIGEST = auto()
234    UNKNOWN = auto()
235    VECTOR = auto()
236    DYNAMIC = auto()
237    VOID = auto()
238
239    # keywords
240    ALIAS = auto()
241    ALTER = auto()
242    ALL = auto()
243    ANTI = auto()
244    ANY = auto()
245    APPLY = auto()
246    ARRAY = auto()
247    ASC = auto()
248    ASOF = auto()
249    ATTACH = auto()
250    AUTO_INCREMENT = auto()
251    BEGIN = auto()
252    BETWEEN = auto()
253    BULK_COLLECT_INTO = auto()
254    CACHE = auto()
255    CASE = auto()
256    CHARACTER_SET = auto()
257    CLUSTER_BY = auto()
258    COLLATE = auto()
259    COMMAND = auto()
260    COMMENT = auto()
261    COMMIT = auto()
262    CONNECT_BY = auto()
263    CONSTRAINT = auto()
264    COPY = auto()
265    CREATE = auto()
266    CROSS = auto()
267    CUBE = auto()
268    CURRENT_DATE = auto()
269    CURRENT_DATETIME = auto()
270    CURRENT_SCHEMA = auto()
271    CURRENT_TIME = auto()
272    CURRENT_TIMESTAMP = auto()
273    CURRENT_USER = auto()
274    CURRENT_USER_ID = auto()
275    CURRENT_ROLE = auto()
276    CURRENT_CATALOG = auto()
277    DECLARE = auto()
278    DEFAULT = auto()
279    DELETE = auto()
280    DESC = auto()
281    DESCRIBE = auto()
282    DETACH = auto()
283    DICTIONARY = auto()
284    DISTINCT = auto()
285    DISTRIBUTE_BY = auto()
286    DIV = auto()
287    DROP = auto()
288    ELSE = auto()
289    END = auto()
290    ESCAPE = auto()
291    EXCEPT = auto()
292    EXECUTE = auto()
293    EXISTS = auto()
294    FALSE = auto()
295    FETCH = auto()
296    FILE = auto()
297    FILE_FORMAT = auto()
298    FILTER = auto()
299    FINAL = auto()
300    FIRST = auto()
301    FOR = auto()
302    FORCE = auto()
303    FOREIGN_KEY = auto()
304    FORMAT = auto()
305    FROM = auto()
306    FULL = auto()
307    FUNCTION = auto()
308    GET = auto()
309    GLOB = auto()
310    GLOBAL = auto()
311    GRANT = auto()
312    GROUP_BY = auto()
313    GROUPING_SETS = auto()
314    HAVING = auto()
315    HINT = auto()
316    IGNORE = auto()
317    ILIKE = auto()
318    IN = auto()
319    INDEX = auto()
320    INDEXED_BY = auto()
321    INNER = auto()
322    INSERT = auto()
323    INSTALL = auto()
324    INTEGRATION = auto()
325    INTERSECT = auto()
326    INTERVAL = auto()
327    INTO = auto()
328    INTRODUCER = auto()
329    IRLIKE = auto()
330    IS = auto()
331    ISNULL = auto()
332    JOIN = auto()
333    JOIN_MARKER = auto()
334    KEEP = auto()
335    KEY = auto()
336    KILL = auto()
337    LANGUAGE = auto()
338    LATERAL = auto()
339    LEFT = auto()
340    LIKE = auto()
341    LIMIT = auto()
342    LIST = auto()
343    LOAD = auto()
344    LOCK = auto()
345    MAP = auto()
346    MATCH = auto()
347    MATCH_CONDITION = auto()
348    MATCH_RECOGNIZE = auto()
349    MEMBER_OF = auto()
350    MERGE = auto()
351    MOD = auto()
352    MODEL = auto()
353    NATURAL = auto()
354    NEXT = auto()
355    NOTHING = auto()
356    NOTNULL = auto()
357    NULL = auto()
358    OBJECT_IDENTIFIER = auto()
359    OFFSET = auto()
360    ON = auto()
361    ONLY = auto()
362    OPERATOR = auto()
363    ORDER_BY = auto()
364    ORDER_SIBLINGS_BY = auto()
365    ORDERED = auto()
366    ORDINALITY = auto()
367    OUT = auto()
368    INOUT = auto()
369    OUTER = auto()
370    OVER = auto()
371    OVERLAPS = auto()
372    OVERWRITE = auto()
373    PACKAGE = auto()
374    PARTITION = auto()
375    PARTITION_BY = auto()
376    PERCENT = auto()
377    PIVOT = auto()
378    PLACEHOLDER = auto()
379    POLICY = auto()
380    POOL = auto()
381    POSITIONAL = auto()
382    PRAGMA = auto()
383    PREWHERE = auto()
384    PRIMARY_KEY = auto()
385    PROCEDURE = auto()
386    PROPERTIES = auto()
387    PSEUDO_TYPE = auto()
388    PUT = auto()
389    QUALIFY = auto()
390    QUOTE = auto()
391    QDCOLON = auto()
392    RANGE = auto()
393    RECURSIVE = auto()
394    REFRESH = auto()
395    RENAME = auto()
396    REPLACE = auto()
397    RETURNING = auto()
398    REVOKE = auto()
399    REFERENCES = auto()
400    RIGHT = auto()
401    RLIKE = auto()
402    ROLE = auto()
403    ROLLBACK = auto()
404    ROLLUP = auto()
405    ROW = auto()
406    ROWS = auto()
407    RULE = auto()
408    SELECT = auto()
409    SEMI = auto()
410    SEPARATOR = auto()
411    SEQUENCE = auto()
412    SERDE_PROPERTIES = auto()
413    SET = auto()
414    SETTINGS = auto()
415    SHOW = auto()
416    SIMILAR_TO = auto()
417    SOME = auto()
418    SORT_BY = auto()
419    SOUNDS_LIKE = auto()
420    SQL_SECURITY = auto()
421    START_WITH = auto()
422    STORAGE_INTEGRATION = auto()
423    STRAIGHT_JOIN = auto()
424    STRUCT = auto()
425    SUMMARIZE = auto()
426    TABLE_SAMPLE = auto()
427    TAG = auto()
428    TEMPORARY = auto()
429    TOP = auto()
430    THEN = auto()
431    TRUE = auto()
432    TRUNCATE = auto()
433    TRIGGER = auto()
434    TYPE = auto()
435    UNCACHE = auto()
436    UNDROP = auto()
437    UNION = auto()
438    UNNEST = auto()
439    UNPIVOT = auto()
440    UPDATE = auto()
441    USE = auto()
442    USING = auto()
443    VALUES = auto()
444    VARIADIC = auto()
445    VIEW = auto()
446    SEMANTIC_VIEW = auto()
447    VOLATILE = auto()
448    VOLUME = auto()
449    WHEN = auto()
450    WHERE = auto()
451    WINDOW = auto()
452    WITH = auto()
453    UNIQUE = auto()
454    UTC_DATE = auto()
455    UTC_TIME = auto()
456    UTC_TIMESTAMP = auto()
457    VERSION_SNAPSHOT = auto()
458    TIMESTAMP_SNAPSHOT = auto()
459    OPTION = auto()
460    SINK = auto()
461    SOURCE = auto()
462    ANALYZE = auto()
463    NAMESPACE = auto()
464    EXPORT = auto()
465
466    # sentinels
467    HIVE_TOKEN_STREAM = auto()
468    SENTINEL = auto()
469
470    def __str__(self) -> str:
471        return f"TokenType.{self.name}"

An enumeration.

L_PAREN = <TokenType.L_PAREN: 1>
R_PAREN = <TokenType.R_PAREN: 2>
L_BRACKET = <TokenType.L_BRACKET: 3>
R_BRACKET = <TokenType.R_BRACKET: 4>
L_BRACE = <TokenType.L_BRACE: 5>
R_BRACE = <TokenType.R_BRACE: 6>
COMMA = <TokenType.COMMA: 7>
DOT = <TokenType.DOT: 8>
DASH = <TokenType.DASH: 9>
PLUS = <TokenType.PLUS: 10>
COLON = <TokenType.COLON: 11>
DOTCOLON = <TokenType.DOTCOLON: 12>
DOTCARET = <TokenType.DOTCARET: 13>
DCOLON = <TokenType.DCOLON: 14>
DCOLONDOLLAR = <TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT = <TokenType.DCOLONPERCENT: 16>
DCOLONQMARK = <TokenType.DCOLONQMARK: 17>
DQMARK = <TokenType.DQMARK: 18>
SEMICOLON = <TokenType.SEMICOLON: 19>
STAR = <TokenType.STAR: 20>
BACKSLASH = <TokenType.BACKSLASH: 21>
SLASH = <TokenType.SLASH: 22>
LT = <TokenType.LT: 23>
LTE = <TokenType.LTE: 24>
GT = <TokenType.GT: 25>
GTE = <TokenType.GTE: 26>
NOT = <TokenType.NOT: 27>
EQ = <TokenType.EQ: 28>
NEQ = <TokenType.NEQ: 29>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 30>
COLON_EQ = <TokenType.COLON_EQ: 31>
COLON_GT = <TokenType.COLON_GT: 32>
NCOLON_GT = <TokenType.NCOLON_GT: 33>
AND = <TokenType.AND: 34>
OR = <TokenType.OR: 35>
AMP = <TokenType.AMP: 36>
DPIPE = <TokenType.DPIPE: 37>
PIPE_GT = <TokenType.PIPE_GT: 38>
PIPE = <TokenType.PIPE: 39>
PIPE_SLASH = <TokenType.PIPE_SLASH: 40>
DPIPE_SLASH = <TokenType.DPIPE_SLASH: 41>
CARET = <TokenType.CARET: 42>
CARET_AT = <TokenType.CARET_AT: 43>
TILDE = <TokenType.TILDE: 44>
ARROW = <TokenType.ARROW: 45>
DARROW = <TokenType.DARROW: 46>
FARROW = <TokenType.FARROW: 47>
HASH = <TokenType.HASH: 48>
HASH_ARROW = <TokenType.HASH_ARROW: 49>
DHASH_ARROW = <TokenType.DHASH_ARROW: 50>
LR_ARROW = <TokenType.LR_ARROW: 51>
LLRR_ARROW = <TokenType.LLRR_ARROW: 52>
DAT = <TokenType.DAT: 53>
AT_QMARK = <TokenType.AT_QMARK: 54>
LT_AT = <TokenType.LT_AT: 55>
AT_GT = <TokenType.AT_GT: 56>
DOLLAR = <TokenType.DOLLAR: 57>
PARAMETER = <TokenType.PARAMETER: 58>
SESSION = <TokenType.SESSION: 59>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 60>
SESSION_USER = <TokenType.SESSION_USER: 61>
DAMP = <TokenType.DAMP: 62>
AMP_LT = <TokenType.AMP_LT: 63>
AMP_GT = <TokenType.AMP_GT: 64>
ADJACENT = <TokenType.ADJACENT: 65>
XOR = <TokenType.XOR: 66>
DSTAR = <TokenType.DSTAR: 67>
QMARK_AMP = <TokenType.QMARK_AMP: 68>
QMARK_PIPE = <TokenType.QMARK_PIPE: 69>
HASH_DASH = <TokenType.HASH_DASH: 70>
EXCLAMATION = <TokenType.EXCLAMATION: 71>
URI_START = <TokenType.URI_START: 72>
BLOCK_START = <TokenType.BLOCK_START: 73>
BLOCK_END = <TokenType.BLOCK_END: 74>
SPACE = <TokenType.SPACE: 75>
BREAK = <TokenType.BREAK: 76>
STRING = <TokenType.STRING: 77>
NUMBER = <TokenType.NUMBER: 78>
IDENTIFIER = <TokenType.IDENTIFIER: 79>
DATABASE = <TokenType.DATABASE: 80>
COLUMN = <TokenType.COLUMN: 81>
COLUMN_DEF = <TokenType.COLUMN_DEF: 82>
SCHEMA = <TokenType.SCHEMA: 83>
TABLE = <TokenType.TABLE: 84>
WAREHOUSE = <TokenType.WAREHOUSE: 85>
STAGE = <TokenType.STAGE: 86>
STREAM = <TokenType.STREAM: 87>
STREAMLIT = <TokenType.STREAMLIT: 88>
VAR = <TokenType.VAR: 89>
BIT_STRING = <TokenType.BIT_STRING: 90>
HEX_STRING = <TokenType.HEX_STRING: 91>
BYTE_STRING = <TokenType.BYTE_STRING: 92>
NATIONAL_STRING = <TokenType.NATIONAL_STRING: 93>
RAW_STRING = <TokenType.RAW_STRING: 94>
HEREDOC_STRING = <TokenType.HEREDOC_STRING: 95>
UNICODE_STRING = <TokenType.UNICODE_STRING: 96>
BIT = <TokenType.BIT: 97>
BOOLEAN = <TokenType.BOOLEAN: 98>
TINYINT = <TokenType.TINYINT: 99>
UTINYINT = <TokenType.UTINYINT: 100>
SMALLINT = <TokenType.SMALLINT: 101>
USMALLINT = <TokenType.USMALLINT: 102>
MEDIUMINT = <TokenType.MEDIUMINT: 103>
UMEDIUMINT = <TokenType.UMEDIUMINT: 104>
INT = <TokenType.INT: 105>
UINT = <TokenType.UINT: 106>
BIGINT = <TokenType.BIGINT: 107>
UBIGINT = <TokenType.UBIGINT: 108>
BIGNUM = <TokenType.BIGNUM: 109>
INT128 = <TokenType.INT128: 110>
UINT128 = <TokenType.UINT128: 111>
INT256 = <TokenType.INT256: 112>
UINT256 = <TokenType.UINT256: 113>
FLOAT = <TokenType.FLOAT: 114>
DOUBLE = <TokenType.DOUBLE: 115>
UDOUBLE = <TokenType.UDOUBLE: 116>
DECIMAL = <TokenType.DECIMAL: 117>
DECIMAL32 = <TokenType.DECIMAL32: 118>
DECIMAL64 = <TokenType.DECIMAL64: 119>
DECIMAL128 = <TokenType.DECIMAL128: 120>
DECIMAL256 = <TokenType.DECIMAL256: 121>
DECFLOAT = <TokenType.DECFLOAT: 122>
UDECIMAL = <TokenType.UDECIMAL: 123>
BIGDECIMAL = <TokenType.BIGDECIMAL: 124>
CHAR = <TokenType.CHAR: 125>
NCHAR = <TokenType.NCHAR: 126>
VARCHAR = <TokenType.VARCHAR: 127>
NVARCHAR = <TokenType.NVARCHAR: 128>
BPCHAR = <TokenType.BPCHAR: 129>
TEXT = <TokenType.TEXT: 130>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 131>
LONGTEXT = <TokenType.LONGTEXT: 132>
BLOB = <TokenType.BLOB: 133>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 134>
LONGBLOB = <TokenType.LONGBLOB: 135>
TINYBLOB = <TokenType.TINYBLOB: 136>
TINYTEXT = <TokenType.TINYTEXT: 137>
NAME = <TokenType.NAME: 138>
BINARY = <TokenType.BINARY: 139>
VARBINARY = <TokenType.VARBINARY: 140>
JSON = <TokenType.JSON: 141>
JSONB = <TokenType.JSONB: 142>
TIME = <TokenType.TIME: 143>
TIMETZ = <TokenType.TIMETZ: 144>
TIME_NS = <TokenType.TIME_NS: 145>
TIMESTAMP = <TokenType.TIMESTAMP: 146>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 147>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 148>
TIMESTAMPNTZ = <TokenType.TIMESTAMPNTZ: 149>
TIMESTAMP_S = <TokenType.TIMESTAMP_S: 150>
TIMESTAMP_MS = <TokenType.TIMESTAMP_MS: 151>
TIMESTAMP_NS = <TokenType.TIMESTAMP_NS: 152>
DATETIME = <TokenType.DATETIME: 153>
DATETIME2 = <TokenType.DATETIME2: 154>
DATETIME64 = <TokenType.DATETIME64: 155>
SMALLDATETIME = <TokenType.SMALLDATETIME: 156>
DATE = <TokenType.DATE: 157>
DATE32 = <TokenType.DATE32: 158>
INT4RANGE = <TokenType.INT4RANGE: 159>
INT4MULTIRANGE = <TokenType.INT4MULTIRANGE: 160>
INT8RANGE = <TokenType.INT8RANGE: 161>
INT8MULTIRANGE = <TokenType.INT8MULTIRANGE: 162>
NUMRANGE = <TokenType.NUMRANGE: 163>
NUMMULTIRANGE = <TokenType.NUMMULTIRANGE: 164>
TSRANGE = <TokenType.TSRANGE: 165>
TSMULTIRANGE = <TokenType.TSMULTIRANGE: 166>
TSTZRANGE = <TokenType.TSTZRANGE: 167>
TSTZMULTIRANGE = <TokenType.TSTZMULTIRANGE: 168>
DATERANGE = <TokenType.DATERANGE: 169>
DATEMULTIRANGE = <TokenType.DATEMULTIRANGE: 170>
UUID = <TokenType.UUID: 171>
GEOGRAPHY = <TokenType.GEOGRAPHY: 172>
GEOGRAPHYPOINT = <TokenType.GEOGRAPHYPOINT: 173>
NULLABLE = <TokenType.NULLABLE: 174>
GEOMETRY = <TokenType.GEOMETRY: 175>
POINT = <TokenType.POINT: 176>
RING = <TokenType.RING: 177>
LINESTRING = <TokenType.LINESTRING: 178>
LOCALTIME = <TokenType.LOCALTIME: 179>
LOCALTIMESTAMP = <TokenType.LOCALTIMESTAMP: 180>
SYSTIMESTAMP = <TokenType.SYSTIMESTAMP: 181>
MULTILINESTRING = <TokenType.MULTILINESTRING: 182>
POLYGON = <TokenType.POLYGON: 183>
MULTIPOLYGON = <TokenType.MULTIPOLYGON: 184>
HLLSKETCH = <TokenType.HLLSKETCH: 185>
HSTORE = <TokenType.HSTORE: 186>
SUPER = <TokenType.SUPER: 187>
SERIAL = <TokenType.SERIAL: 188>
SMALLSERIAL = <TokenType.SMALLSERIAL: 189>
BIGSERIAL = <TokenType.BIGSERIAL: 190>
XML = <TokenType.XML: 191>
YEAR = <TokenType.YEAR: 192>
USERDEFINED = <TokenType.USERDEFINED: 193>
MONEY = <TokenType.MONEY: 194>
SMALLMONEY = <TokenType.SMALLMONEY: 195>
ROWVERSION = <TokenType.ROWVERSION: 196>
IMAGE = <TokenType.IMAGE: 197>
VARIANT = <TokenType.VARIANT: 198>
OBJECT = <TokenType.OBJECT: 199>
INET = <TokenType.INET: 200>
IPADDRESS = <TokenType.IPADDRESS: 201>
IPPREFIX = <TokenType.IPPREFIX: 202>
IPV4 = <TokenType.IPV4: 203>
IPV6 = <TokenType.IPV6: 204>
ENUM = <TokenType.ENUM: 205>
ENUM8 = <TokenType.ENUM8: 206>
ENUM16 = <TokenType.ENUM16: 207>
FIXEDSTRING = <TokenType.FIXEDSTRING: 208>
LOWCARDINALITY = <TokenType.LOWCARDINALITY: 209>
NESTED = <TokenType.NESTED: 210>
AGGREGATEFUNCTION = <TokenType.AGGREGATEFUNCTION: 211>
SIMPLEAGGREGATEFUNCTION = <TokenType.SIMPLEAGGREGATEFUNCTION: 212>
TDIGEST = <TokenType.TDIGEST: 213>
UNKNOWN = <TokenType.UNKNOWN: 214>
VECTOR = <TokenType.VECTOR: 215>
DYNAMIC = <TokenType.DYNAMIC: 216>
VOID = <TokenType.VOID: 217>
ALIAS = <TokenType.ALIAS: 218>
ALTER = <TokenType.ALTER: 219>
ALL = <TokenType.ALL: 220>
ANTI = <TokenType.ANTI: 221>
ANY = <TokenType.ANY: 222>
APPLY = <TokenType.APPLY: 223>
ARRAY = <TokenType.ARRAY: 224>
ASC = <TokenType.ASC: 225>
ASOF = <TokenType.ASOF: 226>
ATTACH = <TokenType.ATTACH: 227>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 228>
BEGIN = <TokenType.BEGIN: 229>
BETWEEN = <TokenType.BETWEEN: 230>
BULK_COLLECT_INTO = <TokenType.BULK_COLLECT_INTO: 231>
CACHE = <TokenType.CACHE: 232>
CASE = <TokenType.CASE: 233>
CHARACTER_SET = <TokenType.CHARACTER_SET: 234>
CLUSTER_BY = <TokenType.CLUSTER_BY: 235>
COLLATE = <TokenType.COLLATE: 236>
COMMAND = <TokenType.COMMAND: 237>
COMMENT = <TokenType.COMMENT: 238>
COMMIT = <TokenType.COMMIT: 239>
CONNECT_BY = <TokenType.CONNECT_BY: 240>
CONSTRAINT = <TokenType.CONSTRAINT: 241>
COPY = <TokenType.COPY: 242>
CREATE = <TokenType.CREATE: 243>
CROSS = <TokenType.CROSS: 244>
CUBE = <TokenType.CUBE: 245>
CURRENT_DATE = <TokenType.CURRENT_DATE: 246>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 247>
CURRENT_SCHEMA = <TokenType.CURRENT_SCHEMA: 248>
CURRENT_TIME = <TokenType.CURRENT_TIME: 249>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 250>
CURRENT_USER = <TokenType.CURRENT_USER: 251>
CURRENT_USER_ID = <TokenType.CURRENT_USER_ID: 252>
CURRENT_ROLE = <TokenType.CURRENT_ROLE: 253>
CURRENT_CATALOG = <TokenType.CURRENT_CATALOG: 254>
DECLARE = <TokenType.DECLARE: 255>
DEFAULT = <TokenType.DEFAULT: 256>
DELETE = <TokenType.DELETE: 257>
DESC = <TokenType.DESC: 258>
DESCRIBE = <TokenType.DESCRIBE: 259>
DETACH = <TokenType.DETACH: 260>
DICTIONARY = <TokenType.DICTIONARY: 261>
DISTINCT = <TokenType.DISTINCT: 262>
DISTRIBUTE_BY = <TokenType.DISTRIBUTE_BY: 263>
DIV = <TokenType.DIV: 264>
DROP = <TokenType.DROP: 265>
ELSE = <TokenType.ELSE: 266>
END = <TokenType.END: 267>
ESCAPE = <TokenType.ESCAPE: 268>
EXCEPT = <TokenType.EXCEPT: 269>
EXECUTE = <TokenType.EXECUTE: 270>
EXISTS = <TokenType.EXISTS: 271>
FALSE = <TokenType.FALSE: 272>
FETCH = <TokenType.FETCH: 273>
FILE = <TokenType.FILE: 274>
FILE_FORMAT = <TokenType.FILE_FORMAT: 275>
FILTER = <TokenType.FILTER: 276>
FINAL = <TokenType.FINAL: 277>
FIRST = <TokenType.FIRST: 278>
FOR = <TokenType.FOR: 279>
FORCE = <TokenType.FORCE: 280>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 281>
FORMAT = <TokenType.FORMAT: 282>
FROM = <TokenType.FROM: 283>
FULL = <TokenType.FULL: 284>
FUNCTION = <TokenType.FUNCTION: 285>
GET = <TokenType.GET: 286>
GLOB = <TokenType.GLOB: 287>
GLOBAL = <TokenType.GLOBAL: 288>
GRANT = <TokenType.GRANT: 289>
GROUP_BY = <TokenType.GROUP_BY: 290>
GROUPING_SETS = <TokenType.GROUPING_SETS: 291>
HAVING = <TokenType.HAVING: 292>
HINT = <TokenType.HINT: 293>
IGNORE = <TokenType.IGNORE: 294>
ILIKE = <TokenType.ILIKE: 295>
IN = <TokenType.IN: 296>
INDEX = <TokenType.INDEX: 297>
INDEXED_BY = <TokenType.INDEXED_BY: 298>
INNER = <TokenType.INNER: 299>
INSERT = <TokenType.INSERT: 300>
INSTALL = <TokenType.INSTALL: 301>
INTEGRATION = <TokenType.INTEGRATION: 302>
INTERSECT = <TokenType.INTERSECT: 303>
INTERVAL = <TokenType.INTERVAL: 304>
INTO = <TokenType.INTO: 305>
INTRODUCER = <TokenType.INTRODUCER: 306>
IRLIKE = <TokenType.IRLIKE: 307>
IS = <TokenType.IS: 308>
ISNULL = <TokenType.ISNULL: 309>
JOIN = <TokenType.JOIN: 310>
JOIN_MARKER = <TokenType.JOIN_MARKER: 311>
KEEP = <TokenType.KEEP: 312>
KEY = <TokenType.KEY: 313>
KILL = <TokenType.KILL: 314>
LANGUAGE = <TokenType.LANGUAGE: 315>
LATERAL = <TokenType.LATERAL: 316>
LEFT = <TokenType.LEFT: 317>
LIKE = <TokenType.LIKE: 318>
LIMIT = <TokenType.LIMIT: 319>
LIST = <TokenType.LIST: 320>
LOAD = <TokenType.LOAD: 321>
LOCK = <TokenType.LOCK: 322>
MAP = <TokenType.MAP: 323>
MATCH = <TokenType.MATCH: 324>
MATCH_CONDITION = <TokenType.MATCH_CONDITION: 325>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 326>
MEMBER_OF = <TokenType.MEMBER_OF: 327>
MERGE = <TokenType.MERGE: 328>
MOD = <TokenType.MOD: 329>
MODEL = <TokenType.MODEL: 330>
NATURAL = <TokenType.NATURAL: 331>
NEXT = <TokenType.NEXT: 332>
NOTHING = <TokenType.NOTHING: 333>
NOTNULL = <TokenType.NOTNULL: 334>
NULL = <TokenType.NULL: 335>
OBJECT_IDENTIFIER = <TokenType.OBJECT_IDENTIFIER: 336>
OFFSET = <TokenType.OFFSET: 337>
ON = <TokenType.ON: 338>
ONLY = <TokenType.ONLY: 339>
OPERATOR = <TokenType.OPERATOR: 340>
ORDER_BY = <TokenType.ORDER_BY: 341>
ORDER_SIBLINGS_BY = <TokenType.ORDER_SIBLINGS_BY: 342>
ORDERED = <TokenType.ORDERED: 343>
ORDINALITY = <TokenType.ORDINALITY: 344>
OUT = <TokenType.OUT: 345>
INOUT = <TokenType.INOUT: 346>
OUTER = <TokenType.OUTER: 347>
OVER = <TokenType.OVER: 348>
OVERLAPS = <TokenType.OVERLAPS: 349>
OVERWRITE = <TokenType.OVERWRITE: 350>
PACKAGE = <TokenType.PACKAGE: 351>
PARTITION = <TokenType.PARTITION: 352>
PARTITION_BY = <TokenType.PARTITION_BY: 353>
PERCENT = <TokenType.PERCENT: 354>
PIVOT = <TokenType.PIVOT: 355>
PLACEHOLDER = <TokenType.PLACEHOLDER: 356>
POLICY = <TokenType.POLICY: 357>
POOL = <TokenType.POOL: 358>
POSITIONAL = <TokenType.POSITIONAL: 359>
PRAGMA = <TokenType.PRAGMA: 360>
PREWHERE = <TokenType.PREWHERE: 361>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 362>
PROCEDURE = <TokenType.PROCEDURE: 363>
PROPERTIES = <TokenType.PROPERTIES: 364>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 365>
PUT = <TokenType.PUT: 366>
QUALIFY = <TokenType.QUALIFY: 367>
QUOTE = <TokenType.QUOTE: 368>
QDCOLON = <TokenType.QDCOLON: 369>
RANGE = <TokenType.RANGE: 370>
RECURSIVE = <TokenType.RECURSIVE: 371>
REFRESH = <TokenType.REFRESH: 372>
RENAME = <TokenType.RENAME: 373>
REPLACE = <TokenType.REPLACE: 374>
RETURNING = <TokenType.RETURNING: 375>
REVOKE = <TokenType.REVOKE: 376>
REFERENCES = <TokenType.REFERENCES: 377>
RIGHT = <TokenType.RIGHT: 378>
RLIKE = <TokenType.RLIKE: 379>
ROLE = <TokenType.ROLE: 380>
ROLLBACK = <TokenType.ROLLBACK: 381>
ROLLUP = <TokenType.ROLLUP: 382>
ROW = <TokenType.ROW: 383>
ROWS = <TokenType.ROWS: 384>
RULE = <TokenType.RULE: 385>
SELECT = <TokenType.SELECT: 386>
SEMI = <TokenType.SEMI: 387>
SEPARATOR = <TokenType.SEPARATOR: 388>
SEQUENCE = <TokenType.SEQUENCE: 389>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 390>
SET = <TokenType.SET: 391>
SETTINGS = <TokenType.SETTINGS: 392>
SHOW = <TokenType.SHOW: 393>
SIMILAR_TO = <TokenType.SIMILAR_TO: 394>
SOME = <TokenType.SOME: 395>
SORT_BY = <TokenType.SORT_BY: 396>
SOUNDS_LIKE = <TokenType.SOUNDS_LIKE: 397>
SQL_SECURITY = <TokenType.SQL_SECURITY: 398>
START_WITH = <TokenType.START_WITH: 399>
STORAGE_INTEGRATION = <TokenType.STORAGE_INTEGRATION: 400>
STRAIGHT_JOIN = <TokenType.STRAIGHT_JOIN: 401>
STRUCT = <TokenType.STRUCT: 402>
SUMMARIZE = <TokenType.SUMMARIZE: 403>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 404>
TAG = <TokenType.TAG: 405>
TEMPORARY = <TokenType.TEMPORARY: 406>
TOP = <TokenType.TOP: 407>
THEN = <TokenType.THEN: 408>
TRUE = <TokenType.TRUE: 409>
TRUNCATE = <TokenType.TRUNCATE: 410>
TRIGGER = <TokenType.TRIGGER: 411>
TYPE = <TokenType.TYPE: 412>
UNCACHE = <TokenType.UNCACHE: 413>
UNDROP = <TokenType.UNDROP: 414>
UNION = <TokenType.UNION: 415>
UNNEST = <TokenType.UNNEST: 416>
UNPIVOT = <TokenType.UNPIVOT: 417>
UPDATE = <TokenType.UPDATE: 418>
USE = <TokenType.USE: 419>
USING = <TokenType.USING: 420>
VALUES = <TokenType.VALUES: 421>
VARIADIC = <TokenType.VARIADIC: 422>
VIEW = <TokenType.VIEW: 423>
SEMANTIC_VIEW = <TokenType.SEMANTIC_VIEW: 424>
VOLATILE = <TokenType.VOLATILE: 425>
VOLUME = <TokenType.VOLUME: 426>
WHEN = <TokenType.WHEN: 427>
WHERE = <TokenType.WHERE: 428>
WINDOW = <TokenType.WINDOW: 429>
WITH = <TokenType.WITH: 430>
UNIQUE = <TokenType.UNIQUE: 431>
UTC_DATE = <TokenType.UTC_DATE: 432>
UTC_TIME = <TokenType.UTC_TIME: 433>
UTC_TIMESTAMP = <TokenType.UTC_TIMESTAMP: 434>
VERSION_SNAPSHOT = <TokenType.VERSION_SNAPSHOT: 435>
TIMESTAMP_SNAPSHOT = <TokenType.TIMESTAMP_SNAPSHOT: 436>
OPTION = <TokenType.OPTION: 437>
SINK = <TokenType.SINK: 438>
SOURCE = <TokenType.SOURCE: 439>
ANALYZE = <TokenType.ANALYZE: 440>
NAMESPACE = <TokenType.NAMESPACE: 441>
EXPORT = <TokenType.EXPORT: 442>
HIVE_TOKEN_STREAM = <TokenType.HIVE_TOKEN_STREAM: 443>
SENTINEL = <TokenType.SENTINEL: 444>
class Token:
474class Token:
475    # mypyc doesn't expose slots
476    _attrs: t.ClassVar[tuple[str, ...]] = (
477        "token_type",
478        "text",
479        "line",
480        "col",
481        "start",
482        "end",
483        "comments",
484    )
485    __slots__ = _attrs
486
487    @classmethod
488    def number(cls, number: int) -> Token:
489        """Returns a NUMBER token with `number` as its text."""
490        return cls(TokenType.NUMBER, str(number))
491
492    @classmethod
493    def string(cls, string: str) -> Token:
494        """Returns a STRING token with `string` as its text."""
495        return cls(TokenType.STRING, string)
496
497    @classmethod
498    def identifier(cls, identifier: str) -> Token:
499        """Returns an IDENTIFIER token with `identifier` as its text."""
500        return cls(TokenType.IDENTIFIER, identifier)
501
502    @classmethod
503    def var(cls, var: str) -> Token:
504        """Returns an VAR token with `var` as its text."""
505        return cls(TokenType.VAR, var)
506
507    def __init__(
508        self,
509        token_type: TokenType,
510        text: str,
511        line: int = 1,
512        col: int = 1,
513        start: int = 0,
514        end: int = 0,
515        comments: list[str] | None = None,
516    ) -> None:
517        self.token_type = token_type
518        self.text = text
519        self.line = line
520        self.col = col
521        self.start = start
522        self.end = end
523        self.comments = [] if comments is None else comments
524
525    def __bool__(self) -> bool:
526        return self.token_type != TokenType.SENTINEL
527
528    def __repr__(self) -> str:
529        attributes = ", ".join(
530            f"{k}: TokenType.{self.token_type.name}"
531            if k == "token_type"
532            else f"{k}: {getattr(self, k)}"
533            for k in self._attrs
534        )
535        return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
507    def __init__(
508        self,
509        token_type: TokenType,
510        text: str,
511        line: int = 1,
512        col: int = 1,
513        start: int = 0,
514        end: int = 0,
515        comments: list[str] | None = None,
516    ) -> None:
517        self.token_type = token_type
518        self.text = text
519        self.line = line
520        self.col = col
521        self.start = start
522        self.end = end
523        self.comments = [] if comments is None else comments
@classmethod
def number(cls, number: int) -> Token:
487    @classmethod
488    def number(cls, number: int) -> Token:
489        """Returns a NUMBER token with `number` as its text."""
490        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
492    @classmethod
493    def string(cls, string: str) -> Token:
494        """Returns a STRING token with `string` as its text."""
495        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
497    @classmethod
498    def identifier(cls, identifier: str) -> Token:
499        """Returns an IDENTIFIER token with `identifier` as its text."""
500        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
502    @classmethod
503    def var(cls, var: str) -> Token:
504        """Returns an VAR token with `var` as its text."""
505        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class TokenizerCore:
 538class TokenizerCore:
 539    __slots__ = (
 540        "sql",
 541        "size",
 542        "tokens",
 543        "_start",
 544        "_current",
 545        "_line",
 546        "_col",
 547        "_comments",
 548        "_char",
 549        "_end",
 550        "_peek",
 551        "_prev_token_line",
 552        "single_tokens",
 553        "keywords",
 554        "quotes",
 555        "format_strings",
 556        "identifiers",
 557        "comments",
 558        "string_escapes",
 559        "byte_string_escapes",
 560        "identifier_escapes",
 561        "escape_follow_chars",
 562        "commands",
 563        "command_prefix_tokens",
 564        "nested_comments",
 565        "hint_start",
 566        "tokens_preceding_hint",
 567        "has_bit_strings",
 568        "has_hex_strings",
 569        "numeric_literals",
 570        "var_single_tokens",
 571        "string_escapes_allowed_in_raw_strings",
 572        "heredoc_tag_is_identifier",
 573        "heredoc_string_alternative",
 574        "keyword_trie",
 575        "numbers_can_be_underscore_separated",
 576        "numbers_can_have_decimals",
 577        "identifiers_can_start_with_digit",
 578        "unescaped_sequences",
 579    )
 580
 581    def __init__(
 582        self,
 583        single_tokens: dict[str, TokenType],
 584        keywords: dict[str, TokenType],
 585        quotes: dict[str, str],
 586        format_strings: dict[str, tuple[str, TokenType]],
 587        identifiers: dict[str, str],
 588        comments: dict[str, str | None],
 589        string_escapes: set[str],
 590        byte_string_escapes: set[str],
 591        identifier_escapes: set[str],
 592        escape_follow_chars: set[str],
 593        commands: set[TokenType],
 594        command_prefix_tokens: set[TokenType],
 595        nested_comments: bool,
 596        hint_start: str,
 597        tokens_preceding_hint: set[TokenType],
 598        has_bit_strings: bool,
 599        has_hex_strings: bool,
 600        numeric_literals: dict[str, str],
 601        var_single_tokens: set[str],
 602        string_escapes_allowed_in_raw_strings: bool,
 603        heredoc_tag_is_identifier: bool,
 604        heredoc_string_alternative: TokenType,
 605        keyword_trie: dict,
 606        numbers_can_be_underscore_separated: bool,
 607        numbers_can_have_decimals: bool,
 608        identifiers_can_start_with_digit: bool,
 609        unescaped_sequences: dict[str, str],
 610    ) -> None:
 611        self.single_tokens = single_tokens
 612        self.keywords = keywords
 613        self.quotes = quotes
 614        self.format_strings = format_strings
 615        self.identifiers = identifiers
 616        self.comments = comments
 617        self.string_escapes = string_escapes
 618        self.byte_string_escapes = byte_string_escapes
 619        self.identifier_escapes = identifier_escapes
 620        self.escape_follow_chars = escape_follow_chars
 621        self.commands = commands
 622        self.command_prefix_tokens = command_prefix_tokens
 623        self.nested_comments = nested_comments
 624        self.hint_start = hint_start
 625        self.tokens_preceding_hint = tokens_preceding_hint
 626        self.has_bit_strings = has_bit_strings
 627        self.has_hex_strings = has_hex_strings
 628        self.numeric_literals = numeric_literals
 629        self.var_single_tokens = var_single_tokens
 630        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 631        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 632        self.heredoc_string_alternative = heredoc_string_alternative
 633        self.keyword_trie = keyword_trie
 634        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 635        self.numbers_can_have_decimals = numbers_can_have_decimals
 636        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 637        self.unescaped_sequences = unescaped_sequences
 638        self.sql = ""
 639        self.size = 0
 640        self.tokens: list[Token] = []
 641        self._start = 0
 642        self._current = 0
 643        self._line = 1
 644        self._col = 0
 645        self._comments: list[str] = []
 646        self._char = ""
 647        self._end = False
 648        self._peek = ""
 649        self._prev_token_line = -1
 650
 651    def reset(self) -> None:
 652        self.sql = ""
 653        self.size = 0
 654        self.tokens = []
 655        self._start = 0
 656        self._current = 0
 657        self._line = 1
 658        self._col = 0
 659        self._comments = []
 660        self._char = ""
 661        self._end = False
 662        self._peek = ""
 663        self._prev_token_line = -1
 664
 665    def tokenize(self, sql: str) -> list[Token]:
 666        """Returns a list of tokens corresponding to the SQL string `sql`."""
 667        self.reset()
 668        self.sql = sql
 669        self.size = len(sql)
 670
 671        try:
 672            self._scan()
 673        except Exception as e:
 674            start = max(self._current - 50, 0)
 675            end = min(self._current + 50, self.size - 1)
 676            context = self.sql[start:end]
 677            raise TokenError(f"Error tokenizing '{context}'") from e
 678
 679        return self.tokens
 680
 681    def _scan(self, check_semicolon: bool = False) -> None:
 682        identifiers = self.identifiers
 683        digit_chars = _DIGIT_CHARS
 684
 685        while self.size and not self._end:
 686            current = self._current
 687
 688            # Skip spaces here rather than iteratively calling advance() for performance reasons
 689            while current < self.size:
 690                char = self.sql[current]
 691
 692                if char == " " or char == "\t":
 693                    current += 1
 694                else:
 695                    break
 696
 697            offset = current - self._current if current > self._current else 1
 698
 699            self._start = current
 700            self._advance(offset)
 701
 702            if not self._char.isspace():
 703                if self._char in digit_chars:
 704                    self._scan_number()
 705                elif self._char in identifiers:
 706                    self._scan_identifier(identifiers[self._char])
 707                else:
 708                    self._scan_keywords()
 709
 710            if check_semicolon and self._peek == ";":
 711                break
 712
 713        if self.tokens and self._comments:
 714            self.tokens[-1].comments.extend(self._comments)
 715
 716    def _chars(self, size: int) -> str:
 717        if size == 1:
 718            return self._char
 719
 720        start = self._current - 1
 721        end = start + size
 722
 723        return self.sql[start:end] if end <= self.size else ""
 724
 725    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 726        char = self._char
 727
 728        if char == "\n" or char == "\r":
 729            # Ensures we don't count an extra line if we get a \r\n line break sequence
 730            if not (char == "\r" and self._peek == "\n"):
 731                self._col = i
 732                self._line += 1
 733        else:
 734            self._col += i
 735
 736        self._current += i
 737        sql = self.sql
 738        size = self.size
 739        self._end = self._current >= size
 740        self._char = sql[self._current - 1]
 741        self._peek = "" if self._end else sql[self._current]
 742
 743        if alnum and self._char.isalnum():
 744            # Cache to local variables instead of attributes for better performance
 745            _col = self._col
 746            _current = self._current
 747            _end = self._end
 748            _peek = self._peek
 749
 750            while _peek.isalnum():
 751                _col += 1
 752                _current += 1
 753                _end = _current >= size
 754                _peek = "" if _end else sql[_current]
 755
 756            self._col = _col
 757            self._current = _current
 758            self._end = _end
 759            self._peek = _peek
 760            self._char = sql[_current - 1]
 761
 762    @property
 763    def _text(self) -> str:
 764        return self.sql[self._start : self._current]
 765
 766    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 767        self._prev_token_line = self._line
 768
 769        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 770            self.tokens[-1].comments.extend(self._comments)
 771            self._comments = []
 772
 773        if text is None:
 774            text = self.sql[self._start : self._current]
 775
 776        self.tokens.append(
 777            Token(
 778                token_type,
 779                text=text,
 780                line=self._line,
 781                col=self._col,
 782                start=self._start,
 783                end=self._current - 1,
 784                comments=self._comments,
 785            )
 786        )
 787        self._comments = []
 788
 789        # If we have either a semicolon or a begin token before the command's token, we'll parse
 790        # whatever follows the command's token as a string
 791        if (
 792            token_type in self.commands
 793            and self._peek != ";"
 794            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 795        ):
 796            start = self._current
 797            tokens = len(self.tokens)
 798            self._scan(check_semicolon=True)
 799            self.tokens = self.tokens[:tokens]
 800            text = self.sql[start : self._current].strip()
 801            if text:
 802                self._add(TokenType.STRING, text)
 803
 804    def _scan_keywords(self) -> None:
 805        sql = self.sql
 806        sql_size = self.size
 807        single_tokens = self.single_tokens
 808        char_upper = _CHAR_UPPER
 809        size = 0
 810        word = None
 811        chars = self._char
 812        char = chars
 813        prev_space = False
 814        skip = False
 815        trie = self.keyword_trie
 816        single_token = char in single_tokens
 817
 818        while chars:
 819            if not skip:
 820                sub = trie.get(char_upper.get(char, char))
 821                if sub is None:
 822                    break
 823                trie = sub
 824                if 0 in trie:
 825                    word = chars
 826
 827            end = self._current + size
 828            size += 1
 829
 830            if end < sql_size:
 831                char = sql[end]
 832                single_token = single_token or char in single_tokens
 833                is_space = char.isspace()
 834
 835                if not is_space or not prev_space:
 836                    if is_space:
 837                        char = " "
 838                    chars += char
 839                    prev_space = is_space
 840                    skip = False
 841                else:
 842                    skip = True
 843            else:
 844                char = ""
 845                break
 846
 847        if word:
 848            if self._scan_string(word):
 849                return
 850            if self._scan_comment(word):
 851                return
 852            if prev_space or single_token or not char:
 853                self._advance(size - 1)
 854                word = word.upper()
 855                self._add(self.keywords[word], text=word)
 856                return
 857
 858        if self._char in single_tokens:
 859            self._add(single_tokens[self._char], text=self._char)
 860            return
 861
 862        self._scan_var()
 863
 864    def _scan_comment(self, comment_start: str) -> bool:
 865        if comment_start not in self.comments:
 866            return False
 867
 868        comment_start_line = self._line
 869        comment_start_size = len(comment_start)
 870        comment_end = self.comments[comment_start]
 871
 872        if comment_end:
 873            # Skip the comment's start delimiter
 874            self._advance(comment_start_size)
 875
 876            comment_count = 1
 877            comment_end_size = len(comment_end)
 878            nested_comments = self.nested_comments
 879
 880            while not self._end:
 881                if self._chars(comment_end_size) == comment_end:
 882                    comment_count -= 1
 883                    if not comment_count:
 884                        break
 885
 886                self._advance(alnum=True)
 887
 888                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 889                if (
 890                    nested_comments
 891                    and not self._end
 892                    and self._chars(comment_end_size) == comment_start
 893                ):
 894                    self._advance(comment_start_size)
 895                    comment_count += 1
 896
 897            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 898            self._advance(comment_end_size - 1)
 899        else:
 900            _peek = self._peek
 901            while not self._end and _peek != "\n" and _peek != "\r":
 902                self._advance(alnum=True)
 903                _peek = self._peek
 904            self._comments.append(self._text[comment_start_size:])
 905
 906        if (
 907            comment_start == self.hint_start
 908            and self.tokens
 909            and self.tokens[-1].token_type in self.tokens_preceding_hint
 910        ):
 911            self._add(TokenType.HINT)
 912
 913        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 914        # Multiple consecutive comments are preserved by appending them to the current comments list.
 915        if comment_start_line == self._prev_token_line:
 916            self.tokens[-1].comments.extend(self._comments)
 917            self._comments = []
 918            self._prev_token_line = self._line
 919
 920        return True
 921
 922    def _scan_number(self) -> None:
 923        if self._char == "0":
 924            peek = _CHAR_UPPER.get(self._peek, self._peek)
 925            if peek == "B":
 926                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 927            elif peek == "X":
 928                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 929
 930        decimal = False
 931        scientific = 0
 932        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 933        single_tokens = self.single_tokens
 934        keywords = self.keywords
 935        numeric_literals = self.numeric_literals
 936        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 937
 938        is_underscore_separated: bool = False
 939        number_text: str = ""
 940        numeric_literal: str = ""
 941        numeric_type: TokenType | None = None
 942
 943        while True:
 944            if self._peek in _DIGIT_CHARS:
 945                # Batch consecutive digits: scan ahead to find how many
 946                sql = self.sql
 947                end = self._current + 1
 948                size = self.size
 949                while end < size and sql[end] in _DIGIT_CHARS:
 950                    end += 1
 951                self._advance(end - self._current)
 952            elif self._peek == "." and not decimal:
 953                if (
 954                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 955                ) or not self.numbers_can_have_decimals:
 956                    break
 957                decimal = True
 958                self._advance()
 959            elif self._peek in ("-", "+") and scientific == 1:
 960                # Only consume +/- if followed by a digit
 961                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 962                    scientific += 1
 963                    self._advance()
 964                else:
 965                    break
 966            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 967                scientific += 1
 968                self._advance()
 969            elif self._peek == "_" and numbers_can_be_underscore_separated:
 970                is_underscore_separated = True
 971                self._advance()
 972            elif self._peek.isidentifier():
 973                number_text = self._text
 974
 975                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 976                    numeric_literal += self._peek
 977                    self._advance()
 978
 979                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 980
 981                if numeric_type:
 982                    break
 983                elif identifiers_can_start_with_digit:
 984                    return self._add(TokenType.VAR)
 985
 986                self._advance(-len(numeric_literal))
 987                break
 988            else:
 989                break
 990
 991        number_text = number_text or self.sql[self._start : self._current]
 992
 993        # Normalize inputs such as 100_000 to 100000
 994        if is_underscore_separated:
 995            number_text = number_text.replace("_", "")
 996
 997        self._add(TokenType.NUMBER, number_text)
 998
 999        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
1000        if numeric_type:
1001            self._add(TokenType.DCOLON, "::")
1002            self._add(numeric_type, numeric_literal)
1003
1004    def _scan_bits(self) -> None:
1005        self._advance()
1006        value = self._extract_value()
1007        try:
1008            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1009            int(value, 2)
1010            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1011        except ValueError:
1012            self._add(TokenType.IDENTIFIER)
1013
1014    def _scan_hex(self) -> None:
1015        self._advance()
1016        value = self._extract_value()
1017        try:
1018            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1019            int(value, 16)
1020            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1021        except ValueError:
1022            self._add(TokenType.IDENTIFIER)
1023
1024    def _extract_value(self) -> str:
1025        single_tokens = self.single_tokens
1026
1027        while True:
1028            char = self._peek.strip()
1029            if char and char not in single_tokens:
1030                self._advance(alnum=True)
1031            else:
1032                break
1033
1034        return self._text
1035
1036    def _scan_string(self, start: str) -> bool:
1037        base = None
1038        token_type = TokenType.STRING
1039
1040        if start in self.quotes:
1041            end = self.quotes[start]
1042        elif start in self.format_strings:
1043            end, token_type = self.format_strings[start]
1044
1045            if token_type == TokenType.HEX_STRING:
1046                base = 16
1047            elif token_type == TokenType.BIT_STRING:
1048                base = 2
1049            elif token_type == TokenType.HEREDOC_STRING:
1050                self._advance()
1051
1052                if self._char == end:
1053                    tag = ""
1054                else:
1055                    tag = self._extract_string(
1056                        end,
1057                        raw_string=True,
1058                        raise_unmatched=not self.heredoc_tag_is_identifier,
1059                    )
1060
1061                if (
1062                    tag
1063                    and self.heredoc_tag_is_identifier
1064                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1065                ):
1066                    if not self._end:
1067                        self._advance(-1)
1068
1069                    self._advance(-len(tag))
1070                    self._add(self.heredoc_string_alternative)
1071                    return True
1072
1073                end = f"{start}{tag}{end}"
1074        else:
1075            return False
1076
1077        self._advance(len(start))
1078        text = self._extract_string(
1079            end,
1080            escapes=(
1081                self.byte_string_escapes
1082                if token_type == TokenType.BYTE_STRING
1083                else self.string_escapes
1084            ),
1085            raw_string=token_type == TokenType.RAW_STRING,
1086        )
1087
1088        if base and text:
1089            try:
1090                int(text, base)
1091            except Exception:
1092                raise TokenError(
1093                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1094                )
1095
1096        self._add(token_type, text)
1097        return True
1098
1099    def _scan_identifier(self, identifier_end: str) -> None:
1100        self._advance()
1101        text = self._extract_string(
1102            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1103        )
1104        self._add(TokenType.IDENTIFIER, text)
1105
1106    def _scan_var(self) -> None:
1107        var_single_tokens = self.var_single_tokens
1108        single_tokens = self.single_tokens
1109
1110        while True:
1111            peek = self._peek
1112            if not peek or peek.isspace():
1113                break
1114            if peek not in var_single_tokens and peek in single_tokens:
1115                break
1116            self._advance(alnum=True)
1117
1118        self._add(
1119            TokenType.VAR
1120            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1121            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1122        )
1123
1124    def _extract_string(
1125        self,
1126        delimiter: str,
1127        escapes: set[str] | None = None,
1128        raw_string: bool = False,
1129        raise_unmatched: bool = True,
1130    ) -> str:
1131        text = ""
1132        delim_size = len(delimiter)
1133        escapes = self.string_escapes if escapes is None else escapes
1134        unescaped_sequences = self.unescaped_sequences
1135        escape_follow_chars = self.escape_follow_chars
1136        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1137        quotes = self.quotes
1138        sql = self.sql
1139
1140        # use str.find() when the string is simple... no \ or other escapes
1141        if delim_size == 1:
1142            pos = self._current - 1
1143            end = sql.find(delimiter, pos)
1144
1145            if (
1146                # the closing delimiter was found
1147                end != -1
1148                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1149                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1150                # no backslash in the string that would need escape processing
1151                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1152            ):
1153                newlines = sql.count("\n", pos, end)
1154                if newlines:
1155                    self._line += newlines
1156                    self._col = end - sql.rfind("\n", pos, end)
1157                else:
1158                    self._col += end - pos
1159
1160                self._current = end + 1
1161                self._end = self._current >= self.size
1162                self._char = sql[end]
1163                self._peek = "" if self._end else sql[self._current]
1164                return sql[pos:end]
1165
1166        while True:
1167            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1168                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1169                if unescaped_sequence:
1170                    self._advance(2)
1171                    text += unescaped_sequence
1172                    continue
1173
1174            is_valid_custom_escape = (
1175                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1176            )
1177
1178            if (
1179                (string_escapes_allowed_in_raw_strings or not raw_string)
1180                and self._char in escapes
1181                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1182                and (self._char not in quotes or self._char == self._peek)
1183            ):
1184                if self._peek == delimiter:
1185                    text += self._peek
1186                elif is_valid_custom_escape and self._char != self._peek:
1187                    text += self._peek
1188                else:
1189                    text += self._char + self._peek
1190
1191                if self._current + 1 < self.size:
1192                    self._advance(2)
1193                else:
1194                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1195            else:
1196                if self._chars(delim_size) == delimiter:
1197                    if delim_size > 1:
1198                        self._advance(delim_size - 1)
1199                    break
1200
1201                if self._end:
1202                    if not raise_unmatched:
1203                        return text + self._char
1204
1205                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1206
1207                current = self._current - 1
1208                self._advance(alnum=True)
1209                text += sql[current : self._current - 1]
1210
1211        return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
581    def __init__(
582        self,
583        single_tokens: dict[str, TokenType],
584        keywords: dict[str, TokenType],
585        quotes: dict[str, str],
586        format_strings: dict[str, tuple[str, TokenType]],
587        identifiers: dict[str, str],
588        comments: dict[str, str | None],
589        string_escapes: set[str],
590        byte_string_escapes: set[str],
591        identifier_escapes: set[str],
592        escape_follow_chars: set[str],
593        commands: set[TokenType],
594        command_prefix_tokens: set[TokenType],
595        nested_comments: bool,
596        hint_start: str,
597        tokens_preceding_hint: set[TokenType],
598        has_bit_strings: bool,
599        has_hex_strings: bool,
600        numeric_literals: dict[str, str],
601        var_single_tokens: set[str],
602        string_escapes_allowed_in_raw_strings: bool,
603        heredoc_tag_is_identifier: bool,
604        heredoc_string_alternative: TokenType,
605        keyword_trie: dict,
606        numbers_can_be_underscore_separated: bool,
607        numbers_can_have_decimals: bool,
608        identifiers_can_start_with_digit: bool,
609        unescaped_sequences: dict[str, str],
610    ) -> None:
611        self.single_tokens = single_tokens
612        self.keywords = keywords
613        self.quotes = quotes
614        self.format_strings = format_strings
615        self.identifiers = identifiers
616        self.comments = comments
617        self.string_escapes = string_escapes
618        self.byte_string_escapes = byte_string_escapes
619        self.identifier_escapes = identifier_escapes
620        self.escape_follow_chars = escape_follow_chars
621        self.commands = commands
622        self.command_prefix_tokens = command_prefix_tokens
623        self.nested_comments = nested_comments
624        self.hint_start = hint_start
625        self.tokens_preceding_hint = tokens_preceding_hint
626        self.has_bit_strings = has_bit_strings
627        self.has_hex_strings = has_hex_strings
628        self.numeric_literals = numeric_literals
629        self.var_single_tokens = var_single_tokens
630        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
631        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
632        self.heredoc_string_alternative = heredoc_string_alternative
633        self.keyword_trie = keyword_trie
634        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
635        self.numbers_can_have_decimals = numbers_can_have_decimals
636        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
637        self.unescaped_sequences = unescaped_sequences
638        self.sql = ""
639        self.size = 0
640        self.tokens: list[Token] = []
641        self._start = 0
642        self._current = 0
643        self._line = 1
644        self._col = 0
645        self._comments: list[str] = []
646        self._char = ""
647        self._end = False
648        self._peek = ""
649        self._prev_token_line = -1
single_tokens
keywords
quotes
format_strings
identifiers
comments
string_escapes
byte_string_escapes
identifier_escapes
escape_follow_chars
commands
command_prefix_tokens
nested_comments
hint_start
tokens_preceding_hint
has_bit_strings
has_hex_strings
numeric_literals
var_single_tokens
string_escapes_allowed_in_raw_strings
heredoc_tag_is_identifier
heredoc_string_alternative
keyword_trie
numbers_can_be_underscore_separated
numbers_can_have_decimals
identifiers_can_start_with_digit
unescaped_sequences
sql
size
tokens: list[Token]
def reset(self) -> None:
651    def reset(self) -> None:
652        self.sql = ""
653        self.size = 0
654        self.tokens = []
655        self._start = 0
656        self._current = 0
657        self._line = 1
658        self._col = 0
659        self._comments = []
660        self._char = ""
661        self._end = False
662        self._peek = ""
663        self._prev_token_line = -1
def tokenize(self, sql: str) -> list[Token]:
665    def tokenize(self, sql: str) -> list[Token]:
666        """Returns a list of tokens corresponding to the SQL string `sql`."""
667        self.reset()
668        self.sql = sql
669        self.size = len(sql)
670
671        try:
672            self._scan()
673        except Exception as e:
674            start = max(self._current - 50, 0)
675            end = min(self._current + 50, self.size - 1)
676            context = self.sql[start:end]
677            raise TokenError(f"Error tokenizing '{context}'") from e
678
679        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.