Edit on GitHub

sqlglot.tokenizer_core

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import IntEnum, auto
   5
   6from sqlglot.errors import TokenError
   7
   8# dict lookup is faster than .upper() and .isdigit()
   9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)}
  10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789")
  11
  12
  13class TokenType(IntEnum):
  14    L_PAREN = auto()
  15    R_PAREN = auto()
  16    L_BRACKET = auto()
  17    R_BRACKET = auto()
  18    L_BRACE = auto()
  19    R_BRACE = auto()
  20    COMMA = auto()
  21    DOT = auto()
  22    DASH = auto()
  23    PLUS = auto()
  24    COLON = auto()
  25    DOTCOLON = auto()
  26    DOTCARET = auto()
  27    DCOLON = auto()
  28    DCOLONDOLLAR = auto()
  29    DCOLONPERCENT = auto()
  30    DCOLONQMARK = auto()
  31    DQMARK = auto()
  32    SEMICOLON = auto()
  33    STAR = auto()
  34    BACKSLASH = auto()
  35    SLASH = auto()
  36    LT = auto()
  37    LTE = auto()
  38    GT = auto()
  39    GTE = auto()
  40    NOT = auto()
  41    EQ = auto()
  42    NEQ = auto()
  43    NULLSAFE_EQ = auto()
  44    COLON_EQ = auto()
  45    COLON_GT = auto()
  46    NCOLON_GT = auto()
  47    AND = auto()
  48    OR = auto()
  49    AMP = auto()
  50    DPIPE = auto()
  51    PIPE_GT = auto()
  52    PIPE = auto()
  53    PIPE_SLASH = auto()
  54    DPIPE_SLASH = auto()
  55    CARET = auto()
  56    CARET_AT = auto()
  57    TILDE = auto()
  58    ARROW = auto()
  59    DARROW = auto()
  60    FARROW = auto()
  61    HASH = auto()
  62    HASH_ARROW = auto()
  63    DHASH_ARROW = auto()
  64    LR_ARROW = auto()
  65    LLRR_ARROW = auto()
  66    DAT = auto()
  67    LT_AT = auto()
  68    AT_GT = auto()
  69    DOLLAR = auto()
  70    PARAMETER = auto()
  71    SESSION = auto()
  72    SESSION_PARAMETER = auto()
  73    SESSION_USER = auto()
  74    DAMP = auto()
  75    AMP_LT = auto()
  76    AMP_GT = auto()
  77    ADJACENT = auto()
  78    XOR = auto()
  79    DSTAR = auto()
  80    QMARK_AMP = auto()
  81    QMARK_PIPE = auto()
  82    HASH_DASH = auto()
  83    EXCLAMATION = auto()
  84
  85    URI_START = auto()
  86
  87    BLOCK_START = auto()
  88    BLOCK_END = auto()
  89
  90    SPACE = auto()
  91    BREAK = auto()
  92
  93    STRING = auto()
  94    NUMBER = auto()
  95    IDENTIFIER = auto()
  96    DATABASE = auto()
  97    COLUMN = auto()
  98    COLUMN_DEF = auto()
  99    SCHEMA = auto()
 100    TABLE = auto()
 101    WAREHOUSE = auto()
 102    STAGE = auto()
 103    STREAM = auto()
 104    STREAMLIT = auto()
 105    VAR = auto()
 106    BIT_STRING = auto()
 107    HEX_STRING = auto()
 108    BYTE_STRING = auto()
 109    NATIONAL_STRING = auto()
 110    RAW_STRING = auto()
 111    HEREDOC_STRING = auto()
 112    UNICODE_STRING = auto()
 113
 114    # types
 115    BIT = auto()
 116    BOOLEAN = auto()
 117    TINYINT = auto()
 118    UTINYINT = auto()
 119    SMALLINT = auto()
 120    USMALLINT = auto()
 121    MEDIUMINT = auto()
 122    UMEDIUMINT = auto()
 123    INT = auto()
 124    UINT = auto()
 125    BIGINT = auto()
 126    UBIGINT = auto()
 127    BIGNUM = auto()
 128    INT128 = auto()
 129    UINT128 = auto()
 130    INT256 = auto()
 131    UINT256 = auto()
 132    FLOAT = auto()
 133    DOUBLE = auto()
 134    UDOUBLE = auto()
 135    DECIMAL = auto()
 136    DECIMAL32 = auto()
 137    DECIMAL64 = auto()
 138    DECIMAL128 = auto()
 139    DECIMAL256 = auto()
 140    DECFLOAT = auto()
 141    UDECIMAL = auto()
 142    BIGDECIMAL = auto()
 143    CHAR = auto()
 144    NCHAR = auto()
 145    VARCHAR = auto()
 146    NVARCHAR = auto()
 147    BPCHAR = auto()
 148    TEXT = auto()
 149    MEDIUMTEXT = auto()
 150    LONGTEXT = auto()
 151    BLOB = auto()
 152    MEDIUMBLOB = auto()
 153    LONGBLOB = auto()
 154    TINYBLOB = auto()
 155    TINYTEXT = auto()
 156    NAME = auto()
 157    BINARY = auto()
 158    VARBINARY = auto()
 159    JSON = auto()
 160    JSONB = auto()
 161    TIME = auto()
 162    TIMETZ = auto()
 163    TIME_NS = auto()
 164    TIMESTAMP = auto()
 165    TIMESTAMPTZ = auto()
 166    TIMESTAMPLTZ = auto()
 167    TIMESTAMPNTZ = auto()
 168    TIMESTAMP_S = auto()
 169    TIMESTAMP_MS = auto()
 170    TIMESTAMP_NS = auto()
 171    DATETIME = auto()
 172    DATETIME2 = auto()
 173    DATETIME64 = auto()
 174    SMALLDATETIME = auto()
 175    DATE = auto()
 176    DATE32 = auto()
 177    INT4RANGE = auto()
 178    INT4MULTIRANGE = auto()
 179    INT8RANGE = auto()
 180    INT8MULTIRANGE = auto()
 181    NUMRANGE = auto()
 182    NUMMULTIRANGE = auto()
 183    TSRANGE = auto()
 184    TSMULTIRANGE = auto()
 185    TSTZRANGE = auto()
 186    TSTZMULTIRANGE = auto()
 187    DATERANGE = auto()
 188    DATEMULTIRANGE = auto()
 189    UUID = auto()
 190    GEOGRAPHY = auto()
 191    GEOGRAPHYPOINT = auto()
 192    NULLABLE = auto()
 193    GEOMETRY = auto()
 194    POINT = auto()
 195    RING = auto()
 196    LINESTRING = auto()
 197    LOCALTIME = auto()
 198    LOCALTIMESTAMP = auto()
 199    SYSTIMESTAMP = auto()
 200    MULTILINESTRING = auto()
 201    POLYGON = auto()
 202    MULTIPOLYGON = auto()
 203    HLLSKETCH = auto()
 204    HSTORE = auto()
 205    SUPER = auto()
 206    SERIAL = auto()
 207    SMALLSERIAL = auto()
 208    BIGSERIAL = auto()
 209    XML = auto()
 210    YEAR = auto()
 211    USERDEFINED = auto()
 212    MONEY = auto()
 213    SMALLMONEY = auto()
 214    ROWVERSION = auto()
 215    IMAGE = auto()
 216    VARIANT = auto()
 217    OBJECT = auto()
 218    INET = auto()
 219    IPADDRESS = auto()
 220    IPPREFIX = auto()
 221    IPV4 = auto()
 222    IPV6 = auto()
 223    ENUM = auto()
 224    ENUM8 = auto()
 225    ENUM16 = auto()
 226    FIXEDSTRING = auto()
 227    LOWCARDINALITY = auto()
 228    NESTED = auto()
 229    AGGREGATEFUNCTION = auto()
 230    SIMPLEAGGREGATEFUNCTION = auto()
 231    TDIGEST = auto()
 232    UNKNOWN = auto()
 233    VECTOR = auto()
 234    DYNAMIC = auto()
 235    VOID = auto()
 236
 237    # keywords
 238    ALIAS = auto()
 239    ALTER = auto()
 240    ALL = auto()
 241    ANTI = auto()
 242    ANY = auto()
 243    APPLY = auto()
 244    ARRAY = auto()
 245    ASC = auto()
 246    ASOF = auto()
 247    ATTACH = auto()
 248    AUTO_INCREMENT = auto()
 249    BEGIN = auto()
 250    BETWEEN = auto()
 251    BULK_COLLECT_INTO = auto()
 252    CACHE = auto()
 253    CASE = auto()
 254    CHARACTER_SET = auto()
 255    CLUSTER_BY = auto()
 256    COLLATE = auto()
 257    COMMAND = auto()
 258    COMMENT = auto()
 259    COMMIT = auto()
 260    CONNECT_BY = auto()
 261    CONSTRAINT = auto()
 262    COPY = auto()
 263    CREATE = auto()
 264    CROSS = auto()
 265    CUBE = auto()
 266    CURRENT_DATE = auto()
 267    CURRENT_DATETIME = auto()
 268    CURRENT_SCHEMA = auto()
 269    CURRENT_TIME = auto()
 270    CURRENT_TIMESTAMP = auto()
 271    CURRENT_USER = auto()
 272    CURRENT_USER_ID = auto()
 273    CURRENT_ROLE = auto()
 274    CURRENT_CATALOG = auto()
 275    DECLARE = auto()
 276    DEFAULT = auto()
 277    DELETE = auto()
 278    DESC = auto()
 279    DESCRIBE = auto()
 280    DETACH = auto()
 281    DICTIONARY = auto()
 282    DISTINCT = auto()
 283    DISTRIBUTE_BY = auto()
 284    DIV = auto()
 285    DROP = auto()
 286    ELSE = auto()
 287    END = auto()
 288    ESCAPE = auto()
 289    EXCEPT = auto()
 290    EXECUTE = auto()
 291    EXISTS = auto()
 292    FALSE = auto()
 293    FETCH = auto()
 294    FILE = auto()
 295    FILE_FORMAT = auto()
 296    FILTER = auto()
 297    FINAL = auto()
 298    FIRST = auto()
 299    FOR = auto()
 300    FORCE = auto()
 301    FOREIGN_KEY = auto()
 302    FORMAT = auto()
 303    FROM = auto()
 304    FULL = auto()
 305    FUNCTION = auto()
 306    GET = auto()
 307    GLOB = auto()
 308    GLOBAL = auto()
 309    GRANT = auto()
 310    GROUP_BY = auto()
 311    GROUPING_SETS = auto()
 312    HAVING = auto()
 313    HINT = auto()
 314    IGNORE = auto()
 315    ILIKE = auto()
 316    IN = auto()
 317    INDEX = auto()
 318    INDEXED_BY = auto()
 319    INNER = auto()
 320    INSERT = auto()
 321    INSTALL = auto()
 322    INTEGRATION = auto()
 323    INTERSECT = auto()
 324    INTERVAL = auto()
 325    INTO = auto()
 326    INTRODUCER = auto()
 327    IRLIKE = auto()
 328    IS = auto()
 329    ISNULL = auto()
 330    JOIN = auto()
 331    JOIN_MARKER = auto()
 332    KEEP = auto()
 333    KEY = auto()
 334    KILL = auto()
 335    LANGUAGE = auto()
 336    LATERAL = auto()
 337    LEFT = auto()
 338    LIKE = auto()
 339    LIMIT = auto()
 340    LIST = auto()
 341    LOAD = auto()
 342    LOCK = auto()
 343    MAP = auto()
 344    MATCH = auto()
 345    MATCH_CONDITION = auto()
 346    MATCH_RECOGNIZE = auto()
 347    MEMBER_OF = auto()
 348    MERGE = auto()
 349    MOD = auto()
 350    MODEL = auto()
 351    NATURAL = auto()
 352    NEXT = auto()
 353    NOTHING = auto()
 354    NOTNULL = auto()
 355    NULL = auto()
 356    OBJECT_IDENTIFIER = auto()
 357    OFFSET = auto()
 358    ON = auto()
 359    ONLY = auto()
 360    OPERATOR = auto()
 361    ORDER_BY = auto()
 362    ORDER_SIBLINGS_BY = auto()
 363    ORDERED = auto()
 364    ORDINALITY = auto()
 365    OUT = auto()
 366    INOUT = auto()
 367    OUTER = auto()
 368    OVER = auto()
 369    OVERLAPS = auto()
 370    OVERWRITE = auto()
 371    PACKAGE = auto()
 372    PARTITION = auto()
 373    PARTITION_BY = auto()
 374    PERCENT = auto()
 375    PIVOT = auto()
 376    PLACEHOLDER = auto()
 377    POLICY = auto()
 378    POOL = auto()
 379    POSITIONAL = auto()
 380    PRAGMA = auto()
 381    PREWHERE = auto()
 382    PRIMARY_KEY = auto()
 383    PROCEDURE = auto()
 384    PROPERTIES = auto()
 385    PSEUDO_TYPE = auto()
 386    PUT = auto()
 387    QUALIFY = auto()
 388    QUOTE = auto()
 389    QDCOLON = auto()
 390    RANGE = auto()
 391    RECURSIVE = auto()
 392    REFRESH = auto()
 393    RENAME = auto()
 394    REPLACE = auto()
 395    RETURNING = auto()
 396    REVOKE = auto()
 397    REFERENCES = auto()
 398    RIGHT = auto()
 399    RLIKE = auto()
 400    ROLE = auto()
 401    ROLLBACK = auto()
 402    ROLLUP = auto()
 403    ROW = auto()
 404    ROWS = auto()
 405    RULE = auto()
 406    SELECT = auto()
 407    SEMI = auto()
 408    SEPARATOR = auto()
 409    SEQUENCE = auto()
 410    SERDE_PROPERTIES = auto()
 411    SET = auto()
 412    SETTINGS = auto()
 413    SHOW = auto()
 414    SIMILAR_TO = auto()
 415    SOME = auto()
 416    SORT_BY = auto()
 417    SOUNDS_LIKE = auto()
 418    SQL_SECURITY = auto()
 419    START_WITH = auto()
 420    STORAGE_INTEGRATION = auto()
 421    STRAIGHT_JOIN = auto()
 422    STRUCT = auto()
 423    SUMMARIZE = auto()
 424    TABLE_SAMPLE = auto()
 425    TAG = auto()
 426    TEMPORARY = auto()
 427    TOP = auto()
 428    THEN = auto()
 429    TRUE = auto()
 430    TRUNCATE = auto()
 431    TRIGGER = auto()
 432    TYPE = auto()
 433    UNCACHE = auto()
 434    UNION = auto()
 435    UNNEST = auto()
 436    UNPIVOT = auto()
 437    UPDATE = auto()
 438    USE = auto()
 439    USING = auto()
 440    VALUES = auto()
 441    VARIADIC = auto()
 442    VIEW = auto()
 443    SEMANTIC_VIEW = auto()
 444    VOLATILE = auto()
 445    VOLUME = auto()
 446    WHEN = auto()
 447    WHERE = auto()
 448    WINDOW = auto()
 449    WITH = auto()
 450    UNIQUE = auto()
 451    UTC_DATE = auto()
 452    UTC_TIME = auto()
 453    UTC_TIMESTAMP = auto()
 454    VERSION_SNAPSHOT = auto()
 455    TIMESTAMP_SNAPSHOT = auto()
 456    OPTION = auto()
 457    SINK = auto()
 458    SOURCE = auto()
 459    ANALYZE = auto()
 460    NAMESPACE = auto()
 461    EXPORT = auto()
 462
 463    # sentinels
 464    HIVE_TOKEN_STREAM = auto()
 465    SENTINEL = auto()
 466
 467    def __str__(self) -> str:
 468        return f"TokenType.{self.name}"
 469
 470
 471class Token:
 472    # mypyc doesn't expose slots
 473    _attrs: t.ClassVar[tuple[str, ...]] = (
 474        "token_type",
 475        "text",
 476        "line",
 477        "col",
 478        "start",
 479        "end",
 480        "comments",
 481    )
 482    __slots__ = _attrs
 483
 484    @classmethod
 485    def number(cls, number: int) -> Token:
 486        """Returns a NUMBER token with `number` as its text."""
 487        return cls(TokenType.NUMBER, str(number))
 488
 489    @classmethod
 490    def string(cls, string: str) -> Token:
 491        """Returns a STRING token with `string` as its text."""
 492        return cls(TokenType.STRING, string)
 493
 494    @classmethod
 495    def identifier(cls, identifier: str) -> Token:
 496        """Returns an IDENTIFIER token with `identifier` as its text."""
 497        return cls(TokenType.IDENTIFIER, identifier)
 498
 499    @classmethod
 500    def var(cls, var: str) -> Token:
 501        """Returns an VAR token with `var` as its text."""
 502        return cls(TokenType.VAR, var)
 503
 504    def __init__(
 505        self,
 506        token_type: TokenType,
 507        text: str,
 508        line: int = 1,
 509        col: int = 1,
 510        start: int = 0,
 511        end: int = 0,
 512        comments: list[str] | None = None,
 513    ) -> None:
 514        self.token_type = token_type
 515        self.text = text
 516        self.line = line
 517        self.col = col
 518        self.start = start
 519        self.end = end
 520        self.comments = [] if comments is None else comments
 521
 522    def __bool__(self) -> bool:
 523        return self.token_type != TokenType.SENTINEL
 524
 525    def __repr__(self) -> str:
 526        attributes = ", ".join(
 527            f"{k}: TokenType.{self.token_type.name}"
 528            if k == "token_type"
 529            else f"{k}: {getattr(self, k)}"
 530            for k in self._attrs
 531        )
 532        return f"<Token {attributes}>"
 533
 534
 535class TokenizerCore:
 536    __slots__ = (
 537        "sql",
 538        "size",
 539        "tokens",
 540        "_start",
 541        "_current",
 542        "_line",
 543        "_col",
 544        "_comments",
 545        "_char",
 546        "_end",
 547        "_peek",
 548        "_prev_token_line",
 549        "single_tokens",
 550        "keywords",
 551        "quotes",
 552        "format_strings",
 553        "identifiers",
 554        "comments",
 555        "string_escapes",
 556        "byte_string_escapes",
 557        "identifier_escapes",
 558        "escape_follow_chars",
 559        "commands",
 560        "command_prefix_tokens",
 561        "nested_comments",
 562        "hint_start",
 563        "tokens_preceding_hint",
 564        "has_bit_strings",
 565        "has_hex_strings",
 566        "numeric_literals",
 567        "var_single_tokens",
 568        "string_escapes_allowed_in_raw_strings",
 569        "heredoc_tag_is_identifier",
 570        "heredoc_string_alternative",
 571        "keyword_trie",
 572        "numbers_can_be_underscore_separated",
 573        "numbers_can_have_decimals",
 574        "identifiers_can_start_with_digit",
 575        "unescaped_sequences",
 576    )
 577
 578    def __init__(
 579        self,
 580        single_tokens: dict[str, TokenType],
 581        keywords: dict[str, TokenType],
 582        quotes: dict[str, str],
 583        format_strings: dict[str, tuple[str, TokenType]],
 584        identifiers: dict[str, str],
 585        comments: dict[str, str | None],
 586        string_escapes: set[str],
 587        byte_string_escapes: set[str],
 588        identifier_escapes: set[str],
 589        escape_follow_chars: set[str],
 590        commands: set[TokenType],
 591        command_prefix_tokens: set[TokenType],
 592        nested_comments: bool,
 593        hint_start: str,
 594        tokens_preceding_hint: set[TokenType],
 595        has_bit_strings: bool,
 596        has_hex_strings: bool,
 597        numeric_literals: dict[str, str],
 598        var_single_tokens: set[str],
 599        string_escapes_allowed_in_raw_strings: bool,
 600        heredoc_tag_is_identifier: bool,
 601        heredoc_string_alternative: TokenType,
 602        keyword_trie: dict,
 603        numbers_can_be_underscore_separated: bool,
 604        numbers_can_have_decimals: bool,
 605        identifiers_can_start_with_digit: bool,
 606        unescaped_sequences: dict[str, str],
 607    ) -> None:
 608        self.single_tokens = single_tokens
 609        self.keywords = keywords
 610        self.quotes = quotes
 611        self.format_strings = format_strings
 612        self.identifiers = identifiers
 613        self.comments = comments
 614        self.string_escapes = string_escapes
 615        self.byte_string_escapes = byte_string_escapes
 616        self.identifier_escapes = identifier_escapes
 617        self.escape_follow_chars = escape_follow_chars
 618        self.commands = commands
 619        self.command_prefix_tokens = command_prefix_tokens
 620        self.nested_comments = nested_comments
 621        self.hint_start = hint_start
 622        self.tokens_preceding_hint = tokens_preceding_hint
 623        self.has_bit_strings = has_bit_strings
 624        self.has_hex_strings = has_hex_strings
 625        self.numeric_literals = numeric_literals
 626        self.var_single_tokens = var_single_tokens
 627        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 628        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 629        self.heredoc_string_alternative = heredoc_string_alternative
 630        self.keyword_trie = keyword_trie
 631        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 632        self.numbers_can_have_decimals = numbers_can_have_decimals
 633        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 634        self.unescaped_sequences = unescaped_sequences
 635        self.sql = ""
 636        self.size = 0
 637        self.tokens: list[Token] = []
 638        self._start = 0
 639        self._current = 0
 640        self._line = 1
 641        self._col = 0
 642        self._comments: list[str] = []
 643        self._char = ""
 644        self._end = False
 645        self._peek = ""
 646        self._prev_token_line = -1
 647
 648    def reset(self) -> None:
 649        self.sql = ""
 650        self.size = 0
 651        self.tokens = []
 652        self._start = 0
 653        self._current = 0
 654        self._line = 1
 655        self._col = 0
 656        self._comments = []
 657        self._char = ""
 658        self._end = False
 659        self._peek = ""
 660        self._prev_token_line = -1
 661
 662    def tokenize(self, sql: str) -> list[Token]:
 663        """Returns a list of tokens corresponding to the SQL string `sql`."""
 664        self.reset()
 665        self.sql = sql
 666        self.size = len(sql)
 667
 668        try:
 669            self._scan()
 670        except Exception as e:
 671            start = max(self._current - 50, 0)
 672            end = min(self._current + 50, self.size - 1)
 673            context = self.sql[start:end]
 674            raise TokenError(f"Error tokenizing '{context}'") from e
 675
 676        return self.tokens
 677
 678    def _scan(self, check_semicolon: bool = False) -> None:
 679        identifiers = self.identifiers
 680        digit_chars = _DIGIT_CHARS
 681
 682        while self.size and not self._end:
 683            current = self._current
 684
 685            # Skip spaces here rather than iteratively calling advance() for performance reasons
 686            while current < self.size:
 687                char = self.sql[current]
 688
 689                if char == " " or char == "\t":
 690                    current += 1
 691                else:
 692                    break
 693
 694            offset = current - self._current if current > self._current else 1
 695
 696            self._start = current
 697            self._advance(offset)
 698
 699            if not self._char.isspace():
 700                if self._char in digit_chars:
 701                    self._scan_number()
 702                elif self._char in identifiers:
 703                    self._scan_identifier(identifiers[self._char])
 704                else:
 705                    self._scan_keywords()
 706
 707            if check_semicolon and self._peek == ";":
 708                break
 709
 710        if self.tokens and self._comments:
 711            self.tokens[-1].comments.extend(self._comments)
 712
 713    def _chars(self, size: int) -> str:
 714        if size == 1:
 715            return self._char
 716
 717        start = self._current - 1
 718        end = start + size
 719
 720        return self.sql[start:end] if end <= self.size else ""
 721
 722    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 723        char = self._char
 724
 725        if char == "\n" or char == "\r":
 726            # Ensures we don't count an extra line if we get a \r\n line break sequence
 727            if not (char == "\r" and self._peek == "\n"):
 728                self._col = i
 729                self._line += 1
 730        else:
 731            self._col += i
 732
 733        self._current += i
 734        sql = self.sql
 735        size = self.size
 736        self._end = self._current >= size
 737        self._char = sql[self._current - 1]
 738        self._peek = "" if self._end else sql[self._current]
 739
 740        if alnum and self._char.isalnum():
 741            # Cache to local variables instead of attributes for better performance
 742            _col = self._col
 743            _current = self._current
 744            _end = self._end
 745            _peek = self._peek
 746
 747            while _peek.isalnum():
 748                _col += 1
 749                _current += 1
 750                _end = _current >= size
 751                _peek = "" if _end else sql[_current]
 752
 753            self._col = _col
 754            self._current = _current
 755            self._end = _end
 756            self._peek = _peek
 757            self._char = sql[_current - 1]
 758
 759    @property
 760    def _text(self) -> str:
 761        return self.sql[self._start : self._current]
 762
 763    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 764        self._prev_token_line = self._line
 765
 766        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 767            self.tokens[-1].comments.extend(self._comments)
 768            self._comments = []
 769
 770        if text is None:
 771            text = self.sql[self._start : self._current]
 772
 773        self.tokens.append(
 774            Token(
 775                token_type,
 776                text=text,
 777                line=self._line,
 778                col=self._col,
 779                start=self._start,
 780                end=self._current - 1,
 781                comments=self._comments,
 782            )
 783        )
 784        self._comments = []
 785
 786        # If we have either a semicolon or a begin token before the command's token, we'll parse
 787        # whatever follows the command's token as a string
 788        if (
 789            token_type in self.commands
 790            and self._peek != ";"
 791            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 792        ):
 793            start = self._current
 794            tokens = len(self.tokens)
 795            self._scan(check_semicolon=True)
 796            self.tokens = self.tokens[:tokens]
 797            text = self.sql[start : self._current].strip()
 798            if text:
 799                self._add(TokenType.STRING, text)
 800
 801    def _scan_keywords(self) -> None:
 802        sql = self.sql
 803        sql_size = self.size
 804        single_tokens = self.single_tokens
 805        char_upper = _CHAR_UPPER
 806        size = 0
 807        word = None
 808        chars = self._char
 809        char = chars
 810        prev_space = False
 811        skip = False
 812        trie = self.keyword_trie
 813        single_token = char in single_tokens
 814
 815        while chars:
 816            if not skip:
 817                sub = trie.get(char_upper.get(char, char))
 818                if sub is None:
 819                    break
 820                trie = sub
 821                if 0 in trie:
 822                    word = chars
 823
 824            end = self._current + size
 825            size += 1
 826
 827            if end < sql_size:
 828                char = sql[end]
 829                single_token = single_token or char in single_tokens
 830                is_space = char.isspace()
 831
 832                if not is_space or not prev_space:
 833                    if is_space:
 834                        char = " "
 835                    chars += char
 836                    prev_space = is_space
 837                    skip = False
 838                else:
 839                    skip = True
 840            else:
 841                char = ""
 842                break
 843
 844        if word:
 845            if self._scan_string(word):
 846                return
 847            if self._scan_comment(word):
 848                return
 849            if prev_space or single_token or not char:
 850                self._advance(size - 1)
 851                word = word.upper()
 852                self._add(self.keywords[word], text=word)
 853                return
 854
 855        if self._char in single_tokens:
 856            self._add(single_tokens[self._char], text=self._char)
 857            return
 858
 859        self._scan_var()
 860
 861    def _scan_comment(self, comment_start: str) -> bool:
 862        if comment_start not in self.comments:
 863            return False
 864
 865        comment_start_line = self._line
 866        comment_start_size = len(comment_start)
 867        comment_end = self.comments[comment_start]
 868
 869        if comment_end:
 870            # Skip the comment's start delimiter
 871            self._advance(comment_start_size)
 872
 873            comment_count = 1
 874            comment_end_size = len(comment_end)
 875            nested_comments = self.nested_comments
 876
 877            while not self._end:
 878                if self._chars(comment_end_size) == comment_end:
 879                    comment_count -= 1
 880                    if not comment_count:
 881                        break
 882
 883                self._advance(alnum=True)
 884
 885                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 886                if (
 887                    nested_comments
 888                    and not self._end
 889                    and self._chars(comment_end_size) == comment_start
 890                ):
 891                    self._advance(comment_start_size)
 892                    comment_count += 1
 893
 894            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 895            self._advance(comment_end_size - 1)
 896        else:
 897            _peek = self._peek
 898            while not self._end and _peek != "\n" and _peek != "\r":
 899                self._advance(alnum=True)
 900                _peek = self._peek
 901            self._comments.append(self._text[comment_start_size:])
 902
 903        if (
 904            comment_start == self.hint_start
 905            and self.tokens
 906            and self.tokens[-1].token_type in self.tokens_preceding_hint
 907        ):
 908            self._add(TokenType.HINT)
 909
 910        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 911        # Multiple consecutive comments are preserved by appending them to the current comments list.
 912        if comment_start_line == self._prev_token_line:
 913            self.tokens[-1].comments.extend(self._comments)
 914            self._comments = []
 915            self._prev_token_line = self._line
 916
 917        return True
 918
 919    def _scan_number(self) -> None:
 920        if self._char == "0":
 921            peek = _CHAR_UPPER.get(self._peek, self._peek)
 922            if peek == "B":
 923                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 924            elif peek == "X":
 925                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 926
 927        decimal = False
 928        scientific = 0
 929        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 930        single_tokens = self.single_tokens
 931        keywords = self.keywords
 932        numeric_literals = self.numeric_literals
 933        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 934
 935        is_underscore_separated: bool = False
 936        number_text: str = ""
 937        numeric_literal: str = ""
 938        numeric_type: TokenType | None = None
 939
 940        while True:
 941            if self._peek in _DIGIT_CHARS:
 942                # Batch consecutive digits: scan ahead to find how many
 943                sql = self.sql
 944                end = self._current + 1
 945                size = self.size
 946                while end < size and sql[end] in _DIGIT_CHARS:
 947                    end += 1
 948                self._advance(end - self._current)
 949            elif self._peek == "." and not decimal:
 950                if (
 951                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 952                ) or not self.numbers_can_have_decimals:
 953                    break
 954                decimal = True
 955                self._advance()
 956            elif self._peek in ("-", "+") and scientific == 1:
 957                # Only consume +/- if followed by a digit
 958                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 959                    scientific += 1
 960                    self._advance()
 961                else:
 962                    break
 963            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 964                scientific += 1
 965                self._advance()
 966            elif self._peek == "_" and numbers_can_be_underscore_separated:
 967                is_underscore_separated = True
 968                self._advance()
 969            elif self._peek.isidentifier():
 970                number_text = self._text
 971
 972                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 973                    numeric_literal += self._peek
 974                    self._advance()
 975
 976                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 977
 978                if numeric_type:
 979                    break
 980                elif identifiers_can_start_with_digit:
 981                    return self._add(TokenType.VAR)
 982
 983                self._advance(-len(numeric_literal))
 984                break
 985            else:
 986                break
 987
 988        number_text = number_text or self.sql[self._start : self._current]
 989
 990        # Normalize inputs such as 100_000 to 100000
 991        if is_underscore_separated:
 992            number_text = number_text.replace("_", "")
 993
 994        self._add(TokenType.NUMBER, number_text)
 995
 996        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 997        if numeric_type:
 998            self._add(TokenType.DCOLON, "::")
 999            self._add(numeric_type, numeric_literal)
1000
1001    def _scan_bits(self) -> None:
1002        self._advance()
1003        value = self._extract_value()
1004        try:
1005            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1006            int(value, 2)
1007            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1008        except ValueError:
1009            self._add(TokenType.IDENTIFIER)
1010
1011    def _scan_hex(self) -> None:
1012        self._advance()
1013        value = self._extract_value()
1014        try:
1015            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1016            int(value, 16)
1017            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1018        except ValueError:
1019            self._add(TokenType.IDENTIFIER)
1020
1021    def _extract_value(self) -> str:
1022        single_tokens = self.single_tokens
1023
1024        while True:
1025            char = self._peek.strip()
1026            if char and char not in single_tokens:
1027                self._advance(alnum=True)
1028            else:
1029                break
1030
1031        return self._text
1032
1033    def _scan_string(self, start: str) -> bool:
1034        base = None
1035        token_type = TokenType.STRING
1036
1037        if start in self.quotes:
1038            end = self.quotes[start]
1039        elif start in self.format_strings:
1040            end, token_type = self.format_strings[start]
1041
1042            if token_type == TokenType.HEX_STRING:
1043                base = 16
1044            elif token_type == TokenType.BIT_STRING:
1045                base = 2
1046            elif token_type == TokenType.HEREDOC_STRING:
1047                self._advance()
1048
1049                if self._char == end:
1050                    tag = ""
1051                else:
1052                    tag = self._extract_string(
1053                        end,
1054                        raw_string=True,
1055                        raise_unmatched=not self.heredoc_tag_is_identifier,
1056                    )
1057
1058                if (
1059                    tag
1060                    and self.heredoc_tag_is_identifier
1061                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1062                ):
1063                    if not self._end:
1064                        self._advance(-1)
1065
1066                    self._advance(-len(tag))
1067                    self._add(self.heredoc_string_alternative)
1068                    return True
1069
1070                end = f"{start}{tag}{end}"
1071        else:
1072            return False
1073
1074        self._advance(len(start))
1075        text = self._extract_string(
1076            end,
1077            escapes=(
1078                self.byte_string_escapes
1079                if token_type == TokenType.BYTE_STRING
1080                else self.string_escapes
1081            ),
1082            raw_string=token_type == TokenType.RAW_STRING,
1083        )
1084
1085        if base and text:
1086            try:
1087                int(text, base)
1088            except Exception:
1089                raise TokenError(
1090                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1091                )
1092
1093        self._add(token_type, text)
1094        return True
1095
1096    def _scan_identifier(self, identifier_end: str) -> None:
1097        self._advance()
1098        text = self._extract_string(
1099            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1100        )
1101        self._add(TokenType.IDENTIFIER, text)
1102
1103    def _scan_var(self) -> None:
1104        var_single_tokens = self.var_single_tokens
1105        single_tokens = self.single_tokens
1106
1107        while True:
1108            peek = self._peek
1109            if not peek or peek.isspace():
1110                break
1111            if peek not in var_single_tokens and peek in single_tokens:
1112                break
1113            self._advance(alnum=True)
1114
1115        self._add(
1116            TokenType.VAR
1117            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1118            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1119        )
1120
1121    def _extract_string(
1122        self,
1123        delimiter: str,
1124        escapes: set[str] | None = None,
1125        raw_string: bool = False,
1126        raise_unmatched: bool = True,
1127    ) -> str:
1128        text = ""
1129        delim_size = len(delimiter)
1130        escapes = self.string_escapes if escapes is None else escapes
1131        unescaped_sequences = self.unescaped_sequences
1132        escape_follow_chars = self.escape_follow_chars
1133        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1134        quotes = self.quotes
1135        sql = self.sql
1136
1137        # use str.find() when the string is simple... no \ or other escapes
1138        if delim_size == 1:
1139            pos = self._current - 1
1140            end = sql.find(delimiter, pos)
1141
1142            if (
1143                # the closing delimiter was found
1144                end != -1
1145                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1146                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1147                # no backslash in the string that would need escape processing
1148                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1149            ):
1150                newlines = sql.count("\n", pos, end)
1151                if newlines:
1152                    self._line += newlines
1153                    self._col = end - sql.rfind("\n", pos, end)
1154                else:
1155                    self._col += end - pos
1156
1157                self._current = end + 1
1158                self._end = self._current >= self.size
1159                self._char = sql[end]
1160                self._peek = "" if self._end else sql[self._current]
1161                return sql[pos:end]
1162
1163        while True:
1164            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1165                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1166                if unescaped_sequence:
1167                    self._advance(2)
1168                    text += unescaped_sequence
1169                    continue
1170
1171            is_valid_custom_escape = (
1172                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1173            )
1174
1175            if (
1176                (string_escapes_allowed_in_raw_strings or not raw_string)
1177                and self._char in escapes
1178                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1179                and (self._char not in quotes or self._char == self._peek)
1180            ):
1181                if self._peek == delimiter:
1182                    text += self._peek
1183                elif is_valid_custom_escape and self._char != self._peek:
1184                    text += self._peek
1185                else:
1186                    text += self._char + self._peek
1187
1188                if self._current + 1 < self.size:
1189                    self._advance(2)
1190                else:
1191                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1192            else:
1193                if self._chars(delim_size) == delimiter:
1194                    if delim_size > 1:
1195                        self._advance(delim_size - 1)
1196                    break
1197
1198                if self._end:
1199                    if not raise_unmatched:
1200                        return text + self._char
1201
1202                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1203
1204                current = self._current - 1
1205                self._advance(alnum=True)
1206                text += sql[current : self._current - 1]
1207
1208        return text
class TokenType(enum.IntEnum):
 14class TokenType(IntEnum):
 15    L_PAREN = auto()
 16    R_PAREN = auto()
 17    L_BRACKET = auto()
 18    R_BRACKET = auto()
 19    L_BRACE = auto()
 20    R_BRACE = auto()
 21    COMMA = auto()
 22    DOT = auto()
 23    DASH = auto()
 24    PLUS = auto()
 25    COLON = auto()
 26    DOTCOLON = auto()
 27    DOTCARET = auto()
 28    DCOLON = auto()
 29    DCOLONDOLLAR = auto()
 30    DCOLONPERCENT = auto()
 31    DCOLONQMARK = auto()
 32    DQMARK = auto()
 33    SEMICOLON = auto()
 34    STAR = auto()
 35    BACKSLASH = auto()
 36    SLASH = auto()
 37    LT = auto()
 38    LTE = auto()
 39    GT = auto()
 40    GTE = auto()
 41    NOT = auto()
 42    EQ = auto()
 43    NEQ = auto()
 44    NULLSAFE_EQ = auto()
 45    COLON_EQ = auto()
 46    COLON_GT = auto()
 47    NCOLON_GT = auto()
 48    AND = auto()
 49    OR = auto()
 50    AMP = auto()
 51    DPIPE = auto()
 52    PIPE_GT = auto()
 53    PIPE = auto()
 54    PIPE_SLASH = auto()
 55    DPIPE_SLASH = auto()
 56    CARET = auto()
 57    CARET_AT = auto()
 58    TILDE = auto()
 59    ARROW = auto()
 60    DARROW = auto()
 61    FARROW = auto()
 62    HASH = auto()
 63    HASH_ARROW = auto()
 64    DHASH_ARROW = auto()
 65    LR_ARROW = auto()
 66    LLRR_ARROW = auto()
 67    DAT = auto()
 68    LT_AT = auto()
 69    AT_GT = auto()
 70    DOLLAR = auto()
 71    PARAMETER = auto()
 72    SESSION = auto()
 73    SESSION_PARAMETER = auto()
 74    SESSION_USER = auto()
 75    DAMP = auto()
 76    AMP_LT = auto()
 77    AMP_GT = auto()
 78    ADJACENT = auto()
 79    XOR = auto()
 80    DSTAR = auto()
 81    QMARK_AMP = auto()
 82    QMARK_PIPE = auto()
 83    HASH_DASH = auto()
 84    EXCLAMATION = auto()
 85
 86    URI_START = auto()
 87
 88    BLOCK_START = auto()
 89    BLOCK_END = auto()
 90
 91    SPACE = auto()
 92    BREAK = auto()
 93
 94    STRING = auto()
 95    NUMBER = auto()
 96    IDENTIFIER = auto()
 97    DATABASE = auto()
 98    COLUMN = auto()
 99    COLUMN_DEF = auto()
100    SCHEMA = auto()
101    TABLE = auto()
102    WAREHOUSE = auto()
103    STAGE = auto()
104    STREAM = auto()
105    STREAMLIT = auto()
106    VAR = auto()
107    BIT_STRING = auto()
108    HEX_STRING = auto()
109    BYTE_STRING = auto()
110    NATIONAL_STRING = auto()
111    RAW_STRING = auto()
112    HEREDOC_STRING = auto()
113    UNICODE_STRING = auto()
114
115    # types
116    BIT = auto()
117    BOOLEAN = auto()
118    TINYINT = auto()
119    UTINYINT = auto()
120    SMALLINT = auto()
121    USMALLINT = auto()
122    MEDIUMINT = auto()
123    UMEDIUMINT = auto()
124    INT = auto()
125    UINT = auto()
126    BIGINT = auto()
127    UBIGINT = auto()
128    BIGNUM = auto()
129    INT128 = auto()
130    UINT128 = auto()
131    INT256 = auto()
132    UINT256 = auto()
133    FLOAT = auto()
134    DOUBLE = auto()
135    UDOUBLE = auto()
136    DECIMAL = auto()
137    DECIMAL32 = auto()
138    DECIMAL64 = auto()
139    DECIMAL128 = auto()
140    DECIMAL256 = auto()
141    DECFLOAT = auto()
142    UDECIMAL = auto()
143    BIGDECIMAL = auto()
144    CHAR = auto()
145    NCHAR = auto()
146    VARCHAR = auto()
147    NVARCHAR = auto()
148    BPCHAR = auto()
149    TEXT = auto()
150    MEDIUMTEXT = auto()
151    LONGTEXT = auto()
152    BLOB = auto()
153    MEDIUMBLOB = auto()
154    LONGBLOB = auto()
155    TINYBLOB = auto()
156    TINYTEXT = auto()
157    NAME = auto()
158    BINARY = auto()
159    VARBINARY = auto()
160    JSON = auto()
161    JSONB = auto()
162    TIME = auto()
163    TIMETZ = auto()
164    TIME_NS = auto()
165    TIMESTAMP = auto()
166    TIMESTAMPTZ = auto()
167    TIMESTAMPLTZ = auto()
168    TIMESTAMPNTZ = auto()
169    TIMESTAMP_S = auto()
170    TIMESTAMP_MS = auto()
171    TIMESTAMP_NS = auto()
172    DATETIME = auto()
173    DATETIME2 = auto()
174    DATETIME64 = auto()
175    SMALLDATETIME = auto()
176    DATE = auto()
177    DATE32 = auto()
178    INT4RANGE = auto()
179    INT4MULTIRANGE = auto()
180    INT8RANGE = auto()
181    INT8MULTIRANGE = auto()
182    NUMRANGE = auto()
183    NUMMULTIRANGE = auto()
184    TSRANGE = auto()
185    TSMULTIRANGE = auto()
186    TSTZRANGE = auto()
187    TSTZMULTIRANGE = auto()
188    DATERANGE = auto()
189    DATEMULTIRANGE = auto()
190    UUID = auto()
191    GEOGRAPHY = auto()
192    GEOGRAPHYPOINT = auto()
193    NULLABLE = auto()
194    GEOMETRY = auto()
195    POINT = auto()
196    RING = auto()
197    LINESTRING = auto()
198    LOCALTIME = auto()
199    LOCALTIMESTAMP = auto()
200    SYSTIMESTAMP = auto()
201    MULTILINESTRING = auto()
202    POLYGON = auto()
203    MULTIPOLYGON = auto()
204    HLLSKETCH = auto()
205    HSTORE = auto()
206    SUPER = auto()
207    SERIAL = auto()
208    SMALLSERIAL = auto()
209    BIGSERIAL = auto()
210    XML = auto()
211    YEAR = auto()
212    USERDEFINED = auto()
213    MONEY = auto()
214    SMALLMONEY = auto()
215    ROWVERSION = auto()
216    IMAGE = auto()
217    VARIANT = auto()
218    OBJECT = auto()
219    INET = auto()
220    IPADDRESS = auto()
221    IPPREFIX = auto()
222    IPV4 = auto()
223    IPV6 = auto()
224    ENUM = auto()
225    ENUM8 = auto()
226    ENUM16 = auto()
227    FIXEDSTRING = auto()
228    LOWCARDINALITY = auto()
229    NESTED = auto()
230    AGGREGATEFUNCTION = auto()
231    SIMPLEAGGREGATEFUNCTION = auto()
232    TDIGEST = auto()
233    UNKNOWN = auto()
234    VECTOR = auto()
235    DYNAMIC = auto()
236    VOID = auto()
237
238    # keywords
239    ALIAS = auto()
240    ALTER = auto()
241    ALL = auto()
242    ANTI = auto()
243    ANY = auto()
244    APPLY = auto()
245    ARRAY = auto()
246    ASC = auto()
247    ASOF = auto()
248    ATTACH = auto()
249    AUTO_INCREMENT = auto()
250    BEGIN = auto()
251    BETWEEN = auto()
252    BULK_COLLECT_INTO = auto()
253    CACHE = auto()
254    CASE = auto()
255    CHARACTER_SET = auto()
256    CLUSTER_BY = auto()
257    COLLATE = auto()
258    COMMAND = auto()
259    COMMENT = auto()
260    COMMIT = auto()
261    CONNECT_BY = auto()
262    CONSTRAINT = auto()
263    COPY = auto()
264    CREATE = auto()
265    CROSS = auto()
266    CUBE = auto()
267    CURRENT_DATE = auto()
268    CURRENT_DATETIME = auto()
269    CURRENT_SCHEMA = auto()
270    CURRENT_TIME = auto()
271    CURRENT_TIMESTAMP = auto()
272    CURRENT_USER = auto()
273    CURRENT_USER_ID = auto()
274    CURRENT_ROLE = auto()
275    CURRENT_CATALOG = auto()
276    DECLARE = auto()
277    DEFAULT = auto()
278    DELETE = auto()
279    DESC = auto()
280    DESCRIBE = auto()
281    DETACH = auto()
282    DICTIONARY = auto()
283    DISTINCT = auto()
284    DISTRIBUTE_BY = auto()
285    DIV = auto()
286    DROP = auto()
287    ELSE = auto()
288    END = auto()
289    ESCAPE = auto()
290    EXCEPT = auto()
291    EXECUTE = auto()
292    EXISTS = auto()
293    FALSE = auto()
294    FETCH = auto()
295    FILE = auto()
296    FILE_FORMAT = auto()
297    FILTER = auto()
298    FINAL = auto()
299    FIRST = auto()
300    FOR = auto()
301    FORCE = auto()
302    FOREIGN_KEY = auto()
303    FORMAT = auto()
304    FROM = auto()
305    FULL = auto()
306    FUNCTION = auto()
307    GET = auto()
308    GLOB = auto()
309    GLOBAL = auto()
310    GRANT = auto()
311    GROUP_BY = auto()
312    GROUPING_SETS = auto()
313    HAVING = auto()
314    HINT = auto()
315    IGNORE = auto()
316    ILIKE = auto()
317    IN = auto()
318    INDEX = auto()
319    INDEXED_BY = auto()
320    INNER = auto()
321    INSERT = auto()
322    INSTALL = auto()
323    INTEGRATION = auto()
324    INTERSECT = auto()
325    INTERVAL = auto()
326    INTO = auto()
327    INTRODUCER = auto()
328    IRLIKE = auto()
329    IS = auto()
330    ISNULL = auto()
331    JOIN = auto()
332    JOIN_MARKER = auto()
333    KEEP = auto()
334    KEY = auto()
335    KILL = auto()
336    LANGUAGE = auto()
337    LATERAL = auto()
338    LEFT = auto()
339    LIKE = auto()
340    LIMIT = auto()
341    LIST = auto()
342    LOAD = auto()
343    LOCK = auto()
344    MAP = auto()
345    MATCH = auto()
346    MATCH_CONDITION = auto()
347    MATCH_RECOGNIZE = auto()
348    MEMBER_OF = auto()
349    MERGE = auto()
350    MOD = auto()
351    MODEL = auto()
352    NATURAL = auto()
353    NEXT = auto()
354    NOTHING = auto()
355    NOTNULL = auto()
356    NULL = auto()
357    OBJECT_IDENTIFIER = auto()
358    OFFSET = auto()
359    ON = auto()
360    ONLY = auto()
361    OPERATOR = auto()
362    ORDER_BY = auto()
363    ORDER_SIBLINGS_BY = auto()
364    ORDERED = auto()
365    ORDINALITY = auto()
366    OUT = auto()
367    INOUT = auto()
368    OUTER = auto()
369    OVER = auto()
370    OVERLAPS = auto()
371    OVERWRITE = auto()
372    PACKAGE = auto()
373    PARTITION = auto()
374    PARTITION_BY = auto()
375    PERCENT = auto()
376    PIVOT = auto()
377    PLACEHOLDER = auto()
378    POLICY = auto()
379    POOL = auto()
380    POSITIONAL = auto()
381    PRAGMA = auto()
382    PREWHERE = auto()
383    PRIMARY_KEY = auto()
384    PROCEDURE = auto()
385    PROPERTIES = auto()
386    PSEUDO_TYPE = auto()
387    PUT = auto()
388    QUALIFY = auto()
389    QUOTE = auto()
390    QDCOLON = auto()
391    RANGE = auto()
392    RECURSIVE = auto()
393    REFRESH = auto()
394    RENAME = auto()
395    REPLACE = auto()
396    RETURNING = auto()
397    REVOKE = auto()
398    REFERENCES = auto()
399    RIGHT = auto()
400    RLIKE = auto()
401    ROLE = auto()
402    ROLLBACK = auto()
403    ROLLUP = auto()
404    ROW = auto()
405    ROWS = auto()
406    RULE = auto()
407    SELECT = auto()
408    SEMI = auto()
409    SEPARATOR = auto()
410    SEQUENCE = auto()
411    SERDE_PROPERTIES = auto()
412    SET = auto()
413    SETTINGS = auto()
414    SHOW = auto()
415    SIMILAR_TO = auto()
416    SOME = auto()
417    SORT_BY = auto()
418    SOUNDS_LIKE = auto()
419    SQL_SECURITY = auto()
420    START_WITH = auto()
421    STORAGE_INTEGRATION = auto()
422    STRAIGHT_JOIN = auto()
423    STRUCT = auto()
424    SUMMARIZE = auto()
425    TABLE_SAMPLE = auto()
426    TAG = auto()
427    TEMPORARY = auto()
428    TOP = auto()
429    THEN = auto()
430    TRUE = auto()
431    TRUNCATE = auto()
432    TRIGGER = auto()
433    TYPE = auto()
434    UNCACHE = auto()
435    UNION = auto()
436    UNNEST = auto()
437    UNPIVOT = auto()
438    UPDATE = auto()
439    USE = auto()
440    USING = auto()
441    VALUES = auto()
442    VARIADIC = auto()
443    VIEW = auto()
444    SEMANTIC_VIEW = auto()
445    VOLATILE = auto()
446    VOLUME = auto()
447    WHEN = auto()
448    WHERE = auto()
449    WINDOW = auto()
450    WITH = auto()
451    UNIQUE = auto()
452    UTC_DATE = auto()
453    UTC_TIME = auto()
454    UTC_TIMESTAMP = auto()
455    VERSION_SNAPSHOT = auto()
456    TIMESTAMP_SNAPSHOT = auto()
457    OPTION = auto()
458    SINK = auto()
459    SOURCE = auto()
460    ANALYZE = auto()
461    NAMESPACE = auto()
462    EXPORT = auto()
463
464    # sentinels
465    HIVE_TOKEN_STREAM = auto()
466    SENTINEL = auto()
467
468    def __str__(self) -> str:
469        return f"TokenType.{self.name}"

An enumeration.

L_PAREN = <TokenType.L_PAREN: 1>
R_PAREN = <TokenType.R_PAREN: 2>
L_BRACKET = <TokenType.L_BRACKET: 3>
R_BRACKET = <TokenType.R_BRACKET: 4>
L_BRACE = <TokenType.L_BRACE: 5>
R_BRACE = <TokenType.R_BRACE: 6>
COMMA = <TokenType.COMMA: 7>
DOT = <TokenType.DOT: 8>
DASH = <TokenType.DASH: 9>
PLUS = <TokenType.PLUS: 10>
COLON = <TokenType.COLON: 11>
DOTCOLON = <TokenType.DOTCOLON: 12>
DOTCARET = <TokenType.DOTCARET: 13>
DCOLON = <TokenType.DCOLON: 14>
DCOLONDOLLAR = <TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT = <TokenType.DCOLONPERCENT: 16>
DCOLONQMARK = <TokenType.DCOLONQMARK: 17>
DQMARK = <TokenType.DQMARK: 18>
SEMICOLON = <TokenType.SEMICOLON: 19>
STAR = <TokenType.STAR: 20>
BACKSLASH = <TokenType.BACKSLASH: 21>
SLASH = <TokenType.SLASH: 22>
LT = <TokenType.LT: 23>
LTE = <TokenType.LTE: 24>
GT = <TokenType.GT: 25>
GTE = <TokenType.GTE: 26>
NOT = <TokenType.NOT: 27>
EQ = <TokenType.EQ: 28>
NEQ = <TokenType.NEQ: 29>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 30>
COLON_EQ = <TokenType.COLON_EQ: 31>
COLON_GT = <TokenType.COLON_GT: 32>
NCOLON_GT = <TokenType.NCOLON_GT: 33>
AND = <TokenType.AND: 34>
OR = <TokenType.OR: 35>
AMP = <TokenType.AMP: 36>
DPIPE = <TokenType.DPIPE: 37>
PIPE_GT = <TokenType.PIPE_GT: 38>
PIPE = <TokenType.PIPE: 39>
PIPE_SLASH = <TokenType.PIPE_SLASH: 40>
DPIPE_SLASH = <TokenType.DPIPE_SLASH: 41>
CARET = <TokenType.CARET: 42>
CARET_AT = <TokenType.CARET_AT: 43>
TILDE = <TokenType.TILDE: 44>
ARROW = <TokenType.ARROW: 45>
DARROW = <TokenType.DARROW: 46>
FARROW = <TokenType.FARROW: 47>
HASH = <TokenType.HASH: 48>
HASH_ARROW = <TokenType.HASH_ARROW: 49>
DHASH_ARROW = <TokenType.DHASH_ARROW: 50>
LR_ARROW = <TokenType.LR_ARROW: 51>
LLRR_ARROW = <TokenType.LLRR_ARROW: 52>
DAT = <TokenType.DAT: 53>
LT_AT = <TokenType.LT_AT: 54>
AT_GT = <TokenType.AT_GT: 55>
DOLLAR = <TokenType.DOLLAR: 56>
PARAMETER = <TokenType.PARAMETER: 57>
SESSION = <TokenType.SESSION: 58>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 59>
SESSION_USER = <TokenType.SESSION_USER: 60>
DAMP = <TokenType.DAMP: 61>
AMP_LT = <TokenType.AMP_LT: 62>
AMP_GT = <TokenType.AMP_GT: 63>
ADJACENT = <TokenType.ADJACENT: 64>
XOR = <TokenType.XOR: 65>
DSTAR = <TokenType.DSTAR: 66>
QMARK_AMP = <TokenType.QMARK_AMP: 67>
QMARK_PIPE = <TokenType.QMARK_PIPE: 68>
HASH_DASH = <TokenType.HASH_DASH: 69>
EXCLAMATION = <TokenType.EXCLAMATION: 70>
URI_START = <TokenType.URI_START: 71>
BLOCK_START = <TokenType.BLOCK_START: 72>
BLOCK_END = <TokenType.BLOCK_END: 73>
SPACE = <TokenType.SPACE: 74>
BREAK = <TokenType.BREAK: 75>
STRING = <TokenType.STRING: 76>
NUMBER = <TokenType.NUMBER: 77>
IDENTIFIER = <TokenType.IDENTIFIER: 78>
DATABASE = <TokenType.DATABASE: 79>
COLUMN = <TokenType.COLUMN: 80>
COLUMN_DEF = <TokenType.COLUMN_DEF: 81>
SCHEMA = <TokenType.SCHEMA: 82>
TABLE = <TokenType.TABLE: 83>
WAREHOUSE = <TokenType.WAREHOUSE: 84>
STAGE = <TokenType.STAGE: 85>
STREAM = <TokenType.STREAM: 86>
STREAMLIT = <TokenType.STREAMLIT: 87>
VAR = <TokenType.VAR: 88>
BIT_STRING = <TokenType.BIT_STRING: 89>
HEX_STRING = <TokenType.HEX_STRING: 90>
BYTE_STRING = <TokenType.BYTE_STRING: 91>
NATIONAL_STRING = <TokenType.NATIONAL_STRING: 92>
RAW_STRING = <TokenType.RAW_STRING: 93>
HEREDOC_STRING = <TokenType.HEREDOC_STRING: 94>
UNICODE_STRING = <TokenType.UNICODE_STRING: 95>
BIT = <TokenType.BIT: 96>
BOOLEAN = <TokenType.BOOLEAN: 97>
TINYINT = <TokenType.TINYINT: 98>
UTINYINT = <TokenType.UTINYINT: 99>
SMALLINT = <TokenType.SMALLINT: 100>
USMALLINT = <TokenType.USMALLINT: 101>
MEDIUMINT = <TokenType.MEDIUMINT: 102>
UMEDIUMINT = <TokenType.UMEDIUMINT: 103>
INT = <TokenType.INT: 104>
UINT = <TokenType.UINT: 105>
BIGINT = <TokenType.BIGINT: 106>
UBIGINT = <TokenType.UBIGINT: 107>
BIGNUM = <TokenType.BIGNUM: 108>
INT128 = <TokenType.INT128: 109>
UINT128 = <TokenType.UINT128: 110>
INT256 = <TokenType.INT256: 111>
UINT256 = <TokenType.UINT256: 112>
FLOAT = <TokenType.FLOAT: 113>
DOUBLE = <TokenType.DOUBLE: 114>
UDOUBLE = <TokenType.UDOUBLE: 115>
DECIMAL = <TokenType.DECIMAL: 116>
DECIMAL32 = <TokenType.DECIMAL32: 117>
DECIMAL64 = <TokenType.DECIMAL64: 118>
DECIMAL128 = <TokenType.DECIMAL128: 119>
DECIMAL256 = <TokenType.DECIMAL256: 120>
DECFLOAT = <TokenType.DECFLOAT: 121>
UDECIMAL = <TokenType.UDECIMAL: 122>
BIGDECIMAL = <TokenType.BIGDECIMAL: 123>
CHAR = <TokenType.CHAR: 124>
NCHAR = <TokenType.NCHAR: 125>
VARCHAR = <TokenType.VARCHAR: 126>
NVARCHAR = <TokenType.NVARCHAR: 127>
BPCHAR = <TokenType.BPCHAR: 128>
TEXT = <TokenType.TEXT: 129>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 130>
LONGTEXT = <TokenType.LONGTEXT: 131>
BLOB = <TokenType.BLOB: 132>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 133>
LONGBLOB = <TokenType.LONGBLOB: 134>
TINYBLOB = <TokenType.TINYBLOB: 135>
TINYTEXT = <TokenType.TINYTEXT: 136>
NAME = <TokenType.NAME: 137>
BINARY = <TokenType.BINARY: 138>
VARBINARY = <TokenType.VARBINARY: 139>
JSON = <TokenType.JSON: 140>
JSONB = <TokenType.JSONB: 141>
TIME = <TokenType.TIME: 142>
TIMETZ = <TokenType.TIMETZ: 143>
TIME_NS = <TokenType.TIME_NS: 144>
TIMESTAMP = <TokenType.TIMESTAMP: 145>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 146>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 147>
TIMESTAMPNTZ = <TokenType.TIMESTAMPNTZ: 148>
TIMESTAMP_S = <TokenType.TIMESTAMP_S: 149>
TIMESTAMP_MS = <TokenType.TIMESTAMP_MS: 150>
TIMESTAMP_NS = <TokenType.TIMESTAMP_NS: 151>
DATETIME = <TokenType.DATETIME: 152>
DATETIME2 = <TokenType.DATETIME2: 153>
DATETIME64 = <TokenType.DATETIME64: 154>
SMALLDATETIME = <TokenType.SMALLDATETIME: 155>
DATE = <TokenType.DATE: 156>
DATE32 = <TokenType.DATE32: 157>
INT4RANGE = <TokenType.INT4RANGE: 158>
INT4MULTIRANGE = <TokenType.INT4MULTIRANGE: 159>
INT8RANGE = <TokenType.INT8RANGE: 160>
INT8MULTIRANGE = <TokenType.INT8MULTIRANGE: 161>
NUMRANGE = <TokenType.NUMRANGE: 162>
NUMMULTIRANGE = <TokenType.NUMMULTIRANGE: 163>
TSRANGE = <TokenType.TSRANGE: 164>
TSMULTIRANGE = <TokenType.TSMULTIRANGE: 165>
TSTZRANGE = <TokenType.TSTZRANGE: 166>
TSTZMULTIRANGE = <TokenType.TSTZMULTIRANGE: 167>
DATERANGE = <TokenType.DATERANGE: 168>
DATEMULTIRANGE = <TokenType.DATEMULTIRANGE: 169>
UUID = <TokenType.UUID: 170>
GEOGRAPHY = <TokenType.GEOGRAPHY: 171>
GEOGRAPHYPOINT = <TokenType.GEOGRAPHYPOINT: 172>
NULLABLE = <TokenType.NULLABLE: 173>
GEOMETRY = <TokenType.GEOMETRY: 174>
POINT = <TokenType.POINT: 175>
RING = <TokenType.RING: 176>
LINESTRING = <TokenType.LINESTRING: 177>
LOCALTIME = <TokenType.LOCALTIME: 178>
LOCALTIMESTAMP = <TokenType.LOCALTIMESTAMP: 179>
SYSTIMESTAMP = <TokenType.SYSTIMESTAMP: 180>
MULTILINESTRING = <TokenType.MULTILINESTRING: 181>
POLYGON = <TokenType.POLYGON: 182>
MULTIPOLYGON = <TokenType.MULTIPOLYGON: 183>
HLLSKETCH = <TokenType.HLLSKETCH: 184>
HSTORE = <TokenType.HSTORE: 185>
SUPER = <TokenType.SUPER: 186>
SERIAL = <TokenType.SERIAL: 187>
SMALLSERIAL = <TokenType.SMALLSERIAL: 188>
BIGSERIAL = <TokenType.BIGSERIAL: 189>
XML = <TokenType.XML: 190>
YEAR = <TokenType.YEAR: 191>
USERDEFINED = <TokenType.USERDEFINED: 192>
MONEY = <TokenType.MONEY: 193>
SMALLMONEY = <TokenType.SMALLMONEY: 194>
ROWVERSION = <TokenType.ROWVERSION: 195>
IMAGE = <TokenType.IMAGE: 196>
VARIANT = <TokenType.VARIANT: 197>
OBJECT = <TokenType.OBJECT: 198>
INET = <TokenType.INET: 199>
IPADDRESS = <TokenType.IPADDRESS: 200>
IPPREFIX = <TokenType.IPPREFIX: 201>
IPV4 = <TokenType.IPV4: 202>
IPV6 = <TokenType.IPV6: 203>
ENUM = <TokenType.ENUM: 204>
ENUM8 = <TokenType.ENUM8: 205>
ENUM16 = <TokenType.ENUM16: 206>
FIXEDSTRING = <TokenType.FIXEDSTRING: 207>
LOWCARDINALITY = <TokenType.LOWCARDINALITY: 208>
NESTED = <TokenType.NESTED: 209>
AGGREGATEFUNCTION = <TokenType.AGGREGATEFUNCTION: 210>
SIMPLEAGGREGATEFUNCTION = <TokenType.SIMPLEAGGREGATEFUNCTION: 211>
TDIGEST = <TokenType.TDIGEST: 212>
UNKNOWN = <TokenType.UNKNOWN: 213>
VECTOR = <TokenType.VECTOR: 214>
DYNAMIC = <TokenType.DYNAMIC: 215>
VOID = <TokenType.VOID: 216>
ALIAS = <TokenType.ALIAS: 217>
ALTER = <TokenType.ALTER: 218>
ALL = <TokenType.ALL: 219>
ANTI = <TokenType.ANTI: 220>
ANY = <TokenType.ANY: 221>
APPLY = <TokenType.APPLY: 222>
ARRAY = <TokenType.ARRAY: 223>
ASC = <TokenType.ASC: 224>
ASOF = <TokenType.ASOF: 225>
ATTACH = <TokenType.ATTACH: 226>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 227>
BEGIN = <TokenType.BEGIN: 228>
BETWEEN = <TokenType.BETWEEN: 229>
BULK_COLLECT_INTO = <TokenType.BULK_COLLECT_INTO: 230>
CACHE = <TokenType.CACHE: 231>
CASE = <TokenType.CASE: 232>
CHARACTER_SET = <TokenType.CHARACTER_SET: 233>
CLUSTER_BY = <TokenType.CLUSTER_BY: 234>
COLLATE = <TokenType.COLLATE: 235>
COMMAND = <TokenType.COMMAND: 236>
COMMENT = <TokenType.COMMENT: 237>
COMMIT = <TokenType.COMMIT: 238>
CONNECT_BY = <TokenType.CONNECT_BY: 239>
CONSTRAINT = <TokenType.CONSTRAINT: 240>
COPY = <TokenType.COPY: 241>
CREATE = <TokenType.CREATE: 242>
CROSS = <TokenType.CROSS: 243>
CUBE = <TokenType.CUBE: 244>
CURRENT_DATE = <TokenType.CURRENT_DATE: 245>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 246>
CURRENT_SCHEMA = <TokenType.CURRENT_SCHEMA: 247>
CURRENT_TIME = <TokenType.CURRENT_TIME: 248>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 249>
CURRENT_USER = <TokenType.CURRENT_USER: 250>
CURRENT_USER_ID = <TokenType.CURRENT_USER_ID: 251>
CURRENT_ROLE = <TokenType.CURRENT_ROLE: 252>
CURRENT_CATALOG = <TokenType.CURRENT_CATALOG: 253>
DECLARE = <TokenType.DECLARE: 254>
DEFAULT = <TokenType.DEFAULT: 255>
DELETE = <TokenType.DELETE: 256>
DESC = <TokenType.DESC: 257>
DESCRIBE = <TokenType.DESCRIBE: 258>
DETACH = <TokenType.DETACH: 259>
DICTIONARY = <TokenType.DICTIONARY: 260>
DISTINCT = <TokenType.DISTINCT: 261>
DISTRIBUTE_BY = <TokenType.DISTRIBUTE_BY: 262>
DIV = <TokenType.DIV: 263>
DROP = <TokenType.DROP: 264>
ELSE = <TokenType.ELSE: 265>
END = <TokenType.END: 266>
ESCAPE = <TokenType.ESCAPE: 267>
EXCEPT = <TokenType.EXCEPT: 268>
EXECUTE = <TokenType.EXECUTE: 269>
EXISTS = <TokenType.EXISTS: 270>
FALSE = <TokenType.FALSE: 271>
FETCH = <TokenType.FETCH: 272>
FILE = <TokenType.FILE: 273>
FILE_FORMAT = <TokenType.FILE_FORMAT: 274>
FILTER = <TokenType.FILTER: 275>
FINAL = <TokenType.FINAL: 276>
FIRST = <TokenType.FIRST: 277>
FOR = <TokenType.FOR: 278>
FORCE = <TokenType.FORCE: 279>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 280>
FORMAT = <TokenType.FORMAT: 281>
FROM = <TokenType.FROM: 282>
FULL = <TokenType.FULL: 283>
FUNCTION = <TokenType.FUNCTION: 284>
GET = <TokenType.GET: 285>
GLOB = <TokenType.GLOB: 286>
GLOBAL = <TokenType.GLOBAL: 287>
GRANT = <TokenType.GRANT: 288>
GROUP_BY = <TokenType.GROUP_BY: 289>
GROUPING_SETS = <TokenType.GROUPING_SETS: 290>
HAVING = <TokenType.HAVING: 291>
HINT = <TokenType.HINT: 292>
IGNORE = <TokenType.IGNORE: 293>
ILIKE = <TokenType.ILIKE: 294>
IN = <TokenType.IN: 295>
INDEX = <TokenType.INDEX: 296>
INDEXED_BY = <TokenType.INDEXED_BY: 297>
INNER = <TokenType.INNER: 298>
INSERT = <TokenType.INSERT: 299>
INSTALL = <TokenType.INSTALL: 300>
INTEGRATION = <TokenType.INTEGRATION: 301>
INTERSECT = <TokenType.INTERSECT: 302>
INTERVAL = <TokenType.INTERVAL: 303>
INTO = <TokenType.INTO: 304>
INTRODUCER = <TokenType.INTRODUCER: 305>
IRLIKE = <TokenType.IRLIKE: 306>
IS = <TokenType.IS: 307>
ISNULL = <TokenType.ISNULL: 308>
JOIN = <TokenType.JOIN: 309>
JOIN_MARKER = <TokenType.JOIN_MARKER: 310>
KEEP = <TokenType.KEEP: 311>
KEY = <TokenType.KEY: 312>
KILL = <TokenType.KILL: 313>
LANGUAGE = <TokenType.LANGUAGE: 314>
LATERAL = <TokenType.LATERAL: 315>
LEFT = <TokenType.LEFT: 316>
LIKE = <TokenType.LIKE: 317>
LIMIT = <TokenType.LIMIT: 318>
LIST = <TokenType.LIST: 319>
LOAD = <TokenType.LOAD: 320>
LOCK = <TokenType.LOCK: 321>
MAP = <TokenType.MAP: 322>
MATCH = <TokenType.MATCH: 323>
MATCH_CONDITION = <TokenType.MATCH_CONDITION: 324>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 325>
MEMBER_OF = <TokenType.MEMBER_OF: 326>
MERGE = <TokenType.MERGE: 327>
MOD = <TokenType.MOD: 328>
MODEL = <TokenType.MODEL: 329>
NATURAL = <TokenType.NATURAL: 330>
NEXT = <TokenType.NEXT: 331>
NOTHING = <TokenType.NOTHING: 332>
NOTNULL = <TokenType.NOTNULL: 333>
NULL = <TokenType.NULL: 334>
OBJECT_IDENTIFIER = <TokenType.OBJECT_IDENTIFIER: 335>
OFFSET = <TokenType.OFFSET: 336>
ON = <TokenType.ON: 337>
ONLY = <TokenType.ONLY: 338>
OPERATOR = <TokenType.OPERATOR: 339>
ORDER_BY = <TokenType.ORDER_BY: 340>
ORDER_SIBLINGS_BY = <TokenType.ORDER_SIBLINGS_BY: 341>
ORDERED = <TokenType.ORDERED: 342>
ORDINALITY = <TokenType.ORDINALITY: 343>
OUT = <TokenType.OUT: 344>
INOUT = <TokenType.INOUT: 345>
OUTER = <TokenType.OUTER: 346>
OVER = <TokenType.OVER: 347>
OVERLAPS = <TokenType.OVERLAPS: 348>
OVERWRITE = <TokenType.OVERWRITE: 349>
PACKAGE = <TokenType.PACKAGE: 350>
PARTITION = <TokenType.PARTITION: 351>
PARTITION_BY = <TokenType.PARTITION_BY: 352>
PERCENT = <TokenType.PERCENT: 353>
PIVOT = <TokenType.PIVOT: 354>
PLACEHOLDER = <TokenType.PLACEHOLDER: 355>
POLICY = <TokenType.POLICY: 356>
POOL = <TokenType.POOL: 357>
POSITIONAL = <TokenType.POSITIONAL: 358>
PRAGMA = <TokenType.PRAGMA: 359>
PREWHERE = <TokenType.PREWHERE: 360>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 361>
PROCEDURE = <TokenType.PROCEDURE: 362>
PROPERTIES = <TokenType.PROPERTIES: 363>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 364>
PUT = <TokenType.PUT: 365>
QUALIFY = <TokenType.QUALIFY: 366>
QUOTE = <TokenType.QUOTE: 367>
QDCOLON = <TokenType.QDCOLON: 368>
RANGE = <TokenType.RANGE: 369>
RECURSIVE = <TokenType.RECURSIVE: 370>
REFRESH = <TokenType.REFRESH: 371>
RENAME = <TokenType.RENAME: 372>
REPLACE = <TokenType.REPLACE: 373>
RETURNING = <TokenType.RETURNING: 374>
REVOKE = <TokenType.REVOKE: 375>
REFERENCES = <TokenType.REFERENCES: 376>
RIGHT = <TokenType.RIGHT: 377>
RLIKE = <TokenType.RLIKE: 378>
ROLE = <TokenType.ROLE: 379>
ROLLBACK = <TokenType.ROLLBACK: 380>
ROLLUP = <TokenType.ROLLUP: 381>
ROW = <TokenType.ROW: 382>
ROWS = <TokenType.ROWS: 383>
RULE = <TokenType.RULE: 384>
SELECT = <TokenType.SELECT: 385>
SEMI = <TokenType.SEMI: 386>
SEPARATOR = <TokenType.SEPARATOR: 387>
SEQUENCE = <TokenType.SEQUENCE: 388>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 389>
SET = <TokenType.SET: 390>
SETTINGS = <TokenType.SETTINGS: 391>
SHOW = <TokenType.SHOW: 392>
SIMILAR_TO = <TokenType.SIMILAR_TO: 393>
SOME = <TokenType.SOME: 394>
SORT_BY = <TokenType.SORT_BY: 395>
SOUNDS_LIKE = <TokenType.SOUNDS_LIKE: 396>
SQL_SECURITY = <TokenType.SQL_SECURITY: 397>
START_WITH = <TokenType.START_WITH: 398>
STORAGE_INTEGRATION = <TokenType.STORAGE_INTEGRATION: 399>
STRAIGHT_JOIN = <TokenType.STRAIGHT_JOIN: 400>
STRUCT = <TokenType.STRUCT: 401>
SUMMARIZE = <TokenType.SUMMARIZE: 402>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 403>
TAG = <TokenType.TAG: 404>
TEMPORARY = <TokenType.TEMPORARY: 405>
TOP = <TokenType.TOP: 406>
THEN = <TokenType.THEN: 407>
TRUE = <TokenType.TRUE: 408>
TRUNCATE = <TokenType.TRUNCATE: 409>
TRIGGER = <TokenType.TRIGGER: 410>
TYPE = <TokenType.TYPE: 411>
UNCACHE = <TokenType.UNCACHE: 412>
UNION = <TokenType.UNION: 413>
UNNEST = <TokenType.UNNEST: 414>
UNPIVOT = <TokenType.UNPIVOT: 415>
UPDATE = <TokenType.UPDATE: 416>
USE = <TokenType.USE: 417>
USING = <TokenType.USING: 418>
VALUES = <TokenType.VALUES: 419>
VARIADIC = <TokenType.VARIADIC: 420>
VIEW = <TokenType.VIEW: 421>
SEMANTIC_VIEW = <TokenType.SEMANTIC_VIEW: 422>
VOLATILE = <TokenType.VOLATILE: 423>
VOLUME = <TokenType.VOLUME: 424>
WHEN = <TokenType.WHEN: 425>
WHERE = <TokenType.WHERE: 426>
WINDOW = <TokenType.WINDOW: 427>
WITH = <TokenType.WITH: 428>
UNIQUE = <TokenType.UNIQUE: 429>
UTC_DATE = <TokenType.UTC_DATE: 430>
UTC_TIME = <TokenType.UTC_TIME: 431>
UTC_TIMESTAMP = <TokenType.UTC_TIMESTAMP: 432>
VERSION_SNAPSHOT = <TokenType.VERSION_SNAPSHOT: 433>
TIMESTAMP_SNAPSHOT = <TokenType.TIMESTAMP_SNAPSHOT: 434>
OPTION = <TokenType.OPTION: 435>
SINK = <TokenType.SINK: 436>
SOURCE = <TokenType.SOURCE: 437>
ANALYZE = <TokenType.ANALYZE: 438>
NAMESPACE = <TokenType.NAMESPACE: 439>
EXPORT = <TokenType.EXPORT: 440>
HIVE_TOKEN_STREAM = <TokenType.HIVE_TOKEN_STREAM: 441>
SENTINEL = <TokenType.SENTINEL: 442>
class Token:
472class Token:
473    # mypyc doesn't expose slots
474    _attrs: t.ClassVar[tuple[str, ...]] = (
475        "token_type",
476        "text",
477        "line",
478        "col",
479        "start",
480        "end",
481        "comments",
482    )
483    __slots__ = _attrs
484
485    @classmethod
486    def number(cls, number: int) -> Token:
487        """Returns a NUMBER token with `number` as its text."""
488        return cls(TokenType.NUMBER, str(number))
489
490    @classmethod
491    def string(cls, string: str) -> Token:
492        """Returns a STRING token with `string` as its text."""
493        return cls(TokenType.STRING, string)
494
495    @classmethod
496    def identifier(cls, identifier: str) -> Token:
497        """Returns an IDENTIFIER token with `identifier` as its text."""
498        return cls(TokenType.IDENTIFIER, identifier)
499
500    @classmethod
501    def var(cls, var: str) -> Token:
502        """Returns an VAR token with `var` as its text."""
503        return cls(TokenType.VAR, var)
504
505    def __init__(
506        self,
507        token_type: TokenType,
508        text: str,
509        line: int = 1,
510        col: int = 1,
511        start: int = 0,
512        end: int = 0,
513        comments: list[str] | None = None,
514    ) -> None:
515        self.token_type = token_type
516        self.text = text
517        self.line = line
518        self.col = col
519        self.start = start
520        self.end = end
521        self.comments = [] if comments is None else comments
522
523    def __bool__(self) -> bool:
524        return self.token_type != TokenType.SENTINEL
525
526    def __repr__(self) -> str:
527        attributes = ", ".join(
528            f"{k}: TokenType.{self.token_type.name}"
529            if k == "token_type"
530            else f"{k}: {getattr(self, k)}"
531            for k in self._attrs
532        )
533        return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
505    def __init__(
506        self,
507        token_type: TokenType,
508        text: str,
509        line: int = 1,
510        col: int = 1,
511        start: int = 0,
512        end: int = 0,
513        comments: list[str] | None = None,
514    ) -> None:
515        self.token_type = token_type
516        self.text = text
517        self.line = line
518        self.col = col
519        self.start = start
520        self.end = end
521        self.comments = [] if comments is None else comments
@classmethod
def number(cls, number: int) -> Token:
485    @classmethod
486    def number(cls, number: int) -> Token:
487        """Returns a NUMBER token with `number` as its text."""
488        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
490    @classmethod
491    def string(cls, string: str) -> Token:
492        """Returns a STRING token with `string` as its text."""
493        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
495    @classmethod
496    def identifier(cls, identifier: str) -> Token:
497        """Returns an IDENTIFIER token with `identifier` as its text."""
498        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
500    @classmethod
501    def var(cls, var: str) -> Token:
502        """Returns an VAR token with `var` as its text."""
503        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class TokenizerCore:
 536class TokenizerCore:
 537    __slots__ = (
 538        "sql",
 539        "size",
 540        "tokens",
 541        "_start",
 542        "_current",
 543        "_line",
 544        "_col",
 545        "_comments",
 546        "_char",
 547        "_end",
 548        "_peek",
 549        "_prev_token_line",
 550        "single_tokens",
 551        "keywords",
 552        "quotes",
 553        "format_strings",
 554        "identifiers",
 555        "comments",
 556        "string_escapes",
 557        "byte_string_escapes",
 558        "identifier_escapes",
 559        "escape_follow_chars",
 560        "commands",
 561        "command_prefix_tokens",
 562        "nested_comments",
 563        "hint_start",
 564        "tokens_preceding_hint",
 565        "has_bit_strings",
 566        "has_hex_strings",
 567        "numeric_literals",
 568        "var_single_tokens",
 569        "string_escapes_allowed_in_raw_strings",
 570        "heredoc_tag_is_identifier",
 571        "heredoc_string_alternative",
 572        "keyword_trie",
 573        "numbers_can_be_underscore_separated",
 574        "numbers_can_have_decimals",
 575        "identifiers_can_start_with_digit",
 576        "unescaped_sequences",
 577    )
 578
 579    def __init__(
 580        self,
 581        single_tokens: dict[str, TokenType],
 582        keywords: dict[str, TokenType],
 583        quotes: dict[str, str],
 584        format_strings: dict[str, tuple[str, TokenType]],
 585        identifiers: dict[str, str],
 586        comments: dict[str, str | None],
 587        string_escapes: set[str],
 588        byte_string_escapes: set[str],
 589        identifier_escapes: set[str],
 590        escape_follow_chars: set[str],
 591        commands: set[TokenType],
 592        command_prefix_tokens: set[TokenType],
 593        nested_comments: bool,
 594        hint_start: str,
 595        tokens_preceding_hint: set[TokenType],
 596        has_bit_strings: bool,
 597        has_hex_strings: bool,
 598        numeric_literals: dict[str, str],
 599        var_single_tokens: set[str],
 600        string_escapes_allowed_in_raw_strings: bool,
 601        heredoc_tag_is_identifier: bool,
 602        heredoc_string_alternative: TokenType,
 603        keyword_trie: dict,
 604        numbers_can_be_underscore_separated: bool,
 605        numbers_can_have_decimals: bool,
 606        identifiers_can_start_with_digit: bool,
 607        unescaped_sequences: dict[str, str],
 608    ) -> None:
 609        self.single_tokens = single_tokens
 610        self.keywords = keywords
 611        self.quotes = quotes
 612        self.format_strings = format_strings
 613        self.identifiers = identifiers
 614        self.comments = comments
 615        self.string_escapes = string_escapes
 616        self.byte_string_escapes = byte_string_escapes
 617        self.identifier_escapes = identifier_escapes
 618        self.escape_follow_chars = escape_follow_chars
 619        self.commands = commands
 620        self.command_prefix_tokens = command_prefix_tokens
 621        self.nested_comments = nested_comments
 622        self.hint_start = hint_start
 623        self.tokens_preceding_hint = tokens_preceding_hint
 624        self.has_bit_strings = has_bit_strings
 625        self.has_hex_strings = has_hex_strings
 626        self.numeric_literals = numeric_literals
 627        self.var_single_tokens = var_single_tokens
 628        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 629        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 630        self.heredoc_string_alternative = heredoc_string_alternative
 631        self.keyword_trie = keyword_trie
 632        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 633        self.numbers_can_have_decimals = numbers_can_have_decimals
 634        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 635        self.unescaped_sequences = unescaped_sequences
 636        self.sql = ""
 637        self.size = 0
 638        self.tokens: list[Token] = []
 639        self._start = 0
 640        self._current = 0
 641        self._line = 1
 642        self._col = 0
 643        self._comments: list[str] = []
 644        self._char = ""
 645        self._end = False
 646        self._peek = ""
 647        self._prev_token_line = -1
 648
 649    def reset(self) -> None:
 650        self.sql = ""
 651        self.size = 0
 652        self.tokens = []
 653        self._start = 0
 654        self._current = 0
 655        self._line = 1
 656        self._col = 0
 657        self._comments = []
 658        self._char = ""
 659        self._end = False
 660        self._peek = ""
 661        self._prev_token_line = -1
 662
 663    def tokenize(self, sql: str) -> list[Token]:
 664        """Returns a list of tokens corresponding to the SQL string `sql`."""
 665        self.reset()
 666        self.sql = sql
 667        self.size = len(sql)
 668
 669        try:
 670            self._scan()
 671        except Exception as e:
 672            start = max(self._current - 50, 0)
 673            end = min(self._current + 50, self.size - 1)
 674            context = self.sql[start:end]
 675            raise TokenError(f"Error tokenizing '{context}'") from e
 676
 677        return self.tokens
 678
 679    def _scan(self, check_semicolon: bool = False) -> None:
 680        identifiers = self.identifiers
 681        digit_chars = _DIGIT_CHARS
 682
 683        while self.size and not self._end:
 684            current = self._current
 685
 686            # Skip spaces here rather than iteratively calling advance() for performance reasons
 687            while current < self.size:
 688                char = self.sql[current]
 689
 690                if char == " " or char == "\t":
 691                    current += 1
 692                else:
 693                    break
 694
 695            offset = current - self._current if current > self._current else 1
 696
 697            self._start = current
 698            self._advance(offset)
 699
 700            if not self._char.isspace():
 701                if self._char in digit_chars:
 702                    self._scan_number()
 703                elif self._char in identifiers:
 704                    self._scan_identifier(identifiers[self._char])
 705                else:
 706                    self._scan_keywords()
 707
 708            if check_semicolon and self._peek == ";":
 709                break
 710
 711        if self.tokens and self._comments:
 712            self.tokens[-1].comments.extend(self._comments)
 713
 714    def _chars(self, size: int) -> str:
 715        if size == 1:
 716            return self._char
 717
 718        start = self._current - 1
 719        end = start + size
 720
 721        return self.sql[start:end] if end <= self.size else ""
 722
 723    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 724        char = self._char
 725
 726        if char == "\n" or char == "\r":
 727            # Ensures we don't count an extra line if we get a \r\n line break sequence
 728            if not (char == "\r" and self._peek == "\n"):
 729                self._col = i
 730                self._line += 1
 731        else:
 732            self._col += i
 733
 734        self._current += i
 735        sql = self.sql
 736        size = self.size
 737        self._end = self._current >= size
 738        self._char = sql[self._current - 1]
 739        self._peek = "" if self._end else sql[self._current]
 740
 741        if alnum and self._char.isalnum():
 742            # Cache to local variables instead of attributes for better performance
 743            _col = self._col
 744            _current = self._current
 745            _end = self._end
 746            _peek = self._peek
 747
 748            while _peek.isalnum():
 749                _col += 1
 750                _current += 1
 751                _end = _current >= size
 752                _peek = "" if _end else sql[_current]
 753
 754            self._col = _col
 755            self._current = _current
 756            self._end = _end
 757            self._peek = _peek
 758            self._char = sql[_current - 1]
 759
 760    @property
 761    def _text(self) -> str:
 762        return self.sql[self._start : self._current]
 763
 764    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 765        self._prev_token_line = self._line
 766
 767        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 768            self.tokens[-1].comments.extend(self._comments)
 769            self._comments = []
 770
 771        if text is None:
 772            text = self.sql[self._start : self._current]
 773
 774        self.tokens.append(
 775            Token(
 776                token_type,
 777                text=text,
 778                line=self._line,
 779                col=self._col,
 780                start=self._start,
 781                end=self._current - 1,
 782                comments=self._comments,
 783            )
 784        )
 785        self._comments = []
 786
 787        # If we have either a semicolon or a begin token before the command's token, we'll parse
 788        # whatever follows the command's token as a string
 789        if (
 790            token_type in self.commands
 791            and self._peek != ";"
 792            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 793        ):
 794            start = self._current
 795            tokens = len(self.tokens)
 796            self._scan(check_semicolon=True)
 797            self.tokens = self.tokens[:tokens]
 798            text = self.sql[start : self._current].strip()
 799            if text:
 800                self._add(TokenType.STRING, text)
 801
 802    def _scan_keywords(self) -> None:
 803        sql = self.sql
 804        sql_size = self.size
 805        single_tokens = self.single_tokens
 806        char_upper = _CHAR_UPPER
 807        size = 0
 808        word = None
 809        chars = self._char
 810        char = chars
 811        prev_space = False
 812        skip = False
 813        trie = self.keyword_trie
 814        single_token = char in single_tokens
 815
 816        while chars:
 817            if not skip:
 818                sub = trie.get(char_upper.get(char, char))
 819                if sub is None:
 820                    break
 821                trie = sub
 822                if 0 in trie:
 823                    word = chars
 824
 825            end = self._current + size
 826            size += 1
 827
 828            if end < sql_size:
 829                char = sql[end]
 830                single_token = single_token or char in single_tokens
 831                is_space = char.isspace()
 832
 833                if not is_space or not prev_space:
 834                    if is_space:
 835                        char = " "
 836                    chars += char
 837                    prev_space = is_space
 838                    skip = False
 839                else:
 840                    skip = True
 841            else:
 842                char = ""
 843                break
 844
 845        if word:
 846            if self._scan_string(word):
 847                return
 848            if self._scan_comment(word):
 849                return
 850            if prev_space or single_token or not char:
 851                self._advance(size - 1)
 852                word = word.upper()
 853                self._add(self.keywords[word], text=word)
 854                return
 855
 856        if self._char in single_tokens:
 857            self._add(single_tokens[self._char], text=self._char)
 858            return
 859
 860        self._scan_var()
 861
 862    def _scan_comment(self, comment_start: str) -> bool:
 863        if comment_start not in self.comments:
 864            return False
 865
 866        comment_start_line = self._line
 867        comment_start_size = len(comment_start)
 868        comment_end = self.comments[comment_start]
 869
 870        if comment_end:
 871            # Skip the comment's start delimiter
 872            self._advance(comment_start_size)
 873
 874            comment_count = 1
 875            comment_end_size = len(comment_end)
 876            nested_comments = self.nested_comments
 877
 878            while not self._end:
 879                if self._chars(comment_end_size) == comment_end:
 880                    comment_count -= 1
 881                    if not comment_count:
 882                        break
 883
 884                self._advance(alnum=True)
 885
 886                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 887                if (
 888                    nested_comments
 889                    and not self._end
 890                    and self._chars(comment_end_size) == comment_start
 891                ):
 892                    self._advance(comment_start_size)
 893                    comment_count += 1
 894
 895            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 896            self._advance(comment_end_size - 1)
 897        else:
 898            _peek = self._peek
 899            while not self._end and _peek != "\n" and _peek != "\r":
 900                self._advance(alnum=True)
 901                _peek = self._peek
 902            self._comments.append(self._text[comment_start_size:])
 903
 904        if (
 905            comment_start == self.hint_start
 906            and self.tokens
 907            and self.tokens[-1].token_type in self.tokens_preceding_hint
 908        ):
 909            self._add(TokenType.HINT)
 910
 911        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 912        # Multiple consecutive comments are preserved by appending them to the current comments list.
 913        if comment_start_line == self._prev_token_line:
 914            self.tokens[-1].comments.extend(self._comments)
 915            self._comments = []
 916            self._prev_token_line = self._line
 917
 918        return True
 919
 920    def _scan_number(self) -> None:
 921        if self._char == "0":
 922            peek = _CHAR_UPPER.get(self._peek, self._peek)
 923            if peek == "B":
 924                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 925            elif peek == "X":
 926                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 927
 928        decimal = False
 929        scientific = 0
 930        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 931        single_tokens = self.single_tokens
 932        keywords = self.keywords
 933        numeric_literals = self.numeric_literals
 934        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 935
 936        is_underscore_separated: bool = False
 937        number_text: str = ""
 938        numeric_literal: str = ""
 939        numeric_type: TokenType | None = None
 940
 941        while True:
 942            if self._peek in _DIGIT_CHARS:
 943                # Batch consecutive digits: scan ahead to find how many
 944                sql = self.sql
 945                end = self._current + 1
 946                size = self.size
 947                while end < size and sql[end] in _DIGIT_CHARS:
 948                    end += 1
 949                self._advance(end - self._current)
 950            elif self._peek == "." and not decimal:
 951                if (
 952                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 953                ) or not self.numbers_can_have_decimals:
 954                    break
 955                decimal = True
 956                self._advance()
 957            elif self._peek in ("-", "+") and scientific == 1:
 958                # Only consume +/- if followed by a digit
 959                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 960                    scientific += 1
 961                    self._advance()
 962                else:
 963                    break
 964            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 965                scientific += 1
 966                self._advance()
 967            elif self._peek == "_" and numbers_can_be_underscore_separated:
 968                is_underscore_separated = True
 969                self._advance()
 970            elif self._peek.isidentifier():
 971                number_text = self._text
 972
 973                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 974                    numeric_literal += self._peek
 975                    self._advance()
 976
 977                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 978
 979                if numeric_type:
 980                    break
 981                elif identifiers_can_start_with_digit:
 982                    return self._add(TokenType.VAR)
 983
 984                self._advance(-len(numeric_literal))
 985                break
 986            else:
 987                break
 988
 989        number_text = number_text or self.sql[self._start : self._current]
 990
 991        # Normalize inputs such as 100_000 to 100000
 992        if is_underscore_separated:
 993            number_text = number_text.replace("_", "")
 994
 995        self._add(TokenType.NUMBER, number_text)
 996
 997        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 998        if numeric_type:
 999            self._add(TokenType.DCOLON, "::")
1000            self._add(numeric_type, numeric_literal)
1001
1002    def _scan_bits(self) -> None:
1003        self._advance()
1004        value = self._extract_value()
1005        try:
1006            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1007            int(value, 2)
1008            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1009        except ValueError:
1010            self._add(TokenType.IDENTIFIER)
1011
1012    def _scan_hex(self) -> None:
1013        self._advance()
1014        value = self._extract_value()
1015        try:
1016            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1017            int(value, 16)
1018            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1019        except ValueError:
1020            self._add(TokenType.IDENTIFIER)
1021
1022    def _extract_value(self) -> str:
1023        single_tokens = self.single_tokens
1024
1025        while True:
1026            char = self._peek.strip()
1027            if char and char not in single_tokens:
1028                self._advance(alnum=True)
1029            else:
1030                break
1031
1032        return self._text
1033
1034    def _scan_string(self, start: str) -> bool:
1035        base = None
1036        token_type = TokenType.STRING
1037
1038        if start in self.quotes:
1039            end = self.quotes[start]
1040        elif start in self.format_strings:
1041            end, token_type = self.format_strings[start]
1042
1043            if token_type == TokenType.HEX_STRING:
1044                base = 16
1045            elif token_type == TokenType.BIT_STRING:
1046                base = 2
1047            elif token_type == TokenType.HEREDOC_STRING:
1048                self._advance()
1049
1050                if self._char == end:
1051                    tag = ""
1052                else:
1053                    tag = self._extract_string(
1054                        end,
1055                        raw_string=True,
1056                        raise_unmatched=not self.heredoc_tag_is_identifier,
1057                    )
1058
1059                if (
1060                    tag
1061                    and self.heredoc_tag_is_identifier
1062                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1063                ):
1064                    if not self._end:
1065                        self._advance(-1)
1066
1067                    self._advance(-len(tag))
1068                    self._add(self.heredoc_string_alternative)
1069                    return True
1070
1071                end = f"{start}{tag}{end}"
1072        else:
1073            return False
1074
1075        self._advance(len(start))
1076        text = self._extract_string(
1077            end,
1078            escapes=(
1079                self.byte_string_escapes
1080                if token_type == TokenType.BYTE_STRING
1081                else self.string_escapes
1082            ),
1083            raw_string=token_type == TokenType.RAW_STRING,
1084        )
1085
1086        if base and text:
1087            try:
1088                int(text, base)
1089            except Exception:
1090                raise TokenError(
1091                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1092                )
1093
1094        self._add(token_type, text)
1095        return True
1096
1097    def _scan_identifier(self, identifier_end: str) -> None:
1098        self._advance()
1099        text = self._extract_string(
1100            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1101        )
1102        self._add(TokenType.IDENTIFIER, text)
1103
1104    def _scan_var(self) -> None:
1105        var_single_tokens = self.var_single_tokens
1106        single_tokens = self.single_tokens
1107
1108        while True:
1109            peek = self._peek
1110            if not peek or peek.isspace():
1111                break
1112            if peek not in var_single_tokens and peek in single_tokens:
1113                break
1114            self._advance(alnum=True)
1115
1116        self._add(
1117            TokenType.VAR
1118            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1119            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1120        )
1121
1122    def _extract_string(
1123        self,
1124        delimiter: str,
1125        escapes: set[str] | None = None,
1126        raw_string: bool = False,
1127        raise_unmatched: bool = True,
1128    ) -> str:
1129        text = ""
1130        delim_size = len(delimiter)
1131        escapes = self.string_escapes if escapes is None else escapes
1132        unescaped_sequences = self.unescaped_sequences
1133        escape_follow_chars = self.escape_follow_chars
1134        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1135        quotes = self.quotes
1136        sql = self.sql
1137
1138        # use str.find() when the string is simple... no \ or other escapes
1139        if delim_size == 1:
1140            pos = self._current - 1
1141            end = sql.find(delimiter, pos)
1142
1143            if (
1144                # the closing delimiter was found
1145                end != -1
1146                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1147                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1148                # no backslash in the string that would need escape processing
1149                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1150            ):
1151                newlines = sql.count("\n", pos, end)
1152                if newlines:
1153                    self._line += newlines
1154                    self._col = end - sql.rfind("\n", pos, end)
1155                else:
1156                    self._col += end - pos
1157
1158                self._current = end + 1
1159                self._end = self._current >= self.size
1160                self._char = sql[end]
1161                self._peek = "" if self._end else sql[self._current]
1162                return sql[pos:end]
1163
1164        while True:
1165            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1166                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1167                if unescaped_sequence:
1168                    self._advance(2)
1169                    text += unescaped_sequence
1170                    continue
1171
1172            is_valid_custom_escape = (
1173                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1174            )
1175
1176            if (
1177                (string_escapes_allowed_in_raw_strings or not raw_string)
1178                and self._char in escapes
1179                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1180                and (self._char not in quotes or self._char == self._peek)
1181            ):
1182                if self._peek == delimiter:
1183                    text += self._peek
1184                elif is_valid_custom_escape and self._char != self._peek:
1185                    text += self._peek
1186                else:
1187                    text += self._char + self._peek
1188
1189                if self._current + 1 < self.size:
1190                    self._advance(2)
1191                else:
1192                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1193            else:
1194                if self._chars(delim_size) == delimiter:
1195                    if delim_size > 1:
1196                        self._advance(delim_size - 1)
1197                    break
1198
1199                if self._end:
1200                    if not raise_unmatched:
1201                        return text + self._char
1202
1203                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1204
1205                current = self._current - 1
1206                self._advance(alnum=True)
1207                text += sql[current : self._current - 1]
1208
1209        return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
579    def __init__(
580        self,
581        single_tokens: dict[str, TokenType],
582        keywords: dict[str, TokenType],
583        quotes: dict[str, str],
584        format_strings: dict[str, tuple[str, TokenType]],
585        identifiers: dict[str, str],
586        comments: dict[str, str | None],
587        string_escapes: set[str],
588        byte_string_escapes: set[str],
589        identifier_escapes: set[str],
590        escape_follow_chars: set[str],
591        commands: set[TokenType],
592        command_prefix_tokens: set[TokenType],
593        nested_comments: bool,
594        hint_start: str,
595        tokens_preceding_hint: set[TokenType],
596        has_bit_strings: bool,
597        has_hex_strings: bool,
598        numeric_literals: dict[str, str],
599        var_single_tokens: set[str],
600        string_escapes_allowed_in_raw_strings: bool,
601        heredoc_tag_is_identifier: bool,
602        heredoc_string_alternative: TokenType,
603        keyword_trie: dict,
604        numbers_can_be_underscore_separated: bool,
605        numbers_can_have_decimals: bool,
606        identifiers_can_start_with_digit: bool,
607        unescaped_sequences: dict[str, str],
608    ) -> None:
609        self.single_tokens = single_tokens
610        self.keywords = keywords
611        self.quotes = quotes
612        self.format_strings = format_strings
613        self.identifiers = identifiers
614        self.comments = comments
615        self.string_escapes = string_escapes
616        self.byte_string_escapes = byte_string_escapes
617        self.identifier_escapes = identifier_escapes
618        self.escape_follow_chars = escape_follow_chars
619        self.commands = commands
620        self.command_prefix_tokens = command_prefix_tokens
621        self.nested_comments = nested_comments
622        self.hint_start = hint_start
623        self.tokens_preceding_hint = tokens_preceding_hint
624        self.has_bit_strings = has_bit_strings
625        self.has_hex_strings = has_hex_strings
626        self.numeric_literals = numeric_literals
627        self.var_single_tokens = var_single_tokens
628        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
629        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
630        self.heredoc_string_alternative = heredoc_string_alternative
631        self.keyword_trie = keyword_trie
632        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
633        self.numbers_can_have_decimals = numbers_can_have_decimals
634        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
635        self.unescaped_sequences = unescaped_sequences
636        self.sql = ""
637        self.size = 0
638        self.tokens: list[Token] = []
639        self._start = 0
640        self._current = 0
641        self._line = 1
642        self._col = 0
643        self._comments: list[str] = []
644        self._char = ""
645        self._end = False
646        self._peek = ""
647        self._prev_token_line = -1
single_tokens
keywords
quotes
format_strings
identifiers
comments
string_escapes
byte_string_escapes
identifier_escapes
escape_follow_chars
commands
command_prefix_tokens
nested_comments
hint_start
tokens_preceding_hint
has_bit_strings
has_hex_strings
numeric_literals
var_single_tokens
string_escapes_allowed_in_raw_strings
heredoc_tag_is_identifier
heredoc_string_alternative
keyword_trie
numbers_can_be_underscore_separated
numbers_can_have_decimals
identifiers_can_start_with_digit
unescaped_sequences
sql
size
tokens: list[Token]
def reset(self) -> None:
649    def reset(self) -> None:
650        self.sql = ""
651        self.size = 0
652        self.tokens = []
653        self._start = 0
654        self._current = 0
655        self._line = 1
656        self._col = 0
657        self._comments = []
658        self._char = ""
659        self._end = False
660        self._peek = ""
661        self._prev_token_line = -1
def tokenize(self, sql: str) -> list[Token]:
663    def tokenize(self, sql: str) -> list[Token]:
664        """Returns a list of tokens corresponding to the SQL string `sql`."""
665        self.reset()
666        self.sql = sql
667        self.size = len(sql)
668
669        try:
670            self._scan()
671        except Exception as e:
672            start = max(self._current - 50, 0)
673            end = min(self._current + 50, self.size - 1)
674            context = self.sql[start:end]
675            raise TokenError(f"Error tokenizing '{context}'") from e
676
677        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.