Edit on GitHub

sqlglot.tokenizer_core

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import IntEnum, auto
   5
   6from sqlglot.errors import TokenError
   7
   8# dict lookup is faster than .upper() and .isdigit()
   9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)}
  10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789")
  11
  12
  13class TokenType(IntEnum):
  14    L_PAREN = auto()
  15    R_PAREN = auto()
  16    L_BRACKET = auto()
  17    R_BRACKET = auto()
  18    L_BRACE = auto()
  19    R_BRACE = auto()
  20    COMMA = auto()
  21    DOT = auto()
  22    DASH = auto()
  23    PLUS = auto()
  24    COLON = auto()
  25    DOTCOLON = auto()
  26    DOTCARET = auto()
  27    DCOLON = auto()
  28    DCOLONDOLLAR = auto()
  29    DCOLONPERCENT = auto()
  30    DCOLONQMARK = auto()
  31    DQMARK = auto()
  32    SEMICOLON = auto()
  33    STAR = auto()
  34    BACKSLASH = auto()
  35    SLASH = auto()
  36    LT = auto()
  37    LTE = auto()
  38    GT = auto()
  39    GTE = auto()
  40    NOT = auto()
  41    EQ = auto()
  42    NEQ = auto()
  43    NULLSAFE_EQ = auto()
  44    COLON_EQ = auto()
  45    COLON_GT = auto()
  46    NCOLON_GT = auto()
  47    AND = auto()
  48    OR = auto()
  49    AMP = auto()
  50    DPIPE = auto()
  51    PIPE_GT = auto()
  52    PIPE = auto()
  53    PIPE_SLASH = auto()
  54    DPIPE_SLASH = auto()
  55    CARET = auto()
  56    CARET_AT = auto()
  57    TILDE = auto()
  58    ARROW = auto()
  59    DARROW = auto()
  60    FARROW = auto()
  61    HASH = auto()
  62    HASH_ARROW = auto()
  63    DHASH_ARROW = auto()
  64    LR_ARROW = auto()
  65    DAT = auto()
  66    LT_AT = auto()
  67    AT_GT = auto()
  68    DOLLAR = auto()
  69    PARAMETER = auto()
  70    SESSION = auto()
  71    SESSION_PARAMETER = auto()
  72    SESSION_USER = auto()
  73    DAMP = auto()
  74    AMP_LT = auto()
  75    AMP_GT = auto()
  76    ADJACENT = auto()
  77    XOR = auto()
  78    DSTAR = auto()
  79    QMARK_AMP = auto()
  80    QMARK_PIPE = auto()
  81    HASH_DASH = auto()
  82    EXCLAMATION = auto()
  83
  84    URI_START = auto()
  85
  86    BLOCK_START = auto()
  87    BLOCK_END = auto()
  88
  89    SPACE = auto()
  90    BREAK = auto()
  91
  92    STRING = auto()
  93    NUMBER = auto()
  94    IDENTIFIER = auto()
  95    DATABASE = auto()
  96    COLUMN = auto()
  97    COLUMN_DEF = auto()
  98    SCHEMA = auto()
  99    TABLE = auto()
 100    WAREHOUSE = auto()
 101    STAGE = auto()
 102    STREAM = auto()
 103    STREAMLIT = auto()
 104    VAR = auto()
 105    BIT_STRING = auto()
 106    HEX_STRING = auto()
 107    BYTE_STRING = auto()
 108    NATIONAL_STRING = auto()
 109    RAW_STRING = auto()
 110    HEREDOC_STRING = auto()
 111    UNICODE_STRING = auto()
 112
 113    # types
 114    BIT = auto()
 115    BOOLEAN = auto()
 116    TINYINT = auto()
 117    UTINYINT = auto()
 118    SMALLINT = auto()
 119    USMALLINT = auto()
 120    MEDIUMINT = auto()
 121    UMEDIUMINT = auto()
 122    INT = auto()
 123    UINT = auto()
 124    BIGINT = auto()
 125    UBIGINT = auto()
 126    BIGNUM = auto()
 127    INT128 = auto()
 128    UINT128 = auto()
 129    INT256 = auto()
 130    UINT256 = auto()
 131    FLOAT = auto()
 132    DOUBLE = auto()
 133    UDOUBLE = auto()
 134    DECIMAL = auto()
 135    DECIMAL32 = auto()
 136    DECIMAL64 = auto()
 137    DECIMAL128 = auto()
 138    DECIMAL256 = auto()
 139    DECFLOAT = auto()
 140    UDECIMAL = auto()
 141    BIGDECIMAL = auto()
 142    CHAR = auto()
 143    NCHAR = auto()
 144    VARCHAR = auto()
 145    NVARCHAR = auto()
 146    BPCHAR = auto()
 147    TEXT = auto()
 148    MEDIUMTEXT = auto()
 149    LONGTEXT = auto()
 150    BLOB = auto()
 151    MEDIUMBLOB = auto()
 152    LONGBLOB = auto()
 153    TINYBLOB = auto()
 154    TINYTEXT = auto()
 155    NAME = auto()
 156    BINARY = auto()
 157    VARBINARY = auto()
 158    JSON = auto()
 159    JSONB = auto()
 160    TIME = auto()
 161    TIMETZ = auto()
 162    TIME_NS = auto()
 163    TIMESTAMP = auto()
 164    TIMESTAMPTZ = auto()
 165    TIMESTAMPLTZ = auto()
 166    TIMESTAMPNTZ = auto()
 167    TIMESTAMP_S = auto()
 168    TIMESTAMP_MS = auto()
 169    TIMESTAMP_NS = auto()
 170    DATETIME = auto()
 171    DATETIME2 = auto()
 172    DATETIME64 = auto()
 173    SMALLDATETIME = auto()
 174    DATE = auto()
 175    DATE32 = auto()
 176    INT4RANGE = auto()
 177    INT4MULTIRANGE = auto()
 178    INT8RANGE = auto()
 179    INT8MULTIRANGE = auto()
 180    NUMRANGE = auto()
 181    NUMMULTIRANGE = auto()
 182    TSRANGE = auto()
 183    TSMULTIRANGE = auto()
 184    TSTZRANGE = auto()
 185    TSTZMULTIRANGE = auto()
 186    DATERANGE = auto()
 187    DATEMULTIRANGE = auto()
 188    UUID = auto()
 189    GEOGRAPHY = auto()
 190    GEOGRAPHYPOINT = auto()
 191    NULLABLE = auto()
 192    GEOMETRY = auto()
 193    POINT = auto()
 194    RING = auto()
 195    LINESTRING = auto()
 196    LOCALTIME = auto()
 197    LOCALTIMESTAMP = auto()
 198    SYSTIMESTAMP = auto()
 199    MULTILINESTRING = auto()
 200    POLYGON = auto()
 201    MULTIPOLYGON = auto()
 202    HLLSKETCH = auto()
 203    HSTORE = auto()
 204    SUPER = auto()
 205    SERIAL = auto()
 206    SMALLSERIAL = auto()
 207    BIGSERIAL = auto()
 208    XML = auto()
 209    YEAR = auto()
 210    USERDEFINED = auto()
 211    MONEY = auto()
 212    SMALLMONEY = auto()
 213    ROWVERSION = auto()
 214    IMAGE = auto()
 215    VARIANT = auto()
 216    OBJECT = auto()
 217    INET = auto()
 218    IPADDRESS = auto()
 219    IPPREFIX = auto()
 220    IPV4 = auto()
 221    IPV6 = auto()
 222    ENUM = auto()
 223    ENUM8 = auto()
 224    ENUM16 = auto()
 225    FIXEDSTRING = auto()
 226    LOWCARDINALITY = auto()
 227    NESTED = auto()
 228    AGGREGATEFUNCTION = auto()
 229    SIMPLEAGGREGATEFUNCTION = auto()
 230    TDIGEST = auto()
 231    UNKNOWN = auto()
 232    VECTOR = auto()
 233    DYNAMIC = auto()
 234    VOID = auto()
 235
 236    # keywords
 237    ALIAS = auto()
 238    ALTER = auto()
 239    ALL = auto()
 240    ANTI = auto()
 241    ANY = auto()
 242    APPLY = auto()
 243    ARRAY = auto()
 244    ASC = auto()
 245    ASOF = auto()
 246    ATTACH = auto()
 247    AUTO_INCREMENT = auto()
 248    BEGIN = auto()
 249    BETWEEN = auto()
 250    BULK_COLLECT_INTO = auto()
 251    CACHE = auto()
 252    CASE = auto()
 253    CHARACTER_SET = auto()
 254    CLUSTER_BY = auto()
 255    COLLATE = auto()
 256    COMMAND = auto()
 257    COMMENT = auto()
 258    COMMIT = auto()
 259    CONNECT_BY = auto()
 260    CONSTRAINT = auto()
 261    COPY = auto()
 262    CREATE = auto()
 263    CROSS = auto()
 264    CUBE = auto()
 265    CURRENT_DATE = auto()
 266    CURRENT_DATETIME = auto()
 267    CURRENT_SCHEMA = auto()
 268    CURRENT_TIME = auto()
 269    CURRENT_TIMESTAMP = auto()
 270    CURRENT_USER = auto()
 271    CURRENT_USER_ID = auto()
 272    CURRENT_ROLE = auto()
 273    CURRENT_CATALOG = auto()
 274    DECLARE = auto()
 275    DEFAULT = auto()
 276    DELETE = auto()
 277    DESC = auto()
 278    DESCRIBE = auto()
 279    DETACH = auto()
 280    DICTIONARY = auto()
 281    DISTINCT = auto()
 282    DISTRIBUTE_BY = auto()
 283    DIV = auto()
 284    DROP = auto()
 285    ELSE = auto()
 286    END = auto()
 287    ESCAPE = auto()
 288    EXCEPT = auto()
 289    EXECUTE = auto()
 290    EXISTS = auto()
 291    FALSE = auto()
 292    FETCH = auto()
 293    FILE = auto()
 294    FILE_FORMAT = auto()
 295    FILTER = auto()
 296    FINAL = auto()
 297    FIRST = auto()
 298    FOR = auto()
 299    FORCE = auto()
 300    FOREIGN_KEY = auto()
 301    FORMAT = auto()
 302    FROM = auto()
 303    FULL = auto()
 304    FUNCTION = auto()
 305    GET = auto()
 306    GLOB = auto()
 307    GLOBAL = auto()
 308    GRANT = auto()
 309    GROUP_BY = auto()
 310    GROUPING_SETS = auto()
 311    HAVING = auto()
 312    HINT = auto()
 313    IGNORE = auto()
 314    ILIKE = auto()
 315    IN = auto()
 316    INDEX = auto()
 317    INDEXED_BY = auto()
 318    INNER = auto()
 319    INSERT = auto()
 320    INSTALL = auto()
 321    INTEGRATION = auto()
 322    INTERSECT = auto()
 323    INTERVAL = auto()
 324    INTO = auto()
 325    INTRODUCER = auto()
 326    IRLIKE = auto()
 327    IS = auto()
 328    ISNULL = auto()
 329    JOIN = auto()
 330    JOIN_MARKER = auto()
 331    KEEP = auto()
 332    KEY = auto()
 333    KILL = auto()
 334    LANGUAGE = auto()
 335    LATERAL = auto()
 336    LEFT = auto()
 337    LIKE = auto()
 338    LIMIT = auto()
 339    LIST = auto()
 340    LOAD = auto()
 341    LOCK = auto()
 342    MAP = auto()
 343    MATCH = auto()
 344    MATCH_CONDITION = auto()
 345    MATCH_RECOGNIZE = auto()
 346    MEMBER_OF = auto()
 347    MERGE = auto()
 348    MOD = auto()
 349    MODEL = auto()
 350    NATURAL = auto()
 351    NEXT = auto()
 352    NOTHING = auto()
 353    NOTNULL = auto()
 354    NULL = auto()
 355    OBJECT_IDENTIFIER = auto()
 356    OFFSET = auto()
 357    ON = auto()
 358    ONLY = auto()
 359    OPERATOR = auto()
 360    ORDER_BY = auto()
 361    ORDER_SIBLINGS_BY = auto()
 362    ORDERED = auto()
 363    ORDINALITY = auto()
 364    OUT = auto()
 365    INOUT = auto()
 366    OUTER = auto()
 367    OVER = auto()
 368    OVERLAPS = auto()
 369    OVERWRITE = auto()
 370    PACKAGE = auto()
 371    PARTITION = auto()
 372    PARTITION_BY = auto()
 373    PERCENT = auto()
 374    PIVOT = auto()
 375    PLACEHOLDER = auto()
 376    POLICY = auto()
 377    POOL = auto()
 378    POSITIONAL = auto()
 379    PRAGMA = auto()
 380    PREWHERE = auto()
 381    PRIMARY_KEY = auto()
 382    PROCEDURE = auto()
 383    PROPERTIES = auto()
 384    PSEUDO_TYPE = auto()
 385    PUT = auto()
 386    QUALIFY = auto()
 387    QUOTE = auto()
 388    QDCOLON = auto()
 389    RANGE = auto()
 390    RECURSIVE = auto()
 391    REFRESH = auto()
 392    RENAME = auto()
 393    REPLACE = auto()
 394    RETURNING = auto()
 395    REVOKE = auto()
 396    REFERENCES = auto()
 397    RIGHT = auto()
 398    RLIKE = auto()
 399    ROLE = auto()
 400    ROLLBACK = auto()
 401    ROLLUP = auto()
 402    ROW = auto()
 403    ROWS = auto()
 404    RULE = auto()
 405    SELECT = auto()
 406    SEMI = auto()
 407    SEPARATOR = auto()
 408    SEQUENCE = auto()
 409    SERDE_PROPERTIES = auto()
 410    SET = auto()
 411    SETTINGS = auto()
 412    SHOW = auto()
 413    SIMILAR_TO = auto()
 414    SOME = auto()
 415    SORT_BY = auto()
 416    SOUNDS_LIKE = auto()
 417    SQL_SECURITY = auto()
 418    START_WITH = auto()
 419    STORAGE_INTEGRATION = auto()
 420    STRAIGHT_JOIN = auto()
 421    STRUCT = auto()
 422    SUMMARIZE = auto()
 423    TABLE_SAMPLE = auto()
 424    TAG = auto()
 425    TEMPORARY = auto()
 426    TOP = auto()
 427    THEN = auto()
 428    TRUE = auto()
 429    TRUNCATE = auto()
 430    TRIGGER = auto()
 431    TYPE = auto()
 432    UNCACHE = auto()
 433    UNION = auto()
 434    UNNEST = auto()
 435    UNPIVOT = auto()
 436    UPDATE = auto()
 437    USE = auto()
 438    USING = auto()
 439    VALUES = auto()
 440    VARIADIC = auto()
 441    VIEW = auto()
 442    SEMANTIC_VIEW = auto()
 443    VOLATILE = auto()
 444    VOLUME = auto()
 445    WHEN = auto()
 446    WHERE = auto()
 447    WINDOW = auto()
 448    WITH = auto()
 449    UNIQUE = auto()
 450    UTC_DATE = auto()
 451    UTC_TIME = auto()
 452    UTC_TIMESTAMP = auto()
 453    VERSION_SNAPSHOT = auto()
 454    TIMESTAMP_SNAPSHOT = auto()
 455    OPTION = auto()
 456    SINK = auto()
 457    SOURCE = auto()
 458    ANALYZE = auto()
 459    NAMESPACE = auto()
 460    EXPORT = auto()
 461
 462    # sentinels
 463    HIVE_TOKEN_STREAM = auto()
 464    SENTINEL = auto()
 465
 466    def __str__(self) -> str:
 467        return f"TokenType.{self.name}"
 468
 469
 470class Token:
 471    # mypyc doesn't expose slots
 472    _attrs: t.ClassVar[tuple[str, ...]] = (
 473        "token_type",
 474        "text",
 475        "line",
 476        "col",
 477        "start",
 478        "end",
 479        "comments",
 480    )
 481    __slots__ = _attrs
 482
 483    @classmethod
 484    def number(cls, number: int) -> Token:
 485        """Returns a NUMBER token with `number` as its text."""
 486        return cls(TokenType.NUMBER, str(number))
 487
 488    @classmethod
 489    def string(cls, string: str) -> Token:
 490        """Returns a STRING token with `string` as its text."""
 491        return cls(TokenType.STRING, string)
 492
 493    @classmethod
 494    def identifier(cls, identifier: str) -> Token:
 495        """Returns an IDENTIFIER token with `identifier` as its text."""
 496        return cls(TokenType.IDENTIFIER, identifier)
 497
 498    @classmethod
 499    def var(cls, var: str) -> Token:
 500        """Returns an VAR token with `var` as its text."""
 501        return cls(TokenType.VAR, var)
 502
 503    def __init__(
 504        self,
 505        token_type: TokenType,
 506        text: str,
 507        line: int = 1,
 508        col: int = 1,
 509        start: int = 0,
 510        end: int = 0,
 511        comments: list[str] | None = None,
 512    ) -> None:
 513        self.token_type = token_type
 514        self.text = text
 515        self.line = line
 516        self.col = col
 517        self.start = start
 518        self.end = end
 519        self.comments = [] if comments is None else comments
 520
 521    def __bool__(self) -> bool:
 522        return self.token_type != TokenType.SENTINEL
 523
 524    def __repr__(self) -> str:
 525        attributes = ", ".join(
 526            f"{k}: TokenType.{self.token_type.name}"
 527            if k == "token_type"
 528            else f"{k}: {getattr(self, k)}"
 529            for k in self._attrs
 530        )
 531        return f"<Token {attributes}>"
 532
 533
 534class TokenizerCore:
 535    __slots__ = (
 536        "sql",
 537        "size",
 538        "tokens",
 539        "_start",
 540        "_current",
 541        "_line",
 542        "_col",
 543        "_comments",
 544        "_char",
 545        "_end",
 546        "_peek",
 547        "_prev_token_line",
 548        "single_tokens",
 549        "keywords",
 550        "quotes",
 551        "format_strings",
 552        "identifiers",
 553        "comments",
 554        "string_escapes",
 555        "byte_string_escapes",
 556        "identifier_escapes",
 557        "escape_follow_chars",
 558        "commands",
 559        "command_prefix_tokens",
 560        "nested_comments",
 561        "hint_start",
 562        "tokens_preceding_hint",
 563        "has_bit_strings",
 564        "has_hex_strings",
 565        "numeric_literals",
 566        "var_single_tokens",
 567        "string_escapes_allowed_in_raw_strings",
 568        "heredoc_tag_is_identifier",
 569        "heredoc_string_alternative",
 570        "keyword_trie",
 571        "numbers_can_be_underscore_separated",
 572        "numbers_can_have_decimals",
 573        "identifiers_can_start_with_digit",
 574        "unescaped_sequences",
 575    )
 576
 577    def __init__(
 578        self,
 579        single_tokens: dict[str, TokenType],
 580        keywords: dict[str, TokenType],
 581        quotes: dict[str, str],
 582        format_strings: dict[str, tuple[str, TokenType]],
 583        identifiers: dict[str, str],
 584        comments: dict[str, str | None],
 585        string_escapes: set[str],
 586        byte_string_escapes: set[str],
 587        identifier_escapes: set[str],
 588        escape_follow_chars: set[str],
 589        commands: set[TokenType],
 590        command_prefix_tokens: set[TokenType],
 591        nested_comments: bool,
 592        hint_start: str,
 593        tokens_preceding_hint: set[TokenType],
 594        has_bit_strings: bool,
 595        has_hex_strings: bool,
 596        numeric_literals: dict[str, str],
 597        var_single_tokens: set[str],
 598        string_escapes_allowed_in_raw_strings: bool,
 599        heredoc_tag_is_identifier: bool,
 600        heredoc_string_alternative: TokenType,
 601        keyword_trie: dict,
 602        numbers_can_be_underscore_separated: bool,
 603        numbers_can_have_decimals: bool,
 604        identifiers_can_start_with_digit: bool,
 605        unescaped_sequences: dict[str, str],
 606    ) -> None:
 607        self.single_tokens = single_tokens
 608        self.keywords = keywords
 609        self.quotes = quotes
 610        self.format_strings = format_strings
 611        self.identifiers = identifiers
 612        self.comments = comments
 613        self.string_escapes = string_escapes
 614        self.byte_string_escapes = byte_string_escapes
 615        self.identifier_escapes = identifier_escapes
 616        self.escape_follow_chars = escape_follow_chars
 617        self.commands = commands
 618        self.command_prefix_tokens = command_prefix_tokens
 619        self.nested_comments = nested_comments
 620        self.hint_start = hint_start
 621        self.tokens_preceding_hint = tokens_preceding_hint
 622        self.has_bit_strings = has_bit_strings
 623        self.has_hex_strings = has_hex_strings
 624        self.numeric_literals = numeric_literals
 625        self.var_single_tokens = var_single_tokens
 626        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 627        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 628        self.heredoc_string_alternative = heredoc_string_alternative
 629        self.keyword_trie = keyword_trie
 630        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 631        self.numbers_can_have_decimals = numbers_can_have_decimals
 632        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 633        self.unescaped_sequences = unescaped_sequences
 634        self.sql = ""
 635        self.size = 0
 636        self.tokens: list[Token] = []
 637        self._start = 0
 638        self._current = 0
 639        self._line = 1
 640        self._col = 0
 641        self._comments: list[str] = []
 642        self._char = ""
 643        self._end = False
 644        self._peek = ""
 645        self._prev_token_line = -1
 646
 647    def reset(self) -> None:
 648        self.sql = ""
 649        self.size = 0
 650        self.tokens = []
 651        self._start = 0
 652        self._current = 0
 653        self._line = 1
 654        self._col = 0
 655        self._comments = []
 656        self._char = ""
 657        self._end = False
 658        self._peek = ""
 659        self._prev_token_line = -1
 660
 661    def tokenize(self, sql: str) -> list[Token]:
 662        """Returns a list of tokens corresponding to the SQL string `sql`."""
 663        self.reset()
 664        self.sql = sql
 665        self.size = len(sql)
 666
 667        try:
 668            self._scan()
 669        except Exception as e:
 670            start = max(self._current - 50, 0)
 671            end = min(self._current + 50, self.size - 1)
 672            context = self.sql[start:end]
 673            raise TokenError(f"Error tokenizing '{context}'") from e
 674
 675        return self.tokens
 676
 677    def _scan(self, check_semicolon: bool = False) -> None:
 678        identifiers = self.identifiers
 679        digit_chars = _DIGIT_CHARS
 680
 681        while self.size and not self._end:
 682            current = self._current
 683
 684            # Skip spaces here rather than iteratively calling advance() for performance reasons
 685            while current < self.size:
 686                char = self.sql[current]
 687
 688                if char == " " or char == "\t":
 689                    current += 1
 690                else:
 691                    break
 692
 693            offset = current - self._current if current > self._current else 1
 694
 695            self._start = current
 696            self._advance(offset)
 697
 698            if not self._char.isspace():
 699                if self._char in digit_chars:
 700                    self._scan_number()
 701                elif self._char in identifiers:
 702                    self._scan_identifier(identifiers[self._char])
 703                else:
 704                    self._scan_keywords()
 705
 706            if check_semicolon and self._peek == ";":
 707                break
 708
 709        if self.tokens and self._comments:
 710            self.tokens[-1].comments.extend(self._comments)
 711
 712    def _chars(self, size: int) -> str:
 713        if size == 1:
 714            return self._char
 715
 716        start = self._current - 1
 717        end = start + size
 718
 719        return self.sql[start:end] if end <= self.size else ""
 720
 721    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 722        char = self._char
 723
 724        if char == "\n" or char == "\r":
 725            # Ensures we don't count an extra line if we get a \r\n line break sequence
 726            if not (char == "\r" and self._peek == "\n"):
 727                self._col = i
 728                self._line += 1
 729        else:
 730            self._col += i
 731
 732        self._current += i
 733        sql = self.sql
 734        size = self.size
 735        self._end = self._current >= size
 736        self._char = sql[self._current - 1]
 737        self._peek = "" if self._end else sql[self._current]
 738
 739        if alnum and self._char.isalnum():
 740            # Cache to local variables instead of attributes for better performance
 741            _col = self._col
 742            _current = self._current
 743            _end = self._end
 744            _peek = self._peek
 745
 746            while _peek.isalnum():
 747                _col += 1
 748                _current += 1
 749                _end = _current >= size
 750                _peek = "" if _end else sql[_current]
 751
 752            self._col = _col
 753            self._current = _current
 754            self._end = _end
 755            self._peek = _peek
 756            self._char = sql[_current - 1]
 757
 758    @property
 759    def _text(self) -> str:
 760        return self.sql[self._start : self._current]
 761
 762    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 763        self._prev_token_line = self._line
 764
 765        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 766            self.tokens[-1].comments.extend(self._comments)
 767            self._comments = []
 768
 769        if text is None:
 770            text = self.sql[self._start : self._current]
 771
 772        self.tokens.append(
 773            Token(
 774                token_type,
 775                text=text,
 776                line=self._line,
 777                col=self._col,
 778                start=self._start,
 779                end=self._current - 1,
 780                comments=self._comments,
 781            )
 782        )
 783        self._comments = []
 784
 785        # If we have either a semicolon or a begin token before the command's token, we'll parse
 786        # whatever follows the command's token as a string
 787        if (
 788            token_type in self.commands
 789            and self._peek != ";"
 790            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 791        ):
 792            start = self._current
 793            tokens = len(self.tokens)
 794            self._scan(check_semicolon=True)
 795            self.tokens = self.tokens[:tokens]
 796            text = self.sql[start : self._current].strip()
 797            if text:
 798                self._add(TokenType.STRING, text)
 799
 800    def _scan_keywords(self) -> None:
 801        sql = self.sql
 802        sql_size = self.size
 803        single_tokens = self.single_tokens
 804        char_upper = _CHAR_UPPER
 805        size = 0
 806        word = None
 807        chars = self._char
 808        char = chars
 809        prev_space = False
 810        skip = False
 811        trie = self.keyword_trie
 812        single_token = char in single_tokens
 813
 814        while chars:
 815            if not skip:
 816                sub = trie.get(char_upper.get(char, char))
 817                if sub is None:
 818                    break
 819                trie = sub
 820                if 0 in trie:
 821                    word = chars
 822
 823            end = self._current + size
 824            size += 1
 825
 826            if end < sql_size:
 827                char = sql[end]
 828                single_token = single_token or char in single_tokens
 829                is_space = char.isspace()
 830
 831                if not is_space or not prev_space:
 832                    if is_space:
 833                        char = " "
 834                    chars += char
 835                    prev_space = is_space
 836                    skip = False
 837                else:
 838                    skip = True
 839            else:
 840                char = ""
 841                break
 842
 843        if word:
 844            if self._scan_string(word):
 845                return
 846            if self._scan_comment(word):
 847                return
 848            if prev_space or single_token or not char:
 849                self._advance(size - 1)
 850                word = word.upper()
 851                self._add(self.keywords[word], text=word)
 852                return
 853
 854        if self._char in single_tokens:
 855            self._add(single_tokens[self._char], text=self._char)
 856            return
 857
 858        self._scan_var()
 859
 860    def _scan_comment(self, comment_start: str) -> bool:
 861        if comment_start not in self.comments:
 862            return False
 863
 864        comment_start_line = self._line
 865        comment_start_size = len(comment_start)
 866        comment_end = self.comments[comment_start]
 867
 868        if comment_end:
 869            # Skip the comment's start delimiter
 870            self._advance(comment_start_size)
 871
 872            comment_count = 1
 873            comment_end_size = len(comment_end)
 874            nested_comments = self.nested_comments
 875
 876            while not self._end:
 877                if self._chars(comment_end_size) == comment_end:
 878                    comment_count -= 1
 879                    if not comment_count:
 880                        break
 881
 882                self._advance(alnum=True)
 883
 884                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 885                if (
 886                    nested_comments
 887                    and not self._end
 888                    and self._chars(comment_end_size) == comment_start
 889                ):
 890                    self._advance(comment_start_size)
 891                    comment_count += 1
 892
 893            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 894            self._advance(comment_end_size - 1)
 895        else:
 896            _peek = self._peek
 897            while not self._end and _peek != "\n" and _peek != "\r":
 898                self._advance(alnum=True)
 899                _peek = self._peek
 900            self._comments.append(self._text[comment_start_size:])
 901
 902        if (
 903            comment_start == self.hint_start
 904            and self.tokens
 905            and self.tokens[-1].token_type in self.tokens_preceding_hint
 906        ):
 907            self._add(TokenType.HINT)
 908
 909        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 910        # Multiple consecutive comments are preserved by appending them to the current comments list.
 911        if comment_start_line == self._prev_token_line:
 912            self.tokens[-1].comments.extend(self._comments)
 913            self._comments = []
 914            self._prev_token_line = self._line
 915
 916        return True
 917
 918    def _scan_number(self) -> None:
 919        if self._char == "0":
 920            peek = _CHAR_UPPER.get(self._peek, self._peek)
 921            if peek == "B":
 922                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 923            elif peek == "X":
 924                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 925
 926        decimal = False
 927        scientific = 0
 928        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 929        single_tokens = self.single_tokens
 930        keywords = self.keywords
 931        numeric_literals = self.numeric_literals
 932        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 933
 934        is_underscore_separated: bool = False
 935        number_text: str = ""
 936        numeric_literal: str = ""
 937        numeric_type: TokenType | None = None
 938
 939        while True:
 940            if self._peek in _DIGIT_CHARS:
 941                # Batch consecutive digits: scan ahead to find how many
 942                sql = self.sql
 943                end = self._current + 1
 944                size = self.size
 945                while end < size and sql[end] in _DIGIT_CHARS:
 946                    end += 1
 947                self._advance(end - self._current)
 948            elif self._peek == "." and not decimal:
 949                if (
 950                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 951                ) or not self.numbers_can_have_decimals:
 952                    break
 953                decimal = True
 954                self._advance()
 955            elif self._peek in ("-", "+") and scientific == 1:
 956                # Only consume +/- if followed by a digit
 957                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 958                    scientific += 1
 959                    self._advance()
 960                else:
 961                    break
 962            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 963                scientific += 1
 964                self._advance()
 965            elif self._peek == "_" and numbers_can_be_underscore_separated:
 966                is_underscore_separated = True
 967                self._advance()
 968            elif self._peek.isidentifier():
 969                number_text = self._text
 970
 971                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 972                    numeric_literal += self._peek
 973                    self._advance()
 974
 975                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 976
 977                if numeric_type:
 978                    break
 979                elif identifiers_can_start_with_digit:
 980                    return self._add(TokenType.VAR)
 981
 982                self._advance(-len(numeric_literal))
 983                break
 984            else:
 985                break
 986
 987        number_text = number_text or self.sql[self._start : self._current]
 988
 989        # Normalize inputs such as 100_000 to 100000
 990        if is_underscore_separated:
 991            number_text = number_text.replace("_", "")
 992
 993        self._add(TokenType.NUMBER, number_text)
 994
 995        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 996        if numeric_type:
 997            self._add(TokenType.DCOLON, "::")
 998            self._add(numeric_type, numeric_literal)
 999
1000    def _scan_bits(self) -> None:
1001        self._advance()
1002        value = self._extract_value()
1003        try:
1004            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1005            int(value, 2)
1006            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1007        except ValueError:
1008            self._add(TokenType.IDENTIFIER)
1009
1010    def _scan_hex(self) -> None:
1011        self._advance()
1012        value = self._extract_value()
1013        try:
1014            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1015            int(value, 16)
1016            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1017        except ValueError:
1018            self._add(TokenType.IDENTIFIER)
1019
1020    def _extract_value(self) -> str:
1021        single_tokens = self.single_tokens
1022
1023        while True:
1024            char = self._peek.strip()
1025            if char and char not in single_tokens:
1026                self._advance(alnum=True)
1027            else:
1028                break
1029
1030        return self._text
1031
1032    def _scan_string(self, start: str) -> bool:
1033        base = None
1034        token_type = TokenType.STRING
1035
1036        if start in self.quotes:
1037            end = self.quotes[start]
1038        elif start in self.format_strings:
1039            end, token_type = self.format_strings[start]
1040
1041            if token_type == TokenType.HEX_STRING:
1042                base = 16
1043            elif token_type == TokenType.BIT_STRING:
1044                base = 2
1045            elif token_type == TokenType.HEREDOC_STRING:
1046                self._advance()
1047
1048                if self._char == end:
1049                    tag = ""
1050                else:
1051                    tag = self._extract_string(
1052                        end,
1053                        raw_string=True,
1054                        raise_unmatched=not self.heredoc_tag_is_identifier,
1055                    )
1056
1057                if (
1058                    tag
1059                    and self.heredoc_tag_is_identifier
1060                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1061                ):
1062                    if not self._end:
1063                        self._advance(-1)
1064
1065                    self._advance(-len(tag))
1066                    self._add(self.heredoc_string_alternative)
1067                    return True
1068
1069                end = f"{start}{tag}{end}"
1070        else:
1071            return False
1072
1073        self._advance(len(start))
1074        text = self._extract_string(
1075            end,
1076            escapes=(
1077                self.byte_string_escapes
1078                if token_type == TokenType.BYTE_STRING
1079                else self.string_escapes
1080            ),
1081            raw_string=token_type == TokenType.RAW_STRING,
1082        )
1083
1084        if base and text:
1085            try:
1086                int(text, base)
1087            except Exception:
1088                raise TokenError(
1089                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1090                )
1091
1092        self._add(token_type, text)
1093        return True
1094
1095    def _scan_identifier(self, identifier_end: str) -> None:
1096        self._advance()
1097        text = self._extract_string(
1098            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1099        )
1100        self._add(TokenType.IDENTIFIER, text)
1101
1102    def _scan_var(self) -> None:
1103        var_single_tokens = self.var_single_tokens
1104        single_tokens = self.single_tokens
1105
1106        while True:
1107            peek = self._peek
1108            if not peek or peek.isspace():
1109                break
1110            if peek not in var_single_tokens and peek in single_tokens:
1111                break
1112            self._advance(alnum=True)
1113
1114        self._add(
1115            TokenType.VAR
1116            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1117            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1118        )
1119
1120    def _extract_string(
1121        self,
1122        delimiter: str,
1123        escapes: set[str] | None = None,
1124        raw_string: bool = False,
1125        raise_unmatched: bool = True,
1126    ) -> str:
1127        text = ""
1128        delim_size = len(delimiter)
1129        escapes = self.string_escapes if escapes is None else escapes
1130        unescaped_sequences = self.unescaped_sequences
1131        escape_follow_chars = self.escape_follow_chars
1132        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1133        quotes = self.quotes
1134        sql = self.sql
1135
1136        # use str.find() when the string is simple... no \ or other escapes
1137        if delim_size == 1:
1138            pos = self._current - 1
1139            end = sql.find(delimiter, pos)
1140
1141            if (
1142                # the closing delimiter was found
1143                end != -1
1144                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1145                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1146                # no backslash in the string that would need escape processing
1147                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1148            ):
1149                newlines = sql.count("\n", pos, end)
1150                if newlines:
1151                    self._line += newlines
1152                    self._col = end - sql.rfind("\n", pos, end)
1153                else:
1154                    self._col += end - pos
1155
1156                self._current = end + 1
1157                self._end = self._current >= self.size
1158                self._char = sql[end]
1159                self._peek = "" if self._end else sql[self._current]
1160                return sql[pos:end]
1161
1162        while True:
1163            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1164                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1165                if unescaped_sequence:
1166                    self._advance(2)
1167                    text += unescaped_sequence
1168                    continue
1169
1170            is_valid_custom_escape = (
1171                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1172            )
1173
1174            if (
1175                (string_escapes_allowed_in_raw_strings or not raw_string)
1176                and self._char in escapes
1177                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1178                and (self._char not in quotes or self._char == self._peek)
1179            ):
1180                if self._peek == delimiter:
1181                    text += self._peek
1182                elif is_valid_custom_escape and self._char != self._peek:
1183                    text += self._peek
1184                else:
1185                    text += self._char + self._peek
1186
1187                if self._current + 1 < self.size:
1188                    self._advance(2)
1189                else:
1190                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1191            else:
1192                if self._chars(delim_size) == delimiter:
1193                    if delim_size > 1:
1194                        self._advance(delim_size - 1)
1195                    break
1196
1197                if self._end:
1198                    if not raise_unmatched:
1199                        return text + self._char
1200
1201                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1202
1203                current = self._current - 1
1204                self._advance(alnum=True)
1205                text += sql[current : self._current - 1]
1206
1207        return text
class TokenType(enum.IntEnum):
 14class TokenType(IntEnum):
 15    L_PAREN = auto()
 16    R_PAREN = auto()
 17    L_BRACKET = auto()
 18    R_BRACKET = auto()
 19    L_BRACE = auto()
 20    R_BRACE = auto()
 21    COMMA = auto()
 22    DOT = auto()
 23    DASH = auto()
 24    PLUS = auto()
 25    COLON = auto()
 26    DOTCOLON = auto()
 27    DOTCARET = auto()
 28    DCOLON = auto()
 29    DCOLONDOLLAR = auto()
 30    DCOLONPERCENT = auto()
 31    DCOLONQMARK = auto()
 32    DQMARK = auto()
 33    SEMICOLON = auto()
 34    STAR = auto()
 35    BACKSLASH = auto()
 36    SLASH = auto()
 37    LT = auto()
 38    LTE = auto()
 39    GT = auto()
 40    GTE = auto()
 41    NOT = auto()
 42    EQ = auto()
 43    NEQ = auto()
 44    NULLSAFE_EQ = auto()
 45    COLON_EQ = auto()
 46    COLON_GT = auto()
 47    NCOLON_GT = auto()
 48    AND = auto()
 49    OR = auto()
 50    AMP = auto()
 51    DPIPE = auto()
 52    PIPE_GT = auto()
 53    PIPE = auto()
 54    PIPE_SLASH = auto()
 55    DPIPE_SLASH = auto()
 56    CARET = auto()
 57    CARET_AT = auto()
 58    TILDE = auto()
 59    ARROW = auto()
 60    DARROW = auto()
 61    FARROW = auto()
 62    HASH = auto()
 63    HASH_ARROW = auto()
 64    DHASH_ARROW = auto()
 65    LR_ARROW = auto()
 66    DAT = auto()
 67    LT_AT = auto()
 68    AT_GT = auto()
 69    DOLLAR = auto()
 70    PARAMETER = auto()
 71    SESSION = auto()
 72    SESSION_PARAMETER = auto()
 73    SESSION_USER = auto()
 74    DAMP = auto()
 75    AMP_LT = auto()
 76    AMP_GT = auto()
 77    ADJACENT = auto()
 78    XOR = auto()
 79    DSTAR = auto()
 80    QMARK_AMP = auto()
 81    QMARK_PIPE = auto()
 82    HASH_DASH = auto()
 83    EXCLAMATION = auto()
 84
 85    URI_START = auto()
 86
 87    BLOCK_START = auto()
 88    BLOCK_END = auto()
 89
 90    SPACE = auto()
 91    BREAK = auto()
 92
 93    STRING = auto()
 94    NUMBER = auto()
 95    IDENTIFIER = auto()
 96    DATABASE = auto()
 97    COLUMN = auto()
 98    COLUMN_DEF = auto()
 99    SCHEMA = auto()
100    TABLE = auto()
101    WAREHOUSE = auto()
102    STAGE = auto()
103    STREAM = auto()
104    STREAMLIT = auto()
105    VAR = auto()
106    BIT_STRING = auto()
107    HEX_STRING = auto()
108    BYTE_STRING = auto()
109    NATIONAL_STRING = auto()
110    RAW_STRING = auto()
111    HEREDOC_STRING = auto()
112    UNICODE_STRING = auto()
113
114    # types
115    BIT = auto()
116    BOOLEAN = auto()
117    TINYINT = auto()
118    UTINYINT = auto()
119    SMALLINT = auto()
120    USMALLINT = auto()
121    MEDIUMINT = auto()
122    UMEDIUMINT = auto()
123    INT = auto()
124    UINT = auto()
125    BIGINT = auto()
126    UBIGINT = auto()
127    BIGNUM = auto()
128    INT128 = auto()
129    UINT128 = auto()
130    INT256 = auto()
131    UINT256 = auto()
132    FLOAT = auto()
133    DOUBLE = auto()
134    UDOUBLE = auto()
135    DECIMAL = auto()
136    DECIMAL32 = auto()
137    DECIMAL64 = auto()
138    DECIMAL128 = auto()
139    DECIMAL256 = auto()
140    DECFLOAT = auto()
141    UDECIMAL = auto()
142    BIGDECIMAL = auto()
143    CHAR = auto()
144    NCHAR = auto()
145    VARCHAR = auto()
146    NVARCHAR = auto()
147    BPCHAR = auto()
148    TEXT = auto()
149    MEDIUMTEXT = auto()
150    LONGTEXT = auto()
151    BLOB = auto()
152    MEDIUMBLOB = auto()
153    LONGBLOB = auto()
154    TINYBLOB = auto()
155    TINYTEXT = auto()
156    NAME = auto()
157    BINARY = auto()
158    VARBINARY = auto()
159    JSON = auto()
160    JSONB = auto()
161    TIME = auto()
162    TIMETZ = auto()
163    TIME_NS = auto()
164    TIMESTAMP = auto()
165    TIMESTAMPTZ = auto()
166    TIMESTAMPLTZ = auto()
167    TIMESTAMPNTZ = auto()
168    TIMESTAMP_S = auto()
169    TIMESTAMP_MS = auto()
170    TIMESTAMP_NS = auto()
171    DATETIME = auto()
172    DATETIME2 = auto()
173    DATETIME64 = auto()
174    SMALLDATETIME = auto()
175    DATE = auto()
176    DATE32 = auto()
177    INT4RANGE = auto()
178    INT4MULTIRANGE = auto()
179    INT8RANGE = auto()
180    INT8MULTIRANGE = auto()
181    NUMRANGE = auto()
182    NUMMULTIRANGE = auto()
183    TSRANGE = auto()
184    TSMULTIRANGE = auto()
185    TSTZRANGE = auto()
186    TSTZMULTIRANGE = auto()
187    DATERANGE = auto()
188    DATEMULTIRANGE = auto()
189    UUID = auto()
190    GEOGRAPHY = auto()
191    GEOGRAPHYPOINT = auto()
192    NULLABLE = auto()
193    GEOMETRY = auto()
194    POINT = auto()
195    RING = auto()
196    LINESTRING = auto()
197    LOCALTIME = auto()
198    LOCALTIMESTAMP = auto()
199    SYSTIMESTAMP = auto()
200    MULTILINESTRING = auto()
201    POLYGON = auto()
202    MULTIPOLYGON = auto()
203    HLLSKETCH = auto()
204    HSTORE = auto()
205    SUPER = auto()
206    SERIAL = auto()
207    SMALLSERIAL = auto()
208    BIGSERIAL = auto()
209    XML = auto()
210    YEAR = auto()
211    USERDEFINED = auto()
212    MONEY = auto()
213    SMALLMONEY = auto()
214    ROWVERSION = auto()
215    IMAGE = auto()
216    VARIANT = auto()
217    OBJECT = auto()
218    INET = auto()
219    IPADDRESS = auto()
220    IPPREFIX = auto()
221    IPV4 = auto()
222    IPV6 = auto()
223    ENUM = auto()
224    ENUM8 = auto()
225    ENUM16 = auto()
226    FIXEDSTRING = auto()
227    LOWCARDINALITY = auto()
228    NESTED = auto()
229    AGGREGATEFUNCTION = auto()
230    SIMPLEAGGREGATEFUNCTION = auto()
231    TDIGEST = auto()
232    UNKNOWN = auto()
233    VECTOR = auto()
234    DYNAMIC = auto()
235    VOID = auto()
236
237    # keywords
238    ALIAS = auto()
239    ALTER = auto()
240    ALL = auto()
241    ANTI = auto()
242    ANY = auto()
243    APPLY = auto()
244    ARRAY = auto()
245    ASC = auto()
246    ASOF = auto()
247    ATTACH = auto()
248    AUTO_INCREMENT = auto()
249    BEGIN = auto()
250    BETWEEN = auto()
251    BULK_COLLECT_INTO = auto()
252    CACHE = auto()
253    CASE = auto()
254    CHARACTER_SET = auto()
255    CLUSTER_BY = auto()
256    COLLATE = auto()
257    COMMAND = auto()
258    COMMENT = auto()
259    COMMIT = auto()
260    CONNECT_BY = auto()
261    CONSTRAINT = auto()
262    COPY = auto()
263    CREATE = auto()
264    CROSS = auto()
265    CUBE = auto()
266    CURRENT_DATE = auto()
267    CURRENT_DATETIME = auto()
268    CURRENT_SCHEMA = auto()
269    CURRENT_TIME = auto()
270    CURRENT_TIMESTAMP = auto()
271    CURRENT_USER = auto()
272    CURRENT_USER_ID = auto()
273    CURRENT_ROLE = auto()
274    CURRENT_CATALOG = auto()
275    DECLARE = auto()
276    DEFAULT = auto()
277    DELETE = auto()
278    DESC = auto()
279    DESCRIBE = auto()
280    DETACH = auto()
281    DICTIONARY = auto()
282    DISTINCT = auto()
283    DISTRIBUTE_BY = auto()
284    DIV = auto()
285    DROP = auto()
286    ELSE = auto()
287    END = auto()
288    ESCAPE = auto()
289    EXCEPT = auto()
290    EXECUTE = auto()
291    EXISTS = auto()
292    FALSE = auto()
293    FETCH = auto()
294    FILE = auto()
295    FILE_FORMAT = auto()
296    FILTER = auto()
297    FINAL = auto()
298    FIRST = auto()
299    FOR = auto()
300    FORCE = auto()
301    FOREIGN_KEY = auto()
302    FORMAT = auto()
303    FROM = auto()
304    FULL = auto()
305    FUNCTION = auto()
306    GET = auto()
307    GLOB = auto()
308    GLOBAL = auto()
309    GRANT = auto()
310    GROUP_BY = auto()
311    GROUPING_SETS = auto()
312    HAVING = auto()
313    HINT = auto()
314    IGNORE = auto()
315    ILIKE = auto()
316    IN = auto()
317    INDEX = auto()
318    INDEXED_BY = auto()
319    INNER = auto()
320    INSERT = auto()
321    INSTALL = auto()
322    INTEGRATION = auto()
323    INTERSECT = auto()
324    INTERVAL = auto()
325    INTO = auto()
326    INTRODUCER = auto()
327    IRLIKE = auto()
328    IS = auto()
329    ISNULL = auto()
330    JOIN = auto()
331    JOIN_MARKER = auto()
332    KEEP = auto()
333    KEY = auto()
334    KILL = auto()
335    LANGUAGE = auto()
336    LATERAL = auto()
337    LEFT = auto()
338    LIKE = auto()
339    LIMIT = auto()
340    LIST = auto()
341    LOAD = auto()
342    LOCK = auto()
343    MAP = auto()
344    MATCH = auto()
345    MATCH_CONDITION = auto()
346    MATCH_RECOGNIZE = auto()
347    MEMBER_OF = auto()
348    MERGE = auto()
349    MOD = auto()
350    MODEL = auto()
351    NATURAL = auto()
352    NEXT = auto()
353    NOTHING = auto()
354    NOTNULL = auto()
355    NULL = auto()
356    OBJECT_IDENTIFIER = auto()
357    OFFSET = auto()
358    ON = auto()
359    ONLY = auto()
360    OPERATOR = auto()
361    ORDER_BY = auto()
362    ORDER_SIBLINGS_BY = auto()
363    ORDERED = auto()
364    ORDINALITY = auto()
365    OUT = auto()
366    INOUT = auto()
367    OUTER = auto()
368    OVER = auto()
369    OVERLAPS = auto()
370    OVERWRITE = auto()
371    PACKAGE = auto()
372    PARTITION = auto()
373    PARTITION_BY = auto()
374    PERCENT = auto()
375    PIVOT = auto()
376    PLACEHOLDER = auto()
377    POLICY = auto()
378    POOL = auto()
379    POSITIONAL = auto()
380    PRAGMA = auto()
381    PREWHERE = auto()
382    PRIMARY_KEY = auto()
383    PROCEDURE = auto()
384    PROPERTIES = auto()
385    PSEUDO_TYPE = auto()
386    PUT = auto()
387    QUALIFY = auto()
388    QUOTE = auto()
389    QDCOLON = auto()
390    RANGE = auto()
391    RECURSIVE = auto()
392    REFRESH = auto()
393    RENAME = auto()
394    REPLACE = auto()
395    RETURNING = auto()
396    REVOKE = auto()
397    REFERENCES = auto()
398    RIGHT = auto()
399    RLIKE = auto()
400    ROLE = auto()
401    ROLLBACK = auto()
402    ROLLUP = auto()
403    ROW = auto()
404    ROWS = auto()
405    RULE = auto()
406    SELECT = auto()
407    SEMI = auto()
408    SEPARATOR = auto()
409    SEQUENCE = auto()
410    SERDE_PROPERTIES = auto()
411    SET = auto()
412    SETTINGS = auto()
413    SHOW = auto()
414    SIMILAR_TO = auto()
415    SOME = auto()
416    SORT_BY = auto()
417    SOUNDS_LIKE = auto()
418    SQL_SECURITY = auto()
419    START_WITH = auto()
420    STORAGE_INTEGRATION = auto()
421    STRAIGHT_JOIN = auto()
422    STRUCT = auto()
423    SUMMARIZE = auto()
424    TABLE_SAMPLE = auto()
425    TAG = auto()
426    TEMPORARY = auto()
427    TOP = auto()
428    THEN = auto()
429    TRUE = auto()
430    TRUNCATE = auto()
431    TRIGGER = auto()
432    TYPE = auto()
433    UNCACHE = auto()
434    UNION = auto()
435    UNNEST = auto()
436    UNPIVOT = auto()
437    UPDATE = auto()
438    USE = auto()
439    USING = auto()
440    VALUES = auto()
441    VARIADIC = auto()
442    VIEW = auto()
443    SEMANTIC_VIEW = auto()
444    VOLATILE = auto()
445    VOLUME = auto()
446    WHEN = auto()
447    WHERE = auto()
448    WINDOW = auto()
449    WITH = auto()
450    UNIQUE = auto()
451    UTC_DATE = auto()
452    UTC_TIME = auto()
453    UTC_TIMESTAMP = auto()
454    VERSION_SNAPSHOT = auto()
455    TIMESTAMP_SNAPSHOT = auto()
456    OPTION = auto()
457    SINK = auto()
458    SOURCE = auto()
459    ANALYZE = auto()
460    NAMESPACE = auto()
461    EXPORT = auto()
462
463    # sentinels
464    HIVE_TOKEN_STREAM = auto()
465    SENTINEL = auto()
466
467    def __str__(self) -> str:
468        return f"TokenType.{self.name}"

An enumeration.

L_PAREN = <TokenType.L_PAREN: 1>
R_PAREN = <TokenType.R_PAREN: 2>
L_BRACKET = <TokenType.L_BRACKET: 3>
R_BRACKET = <TokenType.R_BRACKET: 4>
L_BRACE = <TokenType.L_BRACE: 5>
R_BRACE = <TokenType.R_BRACE: 6>
COMMA = <TokenType.COMMA: 7>
DOT = <TokenType.DOT: 8>
DASH = <TokenType.DASH: 9>
PLUS = <TokenType.PLUS: 10>
COLON = <TokenType.COLON: 11>
DOTCOLON = <TokenType.DOTCOLON: 12>
DOTCARET = <TokenType.DOTCARET: 13>
DCOLON = <TokenType.DCOLON: 14>
DCOLONDOLLAR = <TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT = <TokenType.DCOLONPERCENT: 16>
DCOLONQMARK = <TokenType.DCOLONQMARK: 17>
DQMARK = <TokenType.DQMARK: 18>
SEMICOLON = <TokenType.SEMICOLON: 19>
STAR = <TokenType.STAR: 20>
BACKSLASH = <TokenType.BACKSLASH: 21>
SLASH = <TokenType.SLASH: 22>
LT = <TokenType.LT: 23>
LTE = <TokenType.LTE: 24>
GT = <TokenType.GT: 25>
GTE = <TokenType.GTE: 26>
NOT = <TokenType.NOT: 27>
EQ = <TokenType.EQ: 28>
NEQ = <TokenType.NEQ: 29>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 30>
COLON_EQ = <TokenType.COLON_EQ: 31>
COLON_GT = <TokenType.COLON_GT: 32>
NCOLON_GT = <TokenType.NCOLON_GT: 33>
AND = <TokenType.AND: 34>
OR = <TokenType.OR: 35>
AMP = <TokenType.AMP: 36>
DPIPE = <TokenType.DPIPE: 37>
PIPE_GT = <TokenType.PIPE_GT: 38>
PIPE = <TokenType.PIPE: 39>
PIPE_SLASH = <TokenType.PIPE_SLASH: 40>
DPIPE_SLASH = <TokenType.DPIPE_SLASH: 41>
CARET = <TokenType.CARET: 42>
CARET_AT = <TokenType.CARET_AT: 43>
TILDE = <TokenType.TILDE: 44>
ARROW = <TokenType.ARROW: 45>
DARROW = <TokenType.DARROW: 46>
FARROW = <TokenType.FARROW: 47>
HASH = <TokenType.HASH: 48>
HASH_ARROW = <TokenType.HASH_ARROW: 49>
DHASH_ARROW = <TokenType.DHASH_ARROW: 50>
LR_ARROW = <TokenType.LR_ARROW: 51>
DAT = <TokenType.DAT: 52>
LT_AT = <TokenType.LT_AT: 53>
AT_GT = <TokenType.AT_GT: 54>
DOLLAR = <TokenType.DOLLAR: 55>
PARAMETER = <TokenType.PARAMETER: 56>
SESSION = <TokenType.SESSION: 57>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 58>
SESSION_USER = <TokenType.SESSION_USER: 59>
DAMP = <TokenType.DAMP: 60>
AMP_LT = <TokenType.AMP_LT: 61>
AMP_GT = <TokenType.AMP_GT: 62>
ADJACENT = <TokenType.ADJACENT: 63>
XOR = <TokenType.XOR: 64>
DSTAR = <TokenType.DSTAR: 65>
QMARK_AMP = <TokenType.QMARK_AMP: 66>
QMARK_PIPE = <TokenType.QMARK_PIPE: 67>
HASH_DASH = <TokenType.HASH_DASH: 68>
EXCLAMATION = <TokenType.EXCLAMATION: 69>
URI_START = <TokenType.URI_START: 70>
BLOCK_START = <TokenType.BLOCK_START: 71>
BLOCK_END = <TokenType.BLOCK_END: 72>
SPACE = <TokenType.SPACE: 73>
BREAK = <TokenType.BREAK: 74>
STRING = <TokenType.STRING: 75>
NUMBER = <TokenType.NUMBER: 76>
IDENTIFIER = <TokenType.IDENTIFIER: 77>
DATABASE = <TokenType.DATABASE: 78>
COLUMN = <TokenType.COLUMN: 79>
COLUMN_DEF = <TokenType.COLUMN_DEF: 80>
SCHEMA = <TokenType.SCHEMA: 81>
TABLE = <TokenType.TABLE: 82>
WAREHOUSE = <TokenType.WAREHOUSE: 83>
STAGE = <TokenType.STAGE: 84>
STREAM = <TokenType.STREAM: 85>
STREAMLIT = <TokenType.STREAMLIT: 86>
VAR = <TokenType.VAR: 87>
BIT_STRING = <TokenType.BIT_STRING: 88>
HEX_STRING = <TokenType.HEX_STRING: 89>
BYTE_STRING = <TokenType.BYTE_STRING: 90>
NATIONAL_STRING = <TokenType.NATIONAL_STRING: 91>
RAW_STRING = <TokenType.RAW_STRING: 92>
HEREDOC_STRING = <TokenType.HEREDOC_STRING: 93>
UNICODE_STRING = <TokenType.UNICODE_STRING: 94>
BIT = <TokenType.BIT: 95>
BOOLEAN = <TokenType.BOOLEAN: 96>
TINYINT = <TokenType.TINYINT: 97>
UTINYINT = <TokenType.UTINYINT: 98>
SMALLINT = <TokenType.SMALLINT: 99>
USMALLINT = <TokenType.USMALLINT: 100>
MEDIUMINT = <TokenType.MEDIUMINT: 101>
UMEDIUMINT = <TokenType.UMEDIUMINT: 102>
INT = <TokenType.INT: 103>
UINT = <TokenType.UINT: 104>
BIGINT = <TokenType.BIGINT: 105>
UBIGINT = <TokenType.UBIGINT: 106>
BIGNUM = <TokenType.BIGNUM: 107>
INT128 = <TokenType.INT128: 108>
UINT128 = <TokenType.UINT128: 109>
INT256 = <TokenType.INT256: 110>
UINT256 = <TokenType.UINT256: 111>
FLOAT = <TokenType.FLOAT: 112>
DOUBLE = <TokenType.DOUBLE: 113>
UDOUBLE = <TokenType.UDOUBLE: 114>
DECIMAL = <TokenType.DECIMAL: 115>
DECIMAL32 = <TokenType.DECIMAL32: 116>
DECIMAL64 = <TokenType.DECIMAL64: 117>
DECIMAL128 = <TokenType.DECIMAL128: 118>
DECIMAL256 = <TokenType.DECIMAL256: 119>
DECFLOAT = <TokenType.DECFLOAT: 120>
UDECIMAL = <TokenType.UDECIMAL: 121>
BIGDECIMAL = <TokenType.BIGDECIMAL: 122>
CHAR = <TokenType.CHAR: 123>
NCHAR = <TokenType.NCHAR: 124>
VARCHAR = <TokenType.VARCHAR: 125>
NVARCHAR = <TokenType.NVARCHAR: 126>
BPCHAR = <TokenType.BPCHAR: 127>
TEXT = <TokenType.TEXT: 128>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 129>
LONGTEXT = <TokenType.LONGTEXT: 130>
BLOB = <TokenType.BLOB: 131>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 132>
LONGBLOB = <TokenType.LONGBLOB: 133>
TINYBLOB = <TokenType.TINYBLOB: 134>
TINYTEXT = <TokenType.TINYTEXT: 135>
NAME = <TokenType.NAME: 136>
BINARY = <TokenType.BINARY: 137>
VARBINARY = <TokenType.VARBINARY: 138>
JSON = <TokenType.JSON: 139>
JSONB = <TokenType.JSONB: 140>
TIME = <TokenType.TIME: 141>
TIMETZ = <TokenType.TIMETZ: 142>
TIME_NS = <TokenType.TIME_NS: 143>
TIMESTAMP = <TokenType.TIMESTAMP: 144>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 145>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 146>
TIMESTAMPNTZ = <TokenType.TIMESTAMPNTZ: 147>
TIMESTAMP_S = <TokenType.TIMESTAMP_S: 148>
TIMESTAMP_MS = <TokenType.TIMESTAMP_MS: 149>
TIMESTAMP_NS = <TokenType.TIMESTAMP_NS: 150>
DATETIME = <TokenType.DATETIME: 151>
DATETIME2 = <TokenType.DATETIME2: 152>
DATETIME64 = <TokenType.DATETIME64: 153>
SMALLDATETIME = <TokenType.SMALLDATETIME: 154>
DATE = <TokenType.DATE: 155>
DATE32 = <TokenType.DATE32: 156>
INT4RANGE = <TokenType.INT4RANGE: 157>
INT4MULTIRANGE = <TokenType.INT4MULTIRANGE: 158>
INT8RANGE = <TokenType.INT8RANGE: 159>
INT8MULTIRANGE = <TokenType.INT8MULTIRANGE: 160>
NUMRANGE = <TokenType.NUMRANGE: 161>
NUMMULTIRANGE = <TokenType.NUMMULTIRANGE: 162>
TSRANGE = <TokenType.TSRANGE: 163>
TSMULTIRANGE = <TokenType.TSMULTIRANGE: 164>
TSTZRANGE = <TokenType.TSTZRANGE: 165>
TSTZMULTIRANGE = <TokenType.TSTZMULTIRANGE: 166>
DATERANGE = <TokenType.DATERANGE: 167>
DATEMULTIRANGE = <TokenType.DATEMULTIRANGE: 168>
UUID = <TokenType.UUID: 169>
GEOGRAPHY = <TokenType.GEOGRAPHY: 170>
GEOGRAPHYPOINT = <TokenType.GEOGRAPHYPOINT: 171>
NULLABLE = <TokenType.NULLABLE: 172>
GEOMETRY = <TokenType.GEOMETRY: 173>
POINT = <TokenType.POINT: 174>
RING = <TokenType.RING: 175>
LINESTRING = <TokenType.LINESTRING: 176>
LOCALTIME = <TokenType.LOCALTIME: 177>
LOCALTIMESTAMP = <TokenType.LOCALTIMESTAMP: 178>
SYSTIMESTAMP = <TokenType.SYSTIMESTAMP: 179>
MULTILINESTRING = <TokenType.MULTILINESTRING: 180>
POLYGON = <TokenType.POLYGON: 181>
MULTIPOLYGON = <TokenType.MULTIPOLYGON: 182>
HLLSKETCH = <TokenType.HLLSKETCH: 183>
HSTORE = <TokenType.HSTORE: 184>
SUPER = <TokenType.SUPER: 185>
SERIAL = <TokenType.SERIAL: 186>
SMALLSERIAL = <TokenType.SMALLSERIAL: 187>
BIGSERIAL = <TokenType.BIGSERIAL: 188>
XML = <TokenType.XML: 189>
YEAR = <TokenType.YEAR: 190>
USERDEFINED = <TokenType.USERDEFINED: 191>
MONEY = <TokenType.MONEY: 192>
SMALLMONEY = <TokenType.SMALLMONEY: 193>
ROWVERSION = <TokenType.ROWVERSION: 194>
IMAGE = <TokenType.IMAGE: 195>
VARIANT = <TokenType.VARIANT: 196>
OBJECT = <TokenType.OBJECT: 197>
INET = <TokenType.INET: 198>
IPADDRESS = <TokenType.IPADDRESS: 199>
IPPREFIX = <TokenType.IPPREFIX: 200>
IPV4 = <TokenType.IPV4: 201>
IPV6 = <TokenType.IPV6: 202>
ENUM = <TokenType.ENUM: 203>
ENUM8 = <TokenType.ENUM8: 204>
ENUM16 = <TokenType.ENUM16: 205>
FIXEDSTRING = <TokenType.FIXEDSTRING: 206>
LOWCARDINALITY = <TokenType.LOWCARDINALITY: 207>
NESTED = <TokenType.NESTED: 208>
AGGREGATEFUNCTION = <TokenType.AGGREGATEFUNCTION: 209>
SIMPLEAGGREGATEFUNCTION = <TokenType.SIMPLEAGGREGATEFUNCTION: 210>
TDIGEST = <TokenType.TDIGEST: 211>
UNKNOWN = <TokenType.UNKNOWN: 212>
VECTOR = <TokenType.VECTOR: 213>
DYNAMIC = <TokenType.DYNAMIC: 214>
VOID = <TokenType.VOID: 215>
ALIAS = <TokenType.ALIAS: 216>
ALTER = <TokenType.ALTER: 217>
ALL = <TokenType.ALL: 218>
ANTI = <TokenType.ANTI: 219>
ANY = <TokenType.ANY: 220>
APPLY = <TokenType.APPLY: 221>
ARRAY = <TokenType.ARRAY: 222>
ASC = <TokenType.ASC: 223>
ASOF = <TokenType.ASOF: 224>
ATTACH = <TokenType.ATTACH: 225>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 226>
BEGIN = <TokenType.BEGIN: 227>
BETWEEN = <TokenType.BETWEEN: 228>
BULK_COLLECT_INTO = <TokenType.BULK_COLLECT_INTO: 229>
CACHE = <TokenType.CACHE: 230>
CASE = <TokenType.CASE: 231>
CHARACTER_SET = <TokenType.CHARACTER_SET: 232>
CLUSTER_BY = <TokenType.CLUSTER_BY: 233>
COLLATE = <TokenType.COLLATE: 234>
COMMAND = <TokenType.COMMAND: 235>
COMMENT = <TokenType.COMMENT: 236>
COMMIT = <TokenType.COMMIT: 237>
CONNECT_BY = <TokenType.CONNECT_BY: 238>
CONSTRAINT = <TokenType.CONSTRAINT: 239>
COPY = <TokenType.COPY: 240>
CREATE = <TokenType.CREATE: 241>
CROSS = <TokenType.CROSS: 242>
CUBE = <TokenType.CUBE: 243>
CURRENT_DATE = <TokenType.CURRENT_DATE: 244>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 245>
CURRENT_SCHEMA = <TokenType.CURRENT_SCHEMA: 246>
CURRENT_TIME = <TokenType.CURRENT_TIME: 247>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 248>
CURRENT_USER = <TokenType.CURRENT_USER: 249>
CURRENT_USER_ID = <TokenType.CURRENT_USER_ID: 250>
CURRENT_ROLE = <TokenType.CURRENT_ROLE: 251>
CURRENT_CATALOG = <TokenType.CURRENT_CATALOG: 252>
DECLARE = <TokenType.DECLARE: 253>
DEFAULT = <TokenType.DEFAULT: 254>
DELETE = <TokenType.DELETE: 255>
DESC = <TokenType.DESC: 256>
DESCRIBE = <TokenType.DESCRIBE: 257>
DETACH = <TokenType.DETACH: 258>
DICTIONARY = <TokenType.DICTIONARY: 259>
DISTINCT = <TokenType.DISTINCT: 260>
DISTRIBUTE_BY = <TokenType.DISTRIBUTE_BY: 261>
DIV = <TokenType.DIV: 262>
DROP = <TokenType.DROP: 263>
ELSE = <TokenType.ELSE: 264>
END = <TokenType.END: 265>
ESCAPE = <TokenType.ESCAPE: 266>
EXCEPT = <TokenType.EXCEPT: 267>
EXECUTE = <TokenType.EXECUTE: 268>
EXISTS = <TokenType.EXISTS: 269>
FALSE = <TokenType.FALSE: 270>
FETCH = <TokenType.FETCH: 271>
FILE = <TokenType.FILE: 272>
FILE_FORMAT = <TokenType.FILE_FORMAT: 273>
FILTER = <TokenType.FILTER: 274>
FINAL = <TokenType.FINAL: 275>
FIRST = <TokenType.FIRST: 276>
FOR = <TokenType.FOR: 277>
FORCE = <TokenType.FORCE: 278>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 279>
FORMAT = <TokenType.FORMAT: 280>
FROM = <TokenType.FROM: 281>
FULL = <TokenType.FULL: 282>
FUNCTION = <TokenType.FUNCTION: 283>
GET = <TokenType.GET: 284>
GLOB = <TokenType.GLOB: 285>
GLOBAL = <TokenType.GLOBAL: 286>
GRANT = <TokenType.GRANT: 287>
GROUP_BY = <TokenType.GROUP_BY: 288>
GROUPING_SETS = <TokenType.GROUPING_SETS: 289>
HAVING = <TokenType.HAVING: 290>
HINT = <TokenType.HINT: 291>
IGNORE = <TokenType.IGNORE: 292>
ILIKE = <TokenType.ILIKE: 293>
IN = <TokenType.IN: 294>
INDEX = <TokenType.INDEX: 295>
INDEXED_BY = <TokenType.INDEXED_BY: 296>
INNER = <TokenType.INNER: 297>
INSERT = <TokenType.INSERT: 298>
INSTALL = <TokenType.INSTALL: 299>
INTEGRATION = <TokenType.INTEGRATION: 300>
INTERSECT = <TokenType.INTERSECT: 301>
INTERVAL = <TokenType.INTERVAL: 302>
INTO = <TokenType.INTO: 303>
INTRODUCER = <TokenType.INTRODUCER: 304>
IRLIKE = <TokenType.IRLIKE: 305>
IS = <TokenType.IS: 306>
ISNULL = <TokenType.ISNULL: 307>
JOIN = <TokenType.JOIN: 308>
JOIN_MARKER = <TokenType.JOIN_MARKER: 309>
KEEP = <TokenType.KEEP: 310>
KEY = <TokenType.KEY: 311>
KILL = <TokenType.KILL: 312>
LANGUAGE = <TokenType.LANGUAGE: 313>
LATERAL = <TokenType.LATERAL: 314>
LEFT = <TokenType.LEFT: 315>
LIKE = <TokenType.LIKE: 316>
LIMIT = <TokenType.LIMIT: 317>
LIST = <TokenType.LIST: 318>
LOAD = <TokenType.LOAD: 319>
LOCK = <TokenType.LOCK: 320>
MAP = <TokenType.MAP: 321>
MATCH = <TokenType.MATCH: 322>
MATCH_CONDITION = <TokenType.MATCH_CONDITION: 323>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 324>
MEMBER_OF = <TokenType.MEMBER_OF: 325>
MERGE = <TokenType.MERGE: 326>
MOD = <TokenType.MOD: 327>
MODEL = <TokenType.MODEL: 328>
NATURAL = <TokenType.NATURAL: 329>
NEXT = <TokenType.NEXT: 330>
NOTHING = <TokenType.NOTHING: 331>
NOTNULL = <TokenType.NOTNULL: 332>
NULL = <TokenType.NULL: 333>
OBJECT_IDENTIFIER = <TokenType.OBJECT_IDENTIFIER: 334>
OFFSET = <TokenType.OFFSET: 335>
ON = <TokenType.ON: 336>
ONLY = <TokenType.ONLY: 337>
OPERATOR = <TokenType.OPERATOR: 338>
ORDER_BY = <TokenType.ORDER_BY: 339>
ORDER_SIBLINGS_BY = <TokenType.ORDER_SIBLINGS_BY: 340>
ORDERED = <TokenType.ORDERED: 341>
ORDINALITY = <TokenType.ORDINALITY: 342>
OUT = <TokenType.OUT: 343>
INOUT = <TokenType.INOUT: 344>
OUTER = <TokenType.OUTER: 345>
OVER = <TokenType.OVER: 346>
OVERLAPS = <TokenType.OVERLAPS: 347>
OVERWRITE = <TokenType.OVERWRITE: 348>
PACKAGE = <TokenType.PACKAGE: 349>
PARTITION = <TokenType.PARTITION: 350>
PARTITION_BY = <TokenType.PARTITION_BY: 351>
PERCENT = <TokenType.PERCENT: 352>
PIVOT = <TokenType.PIVOT: 353>
PLACEHOLDER = <TokenType.PLACEHOLDER: 354>
POLICY = <TokenType.POLICY: 355>
POOL = <TokenType.POOL: 356>
POSITIONAL = <TokenType.POSITIONAL: 357>
PRAGMA = <TokenType.PRAGMA: 358>
PREWHERE = <TokenType.PREWHERE: 359>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 360>
PROCEDURE = <TokenType.PROCEDURE: 361>
PROPERTIES = <TokenType.PROPERTIES: 362>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 363>
PUT = <TokenType.PUT: 364>
QUALIFY = <TokenType.QUALIFY: 365>
QUOTE = <TokenType.QUOTE: 366>
QDCOLON = <TokenType.QDCOLON: 367>
RANGE = <TokenType.RANGE: 368>
RECURSIVE = <TokenType.RECURSIVE: 369>
REFRESH = <TokenType.REFRESH: 370>
RENAME = <TokenType.RENAME: 371>
REPLACE = <TokenType.REPLACE: 372>
RETURNING = <TokenType.RETURNING: 373>
REVOKE = <TokenType.REVOKE: 374>
REFERENCES = <TokenType.REFERENCES: 375>
RIGHT = <TokenType.RIGHT: 376>
RLIKE = <TokenType.RLIKE: 377>
ROLE = <TokenType.ROLE: 378>
ROLLBACK = <TokenType.ROLLBACK: 379>
ROLLUP = <TokenType.ROLLUP: 380>
ROW = <TokenType.ROW: 381>
ROWS = <TokenType.ROWS: 382>
RULE = <TokenType.RULE: 383>
SELECT = <TokenType.SELECT: 384>
SEMI = <TokenType.SEMI: 385>
SEPARATOR = <TokenType.SEPARATOR: 386>
SEQUENCE = <TokenType.SEQUENCE: 387>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 388>
SET = <TokenType.SET: 389>
SETTINGS = <TokenType.SETTINGS: 390>
SHOW = <TokenType.SHOW: 391>
SIMILAR_TO = <TokenType.SIMILAR_TO: 392>
SOME = <TokenType.SOME: 393>
SORT_BY = <TokenType.SORT_BY: 394>
SOUNDS_LIKE = <TokenType.SOUNDS_LIKE: 395>
SQL_SECURITY = <TokenType.SQL_SECURITY: 396>
START_WITH = <TokenType.START_WITH: 397>
STORAGE_INTEGRATION = <TokenType.STORAGE_INTEGRATION: 398>
STRAIGHT_JOIN = <TokenType.STRAIGHT_JOIN: 399>
STRUCT = <TokenType.STRUCT: 400>
SUMMARIZE = <TokenType.SUMMARIZE: 401>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 402>
TAG = <TokenType.TAG: 403>
TEMPORARY = <TokenType.TEMPORARY: 404>
TOP = <TokenType.TOP: 405>
THEN = <TokenType.THEN: 406>
TRUE = <TokenType.TRUE: 407>
TRUNCATE = <TokenType.TRUNCATE: 408>
TRIGGER = <TokenType.TRIGGER: 409>
TYPE = <TokenType.TYPE: 410>
UNCACHE = <TokenType.UNCACHE: 411>
UNION = <TokenType.UNION: 412>
UNNEST = <TokenType.UNNEST: 413>
UNPIVOT = <TokenType.UNPIVOT: 414>
UPDATE = <TokenType.UPDATE: 415>
USE = <TokenType.USE: 416>
USING = <TokenType.USING: 417>
VALUES = <TokenType.VALUES: 418>
VARIADIC = <TokenType.VARIADIC: 419>
VIEW = <TokenType.VIEW: 420>
SEMANTIC_VIEW = <TokenType.SEMANTIC_VIEW: 421>
VOLATILE = <TokenType.VOLATILE: 422>
VOLUME = <TokenType.VOLUME: 423>
WHEN = <TokenType.WHEN: 424>
WHERE = <TokenType.WHERE: 425>
WINDOW = <TokenType.WINDOW: 426>
WITH = <TokenType.WITH: 427>
UNIQUE = <TokenType.UNIQUE: 428>
UTC_DATE = <TokenType.UTC_DATE: 429>
UTC_TIME = <TokenType.UTC_TIME: 430>
UTC_TIMESTAMP = <TokenType.UTC_TIMESTAMP: 431>
VERSION_SNAPSHOT = <TokenType.VERSION_SNAPSHOT: 432>
TIMESTAMP_SNAPSHOT = <TokenType.TIMESTAMP_SNAPSHOT: 433>
OPTION = <TokenType.OPTION: 434>
SINK = <TokenType.SINK: 435>
SOURCE = <TokenType.SOURCE: 436>
ANALYZE = <TokenType.ANALYZE: 437>
NAMESPACE = <TokenType.NAMESPACE: 438>
EXPORT = <TokenType.EXPORT: 439>
HIVE_TOKEN_STREAM = <TokenType.HIVE_TOKEN_STREAM: 440>
SENTINEL = <TokenType.SENTINEL: 441>
class Token:
471class Token:
472    # mypyc doesn't expose slots
473    _attrs: t.ClassVar[tuple[str, ...]] = (
474        "token_type",
475        "text",
476        "line",
477        "col",
478        "start",
479        "end",
480        "comments",
481    )
482    __slots__ = _attrs
483
484    @classmethod
485    def number(cls, number: int) -> Token:
486        """Returns a NUMBER token with `number` as its text."""
487        return cls(TokenType.NUMBER, str(number))
488
489    @classmethod
490    def string(cls, string: str) -> Token:
491        """Returns a STRING token with `string` as its text."""
492        return cls(TokenType.STRING, string)
493
494    @classmethod
495    def identifier(cls, identifier: str) -> Token:
496        """Returns an IDENTIFIER token with `identifier` as its text."""
497        return cls(TokenType.IDENTIFIER, identifier)
498
499    @classmethod
500    def var(cls, var: str) -> Token:
501        """Returns an VAR token with `var` as its text."""
502        return cls(TokenType.VAR, var)
503
504    def __init__(
505        self,
506        token_type: TokenType,
507        text: str,
508        line: int = 1,
509        col: int = 1,
510        start: int = 0,
511        end: int = 0,
512        comments: list[str] | None = None,
513    ) -> None:
514        self.token_type = token_type
515        self.text = text
516        self.line = line
517        self.col = col
518        self.start = start
519        self.end = end
520        self.comments = [] if comments is None else comments
521
522    def __bool__(self) -> bool:
523        return self.token_type != TokenType.SENTINEL
524
525    def __repr__(self) -> str:
526        attributes = ", ".join(
527            f"{k}: TokenType.{self.token_type.name}"
528            if k == "token_type"
529            else f"{k}: {getattr(self, k)}"
530            for k in self._attrs
531        )
532        return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
504    def __init__(
505        self,
506        token_type: TokenType,
507        text: str,
508        line: int = 1,
509        col: int = 1,
510        start: int = 0,
511        end: int = 0,
512        comments: list[str] | None = None,
513    ) -> None:
514        self.token_type = token_type
515        self.text = text
516        self.line = line
517        self.col = col
518        self.start = start
519        self.end = end
520        self.comments = [] if comments is None else comments
@classmethod
def number(cls, number: int) -> Token:
484    @classmethod
485    def number(cls, number: int) -> Token:
486        """Returns a NUMBER token with `number` as its text."""
487        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
489    @classmethod
490    def string(cls, string: str) -> Token:
491        """Returns a STRING token with `string` as its text."""
492        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
494    @classmethod
495    def identifier(cls, identifier: str) -> Token:
496        """Returns an IDENTIFIER token with `identifier` as its text."""
497        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
499    @classmethod
500    def var(cls, var: str) -> Token:
501        """Returns an VAR token with `var` as its text."""
502        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class TokenizerCore:
 535class TokenizerCore:
 536    __slots__ = (
 537        "sql",
 538        "size",
 539        "tokens",
 540        "_start",
 541        "_current",
 542        "_line",
 543        "_col",
 544        "_comments",
 545        "_char",
 546        "_end",
 547        "_peek",
 548        "_prev_token_line",
 549        "single_tokens",
 550        "keywords",
 551        "quotes",
 552        "format_strings",
 553        "identifiers",
 554        "comments",
 555        "string_escapes",
 556        "byte_string_escapes",
 557        "identifier_escapes",
 558        "escape_follow_chars",
 559        "commands",
 560        "command_prefix_tokens",
 561        "nested_comments",
 562        "hint_start",
 563        "tokens_preceding_hint",
 564        "has_bit_strings",
 565        "has_hex_strings",
 566        "numeric_literals",
 567        "var_single_tokens",
 568        "string_escapes_allowed_in_raw_strings",
 569        "heredoc_tag_is_identifier",
 570        "heredoc_string_alternative",
 571        "keyword_trie",
 572        "numbers_can_be_underscore_separated",
 573        "numbers_can_have_decimals",
 574        "identifiers_can_start_with_digit",
 575        "unescaped_sequences",
 576    )
 577
 578    def __init__(
 579        self,
 580        single_tokens: dict[str, TokenType],
 581        keywords: dict[str, TokenType],
 582        quotes: dict[str, str],
 583        format_strings: dict[str, tuple[str, TokenType]],
 584        identifiers: dict[str, str],
 585        comments: dict[str, str | None],
 586        string_escapes: set[str],
 587        byte_string_escapes: set[str],
 588        identifier_escapes: set[str],
 589        escape_follow_chars: set[str],
 590        commands: set[TokenType],
 591        command_prefix_tokens: set[TokenType],
 592        nested_comments: bool,
 593        hint_start: str,
 594        tokens_preceding_hint: set[TokenType],
 595        has_bit_strings: bool,
 596        has_hex_strings: bool,
 597        numeric_literals: dict[str, str],
 598        var_single_tokens: set[str],
 599        string_escapes_allowed_in_raw_strings: bool,
 600        heredoc_tag_is_identifier: bool,
 601        heredoc_string_alternative: TokenType,
 602        keyword_trie: dict,
 603        numbers_can_be_underscore_separated: bool,
 604        numbers_can_have_decimals: bool,
 605        identifiers_can_start_with_digit: bool,
 606        unescaped_sequences: dict[str, str],
 607    ) -> None:
 608        self.single_tokens = single_tokens
 609        self.keywords = keywords
 610        self.quotes = quotes
 611        self.format_strings = format_strings
 612        self.identifiers = identifiers
 613        self.comments = comments
 614        self.string_escapes = string_escapes
 615        self.byte_string_escapes = byte_string_escapes
 616        self.identifier_escapes = identifier_escapes
 617        self.escape_follow_chars = escape_follow_chars
 618        self.commands = commands
 619        self.command_prefix_tokens = command_prefix_tokens
 620        self.nested_comments = nested_comments
 621        self.hint_start = hint_start
 622        self.tokens_preceding_hint = tokens_preceding_hint
 623        self.has_bit_strings = has_bit_strings
 624        self.has_hex_strings = has_hex_strings
 625        self.numeric_literals = numeric_literals
 626        self.var_single_tokens = var_single_tokens
 627        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 628        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 629        self.heredoc_string_alternative = heredoc_string_alternative
 630        self.keyword_trie = keyword_trie
 631        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 632        self.numbers_can_have_decimals = numbers_can_have_decimals
 633        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 634        self.unescaped_sequences = unescaped_sequences
 635        self.sql = ""
 636        self.size = 0
 637        self.tokens: list[Token] = []
 638        self._start = 0
 639        self._current = 0
 640        self._line = 1
 641        self._col = 0
 642        self._comments: list[str] = []
 643        self._char = ""
 644        self._end = False
 645        self._peek = ""
 646        self._prev_token_line = -1
 647
 648    def reset(self) -> None:
 649        self.sql = ""
 650        self.size = 0
 651        self.tokens = []
 652        self._start = 0
 653        self._current = 0
 654        self._line = 1
 655        self._col = 0
 656        self._comments = []
 657        self._char = ""
 658        self._end = False
 659        self._peek = ""
 660        self._prev_token_line = -1
 661
 662    def tokenize(self, sql: str) -> list[Token]:
 663        """Returns a list of tokens corresponding to the SQL string `sql`."""
 664        self.reset()
 665        self.sql = sql
 666        self.size = len(sql)
 667
 668        try:
 669            self._scan()
 670        except Exception as e:
 671            start = max(self._current - 50, 0)
 672            end = min(self._current + 50, self.size - 1)
 673            context = self.sql[start:end]
 674            raise TokenError(f"Error tokenizing '{context}'") from e
 675
 676        return self.tokens
 677
 678    def _scan(self, check_semicolon: bool = False) -> None:
 679        identifiers = self.identifiers
 680        digit_chars = _DIGIT_CHARS
 681
 682        while self.size and not self._end:
 683            current = self._current
 684
 685            # Skip spaces here rather than iteratively calling advance() for performance reasons
 686            while current < self.size:
 687                char = self.sql[current]
 688
 689                if char == " " or char == "\t":
 690                    current += 1
 691                else:
 692                    break
 693
 694            offset = current - self._current if current > self._current else 1
 695
 696            self._start = current
 697            self._advance(offset)
 698
 699            if not self._char.isspace():
 700                if self._char in digit_chars:
 701                    self._scan_number()
 702                elif self._char in identifiers:
 703                    self._scan_identifier(identifiers[self._char])
 704                else:
 705                    self._scan_keywords()
 706
 707            if check_semicolon and self._peek == ";":
 708                break
 709
 710        if self.tokens and self._comments:
 711            self.tokens[-1].comments.extend(self._comments)
 712
 713    def _chars(self, size: int) -> str:
 714        if size == 1:
 715            return self._char
 716
 717        start = self._current - 1
 718        end = start + size
 719
 720        return self.sql[start:end] if end <= self.size else ""
 721
 722    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 723        char = self._char
 724
 725        if char == "\n" or char == "\r":
 726            # Ensures we don't count an extra line if we get a \r\n line break sequence
 727            if not (char == "\r" and self._peek == "\n"):
 728                self._col = i
 729                self._line += 1
 730        else:
 731            self._col += i
 732
 733        self._current += i
 734        sql = self.sql
 735        size = self.size
 736        self._end = self._current >= size
 737        self._char = sql[self._current - 1]
 738        self._peek = "" if self._end else sql[self._current]
 739
 740        if alnum and self._char.isalnum():
 741            # Cache to local variables instead of attributes for better performance
 742            _col = self._col
 743            _current = self._current
 744            _end = self._end
 745            _peek = self._peek
 746
 747            while _peek.isalnum():
 748                _col += 1
 749                _current += 1
 750                _end = _current >= size
 751                _peek = "" if _end else sql[_current]
 752
 753            self._col = _col
 754            self._current = _current
 755            self._end = _end
 756            self._peek = _peek
 757            self._char = sql[_current - 1]
 758
 759    @property
 760    def _text(self) -> str:
 761        return self.sql[self._start : self._current]
 762
 763    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 764        self._prev_token_line = self._line
 765
 766        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 767            self.tokens[-1].comments.extend(self._comments)
 768            self._comments = []
 769
 770        if text is None:
 771            text = self.sql[self._start : self._current]
 772
 773        self.tokens.append(
 774            Token(
 775                token_type,
 776                text=text,
 777                line=self._line,
 778                col=self._col,
 779                start=self._start,
 780                end=self._current - 1,
 781                comments=self._comments,
 782            )
 783        )
 784        self._comments = []
 785
 786        # If we have either a semicolon or a begin token before the command's token, we'll parse
 787        # whatever follows the command's token as a string
 788        if (
 789            token_type in self.commands
 790            and self._peek != ";"
 791            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 792        ):
 793            start = self._current
 794            tokens = len(self.tokens)
 795            self._scan(check_semicolon=True)
 796            self.tokens = self.tokens[:tokens]
 797            text = self.sql[start : self._current].strip()
 798            if text:
 799                self._add(TokenType.STRING, text)
 800
 801    def _scan_keywords(self) -> None:
 802        sql = self.sql
 803        sql_size = self.size
 804        single_tokens = self.single_tokens
 805        char_upper = _CHAR_UPPER
 806        size = 0
 807        word = None
 808        chars = self._char
 809        char = chars
 810        prev_space = False
 811        skip = False
 812        trie = self.keyword_trie
 813        single_token = char in single_tokens
 814
 815        while chars:
 816            if not skip:
 817                sub = trie.get(char_upper.get(char, char))
 818                if sub is None:
 819                    break
 820                trie = sub
 821                if 0 in trie:
 822                    word = chars
 823
 824            end = self._current + size
 825            size += 1
 826
 827            if end < sql_size:
 828                char = sql[end]
 829                single_token = single_token or char in single_tokens
 830                is_space = char.isspace()
 831
 832                if not is_space or not prev_space:
 833                    if is_space:
 834                        char = " "
 835                    chars += char
 836                    prev_space = is_space
 837                    skip = False
 838                else:
 839                    skip = True
 840            else:
 841                char = ""
 842                break
 843
 844        if word:
 845            if self._scan_string(word):
 846                return
 847            if self._scan_comment(word):
 848                return
 849            if prev_space or single_token or not char:
 850                self._advance(size - 1)
 851                word = word.upper()
 852                self._add(self.keywords[word], text=word)
 853                return
 854
 855        if self._char in single_tokens:
 856            self._add(single_tokens[self._char], text=self._char)
 857            return
 858
 859        self._scan_var()
 860
 861    def _scan_comment(self, comment_start: str) -> bool:
 862        if comment_start not in self.comments:
 863            return False
 864
 865        comment_start_line = self._line
 866        comment_start_size = len(comment_start)
 867        comment_end = self.comments[comment_start]
 868
 869        if comment_end:
 870            # Skip the comment's start delimiter
 871            self._advance(comment_start_size)
 872
 873            comment_count = 1
 874            comment_end_size = len(comment_end)
 875            nested_comments = self.nested_comments
 876
 877            while not self._end:
 878                if self._chars(comment_end_size) == comment_end:
 879                    comment_count -= 1
 880                    if not comment_count:
 881                        break
 882
 883                self._advance(alnum=True)
 884
 885                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 886                if (
 887                    nested_comments
 888                    and not self._end
 889                    and self._chars(comment_end_size) == comment_start
 890                ):
 891                    self._advance(comment_start_size)
 892                    comment_count += 1
 893
 894            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 895            self._advance(comment_end_size - 1)
 896        else:
 897            _peek = self._peek
 898            while not self._end and _peek != "\n" and _peek != "\r":
 899                self._advance(alnum=True)
 900                _peek = self._peek
 901            self._comments.append(self._text[comment_start_size:])
 902
 903        if (
 904            comment_start == self.hint_start
 905            and self.tokens
 906            and self.tokens[-1].token_type in self.tokens_preceding_hint
 907        ):
 908            self._add(TokenType.HINT)
 909
 910        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 911        # Multiple consecutive comments are preserved by appending them to the current comments list.
 912        if comment_start_line == self._prev_token_line:
 913            self.tokens[-1].comments.extend(self._comments)
 914            self._comments = []
 915            self._prev_token_line = self._line
 916
 917        return True
 918
 919    def _scan_number(self) -> None:
 920        if self._char == "0":
 921            peek = _CHAR_UPPER.get(self._peek, self._peek)
 922            if peek == "B":
 923                return self._scan_bits() if self.has_bit_strings else self._add(TokenType.NUMBER)
 924            elif peek == "X":
 925                return self._scan_hex() if self.has_hex_strings else self._add(TokenType.NUMBER)
 926
 927        decimal = False
 928        scientific = 0
 929        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 930        single_tokens = self.single_tokens
 931        keywords = self.keywords
 932        numeric_literals = self.numeric_literals
 933        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 934
 935        is_underscore_separated: bool = False
 936        number_text: str = ""
 937        numeric_literal: str = ""
 938        numeric_type: TokenType | None = None
 939
 940        while True:
 941            if self._peek in _DIGIT_CHARS:
 942                # Batch consecutive digits: scan ahead to find how many
 943                sql = self.sql
 944                end = self._current + 1
 945                size = self.size
 946                while end < size and sql[end] in _DIGIT_CHARS:
 947                    end += 1
 948                self._advance(end - self._current)
 949            elif self._peek == "." and not decimal:
 950                if (
 951                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 952                ) or not self.numbers_can_have_decimals:
 953                    break
 954                decimal = True
 955                self._advance()
 956            elif self._peek in ("-", "+") and scientific == 1:
 957                # Only consume +/- if followed by a digit
 958                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 959                    scientific += 1
 960                    self._advance()
 961                else:
 962                    break
 963            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 964                scientific += 1
 965                self._advance()
 966            elif self._peek == "_" and numbers_can_be_underscore_separated:
 967                is_underscore_separated = True
 968                self._advance()
 969            elif self._peek.isidentifier():
 970                number_text = self._text
 971
 972                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 973                    numeric_literal += self._peek
 974                    self._advance()
 975
 976                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 977
 978                if numeric_type:
 979                    break
 980                elif identifiers_can_start_with_digit:
 981                    return self._add(TokenType.VAR)
 982
 983                self._advance(-len(numeric_literal))
 984                break
 985            else:
 986                break
 987
 988        number_text = number_text or self.sql[self._start : self._current]
 989
 990        # Normalize inputs such as 100_000 to 100000
 991        if is_underscore_separated:
 992            number_text = number_text.replace("_", "")
 993
 994        self._add(TokenType.NUMBER, number_text)
 995
 996        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 997        if numeric_type:
 998            self._add(TokenType.DCOLON, "::")
 999            self._add(numeric_type, numeric_literal)
1000
1001    def _scan_bits(self) -> None:
1002        self._advance()
1003        value = self._extract_value()
1004        try:
1005            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1006            int(value, 2)
1007            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1008        except ValueError:
1009            self._add(TokenType.IDENTIFIER)
1010
1011    def _scan_hex(self) -> None:
1012        self._advance()
1013        value = self._extract_value()
1014        try:
1015            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1016            int(value, 16)
1017            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1018        except ValueError:
1019            self._add(TokenType.IDENTIFIER)
1020
1021    def _extract_value(self) -> str:
1022        single_tokens = self.single_tokens
1023
1024        while True:
1025            char = self._peek.strip()
1026            if char and char not in single_tokens:
1027                self._advance(alnum=True)
1028            else:
1029                break
1030
1031        return self._text
1032
1033    def _scan_string(self, start: str) -> bool:
1034        base = None
1035        token_type = TokenType.STRING
1036
1037        if start in self.quotes:
1038            end = self.quotes[start]
1039        elif start in self.format_strings:
1040            end, token_type = self.format_strings[start]
1041
1042            if token_type == TokenType.HEX_STRING:
1043                base = 16
1044            elif token_type == TokenType.BIT_STRING:
1045                base = 2
1046            elif token_type == TokenType.HEREDOC_STRING:
1047                self._advance()
1048
1049                if self._char == end:
1050                    tag = ""
1051                else:
1052                    tag = self._extract_string(
1053                        end,
1054                        raw_string=True,
1055                        raise_unmatched=not self.heredoc_tag_is_identifier,
1056                    )
1057
1058                if (
1059                    tag
1060                    and self.heredoc_tag_is_identifier
1061                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1062                ):
1063                    if not self._end:
1064                        self._advance(-1)
1065
1066                    self._advance(-len(tag))
1067                    self._add(self.heredoc_string_alternative)
1068                    return True
1069
1070                end = f"{start}{tag}{end}"
1071        else:
1072            return False
1073
1074        self._advance(len(start))
1075        text = self._extract_string(
1076            end,
1077            escapes=(
1078                self.byte_string_escapes
1079                if token_type == TokenType.BYTE_STRING
1080                else self.string_escapes
1081            ),
1082            raw_string=token_type == TokenType.RAW_STRING,
1083        )
1084
1085        if base and text:
1086            try:
1087                int(text, base)
1088            except Exception:
1089                raise TokenError(
1090                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1091                )
1092
1093        self._add(token_type, text)
1094        return True
1095
1096    def _scan_identifier(self, identifier_end: str) -> None:
1097        self._advance()
1098        text = self._extract_string(
1099            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1100        )
1101        self._add(TokenType.IDENTIFIER, text)
1102
1103    def _scan_var(self) -> None:
1104        var_single_tokens = self.var_single_tokens
1105        single_tokens = self.single_tokens
1106
1107        while True:
1108            peek = self._peek
1109            if not peek or peek.isspace():
1110                break
1111            if peek not in var_single_tokens and peek in single_tokens:
1112                break
1113            self._advance(alnum=True)
1114
1115        self._add(
1116            TokenType.VAR
1117            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1118            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1119        )
1120
1121    def _extract_string(
1122        self,
1123        delimiter: str,
1124        escapes: set[str] | None = None,
1125        raw_string: bool = False,
1126        raise_unmatched: bool = True,
1127    ) -> str:
1128        text = ""
1129        delim_size = len(delimiter)
1130        escapes = self.string_escapes if escapes is None else escapes
1131        unescaped_sequences = self.unescaped_sequences
1132        escape_follow_chars = self.escape_follow_chars
1133        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1134        quotes = self.quotes
1135        sql = self.sql
1136
1137        # use str.find() when the string is simple... no \ or other escapes
1138        if delim_size == 1:
1139            pos = self._current - 1
1140            end = sql.find(delimiter, pos)
1141
1142            if (
1143                # the closing delimiter was found
1144                end != -1
1145                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1146                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1147                # no backslash in the string that would need escape processing
1148                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1149            ):
1150                newlines = sql.count("\n", pos, end)
1151                if newlines:
1152                    self._line += newlines
1153                    self._col = end - sql.rfind("\n", pos, end)
1154                else:
1155                    self._col += end - pos
1156
1157                self._current = end + 1
1158                self._end = self._current >= self.size
1159                self._char = sql[end]
1160                self._peek = "" if self._end else sql[self._current]
1161                return sql[pos:end]
1162
1163        while True:
1164            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1165                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1166                if unescaped_sequence:
1167                    self._advance(2)
1168                    text += unescaped_sequence
1169                    continue
1170
1171            is_valid_custom_escape = (
1172                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1173            )
1174
1175            if (
1176                (string_escapes_allowed_in_raw_strings or not raw_string)
1177                and self._char in escapes
1178                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1179                and (self._char not in quotes or self._char == self._peek)
1180            ):
1181                if self._peek == delimiter:
1182                    text += self._peek
1183                elif is_valid_custom_escape and self._char != self._peek:
1184                    text += self._peek
1185                else:
1186                    text += self._char + self._peek
1187
1188                if self._current + 1 < self.size:
1189                    self._advance(2)
1190                else:
1191                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1192            else:
1193                if self._chars(delim_size) == delimiter:
1194                    if delim_size > 1:
1195                        self._advance(delim_size - 1)
1196                    break
1197
1198                if self._end:
1199                    if not raise_unmatched:
1200                        return text + self._char
1201
1202                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1203
1204                current = self._current - 1
1205                self._advance(alnum=True)
1206                text += sql[current : self._current - 1]
1207
1208        return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], has_bit_strings: bool, has_hex_strings: bool, numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
578    def __init__(
579        self,
580        single_tokens: dict[str, TokenType],
581        keywords: dict[str, TokenType],
582        quotes: dict[str, str],
583        format_strings: dict[str, tuple[str, TokenType]],
584        identifiers: dict[str, str],
585        comments: dict[str, str | None],
586        string_escapes: set[str],
587        byte_string_escapes: set[str],
588        identifier_escapes: set[str],
589        escape_follow_chars: set[str],
590        commands: set[TokenType],
591        command_prefix_tokens: set[TokenType],
592        nested_comments: bool,
593        hint_start: str,
594        tokens_preceding_hint: set[TokenType],
595        has_bit_strings: bool,
596        has_hex_strings: bool,
597        numeric_literals: dict[str, str],
598        var_single_tokens: set[str],
599        string_escapes_allowed_in_raw_strings: bool,
600        heredoc_tag_is_identifier: bool,
601        heredoc_string_alternative: TokenType,
602        keyword_trie: dict,
603        numbers_can_be_underscore_separated: bool,
604        numbers_can_have_decimals: bool,
605        identifiers_can_start_with_digit: bool,
606        unescaped_sequences: dict[str, str],
607    ) -> None:
608        self.single_tokens = single_tokens
609        self.keywords = keywords
610        self.quotes = quotes
611        self.format_strings = format_strings
612        self.identifiers = identifiers
613        self.comments = comments
614        self.string_escapes = string_escapes
615        self.byte_string_escapes = byte_string_escapes
616        self.identifier_escapes = identifier_escapes
617        self.escape_follow_chars = escape_follow_chars
618        self.commands = commands
619        self.command_prefix_tokens = command_prefix_tokens
620        self.nested_comments = nested_comments
621        self.hint_start = hint_start
622        self.tokens_preceding_hint = tokens_preceding_hint
623        self.has_bit_strings = has_bit_strings
624        self.has_hex_strings = has_hex_strings
625        self.numeric_literals = numeric_literals
626        self.var_single_tokens = var_single_tokens
627        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
628        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
629        self.heredoc_string_alternative = heredoc_string_alternative
630        self.keyword_trie = keyword_trie
631        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
632        self.numbers_can_have_decimals = numbers_can_have_decimals
633        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
634        self.unescaped_sequences = unescaped_sequences
635        self.sql = ""
636        self.size = 0
637        self.tokens: list[Token] = []
638        self._start = 0
639        self._current = 0
640        self._line = 1
641        self._col = 0
642        self._comments: list[str] = []
643        self._char = ""
644        self._end = False
645        self._peek = ""
646        self._prev_token_line = -1
single_tokens
keywords
quotes
format_strings
identifiers
comments
string_escapes
byte_string_escapes
identifier_escapes
escape_follow_chars
commands
command_prefix_tokens
nested_comments
hint_start
tokens_preceding_hint
has_bit_strings
has_hex_strings
numeric_literals
var_single_tokens
string_escapes_allowed_in_raw_strings
heredoc_tag_is_identifier
heredoc_string_alternative
keyword_trie
numbers_can_be_underscore_separated
numbers_can_have_decimals
identifiers_can_start_with_digit
unescaped_sequences
sql
size
tokens: list[Token]
def reset(self) -> None:
648    def reset(self) -> None:
649        self.sql = ""
650        self.size = 0
651        self.tokens = []
652        self._start = 0
653        self._current = 0
654        self._line = 1
655        self._col = 0
656        self._comments = []
657        self._char = ""
658        self._end = False
659        self._peek = ""
660        self._prev_token_line = -1
def tokenize(self, sql: str) -> list[Token]:
662    def tokenize(self, sql: str) -> list[Token]:
663        """Returns a list of tokens corresponding to the SQL string `sql`."""
664        self.reset()
665        self.sql = sql
666        self.size = len(sql)
667
668        try:
669            self._scan()
670        except Exception as e:
671            start = max(self._current - 50, 0)
672            end = min(self._current + 50, self.size - 1)
673            context = self.sql[start:end]
674            raise TokenError(f"Error tokenizing '{context}'") from e
675
676        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.