Edit on GitHub

sqlglot.tokenizer_core

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import IntEnum, auto
   5
   6from sqlglot.errors import TokenError
   7
   8# dict lookup is faster than .upper() and .isdigit()
   9_CHAR_UPPER: dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)}
  10_DIGIT_CHARS: frozenset[str] = frozenset("0123456789")
  11
  12
  13class TokenType(IntEnum):
  14    L_PAREN = auto()
  15    R_PAREN = auto()
  16    L_BRACKET = auto()
  17    R_BRACKET = auto()
  18    L_BRACE = auto()
  19    R_BRACE = auto()
  20    COMMA = auto()
  21    DOT = auto()
  22    DASH = auto()
  23    PLUS = auto()
  24    COLON = auto()
  25    DOTCOLON = auto()
  26    DOTCARET = auto()
  27    DCOLON = auto()
  28    DCOLONDOLLAR = auto()
  29    DCOLONPERCENT = auto()
  30    DCOLONQMARK = auto()
  31    DQMARK = auto()
  32    SEMICOLON = auto()
  33    STAR = auto()
  34    BACKSLASH = auto()
  35    SLASH = auto()
  36    LT = auto()
  37    LTE = auto()
  38    GT = auto()
  39    GTE = auto()
  40    NOT = auto()
  41    EQ = auto()
  42    NEQ = auto()
  43    NULLSAFE_EQ = auto()
  44    COLON_EQ = auto()
  45    COLON_GT = auto()
  46    NCOLON_GT = auto()
  47    AND = auto()
  48    OR = auto()
  49    AMP = auto()
  50    DPIPE = auto()
  51    PIPE_GT = auto()
  52    PIPE = auto()
  53    PIPE_SLASH = auto()
  54    DPIPE_SLASH = auto()
  55    CARET = auto()
  56    CARET_AT = auto()
  57    TILDE = auto()
  58    ARROW = auto()
  59    DARROW = auto()
  60    FARROW = auto()
  61    HASH = auto()
  62    HASH_ARROW = auto()
  63    DHASH_ARROW = auto()
  64    LR_ARROW = auto()
  65    DAT = auto()
  66    LT_AT = auto()
  67    AT_GT = auto()
  68    DOLLAR = auto()
  69    PARAMETER = auto()
  70    SESSION = auto()
  71    SESSION_PARAMETER = auto()
  72    SESSION_USER = auto()
  73    DAMP = auto()
  74    AMP_LT = auto()
  75    AMP_GT = auto()
  76    ADJACENT = auto()
  77    XOR = auto()
  78    DSTAR = auto()
  79    QMARK_AMP = auto()
  80    QMARK_PIPE = auto()
  81    HASH_DASH = auto()
  82    EXCLAMATION = auto()
  83
  84    URI_START = auto()
  85
  86    BLOCK_START = auto()
  87    BLOCK_END = auto()
  88
  89    SPACE = auto()
  90    BREAK = auto()
  91
  92    STRING = auto()
  93    NUMBER = auto()
  94    IDENTIFIER = auto()
  95    DATABASE = auto()
  96    COLUMN = auto()
  97    COLUMN_DEF = auto()
  98    SCHEMA = auto()
  99    TABLE = auto()
 100    WAREHOUSE = auto()
 101    STAGE = auto()
 102    STREAM = auto()
 103    STREAMLIT = auto()
 104    VAR = auto()
 105    BIT_STRING = auto()
 106    HEX_STRING = auto()
 107    BYTE_STRING = auto()
 108    NATIONAL_STRING = auto()
 109    RAW_STRING = auto()
 110    HEREDOC_STRING = auto()
 111    UNICODE_STRING = auto()
 112
 113    # types
 114    BIT = auto()
 115    BOOLEAN = auto()
 116    TINYINT = auto()
 117    UTINYINT = auto()
 118    SMALLINT = auto()
 119    USMALLINT = auto()
 120    MEDIUMINT = auto()
 121    UMEDIUMINT = auto()
 122    INT = auto()
 123    UINT = auto()
 124    BIGINT = auto()
 125    UBIGINT = auto()
 126    BIGNUM = auto()
 127    INT128 = auto()
 128    UINT128 = auto()
 129    INT256 = auto()
 130    UINT256 = auto()
 131    FLOAT = auto()
 132    DOUBLE = auto()
 133    UDOUBLE = auto()
 134    DECIMAL = auto()
 135    DECIMAL32 = auto()
 136    DECIMAL64 = auto()
 137    DECIMAL128 = auto()
 138    DECIMAL256 = auto()
 139    DECFLOAT = auto()
 140    UDECIMAL = auto()
 141    BIGDECIMAL = auto()
 142    CHAR = auto()
 143    NCHAR = auto()
 144    VARCHAR = auto()
 145    NVARCHAR = auto()
 146    BPCHAR = auto()
 147    TEXT = auto()
 148    MEDIUMTEXT = auto()
 149    LONGTEXT = auto()
 150    BLOB = auto()
 151    MEDIUMBLOB = auto()
 152    LONGBLOB = auto()
 153    TINYBLOB = auto()
 154    TINYTEXT = auto()
 155    NAME = auto()
 156    BINARY = auto()
 157    VARBINARY = auto()
 158    JSON = auto()
 159    JSONB = auto()
 160    TIME = auto()
 161    TIMETZ = auto()
 162    TIME_NS = auto()
 163    TIMESTAMP = auto()
 164    TIMESTAMPTZ = auto()
 165    TIMESTAMPLTZ = auto()
 166    TIMESTAMPNTZ = auto()
 167    TIMESTAMP_S = auto()
 168    TIMESTAMP_MS = auto()
 169    TIMESTAMP_NS = auto()
 170    DATETIME = auto()
 171    DATETIME2 = auto()
 172    DATETIME64 = auto()
 173    SMALLDATETIME = auto()
 174    DATE = auto()
 175    DATE32 = auto()
 176    INT4RANGE = auto()
 177    INT4MULTIRANGE = auto()
 178    INT8RANGE = auto()
 179    INT8MULTIRANGE = auto()
 180    NUMRANGE = auto()
 181    NUMMULTIRANGE = auto()
 182    TSRANGE = auto()
 183    TSMULTIRANGE = auto()
 184    TSTZRANGE = auto()
 185    TSTZMULTIRANGE = auto()
 186    DATERANGE = auto()
 187    DATEMULTIRANGE = auto()
 188    UUID = auto()
 189    GEOGRAPHY = auto()
 190    GEOGRAPHYPOINT = auto()
 191    NULLABLE = auto()
 192    GEOMETRY = auto()
 193    POINT = auto()
 194    RING = auto()
 195    LINESTRING = auto()
 196    LOCALTIME = auto()
 197    LOCALTIMESTAMP = auto()
 198    SYSTIMESTAMP = auto()
 199    MULTILINESTRING = auto()
 200    POLYGON = auto()
 201    MULTIPOLYGON = auto()
 202    HLLSKETCH = auto()
 203    HSTORE = auto()
 204    SUPER = auto()
 205    SERIAL = auto()
 206    SMALLSERIAL = auto()
 207    BIGSERIAL = auto()
 208    XML = auto()
 209    YEAR = auto()
 210    USERDEFINED = auto()
 211    MONEY = auto()
 212    SMALLMONEY = auto()
 213    ROWVERSION = auto()
 214    IMAGE = auto()
 215    VARIANT = auto()
 216    OBJECT = auto()
 217    INET = auto()
 218    IPADDRESS = auto()
 219    IPPREFIX = auto()
 220    IPV4 = auto()
 221    IPV6 = auto()
 222    ENUM = auto()
 223    ENUM8 = auto()
 224    ENUM16 = auto()
 225    FIXEDSTRING = auto()
 226    LOWCARDINALITY = auto()
 227    NESTED = auto()
 228    AGGREGATEFUNCTION = auto()
 229    SIMPLEAGGREGATEFUNCTION = auto()
 230    TDIGEST = auto()
 231    UNKNOWN = auto()
 232    VECTOR = auto()
 233    DYNAMIC = auto()
 234    VOID = auto()
 235
 236    # keywords
 237    ALIAS = auto()
 238    ALTER = auto()
 239    ALL = auto()
 240    ANTI = auto()
 241    ANY = auto()
 242    APPLY = auto()
 243    ARRAY = auto()
 244    ASC = auto()
 245    ASOF = auto()
 246    ATTACH = auto()
 247    AUTO_INCREMENT = auto()
 248    BEGIN = auto()
 249    BETWEEN = auto()
 250    BULK_COLLECT_INTO = auto()
 251    CACHE = auto()
 252    CASE = auto()
 253    CHARACTER_SET = auto()
 254    CLUSTER_BY = auto()
 255    COLLATE = auto()
 256    COMMAND = auto()
 257    COMMENT = auto()
 258    COMMIT = auto()
 259    CONNECT_BY = auto()
 260    CONSTRAINT = auto()
 261    COPY = auto()
 262    CREATE = auto()
 263    CROSS = auto()
 264    CUBE = auto()
 265    CURRENT_DATE = auto()
 266    CURRENT_DATETIME = auto()
 267    CURRENT_SCHEMA = auto()
 268    CURRENT_TIME = auto()
 269    CURRENT_TIMESTAMP = auto()
 270    CURRENT_USER = auto()
 271    CURRENT_ROLE = auto()
 272    CURRENT_CATALOG = auto()
 273    DECLARE = auto()
 274    DEFAULT = auto()
 275    DELETE = auto()
 276    DESC = auto()
 277    DESCRIBE = auto()
 278    DETACH = auto()
 279    DICTIONARY = auto()
 280    DISTINCT = auto()
 281    DISTRIBUTE_BY = auto()
 282    DIV = auto()
 283    DROP = auto()
 284    ELSE = auto()
 285    END = auto()
 286    ESCAPE = auto()
 287    EXCEPT = auto()
 288    EXECUTE = auto()
 289    EXISTS = auto()
 290    FALSE = auto()
 291    FETCH = auto()
 292    FILE = auto()
 293    FILE_FORMAT = auto()
 294    FILTER = auto()
 295    FINAL = auto()
 296    FIRST = auto()
 297    FOR = auto()
 298    FORCE = auto()
 299    FOREIGN_KEY = auto()
 300    FORMAT = auto()
 301    FROM = auto()
 302    FULL = auto()
 303    FUNCTION = auto()
 304    GET = auto()
 305    GLOB = auto()
 306    GLOBAL = auto()
 307    GRANT = auto()
 308    GROUP_BY = auto()
 309    GROUPING_SETS = auto()
 310    HAVING = auto()
 311    HINT = auto()
 312    IGNORE = auto()
 313    ILIKE = auto()
 314    IN = auto()
 315    INDEX = auto()
 316    INDEXED_BY = auto()
 317    INNER = auto()
 318    INSERT = auto()
 319    INSTALL = auto()
 320    INTEGRATION = auto()
 321    INTERSECT = auto()
 322    INTERVAL = auto()
 323    INTO = auto()
 324    INTRODUCER = auto()
 325    IRLIKE = auto()
 326    IS = auto()
 327    ISNULL = auto()
 328    JOIN = auto()
 329    JOIN_MARKER = auto()
 330    KEEP = auto()
 331    KEY = auto()
 332    KILL = auto()
 333    LANGUAGE = auto()
 334    LATERAL = auto()
 335    LEFT = auto()
 336    LIKE = auto()
 337    LIMIT = auto()
 338    LIST = auto()
 339    LOAD = auto()
 340    LOCK = auto()
 341    MAP = auto()
 342    MATCH = auto()
 343    MATCH_CONDITION = auto()
 344    MATCH_RECOGNIZE = auto()
 345    MEMBER_OF = auto()
 346    MERGE = auto()
 347    MOD = auto()
 348    MODEL = auto()
 349    NATURAL = auto()
 350    NEXT = auto()
 351    NOTHING = auto()
 352    NOTNULL = auto()
 353    NULL = auto()
 354    OBJECT_IDENTIFIER = auto()
 355    OFFSET = auto()
 356    ON = auto()
 357    ONLY = auto()
 358    OPERATOR = auto()
 359    ORDER_BY = auto()
 360    ORDER_SIBLINGS_BY = auto()
 361    ORDERED = auto()
 362    ORDINALITY = auto()
 363    OUT = auto()
 364    INOUT = auto()
 365    OUTER = auto()
 366    OVER = auto()
 367    OVERLAPS = auto()
 368    OVERWRITE = auto()
 369    PACKAGE = auto()
 370    PARTITION = auto()
 371    PARTITION_BY = auto()
 372    PERCENT = auto()
 373    PIVOT = auto()
 374    PLACEHOLDER = auto()
 375    POLICY = auto()
 376    POOL = auto()
 377    POSITIONAL = auto()
 378    PRAGMA = auto()
 379    PREWHERE = auto()
 380    PRIMARY_KEY = auto()
 381    PROCEDURE = auto()
 382    PROPERTIES = auto()
 383    PSEUDO_TYPE = auto()
 384    PUT = auto()
 385    QUALIFY = auto()
 386    QUOTE = auto()
 387    QDCOLON = auto()
 388    RANGE = auto()
 389    RECURSIVE = auto()
 390    REFRESH = auto()
 391    RENAME = auto()
 392    REPLACE = auto()
 393    RETURNING = auto()
 394    REVOKE = auto()
 395    REFERENCES = auto()
 396    RIGHT = auto()
 397    RLIKE = auto()
 398    ROLE = auto()
 399    ROLLBACK = auto()
 400    ROLLUP = auto()
 401    ROW = auto()
 402    ROWS = auto()
 403    RULE = auto()
 404    SELECT = auto()
 405    SEMI = auto()
 406    SEPARATOR = auto()
 407    SEQUENCE = auto()
 408    SERDE_PROPERTIES = auto()
 409    SET = auto()
 410    SETTINGS = auto()
 411    SHOW = auto()
 412    SIMILAR_TO = auto()
 413    SOME = auto()
 414    SORT_BY = auto()
 415    SOUNDS_LIKE = auto()
 416    SQL_SECURITY = auto()
 417    START_WITH = auto()
 418    STORAGE_INTEGRATION = auto()
 419    STRAIGHT_JOIN = auto()
 420    STRUCT = auto()
 421    SUMMARIZE = auto()
 422    TABLE_SAMPLE = auto()
 423    TAG = auto()
 424    TEMPORARY = auto()
 425    TOP = auto()
 426    THEN = auto()
 427    TRUE = auto()
 428    TRUNCATE = auto()
 429    TRIGGER = auto()
 430    UNCACHE = auto()
 431    UNION = auto()
 432    UNNEST = auto()
 433    UNPIVOT = auto()
 434    UPDATE = auto()
 435    USE = auto()
 436    USING = auto()
 437    VALUES = auto()
 438    VARIADIC = auto()
 439    VIEW = auto()
 440    SEMANTIC_VIEW = auto()
 441    VOLATILE = auto()
 442    VOLUME = auto()
 443    WHEN = auto()
 444    WHERE = auto()
 445    WINDOW = auto()
 446    WITH = auto()
 447    UNIQUE = auto()
 448    UTC_DATE = auto()
 449    UTC_TIME = auto()
 450    UTC_TIMESTAMP = auto()
 451    VERSION_SNAPSHOT = auto()
 452    TIMESTAMP_SNAPSHOT = auto()
 453    OPTION = auto()
 454    SINK = auto()
 455    SOURCE = auto()
 456    ANALYZE = auto()
 457    NAMESPACE = auto()
 458    EXPORT = auto()
 459
 460    # sentinels
 461    HIVE_TOKEN_STREAM = auto()
 462    SENTINEL = auto()
 463
 464    def __str__(self) -> str:
 465        return f"TokenType.{self.name}"
 466
 467
 468class Token:
 469    # mypyc doesn't expose slots
 470    _attrs: t.ClassVar[tuple[str, ...]] = (
 471        "token_type",
 472        "text",
 473        "line",
 474        "col",
 475        "start",
 476        "end",
 477        "comments",
 478    )
 479    __slots__ = _attrs
 480
 481    @classmethod
 482    def number(cls, number: int) -> Token:
 483        """Returns a NUMBER token with `number` as its text."""
 484        return cls(TokenType.NUMBER, str(number))
 485
 486    @classmethod
 487    def string(cls, string: str) -> Token:
 488        """Returns a STRING token with `string` as its text."""
 489        return cls(TokenType.STRING, string)
 490
 491    @classmethod
 492    def identifier(cls, identifier: str) -> Token:
 493        """Returns an IDENTIFIER token with `identifier` as its text."""
 494        return cls(TokenType.IDENTIFIER, identifier)
 495
 496    @classmethod
 497    def var(cls, var: str) -> Token:
 498        """Returns an VAR token with `var` as its text."""
 499        return cls(TokenType.VAR, var)
 500
 501    def __init__(
 502        self,
 503        token_type: TokenType,
 504        text: str,
 505        line: int = 1,
 506        col: int = 1,
 507        start: int = 0,
 508        end: int = 0,
 509        comments: list[str] | None = None,
 510    ) -> None:
 511        self.token_type = token_type
 512        self.text = text
 513        self.line = line
 514        self.col = col
 515        self.start = start
 516        self.end = end
 517        self.comments = [] if comments is None else comments
 518
 519    def __bool__(self) -> bool:
 520        return self.token_type != TokenType.SENTINEL
 521
 522    def __repr__(self) -> str:
 523        attributes = ", ".join(
 524            f"{k}: TokenType.{self.token_type.name}"
 525            if k == "token_type"
 526            else f"{k}: {getattr(self, k)}"
 527            for k in self._attrs
 528        )
 529        return f"<Token {attributes}>"
 530
 531
 532class TokenizerCore:
 533    __slots__ = (
 534        "sql",
 535        "size",
 536        "tokens",
 537        "_start",
 538        "_current",
 539        "_line",
 540        "_col",
 541        "_comments",
 542        "_char",
 543        "_end",
 544        "_peek",
 545        "_prev_token_line",
 546        "single_tokens",
 547        "keywords",
 548        "quotes",
 549        "format_strings",
 550        "identifiers",
 551        "comments",
 552        "string_escapes",
 553        "byte_string_escapes",
 554        "identifier_escapes",
 555        "escape_follow_chars",
 556        "commands",
 557        "command_prefix_tokens",
 558        "nested_comments",
 559        "hint_start",
 560        "tokens_preceding_hint",
 561        "bit_strings",
 562        "hex_strings",
 563        "numeric_literals",
 564        "var_single_tokens",
 565        "string_escapes_allowed_in_raw_strings",
 566        "heredoc_tag_is_identifier",
 567        "heredoc_string_alternative",
 568        "keyword_trie",
 569        "numbers_can_be_underscore_separated",
 570        "numbers_can_have_decimals",
 571        "identifiers_can_start_with_digit",
 572        "unescaped_sequences",
 573    )
 574
 575    def __init__(
 576        self,
 577        single_tokens: dict[str, TokenType],
 578        keywords: dict[str, TokenType],
 579        quotes: dict[str, str],
 580        format_strings: dict[str, tuple[str, TokenType]],
 581        identifiers: dict[str, str],
 582        comments: dict[str, str | None],
 583        string_escapes: set[str],
 584        byte_string_escapes: set[str],
 585        identifier_escapes: set[str],
 586        escape_follow_chars: set[str],
 587        commands: set[TokenType],
 588        command_prefix_tokens: set[TokenType],
 589        nested_comments: bool,
 590        hint_start: str,
 591        tokens_preceding_hint: set[TokenType],
 592        bit_strings: list[str | tuple[str, str]],
 593        hex_strings: list[str | tuple[str, str]],
 594        numeric_literals: dict[str, str],
 595        var_single_tokens: set[str],
 596        string_escapes_allowed_in_raw_strings: bool,
 597        heredoc_tag_is_identifier: bool,
 598        heredoc_string_alternative: TokenType,
 599        keyword_trie: dict,
 600        numbers_can_be_underscore_separated: bool,
 601        numbers_can_have_decimals: bool,
 602        identifiers_can_start_with_digit: bool,
 603        unescaped_sequences: dict[str, str],
 604    ) -> None:
 605        self.single_tokens = single_tokens
 606        self.keywords = keywords
 607        self.quotes = quotes
 608        self.format_strings = format_strings
 609        self.identifiers = identifiers
 610        self.comments = comments
 611        self.string_escapes = string_escapes
 612        self.byte_string_escapes = byte_string_escapes
 613        self.identifier_escapes = identifier_escapes
 614        self.escape_follow_chars = escape_follow_chars
 615        self.commands = commands
 616        self.command_prefix_tokens = command_prefix_tokens
 617        self.nested_comments = nested_comments
 618        self.hint_start = hint_start
 619        self.tokens_preceding_hint = tokens_preceding_hint
 620        self.bit_strings = bit_strings
 621        self.hex_strings = hex_strings
 622        self.numeric_literals = numeric_literals
 623        self.var_single_tokens = var_single_tokens
 624        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 625        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 626        self.heredoc_string_alternative = heredoc_string_alternative
 627        self.keyword_trie = keyword_trie
 628        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 629        self.numbers_can_have_decimals = numbers_can_have_decimals
 630        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 631        self.unescaped_sequences = unescaped_sequences
 632        self.sql = ""
 633        self.size = 0
 634        self.tokens: list[Token] = []
 635        self._start = 0
 636        self._current = 0
 637        self._line = 1
 638        self._col = 0
 639        self._comments: list[str] = []
 640        self._char = ""
 641        self._end = False
 642        self._peek = ""
 643        self._prev_token_line = -1
 644
 645    def reset(self) -> None:
 646        self.sql = ""
 647        self.size = 0
 648        self.tokens = []
 649        self._start = 0
 650        self._current = 0
 651        self._line = 1
 652        self._col = 0
 653        self._comments = []
 654        self._char = ""
 655        self._end = False
 656        self._peek = ""
 657        self._prev_token_line = -1
 658
 659    def tokenize(self, sql: str) -> list[Token]:
 660        """Returns a list of tokens corresponding to the SQL string `sql`."""
 661        self.reset()
 662        self.sql = sql
 663        self.size = len(sql)
 664
 665        try:
 666            self._scan()
 667        except Exception as e:
 668            start = max(self._current - 50, 0)
 669            end = min(self._current + 50, self.size - 1)
 670            context = self.sql[start:end]
 671            raise TokenError(f"Error tokenizing '{context}'") from e
 672
 673        return self.tokens
 674
 675    def _scan(self, check_semicolon: bool = False) -> None:
 676        identifiers = self.identifiers
 677        digit_chars = _DIGIT_CHARS
 678
 679        while self.size and not self._end:
 680            current = self._current
 681
 682            # Skip spaces here rather than iteratively calling advance() for performance reasons
 683            while current < self.size:
 684                char = self.sql[current]
 685
 686                if char == " " or char == "\t":
 687                    current += 1
 688                else:
 689                    break
 690
 691            offset = current - self._current if current > self._current else 1
 692
 693            self._start = current
 694            self._advance(offset)
 695
 696            if not self._char.isspace():
 697                if self._char in digit_chars:
 698                    self._scan_number()
 699                elif self._char in identifiers:
 700                    self._scan_identifier(identifiers[self._char])
 701                else:
 702                    self._scan_keywords()
 703
 704            if check_semicolon and self._peek == ";":
 705                break
 706
 707        if self.tokens and self._comments:
 708            self.tokens[-1].comments.extend(self._comments)
 709
 710    def _chars(self, size: int) -> str:
 711        if size == 1:
 712            return self._char
 713
 714        start = self._current - 1
 715        end = start + size
 716
 717        return self.sql[start:end] if end <= self.size else ""
 718
 719    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 720        char = self._char
 721
 722        if char == "\n" or char == "\r":
 723            # Ensures we don't count an extra line if we get a \r\n line break sequence
 724            if not (char == "\r" and self._peek == "\n"):
 725                self._col = i
 726                self._line += 1
 727        else:
 728            self._col += i
 729
 730        self._current += i
 731        sql = self.sql
 732        size = self.size
 733        self._end = self._current >= size
 734        self._char = sql[self._current - 1]
 735        self._peek = "" if self._end else sql[self._current]
 736
 737        if alnum and self._char.isalnum():
 738            # Cache to local variables instead of attributes for better performance
 739            _col = self._col
 740            _current = self._current
 741            _end = self._end
 742            _peek = self._peek
 743
 744            while _peek.isalnum():
 745                _col += 1
 746                _current += 1
 747                _end = _current >= size
 748                _peek = "" if _end else sql[_current]
 749
 750            self._col = _col
 751            self._current = _current
 752            self._end = _end
 753            self._peek = _peek
 754            self._char = sql[_current - 1]
 755
 756    @property
 757    def _text(self) -> str:
 758        return self.sql[self._start : self._current]
 759
 760    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 761        self._prev_token_line = self._line
 762
 763        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 764            self.tokens[-1].comments.extend(self._comments)
 765            self._comments = []
 766
 767        if text is None:
 768            text = self.sql[self._start : self._current]
 769
 770        self.tokens.append(
 771            Token(
 772                token_type,
 773                text=text,
 774                line=self._line,
 775                col=self._col,
 776                start=self._start,
 777                end=self._current - 1,
 778                comments=self._comments,
 779            )
 780        )
 781        self._comments = []
 782
 783        # If we have either a semicolon or a begin token before the command's token, we'll parse
 784        # whatever follows the command's token as a string
 785        if (
 786            token_type in self.commands
 787            and self._peek != ";"
 788            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 789        ):
 790            start = self._current
 791            tokens = len(self.tokens)
 792            self._scan(check_semicolon=True)
 793            self.tokens = self.tokens[:tokens]
 794            text = self.sql[start : self._current].strip()
 795            if text:
 796                self._add(TokenType.STRING, text)
 797
 798    def _scan_keywords(self) -> None:
 799        sql = self.sql
 800        sql_size = self.size
 801        single_tokens = self.single_tokens
 802        char_upper = _CHAR_UPPER
 803        size = 0
 804        word = None
 805        chars = self._char
 806        char = chars
 807        prev_space = False
 808        skip = False
 809        trie = self.keyword_trie
 810        single_token = char in single_tokens
 811
 812        while chars:
 813            if not skip:
 814                sub = trie.get(char_upper.get(char, char))
 815                if sub is None:
 816                    break
 817                trie = sub
 818                if 0 in trie:
 819                    word = chars
 820
 821            end = self._current + size
 822            size += 1
 823
 824            if end < sql_size:
 825                char = sql[end]
 826                single_token = single_token or char in single_tokens
 827                is_space = char.isspace()
 828
 829                if not is_space or not prev_space:
 830                    if is_space:
 831                        char = " "
 832                    chars += char
 833                    prev_space = is_space
 834                    skip = False
 835                else:
 836                    skip = True
 837            else:
 838                char = ""
 839                break
 840
 841        if word:
 842            if self._scan_string(word):
 843                return
 844            if self._scan_comment(word):
 845                return
 846            if prev_space or single_token or not char:
 847                self._advance(size - 1)
 848                word = word.upper()
 849                self._add(self.keywords[word], text=word)
 850                return
 851
 852        if self._char in single_tokens:
 853            self._add(single_tokens[self._char], text=self._char)
 854            return
 855
 856        self._scan_var()
 857
 858    def _scan_comment(self, comment_start: str) -> bool:
 859        if comment_start not in self.comments:
 860            return False
 861
 862        comment_start_line = self._line
 863        comment_start_size = len(comment_start)
 864        comment_end = self.comments[comment_start]
 865
 866        if comment_end:
 867            # Skip the comment's start delimiter
 868            self._advance(comment_start_size)
 869
 870            comment_count = 1
 871            comment_end_size = len(comment_end)
 872            nested_comments = self.nested_comments
 873
 874            while not self._end:
 875                if self._chars(comment_end_size) == comment_end:
 876                    comment_count -= 1
 877                    if not comment_count:
 878                        break
 879
 880                self._advance(alnum=True)
 881
 882                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 883                if (
 884                    nested_comments
 885                    and not self._end
 886                    and self._chars(comment_end_size) == comment_start
 887                ):
 888                    self._advance(comment_start_size)
 889                    comment_count += 1
 890
 891            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 892            self._advance(comment_end_size - 1)
 893        else:
 894            _peek = self._peek
 895            while not self._end and _peek != "\n" and _peek != "\r":
 896                self._advance(alnum=True)
 897                _peek = self._peek
 898            self._comments.append(self._text[comment_start_size:])
 899
 900        if (
 901            comment_start == self.hint_start
 902            and self.tokens
 903            and self.tokens[-1].token_type in self.tokens_preceding_hint
 904        ):
 905            self._add(TokenType.HINT)
 906
 907        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 908        # Multiple consecutive comments are preserved by appending them to the current comments list.
 909        if comment_start_line == self._prev_token_line:
 910            self.tokens[-1].comments.extend(self._comments)
 911            self._comments = []
 912            self._prev_token_line = self._line
 913
 914        return True
 915
 916    def _scan_number(self) -> None:
 917        if self._char == "0":
 918            peek = _CHAR_UPPER.get(self._peek, self._peek)
 919            if peek == "B":
 920                return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
 921            elif peek == "X":
 922                return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
 923
 924        decimal = False
 925        scientific = 0
 926        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 927        single_tokens = self.single_tokens
 928        keywords = self.keywords
 929        numeric_literals = self.numeric_literals
 930        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 931
 932        is_underscore_separated: bool = False
 933        number_text: str = ""
 934        numeric_literal: str = ""
 935        numeric_type: TokenType | None = None
 936
 937        while True:
 938            if self._peek in _DIGIT_CHARS:
 939                # Batch consecutive digits: scan ahead to find how many
 940                sql = self.sql
 941                end = self._current + 1
 942                size = self.size
 943                while end < size and sql[end] in _DIGIT_CHARS:
 944                    end += 1
 945                self._advance(end - self._current)
 946            elif self._peek == "." and not decimal:
 947                if (
 948                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 949                ) or not self.numbers_can_have_decimals:
 950                    break
 951                decimal = True
 952                self._advance()
 953            elif self._peek in ("-", "+") and scientific == 1:
 954                # Only consume +/- if followed by a digit
 955                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 956                    scientific += 1
 957                    self._advance()
 958                else:
 959                    break
 960            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 961                scientific += 1
 962                self._advance()
 963            elif self._peek == "_" and numbers_can_be_underscore_separated:
 964                is_underscore_separated = True
 965                self._advance()
 966            elif self._peek.isidentifier():
 967                number_text = self._text
 968
 969                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 970                    numeric_literal += self._peek
 971                    self._advance()
 972
 973                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 974
 975                if numeric_type:
 976                    break
 977                elif identifiers_can_start_with_digit:
 978                    return self._add(TokenType.VAR)
 979
 980                self._advance(-len(numeric_literal))
 981                break
 982            else:
 983                break
 984
 985        number_text = number_text or self.sql[self._start : self._current]
 986
 987        # Normalize inputs such as 100_000 to 100000
 988        if is_underscore_separated:
 989            number_text = number_text.replace("_", "")
 990
 991        self._add(TokenType.NUMBER, number_text)
 992
 993        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 994        if numeric_type:
 995            self._add(TokenType.DCOLON, "::")
 996            self._add(numeric_type, numeric_literal)
 997
 998    def _scan_bits(self) -> None:
 999        self._advance()
1000        value = self._extract_value()
1001        try:
1002            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1003            int(value, 2)
1004            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1005        except ValueError:
1006            self._add(TokenType.IDENTIFIER)
1007
1008    def _scan_hex(self) -> None:
1009        self._advance()
1010        value = self._extract_value()
1011        try:
1012            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1013            int(value, 16)
1014            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1015        except ValueError:
1016            self._add(TokenType.IDENTIFIER)
1017
1018    def _extract_value(self) -> str:
1019        single_tokens = self.single_tokens
1020
1021        while True:
1022            char = self._peek.strip()
1023            if char and char not in single_tokens:
1024                self._advance(alnum=True)
1025            else:
1026                break
1027
1028        return self._text
1029
1030    def _scan_string(self, start: str) -> bool:
1031        base = None
1032        token_type = TokenType.STRING
1033
1034        if start in self.quotes:
1035            end = self.quotes[start]
1036        elif start in self.format_strings:
1037            end, token_type = self.format_strings[start]
1038
1039            if token_type == TokenType.HEX_STRING:
1040                base = 16
1041            elif token_type == TokenType.BIT_STRING:
1042                base = 2
1043            elif token_type == TokenType.HEREDOC_STRING:
1044                self._advance()
1045
1046                if self._char == end:
1047                    tag = ""
1048                else:
1049                    tag = self._extract_string(
1050                        end,
1051                        raw_string=True,
1052                        raise_unmatched=not self.heredoc_tag_is_identifier,
1053                    )
1054
1055                if (
1056                    tag
1057                    and self.heredoc_tag_is_identifier
1058                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1059                ):
1060                    if not self._end:
1061                        self._advance(-1)
1062
1063                    self._advance(-len(tag))
1064                    self._add(self.heredoc_string_alternative)
1065                    return True
1066
1067                end = f"{start}{tag}{end}"
1068        else:
1069            return False
1070
1071        self._advance(len(start))
1072        text = self._extract_string(
1073            end,
1074            escapes=(
1075                self.byte_string_escapes
1076                if token_type == TokenType.BYTE_STRING
1077                else self.string_escapes
1078            ),
1079            raw_string=token_type == TokenType.RAW_STRING,
1080        )
1081
1082        if base and text:
1083            try:
1084                int(text, base)
1085            except Exception:
1086                raise TokenError(
1087                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1088                )
1089
1090        self._add(token_type, text)
1091        return True
1092
1093    def _scan_identifier(self, identifier_end: str) -> None:
1094        self._advance()
1095        text = self._extract_string(
1096            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1097        )
1098        self._add(TokenType.IDENTIFIER, text)
1099
1100    def _scan_var(self) -> None:
1101        var_single_tokens = self.var_single_tokens
1102        single_tokens = self.single_tokens
1103
1104        while True:
1105            peek = self._peek
1106            if not peek or peek.isspace():
1107                break
1108            if peek not in var_single_tokens and peek in single_tokens:
1109                break
1110            self._advance(alnum=True)
1111
1112        self._add(
1113            TokenType.VAR
1114            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1115            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1116        )
1117
1118    def _extract_string(
1119        self,
1120        delimiter: str,
1121        escapes: set[str] | None = None,
1122        raw_string: bool = False,
1123        raise_unmatched: bool = True,
1124    ) -> str:
1125        text = ""
1126        delim_size = len(delimiter)
1127        escapes = self.string_escapes if escapes is None else escapes
1128        unescaped_sequences = self.unescaped_sequences
1129        escape_follow_chars = self.escape_follow_chars
1130        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1131        quotes = self.quotes
1132        sql = self.sql
1133
1134        # use str.find() when the string is simple... no \ or other escapes
1135        if delim_size == 1:
1136            pos = self._current - 1
1137            end = sql.find(delimiter, pos)
1138
1139            if (
1140                # the closing delimiter was found
1141                end != -1
1142                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1143                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1144                # no backslash in the string that would need escape processing
1145                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1146            ):
1147                newlines = sql.count("\n", pos, end)
1148                if newlines:
1149                    self._line += newlines
1150                    self._col = end - sql.rfind("\n", pos, end)
1151                else:
1152                    self._col += end - pos
1153
1154                self._current = end + 1
1155                self._end = self._current >= self.size
1156                self._char = sql[end]
1157                self._peek = "" if self._end else sql[self._current]
1158                return sql[pos:end]
1159
1160        while True:
1161            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1162                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1163                if unescaped_sequence:
1164                    self._advance(2)
1165                    text += unescaped_sequence
1166                    continue
1167
1168            is_valid_custom_escape = (
1169                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1170            )
1171
1172            if (
1173                (string_escapes_allowed_in_raw_strings or not raw_string)
1174                and self._char in escapes
1175                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1176                and (self._char not in quotes or self._char == self._peek)
1177            ):
1178                if self._peek == delimiter:
1179                    text += self._peek
1180                elif is_valid_custom_escape and self._char != self._peek:
1181                    text += self._peek
1182                else:
1183                    text += self._char + self._peek
1184
1185                if self._current + 1 < self.size:
1186                    self._advance(2)
1187                else:
1188                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1189            else:
1190                if self._chars(delim_size) == delimiter:
1191                    if delim_size > 1:
1192                        self._advance(delim_size - 1)
1193                    break
1194
1195                if self._end:
1196                    if not raise_unmatched:
1197                        return text + self._char
1198
1199                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1200
1201                current = self._current - 1
1202                self._advance(alnum=True)
1203                text += sql[current : self._current - 1]
1204
1205        return text
class TokenType(enum.IntEnum):
 14class TokenType(IntEnum):
 15    L_PAREN = auto()
 16    R_PAREN = auto()
 17    L_BRACKET = auto()
 18    R_BRACKET = auto()
 19    L_BRACE = auto()
 20    R_BRACE = auto()
 21    COMMA = auto()
 22    DOT = auto()
 23    DASH = auto()
 24    PLUS = auto()
 25    COLON = auto()
 26    DOTCOLON = auto()
 27    DOTCARET = auto()
 28    DCOLON = auto()
 29    DCOLONDOLLAR = auto()
 30    DCOLONPERCENT = auto()
 31    DCOLONQMARK = auto()
 32    DQMARK = auto()
 33    SEMICOLON = auto()
 34    STAR = auto()
 35    BACKSLASH = auto()
 36    SLASH = auto()
 37    LT = auto()
 38    LTE = auto()
 39    GT = auto()
 40    GTE = auto()
 41    NOT = auto()
 42    EQ = auto()
 43    NEQ = auto()
 44    NULLSAFE_EQ = auto()
 45    COLON_EQ = auto()
 46    COLON_GT = auto()
 47    NCOLON_GT = auto()
 48    AND = auto()
 49    OR = auto()
 50    AMP = auto()
 51    DPIPE = auto()
 52    PIPE_GT = auto()
 53    PIPE = auto()
 54    PIPE_SLASH = auto()
 55    DPIPE_SLASH = auto()
 56    CARET = auto()
 57    CARET_AT = auto()
 58    TILDE = auto()
 59    ARROW = auto()
 60    DARROW = auto()
 61    FARROW = auto()
 62    HASH = auto()
 63    HASH_ARROW = auto()
 64    DHASH_ARROW = auto()
 65    LR_ARROW = auto()
 66    DAT = auto()
 67    LT_AT = auto()
 68    AT_GT = auto()
 69    DOLLAR = auto()
 70    PARAMETER = auto()
 71    SESSION = auto()
 72    SESSION_PARAMETER = auto()
 73    SESSION_USER = auto()
 74    DAMP = auto()
 75    AMP_LT = auto()
 76    AMP_GT = auto()
 77    ADJACENT = auto()
 78    XOR = auto()
 79    DSTAR = auto()
 80    QMARK_AMP = auto()
 81    QMARK_PIPE = auto()
 82    HASH_DASH = auto()
 83    EXCLAMATION = auto()
 84
 85    URI_START = auto()
 86
 87    BLOCK_START = auto()
 88    BLOCK_END = auto()
 89
 90    SPACE = auto()
 91    BREAK = auto()
 92
 93    STRING = auto()
 94    NUMBER = auto()
 95    IDENTIFIER = auto()
 96    DATABASE = auto()
 97    COLUMN = auto()
 98    COLUMN_DEF = auto()
 99    SCHEMA = auto()
100    TABLE = auto()
101    WAREHOUSE = auto()
102    STAGE = auto()
103    STREAM = auto()
104    STREAMLIT = auto()
105    VAR = auto()
106    BIT_STRING = auto()
107    HEX_STRING = auto()
108    BYTE_STRING = auto()
109    NATIONAL_STRING = auto()
110    RAW_STRING = auto()
111    HEREDOC_STRING = auto()
112    UNICODE_STRING = auto()
113
114    # types
115    BIT = auto()
116    BOOLEAN = auto()
117    TINYINT = auto()
118    UTINYINT = auto()
119    SMALLINT = auto()
120    USMALLINT = auto()
121    MEDIUMINT = auto()
122    UMEDIUMINT = auto()
123    INT = auto()
124    UINT = auto()
125    BIGINT = auto()
126    UBIGINT = auto()
127    BIGNUM = auto()
128    INT128 = auto()
129    UINT128 = auto()
130    INT256 = auto()
131    UINT256 = auto()
132    FLOAT = auto()
133    DOUBLE = auto()
134    UDOUBLE = auto()
135    DECIMAL = auto()
136    DECIMAL32 = auto()
137    DECIMAL64 = auto()
138    DECIMAL128 = auto()
139    DECIMAL256 = auto()
140    DECFLOAT = auto()
141    UDECIMAL = auto()
142    BIGDECIMAL = auto()
143    CHAR = auto()
144    NCHAR = auto()
145    VARCHAR = auto()
146    NVARCHAR = auto()
147    BPCHAR = auto()
148    TEXT = auto()
149    MEDIUMTEXT = auto()
150    LONGTEXT = auto()
151    BLOB = auto()
152    MEDIUMBLOB = auto()
153    LONGBLOB = auto()
154    TINYBLOB = auto()
155    TINYTEXT = auto()
156    NAME = auto()
157    BINARY = auto()
158    VARBINARY = auto()
159    JSON = auto()
160    JSONB = auto()
161    TIME = auto()
162    TIMETZ = auto()
163    TIME_NS = auto()
164    TIMESTAMP = auto()
165    TIMESTAMPTZ = auto()
166    TIMESTAMPLTZ = auto()
167    TIMESTAMPNTZ = auto()
168    TIMESTAMP_S = auto()
169    TIMESTAMP_MS = auto()
170    TIMESTAMP_NS = auto()
171    DATETIME = auto()
172    DATETIME2 = auto()
173    DATETIME64 = auto()
174    SMALLDATETIME = auto()
175    DATE = auto()
176    DATE32 = auto()
177    INT4RANGE = auto()
178    INT4MULTIRANGE = auto()
179    INT8RANGE = auto()
180    INT8MULTIRANGE = auto()
181    NUMRANGE = auto()
182    NUMMULTIRANGE = auto()
183    TSRANGE = auto()
184    TSMULTIRANGE = auto()
185    TSTZRANGE = auto()
186    TSTZMULTIRANGE = auto()
187    DATERANGE = auto()
188    DATEMULTIRANGE = auto()
189    UUID = auto()
190    GEOGRAPHY = auto()
191    GEOGRAPHYPOINT = auto()
192    NULLABLE = auto()
193    GEOMETRY = auto()
194    POINT = auto()
195    RING = auto()
196    LINESTRING = auto()
197    LOCALTIME = auto()
198    LOCALTIMESTAMP = auto()
199    SYSTIMESTAMP = auto()
200    MULTILINESTRING = auto()
201    POLYGON = auto()
202    MULTIPOLYGON = auto()
203    HLLSKETCH = auto()
204    HSTORE = auto()
205    SUPER = auto()
206    SERIAL = auto()
207    SMALLSERIAL = auto()
208    BIGSERIAL = auto()
209    XML = auto()
210    YEAR = auto()
211    USERDEFINED = auto()
212    MONEY = auto()
213    SMALLMONEY = auto()
214    ROWVERSION = auto()
215    IMAGE = auto()
216    VARIANT = auto()
217    OBJECT = auto()
218    INET = auto()
219    IPADDRESS = auto()
220    IPPREFIX = auto()
221    IPV4 = auto()
222    IPV6 = auto()
223    ENUM = auto()
224    ENUM8 = auto()
225    ENUM16 = auto()
226    FIXEDSTRING = auto()
227    LOWCARDINALITY = auto()
228    NESTED = auto()
229    AGGREGATEFUNCTION = auto()
230    SIMPLEAGGREGATEFUNCTION = auto()
231    TDIGEST = auto()
232    UNKNOWN = auto()
233    VECTOR = auto()
234    DYNAMIC = auto()
235    VOID = auto()
236
237    # keywords
238    ALIAS = auto()
239    ALTER = auto()
240    ALL = auto()
241    ANTI = auto()
242    ANY = auto()
243    APPLY = auto()
244    ARRAY = auto()
245    ASC = auto()
246    ASOF = auto()
247    ATTACH = auto()
248    AUTO_INCREMENT = auto()
249    BEGIN = auto()
250    BETWEEN = auto()
251    BULK_COLLECT_INTO = auto()
252    CACHE = auto()
253    CASE = auto()
254    CHARACTER_SET = auto()
255    CLUSTER_BY = auto()
256    COLLATE = auto()
257    COMMAND = auto()
258    COMMENT = auto()
259    COMMIT = auto()
260    CONNECT_BY = auto()
261    CONSTRAINT = auto()
262    COPY = auto()
263    CREATE = auto()
264    CROSS = auto()
265    CUBE = auto()
266    CURRENT_DATE = auto()
267    CURRENT_DATETIME = auto()
268    CURRENT_SCHEMA = auto()
269    CURRENT_TIME = auto()
270    CURRENT_TIMESTAMP = auto()
271    CURRENT_USER = auto()
272    CURRENT_ROLE = auto()
273    CURRENT_CATALOG = auto()
274    DECLARE = auto()
275    DEFAULT = auto()
276    DELETE = auto()
277    DESC = auto()
278    DESCRIBE = auto()
279    DETACH = auto()
280    DICTIONARY = auto()
281    DISTINCT = auto()
282    DISTRIBUTE_BY = auto()
283    DIV = auto()
284    DROP = auto()
285    ELSE = auto()
286    END = auto()
287    ESCAPE = auto()
288    EXCEPT = auto()
289    EXECUTE = auto()
290    EXISTS = auto()
291    FALSE = auto()
292    FETCH = auto()
293    FILE = auto()
294    FILE_FORMAT = auto()
295    FILTER = auto()
296    FINAL = auto()
297    FIRST = auto()
298    FOR = auto()
299    FORCE = auto()
300    FOREIGN_KEY = auto()
301    FORMAT = auto()
302    FROM = auto()
303    FULL = auto()
304    FUNCTION = auto()
305    GET = auto()
306    GLOB = auto()
307    GLOBAL = auto()
308    GRANT = auto()
309    GROUP_BY = auto()
310    GROUPING_SETS = auto()
311    HAVING = auto()
312    HINT = auto()
313    IGNORE = auto()
314    ILIKE = auto()
315    IN = auto()
316    INDEX = auto()
317    INDEXED_BY = auto()
318    INNER = auto()
319    INSERT = auto()
320    INSTALL = auto()
321    INTEGRATION = auto()
322    INTERSECT = auto()
323    INTERVAL = auto()
324    INTO = auto()
325    INTRODUCER = auto()
326    IRLIKE = auto()
327    IS = auto()
328    ISNULL = auto()
329    JOIN = auto()
330    JOIN_MARKER = auto()
331    KEEP = auto()
332    KEY = auto()
333    KILL = auto()
334    LANGUAGE = auto()
335    LATERAL = auto()
336    LEFT = auto()
337    LIKE = auto()
338    LIMIT = auto()
339    LIST = auto()
340    LOAD = auto()
341    LOCK = auto()
342    MAP = auto()
343    MATCH = auto()
344    MATCH_CONDITION = auto()
345    MATCH_RECOGNIZE = auto()
346    MEMBER_OF = auto()
347    MERGE = auto()
348    MOD = auto()
349    MODEL = auto()
350    NATURAL = auto()
351    NEXT = auto()
352    NOTHING = auto()
353    NOTNULL = auto()
354    NULL = auto()
355    OBJECT_IDENTIFIER = auto()
356    OFFSET = auto()
357    ON = auto()
358    ONLY = auto()
359    OPERATOR = auto()
360    ORDER_BY = auto()
361    ORDER_SIBLINGS_BY = auto()
362    ORDERED = auto()
363    ORDINALITY = auto()
364    OUT = auto()
365    INOUT = auto()
366    OUTER = auto()
367    OVER = auto()
368    OVERLAPS = auto()
369    OVERWRITE = auto()
370    PACKAGE = auto()
371    PARTITION = auto()
372    PARTITION_BY = auto()
373    PERCENT = auto()
374    PIVOT = auto()
375    PLACEHOLDER = auto()
376    POLICY = auto()
377    POOL = auto()
378    POSITIONAL = auto()
379    PRAGMA = auto()
380    PREWHERE = auto()
381    PRIMARY_KEY = auto()
382    PROCEDURE = auto()
383    PROPERTIES = auto()
384    PSEUDO_TYPE = auto()
385    PUT = auto()
386    QUALIFY = auto()
387    QUOTE = auto()
388    QDCOLON = auto()
389    RANGE = auto()
390    RECURSIVE = auto()
391    REFRESH = auto()
392    RENAME = auto()
393    REPLACE = auto()
394    RETURNING = auto()
395    REVOKE = auto()
396    REFERENCES = auto()
397    RIGHT = auto()
398    RLIKE = auto()
399    ROLE = auto()
400    ROLLBACK = auto()
401    ROLLUP = auto()
402    ROW = auto()
403    ROWS = auto()
404    RULE = auto()
405    SELECT = auto()
406    SEMI = auto()
407    SEPARATOR = auto()
408    SEQUENCE = auto()
409    SERDE_PROPERTIES = auto()
410    SET = auto()
411    SETTINGS = auto()
412    SHOW = auto()
413    SIMILAR_TO = auto()
414    SOME = auto()
415    SORT_BY = auto()
416    SOUNDS_LIKE = auto()
417    SQL_SECURITY = auto()
418    START_WITH = auto()
419    STORAGE_INTEGRATION = auto()
420    STRAIGHT_JOIN = auto()
421    STRUCT = auto()
422    SUMMARIZE = auto()
423    TABLE_SAMPLE = auto()
424    TAG = auto()
425    TEMPORARY = auto()
426    TOP = auto()
427    THEN = auto()
428    TRUE = auto()
429    TRUNCATE = auto()
430    TRIGGER = auto()
431    UNCACHE = auto()
432    UNION = auto()
433    UNNEST = auto()
434    UNPIVOT = auto()
435    UPDATE = auto()
436    USE = auto()
437    USING = auto()
438    VALUES = auto()
439    VARIADIC = auto()
440    VIEW = auto()
441    SEMANTIC_VIEW = auto()
442    VOLATILE = auto()
443    VOLUME = auto()
444    WHEN = auto()
445    WHERE = auto()
446    WINDOW = auto()
447    WITH = auto()
448    UNIQUE = auto()
449    UTC_DATE = auto()
450    UTC_TIME = auto()
451    UTC_TIMESTAMP = auto()
452    VERSION_SNAPSHOT = auto()
453    TIMESTAMP_SNAPSHOT = auto()
454    OPTION = auto()
455    SINK = auto()
456    SOURCE = auto()
457    ANALYZE = auto()
458    NAMESPACE = auto()
459    EXPORT = auto()
460
461    # sentinels
462    HIVE_TOKEN_STREAM = auto()
463    SENTINEL = auto()
464
465    def __str__(self) -> str:
466        return f"TokenType.{self.name}"

An enumeration.

L_PAREN = <TokenType.L_PAREN: 1>
R_PAREN = <TokenType.R_PAREN: 2>
L_BRACKET = <TokenType.L_BRACKET: 3>
R_BRACKET = <TokenType.R_BRACKET: 4>
L_BRACE = <TokenType.L_BRACE: 5>
R_BRACE = <TokenType.R_BRACE: 6>
COMMA = <TokenType.COMMA: 7>
DOT = <TokenType.DOT: 8>
DASH = <TokenType.DASH: 9>
PLUS = <TokenType.PLUS: 10>
COLON = <TokenType.COLON: 11>
DOTCOLON = <TokenType.DOTCOLON: 12>
DOTCARET = <TokenType.DOTCARET: 13>
DCOLON = <TokenType.DCOLON: 14>
DCOLONDOLLAR = <TokenType.DCOLONDOLLAR: 15>
DCOLONPERCENT = <TokenType.DCOLONPERCENT: 16>
DCOLONQMARK = <TokenType.DCOLONQMARK: 17>
DQMARK = <TokenType.DQMARK: 18>
SEMICOLON = <TokenType.SEMICOLON: 19>
STAR = <TokenType.STAR: 20>
BACKSLASH = <TokenType.BACKSLASH: 21>
SLASH = <TokenType.SLASH: 22>
LT = <TokenType.LT: 23>
LTE = <TokenType.LTE: 24>
GT = <TokenType.GT: 25>
GTE = <TokenType.GTE: 26>
NOT = <TokenType.NOT: 27>
EQ = <TokenType.EQ: 28>
NEQ = <TokenType.NEQ: 29>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 30>
COLON_EQ = <TokenType.COLON_EQ: 31>
COLON_GT = <TokenType.COLON_GT: 32>
NCOLON_GT = <TokenType.NCOLON_GT: 33>
AND = <TokenType.AND: 34>
OR = <TokenType.OR: 35>
AMP = <TokenType.AMP: 36>
DPIPE = <TokenType.DPIPE: 37>
PIPE_GT = <TokenType.PIPE_GT: 38>
PIPE = <TokenType.PIPE: 39>
PIPE_SLASH = <TokenType.PIPE_SLASH: 40>
DPIPE_SLASH = <TokenType.DPIPE_SLASH: 41>
CARET = <TokenType.CARET: 42>
CARET_AT = <TokenType.CARET_AT: 43>
TILDE = <TokenType.TILDE: 44>
ARROW = <TokenType.ARROW: 45>
DARROW = <TokenType.DARROW: 46>
FARROW = <TokenType.FARROW: 47>
HASH = <TokenType.HASH: 48>
HASH_ARROW = <TokenType.HASH_ARROW: 49>
DHASH_ARROW = <TokenType.DHASH_ARROW: 50>
LR_ARROW = <TokenType.LR_ARROW: 51>
DAT = <TokenType.DAT: 52>
LT_AT = <TokenType.LT_AT: 53>
AT_GT = <TokenType.AT_GT: 54>
DOLLAR = <TokenType.DOLLAR: 55>
PARAMETER = <TokenType.PARAMETER: 56>
SESSION = <TokenType.SESSION: 57>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 58>
SESSION_USER = <TokenType.SESSION_USER: 59>
DAMP = <TokenType.DAMP: 60>
AMP_LT = <TokenType.AMP_LT: 61>
AMP_GT = <TokenType.AMP_GT: 62>
ADJACENT = <TokenType.ADJACENT: 63>
XOR = <TokenType.XOR: 64>
DSTAR = <TokenType.DSTAR: 65>
QMARK_AMP = <TokenType.QMARK_AMP: 66>
QMARK_PIPE = <TokenType.QMARK_PIPE: 67>
HASH_DASH = <TokenType.HASH_DASH: 68>
EXCLAMATION = <TokenType.EXCLAMATION: 69>
URI_START = <TokenType.URI_START: 70>
BLOCK_START = <TokenType.BLOCK_START: 71>
BLOCK_END = <TokenType.BLOCK_END: 72>
SPACE = <TokenType.SPACE: 73>
BREAK = <TokenType.BREAK: 74>
STRING = <TokenType.STRING: 75>
NUMBER = <TokenType.NUMBER: 76>
IDENTIFIER = <TokenType.IDENTIFIER: 77>
DATABASE = <TokenType.DATABASE: 78>
COLUMN = <TokenType.COLUMN: 79>
COLUMN_DEF = <TokenType.COLUMN_DEF: 80>
SCHEMA = <TokenType.SCHEMA: 81>
TABLE = <TokenType.TABLE: 82>
WAREHOUSE = <TokenType.WAREHOUSE: 83>
STAGE = <TokenType.STAGE: 84>
STREAM = <TokenType.STREAM: 85>
STREAMLIT = <TokenType.STREAMLIT: 86>
VAR = <TokenType.VAR: 87>
BIT_STRING = <TokenType.BIT_STRING: 88>
HEX_STRING = <TokenType.HEX_STRING: 89>
BYTE_STRING = <TokenType.BYTE_STRING: 90>
NATIONAL_STRING = <TokenType.NATIONAL_STRING: 91>
RAW_STRING = <TokenType.RAW_STRING: 92>
HEREDOC_STRING = <TokenType.HEREDOC_STRING: 93>
UNICODE_STRING = <TokenType.UNICODE_STRING: 94>
BIT = <TokenType.BIT: 95>
BOOLEAN = <TokenType.BOOLEAN: 96>
TINYINT = <TokenType.TINYINT: 97>
UTINYINT = <TokenType.UTINYINT: 98>
SMALLINT = <TokenType.SMALLINT: 99>
USMALLINT = <TokenType.USMALLINT: 100>
MEDIUMINT = <TokenType.MEDIUMINT: 101>
UMEDIUMINT = <TokenType.UMEDIUMINT: 102>
INT = <TokenType.INT: 103>
UINT = <TokenType.UINT: 104>
BIGINT = <TokenType.BIGINT: 105>
UBIGINT = <TokenType.UBIGINT: 106>
BIGNUM = <TokenType.BIGNUM: 107>
INT128 = <TokenType.INT128: 108>
UINT128 = <TokenType.UINT128: 109>
INT256 = <TokenType.INT256: 110>
UINT256 = <TokenType.UINT256: 111>
FLOAT = <TokenType.FLOAT: 112>
DOUBLE = <TokenType.DOUBLE: 113>
UDOUBLE = <TokenType.UDOUBLE: 114>
DECIMAL = <TokenType.DECIMAL: 115>
DECIMAL32 = <TokenType.DECIMAL32: 116>
DECIMAL64 = <TokenType.DECIMAL64: 117>
DECIMAL128 = <TokenType.DECIMAL128: 118>
DECIMAL256 = <TokenType.DECIMAL256: 119>
DECFLOAT = <TokenType.DECFLOAT: 120>
UDECIMAL = <TokenType.UDECIMAL: 121>
BIGDECIMAL = <TokenType.BIGDECIMAL: 122>
CHAR = <TokenType.CHAR: 123>
NCHAR = <TokenType.NCHAR: 124>
VARCHAR = <TokenType.VARCHAR: 125>
NVARCHAR = <TokenType.NVARCHAR: 126>
BPCHAR = <TokenType.BPCHAR: 127>
TEXT = <TokenType.TEXT: 128>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 129>
LONGTEXT = <TokenType.LONGTEXT: 130>
BLOB = <TokenType.BLOB: 131>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 132>
LONGBLOB = <TokenType.LONGBLOB: 133>
TINYBLOB = <TokenType.TINYBLOB: 134>
TINYTEXT = <TokenType.TINYTEXT: 135>
NAME = <TokenType.NAME: 136>
BINARY = <TokenType.BINARY: 137>
VARBINARY = <TokenType.VARBINARY: 138>
JSON = <TokenType.JSON: 139>
JSONB = <TokenType.JSONB: 140>
TIME = <TokenType.TIME: 141>
TIMETZ = <TokenType.TIMETZ: 142>
TIME_NS = <TokenType.TIME_NS: 143>
TIMESTAMP = <TokenType.TIMESTAMP: 144>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 145>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 146>
TIMESTAMPNTZ = <TokenType.TIMESTAMPNTZ: 147>
TIMESTAMP_S = <TokenType.TIMESTAMP_S: 148>
TIMESTAMP_MS = <TokenType.TIMESTAMP_MS: 149>
TIMESTAMP_NS = <TokenType.TIMESTAMP_NS: 150>
DATETIME = <TokenType.DATETIME: 151>
DATETIME2 = <TokenType.DATETIME2: 152>
DATETIME64 = <TokenType.DATETIME64: 153>
SMALLDATETIME = <TokenType.SMALLDATETIME: 154>
DATE = <TokenType.DATE: 155>
DATE32 = <TokenType.DATE32: 156>
INT4RANGE = <TokenType.INT4RANGE: 157>
INT4MULTIRANGE = <TokenType.INT4MULTIRANGE: 158>
INT8RANGE = <TokenType.INT8RANGE: 159>
INT8MULTIRANGE = <TokenType.INT8MULTIRANGE: 160>
NUMRANGE = <TokenType.NUMRANGE: 161>
NUMMULTIRANGE = <TokenType.NUMMULTIRANGE: 162>
TSRANGE = <TokenType.TSRANGE: 163>
TSMULTIRANGE = <TokenType.TSMULTIRANGE: 164>
TSTZRANGE = <TokenType.TSTZRANGE: 165>
TSTZMULTIRANGE = <TokenType.TSTZMULTIRANGE: 166>
DATERANGE = <TokenType.DATERANGE: 167>
DATEMULTIRANGE = <TokenType.DATEMULTIRANGE: 168>
UUID = <TokenType.UUID: 169>
GEOGRAPHY = <TokenType.GEOGRAPHY: 170>
GEOGRAPHYPOINT = <TokenType.GEOGRAPHYPOINT: 171>
NULLABLE = <TokenType.NULLABLE: 172>
GEOMETRY = <TokenType.GEOMETRY: 173>
POINT = <TokenType.POINT: 174>
RING = <TokenType.RING: 175>
LINESTRING = <TokenType.LINESTRING: 176>
LOCALTIME = <TokenType.LOCALTIME: 177>
LOCALTIMESTAMP = <TokenType.LOCALTIMESTAMP: 178>
SYSTIMESTAMP = <TokenType.SYSTIMESTAMP: 179>
MULTILINESTRING = <TokenType.MULTILINESTRING: 180>
POLYGON = <TokenType.POLYGON: 181>
MULTIPOLYGON = <TokenType.MULTIPOLYGON: 182>
HLLSKETCH = <TokenType.HLLSKETCH: 183>
HSTORE = <TokenType.HSTORE: 184>
SUPER = <TokenType.SUPER: 185>
SERIAL = <TokenType.SERIAL: 186>
SMALLSERIAL = <TokenType.SMALLSERIAL: 187>
BIGSERIAL = <TokenType.BIGSERIAL: 188>
XML = <TokenType.XML: 189>
YEAR = <TokenType.YEAR: 190>
USERDEFINED = <TokenType.USERDEFINED: 191>
MONEY = <TokenType.MONEY: 192>
SMALLMONEY = <TokenType.SMALLMONEY: 193>
ROWVERSION = <TokenType.ROWVERSION: 194>
IMAGE = <TokenType.IMAGE: 195>
VARIANT = <TokenType.VARIANT: 196>
OBJECT = <TokenType.OBJECT: 197>
INET = <TokenType.INET: 198>
IPADDRESS = <TokenType.IPADDRESS: 199>
IPPREFIX = <TokenType.IPPREFIX: 200>
IPV4 = <TokenType.IPV4: 201>
IPV6 = <TokenType.IPV6: 202>
ENUM = <TokenType.ENUM: 203>
ENUM8 = <TokenType.ENUM8: 204>
ENUM16 = <TokenType.ENUM16: 205>
FIXEDSTRING = <TokenType.FIXEDSTRING: 206>
LOWCARDINALITY = <TokenType.LOWCARDINALITY: 207>
NESTED = <TokenType.NESTED: 208>
AGGREGATEFUNCTION = <TokenType.AGGREGATEFUNCTION: 209>
SIMPLEAGGREGATEFUNCTION = <TokenType.SIMPLEAGGREGATEFUNCTION: 210>
TDIGEST = <TokenType.TDIGEST: 211>
UNKNOWN = <TokenType.UNKNOWN: 212>
VECTOR = <TokenType.VECTOR: 213>
DYNAMIC = <TokenType.DYNAMIC: 214>
VOID = <TokenType.VOID: 215>
ALIAS = <TokenType.ALIAS: 216>
ALTER = <TokenType.ALTER: 217>
ALL = <TokenType.ALL: 218>
ANTI = <TokenType.ANTI: 219>
ANY = <TokenType.ANY: 220>
APPLY = <TokenType.APPLY: 221>
ARRAY = <TokenType.ARRAY: 222>
ASC = <TokenType.ASC: 223>
ASOF = <TokenType.ASOF: 224>
ATTACH = <TokenType.ATTACH: 225>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 226>
BEGIN = <TokenType.BEGIN: 227>
BETWEEN = <TokenType.BETWEEN: 228>
BULK_COLLECT_INTO = <TokenType.BULK_COLLECT_INTO: 229>
CACHE = <TokenType.CACHE: 230>
CASE = <TokenType.CASE: 231>
CHARACTER_SET = <TokenType.CHARACTER_SET: 232>
CLUSTER_BY = <TokenType.CLUSTER_BY: 233>
COLLATE = <TokenType.COLLATE: 234>
COMMAND = <TokenType.COMMAND: 235>
COMMENT = <TokenType.COMMENT: 236>
COMMIT = <TokenType.COMMIT: 237>
CONNECT_BY = <TokenType.CONNECT_BY: 238>
CONSTRAINT = <TokenType.CONSTRAINT: 239>
COPY = <TokenType.COPY: 240>
CREATE = <TokenType.CREATE: 241>
CROSS = <TokenType.CROSS: 242>
CUBE = <TokenType.CUBE: 243>
CURRENT_DATE = <TokenType.CURRENT_DATE: 244>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 245>
CURRENT_SCHEMA = <TokenType.CURRENT_SCHEMA: 246>
CURRENT_TIME = <TokenType.CURRENT_TIME: 247>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 248>
CURRENT_USER = <TokenType.CURRENT_USER: 249>
CURRENT_ROLE = <TokenType.CURRENT_ROLE: 250>
CURRENT_CATALOG = <TokenType.CURRENT_CATALOG: 251>
DECLARE = <TokenType.DECLARE: 252>
DEFAULT = <TokenType.DEFAULT: 253>
DELETE = <TokenType.DELETE: 254>
DESC = <TokenType.DESC: 255>
DESCRIBE = <TokenType.DESCRIBE: 256>
DETACH = <TokenType.DETACH: 257>
DICTIONARY = <TokenType.DICTIONARY: 258>
DISTINCT = <TokenType.DISTINCT: 259>
DISTRIBUTE_BY = <TokenType.DISTRIBUTE_BY: 260>
DIV = <TokenType.DIV: 261>
DROP = <TokenType.DROP: 262>
ELSE = <TokenType.ELSE: 263>
END = <TokenType.END: 264>
ESCAPE = <TokenType.ESCAPE: 265>
EXCEPT = <TokenType.EXCEPT: 266>
EXECUTE = <TokenType.EXECUTE: 267>
EXISTS = <TokenType.EXISTS: 268>
FALSE = <TokenType.FALSE: 269>
FETCH = <TokenType.FETCH: 270>
FILE = <TokenType.FILE: 271>
FILE_FORMAT = <TokenType.FILE_FORMAT: 272>
FILTER = <TokenType.FILTER: 273>
FINAL = <TokenType.FINAL: 274>
FIRST = <TokenType.FIRST: 275>
FOR = <TokenType.FOR: 276>
FORCE = <TokenType.FORCE: 277>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 278>
FORMAT = <TokenType.FORMAT: 279>
FROM = <TokenType.FROM: 280>
FULL = <TokenType.FULL: 281>
FUNCTION = <TokenType.FUNCTION: 282>
GET = <TokenType.GET: 283>
GLOB = <TokenType.GLOB: 284>
GLOBAL = <TokenType.GLOBAL: 285>
GRANT = <TokenType.GRANT: 286>
GROUP_BY = <TokenType.GROUP_BY: 287>
GROUPING_SETS = <TokenType.GROUPING_SETS: 288>
HAVING = <TokenType.HAVING: 289>
HINT = <TokenType.HINT: 290>
IGNORE = <TokenType.IGNORE: 291>
ILIKE = <TokenType.ILIKE: 292>
IN = <TokenType.IN: 293>
INDEX = <TokenType.INDEX: 294>
INDEXED_BY = <TokenType.INDEXED_BY: 295>
INNER = <TokenType.INNER: 296>
INSERT = <TokenType.INSERT: 297>
INSTALL = <TokenType.INSTALL: 298>
INTEGRATION = <TokenType.INTEGRATION: 299>
INTERSECT = <TokenType.INTERSECT: 300>
INTERVAL = <TokenType.INTERVAL: 301>
INTO = <TokenType.INTO: 302>
INTRODUCER = <TokenType.INTRODUCER: 303>
IRLIKE = <TokenType.IRLIKE: 304>
IS = <TokenType.IS: 305>
ISNULL = <TokenType.ISNULL: 306>
JOIN = <TokenType.JOIN: 307>
JOIN_MARKER = <TokenType.JOIN_MARKER: 308>
KEEP = <TokenType.KEEP: 309>
KEY = <TokenType.KEY: 310>
KILL = <TokenType.KILL: 311>
LANGUAGE = <TokenType.LANGUAGE: 312>
LATERAL = <TokenType.LATERAL: 313>
LEFT = <TokenType.LEFT: 314>
LIKE = <TokenType.LIKE: 315>
LIMIT = <TokenType.LIMIT: 316>
LIST = <TokenType.LIST: 317>
LOAD = <TokenType.LOAD: 318>
LOCK = <TokenType.LOCK: 319>
MAP = <TokenType.MAP: 320>
MATCH = <TokenType.MATCH: 321>
MATCH_CONDITION = <TokenType.MATCH_CONDITION: 322>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 323>
MEMBER_OF = <TokenType.MEMBER_OF: 324>
MERGE = <TokenType.MERGE: 325>
MOD = <TokenType.MOD: 326>
MODEL = <TokenType.MODEL: 327>
NATURAL = <TokenType.NATURAL: 328>
NEXT = <TokenType.NEXT: 329>
NOTHING = <TokenType.NOTHING: 330>
NOTNULL = <TokenType.NOTNULL: 331>
NULL = <TokenType.NULL: 332>
OBJECT_IDENTIFIER = <TokenType.OBJECT_IDENTIFIER: 333>
OFFSET = <TokenType.OFFSET: 334>
ON = <TokenType.ON: 335>
ONLY = <TokenType.ONLY: 336>
OPERATOR = <TokenType.OPERATOR: 337>
ORDER_BY = <TokenType.ORDER_BY: 338>
ORDER_SIBLINGS_BY = <TokenType.ORDER_SIBLINGS_BY: 339>
ORDERED = <TokenType.ORDERED: 340>
ORDINALITY = <TokenType.ORDINALITY: 341>
OUT = <TokenType.OUT: 342>
INOUT = <TokenType.INOUT: 343>
OUTER = <TokenType.OUTER: 344>
OVER = <TokenType.OVER: 345>
OVERLAPS = <TokenType.OVERLAPS: 346>
OVERWRITE = <TokenType.OVERWRITE: 347>
PACKAGE = <TokenType.PACKAGE: 348>
PARTITION = <TokenType.PARTITION: 349>
PARTITION_BY = <TokenType.PARTITION_BY: 350>
PERCENT = <TokenType.PERCENT: 351>
PIVOT = <TokenType.PIVOT: 352>
PLACEHOLDER = <TokenType.PLACEHOLDER: 353>
POLICY = <TokenType.POLICY: 354>
POOL = <TokenType.POOL: 355>
POSITIONAL = <TokenType.POSITIONAL: 356>
PRAGMA = <TokenType.PRAGMA: 357>
PREWHERE = <TokenType.PREWHERE: 358>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 359>
PROCEDURE = <TokenType.PROCEDURE: 360>
PROPERTIES = <TokenType.PROPERTIES: 361>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 362>
PUT = <TokenType.PUT: 363>
QUALIFY = <TokenType.QUALIFY: 364>
QUOTE = <TokenType.QUOTE: 365>
QDCOLON = <TokenType.QDCOLON: 366>
RANGE = <TokenType.RANGE: 367>
RECURSIVE = <TokenType.RECURSIVE: 368>
REFRESH = <TokenType.REFRESH: 369>
RENAME = <TokenType.RENAME: 370>
REPLACE = <TokenType.REPLACE: 371>
RETURNING = <TokenType.RETURNING: 372>
REVOKE = <TokenType.REVOKE: 373>
REFERENCES = <TokenType.REFERENCES: 374>
RIGHT = <TokenType.RIGHT: 375>
RLIKE = <TokenType.RLIKE: 376>
ROLE = <TokenType.ROLE: 377>
ROLLBACK = <TokenType.ROLLBACK: 378>
ROLLUP = <TokenType.ROLLUP: 379>
ROW = <TokenType.ROW: 380>
ROWS = <TokenType.ROWS: 381>
RULE = <TokenType.RULE: 382>
SELECT = <TokenType.SELECT: 383>
SEMI = <TokenType.SEMI: 384>
SEPARATOR = <TokenType.SEPARATOR: 385>
SEQUENCE = <TokenType.SEQUENCE: 386>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 387>
SET = <TokenType.SET: 388>
SETTINGS = <TokenType.SETTINGS: 389>
SHOW = <TokenType.SHOW: 390>
SIMILAR_TO = <TokenType.SIMILAR_TO: 391>
SOME = <TokenType.SOME: 392>
SORT_BY = <TokenType.SORT_BY: 393>
SOUNDS_LIKE = <TokenType.SOUNDS_LIKE: 394>
SQL_SECURITY = <TokenType.SQL_SECURITY: 395>
START_WITH = <TokenType.START_WITH: 396>
STORAGE_INTEGRATION = <TokenType.STORAGE_INTEGRATION: 397>
STRAIGHT_JOIN = <TokenType.STRAIGHT_JOIN: 398>
STRUCT = <TokenType.STRUCT: 399>
SUMMARIZE = <TokenType.SUMMARIZE: 400>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 401>
TAG = <TokenType.TAG: 402>
TEMPORARY = <TokenType.TEMPORARY: 403>
TOP = <TokenType.TOP: 404>
THEN = <TokenType.THEN: 405>
TRUE = <TokenType.TRUE: 406>
TRUNCATE = <TokenType.TRUNCATE: 407>
TRIGGER = <TokenType.TRIGGER: 408>
UNCACHE = <TokenType.UNCACHE: 409>
UNION = <TokenType.UNION: 410>
UNNEST = <TokenType.UNNEST: 411>
UNPIVOT = <TokenType.UNPIVOT: 412>
UPDATE = <TokenType.UPDATE: 413>
USE = <TokenType.USE: 414>
USING = <TokenType.USING: 415>
VALUES = <TokenType.VALUES: 416>
VARIADIC = <TokenType.VARIADIC: 417>
VIEW = <TokenType.VIEW: 418>
SEMANTIC_VIEW = <TokenType.SEMANTIC_VIEW: 419>
VOLATILE = <TokenType.VOLATILE: 420>
VOLUME = <TokenType.VOLUME: 421>
WHEN = <TokenType.WHEN: 422>
WHERE = <TokenType.WHERE: 423>
WINDOW = <TokenType.WINDOW: 424>
WITH = <TokenType.WITH: 425>
UNIQUE = <TokenType.UNIQUE: 426>
UTC_DATE = <TokenType.UTC_DATE: 427>
UTC_TIME = <TokenType.UTC_TIME: 428>
UTC_TIMESTAMP = <TokenType.UTC_TIMESTAMP: 429>
VERSION_SNAPSHOT = <TokenType.VERSION_SNAPSHOT: 430>
TIMESTAMP_SNAPSHOT = <TokenType.TIMESTAMP_SNAPSHOT: 431>
OPTION = <TokenType.OPTION: 432>
SINK = <TokenType.SINK: 433>
SOURCE = <TokenType.SOURCE: 434>
ANALYZE = <TokenType.ANALYZE: 435>
NAMESPACE = <TokenType.NAMESPACE: 436>
EXPORT = <TokenType.EXPORT: 437>
HIVE_TOKEN_STREAM = <TokenType.HIVE_TOKEN_STREAM: 438>
SENTINEL = <TokenType.SENTINEL: 439>
class Token:
469class Token:
470    # mypyc doesn't expose slots
471    _attrs: t.ClassVar[tuple[str, ...]] = (
472        "token_type",
473        "text",
474        "line",
475        "col",
476        "start",
477        "end",
478        "comments",
479    )
480    __slots__ = _attrs
481
482    @classmethod
483    def number(cls, number: int) -> Token:
484        """Returns a NUMBER token with `number` as its text."""
485        return cls(TokenType.NUMBER, str(number))
486
487    @classmethod
488    def string(cls, string: str) -> Token:
489        """Returns a STRING token with `string` as its text."""
490        return cls(TokenType.STRING, string)
491
492    @classmethod
493    def identifier(cls, identifier: str) -> Token:
494        """Returns an IDENTIFIER token with `identifier` as its text."""
495        return cls(TokenType.IDENTIFIER, identifier)
496
497    @classmethod
498    def var(cls, var: str) -> Token:
499        """Returns an VAR token with `var` as its text."""
500        return cls(TokenType.VAR, var)
501
502    def __init__(
503        self,
504        token_type: TokenType,
505        text: str,
506        line: int = 1,
507        col: int = 1,
508        start: int = 0,
509        end: int = 0,
510        comments: list[str] | None = None,
511    ) -> None:
512        self.token_type = token_type
513        self.text = text
514        self.line = line
515        self.col = col
516        self.start = start
517        self.end = end
518        self.comments = [] if comments is None else comments
519
520    def __bool__(self) -> bool:
521        return self.token_type != TokenType.SENTINEL
522
523    def __repr__(self) -> str:
524        attributes = ", ".join(
525            f"{k}: TokenType.{self.token_type.name}"
526            if k == "token_type"
527            else f"{k}: {getattr(self, k)}"
528            for k in self._attrs
529        )
530        return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: list[str] | None = None)
502    def __init__(
503        self,
504        token_type: TokenType,
505        text: str,
506        line: int = 1,
507        col: int = 1,
508        start: int = 0,
509        end: int = 0,
510        comments: list[str] | None = None,
511    ) -> None:
512        self.token_type = token_type
513        self.text = text
514        self.line = line
515        self.col = col
516        self.start = start
517        self.end = end
518        self.comments = [] if comments is None else comments
@classmethod
def number(cls, number: int) -> Token:
482    @classmethod
483    def number(cls, number: int) -> Token:
484        """Returns a NUMBER token with `number` as its text."""
485        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
487    @classmethod
488    def string(cls, string: str) -> Token:
489        """Returns a STRING token with `string` as its text."""
490        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
492    @classmethod
493    def identifier(cls, identifier: str) -> Token:
494        """Returns an IDENTIFIER token with `identifier` as its text."""
495        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
497    @classmethod
498    def var(cls, var: str) -> Token:
499        """Returns an VAR token with `var` as its text."""
500        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class TokenizerCore:
 533class TokenizerCore:
 534    __slots__ = (
 535        "sql",
 536        "size",
 537        "tokens",
 538        "_start",
 539        "_current",
 540        "_line",
 541        "_col",
 542        "_comments",
 543        "_char",
 544        "_end",
 545        "_peek",
 546        "_prev_token_line",
 547        "single_tokens",
 548        "keywords",
 549        "quotes",
 550        "format_strings",
 551        "identifiers",
 552        "comments",
 553        "string_escapes",
 554        "byte_string_escapes",
 555        "identifier_escapes",
 556        "escape_follow_chars",
 557        "commands",
 558        "command_prefix_tokens",
 559        "nested_comments",
 560        "hint_start",
 561        "tokens_preceding_hint",
 562        "bit_strings",
 563        "hex_strings",
 564        "numeric_literals",
 565        "var_single_tokens",
 566        "string_escapes_allowed_in_raw_strings",
 567        "heredoc_tag_is_identifier",
 568        "heredoc_string_alternative",
 569        "keyword_trie",
 570        "numbers_can_be_underscore_separated",
 571        "numbers_can_have_decimals",
 572        "identifiers_can_start_with_digit",
 573        "unescaped_sequences",
 574    )
 575
 576    def __init__(
 577        self,
 578        single_tokens: dict[str, TokenType],
 579        keywords: dict[str, TokenType],
 580        quotes: dict[str, str],
 581        format_strings: dict[str, tuple[str, TokenType]],
 582        identifiers: dict[str, str],
 583        comments: dict[str, str | None],
 584        string_escapes: set[str],
 585        byte_string_escapes: set[str],
 586        identifier_escapes: set[str],
 587        escape_follow_chars: set[str],
 588        commands: set[TokenType],
 589        command_prefix_tokens: set[TokenType],
 590        nested_comments: bool,
 591        hint_start: str,
 592        tokens_preceding_hint: set[TokenType],
 593        bit_strings: list[str | tuple[str, str]],
 594        hex_strings: list[str | tuple[str, str]],
 595        numeric_literals: dict[str, str],
 596        var_single_tokens: set[str],
 597        string_escapes_allowed_in_raw_strings: bool,
 598        heredoc_tag_is_identifier: bool,
 599        heredoc_string_alternative: TokenType,
 600        keyword_trie: dict,
 601        numbers_can_be_underscore_separated: bool,
 602        numbers_can_have_decimals: bool,
 603        identifiers_can_start_with_digit: bool,
 604        unescaped_sequences: dict[str, str],
 605    ) -> None:
 606        self.single_tokens = single_tokens
 607        self.keywords = keywords
 608        self.quotes = quotes
 609        self.format_strings = format_strings
 610        self.identifiers = identifiers
 611        self.comments = comments
 612        self.string_escapes = string_escapes
 613        self.byte_string_escapes = byte_string_escapes
 614        self.identifier_escapes = identifier_escapes
 615        self.escape_follow_chars = escape_follow_chars
 616        self.commands = commands
 617        self.command_prefix_tokens = command_prefix_tokens
 618        self.nested_comments = nested_comments
 619        self.hint_start = hint_start
 620        self.tokens_preceding_hint = tokens_preceding_hint
 621        self.bit_strings = bit_strings
 622        self.hex_strings = hex_strings
 623        self.numeric_literals = numeric_literals
 624        self.var_single_tokens = var_single_tokens
 625        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 626        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 627        self.heredoc_string_alternative = heredoc_string_alternative
 628        self.keyword_trie = keyword_trie
 629        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 630        self.numbers_can_have_decimals = numbers_can_have_decimals
 631        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 632        self.unescaped_sequences = unescaped_sequences
 633        self.sql = ""
 634        self.size = 0
 635        self.tokens: list[Token] = []
 636        self._start = 0
 637        self._current = 0
 638        self._line = 1
 639        self._col = 0
 640        self._comments: list[str] = []
 641        self._char = ""
 642        self._end = False
 643        self._peek = ""
 644        self._prev_token_line = -1
 645
 646    def reset(self) -> None:
 647        self.sql = ""
 648        self.size = 0
 649        self.tokens = []
 650        self._start = 0
 651        self._current = 0
 652        self._line = 1
 653        self._col = 0
 654        self._comments = []
 655        self._char = ""
 656        self._end = False
 657        self._peek = ""
 658        self._prev_token_line = -1
 659
 660    def tokenize(self, sql: str) -> list[Token]:
 661        """Returns a list of tokens corresponding to the SQL string `sql`."""
 662        self.reset()
 663        self.sql = sql
 664        self.size = len(sql)
 665
 666        try:
 667            self._scan()
 668        except Exception as e:
 669            start = max(self._current - 50, 0)
 670            end = min(self._current + 50, self.size - 1)
 671            context = self.sql[start:end]
 672            raise TokenError(f"Error tokenizing '{context}'") from e
 673
 674        return self.tokens
 675
 676    def _scan(self, check_semicolon: bool = False) -> None:
 677        identifiers = self.identifiers
 678        digit_chars = _DIGIT_CHARS
 679
 680        while self.size and not self._end:
 681            current = self._current
 682
 683            # Skip spaces here rather than iteratively calling advance() for performance reasons
 684            while current < self.size:
 685                char = self.sql[current]
 686
 687                if char == " " or char == "\t":
 688                    current += 1
 689                else:
 690                    break
 691
 692            offset = current - self._current if current > self._current else 1
 693
 694            self._start = current
 695            self._advance(offset)
 696
 697            if not self._char.isspace():
 698                if self._char in digit_chars:
 699                    self._scan_number()
 700                elif self._char in identifiers:
 701                    self._scan_identifier(identifiers[self._char])
 702                else:
 703                    self._scan_keywords()
 704
 705            if check_semicolon and self._peek == ";":
 706                break
 707
 708        if self.tokens and self._comments:
 709            self.tokens[-1].comments.extend(self._comments)
 710
 711    def _chars(self, size: int) -> str:
 712        if size == 1:
 713            return self._char
 714
 715        start = self._current - 1
 716        end = start + size
 717
 718        return self.sql[start:end] if end <= self.size else ""
 719
 720    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 721        char = self._char
 722
 723        if char == "\n" or char == "\r":
 724            # Ensures we don't count an extra line if we get a \r\n line break sequence
 725            if not (char == "\r" and self._peek == "\n"):
 726                self._col = i
 727                self._line += 1
 728        else:
 729            self._col += i
 730
 731        self._current += i
 732        sql = self.sql
 733        size = self.size
 734        self._end = self._current >= size
 735        self._char = sql[self._current - 1]
 736        self._peek = "" if self._end else sql[self._current]
 737
 738        if alnum and self._char.isalnum():
 739            # Cache to local variables instead of attributes for better performance
 740            _col = self._col
 741            _current = self._current
 742            _end = self._end
 743            _peek = self._peek
 744
 745            while _peek.isalnum():
 746                _col += 1
 747                _current += 1
 748                _end = _current >= size
 749                _peek = "" if _end else sql[_current]
 750
 751            self._col = _col
 752            self._current = _current
 753            self._end = _end
 754            self._peek = _peek
 755            self._char = sql[_current - 1]
 756
 757    @property
 758    def _text(self) -> str:
 759        return self.sql[self._start : self._current]
 760
 761    def _add(self, token_type: TokenType, text: str | None = None) -> None:
 762        self._prev_token_line = self._line
 763
 764        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 765            self.tokens[-1].comments.extend(self._comments)
 766            self._comments = []
 767
 768        if text is None:
 769            text = self.sql[self._start : self._current]
 770
 771        self.tokens.append(
 772            Token(
 773                token_type,
 774                text=text,
 775                line=self._line,
 776                col=self._col,
 777                start=self._start,
 778                end=self._current - 1,
 779                comments=self._comments,
 780            )
 781        )
 782        self._comments = []
 783
 784        # If we have either a semicolon or a begin token before the command's token, we'll parse
 785        # whatever follows the command's token as a string
 786        if (
 787            token_type in self.commands
 788            and self._peek != ";"
 789            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 790        ):
 791            start = self._current
 792            tokens = len(self.tokens)
 793            self._scan(check_semicolon=True)
 794            self.tokens = self.tokens[:tokens]
 795            text = self.sql[start : self._current].strip()
 796            if text:
 797                self._add(TokenType.STRING, text)
 798
 799    def _scan_keywords(self) -> None:
 800        sql = self.sql
 801        sql_size = self.size
 802        single_tokens = self.single_tokens
 803        char_upper = _CHAR_UPPER
 804        size = 0
 805        word = None
 806        chars = self._char
 807        char = chars
 808        prev_space = False
 809        skip = False
 810        trie = self.keyword_trie
 811        single_token = char in single_tokens
 812
 813        while chars:
 814            if not skip:
 815                sub = trie.get(char_upper.get(char, char))
 816                if sub is None:
 817                    break
 818                trie = sub
 819                if 0 in trie:
 820                    word = chars
 821
 822            end = self._current + size
 823            size += 1
 824
 825            if end < sql_size:
 826                char = sql[end]
 827                single_token = single_token or char in single_tokens
 828                is_space = char.isspace()
 829
 830                if not is_space or not prev_space:
 831                    if is_space:
 832                        char = " "
 833                    chars += char
 834                    prev_space = is_space
 835                    skip = False
 836                else:
 837                    skip = True
 838            else:
 839                char = ""
 840                break
 841
 842        if word:
 843            if self._scan_string(word):
 844                return
 845            if self._scan_comment(word):
 846                return
 847            if prev_space or single_token or not char:
 848                self._advance(size - 1)
 849                word = word.upper()
 850                self._add(self.keywords[word], text=word)
 851                return
 852
 853        if self._char in single_tokens:
 854            self._add(single_tokens[self._char], text=self._char)
 855            return
 856
 857        self._scan_var()
 858
 859    def _scan_comment(self, comment_start: str) -> bool:
 860        if comment_start not in self.comments:
 861            return False
 862
 863        comment_start_line = self._line
 864        comment_start_size = len(comment_start)
 865        comment_end = self.comments[comment_start]
 866
 867        if comment_end:
 868            # Skip the comment's start delimiter
 869            self._advance(comment_start_size)
 870
 871            comment_count = 1
 872            comment_end_size = len(comment_end)
 873            nested_comments = self.nested_comments
 874
 875            while not self._end:
 876                if self._chars(comment_end_size) == comment_end:
 877                    comment_count -= 1
 878                    if not comment_count:
 879                        break
 880
 881                self._advance(alnum=True)
 882
 883                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 884                if (
 885                    nested_comments
 886                    and not self._end
 887                    and self._chars(comment_end_size) == comment_start
 888                ):
 889                    self._advance(comment_start_size)
 890                    comment_count += 1
 891
 892            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 893            self._advance(comment_end_size - 1)
 894        else:
 895            _peek = self._peek
 896            while not self._end and _peek != "\n" and _peek != "\r":
 897                self._advance(alnum=True)
 898                _peek = self._peek
 899            self._comments.append(self._text[comment_start_size:])
 900
 901        if (
 902            comment_start == self.hint_start
 903            and self.tokens
 904            and self.tokens[-1].token_type in self.tokens_preceding_hint
 905        ):
 906            self._add(TokenType.HINT)
 907
 908        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 909        # Multiple consecutive comments are preserved by appending them to the current comments list.
 910        if comment_start_line == self._prev_token_line:
 911            self.tokens[-1].comments.extend(self._comments)
 912            self._comments = []
 913            self._prev_token_line = self._line
 914
 915        return True
 916
 917    def _scan_number(self) -> None:
 918        if self._char == "0":
 919            peek = _CHAR_UPPER.get(self._peek, self._peek)
 920            if peek == "B":
 921                return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
 922            elif peek == "X":
 923                return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
 924
 925        decimal = False
 926        scientific = 0
 927        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 928        single_tokens = self.single_tokens
 929        keywords = self.keywords
 930        numeric_literals = self.numeric_literals
 931        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 932
 933        is_underscore_separated: bool = False
 934        number_text: str = ""
 935        numeric_literal: str = ""
 936        numeric_type: TokenType | None = None
 937
 938        while True:
 939            if self._peek in _DIGIT_CHARS:
 940                # Batch consecutive digits: scan ahead to find how many
 941                sql = self.sql
 942                end = self._current + 1
 943                size = self.size
 944                while end < size and sql[end] in _DIGIT_CHARS:
 945                    end += 1
 946                self._advance(end - self._current)
 947            elif self._peek == "." and not decimal:
 948                if (
 949                    self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
 950                ) or not self.numbers_can_have_decimals:
 951                    break
 952                decimal = True
 953                self._advance()
 954            elif self._peek in ("-", "+") and scientific == 1:
 955                # Only consume +/- if followed by a digit
 956                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 957                    scientific += 1
 958                    self._advance()
 959                else:
 960                    break
 961            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 962                scientific += 1
 963                self._advance()
 964            elif self._peek == "_" and numbers_can_be_underscore_separated:
 965                is_underscore_separated = True
 966                self._advance()
 967            elif self._peek.isidentifier():
 968                number_text = self._text
 969
 970                while self._peek and not self._peek.isspace() and self._peek not in single_tokens:
 971                    numeric_literal += self._peek
 972                    self._advance()
 973
 974                numeric_type = keywords.get(numeric_literals.get(numeric_literal.upper(), ""))
 975
 976                if numeric_type:
 977                    break
 978                elif identifiers_can_start_with_digit:
 979                    return self._add(TokenType.VAR)
 980
 981                self._advance(-len(numeric_literal))
 982                break
 983            else:
 984                break
 985
 986        number_text = number_text or self.sql[self._start : self._current]
 987
 988        # Normalize inputs such as 100_000 to 100000
 989        if is_underscore_separated:
 990            number_text = number_text.replace("_", "")
 991
 992        self._add(TokenType.NUMBER, number_text)
 993
 994        # Normalize inputs such as 123L to 123::BIGINT so that they're parsed as casts
 995        if numeric_type:
 996            self._add(TokenType.DCOLON, "::")
 997            self._add(numeric_type, numeric_literal)
 998
 999    def _scan_bits(self) -> None:
1000        self._advance()
1001        value = self._extract_value()
1002        try:
1003            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
1004            int(value, 2)
1005            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
1006        except ValueError:
1007            self._add(TokenType.IDENTIFIER)
1008
1009    def _scan_hex(self) -> None:
1010        self._advance()
1011        value = self._extract_value()
1012        try:
1013            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
1014            int(value, 16)
1015            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
1016        except ValueError:
1017            self._add(TokenType.IDENTIFIER)
1018
1019    def _extract_value(self) -> str:
1020        single_tokens = self.single_tokens
1021
1022        while True:
1023            char = self._peek.strip()
1024            if char and char not in single_tokens:
1025                self._advance(alnum=True)
1026            else:
1027                break
1028
1029        return self._text
1030
1031    def _scan_string(self, start: str) -> bool:
1032        base = None
1033        token_type = TokenType.STRING
1034
1035        if start in self.quotes:
1036            end = self.quotes[start]
1037        elif start in self.format_strings:
1038            end, token_type = self.format_strings[start]
1039
1040            if token_type == TokenType.HEX_STRING:
1041                base = 16
1042            elif token_type == TokenType.BIT_STRING:
1043                base = 2
1044            elif token_type == TokenType.HEREDOC_STRING:
1045                self._advance()
1046
1047                if self._char == end:
1048                    tag = ""
1049                else:
1050                    tag = self._extract_string(
1051                        end,
1052                        raw_string=True,
1053                        raise_unmatched=not self.heredoc_tag_is_identifier,
1054                    )
1055
1056                if (
1057                    tag
1058                    and self.heredoc_tag_is_identifier
1059                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1060                ):
1061                    if not self._end:
1062                        self._advance(-1)
1063
1064                    self._advance(-len(tag))
1065                    self._add(self.heredoc_string_alternative)
1066                    return True
1067
1068                end = f"{start}{tag}{end}"
1069        else:
1070            return False
1071
1072        self._advance(len(start))
1073        text = self._extract_string(
1074            end,
1075            escapes=(
1076                self.byte_string_escapes
1077                if token_type == TokenType.BYTE_STRING
1078                else self.string_escapes
1079            ),
1080            raw_string=token_type == TokenType.RAW_STRING,
1081        )
1082
1083        if base and text:
1084            try:
1085                int(text, base)
1086            except Exception:
1087                raise TokenError(
1088                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1089                )
1090
1091        self._add(token_type, text)
1092        return True
1093
1094    def _scan_identifier(self, identifier_end: str) -> None:
1095        self._advance()
1096        text = self._extract_string(
1097            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1098        )
1099        self._add(TokenType.IDENTIFIER, text)
1100
1101    def _scan_var(self) -> None:
1102        var_single_tokens = self.var_single_tokens
1103        single_tokens = self.single_tokens
1104
1105        while True:
1106            peek = self._peek
1107            if not peek or peek.isspace():
1108                break
1109            if peek not in var_single_tokens and peek in single_tokens:
1110                break
1111            self._advance(alnum=True)
1112
1113        self._add(
1114            TokenType.VAR
1115            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1116            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1117        )
1118
1119    def _extract_string(
1120        self,
1121        delimiter: str,
1122        escapes: set[str] | None = None,
1123        raw_string: bool = False,
1124        raise_unmatched: bool = True,
1125    ) -> str:
1126        text = ""
1127        delim_size = len(delimiter)
1128        escapes = self.string_escapes if escapes is None else escapes
1129        unescaped_sequences = self.unescaped_sequences
1130        escape_follow_chars = self.escape_follow_chars
1131        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1132        quotes = self.quotes
1133        sql = self.sql
1134
1135        # use str.find() when the string is simple... no \ or other escapes
1136        if delim_size == 1:
1137            pos = self._current - 1
1138            end = sql.find(delimiter, pos)
1139
1140            if (
1141                # the closing delimiter was found
1142                end != -1
1143                # there's no doubled delimiter (e.g. '' escape), or the delimiter isn't an escape char
1144                and (end + 1 >= self.size or sql[end + 1] != delimiter or delimiter not in escapes)
1145                # no backslash in the string that would need escape processing
1146                and (not (unescaped_sequences or "\\" in escapes) or sql.find("\\", pos, end) == -1)
1147            ):
1148                newlines = sql.count("\n", pos, end)
1149                if newlines:
1150                    self._line += newlines
1151                    self._col = end - sql.rfind("\n", pos, end)
1152                else:
1153                    self._col += end - pos
1154
1155                self._current = end + 1
1156                self._end = self._current >= self.size
1157                self._char = sql[end]
1158                self._peek = "" if self._end else sql[self._current]
1159                return sql[pos:end]
1160
1161        while True:
1162            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1163                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1164                if unescaped_sequence:
1165                    self._advance(2)
1166                    text += unescaped_sequence
1167                    continue
1168
1169            is_valid_custom_escape = (
1170                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1171            )
1172
1173            if (
1174                (string_escapes_allowed_in_raw_strings or not raw_string)
1175                and self._char in escapes
1176                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1177                and (self._char not in quotes or self._char == self._peek)
1178            ):
1179                if self._peek == delimiter:
1180                    text += self._peek
1181                elif is_valid_custom_escape and self._char != self._peek:
1182                    text += self._peek
1183                else:
1184                    text += self._char + self._peek
1185
1186                if self._current + 1 < self.size:
1187                    self._advance(2)
1188                else:
1189                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1190            else:
1191                if self._chars(delim_size) == delimiter:
1192                    if delim_size > 1:
1193                        self._advance(delim_size - 1)
1194                    break
1195
1196                if self._end:
1197                    if not raise_unmatched:
1198                        return text + self._char
1199
1200                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1201
1202                current = self._current - 1
1203                self._advance(alnum=True)
1204                text += sql[current : self._current - 1]
1205
1206        return text
TokenizerCore( single_tokens: dict[str, TokenType], keywords: dict[str, TokenType], quotes: dict[str, str], format_strings: dict[str, tuple[str, TokenType]], identifiers: dict[str, str], comments: dict[str, str | None], string_escapes: set[str], byte_string_escapes: set[str], identifier_escapes: set[str], escape_follow_chars: set[str], commands: set[TokenType], command_prefix_tokens: set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: set[TokenType], bit_strings: list[str | tuple[str, str]], hex_strings: list[str | tuple[str, str]], numeric_literals: dict[str, str], var_single_tokens: set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: dict, numbers_can_be_underscore_separated: bool, numbers_can_have_decimals: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: dict[str, str])
576    def __init__(
577        self,
578        single_tokens: dict[str, TokenType],
579        keywords: dict[str, TokenType],
580        quotes: dict[str, str],
581        format_strings: dict[str, tuple[str, TokenType]],
582        identifiers: dict[str, str],
583        comments: dict[str, str | None],
584        string_escapes: set[str],
585        byte_string_escapes: set[str],
586        identifier_escapes: set[str],
587        escape_follow_chars: set[str],
588        commands: set[TokenType],
589        command_prefix_tokens: set[TokenType],
590        nested_comments: bool,
591        hint_start: str,
592        tokens_preceding_hint: set[TokenType],
593        bit_strings: list[str | tuple[str, str]],
594        hex_strings: list[str | tuple[str, str]],
595        numeric_literals: dict[str, str],
596        var_single_tokens: set[str],
597        string_escapes_allowed_in_raw_strings: bool,
598        heredoc_tag_is_identifier: bool,
599        heredoc_string_alternative: TokenType,
600        keyword_trie: dict,
601        numbers_can_be_underscore_separated: bool,
602        numbers_can_have_decimals: bool,
603        identifiers_can_start_with_digit: bool,
604        unescaped_sequences: dict[str, str],
605    ) -> None:
606        self.single_tokens = single_tokens
607        self.keywords = keywords
608        self.quotes = quotes
609        self.format_strings = format_strings
610        self.identifiers = identifiers
611        self.comments = comments
612        self.string_escapes = string_escapes
613        self.byte_string_escapes = byte_string_escapes
614        self.identifier_escapes = identifier_escapes
615        self.escape_follow_chars = escape_follow_chars
616        self.commands = commands
617        self.command_prefix_tokens = command_prefix_tokens
618        self.nested_comments = nested_comments
619        self.hint_start = hint_start
620        self.tokens_preceding_hint = tokens_preceding_hint
621        self.bit_strings = bit_strings
622        self.hex_strings = hex_strings
623        self.numeric_literals = numeric_literals
624        self.var_single_tokens = var_single_tokens
625        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
626        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
627        self.heredoc_string_alternative = heredoc_string_alternative
628        self.keyword_trie = keyword_trie
629        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
630        self.numbers_can_have_decimals = numbers_can_have_decimals
631        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
632        self.unescaped_sequences = unescaped_sequences
633        self.sql = ""
634        self.size = 0
635        self.tokens: list[Token] = []
636        self._start = 0
637        self._current = 0
638        self._line = 1
639        self._col = 0
640        self._comments: list[str] = []
641        self._char = ""
642        self._end = False
643        self._peek = ""
644        self._prev_token_line = -1
single_tokens
keywords
quotes
format_strings
identifiers
comments
string_escapes
byte_string_escapes
identifier_escapes
escape_follow_chars
commands
command_prefix_tokens
nested_comments
hint_start
tokens_preceding_hint
bit_strings
hex_strings
numeric_literals
var_single_tokens
string_escapes_allowed_in_raw_strings
heredoc_tag_is_identifier
heredoc_string_alternative
keyword_trie
numbers_can_be_underscore_separated
numbers_can_have_decimals
identifiers_can_start_with_digit
unescaped_sequences
sql
size
tokens: list[Token]
def reset(self) -> None:
646    def reset(self) -> None:
647        self.sql = ""
648        self.size = 0
649        self.tokens = []
650        self._start = 0
651        self._current = 0
652        self._line = 1
653        self._col = 0
654        self._comments = []
655        self._char = ""
656        self._end = False
657        self._peek = ""
658        self._prev_token_line = -1
def tokenize(self, sql: str) -> list[Token]:
660    def tokenize(self, sql: str) -> list[Token]:
661        """Returns a list of tokens corresponding to the SQL string `sql`."""
662        self.reset()
663        self.sql = sql
664        self.size = len(sql)
665
666        try:
667            self._scan()
668        except Exception as e:
669            start = max(self._current - 50, 0)
670            end = min(self._current + 50, self.size - 1)
671            context = self.sql[start:end]
672            raise TokenError(f"Error tokenizing '{context}'") from e
673
674        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.