Edit on GitHub

sqlglot.tokenizer_core

   1from __future__ import annotations
   2
   3import typing as t
   4from enum import IntEnum, auto
   5
   6from sqlglot.errors import TokenError
   7
   8# dict lookup is faster than .upper(), .isspace(), .isdigit()
   9_CHAR_UPPER: t.Dict[str, str] = {chr(i): chr(i).upper() for i in range(97, 123)}
  10
  11_SPACE_CHARS: t.FrozenSet[str] = frozenset(
  12    "\t\n\r \x0b\x0c\x1c\x1d\x1e\x1f\x85\xa0"
  13    "\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a"
  14    "\u2028\u2029\u202f\u205f\u3000"
  15)
  16_DIGIT_CHARS: t.FrozenSet[str] = frozenset("0123456789")
  17
  18
  19class TokenType(IntEnum):
  20    L_PAREN = auto()
  21    R_PAREN = auto()
  22    L_BRACKET = auto()
  23    R_BRACKET = auto()
  24    L_BRACE = auto()
  25    R_BRACE = auto()
  26    COMMA = auto()
  27    DOT = auto()
  28    DASH = auto()
  29    PLUS = auto()
  30    COLON = auto()
  31    DOTCOLON = auto()
  32    DCOLON = auto()
  33    DCOLONDOLLAR = auto()
  34    DCOLONPERCENT = auto()
  35    DCOLONQMARK = auto()
  36    DQMARK = auto()
  37    SEMICOLON = auto()
  38    STAR = auto()
  39    BACKSLASH = auto()
  40    SLASH = auto()
  41    LT = auto()
  42    LTE = auto()
  43    GT = auto()
  44    GTE = auto()
  45    NOT = auto()
  46    EQ = auto()
  47    NEQ = auto()
  48    NULLSAFE_EQ = auto()
  49    COLON_EQ = auto()
  50    COLON_GT = auto()
  51    NCOLON_GT = auto()
  52    AND = auto()
  53    OR = auto()
  54    AMP = auto()
  55    DPIPE = auto()
  56    PIPE_GT = auto()
  57    PIPE = auto()
  58    PIPE_SLASH = auto()
  59    DPIPE_SLASH = auto()
  60    CARET = auto()
  61    CARET_AT = auto()
  62    TILDE = auto()
  63    ARROW = auto()
  64    DARROW = auto()
  65    FARROW = auto()
  66    HASH = auto()
  67    HASH_ARROW = auto()
  68    DHASH_ARROW = auto()
  69    LR_ARROW = auto()
  70    DAT = auto()
  71    LT_AT = auto()
  72    AT_GT = auto()
  73    DOLLAR = auto()
  74    PARAMETER = auto()
  75    SESSION = auto()
  76    SESSION_PARAMETER = auto()
  77    SESSION_USER = auto()
  78    DAMP = auto()
  79    AMP_LT = auto()
  80    AMP_GT = auto()
  81    ADJACENT = auto()
  82    XOR = auto()
  83    DSTAR = auto()
  84    QMARK_AMP = auto()
  85    QMARK_PIPE = auto()
  86    HASH_DASH = auto()
  87    EXCLAMATION = auto()
  88
  89    URI_START = auto()
  90
  91    BLOCK_START = auto()
  92    BLOCK_END = auto()
  93
  94    SPACE = auto()
  95    BREAK = auto()
  96
  97    STRING = auto()
  98    NUMBER = auto()
  99    IDENTIFIER = auto()
 100    DATABASE = auto()
 101    COLUMN = auto()
 102    COLUMN_DEF = auto()
 103    SCHEMA = auto()
 104    TABLE = auto()
 105    WAREHOUSE = auto()
 106    STAGE = auto()
 107    STREAMLIT = auto()
 108    VAR = auto()
 109    BIT_STRING = auto()
 110    HEX_STRING = auto()
 111    BYTE_STRING = auto()
 112    NATIONAL_STRING = auto()
 113    RAW_STRING = auto()
 114    HEREDOC_STRING = auto()
 115    UNICODE_STRING = auto()
 116
 117    # types
 118    BIT = auto()
 119    BOOLEAN = auto()
 120    TINYINT = auto()
 121    UTINYINT = auto()
 122    SMALLINT = auto()
 123    USMALLINT = auto()
 124    MEDIUMINT = auto()
 125    UMEDIUMINT = auto()
 126    INT = auto()
 127    UINT = auto()
 128    BIGINT = auto()
 129    UBIGINT = auto()
 130    BIGNUM = auto()
 131    INT128 = auto()
 132    UINT128 = auto()
 133    INT256 = auto()
 134    UINT256 = auto()
 135    FLOAT = auto()
 136    DOUBLE = auto()
 137    UDOUBLE = auto()
 138    DECIMAL = auto()
 139    DECIMAL32 = auto()
 140    DECIMAL64 = auto()
 141    DECIMAL128 = auto()
 142    DECIMAL256 = auto()
 143    DECFLOAT = auto()
 144    UDECIMAL = auto()
 145    BIGDECIMAL = auto()
 146    CHAR = auto()
 147    NCHAR = auto()
 148    VARCHAR = auto()
 149    NVARCHAR = auto()
 150    BPCHAR = auto()
 151    TEXT = auto()
 152    MEDIUMTEXT = auto()
 153    LONGTEXT = auto()
 154    BLOB = auto()
 155    MEDIUMBLOB = auto()
 156    LONGBLOB = auto()
 157    TINYBLOB = auto()
 158    TINYTEXT = auto()
 159    NAME = auto()
 160    BINARY = auto()
 161    VARBINARY = auto()
 162    JSON = auto()
 163    JSONB = auto()
 164    TIME = auto()
 165    TIMETZ = auto()
 166    TIME_NS = auto()
 167    TIMESTAMP = auto()
 168    TIMESTAMPTZ = auto()
 169    TIMESTAMPLTZ = auto()
 170    TIMESTAMPNTZ = auto()
 171    TIMESTAMP_S = auto()
 172    TIMESTAMP_MS = auto()
 173    TIMESTAMP_NS = auto()
 174    DATETIME = auto()
 175    DATETIME2 = auto()
 176    DATETIME64 = auto()
 177    SMALLDATETIME = auto()
 178    DATE = auto()
 179    DATE32 = auto()
 180    INT4RANGE = auto()
 181    INT4MULTIRANGE = auto()
 182    INT8RANGE = auto()
 183    INT8MULTIRANGE = auto()
 184    NUMRANGE = auto()
 185    NUMMULTIRANGE = auto()
 186    TSRANGE = auto()
 187    TSMULTIRANGE = auto()
 188    TSTZRANGE = auto()
 189    TSTZMULTIRANGE = auto()
 190    DATERANGE = auto()
 191    DATEMULTIRANGE = auto()
 192    UUID = auto()
 193    GEOGRAPHY = auto()
 194    GEOGRAPHYPOINT = auto()
 195    NULLABLE = auto()
 196    GEOMETRY = auto()
 197    POINT = auto()
 198    RING = auto()
 199    LINESTRING = auto()
 200    LOCALTIME = auto()
 201    LOCALTIMESTAMP = auto()
 202    SYSTIMESTAMP = auto()
 203    MULTILINESTRING = auto()
 204    POLYGON = auto()
 205    MULTIPOLYGON = auto()
 206    HLLSKETCH = auto()
 207    HSTORE = auto()
 208    SUPER = auto()
 209    SERIAL = auto()
 210    SMALLSERIAL = auto()
 211    BIGSERIAL = auto()
 212    XML = auto()
 213    YEAR = auto()
 214    USERDEFINED = auto()
 215    MONEY = auto()
 216    SMALLMONEY = auto()
 217    ROWVERSION = auto()
 218    IMAGE = auto()
 219    VARIANT = auto()
 220    OBJECT = auto()
 221    INET = auto()
 222    IPADDRESS = auto()
 223    IPPREFIX = auto()
 224    IPV4 = auto()
 225    IPV6 = auto()
 226    ENUM = auto()
 227    ENUM8 = auto()
 228    ENUM16 = auto()
 229    FIXEDSTRING = auto()
 230    LOWCARDINALITY = auto()
 231    NESTED = auto()
 232    AGGREGATEFUNCTION = auto()
 233    SIMPLEAGGREGATEFUNCTION = auto()
 234    TDIGEST = auto()
 235    UNKNOWN = auto()
 236    VECTOR = auto()
 237    DYNAMIC = auto()
 238    VOID = auto()
 239
 240    # keywords
 241    ALIAS = auto()
 242    ALTER = auto()
 243    ALL = auto()
 244    ANTI = auto()
 245    ANY = auto()
 246    APPLY = auto()
 247    ARRAY = auto()
 248    ASC = auto()
 249    ASOF = auto()
 250    ATTACH = auto()
 251    AUTO_INCREMENT = auto()
 252    BEGIN = auto()
 253    BETWEEN = auto()
 254    BULK_COLLECT_INTO = auto()
 255    CACHE = auto()
 256    CASE = auto()
 257    CHARACTER_SET = auto()
 258    CLUSTER_BY = auto()
 259    COLLATE = auto()
 260    COMMAND = auto()
 261    COMMENT = auto()
 262    COMMIT = auto()
 263    CONNECT_BY = auto()
 264    CONSTRAINT = auto()
 265    COPY = auto()
 266    CREATE = auto()
 267    CROSS = auto()
 268    CUBE = auto()
 269    CURRENT_DATE = auto()
 270    CURRENT_DATETIME = auto()
 271    CURRENT_SCHEMA = auto()
 272    CURRENT_TIME = auto()
 273    CURRENT_TIMESTAMP = auto()
 274    CURRENT_USER = auto()
 275    CURRENT_ROLE = auto()
 276    CURRENT_CATALOG = auto()
 277    DECLARE = auto()
 278    DEFAULT = auto()
 279    DELETE = auto()
 280    DESC = auto()
 281    DESCRIBE = auto()
 282    DETACH = auto()
 283    DICTIONARY = auto()
 284    DISTINCT = auto()
 285    DISTRIBUTE_BY = auto()
 286    DIV = auto()
 287    DROP = auto()
 288    ELSE = auto()
 289    END = auto()
 290    ESCAPE = auto()
 291    EXCEPT = auto()
 292    EXECUTE = auto()
 293    EXISTS = auto()
 294    FALSE = auto()
 295    FETCH = auto()
 296    FILE = auto()
 297    FILE_FORMAT = auto()
 298    FILTER = auto()
 299    FINAL = auto()
 300    FIRST = auto()
 301    FOR = auto()
 302    FORCE = auto()
 303    FOREIGN_KEY = auto()
 304    FORMAT = auto()
 305    FROM = auto()
 306    FULL = auto()
 307    FUNCTION = auto()
 308    GET = auto()
 309    GLOB = auto()
 310    GLOBAL = auto()
 311    GRANT = auto()
 312    GROUP_BY = auto()
 313    GROUPING_SETS = auto()
 314    HAVING = auto()
 315    HINT = auto()
 316    IGNORE = auto()
 317    ILIKE = auto()
 318    IN = auto()
 319    INDEX = auto()
 320    INDEXED_BY = auto()
 321    INNER = auto()
 322    INSERT = auto()
 323    INSTALL = auto()
 324    INTERSECT = auto()
 325    INTERVAL = auto()
 326    INTO = auto()
 327    INTRODUCER = auto()
 328    IRLIKE = auto()
 329    IS = auto()
 330    ISNULL = auto()
 331    JOIN = auto()
 332    JOIN_MARKER = auto()
 333    KEEP = auto()
 334    KEY = auto()
 335    KILL = auto()
 336    LANGUAGE = auto()
 337    LATERAL = auto()
 338    LEFT = auto()
 339    LIKE = auto()
 340    LIMIT = auto()
 341    LIST = auto()
 342    LOAD = auto()
 343    LOCK = auto()
 344    MAP = auto()
 345    MATCH = auto()
 346    MATCH_CONDITION = auto()
 347    MATCH_RECOGNIZE = auto()
 348    MEMBER_OF = auto()
 349    MERGE = auto()
 350    MOD = auto()
 351    MODEL = auto()
 352    NATURAL = auto()
 353    NEXT = auto()
 354    NOTHING = auto()
 355    NOTNULL = auto()
 356    NULL = auto()
 357    OBJECT_IDENTIFIER = auto()
 358    OFFSET = auto()
 359    ON = auto()
 360    ONLY = auto()
 361    OPERATOR = auto()
 362    ORDER_BY = auto()
 363    ORDER_SIBLINGS_BY = auto()
 364    ORDERED = auto()
 365    ORDINALITY = auto()
 366    OUT = auto()
 367    INOUT = auto()
 368    OUTER = auto()
 369    OVER = auto()
 370    OVERLAPS = auto()
 371    OVERWRITE = auto()
 372    PARTITION = auto()
 373    PARTITION_BY = auto()
 374    PERCENT = auto()
 375    PIVOT = auto()
 376    PLACEHOLDER = auto()
 377    POSITIONAL = auto()
 378    PRAGMA = auto()
 379    PREWHERE = auto()
 380    PRIMARY_KEY = auto()
 381    PROCEDURE = auto()
 382    PROPERTIES = auto()
 383    PSEUDO_TYPE = auto()
 384    PUT = auto()
 385    QUALIFY = auto()
 386    QUOTE = auto()
 387    QDCOLON = auto()
 388    RANGE = auto()
 389    RECURSIVE = auto()
 390    REFRESH = auto()
 391    RENAME = auto()
 392    REPLACE = auto()
 393    RETURNING = auto()
 394    REVOKE = auto()
 395    REFERENCES = auto()
 396    RIGHT = auto()
 397    RLIKE = auto()
 398    ROLLBACK = auto()
 399    ROLLUP = auto()
 400    ROW = auto()
 401    ROWS = auto()
 402    SELECT = auto()
 403    SEMI = auto()
 404    SEPARATOR = auto()
 405    SEQUENCE = auto()
 406    SERDE_PROPERTIES = auto()
 407    SET = auto()
 408    SETTINGS = auto()
 409    SHOW = auto()
 410    SIMILAR_TO = auto()
 411    SOME = auto()
 412    SORT_BY = auto()
 413    SOUNDS_LIKE = auto()
 414    START_WITH = auto()
 415    STORAGE_INTEGRATION = auto()
 416    STRAIGHT_JOIN = auto()
 417    STRUCT = auto()
 418    SUMMARIZE = auto()
 419    TABLE_SAMPLE = auto()
 420    TAG = auto()
 421    TEMPORARY = auto()
 422    TOP = auto()
 423    THEN = auto()
 424    TRUE = auto()
 425    TRUNCATE = auto()
 426    TRIGGER = auto()
 427    UNCACHE = auto()
 428    UNION = auto()
 429    UNNEST = auto()
 430    UNPIVOT = auto()
 431    UPDATE = auto()
 432    USE = auto()
 433    USING = auto()
 434    VALUES = auto()
 435    VARIADIC = auto()
 436    VIEW = auto()
 437    SEMANTIC_VIEW = auto()
 438    VOLATILE = auto()
 439    WHEN = auto()
 440    WHERE = auto()
 441    WINDOW = auto()
 442    WITH = auto()
 443    UNIQUE = auto()
 444    UTC_DATE = auto()
 445    UTC_TIME = auto()
 446    UTC_TIMESTAMP = auto()
 447    VERSION_SNAPSHOT = auto()
 448    TIMESTAMP_SNAPSHOT = auto()
 449    OPTION = auto()
 450    SINK = auto()
 451    SOURCE = auto()
 452    ANALYZE = auto()
 453    NAMESPACE = auto()
 454    EXPORT = auto()
 455
 456    # sentinel
 457    HIVE_TOKEN_STREAM = auto()
 458
 459    def __str__(self) -> str:
 460        return f"TokenType.{self.name}"
 461
 462
 463class Token:
 464    # mypyc doesn't expose slots
 465    _attrs: t.ClassVar[t.Tuple[str, ...]] = (
 466        "token_type",
 467        "text",
 468        "line",
 469        "col",
 470        "start",
 471        "end",
 472        "comments",
 473    )
 474    __slots__ = _attrs
 475
 476    @classmethod
 477    def number(cls, number: int) -> Token:
 478        """Returns a NUMBER token with `number` as its text."""
 479        return cls(TokenType.NUMBER, str(number))
 480
 481    @classmethod
 482    def string(cls, string: str) -> Token:
 483        """Returns a STRING token with `string` as its text."""
 484        return cls(TokenType.STRING, string)
 485
 486    @classmethod
 487    def identifier(cls, identifier: str) -> Token:
 488        """Returns an IDENTIFIER token with `identifier` as its text."""
 489        return cls(TokenType.IDENTIFIER, identifier)
 490
 491    @classmethod
 492    def var(cls, var: str) -> Token:
 493        """Returns an VAR token with `var` as its text."""
 494        return cls(TokenType.VAR, var)
 495
 496    def __init__(
 497        self,
 498        token_type: TokenType,
 499        text: str,
 500        line: int = 1,
 501        col: int = 1,
 502        start: int = 0,
 503        end: int = 0,
 504        comments: t.Optional[t.List[str]] = None,
 505    ) -> None:
 506        self.token_type = token_type
 507        self.text = text
 508        self.line = line
 509        self.col = col
 510        self.start = start
 511        self.end = end
 512        self.comments = [] if comments is None else comments
 513
 514    def __repr__(self) -> str:
 515        attributes = ", ".join(
 516            f"{k}: TokenType.{self.token_type.name}"
 517            if k == "token_type"
 518            else f"{k}: {getattr(self, k)}"
 519            for k in self._attrs
 520        )
 521        return f"<Token {attributes}>"
 522
 523
 524class TokenizerCore:
 525    __slots__ = (
 526        "sql",
 527        "size",
 528        "tokens",
 529        "_start",
 530        "_current",
 531        "_line",
 532        "_col",
 533        "_comments",
 534        "_char",
 535        "_end",
 536        "_peek",
 537        "_prev_token_line",
 538        "single_tokens",
 539        "keywords",
 540        "quotes",
 541        "format_strings",
 542        "identifiers",
 543        "comments",
 544        "string_escapes",
 545        "byte_string_escapes",
 546        "identifier_escapes",
 547        "escape_follow_chars",
 548        "commands",
 549        "command_prefix_tokens",
 550        "nested_comments",
 551        "hint_start",
 552        "tokens_preceding_hint",
 553        "bit_strings",
 554        "hex_strings",
 555        "numeric_literals",
 556        "var_single_tokens",
 557        "string_escapes_allowed_in_raw_strings",
 558        "heredoc_tag_is_identifier",
 559        "heredoc_string_alternative",
 560        "keyword_trie",
 561        "numbers_can_be_underscore_separated",
 562        "identifiers_can_start_with_digit",
 563        "unescaped_sequences",
 564    )
 565
 566    def __init__(
 567        self,
 568        single_tokens: t.Dict[str, TokenType],
 569        keywords: t.Dict[str, TokenType],
 570        quotes: t.Dict[str, str],
 571        format_strings: t.Dict[str, t.Tuple[str, TokenType]],
 572        identifiers: t.Dict[str, str],
 573        comments: t.Dict[str, t.Optional[str]],
 574        string_escapes: t.Set[str],
 575        byte_string_escapes: t.Set[str],
 576        identifier_escapes: t.Set[str],
 577        escape_follow_chars: t.Set[str],
 578        commands: t.Set[TokenType],
 579        command_prefix_tokens: t.Set[TokenType],
 580        nested_comments: bool,
 581        hint_start: str,
 582        tokens_preceding_hint: t.Set[TokenType],
 583        bit_strings: t.List[t.Union[str, t.Tuple[str, str]]],
 584        hex_strings: t.List[t.Union[str, t.Tuple[str, str]]],
 585        numeric_literals: t.Dict[str, str],
 586        var_single_tokens: t.Set[str],
 587        string_escapes_allowed_in_raw_strings: bool,
 588        heredoc_tag_is_identifier: bool,
 589        heredoc_string_alternative: TokenType,
 590        keyword_trie: t.Dict,
 591        numbers_can_be_underscore_separated: bool,
 592        identifiers_can_start_with_digit: bool,
 593        unescaped_sequences: t.Dict[str, str],
 594    ) -> None:
 595        self.single_tokens = single_tokens
 596        self.keywords = keywords
 597        self.quotes = quotes
 598        self.format_strings = format_strings
 599        self.identifiers = identifiers
 600        self.comments = comments
 601        self.string_escapes = string_escapes
 602        self.byte_string_escapes = byte_string_escapes
 603        self.identifier_escapes = identifier_escapes
 604        self.escape_follow_chars = escape_follow_chars
 605        self.commands = commands
 606        self.command_prefix_tokens = command_prefix_tokens
 607        self.nested_comments = nested_comments
 608        self.hint_start = hint_start
 609        self.tokens_preceding_hint = tokens_preceding_hint
 610        self.bit_strings = bit_strings
 611        self.hex_strings = hex_strings
 612        self.numeric_literals = numeric_literals
 613        self.var_single_tokens = var_single_tokens
 614        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 615        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 616        self.heredoc_string_alternative = heredoc_string_alternative
 617        self.keyword_trie = keyword_trie
 618        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 619        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 620        self.unescaped_sequences = unescaped_sequences
 621        self.reset()
 622
 623    def reset(self) -> None:
 624        self.sql = ""
 625        self.size = 0
 626        self.tokens: t.List[Token] = []
 627        self._start = 0
 628        self._current = 0
 629        self._line = 1
 630        self._col = 0
 631        self._comments: t.List[str] = []
 632        self._char = ""
 633        self._end = False
 634        self._peek = ""
 635        self._prev_token_line = -1
 636
 637    def tokenize(self, sql: str) -> t.List[Token]:
 638        """Returns a list of tokens corresponding to the SQL string `sql`."""
 639        self.reset()
 640        self.sql = sql
 641        self.size = len(sql)
 642
 643        try:
 644            self._scan()
 645        except Exception as e:
 646            start = max(self._current - 50, 0)
 647            end = min(self._current + 50, self.size - 1)
 648            context = self.sql[start:end]
 649            raise TokenError(f"Error tokenizing '{context}'") from e
 650
 651        return self.tokens
 652
 653    def _scan(self, check_semicolon: bool = False) -> None:
 654        identifiers = self.identifiers
 655        space_chars = _SPACE_CHARS
 656        digit_chars = _DIGIT_CHARS
 657
 658        while self.size and not self._end:
 659            current = self._current
 660
 661            # Skip spaces here rather than iteratively calling advance() for performance reasons
 662            while current < self.size:
 663                char = self.sql[current]
 664
 665                if char == " " or char == "\t":
 666                    current += 1
 667                else:
 668                    break
 669
 670            offset = current - self._current if current > self._current else 1
 671
 672            self._start = current
 673            self._advance(offset)
 674
 675            if self._char not in space_chars:
 676                if self._char in digit_chars:
 677                    self._scan_number()
 678                elif self._char in identifiers:
 679                    self._scan_identifier(identifiers[self._char])
 680                else:
 681                    self._scan_keywords()
 682
 683            if check_semicolon and self._peek == ";":
 684                break
 685
 686        if self.tokens and self._comments:
 687            self.tokens[-1].comments.extend(self._comments)
 688
 689    def _chars(self, size: int) -> str:
 690        if size == 1:
 691            return self._char
 692
 693        start = self._current - 1
 694        end = start + size
 695
 696        return self.sql[start:end] if end <= self.size else ""
 697
 698    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 699        char = self._char
 700
 701        if char == "\n" or char == "\r":
 702            # Ensures we don't count an extra line if we get a \r\n line break sequence
 703            if not (char == "\r" and self._peek == "\n"):
 704                self._col = i
 705                self._line += 1
 706        else:
 707            self._col += i
 708
 709        self._current += i
 710        sql = self.sql
 711        size = self.size
 712        self._end = self._current >= size
 713        self._char = sql[self._current - 1]
 714        self._peek = "" if self._end else sql[self._current]
 715
 716        if alnum and self._char.isalnum():
 717            # Cache to local variables instead of attributes for better performance
 718            _col = self._col
 719            _current = self._current
 720            _end = self._end
 721            _peek = self._peek
 722
 723            while _peek.isalnum():
 724                _col += 1
 725                _current += 1
 726                _end = _current >= size
 727                _peek = "" if _end else sql[_current]
 728
 729            self._col = _col
 730            self._current = _current
 731            self._end = _end
 732            self._peek = _peek
 733            self._char = sql[_current - 1]
 734
 735    @property
 736    def _text(self) -> str:
 737        return self.sql[self._start : self._current]
 738
 739    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
 740        self._prev_token_line = self._line
 741
 742        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 743            self.tokens[-1].comments.extend(self._comments)
 744            self._comments = []
 745
 746        if text is None:
 747            text = self.sql[self._start : self._current]
 748
 749        self.tokens.append(
 750            Token(
 751                token_type,
 752                text=text,
 753                line=self._line,
 754                col=self._col,
 755                start=self._start,
 756                end=self._current - 1,
 757                comments=self._comments,
 758            )
 759        )
 760        self._comments = []
 761
 762        # If we have either a semicolon or a begin token before the command's token, we'll parse
 763        # whatever follows the command's token as a string
 764        if (
 765            token_type in self.commands
 766            and self._peek != ";"
 767            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 768        ):
 769            start = self._current
 770            tokens = len(self.tokens)
 771            self._scan(check_semicolon=True)
 772            self.tokens = self.tokens[:tokens]
 773            text = self.sql[start : self._current].strip()
 774            if text:
 775                self._add(TokenType.STRING, text)
 776
 777    def _scan_keywords(self) -> None:
 778        sql = self.sql
 779        sql_size = self.size
 780        single_tokens = self.single_tokens
 781        char_upper = _CHAR_UPPER
 782        space_chars = _SPACE_CHARS
 783        size = 0
 784        word = None
 785        chars = self._char
 786        char = chars
 787        prev_space = False
 788        skip = False
 789        trie = self.keyword_trie
 790        single_token = char in single_tokens
 791
 792        while chars:
 793            if not skip:
 794                sub = trie.get(char_upper.get(char, char))
 795                if sub is None:
 796                    break
 797                trie = sub
 798                if 0 in trie:
 799                    word = chars
 800
 801            end = self._current + size
 802            size += 1
 803
 804            if end < sql_size:
 805                char = sql[end]
 806                single_token = single_token or char in single_tokens
 807                is_space = char in space_chars
 808
 809                if not is_space or not prev_space:
 810                    if is_space:
 811                        char = " "
 812                    chars += char
 813                    prev_space = is_space
 814                    skip = False
 815                else:
 816                    skip = True
 817            else:
 818                char = ""
 819                break
 820
 821        if word:
 822            if self._scan_string(word):
 823                return
 824            if self._scan_comment(word):
 825                return
 826            if prev_space or single_token or not char:
 827                self._advance(size - 1)
 828                word = word.upper()
 829                self._add(self.keywords[word], text=word)
 830                return
 831
 832        if self._char in single_tokens:
 833            self._add(single_tokens[self._char], text=self._char)
 834            return
 835
 836        self._scan_var()
 837
 838    def _scan_comment(self, comment_start: str) -> bool:
 839        if comment_start not in self.comments:
 840            return False
 841
 842        comment_start_line = self._line
 843        comment_start_size = len(comment_start)
 844        comment_end = self.comments[comment_start]
 845
 846        if comment_end:
 847            # Skip the comment's start delimiter
 848            self._advance(comment_start_size)
 849
 850            comment_count = 1
 851            comment_end_size = len(comment_end)
 852            nested_comments = self.nested_comments
 853
 854            while not self._end:
 855                if self._chars(comment_end_size) == comment_end:
 856                    comment_count -= 1
 857                    if not comment_count:
 858                        break
 859
 860                self._advance(alnum=True)
 861
 862                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 863                if (
 864                    nested_comments
 865                    and not self._end
 866                    and self._chars(comment_end_size) == comment_start
 867                ):
 868                    self._advance(comment_start_size)
 869                    comment_count += 1
 870
 871            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 872            self._advance(comment_end_size - 1)
 873        else:
 874            _peek = self._peek
 875            while not self._end and _peek != "\n" and _peek != "\r":
 876                self._advance(alnum=True)
 877                _peek = self._peek
 878            self._comments.append(self._text[comment_start_size:])
 879
 880        if (
 881            comment_start == self.hint_start
 882            and self.tokens
 883            and self.tokens[-1].token_type in self.tokens_preceding_hint
 884        ):
 885            self._add(TokenType.HINT)
 886
 887        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 888        # Multiple consecutive comments are preserved by appending them to the current comments list.
 889        if comment_start_line == self._prev_token_line:
 890            self.tokens[-1].comments.extend(self._comments)
 891            self._comments = []
 892            self._prev_token_line = self._line
 893
 894        return True
 895
 896    def _scan_number(self) -> None:
 897        if self._char == "0":
 898            peek = _CHAR_UPPER.get(self._peek, self._peek)
 899            if peek == "B":
 900                return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
 901            elif peek == "X":
 902                return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
 903
 904        decimal = False
 905        scientific = 0
 906        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 907        single_tokens = self.single_tokens
 908        keywords = self.keywords
 909        numeric_literals = self.numeric_literals
 910        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 911
 912        while True:
 913            if self._peek in _DIGIT_CHARS:
 914                self._advance()
 915            elif self._peek == "." and not decimal:
 916                if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER:
 917                    return self._add(TokenType.NUMBER)
 918                decimal = True
 919                self._advance()
 920            elif self._peek in ("-", "+") and scientific == 1:
 921                # Only consume +/- if followed by a digit
 922                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 923                    scientific += 1
 924                    self._advance()
 925                else:
 926                    return self._add(TokenType.NUMBER)
 927            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 928                scientific += 1
 929                self._advance()
 930            elif self._peek == "_" and numbers_can_be_underscore_separated:
 931                self._advance()
 932            elif self._peek.isidentifier():
 933                number_text = self._text
 934                literal = ""
 935
 936                while (
 937                    self._peek
 938                    and self._peek not in _SPACE_CHARS
 939                    and self._peek not in single_tokens
 940                ):
 941                    literal += self._peek
 942                    self._advance()
 943
 944                token_type = keywords.get(numeric_literals.get(literal.upper(), ""))
 945
 946                if token_type:
 947                    self._add(TokenType.NUMBER, number_text)
 948                    self._add(TokenType.DCOLON, "::")
 949                    return self._add(token_type, literal)
 950                elif identifiers_can_start_with_digit:
 951                    return self._add(TokenType.VAR)
 952
 953                self._advance(-len(literal))
 954                return self._add(TokenType.NUMBER, number_text)
 955            else:
 956                return self._add(TokenType.NUMBER)
 957
 958    def _scan_bits(self) -> None:
 959        self._advance()
 960        value = self._extract_value()
 961        try:
 962            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
 963            int(value, 2)
 964            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
 965        except ValueError:
 966            self._add(TokenType.IDENTIFIER)
 967
 968    def _scan_hex(self) -> None:
 969        self._advance()
 970        value = self._extract_value()
 971        try:
 972            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
 973            int(value, 16)
 974            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
 975        except ValueError:
 976            self._add(TokenType.IDENTIFIER)
 977
 978    def _extract_value(self) -> str:
 979        single_tokens = self.single_tokens
 980
 981        while True:
 982            char = self._peek.strip()
 983            if char and char not in single_tokens:
 984                self._advance(alnum=True)
 985            else:
 986                break
 987
 988        return self._text
 989
 990    def _scan_string(self, start: str) -> bool:
 991        base = None
 992        token_type = TokenType.STRING
 993
 994        if start in self.quotes:
 995            end = self.quotes[start]
 996        elif start in self.format_strings:
 997            end, token_type = self.format_strings[start]
 998
 999            if token_type == TokenType.HEX_STRING:
1000                base = 16
1001            elif token_type == TokenType.BIT_STRING:
1002                base = 2
1003            elif token_type == TokenType.HEREDOC_STRING:
1004                self._advance()
1005
1006                if self._char == end:
1007                    tag = ""
1008                else:
1009                    tag = self._extract_string(
1010                        end,
1011                        raw_string=True,
1012                        raise_unmatched=not self.heredoc_tag_is_identifier,
1013                    )
1014
1015                if (
1016                    tag
1017                    and self.heredoc_tag_is_identifier
1018                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1019                ):
1020                    if not self._end:
1021                        self._advance(-1)
1022
1023                    self._advance(-len(tag))
1024                    self._add(self.heredoc_string_alternative)
1025                    return True
1026
1027                end = f"{start}{tag}{end}"
1028        else:
1029            return False
1030
1031        self._advance(len(start))
1032        text = self._extract_string(
1033            end,
1034            escapes=(
1035                self.byte_string_escapes
1036                if token_type == TokenType.BYTE_STRING
1037                else self.string_escapes
1038            ),
1039            raw_string=token_type == TokenType.RAW_STRING,
1040        )
1041
1042        if base and text:
1043            try:
1044                int(text, base)
1045            except Exception:
1046                raise TokenError(
1047                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1048                )
1049
1050        self._add(token_type, text)
1051        return True
1052
1053    def _scan_identifier(self, identifier_end: str) -> None:
1054        self._advance()
1055        text = self._extract_string(
1056            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1057        )
1058        self._add(TokenType.IDENTIFIER, text)
1059
1060    def _scan_var(self) -> None:
1061        var_single_tokens = self.var_single_tokens
1062        single_tokens = self.single_tokens
1063
1064        while True:
1065            peek = self._peek
1066            if not peek or peek in _SPACE_CHARS:
1067                break
1068            if peek not in var_single_tokens and peek in single_tokens:
1069                break
1070            self._advance(alnum=True)
1071
1072        self._add(
1073            TokenType.VAR
1074            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1075            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1076        )
1077
1078    def _extract_string(
1079        self,
1080        delimiter: str,
1081        escapes: t.Optional[t.Set[str]] = None,
1082        raw_string: bool = False,
1083        raise_unmatched: bool = True,
1084    ) -> str:
1085        text = ""
1086        delim_size = len(delimiter)
1087        escapes = self.string_escapes if escapes is None else escapes
1088        unescaped_sequences = self.unescaped_sequences
1089        escape_follow_chars = self.escape_follow_chars
1090        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1091        quotes = self.quotes
1092
1093        while True:
1094            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1095                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1096                if unescaped_sequence:
1097                    self._advance(2)
1098                    text += unescaped_sequence
1099                    continue
1100
1101            is_valid_custom_escape = (
1102                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1103            )
1104
1105            if (
1106                (string_escapes_allowed_in_raw_strings or not raw_string)
1107                and self._char in escapes
1108                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1109                and (self._char not in quotes or self._char == self._peek)
1110            ):
1111                if self._peek == delimiter:
1112                    text += self._peek
1113                elif is_valid_custom_escape and self._char != self._peek:
1114                    text += self._peek
1115                else:
1116                    text += self._char + self._peek
1117
1118                if self._current + 1 < self.size:
1119                    self._advance(2)
1120                else:
1121                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1122            else:
1123                if self._chars(delim_size) == delimiter:
1124                    if delim_size > 1:
1125                        self._advance(delim_size - 1)
1126                    break
1127
1128                if self._end:
1129                    if not raise_unmatched:
1130                        return text + self._char
1131
1132                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1133
1134                current = self._current - 1
1135                self._advance(alnum=True)
1136                text += self.sql[current : self._current - 1]
1137
1138        return text
class TokenType(enum.IntEnum):
 20class TokenType(IntEnum):
 21    L_PAREN = auto()
 22    R_PAREN = auto()
 23    L_BRACKET = auto()
 24    R_BRACKET = auto()
 25    L_BRACE = auto()
 26    R_BRACE = auto()
 27    COMMA = auto()
 28    DOT = auto()
 29    DASH = auto()
 30    PLUS = auto()
 31    COLON = auto()
 32    DOTCOLON = auto()
 33    DCOLON = auto()
 34    DCOLONDOLLAR = auto()
 35    DCOLONPERCENT = auto()
 36    DCOLONQMARK = auto()
 37    DQMARK = auto()
 38    SEMICOLON = auto()
 39    STAR = auto()
 40    BACKSLASH = auto()
 41    SLASH = auto()
 42    LT = auto()
 43    LTE = auto()
 44    GT = auto()
 45    GTE = auto()
 46    NOT = auto()
 47    EQ = auto()
 48    NEQ = auto()
 49    NULLSAFE_EQ = auto()
 50    COLON_EQ = auto()
 51    COLON_GT = auto()
 52    NCOLON_GT = auto()
 53    AND = auto()
 54    OR = auto()
 55    AMP = auto()
 56    DPIPE = auto()
 57    PIPE_GT = auto()
 58    PIPE = auto()
 59    PIPE_SLASH = auto()
 60    DPIPE_SLASH = auto()
 61    CARET = auto()
 62    CARET_AT = auto()
 63    TILDE = auto()
 64    ARROW = auto()
 65    DARROW = auto()
 66    FARROW = auto()
 67    HASH = auto()
 68    HASH_ARROW = auto()
 69    DHASH_ARROW = auto()
 70    LR_ARROW = auto()
 71    DAT = auto()
 72    LT_AT = auto()
 73    AT_GT = auto()
 74    DOLLAR = auto()
 75    PARAMETER = auto()
 76    SESSION = auto()
 77    SESSION_PARAMETER = auto()
 78    SESSION_USER = auto()
 79    DAMP = auto()
 80    AMP_LT = auto()
 81    AMP_GT = auto()
 82    ADJACENT = auto()
 83    XOR = auto()
 84    DSTAR = auto()
 85    QMARK_AMP = auto()
 86    QMARK_PIPE = auto()
 87    HASH_DASH = auto()
 88    EXCLAMATION = auto()
 89
 90    URI_START = auto()
 91
 92    BLOCK_START = auto()
 93    BLOCK_END = auto()
 94
 95    SPACE = auto()
 96    BREAK = auto()
 97
 98    STRING = auto()
 99    NUMBER = auto()
100    IDENTIFIER = auto()
101    DATABASE = auto()
102    COLUMN = auto()
103    COLUMN_DEF = auto()
104    SCHEMA = auto()
105    TABLE = auto()
106    WAREHOUSE = auto()
107    STAGE = auto()
108    STREAMLIT = auto()
109    VAR = auto()
110    BIT_STRING = auto()
111    HEX_STRING = auto()
112    BYTE_STRING = auto()
113    NATIONAL_STRING = auto()
114    RAW_STRING = auto()
115    HEREDOC_STRING = auto()
116    UNICODE_STRING = auto()
117
118    # types
119    BIT = auto()
120    BOOLEAN = auto()
121    TINYINT = auto()
122    UTINYINT = auto()
123    SMALLINT = auto()
124    USMALLINT = auto()
125    MEDIUMINT = auto()
126    UMEDIUMINT = auto()
127    INT = auto()
128    UINT = auto()
129    BIGINT = auto()
130    UBIGINT = auto()
131    BIGNUM = auto()
132    INT128 = auto()
133    UINT128 = auto()
134    INT256 = auto()
135    UINT256 = auto()
136    FLOAT = auto()
137    DOUBLE = auto()
138    UDOUBLE = auto()
139    DECIMAL = auto()
140    DECIMAL32 = auto()
141    DECIMAL64 = auto()
142    DECIMAL128 = auto()
143    DECIMAL256 = auto()
144    DECFLOAT = auto()
145    UDECIMAL = auto()
146    BIGDECIMAL = auto()
147    CHAR = auto()
148    NCHAR = auto()
149    VARCHAR = auto()
150    NVARCHAR = auto()
151    BPCHAR = auto()
152    TEXT = auto()
153    MEDIUMTEXT = auto()
154    LONGTEXT = auto()
155    BLOB = auto()
156    MEDIUMBLOB = auto()
157    LONGBLOB = auto()
158    TINYBLOB = auto()
159    TINYTEXT = auto()
160    NAME = auto()
161    BINARY = auto()
162    VARBINARY = auto()
163    JSON = auto()
164    JSONB = auto()
165    TIME = auto()
166    TIMETZ = auto()
167    TIME_NS = auto()
168    TIMESTAMP = auto()
169    TIMESTAMPTZ = auto()
170    TIMESTAMPLTZ = auto()
171    TIMESTAMPNTZ = auto()
172    TIMESTAMP_S = auto()
173    TIMESTAMP_MS = auto()
174    TIMESTAMP_NS = auto()
175    DATETIME = auto()
176    DATETIME2 = auto()
177    DATETIME64 = auto()
178    SMALLDATETIME = auto()
179    DATE = auto()
180    DATE32 = auto()
181    INT4RANGE = auto()
182    INT4MULTIRANGE = auto()
183    INT8RANGE = auto()
184    INT8MULTIRANGE = auto()
185    NUMRANGE = auto()
186    NUMMULTIRANGE = auto()
187    TSRANGE = auto()
188    TSMULTIRANGE = auto()
189    TSTZRANGE = auto()
190    TSTZMULTIRANGE = auto()
191    DATERANGE = auto()
192    DATEMULTIRANGE = auto()
193    UUID = auto()
194    GEOGRAPHY = auto()
195    GEOGRAPHYPOINT = auto()
196    NULLABLE = auto()
197    GEOMETRY = auto()
198    POINT = auto()
199    RING = auto()
200    LINESTRING = auto()
201    LOCALTIME = auto()
202    LOCALTIMESTAMP = auto()
203    SYSTIMESTAMP = auto()
204    MULTILINESTRING = auto()
205    POLYGON = auto()
206    MULTIPOLYGON = auto()
207    HLLSKETCH = auto()
208    HSTORE = auto()
209    SUPER = auto()
210    SERIAL = auto()
211    SMALLSERIAL = auto()
212    BIGSERIAL = auto()
213    XML = auto()
214    YEAR = auto()
215    USERDEFINED = auto()
216    MONEY = auto()
217    SMALLMONEY = auto()
218    ROWVERSION = auto()
219    IMAGE = auto()
220    VARIANT = auto()
221    OBJECT = auto()
222    INET = auto()
223    IPADDRESS = auto()
224    IPPREFIX = auto()
225    IPV4 = auto()
226    IPV6 = auto()
227    ENUM = auto()
228    ENUM8 = auto()
229    ENUM16 = auto()
230    FIXEDSTRING = auto()
231    LOWCARDINALITY = auto()
232    NESTED = auto()
233    AGGREGATEFUNCTION = auto()
234    SIMPLEAGGREGATEFUNCTION = auto()
235    TDIGEST = auto()
236    UNKNOWN = auto()
237    VECTOR = auto()
238    DYNAMIC = auto()
239    VOID = auto()
240
241    # keywords
242    ALIAS = auto()
243    ALTER = auto()
244    ALL = auto()
245    ANTI = auto()
246    ANY = auto()
247    APPLY = auto()
248    ARRAY = auto()
249    ASC = auto()
250    ASOF = auto()
251    ATTACH = auto()
252    AUTO_INCREMENT = auto()
253    BEGIN = auto()
254    BETWEEN = auto()
255    BULK_COLLECT_INTO = auto()
256    CACHE = auto()
257    CASE = auto()
258    CHARACTER_SET = auto()
259    CLUSTER_BY = auto()
260    COLLATE = auto()
261    COMMAND = auto()
262    COMMENT = auto()
263    COMMIT = auto()
264    CONNECT_BY = auto()
265    CONSTRAINT = auto()
266    COPY = auto()
267    CREATE = auto()
268    CROSS = auto()
269    CUBE = auto()
270    CURRENT_DATE = auto()
271    CURRENT_DATETIME = auto()
272    CURRENT_SCHEMA = auto()
273    CURRENT_TIME = auto()
274    CURRENT_TIMESTAMP = auto()
275    CURRENT_USER = auto()
276    CURRENT_ROLE = auto()
277    CURRENT_CATALOG = auto()
278    DECLARE = auto()
279    DEFAULT = auto()
280    DELETE = auto()
281    DESC = auto()
282    DESCRIBE = auto()
283    DETACH = auto()
284    DICTIONARY = auto()
285    DISTINCT = auto()
286    DISTRIBUTE_BY = auto()
287    DIV = auto()
288    DROP = auto()
289    ELSE = auto()
290    END = auto()
291    ESCAPE = auto()
292    EXCEPT = auto()
293    EXECUTE = auto()
294    EXISTS = auto()
295    FALSE = auto()
296    FETCH = auto()
297    FILE = auto()
298    FILE_FORMAT = auto()
299    FILTER = auto()
300    FINAL = auto()
301    FIRST = auto()
302    FOR = auto()
303    FORCE = auto()
304    FOREIGN_KEY = auto()
305    FORMAT = auto()
306    FROM = auto()
307    FULL = auto()
308    FUNCTION = auto()
309    GET = auto()
310    GLOB = auto()
311    GLOBAL = auto()
312    GRANT = auto()
313    GROUP_BY = auto()
314    GROUPING_SETS = auto()
315    HAVING = auto()
316    HINT = auto()
317    IGNORE = auto()
318    ILIKE = auto()
319    IN = auto()
320    INDEX = auto()
321    INDEXED_BY = auto()
322    INNER = auto()
323    INSERT = auto()
324    INSTALL = auto()
325    INTERSECT = auto()
326    INTERVAL = auto()
327    INTO = auto()
328    INTRODUCER = auto()
329    IRLIKE = auto()
330    IS = auto()
331    ISNULL = auto()
332    JOIN = auto()
333    JOIN_MARKER = auto()
334    KEEP = auto()
335    KEY = auto()
336    KILL = auto()
337    LANGUAGE = auto()
338    LATERAL = auto()
339    LEFT = auto()
340    LIKE = auto()
341    LIMIT = auto()
342    LIST = auto()
343    LOAD = auto()
344    LOCK = auto()
345    MAP = auto()
346    MATCH = auto()
347    MATCH_CONDITION = auto()
348    MATCH_RECOGNIZE = auto()
349    MEMBER_OF = auto()
350    MERGE = auto()
351    MOD = auto()
352    MODEL = auto()
353    NATURAL = auto()
354    NEXT = auto()
355    NOTHING = auto()
356    NOTNULL = auto()
357    NULL = auto()
358    OBJECT_IDENTIFIER = auto()
359    OFFSET = auto()
360    ON = auto()
361    ONLY = auto()
362    OPERATOR = auto()
363    ORDER_BY = auto()
364    ORDER_SIBLINGS_BY = auto()
365    ORDERED = auto()
366    ORDINALITY = auto()
367    OUT = auto()
368    INOUT = auto()
369    OUTER = auto()
370    OVER = auto()
371    OVERLAPS = auto()
372    OVERWRITE = auto()
373    PARTITION = auto()
374    PARTITION_BY = auto()
375    PERCENT = auto()
376    PIVOT = auto()
377    PLACEHOLDER = auto()
378    POSITIONAL = auto()
379    PRAGMA = auto()
380    PREWHERE = auto()
381    PRIMARY_KEY = auto()
382    PROCEDURE = auto()
383    PROPERTIES = auto()
384    PSEUDO_TYPE = auto()
385    PUT = auto()
386    QUALIFY = auto()
387    QUOTE = auto()
388    QDCOLON = auto()
389    RANGE = auto()
390    RECURSIVE = auto()
391    REFRESH = auto()
392    RENAME = auto()
393    REPLACE = auto()
394    RETURNING = auto()
395    REVOKE = auto()
396    REFERENCES = auto()
397    RIGHT = auto()
398    RLIKE = auto()
399    ROLLBACK = auto()
400    ROLLUP = auto()
401    ROW = auto()
402    ROWS = auto()
403    SELECT = auto()
404    SEMI = auto()
405    SEPARATOR = auto()
406    SEQUENCE = auto()
407    SERDE_PROPERTIES = auto()
408    SET = auto()
409    SETTINGS = auto()
410    SHOW = auto()
411    SIMILAR_TO = auto()
412    SOME = auto()
413    SORT_BY = auto()
414    SOUNDS_LIKE = auto()
415    START_WITH = auto()
416    STORAGE_INTEGRATION = auto()
417    STRAIGHT_JOIN = auto()
418    STRUCT = auto()
419    SUMMARIZE = auto()
420    TABLE_SAMPLE = auto()
421    TAG = auto()
422    TEMPORARY = auto()
423    TOP = auto()
424    THEN = auto()
425    TRUE = auto()
426    TRUNCATE = auto()
427    TRIGGER = auto()
428    UNCACHE = auto()
429    UNION = auto()
430    UNNEST = auto()
431    UNPIVOT = auto()
432    UPDATE = auto()
433    USE = auto()
434    USING = auto()
435    VALUES = auto()
436    VARIADIC = auto()
437    VIEW = auto()
438    SEMANTIC_VIEW = auto()
439    VOLATILE = auto()
440    WHEN = auto()
441    WHERE = auto()
442    WINDOW = auto()
443    WITH = auto()
444    UNIQUE = auto()
445    UTC_DATE = auto()
446    UTC_TIME = auto()
447    UTC_TIMESTAMP = auto()
448    VERSION_SNAPSHOT = auto()
449    TIMESTAMP_SNAPSHOT = auto()
450    OPTION = auto()
451    SINK = auto()
452    SOURCE = auto()
453    ANALYZE = auto()
454    NAMESPACE = auto()
455    EXPORT = auto()
456
457    # sentinel
458    HIVE_TOKEN_STREAM = auto()
459
460    def __str__(self) -> str:
461        return f"TokenType.{self.name}"

An enumeration.

L_PAREN = <TokenType.L_PAREN: 1>
R_PAREN = <TokenType.R_PAREN: 2>
L_BRACKET = <TokenType.L_BRACKET: 3>
R_BRACKET = <TokenType.R_BRACKET: 4>
L_BRACE = <TokenType.L_BRACE: 5>
R_BRACE = <TokenType.R_BRACE: 6>
COMMA = <TokenType.COMMA: 7>
DOT = <TokenType.DOT: 8>
DASH = <TokenType.DASH: 9>
PLUS = <TokenType.PLUS: 10>
COLON = <TokenType.COLON: 11>
DOTCOLON = <TokenType.DOTCOLON: 12>
DCOLON = <TokenType.DCOLON: 13>
DCOLONDOLLAR = <TokenType.DCOLONDOLLAR: 14>
DCOLONPERCENT = <TokenType.DCOLONPERCENT: 15>
DCOLONQMARK = <TokenType.DCOLONQMARK: 16>
DQMARK = <TokenType.DQMARK: 17>
SEMICOLON = <TokenType.SEMICOLON: 18>
STAR = <TokenType.STAR: 19>
BACKSLASH = <TokenType.BACKSLASH: 20>
SLASH = <TokenType.SLASH: 21>
LT = <TokenType.LT: 22>
LTE = <TokenType.LTE: 23>
GT = <TokenType.GT: 24>
GTE = <TokenType.GTE: 25>
NOT = <TokenType.NOT: 26>
EQ = <TokenType.EQ: 27>
NEQ = <TokenType.NEQ: 28>
NULLSAFE_EQ = <TokenType.NULLSAFE_EQ: 29>
COLON_EQ = <TokenType.COLON_EQ: 30>
COLON_GT = <TokenType.COLON_GT: 31>
NCOLON_GT = <TokenType.NCOLON_GT: 32>
AND = <TokenType.AND: 33>
OR = <TokenType.OR: 34>
AMP = <TokenType.AMP: 35>
DPIPE = <TokenType.DPIPE: 36>
PIPE_GT = <TokenType.PIPE_GT: 37>
PIPE = <TokenType.PIPE: 38>
PIPE_SLASH = <TokenType.PIPE_SLASH: 39>
DPIPE_SLASH = <TokenType.DPIPE_SLASH: 40>
CARET = <TokenType.CARET: 41>
CARET_AT = <TokenType.CARET_AT: 42>
TILDE = <TokenType.TILDE: 43>
ARROW = <TokenType.ARROW: 44>
DARROW = <TokenType.DARROW: 45>
FARROW = <TokenType.FARROW: 46>
HASH = <TokenType.HASH: 47>
HASH_ARROW = <TokenType.HASH_ARROW: 48>
DHASH_ARROW = <TokenType.DHASH_ARROW: 49>
LR_ARROW = <TokenType.LR_ARROW: 50>
DAT = <TokenType.DAT: 51>
LT_AT = <TokenType.LT_AT: 52>
AT_GT = <TokenType.AT_GT: 53>
DOLLAR = <TokenType.DOLLAR: 54>
PARAMETER = <TokenType.PARAMETER: 55>
SESSION = <TokenType.SESSION: 56>
SESSION_PARAMETER = <TokenType.SESSION_PARAMETER: 57>
SESSION_USER = <TokenType.SESSION_USER: 58>
DAMP = <TokenType.DAMP: 59>
AMP_LT = <TokenType.AMP_LT: 60>
AMP_GT = <TokenType.AMP_GT: 61>
ADJACENT = <TokenType.ADJACENT: 62>
XOR = <TokenType.XOR: 63>
DSTAR = <TokenType.DSTAR: 64>
QMARK_AMP = <TokenType.QMARK_AMP: 65>
QMARK_PIPE = <TokenType.QMARK_PIPE: 66>
HASH_DASH = <TokenType.HASH_DASH: 67>
EXCLAMATION = <TokenType.EXCLAMATION: 68>
URI_START = <TokenType.URI_START: 69>
BLOCK_START = <TokenType.BLOCK_START: 70>
BLOCK_END = <TokenType.BLOCK_END: 71>
SPACE = <TokenType.SPACE: 72>
BREAK = <TokenType.BREAK: 73>
STRING = <TokenType.STRING: 74>
NUMBER = <TokenType.NUMBER: 75>
IDENTIFIER = <TokenType.IDENTIFIER: 76>
DATABASE = <TokenType.DATABASE: 77>
COLUMN = <TokenType.COLUMN: 78>
COLUMN_DEF = <TokenType.COLUMN_DEF: 79>
SCHEMA = <TokenType.SCHEMA: 80>
TABLE = <TokenType.TABLE: 81>
WAREHOUSE = <TokenType.WAREHOUSE: 82>
STAGE = <TokenType.STAGE: 83>
STREAMLIT = <TokenType.STREAMLIT: 84>
VAR = <TokenType.VAR: 85>
BIT_STRING = <TokenType.BIT_STRING: 86>
HEX_STRING = <TokenType.HEX_STRING: 87>
BYTE_STRING = <TokenType.BYTE_STRING: 88>
NATIONAL_STRING = <TokenType.NATIONAL_STRING: 89>
RAW_STRING = <TokenType.RAW_STRING: 90>
HEREDOC_STRING = <TokenType.HEREDOC_STRING: 91>
UNICODE_STRING = <TokenType.UNICODE_STRING: 92>
BIT = <TokenType.BIT: 93>
BOOLEAN = <TokenType.BOOLEAN: 94>
TINYINT = <TokenType.TINYINT: 95>
UTINYINT = <TokenType.UTINYINT: 96>
SMALLINT = <TokenType.SMALLINT: 97>
USMALLINT = <TokenType.USMALLINT: 98>
MEDIUMINT = <TokenType.MEDIUMINT: 99>
UMEDIUMINT = <TokenType.UMEDIUMINT: 100>
INT = <TokenType.INT: 101>
UINT = <TokenType.UINT: 102>
BIGINT = <TokenType.BIGINT: 103>
UBIGINT = <TokenType.UBIGINT: 104>
BIGNUM = <TokenType.BIGNUM: 105>
INT128 = <TokenType.INT128: 106>
UINT128 = <TokenType.UINT128: 107>
INT256 = <TokenType.INT256: 108>
UINT256 = <TokenType.UINT256: 109>
FLOAT = <TokenType.FLOAT: 110>
DOUBLE = <TokenType.DOUBLE: 111>
UDOUBLE = <TokenType.UDOUBLE: 112>
DECIMAL = <TokenType.DECIMAL: 113>
DECIMAL32 = <TokenType.DECIMAL32: 114>
DECIMAL64 = <TokenType.DECIMAL64: 115>
DECIMAL128 = <TokenType.DECIMAL128: 116>
DECIMAL256 = <TokenType.DECIMAL256: 117>
DECFLOAT = <TokenType.DECFLOAT: 118>
UDECIMAL = <TokenType.UDECIMAL: 119>
BIGDECIMAL = <TokenType.BIGDECIMAL: 120>
CHAR = <TokenType.CHAR: 121>
NCHAR = <TokenType.NCHAR: 122>
VARCHAR = <TokenType.VARCHAR: 123>
NVARCHAR = <TokenType.NVARCHAR: 124>
BPCHAR = <TokenType.BPCHAR: 125>
TEXT = <TokenType.TEXT: 126>
MEDIUMTEXT = <TokenType.MEDIUMTEXT: 127>
LONGTEXT = <TokenType.LONGTEXT: 128>
BLOB = <TokenType.BLOB: 129>
MEDIUMBLOB = <TokenType.MEDIUMBLOB: 130>
LONGBLOB = <TokenType.LONGBLOB: 131>
TINYBLOB = <TokenType.TINYBLOB: 132>
TINYTEXT = <TokenType.TINYTEXT: 133>
NAME = <TokenType.NAME: 134>
BINARY = <TokenType.BINARY: 135>
VARBINARY = <TokenType.VARBINARY: 136>
JSON = <TokenType.JSON: 137>
JSONB = <TokenType.JSONB: 138>
TIME = <TokenType.TIME: 139>
TIMETZ = <TokenType.TIMETZ: 140>
TIME_NS = <TokenType.TIME_NS: 141>
TIMESTAMP = <TokenType.TIMESTAMP: 142>
TIMESTAMPTZ = <TokenType.TIMESTAMPTZ: 143>
TIMESTAMPLTZ = <TokenType.TIMESTAMPLTZ: 144>
TIMESTAMPNTZ = <TokenType.TIMESTAMPNTZ: 145>
TIMESTAMP_S = <TokenType.TIMESTAMP_S: 146>
TIMESTAMP_MS = <TokenType.TIMESTAMP_MS: 147>
TIMESTAMP_NS = <TokenType.TIMESTAMP_NS: 148>
DATETIME = <TokenType.DATETIME: 149>
DATETIME2 = <TokenType.DATETIME2: 150>
DATETIME64 = <TokenType.DATETIME64: 151>
SMALLDATETIME = <TokenType.SMALLDATETIME: 152>
DATE = <TokenType.DATE: 153>
DATE32 = <TokenType.DATE32: 154>
INT4RANGE = <TokenType.INT4RANGE: 155>
INT4MULTIRANGE = <TokenType.INT4MULTIRANGE: 156>
INT8RANGE = <TokenType.INT8RANGE: 157>
INT8MULTIRANGE = <TokenType.INT8MULTIRANGE: 158>
NUMRANGE = <TokenType.NUMRANGE: 159>
NUMMULTIRANGE = <TokenType.NUMMULTIRANGE: 160>
TSRANGE = <TokenType.TSRANGE: 161>
TSMULTIRANGE = <TokenType.TSMULTIRANGE: 162>
TSTZRANGE = <TokenType.TSTZRANGE: 163>
TSTZMULTIRANGE = <TokenType.TSTZMULTIRANGE: 164>
DATERANGE = <TokenType.DATERANGE: 165>
DATEMULTIRANGE = <TokenType.DATEMULTIRANGE: 166>
UUID = <TokenType.UUID: 167>
GEOGRAPHY = <TokenType.GEOGRAPHY: 168>
GEOGRAPHYPOINT = <TokenType.GEOGRAPHYPOINT: 169>
NULLABLE = <TokenType.NULLABLE: 170>
GEOMETRY = <TokenType.GEOMETRY: 171>
POINT = <TokenType.POINT: 172>
RING = <TokenType.RING: 173>
LINESTRING = <TokenType.LINESTRING: 174>
LOCALTIME = <TokenType.LOCALTIME: 175>
LOCALTIMESTAMP = <TokenType.LOCALTIMESTAMP: 176>
SYSTIMESTAMP = <TokenType.SYSTIMESTAMP: 177>
MULTILINESTRING = <TokenType.MULTILINESTRING: 178>
POLYGON = <TokenType.POLYGON: 179>
MULTIPOLYGON = <TokenType.MULTIPOLYGON: 180>
HLLSKETCH = <TokenType.HLLSKETCH: 181>
HSTORE = <TokenType.HSTORE: 182>
SUPER = <TokenType.SUPER: 183>
SERIAL = <TokenType.SERIAL: 184>
SMALLSERIAL = <TokenType.SMALLSERIAL: 185>
BIGSERIAL = <TokenType.BIGSERIAL: 186>
XML = <TokenType.XML: 187>
YEAR = <TokenType.YEAR: 188>
USERDEFINED = <TokenType.USERDEFINED: 189>
MONEY = <TokenType.MONEY: 190>
SMALLMONEY = <TokenType.SMALLMONEY: 191>
ROWVERSION = <TokenType.ROWVERSION: 192>
IMAGE = <TokenType.IMAGE: 193>
VARIANT = <TokenType.VARIANT: 194>
OBJECT = <TokenType.OBJECT: 195>
INET = <TokenType.INET: 196>
IPADDRESS = <TokenType.IPADDRESS: 197>
IPPREFIX = <TokenType.IPPREFIX: 198>
IPV4 = <TokenType.IPV4: 199>
IPV6 = <TokenType.IPV6: 200>
ENUM = <TokenType.ENUM: 201>
ENUM8 = <TokenType.ENUM8: 202>
ENUM16 = <TokenType.ENUM16: 203>
FIXEDSTRING = <TokenType.FIXEDSTRING: 204>
LOWCARDINALITY = <TokenType.LOWCARDINALITY: 205>
NESTED = <TokenType.NESTED: 206>
AGGREGATEFUNCTION = <TokenType.AGGREGATEFUNCTION: 207>
SIMPLEAGGREGATEFUNCTION = <TokenType.SIMPLEAGGREGATEFUNCTION: 208>
TDIGEST = <TokenType.TDIGEST: 209>
UNKNOWN = <TokenType.UNKNOWN: 210>
VECTOR = <TokenType.VECTOR: 211>
DYNAMIC = <TokenType.DYNAMIC: 212>
VOID = <TokenType.VOID: 213>
ALIAS = <TokenType.ALIAS: 214>
ALTER = <TokenType.ALTER: 215>
ALL = <TokenType.ALL: 216>
ANTI = <TokenType.ANTI: 217>
ANY = <TokenType.ANY: 218>
APPLY = <TokenType.APPLY: 219>
ARRAY = <TokenType.ARRAY: 220>
ASC = <TokenType.ASC: 221>
ASOF = <TokenType.ASOF: 222>
ATTACH = <TokenType.ATTACH: 223>
AUTO_INCREMENT = <TokenType.AUTO_INCREMENT: 224>
BEGIN = <TokenType.BEGIN: 225>
BETWEEN = <TokenType.BETWEEN: 226>
BULK_COLLECT_INTO = <TokenType.BULK_COLLECT_INTO: 227>
CACHE = <TokenType.CACHE: 228>
CASE = <TokenType.CASE: 229>
CHARACTER_SET = <TokenType.CHARACTER_SET: 230>
CLUSTER_BY = <TokenType.CLUSTER_BY: 231>
COLLATE = <TokenType.COLLATE: 232>
COMMAND = <TokenType.COMMAND: 233>
COMMENT = <TokenType.COMMENT: 234>
COMMIT = <TokenType.COMMIT: 235>
CONNECT_BY = <TokenType.CONNECT_BY: 236>
CONSTRAINT = <TokenType.CONSTRAINT: 237>
COPY = <TokenType.COPY: 238>
CREATE = <TokenType.CREATE: 239>
CROSS = <TokenType.CROSS: 240>
CUBE = <TokenType.CUBE: 241>
CURRENT_DATE = <TokenType.CURRENT_DATE: 242>
CURRENT_DATETIME = <TokenType.CURRENT_DATETIME: 243>
CURRENT_SCHEMA = <TokenType.CURRENT_SCHEMA: 244>
CURRENT_TIME = <TokenType.CURRENT_TIME: 245>
CURRENT_TIMESTAMP = <TokenType.CURRENT_TIMESTAMP: 246>
CURRENT_USER = <TokenType.CURRENT_USER: 247>
CURRENT_ROLE = <TokenType.CURRENT_ROLE: 248>
CURRENT_CATALOG = <TokenType.CURRENT_CATALOG: 249>
DECLARE = <TokenType.DECLARE: 250>
DEFAULT = <TokenType.DEFAULT: 251>
DELETE = <TokenType.DELETE: 252>
DESC = <TokenType.DESC: 253>
DESCRIBE = <TokenType.DESCRIBE: 254>
DETACH = <TokenType.DETACH: 255>
DICTIONARY = <TokenType.DICTIONARY: 256>
DISTINCT = <TokenType.DISTINCT: 257>
DISTRIBUTE_BY = <TokenType.DISTRIBUTE_BY: 258>
DIV = <TokenType.DIV: 259>
DROP = <TokenType.DROP: 260>
ELSE = <TokenType.ELSE: 261>
END = <TokenType.END: 262>
ESCAPE = <TokenType.ESCAPE: 263>
EXCEPT = <TokenType.EXCEPT: 264>
EXECUTE = <TokenType.EXECUTE: 265>
EXISTS = <TokenType.EXISTS: 266>
FALSE = <TokenType.FALSE: 267>
FETCH = <TokenType.FETCH: 268>
FILE = <TokenType.FILE: 269>
FILE_FORMAT = <TokenType.FILE_FORMAT: 270>
FILTER = <TokenType.FILTER: 271>
FINAL = <TokenType.FINAL: 272>
FIRST = <TokenType.FIRST: 273>
FOR = <TokenType.FOR: 274>
FORCE = <TokenType.FORCE: 275>
FOREIGN_KEY = <TokenType.FOREIGN_KEY: 276>
FORMAT = <TokenType.FORMAT: 277>
FROM = <TokenType.FROM: 278>
FULL = <TokenType.FULL: 279>
FUNCTION = <TokenType.FUNCTION: 280>
GET = <TokenType.GET: 281>
GLOB = <TokenType.GLOB: 282>
GLOBAL = <TokenType.GLOBAL: 283>
GRANT = <TokenType.GRANT: 284>
GROUP_BY = <TokenType.GROUP_BY: 285>
GROUPING_SETS = <TokenType.GROUPING_SETS: 286>
HAVING = <TokenType.HAVING: 287>
HINT = <TokenType.HINT: 288>
IGNORE = <TokenType.IGNORE: 289>
ILIKE = <TokenType.ILIKE: 290>
IN = <TokenType.IN: 291>
INDEX = <TokenType.INDEX: 292>
INDEXED_BY = <TokenType.INDEXED_BY: 293>
INNER = <TokenType.INNER: 294>
INSERT = <TokenType.INSERT: 295>
INSTALL = <TokenType.INSTALL: 296>
INTERSECT = <TokenType.INTERSECT: 297>
INTERVAL = <TokenType.INTERVAL: 298>
INTO = <TokenType.INTO: 299>
INTRODUCER = <TokenType.INTRODUCER: 300>
IRLIKE = <TokenType.IRLIKE: 301>
IS = <TokenType.IS: 302>
ISNULL = <TokenType.ISNULL: 303>
JOIN = <TokenType.JOIN: 304>
JOIN_MARKER = <TokenType.JOIN_MARKER: 305>
KEEP = <TokenType.KEEP: 306>
KEY = <TokenType.KEY: 307>
KILL = <TokenType.KILL: 308>
LANGUAGE = <TokenType.LANGUAGE: 309>
LATERAL = <TokenType.LATERAL: 310>
LEFT = <TokenType.LEFT: 311>
LIKE = <TokenType.LIKE: 312>
LIMIT = <TokenType.LIMIT: 313>
LIST = <TokenType.LIST: 314>
LOAD = <TokenType.LOAD: 315>
LOCK = <TokenType.LOCK: 316>
MAP = <TokenType.MAP: 317>
MATCH = <TokenType.MATCH: 318>
MATCH_CONDITION = <TokenType.MATCH_CONDITION: 319>
MATCH_RECOGNIZE = <TokenType.MATCH_RECOGNIZE: 320>
MEMBER_OF = <TokenType.MEMBER_OF: 321>
MERGE = <TokenType.MERGE: 322>
MOD = <TokenType.MOD: 323>
MODEL = <TokenType.MODEL: 324>
NATURAL = <TokenType.NATURAL: 325>
NEXT = <TokenType.NEXT: 326>
NOTHING = <TokenType.NOTHING: 327>
NOTNULL = <TokenType.NOTNULL: 328>
NULL = <TokenType.NULL: 329>
OBJECT_IDENTIFIER = <TokenType.OBJECT_IDENTIFIER: 330>
OFFSET = <TokenType.OFFSET: 331>
ON = <TokenType.ON: 332>
ONLY = <TokenType.ONLY: 333>
OPERATOR = <TokenType.OPERATOR: 334>
ORDER_BY = <TokenType.ORDER_BY: 335>
ORDER_SIBLINGS_BY = <TokenType.ORDER_SIBLINGS_BY: 336>
ORDERED = <TokenType.ORDERED: 337>
ORDINALITY = <TokenType.ORDINALITY: 338>
OUT = <TokenType.OUT: 339>
INOUT = <TokenType.INOUT: 340>
OUTER = <TokenType.OUTER: 341>
OVER = <TokenType.OVER: 342>
OVERLAPS = <TokenType.OVERLAPS: 343>
OVERWRITE = <TokenType.OVERWRITE: 344>
PARTITION = <TokenType.PARTITION: 345>
PARTITION_BY = <TokenType.PARTITION_BY: 346>
PERCENT = <TokenType.PERCENT: 347>
PIVOT = <TokenType.PIVOT: 348>
PLACEHOLDER = <TokenType.PLACEHOLDER: 349>
POSITIONAL = <TokenType.POSITIONAL: 350>
PRAGMA = <TokenType.PRAGMA: 351>
PREWHERE = <TokenType.PREWHERE: 352>
PRIMARY_KEY = <TokenType.PRIMARY_KEY: 353>
PROCEDURE = <TokenType.PROCEDURE: 354>
PROPERTIES = <TokenType.PROPERTIES: 355>
PSEUDO_TYPE = <TokenType.PSEUDO_TYPE: 356>
PUT = <TokenType.PUT: 357>
QUALIFY = <TokenType.QUALIFY: 358>
QUOTE = <TokenType.QUOTE: 359>
QDCOLON = <TokenType.QDCOLON: 360>
RANGE = <TokenType.RANGE: 361>
RECURSIVE = <TokenType.RECURSIVE: 362>
REFRESH = <TokenType.REFRESH: 363>
RENAME = <TokenType.RENAME: 364>
REPLACE = <TokenType.REPLACE: 365>
RETURNING = <TokenType.RETURNING: 366>
REVOKE = <TokenType.REVOKE: 367>
REFERENCES = <TokenType.REFERENCES: 368>
RIGHT = <TokenType.RIGHT: 369>
RLIKE = <TokenType.RLIKE: 370>
ROLLBACK = <TokenType.ROLLBACK: 371>
ROLLUP = <TokenType.ROLLUP: 372>
ROW = <TokenType.ROW: 373>
ROWS = <TokenType.ROWS: 374>
SELECT = <TokenType.SELECT: 375>
SEMI = <TokenType.SEMI: 376>
SEPARATOR = <TokenType.SEPARATOR: 377>
SEQUENCE = <TokenType.SEQUENCE: 378>
SERDE_PROPERTIES = <TokenType.SERDE_PROPERTIES: 379>
SET = <TokenType.SET: 380>
SETTINGS = <TokenType.SETTINGS: 381>
SHOW = <TokenType.SHOW: 382>
SIMILAR_TO = <TokenType.SIMILAR_TO: 383>
SOME = <TokenType.SOME: 384>
SORT_BY = <TokenType.SORT_BY: 385>
SOUNDS_LIKE = <TokenType.SOUNDS_LIKE: 386>
START_WITH = <TokenType.START_WITH: 387>
STORAGE_INTEGRATION = <TokenType.STORAGE_INTEGRATION: 388>
STRAIGHT_JOIN = <TokenType.STRAIGHT_JOIN: 389>
STRUCT = <TokenType.STRUCT: 390>
SUMMARIZE = <TokenType.SUMMARIZE: 391>
TABLE_SAMPLE = <TokenType.TABLE_SAMPLE: 392>
TAG = <TokenType.TAG: 393>
TEMPORARY = <TokenType.TEMPORARY: 394>
TOP = <TokenType.TOP: 395>
THEN = <TokenType.THEN: 396>
TRUE = <TokenType.TRUE: 397>
TRUNCATE = <TokenType.TRUNCATE: 398>
TRIGGER = <TokenType.TRIGGER: 399>
UNCACHE = <TokenType.UNCACHE: 400>
UNION = <TokenType.UNION: 401>
UNNEST = <TokenType.UNNEST: 402>
UNPIVOT = <TokenType.UNPIVOT: 403>
UPDATE = <TokenType.UPDATE: 404>
USE = <TokenType.USE: 405>
USING = <TokenType.USING: 406>
VALUES = <TokenType.VALUES: 407>
VARIADIC = <TokenType.VARIADIC: 408>
VIEW = <TokenType.VIEW: 409>
SEMANTIC_VIEW = <TokenType.SEMANTIC_VIEW: 410>
VOLATILE = <TokenType.VOLATILE: 411>
WHEN = <TokenType.WHEN: 412>
WHERE = <TokenType.WHERE: 413>
WINDOW = <TokenType.WINDOW: 414>
WITH = <TokenType.WITH: 415>
UNIQUE = <TokenType.UNIQUE: 416>
UTC_DATE = <TokenType.UTC_DATE: 417>
UTC_TIME = <TokenType.UTC_TIME: 418>
UTC_TIMESTAMP = <TokenType.UTC_TIMESTAMP: 419>
VERSION_SNAPSHOT = <TokenType.VERSION_SNAPSHOT: 420>
TIMESTAMP_SNAPSHOT = <TokenType.TIMESTAMP_SNAPSHOT: 421>
OPTION = <TokenType.OPTION: 422>
SINK = <TokenType.SINK: 423>
SOURCE = <TokenType.SOURCE: 424>
ANALYZE = <TokenType.ANALYZE: 425>
NAMESPACE = <TokenType.NAMESPACE: 426>
EXPORT = <TokenType.EXPORT: 427>
HIVE_TOKEN_STREAM = <TokenType.HIVE_TOKEN_STREAM: 428>
class Token:
464class Token:
465    # mypyc doesn't expose slots
466    _attrs: t.ClassVar[t.Tuple[str, ...]] = (
467        "token_type",
468        "text",
469        "line",
470        "col",
471        "start",
472        "end",
473        "comments",
474    )
475    __slots__ = _attrs
476
477    @classmethod
478    def number(cls, number: int) -> Token:
479        """Returns a NUMBER token with `number` as its text."""
480        return cls(TokenType.NUMBER, str(number))
481
482    @classmethod
483    def string(cls, string: str) -> Token:
484        """Returns a STRING token with `string` as its text."""
485        return cls(TokenType.STRING, string)
486
487    @classmethod
488    def identifier(cls, identifier: str) -> Token:
489        """Returns an IDENTIFIER token with `identifier` as its text."""
490        return cls(TokenType.IDENTIFIER, identifier)
491
492    @classmethod
493    def var(cls, var: str) -> Token:
494        """Returns an VAR token with `var` as its text."""
495        return cls(TokenType.VAR, var)
496
497    def __init__(
498        self,
499        token_type: TokenType,
500        text: str,
501        line: int = 1,
502        col: int = 1,
503        start: int = 0,
504        end: int = 0,
505        comments: t.Optional[t.List[str]] = None,
506    ) -> None:
507        self.token_type = token_type
508        self.text = text
509        self.line = line
510        self.col = col
511        self.start = start
512        self.end = end
513        self.comments = [] if comments is None else comments
514
515    def __repr__(self) -> str:
516        attributes = ", ".join(
517            f"{k}: TokenType.{self.token_type.name}"
518            if k == "token_type"
519            else f"{k}: {getattr(self, k)}"
520            for k in self._attrs
521        )
522        return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: Optional[List[str]] = None)
497    def __init__(
498        self,
499        token_type: TokenType,
500        text: str,
501        line: int = 1,
502        col: int = 1,
503        start: int = 0,
504        end: int = 0,
505        comments: t.Optional[t.List[str]] = None,
506    ) -> None:
507        self.token_type = token_type
508        self.text = text
509        self.line = line
510        self.col = col
511        self.start = start
512        self.end = end
513        self.comments = [] if comments is None else comments
@classmethod
def number(cls, number: int) -> Token:
477    @classmethod
478    def number(cls, number: int) -> Token:
479        """Returns a NUMBER token with `number` as its text."""
480        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
482    @classmethod
483    def string(cls, string: str) -> Token:
484        """Returns a STRING token with `string` as its text."""
485        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
487    @classmethod
488    def identifier(cls, identifier: str) -> Token:
489        """Returns an IDENTIFIER token with `identifier` as its text."""
490        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
492    @classmethod
493    def var(cls, var: str) -> Token:
494        """Returns an VAR token with `var` as its text."""
495        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class TokenizerCore:
 525class TokenizerCore:
 526    __slots__ = (
 527        "sql",
 528        "size",
 529        "tokens",
 530        "_start",
 531        "_current",
 532        "_line",
 533        "_col",
 534        "_comments",
 535        "_char",
 536        "_end",
 537        "_peek",
 538        "_prev_token_line",
 539        "single_tokens",
 540        "keywords",
 541        "quotes",
 542        "format_strings",
 543        "identifiers",
 544        "comments",
 545        "string_escapes",
 546        "byte_string_escapes",
 547        "identifier_escapes",
 548        "escape_follow_chars",
 549        "commands",
 550        "command_prefix_tokens",
 551        "nested_comments",
 552        "hint_start",
 553        "tokens_preceding_hint",
 554        "bit_strings",
 555        "hex_strings",
 556        "numeric_literals",
 557        "var_single_tokens",
 558        "string_escapes_allowed_in_raw_strings",
 559        "heredoc_tag_is_identifier",
 560        "heredoc_string_alternative",
 561        "keyword_trie",
 562        "numbers_can_be_underscore_separated",
 563        "identifiers_can_start_with_digit",
 564        "unescaped_sequences",
 565    )
 566
 567    def __init__(
 568        self,
 569        single_tokens: t.Dict[str, TokenType],
 570        keywords: t.Dict[str, TokenType],
 571        quotes: t.Dict[str, str],
 572        format_strings: t.Dict[str, t.Tuple[str, TokenType]],
 573        identifiers: t.Dict[str, str],
 574        comments: t.Dict[str, t.Optional[str]],
 575        string_escapes: t.Set[str],
 576        byte_string_escapes: t.Set[str],
 577        identifier_escapes: t.Set[str],
 578        escape_follow_chars: t.Set[str],
 579        commands: t.Set[TokenType],
 580        command_prefix_tokens: t.Set[TokenType],
 581        nested_comments: bool,
 582        hint_start: str,
 583        tokens_preceding_hint: t.Set[TokenType],
 584        bit_strings: t.List[t.Union[str, t.Tuple[str, str]]],
 585        hex_strings: t.List[t.Union[str, t.Tuple[str, str]]],
 586        numeric_literals: t.Dict[str, str],
 587        var_single_tokens: t.Set[str],
 588        string_escapes_allowed_in_raw_strings: bool,
 589        heredoc_tag_is_identifier: bool,
 590        heredoc_string_alternative: TokenType,
 591        keyword_trie: t.Dict,
 592        numbers_can_be_underscore_separated: bool,
 593        identifiers_can_start_with_digit: bool,
 594        unescaped_sequences: t.Dict[str, str],
 595    ) -> None:
 596        self.single_tokens = single_tokens
 597        self.keywords = keywords
 598        self.quotes = quotes
 599        self.format_strings = format_strings
 600        self.identifiers = identifiers
 601        self.comments = comments
 602        self.string_escapes = string_escapes
 603        self.byte_string_escapes = byte_string_escapes
 604        self.identifier_escapes = identifier_escapes
 605        self.escape_follow_chars = escape_follow_chars
 606        self.commands = commands
 607        self.command_prefix_tokens = command_prefix_tokens
 608        self.nested_comments = nested_comments
 609        self.hint_start = hint_start
 610        self.tokens_preceding_hint = tokens_preceding_hint
 611        self.bit_strings = bit_strings
 612        self.hex_strings = hex_strings
 613        self.numeric_literals = numeric_literals
 614        self.var_single_tokens = var_single_tokens
 615        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
 616        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
 617        self.heredoc_string_alternative = heredoc_string_alternative
 618        self.keyword_trie = keyword_trie
 619        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
 620        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
 621        self.unescaped_sequences = unescaped_sequences
 622        self.reset()
 623
 624    def reset(self) -> None:
 625        self.sql = ""
 626        self.size = 0
 627        self.tokens: t.List[Token] = []
 628        self._start = 0
 629        self._current = 0
 630        self._line = 1
 631        self._col = 0
 632        self._comments: t.List[str] = []
 633        self._char = ""
 634        self._end = False
 635        self._peek = ""
 636        self._prev_token_line = -1
 637
 638    def tokenize(self, sql: str) -> t.List[Token]:
 639        """Returns a list of tokens corresponding to the SQL string `sql`."""
 640        self.reset()
 641        self.sql = sql
 642        self.size = len(sql)
 643
 644        try:
 645            self._scan()
 646        except Exception as e:
 647            start = max(self._current - 50, 0)
 648            end = min(self._current + 50, self.size - 1)
 649            context = self.sql[start:end]
 650            raise TokenError(f"Error tokenizing '{context}'") from e
 651
 652        return self.tokens
 653
 654    def _scan(self, check_semicolon: bool = False) -> None:
 655        identifiers = self.identifiers
 656        space_chars = _SPACE_CHARS
 657        digit_chars = _DIGIT_CHARS
 658
 659        while self.size and not self._end:
 660            current = self._current
 661
 662            # Skip spaces here rather than iteratively calling advance() for performance reasons
 663            while current < self.size:
 664                char = self.sql[current]
 665
 666                if char == " " or char == "\t":
 667                    current += 1
 668                else:
 669                    break
 670
 671            offset = current - self._current if current > self._current else 1
 672
 673            self._start = current
 674            self._advance(offset)
 675
 676            if self._char not in space_chars:
 677                if self._char in digit_chars:
 678                    self._scan_number()
 679                elif self._char in identifiers:
 680                    self._scan_identifier(identifiers[self._char])
 681                else:
 682                    self._scan_keywords()
 683
 684            if check_semicolon and self._peek == ";":
 685                break
 686
 687        if self.tokens and self._comments:
 688            self.tokens[-1].comments.extend(self._comments)
 689
 690    def _chars(self, size: int) -> str:
 691        if size == 1:
 692            return self._char
 693
 694        start = self._current - 1
 695        end = start + size
 696
 697        return self.sql[start:end] if end <= self.size else ""
 698
 699    def _advance(self, i: int = 1, alnum: bool = False) -> None:
 700        char = self._char
 701
 702        if char == "\n" or char == "\r":
 703            # Ensures we don't count an extra line if we get a \r\n line break sequence
 704            if not (char == "\r" and self._peek == "\n"):
 705                self._col = i
 706                self._line += 1
 707        else:
 708            self._col += i
 709
 710        self._current += i
 711        sql = self.sql
 712        size = self.size
 713        self._end = self._current >= size
 714        self._char = sql[self._current - 1]
 715        self._peek = "" if self._end else sql[self._current]
 716
 717        if alnum and self._char.isalnum():
 718            # Cache to local variables instead of attributes for better performance
 719            _col = self._col
 720            _current = self._current
 721            _end = self._end
 722            _peek = self._peek
 723
 724            while _peek.isalnum():
 725                _col += 1
 726                _current += 1
 727                _end = _current >= size
 728                _peek = "" if _end else sql[_current]
 729
 730            self._col = _col
 731            self._current = _current
 732            self._end = _end
 733            self._peek = _peek
 734            self._char = sql[_current - 1]
 735
 736    @property
 737    def _text(self) -> str:
 738        return self.sql[self._start : self._current]
 739
 740    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
 741        self._prev_token_line = self._line
 742
 743        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
 744            self.tokens[-1].comments.extend(self._comments)
 745            self._comments = []
 746
 747        if text is None:
 748            text = self.sql[self._start : self._current]
 749
 750        self.tokens.append(
 751            Token(
 752                token_type,
 753                text=text,
 754                line=self._line,
 755                col=self._col,
 756                start=self._start,
 757                end=self._current - 1,
 758                comments=self._comments,
 759            )
 760        )
 761        self._comments = []
 762
 763        # If we have either a semicolon or a begin token before the command's token, we'll parse
 764        # whatever follows the command's token as a string
 765        if (
 766            token_type in self.commands
 767            and self._peek != ";"
 768            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.command_prefix_tokens)
 769        ):
 770            start = self._current
 771            tokens = len(self.tokens)
 772            self._scan(check_semicolon=True)
 773            self.tokens = self.tokens[:tokens]
 774            text = self.sql[start : self._current].strip()
 775            if text:
 776                self._add(TokenType.STRING, text)
 777
 778    def _scan_keywords(self) -> None:
 779        sql = self.sql
 780        sql_size = self.size
 781        single_tokens = self.single_tokens
 782        char_upper = _CHAR_UPPER
 783        space_chars = _SPACE_CHARS
 784        size = 0
 785        word = None
 786        chars = self._char
 787        char = chars
 788        prev_space = False
 789        skip = False
 790        trie = self.keyword_trie
 791        single_token = char in single_tokens
 792
 793        while chars:
 794            if not skip:
 795                sub = trie.get(char_upper.get(char, char))
 796                if sub is None:
 797                    break
 798                trie = sub
 799                if 0 in trie:
 800                    word = chars
 801
 802            end = self._current + size
 803            size += 1
 804
 805            if end < sql_size:
 806                char = sql[end]
 807                single_token = single_token or char in single_tokens
 808                is_space = char in space_chars
 809
 810                if not is_space or not prev_space:
 811                    if is_space:
 812                        char = " "
 813                    chars += char
 814                    prev_space = is_space
 815                    skip = False
 816                else:
 817                    skip = True
 818            else:
 819                char = ""
 820                break
 821
 822        if word:
 823            if self._scan_string(word):
 824                return
 825            if self._scan_comment(word):
 826                return
 827            if prev_space or single_token or not char:
 828                self._advance(size - 1)
 829                word = word.upper()
 830                self._add(self.keywords[word], text=word)
 831                return
 832
 833        if self._char in single_tokens:
 834            self._add(single_tokens[self._char], text=self._char)
 835            return
 836
 837        self._scan_var()
 838
 839    def _scan_comment(self, comment_start: str) -> bool:
 840        if comment_start not in self.comments:
 841            return False
 842
 843        comment_start_line = self._line
 844        comment_start_size = len(comment_start)
 845        comment_end = self.comments[comment_start]
 846
 847        if comment_end:
 848            # Skip the comment's start delimiter
 849            self._advance(comment_start_size)
 850
 851            comment_count = 1
 852            comment_end_size = len(comment_end)
 853            nested_comments = self.nested_comments
 854
 855            while not self._end:
 856                if self._chars(comment_end_size) == comment_end:
 857                    comment_count -= 1
 858                    if not comment_count:
 859                        break
 860
 861                self._advance(alnum=True)
 862
 863                # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres
 864                if (
 865                    nested_comments
 866                    and not self._end
 867                    and self._chars(comment_end_size) == comment_start
 868                ):
 869                    self._advance(comment_start_size)
 870                    comment_count += 1
 871
 872            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
 873            self._advance(comment_end_size - 1)
 874        else:
 875            _peek = self._peek
 876            while not self._end and _peek != "\n" and _peek != "\r":
 877                self._advance(alnum=True)
 878                _peek = self._peek
 879            self._comments.append(self._text[comment_start_size:])
 880
 881        if (
 882            comment_start == self.hint_start
 883            and self.tokens
 884            and self.tokens[-1].token_type in self.tokens_preceding_hint
 885        ):
 886            self._add(TokenType.HINT)
 887
 888        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
 889        # Multiple consecutive comments are preserved by appending them to the current comments list.
 890        if comment_start_line == self._prev_token_line:
 891            self.tokens[-1].comments.extend(self._comments)
 892            self._comments = []
 893            self._prev_token_line = self._line
 894
 895        return True
 896
 897    def _scan_number(self) -> None:
 898        if self._char == "0":
 899            peek = _CHAR_UPPER.get(self._peek, self._peek)
 900            if peek == "B":
 901                return self._scan_bits() if self.bit_strings else self._add(TokenType.NUMBER)
 902            elif peek == "X":
 903                return self._scan_hex() if self.hex_strings else self._add(TokenType.NUMBER)
 904
 905        decimal = False
 906        scientific = 0
 907        numbers_can_be_underscore_separated = self.numbers_can_be_underscore_separated
 908        single_tokens = self.single_tokens
 909        keywords = self.keywords
 910        numeric_literals = self.numeric_literals
 911        identifiers_can_start_with_digit = self.identifiers_can_start_with_digit
 912
 913        while True:
 914            if self._peek in _DIGIT_CHARS:
 915                self._advance()
 916            elif self._peek == "." and not decimal:
 917                if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER:
 918                    return self._add(TokenType.NUMBER)
 919                decimal = True
 920                self._advance()
 921            elif self._peek in ("-", "+") and scientific == 1:
 922                # Only consume +/- if followed by a digit
 923                if self._current + 1 < self.size and self.sql[self._current + 1] in _DIGIT_CHARS:
 924                    scientific += 1
 925                    self._advance()
 926                else:
 927                    return self._add(TokenType.NUMBER)
 928            elif _CHAR_UPPER.get(self._peek, self._peek) == "E" and not scientific:
 929                scientific += 1
 930                self._advance()
 931            elif self._peek == "_" and numbers_can_be_underscore_separated:
 932                self._advance()
 933            elif self._peek.isidentifier():
 934                number_text = self._text
 935                literal = ""
 936
 937                while (
 938                    self._peek
 939                    and self._peek not in _SPACE_CHARS
 940                    and self._peek not in single_tokens
 941                ):
 942                    literal += self._peek
 943                    self._advance()
 944
 945                token_type = keywords.get(numeric_literals.get(literal.upper(), ""))
 946
 947                if token_type:
 948                    self._add(TokenType.NUMBER, number_text)
 949                    self._add(TokenType.DCOLON, "::")
 950                    return self._add(token_type, literal)
 951                elif identifiers_can_start_with_digit:
 952                    return self._add(TokenType.VAR)
 953
 954                self._advance(-len(literal))
 955                return self._add(TokenType.NUMBER, number_text)
 956            else:
 957                return self._add(TokenType.NUMBER)
 958
 959    def _scan_bits(self) -> None:
 960        self._advance()
 961        value = self._extract_value()
 962        try:
 963            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
 964            int(value, 2)
 965            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
 966        except ValueError:
 967            self._add(TokenType.IDENTIFIER)
 968
 969    def _scan_hex(self) -> None:
 970        self._advance()
 971        value = self._extract_value()
 972        try:
 973            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
 974            int(value, 16)
 975            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
 976        except ValueError:
 977            self._add(TokenType.IDENTIFIER)
 978
 979    def _extract_value(self) -> str:
 980        single_tokens = self.single_tokens
 981
 982        while True:
 983            char = self._peek.strip()
 984            if char and char not in single_tokens:
 985                self._advance(alnum=True)
 986            else:
 987                break
 988
 989        return self._text
 990
 991    def _scan_string(self, start: str) -> bool:
 992        base = None
 993        token_type = TokenType.STRING
 994
 995        if start in self.quotes:
 996            end = self.quotes[start]
 997        elif start in self.format_strings:
 998            end, token_type = self.format_strings[start]
 999
1000            if token_type == TokenType.HEX_STRING:
1001                base = 16
1002            elif token_type == TokenType.BIT_STRING:
1003                base = 2
1004            elif token_type == TokenType.HEREDOC_STRING:
1005                self._advance()
1006
1007                if self._char == end:
1008                    tag = ""
1009                else:
1010                    tag = self._extract_string(
1011                        end,
1012                        raw_string=True,
1013                        raise_unmatched=not self.heredoc_tag_is_identifier,
1014                    )
1015
1016                if (
1017                    tag
1018                    and self.heredoc_tag_is_identifier
1019                    and (self._end or tag.isdigit() or any(c.isspace() for c in tag))
1020                ):
1021                    if not self._end:
1022                        self._advance(-1)
1023
1024                    self._advance(-len(tag))
1025                    self._add(self.heredoc_string_alternative)
1026                    return True
1027
1028                end = f"{start}{tag}{end}"
1029        else:
1030            return False
1031
1032        self._advance(len(start))
1033        text = self._extract_string(
1034            end,
1035            escapes=(
1036                self.byte_string_escapes
1037                if token_type == TokenType.BYTE_STRING
1038                else self.string_escapes
1039            ),
1040            raw_string=token_type == TokenType.RAW_STRING,
1041        )
1042
1043        if base and text:
1044            try:
1045                int(text, base)
1046            except Exception:
1047                raise TokenError(
1048                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
1049                )
1050
1051        self._add(token_type, text)
1052        return True
1053
1054    def _scan_identifier(self, identifier_end: str) -> None:
1055        self._advance()
1056        text = self._extract_string(
1057            identifier_end, escapes=self.identifier_escapes | {identifier_end}
1058        )
1059        self._add(TokenType.IDENTIFIER, text)
1060
1061    def _scan_var(self) -> None:
1062        var_single_tokens = self.var_single_tokens
1063        single_tokens = self.single_tokens
1064
1065        while True:
1066            peek = self._peek
1067            if not peek or peek in _SPACE_CHARS:
1068                break
1069            if peek not in var_single_tokens and peek in single_tokens:
1070                break
1071            self._advance(alnum=True)
1072
1073        self._add(
1074            TokenType.VAR
1075            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
1076            else self.keywords.get(self.sql[self._start : self._current].upper(), TokenType.VAR)
1077        )
1078
1079    def _extract_string(
1080        self,
1081        delimiter: str,
1082        escapes: t.Optional[t.Set[str]] = None,
1083        raw_string: bool = False,
1084        raise_unmatched: bool = True,
1085    ) -> str:
1086        text = ""
1087        delim_size = len(delimiter)
1088        escapes = self.string_escapes if escapes is None else escapes
1089        unescaped_sequences = self.unescaped_sequences
1090        escape_follow_chars = self.escape_follow_chars
1091        string_escapes_allowed_in_raw_strings = self.string_escapes_allowed_in_raw_strings
1092        quotes = self.quotes
1093
1094        while True:
1095            if not raw_string and unescaped_sequences and self._peek and self._char in escapes:
1096                unescaped_sequence = unescaped_sequences.get(self._char + self._peek)
1097                if unescaped_sequence:
1098                    self._advance(2)
1099                    text += unescaped_sequence
1100                    continue
1101
1102            is_valid_custom_escape = (
1103                escape_follow_chars and self._char == "\\" and self._peek not in escape_follow_chars
1104            )
1105
1106            if (
1107                (string_escapes_allowed_in_raw_strings or not raw_string)
1108                and self._char in escapes
1109                and (self._peek == delimiter or self._peek in escapes or is_valid_custom_escape)
1110                and (self._char not in quotes or self._char == self._peek)
1111            ):
1112                if self._peek == delimiter:
1113                    text += self._peek
1114                elif is_valid_custom_escape and self._char != self._peek:
1115                    text += self._peek
1116                else:
1117                    text += self._char + self._peek
1118
1119                if self._current + 1 < self.size:
1120                    self._advance(2)
1121                else:
1122                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
1123            else:
1124                if self._chars(delim_size) == delimiter:
1125                    if delim_size > 1:
1126                        self._advance(delim_size - 1)
1127                    break
1128
1129                if self._end:
1130                    if not raise_unmatched:
1131                        return text + self._char
1132
1133                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
1134
1135                current = self._current - 1
1136                self._advance(alnum=True)
1137                text += self.sql[current : self._current - 1]
1138
1139        return text
TokenizerCore( single_tokens: Dict[str, TokenType], keywords: Dict[str, TokenType], quotes: Dict[str, str], format_strings: Dict[str, Tuple[str, TokenType]], identifiers: Dict[str, str], comments: Dict[str, Optional[str]], string_escapes: Set[str], byte_string_escapes: Set[str], identifier_escapes: Set[str], escape_follow_chars: Set[str], commands: Set[TokenType], command_prefix_tokens: Set[TokenType], nested_comments: bool, hint_start: str, tokens_preceding_hint: Set[TokenType], bit_strings: List[Union[str, Tuple[str, str]]], hex_strings: List[Union[str, Tuple[str, str]]], numeric_literals: Dict[str, str], var_single_tokens: Set[str], string_escapes_allowed_in_raw_strings: bool, heredoc_tag_is_identifier: bool, heredoc_string_alternative: TokenType, keyword_trie: Dict, numbers_can_be_underscore_separated: bool, identifiers_can_start_with_digit: bool, unescaped_sequences: Dict[str, str])
567    def __init__(
568        self,
569        single_tokens: t.Dict[str, TokenType],
570        keywords: t.Dict[str, TokenType],
571        quotes: t.Dict[str, str],
572        format_strings: t.Dict[str, t.Tuple[str, TokenType]],
573        identifiers: t.Dict[str, str],
574        comments: t.Dict[str, t.Optional[str]],
575        string_escapes: t.Set[str],
576        byte_string_escapes: t.Set[str],
577        identifier_escapes: t.Set[str],
578        escape_follow_chars: t.Set[str],
579        commands: t.Set[TokenType],
580        command_prefix_tokens: t.Set[TokenType],
581        nested_comments: bool,
582        hint_start: str,
583        tokens_preceding_hint: t.Set[TokenType],
584        bit_strings: t.List[t.Union[str, t.Tuple[str, str]]],
585        hex_strings: t.List[t.Union[str, t.Tuple[str, str]]],
586        numeric_literals: t.Dict[str, str],
587        var_single_tokens: t.Set[str],
588        string_escapes_allowed_in_raw_strings: bool,
589        heredoc_tag_is_identifier: bool,
590        heredoc_string_alternative: TokenType,
591        keyword_trie: t.Dict,
592        numbers_can_be_underscore_separated: bool,
593        identifiers_can_start_with_digit: bool,
594        unescaped_sequences: t.Dict[str, str],
595    ) -> None:
596        self.single_tokens = single_tokens
597        self.keywords = keywords
598        self.quotes = quotes
599        self.format_strings = format_strings
600        self.identifiers = identifiers
601        self.comments = comments
602        self.string_escapes = string_escapes
603        self.byte_string_escapes = byte_string_escapes
604        self.identifier_escapes = identifier_escapes
605        self.escape_follow_chars = escape_follow_chars
606        self.commands = commands
607        self.command_prefix_tokens = command_prefix_tokens
608        self.nested_comments = nested_comments
609        self.hint_start = hint_start
610        self.tokens_preceding_hint = tokens_preceding_hint
611        self.bit_strings = bit_strings
612        self.hex_strings = hex_strings
613        self.numeric_literals = numeric_literals
614        self.var_single_tokens = var_single_tokens
615        self.string_escapes_allowed_in_raw_strings = string_escapes_allowed_in_raw_strings
616        self.heredoc_tag_is_identifier = heredoc_tag_is_identifier
617        self.heredoc_string_alternative = heredoc_string_alternative
618        self.keyword_trie = keyword_trie
619        self.numbers_can_be_underscore_separated = numbers_can_be_underscore_separated
620        self.identifiers_can_start_with_digit = identifiers_can_start_with_digit
621        self.unescaped_sequences = unescaped_sequences
622        self.reset()
single_tokens
keywords
quotes
format_strings
identifiers
comments
string_escapes
byte_string_escapes
identifier_escapes
escape_follow_chars
commands
command_prefix_tokens
nested_comments
hint_start
tokens_preceding_hint
bit_strings
hex_strings
numeric_literals
var_single_tokens
string_escapes_allowed_in_raw_strings
heredoc_tag_is_identifier
heredoc_string_alternative
keyword_trie
numbers_can_be_underscore_separated
identifiers_can_start_with_digit
unescaped_sequences
def reset(self) -> None:
624    def reset(self) -> None:
625        self.sql = ""
626        self.size = 0
627        self.tokens: t.List[Token] = []
628        self._start = 0
629        self._current = 0
630        self._line = 1
631        self._col = 0
632        self._comments: t.List[str] = []
633        self._char = ""
634        self._end = False
635        self._peek = ""
636        self._prev_token_line = -1
def tokenize(self, sql: str) -> List[Token]:
638    def tokenize(self, sql: str) -> t.List[Token]:
639        """Returns a list of tokens corresponding to the SQL string `sql`."""
640        self.reset()
641        self.sql = sql
642        self.size = len(sql)
643
644        try:
645            self._scan()
646        except Exception as e:
647            start = max(self._current - 50, 0)
648            end = min(self._current + 50, self.size - 1)
649            context = self.sql[start:end]
650            raise TokenError(f"Error tokenizing '{context}'") from e
651
652        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.

size
sql
tokens