Edit on GitHub

sqlglot.tokens

  1from __future__ import annotations
  2
  3import typing as t
  4
  5from sqlglot.trie import new_trie
  6
  7# Import Token and TokenType from tokenizer_core (compiled with mypyc)
  8from sqlglot.tokenizer_core import Token, TokenType
  9
 10try:
 11    import sqlglotc  # noqa: F401
 12except ImportError:
 13    pass
 14
 15try:
 16    import sqlglotrs  # type: ignore # noqa: F401
 17    import warnings
 18
 19    if "sqlglotc" not in globals():
 20        warnings.warn(
 21            "sqlglot[rs] is deprecated and no longer compatible with sqlglot. "
 22            "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]",
 23        )
 24except ImportError:
 25    pass
 26
 27if t.TYPE_CHECKING:
 28    from sqlglot.dialects.dialect import DialectType
 29
 30
 31def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]:
 32    return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr)
 33
 34
 35def _quotes_to_format(
 36    token_type: TokenType, arr: list[str | tuple[str, str]]
 37) -> dict[str, tuple[str, TokenType]]:
 38    return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
 39
 40
 41class _TokenizerBase:
 42    QUOTES: t.ClassVar[list[tuple[str, str] | str]]
 43    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]]
 44    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 45    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 46    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 47    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 48    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 49    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 50    STRING_ESCAPES: t.ClassVar[list[str]]
 51    BYTE_STRING_ESCAPES: t.ClassVar[list[str]]
 52    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]]
 53    IDENTIFIER_ESCAPES: t.ClassVar[list[str]]
 54    HINT_START: t.ClassVar[str]
 55    KEYWORDS: t.ClassVar[dict[str, TokenType]]
 56    SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]]
 57    NUMERIC_LITERALS: t.ClassVar[dict[str, str]]
 58    VAR_SINGLE_TOKENS: t.ClassVar[set[str]]
 59    COMMANDS: t.ClassVar[set[TokenType]]
 60    COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]]
 61    HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool]
 62    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool]
 63    NESTED_COMMENTS: t.ClassVar[bool]
 64    TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]]
 65    HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType]
 66    COMMENTS: t.ClassVar[list[str | tuple[str, str]]]
 67    _QUOTES: t.ClassVar[dict[str, str]]
 68    _IDENTIFIERS: t.ClassVar[dict[str, str]]
 69    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]]
 70    _STRING_ESCAPES: t.ClassVar[set[str]]
 71    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]]
 72    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]]
 73    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]]
 74    _COMMENTS: t.ClassVar[dict[str, str | None]]
 75    _KEYWORD_TRIE: t.ClassVar[dict[str, object]]
 76
 77    @classmethod
 78    def __init_subclass__(cls, **kwargs: t.Any) -> None:
 79        super().__init_subclass__(**kwargs)
 80        cls._QUOTES = _convert_quotes(cls.QUOTES)
 81        cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS)
 82        cls._FORMAT_STRINGS = {
 83            **{
 84                p + s: (e, TokenType.NATIONAL_STRING)
 85                for s, e in cls._QUOTES.items()
 86                for p in ("n", "N")
 87            },
 88            **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS),
 89            **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS),
 90            **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS),
 91            **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS),
 92            **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS),
 93            **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS),
 94        }
 95        if "BYTE_STRING_ESCAPES" not in cls.__dict__:
 96            cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy()
 97        cls._STRING_ESCAPES = set(cls.STRING_ESCAPES)
 98        cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES)
 99        cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS)
100        cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES)
101        cls._COMMENTS = {
102            **{c: None for c in cls.COMMENTS if isinstance(c, str)},
103            **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)},
104            "{#": "#}",  # Ensure Jinja comments are tokenized correctly in all dialects
105        }
106        if cls.HINT_START in cls.KEYWORDS:
107            cls._COMMENTS[cls.HINT_START] = "*/"
108        cls._KEYWORD_TRIE = new_trie(
109            key.upper()
110            for key in (
111                *cls.KEYWORDS,
112                *cls._COMMENTS,
113                *cls._QUOTES,
114                *cls._FORMAT_STRINGS,
115            )
116            if " " in key or any(single in key for single in cls.SINGLE_TOKENS)
117        )
118
119
120class Tokenizer(_TokenizerBase):
121    SINGLE_TOKENS = {
122        "(": TokenType.L_PAREN,
123        ")": TokenType.R_PAREN,
124        "[": TokenType.L_BRACKET,
125        "]": TokenType.R_BRACKET,
126        "{": TokenType.L_BRACE,
127        "}": TokenType.R_BRACE,
128        "&": TokenType.AMP,
129        "^": TokenType.CARET,
130        ":": TokenType.COLON,
131        ",": TokenType.COMMA,
132        ".": TokenType.DOT,
133        "-": TokenType.DASH,
134        "=": TokenType.EQ,
135        ">": TokenType.GT,
136        "<": TokenType.LT,
137        "%": TokenType.MOD,
138        "!": TokenType.NOT,
139        "|": TokenType.PIPE,
140        "+": TokenType.PLUS,
141        ";": TokenType.SEMICOLON,
142        "/": TokenType.SLASH,
143        "\\": TokenType.BACKSLASH,
144        "*": TokenType.STAR,
145        "~": TokenType.TILDE,
146        "?": TokenType.PLACEHOLDER,
147        "@": TokenType.PARAMETER,
148        "#": TokenType.HASH,
149        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
150        "'": TokenType.UNKNOWN,
151        "`": TokenType.UNKNOWN,
152        '"': TokenType.UNKNOWN,
153    }
154
155    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
156    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
157    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
158    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
159    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
160    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
161    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
162    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
163    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
164    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
165    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
166    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
167
168    # The strings in this list can always be used as escapes, regardless of the surrounding
169    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
170    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
171    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
172
173    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
174    HEREDOC_TAG_IS_IDENTIFIER = False
175
176    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
177    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
178
179    # Whether string escape characters function as such when placed within raw strings
180    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
181
182    NESTED_COMMENTS = True
183
184    HINT_START = "/*+"
185
186    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
187
188    # Autofilled
189    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
190    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
191    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
192    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
193    _QUOTES: t.ClassVar[dict[str, str]] = {}
194    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
195    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
196    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
197    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
198
199    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
200        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
201        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
202        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
203        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
204        HINT_START: TokenType.HINT,
205        "&<": TokenType.AMP_LT,
206        "&>": TokenType.AMP_GT,
207        "==": TokenType.EQ,
208        "::": TokenType.DCOLON,
209        "?::": TokenType.QDCOLON,
210        "||": TokenType.DPIPE,
211        "|>": TokenType.PIPE_GT,
212        ">=": TokenType.GTE,
213        "<=": TokenType.LTE,
214        "<>": TokenType.NEQ,
215        "!=": TokenType.NEQ,
216        ":=": TokenType.COLON_EQ,
217        "<=>": TokenType.NULLSAFE_EQ,
218        "->": TokenType.ARROW,
219        "->>": TokenType.DARROW,
220        "=>": TokenType.FARROW,
221        "#>": TokenType.HASH_ARROW,
222        "#>>": TokenType.DHASH_ARROW,
223        "<->": TokenType.LR_ARROW,
224        "&&": TokenType.DAMP,
225        "??": TokenType.DQMARK,
226        "~~~": TokenType.GLOB,
227        "~~": TokenType.LIKE,
228        "~~*": TokenType.ILIKE,
229        "~*": TokenType.IRLIKE,
230        "-|-": TokenType.ADJACENT,
231        "ALL": TokenType.ALL,
232        "AND": TokenType.AND,
233        "ANTI": TokenType.ANTI,
234        "ANY": TokenType.ANY,
235        "ASC": TokenType.ASC,
236        "AS": TokenType.ALIAS,
237        "ASOF": TokenType.ASOF,
238        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
239        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
240        "BEGIN": TokenType.BEGIN,
241        "BETWEEN": TokenType.BETWEEN,
242        "CACHE": TokenType.CACHE,
243        "UNCACHE": TokenType.UNCACHE,
244        "CASE": TokenType.CASE,
245        "CHARACTER SET": TokenType.CHARACTER_SET,
246        "CLUSTER BY": TokenType.CLUSTER_BY,
247        "COLLATE": TokenType.COLLATE,
248        "COLUMN": TokenType.COLUMN,
249        "COMMIT": TokenType.COMMIT,
250        "CONNECT BY": TokenType.CONNECT_BY,
251        "CONSTRAINT": TokenType.CONSTRAINT,
252        "COPY": TokenType.COPY,
253        "CREATE": TokenType.CREATE,
254        "CROSS": TokenType.CROSS,
255        "CUBE": TokenType.CUBE,
256        "CURRENT_DATE": TokenType.CURRENT_DATE,
257        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
258        "CURRENT_TIME": TokenType.CURRENT_TIME,
259        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
260        "CURRENT_USER": TokenType.CURRENT_USER,
261        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
262        "DATABASE": TokenType.DATABASE,
263        "DEFAULT": TokenType.DEFAULT,
264        "DELETE": TokenType.DELETE,
265        "DESC": TokenType.DESC,
266        "DESCRIBE": TokenType.DESCRIBE,
267        "DISTINCT": TokenType.DISTINCT,
268        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
269        "DIV": TokenType.DIV,
270        "DROP": TokenType.DROP,
271        "ELSE": TokenType.ELSE,
272        "END": TokenType.END,
273        "ENUM": TokenType.ENUM,
274        "ESCAPE": TokenType.ESCAPE,
275        "EXCEPT": TokenType.EXCEPT,
276        "EXECUTE": TokenType.EXECUTE,
277        "EXISTS": TokenType.EXISTS,
278        "FALSE": TokenType.FALSE,
279        "FETCH": TokenType.FETCH,
280        "FILTER": TokenType.FILTER,
281        "FILE": TokenType.FILE,
282        "FIRST": TokenType.FIRST,
283        "FULL": TokenType.FULL,
284        "FUNCTION": TokenType.FUNCTION,
285        "FOR": TokenType.FOR,
286        "FOREIGN KEY": TokenType.FOREIGN_KEY,
287        "FORMAT": TokenType.FORMAT,
288        "FROM": TokenType.FROM,
289        "GEOGRAPHY": TokenType.GEOGRAPHY,
290        "GEOMETRY": TokenType.GEOMETRY,
291        "GLOB": TokenType.GLOB,
292        "GROUP BY": TokenType.GROUP_BY,
293        "GROUPING SETS": TokenType.GROUPING_SETS,
294        "HAVING": TokenType.HAVING,
295        "ILIKE": TokenType.ILIKE,
296        "IN": TokenType.IN,
297        "INDEX": TokenType.INDEX,
298        "INET": TokenType.INET,
299        "INNER": TokenType.INNER,
300        "INSERT": TokenType.INSERT,
301        "INTERVAL": TokenType.INTERVAL,
302        "INTERSECT": TokenType.INTERSECT,
303        "INTO": TokenType.INTO,
304        "IS": TokenType.IS,
305        "ISNULL": TokenType.ISNULL,
306        "JOIN": TokenType.JOIN,
307        "KEEP": TokenType.KEEP,
308        "KILL": TokenType.KILL,
309        "LATERAL": TokenType.LATERAL,
310        "LEFT": TokenType.LEFT,
311        "LIKE": TokenType.LIKE,
312        "LIMIT": TokenType.LIMIT,
313        "LOAD": TokenType.LOAD,
314        "LOCALTIME": TokenType.LOCALTIME,
315        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
316        "LOCK": TokenType.LOCK,
317        "MERGE": TokenType.MERGE,
318        "NAMESPACE": TokenType.NAMESPACE,
319        "NATURAL": TokenType.NATURAL,
320        "NEXT": TokenType.NEXT,
321        "NOT": TokenType.NOT,
322        "NOTNULL": TokenType.NOTNULL,
323        "NULL": TokenType.NULL,
324        "OBJECT": TokenType.OBJECT,
325        "OFFSET": TokenType.OFFSET,
326        "ON": TokenType.ON,
327        "OR": TokenType.OR,
328        "XOR": TokenType.XOR,
329        "ORDER BY": TokenType.ORDER_BY,
330        "ORDINALITY": TokenType.ORDINALITY,
331        "OUT": TokenType.OUT,
332        "OUTER": TokenType.OUTER,
333        "OVER": TokenType.OVER,
334        "OVERLAPS": TokenType.OVERLAPS,
335        "OVERWRITE": TokenType.OVERWRITE,
336        "PARTITION": TokenType.PARTITION,
337        "PARTITION BY": TokenType.PARTITION_BY,
338        "PARTITIONED BY": TokenType.PARTITION_BY,
339        "PARTITIONED_BY": TokenType.PARTITION_BY,
340        "PERCENT": TokenType.PERCENT,
341        "PIVOT": TokenType.PIVOT,
342        "PRAGMA": TokenType.PRAGMA,
343        "PRIMARY KEY": TokenType.PRIMARY_KEY,
344        "PROCEDURE": TokenType.PROCEDURE,
345        "OPERATOR": TokenType.OPERATOR,
346        "QUALIFY": TokenType.QUALIFY,
347        "RANGE": TokenType.RANGE,
348        "RECURSIVE": TokenType.RECURSIVE,
349        "REGEXP": TokenType.RLIKE,
350        "RENAME": TokenType.RENAME,
351        "REPLACE": TokenType.REPLACE,
352        "RETURNING": TokenType.RETURNING,
353        "REFERENCES": TokenType.REFERENCES,
354        "RIGHT": TokenType.RIGHT,
355        "RLIKE": TokenType.RLIKE,
356        "ROLLBACK": TokenType.ROLLBACK,
357        "ROLLUP": TokenType.ROLLUP,
358        "ROW": TokenType.ROW,
359        "ROWS": TokenType.ROWS,
360        "SCHEMA": TokenType.SCHEMA,
361        "SELECT": TokenType.SELECT,
362        "SEMI": TokenType.SEMI,
363        "SESSION": TokenType.SESSION,
364        "SESSION_USER": TokenType.SESSION_USER,
365        "SET": TokenType.SET,
366        "SETTINGS": TokenType.SETTINGS,
367        "SHOW": TokenType.SHOW,
368        "SIMILAR TO": TokenType.SIMILAR_TO,
369        "SOME": TokenType.SOME,
370        "SORT BY": TokenType.SORT_BY,
371        "SQL SECURITY": TokenType.SQL_SECURITY,
372        "START WITH": TokenType.START_WITH,
373        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
374        "TABLE": TokenType.TABLE,
375        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
376        "TEMP": TokenType.TEMPORARY,
377        "TEMPORARY": TokenType.TEMPORARY,
378        "THEN": TokenType.THEN,
379        "TRUE": TokenType.TRUE,
380        "TRUNCATE": TokenType.TRUNCATE,
381        "TRIGGER": TokenType.TRIGGER,
382        "UNION": TokenType.UNION,
383        "UNKNOWN": TokenType.UNKNOWN,
384        "UNNEST": TokenType.UNNEST,
385        "UNPIVOT": TokenType.UNPIVOT,
386        "UPDATE": TokenType.UPDATE,
387        "USE": TokenType.USE,
388        "USING": TokenType.USING,
389        "UUID": TokenType.UUID,
390        "VALUES": TokenType.VALUES,
391        "VIEW": TokenType.VIEW,
392        "VOLATILE": TokenType.VOLATILE,
393        "WHEN": TokenType.WHEN,
394        "WHERE": TokenType.WHERE,
395        "WINDOW": TokenType.WINDOW,
396        "WITH": TokenType.WITH,
397        "APPLY": TokenType.APPLY,
398        "ARRAY": TokenType.ARRAY,
399        "BIT": TokenType.BIT,
400        "BOOL": TokenType.BOOLEAN,
401        "BOOLEAN": TokenType.BOOLEAN,
402        "BYTE": TokenType.TINYINT,
403        "MEDIUMINT": TokenType.MEDIUMINT,
404        "INT1": TokenType.TINYINT,
405        "TINYINT": TokenType.TINYINT,
406        "INT16": TokenType.SMALLINT,
407        "SHORT": TokenType.SMALLINT,
408        "SMALLINT": TokenType.SMALLINT,
409        "HUGEINT": TokenType.INT128,
410        "UHUGEINT": TokenType.UINT128,
411        "INT2": TokenType.SMALLINT,
412        "INTEGER": TokenType.INT,
413        "INT": TokenType.INT,
414        "INT4": TokenType.INT,
415        "INT32": TokenType.INT,
416        "INT64": TokenType.BIGINT,
417        "INT128": TokenType.INT128,
418        "INT256": TokenType.INT256,
419        "LONG": TokenType.BIGINT,
420        "BIGINT": TokenType.BIGINT,
421        "INT8": TokenType.TINYINT,
422        "UINT": TokenType.UINT,
423        "UINT128": TokenType.UINT128,
424        "UINT256": TokenType.UINT256,
425        "DEC": TokenType.DECIMAL,
426        "DECIMAL": TokenType.DECIMAL,
427        "DECIMAL32": TokenType.DECIMAL32,
428        "DECIMAL64": TokenType.DECIMAL64,
429        "DECIMAL128": TokenType.DECIMAL128,
430        "DECIMAL256": TokenType.DECIMAL256,
431        "DECFLOAT": TokenType.DECFLOAT,
432        "BIGDECIMAL": TokenType.BIGDECIMAL,
433        "BIGNUMERIC": TokenType.BIGDECIMAL,
434        "BIGNUM": TokenType.BIGNUM,
435        "LIST": TokenType.LIST,
436        "MAP": TokenType.MAP,
437        "NULLABLE": TokenType.NULLABLE,
438        "NUMBER": TokenType.DECIMAL,
439        "NUMERIC": TokenType.DECIMAL,
440        "FIXED": TokenType.DECIMAL,
441        "REAL": TokenType.FLOAT,
442        "FLOAT": TokenType.FLOAT,
443        "FLOAT4": TokenType.FLOAT,
444        "FLOAT8": TokenType.DOUBLE,
445        "DOUBLE": TokenType.DOUBLE,
446        "DOUBLE PRECISION": TokenType.DOUBLE,
447        "JSON": TokenType.JSON,
448        "JSONB": TokenType.JSONB,
449        "CHAR": TokenType.CHAR,
450        "CHARACTER": TokenType.CHAR,
451        "CHAR VARYING": TokenType.VARCHAR,
452        "CHARACTER VARYING": TokenType.VARCHAR,
453        "NCHAR": TokenType.NCHAR,
454        "VARCHAR": TokenType.VARCHAR,
455        "VARCHAR2": TokenType.VARCHAR,
456        "NVARCHAR": TokenType.NVARCHAR,
457        "NVARCHAR2": TokenType.NVARCHAR,
458        "BPCHAR": TokenType.BPCHAR,
459        "STR": TokenType.TEXT,
460        "STRING": TokenType.TEXT,
461        "TEXT": TokenType.TEXT,
462        "LONGTEXT": TokenType.LONGTEXT,
463        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
464        "TINYTEXT": TokenType.TINYTEXT,
465        "CLOB": TokenType.TEXT,
466        "LONGVARCHAR": TokenType.TEXT,
467        "BINARY": TokenType.BINARY,
468        "BLOB": TokenType.VARBINARY,
469        "LONGBLOB": TokenType.LONGBLOB,
470        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
471        "TINYBLOB": TokenType.TINYBLOB,
472        "BYTEA": TokenType.VARBINARY,
473        "VARBINARY": TokenType.VARBINARY,
474        "TIME": TokenType.TIME,
475        "TIMETZ": TokenType.TIMETZ,
476        "TIME_NS": TokenType.TIME_NS,
477        "TIMESTAMP": TokenType.TIMESTAMP,
478        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
479        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
480        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
481        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
482        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
483        "DATE": TokenType.DATE,
484        "DATETIME": TokenType.DATETIME,
485        "INT4RANGE": TokenType.INT4RANGE,
486        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
487        "INT8RANGE": TokenType.INT8RANGE,
488        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
489        "NUMRANGE": TokenType.NUMRANGE,
490        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
491        "TSRANGE": TokenType.TSRANGE,
492        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
493        "TSTZRANGE": TokenType.TSTZRANGE,
494        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
495        "DATERANGE": TokenType.DATERANGE,
496        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
497        "UNIQUE": TokenType.UNIQUE,
498        "VECTOR": TokenType.VECTOR,
499        "STRUCT": TokenType.STRUCT,
500        "SEQUENCE": TokenType.SEQUENCE,
501        "VARIANT": TokenType.VARIANT,
502        "ALTER": TokenType.ALTER,
503        "ANALYZE": TokenType.ANALYZE,
504        "CALL": TokenType.COMMAND,
505        "COMMENT": TokenType.COMMENT,
506        "EXPLAIN": TokenType.COMMAND,
507        "GRANT": TokenType.GRANT,
508        "REVOKE": TokenType.REVOKE,
509        "OPTIMIZE": TokenType.COMMAND,
510        "PREPARE": TokenType.COMMAND,
511        "VACUUM": TokenType.COMMAND,
512        "USER-DEFINED": TokenType.USERDEFINED,
513        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
514        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
515    }
516
517    COMMANDS = {
518        TokenType.COMMAND,
519        TokenType.EXECUTE,
520        TokenType.FETCH,
521        TokenType.SHOW,
522        TokenType.RENAME,
523    }
524
525    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
526
527    # Handle numeric literals like in hive (3L = BIGINT)
528    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
529
530    # In tokenizers like JSONPath, dots are always key separators, never decimal points
531    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
532
533    COMMENTS = ["--", ("/*", "*/")]
534
535    __slots__ = (
536        "dialect",
537        "_core",
538    )
539
540    def __init__(self, dialect: DialectType = None) -> None:
541        from sqlglot.dialects.dialect import Dialect
542        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
543
544        self.dialect = Dialect.get_or_raise(dialect)
545
546        self._core = _TokenizerCore(
547            single_tokens=self.SINGLE_TOKENS,
548            keywords=self.KEYWORDS,
549            quotes=self._QUOTES,
550            format_strings=self._FORMAT_STRINGS,
551            identifiers=self._IDENTIFIERS,
552            comments=self._COMMENTS,
553            string_escapes=self._STRING_ESCAPES,
554            byte_string_escapes=self._BYTE_STRING_ESCAPES,
555            identifier_escapes=self._IDENTIFIER_ESCAPES,
556            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
557            commands=self.COMMANDS,
558            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
559            nested_comments=self.NESTED_COMMENTS,
560            hint_start=self.HINT_START,
561            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
562            bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS),
563            hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS),
564            numeric_literals=self.NUMERIC_LITERALS,
565            var_single_tokens=self.VAR_SINGLE_TOKENS,
566            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
567            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
568            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
569            keyword_trie=self._KEYWORD_TRIE,
570            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
571            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
572            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
573            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
574        )
575
576    def tokenize(self, sql: str) -> list[Token]:
577        """Returns a list of tokens corresponding to the SQL string `sql`."""
578        return self._core.tokenize(sql)  # type: ignore
579
580    @property
581    def sql(self) -> str:
582        """The SQL string being tokenized."""
583        return self._core.sql
584
585    @property
586    def size(self) -> int:
587        """Length of the SQL string."""
588        return self._core.size
589
590    @property
591    def tokens(self) -> list[Token]:
592        """The list of tokens produced by tokenization."""
593        return self._core.tokens
class Tokenizer(_TokenizerBase):
121class Tokenizer(_TokenizerBase):
122    SINGLE_TOKENS = {
123        "(": TokenType.L_PAREN,
124        ")": TokenType.R_PAREN,
125        "[": TokenType.L_BRACKET,
126        "]": TokenType.R_BRACKET,
127        "{": TokenType.L_BRACE,
128        "}": TokenType.R_BRACE,
129        "&": TokenType.AMP,
130        "^": TokenType.CARET,
131        ":": TokenType.COLON,
132        ",": TokenType.COMMA,
133        ".": TokenType.DOT,
134        "-": TokenType.DASH,
135        "=": TokenType.EQ,
136        ">": TokenType.GT,
137        "<": TokenType.LT,
138        "%": TokenType.MOD,
139        "!": TokenType.NOT,
140        "|": TokenType.PIPE,
141        "+": TokenType.PLUS,
142        ";": TokenType.SEMICOLON,
143        "/": TokenType.SLASH,
144        "\\": TokenType.BACKSLASH,
145        "*": TokenType.STAR,
146        "~": TokenType.TILDE,
147        "?": TokenType.PLACEHOLDER,
148        "@": TokenType.PARAMETER,
149        "#": TokenType.HASH,
150        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
151        "'": TokenType.UNKNOWN,
152        "`": TokenType.UNKNOWN,
153        '"': TokenType.UNKNOWN,
154    }
155
156    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
157    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
158    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
159    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
160    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
161    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
162    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
163    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
164    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
165    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
166    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
167    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
168
169    # The strings in this list can always be used as escapes, regardless of the surrounding
170    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
171    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
172    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
173
174    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
175    HEREDOC_TAG_IS_IDENTIFIER = False
176
177    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
178    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
179
180    # Whether string escape characters function as such when placed within raw strings
181    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
182
183    NESTED_COMMENTS = True
184
185    HINT_START = "/*+"
186
187    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
188
189    # Autofilled
190    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
191    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
192    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
193    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
194    _QUOTES: t.ClassVar[dict[str, str]] = {}
195    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
196    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
197    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
198    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
199
200    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
201        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
202        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
203        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
204        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
205        HINT_START: TokenType.HINT,
206        "&<": TokenType.AMP_LT,
207        "&>": TokenType.AMP_GT,
208        "==": TokenType.EQ,
209        "::": TokenType.DCOLON,
210        "?::": TokenType.QDCOLON,
211        "||": TokenType.DPIPE,
212        "|>": TokenType.PIPE_GT,
213        ">=": TokenType.GTE,
214        "<=": TokenType.LTE,
215        "<>": TokenType.NEQ,
216        "!=": TokenType.NEQ,
217        ":=": TokenType.COLON_EQ,
218        "<=>": TokenType.NULLSAFE_EQ,
219        "->": TokenType.ARROW,
220        "->>": TokenType.DARROW,
221        "=>": TokenType.FARROW,
222        "#>": TokenType.HASH_ARROW,
223        "#>>": TokenType.DHASH_ARROW,
224        "<->": TokenType.LR_ARROW,
225        "&&": TokenType.DAMP,
226        "??": TokenType.DQMARK,
227        "~~~": TokenType.GLOB,
228        "~~": TokenType.LIKE,
229        "~~*": TokenType.ILIKE,
230        "~*": TokenType.IRLIKE,
231        "-|-": TokenType.ADJACENT,
232        "ALL": TokenType.ALL,
233        "AND": TokenType.AND,
234        "ANTI": TokenType.ANTI,
235        "ANY": TokenType.ANY,
236        "ASC": TokenType.ASC,
237        "AS": TokenType.ALIAS,
238        "ASOF": TokenType.ASOF,
239        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
240        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
241        "BEGIN": TokenType.BEGIN,
242        "BETWEEN": TokenType.BETWEEN,
243        "CACHE": TokenType.CACHE,
244        "UNCACHE": TokenType.UNCACHE,
245        "CASE": TokenType.CASE,
246        "CHARACTER SET": TokenType.CHARACTER_SET,
247        "CLUSTER BY": TokenType.CLUSTER_BY,
248        "COLLATE": TokenType.COLLATE,
249        "COLUMN": TokenType.COLUMN,
250        "COMMIT": TokenType.COMMIT,
251        "CONNECT BY": TokenType.CONNECT_BY,
252        "CONSTRAINT": TokenType.CONSTRAINT,
253        "COPY": TokenType.COPY,
254        "CREATE": TokenType.CREATE,
255        "CROSS": TokenType.CROSS,
256        "CUBE": TokenType.CUBE,
257        "CURRENT_DATE": TokenType.CURRENT_DATE,
258        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
259        "CURRENT_TIME": TokenType.CURRENT_TIME,
260        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
261        "CURRENT_USER": TokenType.CURRENT_USER,
262        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
263        "DATABASE": TokenType.DATABASE,
264        "DEFAULT": TokenType.DEFAULT,
265        "DELETE": TokenType.DELETE,
266        "DESC": TokenType.DESC,
267        "DESCRIBE": TokenType.DESCRIBE,
268        "DISTINCT": TokenType.DISTINCT,
269        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
270        "DIV": TokenType.DIV,
271        "DROP": TokenType.DROP,
272        "ELSE": TokenType.ELSE,
273        "END": TokenType.END,
274        "ENUM": TokenType.ENUM,
275        "ESCAPE": TokenType.ESCAPE,
276        "EXCEPT": TokenType.EXCEPT,
277        "EXECUTE": TokenType.EXECUTE,
278        "EXISTS": TokenType.EXISTS,
279        "FALSE": TokenType.FALSE,
280        "FETCH": TokenType.FETCH,
281        "FILTER": TokenType.FILTER,
282        "FILE": TokenType.FILE,
283        "FIRST": TokenType.FIRST,
284        "FULL": TokenType.FULL,
285        "FUNCTION": TokenType.FUNCTION,
286        "FOR": TokenType.FOR,
287        "FOREIGN KEY": TokenType.FOREIGN_KEY,
288        "FORMAT": TokenType.FORMAT,
289        "FROM": TokenType.FROM,
290        "GEOGRAPHY": TokenType.GEOGRAPHY,
291        "GEOMETRY": TokenType.GEOMETRY,
292        "GLOB": TokenType.GLOB,
293        "GROUP BY": TokenType.GROUP_BY,
294        "GROUPING SETS": TokenType.GROUPING_SETS,
295        "HAVING": TokenType.HAVING,
296        "ILIKE": TokenType.ILIKE,
297        "IN": TokenType.IN,
298        "INDEX": TokenType.INDEX,
299        "INET": TokenType.INET,
300        "INNER": TokenType.INNER,
301        "INSERT": TokenType.INSERT,
302        "INTERVAL": TokenType.INTERVAL,
303        "INTERSECT": TokenType.INTERSECT,
304        "INTO": TokenType.INTO,
305        "IS": TokenType.IS,
306        "ISNULL": TokenType.ISNULL,
307        "JOIN": TokenType.JOIN,
308        "KEEP": TokenType.KEEP,
309        "KILL": TokenType.KILL,
310        "LATERAL": TokenType.LATERAL,
311        "LEFT": TokenType.LEFT,
312        "LIKE": TokenType.LIKE,
313        "LIMIT": TokenType.LIMIT,
314        "LOAD": TokenType.LOAD,
315        "LOCALTIME": TokenType.LOCALTIME,
316        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
317        "LOCK": TokenType.LOCK,
318        "MERGE": TokenType.MERGE,
319        "NAMESPACE": TokenType.NAMESPACE,
320        "NATURAL": TokenType.NATURAL,
321        "NEXT": TokenType.NEXT,
322        "NOT": TokenType.NOT,
323        "NOTNULL": TokenType.NOTNULL,
324        "NULL": TokenType.NULL,
325        "OBJECT": TokenType.OBJECT,
326        "OFFSET": TokenType.OFFSET,
327        "ON": TokenType.ON,
328        "OR": TokenType.OR,
329        "XOR": TokenType.XOR,
330        "ORDER BY": TokenType.ORDER_BY,
331        "ORDINALITY": TokenType.ORDINALITY,
332        "OUT": TokenType.OUT,
333        "OUTER": TokenType.OUTER,
334        "OVER": TokenType.OVER,
335        "OVERLAPS": TokenType.OVERLAPS,
336        "OVERWRITE": TokenType.OVERWRITE,
337        "PARTITION": TokenType.PARTITION,
338        "PARTITION BY": TokenType.PARTITION_BY,
339        "PARTITIONED BY": TokenType.PARTITION_BY,
340        "PARTITIONED_BY": TokenType.PARTITION_BY,
341        "PERCENT": TokenType.PERCENT,
342        "PIVOT": TokenType.PIVOT,
343        "PRAGMA": TokenType.PRAGMA,
344        "PRIMARY KEY": TokenType.PRIMARY_KEY,
345        "PROCEDURE": TokenType.PROCEDURE,
346        "OPERATOR": TokenType.OPERATOR,
347        "QUALIFY": TokenType.QUALIFY,
348        "RANGE": TokenType.RANGE,
349        "RECURSIVE": TokenType.RECURSIVE,
350        "REGEXP": TokenType.RLIKE,
351        "RENAME": TokenType.RENAME,
352        "REPLACE": TokenType.REPLACE,
353        "RETURNING": TokenType.RETURNING,
354        "REFERENCES": TokenType.REFERENCES,
355        "RIGHT": TokenType.RIGHT,
356        "RLIKE": TokenType.RLIKE,
357        "ROLLBACK": TokenType.ROLLBACK,
358        "ROLLUP": TokenType.ROLLUP,
359        "ROW": TokenType.ROW,
360        "ROWS": TokenType.ROWS,
361        "SCHEMA": TokenType.SCHEMA,
362        "SELECT": TokenType.SELECT,
363        "SEMI": TokenType.SEMI,
364        "SESSION": TokenType.SESSION,
365        "SESSION_USER": TokenType.SESSION_USER,
366        "SET": TokenType.SET,
367        "SETTINGS": TokenType.SETTINGS,
368        "SHOW": TokenType.SHOW,
369        "SIMILAR TO": TokenType.SIMILAR_TO,
370        "SOME": TokenType.SOME,
371        "SORT BY": TokenType.SORT_BY,
372        "SQL SECURITY": TokenType.SQL_SECURITY,
373        "START WITH": TokenType.START_WITH,
374        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
375        "TABLE": TokenType.TABLE,
376        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
377        "TEMP": TokenType.TEMPORARY,
378        "TEMPORARY": TokenType.TEMPORARY,
379        "THEN": TokenType.THEN,
380        "TRUE": TokenType.TRUE,
381        "TRUNCATE": TokenType.TRUNCATE,
382        "TRIGGER": TokenType.TRIGGER,
383        "UNION": TokenType.UNION,
384        "UNKNOWN": TokenType.UNKNOWN,
385        "UNNEST": TokenType.UNNEST,
386        "UNPIVOT": TokenType.UNPIVOT,
387        "UPDATE": TokenType.UPDATE,
388        "USE": TokenType.USE,
389        "USING": TokenType.USING,
390        "UUID": TokenType.UUID,
391        "VALUES": TokenType.VALUES,
392        "VIEW": TokenType.VIEW,
393        "VOLATILE": TokenType.VOLATILE,
394        "WHEN": TokenType.WHEN,
395        "WHERE": TokenType.WHERE,
396        "WINDOW": TokenType.WINDOW,
397        "WITH": TokenType.WITH,
398        "APPLY": TokenType.APPLY,
399        "ARRAY": TokenType.ARRAY,
400        "BIT": TokenType.BIT,
401        "BOOL": TokenType.BOOLEAN,
402        "BOOLEAN": TokenType.BOOLEAN,
403        "BYTE": TokenType.TINYINT,
404        "MEDIUMINT": TokenType.MEDIUMINT,
405        "INT1": TokenType.TINYINT,
406        "TINYINT": TokenType.TINYINT,
407        "INT16": TokenType.SMALLINT,
408        "SHORT": TokenType.SMALLINT,
409        "SMALLINT": TokenType.SMALLINT,
410        "HUGEINT": TokenType.INT128,
411        "UHUGEINT": TokenType.UINT128,
412        "INT2": TokenType.SMALLINT,
413        "INTEGER": TokenType.INT,
414        "INT": TokenType.INT,
415        "INT4": TokenType.INT,
416        "INT32": TokenType.INT,
417        "INT64": TokenType.BIGINT,
418        "INT128": TokenType.INT128,
419        "INT256": TokenType.INT256,
420        "LONG": TokenType.BIGINT,
421        "BIGINT": TokenType.BIGINT,
422        "INT8": TokenType.TINYINT,
423        "UINT": TokenType.UINT,
424        "UINT128": TokenType.UINT128,
425        "UINT256": TokenType.UINT256,
426        "DEC": TokenType.DECIMAL,
427        "DECIMAL": TokenType.DECIMAL,
428        "DECIMAL32": TokenType.DECIMAL32,
429        "DECIMAL64": TokenType.DECIMAL64,
430        "DECIMAL128": TokenType.DECIMAL128,
431        "DECIMAL256": TokenType.DECIMAL256,
432        "DECFLOAT": TokenType.DECFLOAT,
433        "BIGDECIMAL": TokenType.BIGDECIMAL,
434        "BIGNUMERIC": TokenType.BIGDECIMAL,
435        "BIGNUM": TokenType.BIGNUM,
436        "LIST": TokenType.LIST,
437        "MAP": TokenType.MAP,
438        "NULLABLE": TokenType.NULLABLE,
439        "NUMBER": TokenType.DECIMAL,
440        "NUMERIC": TokenType.DECIMAL,
441        "FIXED": TokenType.DECIMAL,
442        "REAL": TokenType.FLOAT,
443        "FLOAT": TokenType.FLOAT,
444        "FLOAT4": TokenType.FLOAT,
445        "FLOAT8": TokenType.DOUBLE,
446        "DOUBLE": TokenType.DOUBLE,
447        "DOUBLE PRECISION": TokenType.DOUBLE,
448        "JSON": TokenType.JSON,
449        "JSONB": TokenType.JSONB,
450        "CHAR": TokenType.CHAR,
451        "CHARACTER": TokenType.CHAR,
452        "CHAR VARYING": TokenType.VARCHAR,
453        "CHARACTER VARYING": TokenType.VARCHAR,
454        "NCHAR": TokenType.NCHAR,
455        "VARCHAR": TokenType.VARCHAR,
456        "VARCHAR2": TokenType.VARCHAR,
457        "NVARCHAR": TokenType.NVARCHAR,
458        "NVARCHAR2": TokenType.NVARCHAR,
459        "BPCHAR": TokenType.BPCHAR,
460        "STR": TokenType.TEXT,
461        "STRING": TokenType.TEXT,
462        "TEXT": TokenType.TEXT,
463        "LONGTEXT": TokenType.LONGTEXT,
464        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
465        "TINYTEXT": TokenType.TINYTEXT,
466        "CLOB": TokenType.TEXT,
467        "LONGVARCHAR": TokenType.TEXT,
468        "BINARY": TokenType.BINARY,
469        "BLOB": TokenType.VARBINARY,
470        "LONGBLOB": TokenType.LONGBLOB,
471        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
472        "TINYBLOB": TokenType.TINYBLOB,
473        "BYTEA": TokenType.VARBINARY,
474        "VARBINARY": TokenType.VARBINARY,
475        "TIME": TokenType.TIME,
476        "TIMETZ": TokenType.TIMETZ,
477        "TIME_NS": TokenType.TIME_NS,
478        "TIMESTAMP": TokenType.TIMESTAMP,
479        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
480        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
481        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
482        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
483        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
484        "DATE": TokenType.DATE,
485        "DATETIME": TokenType.DATETIME,
486        "INT4RANGE": TokenType.INT4RANGE,
487        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
488        "INT8RANGE": TokenType.INT8RANGE,
489        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
490        "NUMRANGE": TokenType.NUMRANGE,
491        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
492        "TSRANGE": TokenType.TSRANGE,
493        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
494        "TSTZRANGE": TokenType.TSTZRANGE,
495        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
496        "DATERANGE": TokenType.DATERANGE,
497        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
498        "UNIQUE": TokenType.UNIQUE,
499        "VECTOR": TokenType.VECTOR,
500        "STRUCT": TokenType.STRUCT,
501        "SEQUENCE": TokenType.SEQUENCE,
502        "VARIANT": TokenType.VARIANT,
503        "ALTER": TokenType.ALTER,
504        "ANALYZE": TokenType.ANALYZE,
505        "CALL": TokenType.COMMAND,
506        "COMMENT": TokenType.COMMENT,
507        "EXPLAIN": TokenType.COMMAND,
508        "GRANT": TokenType.GRANT,
509        "REVOKE": TokenType.REVOKE,
510        "OPTIMIZE": TokenType.COMMAND,
511        "PREPARE": TokenType.COMMAND,
512        "VACUUM": TokenType.COMMAND,
513        "USER-DEFINED": TokenType.USERDEFINED,
514        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
515        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
516    }
517
518    COMMANDS = {
519        TokenType.COMMAND,
520        TokenType.EXECUTE,
521        TokenType.FETCH,
522        TokenType.SHOW,
523        TokenType.RENAME,
524    }
525
526    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
527
528    # Handle numeric literals like in hive (3L = BIGINT)
529    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
530
531    # In tokenizers like JSONPath, dots are always key separators, never decimal points
532    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
533
534    COMMENTS = ["--", ("/*", "*/")]
535
536    __slots__ = (
537        "dialect",
538        "_core",
539    )
540
541    def __init__(self, dialect: DialectType = None) -> None:
542        from sqlglot.dialects.dialect import Dialect
543        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
544
545        self.dialect = Dialect.get_or_raise(dialect)
546
547        self._core = _TokenizerCore(
548            single_tokens=self.SINGLE_TOKENS,
549            keywords=self.KEYWORDS,
550            quotes=self._QUOTES,
551            format_strings=self._FORMAT_STRINGS,
552            identifiers=self._IDENTIFIERS,
553            comments=self._COMMENTS,
554            string_escapes=self._STRING_ESCAPES,
555            byte_string_escapes=self._BYTE_STRING_ESCAPES,
556            identifier_escapes=self._IDENTIFIER_ESCAPES,
557            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
558            commands=self.COMMANDS,
559            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
560            nested_comments=self.NESTED_COMMENTS,
561            hint_start=self.HINT_START,
562            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
563            bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS),
564            hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS),
565            numeric_literals=self.NUMERIC_LITERALS,
566            var_single_tokens=self.VAR_SINGLE_TOKENS,
567            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
568            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
569            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
570            keyword_trie=self._KEYWORD_TRIE,
571            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
572            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
573            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
574            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
575        )
576
577    def tokenize(self, sql: str) -> list[Token]:
578        """Returns a list of tokens corresponding to the SQL string `sql`."""
579        return self._core.tokenize(sql)  # type: ignore
580
581    @property
582    def sql(self) -> str:
583        """The SQL string being tokenized."""
584        return self._core.sql
585
586    @property
587    def size(self) -> int:
588        """Length of the SQL string."""
589        return self._core.size
590
591    @property
592    def tokens(self) -> list[Token]:
593        """The list of tokens produced by tokenization."""
594        return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
541    def __init__(self, dialect: DialectType = None) -> None:
542        from sqlglot.dialects.dialect import Dialect
543        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
544
545        self.dialect = Dialect.get_or_raise(dialect)
546
547        self._core = _TokenizerCore(
548            single_tokens=self.SINGLE_TOKENS,
549            keywords=self.KEYWORDS,
550            quotes=self._QUOTES,
551            format_strings=self._FORMAT_STRINGS,
552            identifiers=self._IDENTIFIERS,
553            comments=self._COMMENTS,
554            string_escapes=self._STRING_ESCAPES,
555            byte_string_escapes=self._BYTE_STRING_ESCAPES,
556            identifier_escapes=self._IDENTIFIER_ESCAPES,
557            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
558            commands=self.COMMANDS,
559            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
560            nested_comments=self.NESTED_COMMENTS,
561            hint_start=self.HINT_START,
562            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
563            bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS),
564            hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS),
565            numeric_literals=self.NUMERIC_LITERALS,
566            var_single_tokens=self.VAR_SINGLE_TOKENS,
567            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
568            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
569            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
570            keyword_trie=self._KEYWORD_TRIE,
571            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
572            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
573            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
574            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
575        )
SINGLE_TOKENS = {'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 326>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 353>, '@': <TokenType.PARAMETER: 56>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 212>, '`': <TokenType.UNKNOWN: 212>, '"': <TokenType.UNKNOWN: 212>}
BIT_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
BYTE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEX_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
RAW_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEREDOC_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
UNICODE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
IDENTIFIERS: ClassVar[list[tuple[str, str] | str]] = ['"']
QUOTES: ClassVar[list[tuple[str, str] | str]] = ["'"]
STRING_ESCAPES: ClassVar[list[str]] = ["'"]
BYTE_STRING_ESCAPES: ClassVar[list[str]] = []
VAR_SINGLE_TOKENS: ClassVar[set[str]] = set()
ESCAPE_FOLLOW_CHARS: ClassVar[list[str]] = []
IDENTIFIER_ESCAPES: ClassVar[list[str]] = []
HEREDOC_TAG_IS_IDENTIFIER = False
HEREDOC_STRING_ALTERNATIVE = <TokenType.VAR: 87>
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
NESTED_COMMENTS = True
HINT_START = '/*+'
TOKENS_PRECEDING_HINT = {<TokenType.INSERT: 297>, <TokenType.UPDATE: 413>, <TokenType.DELETE: 254>, <TokenType.SELECT: 383>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] = {'{%': <TokenType.BLOCK_START: 71>, '{%+': <TokenType.BLOCK_START: 71>, '{%-': <TokenType.BLOCK_START: 71>, '%}': <TokenType.BLOCK_END: 72>, '+%}': <TokenType.BLOCK_END: 72>, '-%}': <TokenType.BLOCK_END: 72>, '{{+': <TokenType.BLOCK_START: 71>, '{{-': <TokenType.BLOCK_START: 71>, '+}}': <TokenType.BLOCK_END: 72>, '-}}': <TokenType.BLOCK_END: 72>, '/*+': <TokenType.HINT: 290>, '&<': <TokenType.AMP_LT: 61>, '&>': <TokenType.AMP_GT: 62>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 366>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '&&': <TokenType.DAMP: 60>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 284>, '~~': <TokenType.LIKE: 315>, '~~*': <TokenType.ILIKE: 292>, '~*': <TokenType.IRLIKE: 304>, '-|-': <TokenType.ADJACENT: 63>, 'ALL': <TokenType.ALL: 218>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 219>, 'ANY': <TokenType.ANY: 220>, 'ASC': <TokenType.ASC: 223>, 'AS': <TokenType.ALIAS: 216>, 'ASOF': <TokenType.ASOF: 224>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'BEGIN': <TokenType.BEGIN: 227>, 'BETWEEN': <TokenType.BETWEEN: 228>, 'CACHE': <TokenType.CACHE: 230>, 'UNCACHE': <TokenType.UNCACHE: 409>, 'CASE': <TokenType.CASE: 231>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 232>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 233>, 'COLLATE': <TokenType.COLLATE: 234>, 'COLUMN': <TokenType.COLUMN: 79>, 'COMMIT': <TokenType.COMMIT: 237>, 'CONNECT BY': <TokenType.CONNECT_BY: 238>, 'CONSTRAINT': <TokenType.CONSTRAINT: 239>, 'COPY': <TokenType.COPY: 240>, 'CREATE': <TokenType.CREATE: 241>, 'CROSS': <TokenType.CROSS: 242>, 'CUBE': <TokenType.CUBE: 243>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 244>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 246>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 247>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 248>, 'CURRENT_USER': <TokenType.CURRENT_USER: 249>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 251>, 'DATABASE': <TokenType.DATABASE: 78>, 'DEFAULT': <TokenType.DEFAULT: 253>, 'DELETE': <TokenType.DELETE: 254>, 'DESC': <TokenType.DESC: 255>, 'DESCRIBE': <TokenType.DESCRIBE: 256>, 'DISTINCT': <TokenType.DISTINCT: 259>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 260>, 'DIV': <TokenType.DIV: 261>, 'DROP': <TokenType.DROP: 262>, 'ELSE': <TokenType.ELSE: 263>, 'END': <TokenType.END: 264>, 'ENUM': <TokenType.ENUM: 203>, 'ESCAPE': <TokenType.ESCAPE: 265>, 'EXCEPT': <TokenType.EXCEPT: 266>, 'EXECUTE': <TokenType.EXECUTE: 267>, 'EXISTS': <TokenType.EXISTS: 268>, 'FALSE': <TokenType.FALSE: 269>, 'FETCH': <TokenType.FETCH: 270>, 'FILTER': <TokenType.FILTER: 273>, 'FILE': <TokenType.FILE: 271>, 'FIRST': <TokenType.FIRST: 275>, 'FULL': <TokenType.FULL: 281>, 'FUNCTION': <TokenType.FUNCTION: 282>, 'FOR': <TokenType.FOR: 276>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 278>, 'FORMAT': <TokenType.FORMAT: 279>, 'FROM': <TokenType.FROM: 280>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 170>, 'GEOMETRY': <TokenType.GEOMETRY: 173>, 'GLOB': <TokenType.GLOB: 284>, 'GROUP BY': <TokenType.GROUP_BY: 287>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 288>, 'HAVING': <TokenType.HAVING: 289>, 'ILIKE': <TokenType.ILIKE: 292>, 'IN': <TokenType.IN: 293>, 'INDEX': <TokenType.INDEX: 294>, 'INET': <TokenType.INET: 198>, 'INNER': <TokenType.INNER: 296>, 'INSERT': <TokenType.INSERT: 297>, 'INTERVAL': <TokenType.INTERVAL: 301>, 'INTERSECT': <TokenType.INTERSECT: 300>, 'INTO': <TokenType.INTO: 302>, 'IS': <TokenType.IS: 305>, 'ISNULL': <TokenType.ISNULL: 306>, 'JOIN': <TokenType.JOIN: 307>, 'KEEP': <TokenType.KEEP: 309>, 'KILL': <TokenType.KILL: 311>, 'LATERAL': <TokenType.LATERAL: 313>, 'LEFT': <TokenType.LEFT: 314>, 'LIKE': <TokenType.LIKE: 315>, 'LIMIT': <TokenType.LIMIT: 316>, 'LOAD': <TokenType.LOAD: 318>, 'LOCALTIME': <TokenType.LOCALTIME: 177>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 178>, 'LOCK': <TokenType.LOCK: 319>, 'MERGE': <TokenType.MERGE: 325>, 'NAMESPACE': <TokenType.NAMESPACE: 436>, 'NATURAL': <TokenType.NATURAL: 328>, 'NEXT': <TokenType.NEXT: 329>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 331>, 'NULL': <TokenType.NULL: 332>, 'OBJECT': <TokenType.OBJECT: 197>, 'OFFSET': <TokenType.OFFSET: 334>, 'ON': <TokenType.ON: 335>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 64>, 'ORDER BY': <TokenType.ORDER_BY: 338>, 'ORDINALITY': <TokenType.ORDINALITY: 341>, 'OUT': <TokenType.OUT: 342>, 'OUTER': <TokenType.OUTER: 344>, 'OVER': <TokenType.OVER: 345>, 'OVERLAPS': <TokenType.OVERLAPS: 346>, 'OVERWRITE': <TokenType.OVERWRITE: 347>, 'PARTITION': <TokenType.PARTITION: 349>, 'PARTITION BY': <TokenType.PARTITION_BY: 350>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 350>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 350>, 'PERCENT': <TokenType.PERCENT: 351>, 'PIVOT': <TokenType.PIVOT: 352>, 'PRAGMA': <TokenType.PRAGMA: 357>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 359>, 'PROCEDURE': <TokenType.PROCEDURE: 360>, 'OPERATOR': <TokenType.OPERATOR: 337>, 'QUALIFY': <TokenType.QUALIFY: 364>, 'RANGE': <TokenType.RANGE: 367>, 'RECURSIVE': <TokenType.RECURSIVE: 368>, 'REGEXP': <TokenType.RLIKE: 376>, 'RENAME': <TokenType.RENAME: 370>, 'REPLACE': <TokenType.REPLACE: 371>, 'RETURNING': <TokenType.RETURNING: 372>, 'REFERENCES': <TokenType.REFERENCES: 374>, 'RIGHT': <TokenType.RIGHT: 375>, 'RLIKE': <TokenType.RLIKE: 376>, 'ROLLBACK': <TokenType.ROLLBACK: 378>, 'ROLLUP': <TokenType.ROLLUP: 379>, 'ROW': <TokenType.ROW: 380>, 'ROWS': <TokenType.ROWS: 381>, 'SCHEMA': <TokenType.SCHEMA: 81>, 'SELECT': <TokenType.SELECT: 383>, 'SEMI': <TokenType.SEMI: 384>, 'SESSION': <TokenType.SESSION: 57>, 'SESSION_USER': <TokenType.SESSION_USER: 59>, 'SET': <TokenType.SET: 388>, 'SETTINGS': <TokenType.SETTINGS: 389>, 'SHOW': <TokenType.SHOW: 390>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 391>, 'SOME': <TokenType.SOME: 392>, 'SORT BY': <TokenType.SORT_BY: 393>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 395>, 'START WITH': <TokenType.START_WITH: 396>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 398>, 'TABLE': <TokenType.TABLE: 82>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 401>, 'TEMP': <TokenType.TEMPORARY: 403>, 'TEMPORARY': <TokenType.TEMPORARY: 403>, 'THEN': <TokenType.THEN: 405>, 'TRUE': <TokenType.TRUE: 406>, 'TRUNCATE': <TokenType.TRUNCATE: 407>, 'TRIGGER': <TokenType.TRIGGER: 408>, 'UNION': <TokenType.UNION: 410>, 'UNKNOWN': <TokenType.UNKNOWN: 212>, 'UNNEST': <TokenType.UNNEST: 411>, 'UNPIVOT': <TokenType.UNPIVOT: 412>, 'UPDATE': <TokenType.UPDATE: 413>, 'USE': <TokenType.USE: 414>, 'USING': <TokenType.USING: 415>, 'UUID': <TokenType.UUID: 169>, 'VALUES': <TokenType.VALUES: 416>, 'VIEW': <TokenType.VIEW: 418>, 'VOLATILE': <TokenType.VOLATILE: 420>, 'WHEN': <TokenType.WHEN: 422>, 'WHERE': <TokenType.WHERE: 423>, 'WINDOW': <TokenType.WINDOW: 424>, 'WITH': <TokenType.WITH: 425>, 'APPLY': <TokenType.APPLY: 221>, 'ARRAY': <TokenType.ARRAY: 222>, 'BIT': <TokenType.BIT: 95>, 'BOOL': <TokenType.BOOLEAN: 96>, 'BOOLEAN': <TokenType.BOOLEAN: 96>, 'BYTE': <TokenType.TINYINT: 97>, 'MEDIUMINT': <TokenType.MEDIUMINT: 101>, 'INT1': <TokenType.TINYINT: 97>, 'TINYINT': <TokenType.TINYINT: 97>, 'INT16': <TokenType.SMALLINT: 99>, 'SHORT': <TokenType.SMALLINT: 99>, 'SMALLINT': <TokenType.SMALLINT: 99>, 'HUGEINT': <TokenType.INT128: 108>, 'UHUGEINT': <TokenType.UINT128: 109>, 'INT2': <TokenType.SMALLINT: 99>, 'INTEGER': <TokenType.INT: 103>, 'INT': <TokenType.INT: 103>, 'INT4': <TokenType.INT: 103>, 'INT32': <TokenType.INT: 103>, 'INT64': <TokenType.BIGINT: 105>, 'INT128': <TokenType.INT128: 108>, 'INT256': <TokenType.INT256: 110>, 'LONG': <TokenType.BIGINT: 105>, 'BIGINT': <TokenType.BIGINT: 105>, 'INT8': <TokenType.TINYINT: 97>, 'UINT': <TokenType.UINT: 104>, 'UINT128': <TokenType.UINT128: 109>, 'UINT256': <TokenType.UINT256: 111>, 'DEC': <TokenType.DECIMAL: 115>, 'DECIMAL': <TokenType.DECIMAL: 115>, 'DECIMAL32': <TokenType.DECIMAL32: 116>, 'DECIMAL64': <TokenType.DECIMAL64: 117>, 'DECIMAL128': <TokenType.DECIMAL128: 118>, 'DECIMAL256': <TokenType.DECIMAL256: 119>, 'DECFLOAT': <TokenType.DECFLOAT: 120>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 122>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 122>, 'BIGNUM': <TokenType.BIGNUM: 107>, 'LIST': <TokenType.LIST: 317>, 'MAP': <TokenType.MAP: 320>, 'NULLABLE': <TokenType.NULLABLE: 172>, 'NUMBER': <TokenType.DECIMAL: 115>, 'NUMERIC': <TokenType.DECIMAL: 115>, 'FIXED': <TokenType.DECIMAL: 115>, 'REAL': <TokenType.FLOAT: 112>, 'FLOAT': <TokenType.FLOAT: 112>, 'FLOAT4': <TokenType.FLOAT: 112>, 'FLOAT8': <TokenType.DOUBLE: 113>, 'DOUBLE': <TokenType.DOUBLE: 113>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 113>, 'JSON': <TokenType.JSON: 139>, 'JSONB': <TokenType.JSONB: 140>, 'CHAR': <TokenType.CHAR: 123>, 'CHARACTER': <TokenType.CHAR: 123>, 'CHAR VARYING': <TokenType.VARCHAR: 125>, 'CHARACTER VARYING': <TokenType.VARCHAR: 125>, 'NCHAR': <TokenType.NCHAR: 124>, 'VARCHAR': <TokenType.VARCHAR: 125>, 'VARCHAR2': <TokenType.VARCHAR: 125>, 'NVARCHAR': <TokenType.NVARCHAR: 126>, 'NVARCHAR2': <TokenType.NVARCHAR: 126>, 'BPCHAR': <TokenType.BPCHAR: 127>, 'STR': <TokenType.TEXT: 128>, 'STRING': <TokenType.TEXT: 128>, 'TEXT': <TokenType.TEXT: 128>, 'LONGTEXT': <TokenType.LONGTEXT: 130>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 129>, 'TINYTEXT': <TokenType.TINYTEXT: 135>, 'CLOB': <TokenType.TEXT: 128>, 'LONGVARCHAR': <TokenType.TEXT: 128>, 'BINARY': <TokenType.BINARY: 137>, 'BLOB': <TokenType.VARBINARY: 138>, 'LONGBLOB': <TokenType.LONGBLOB: 133>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 132>, 'TINYBLOB': <TokenType.TINYBLOB: 134>, 'BYTEA': <TokenType.VARBINARY: 138>, 'VARBINARY': <TokenType.VARBINARY: 138>, 'TIME': <TokenType.TIME: 141>, 'TIMETZ': <TokenType.TIMETZ: 142>, 'TIME_NS': <TokenType.TIME_NS: 143>, 'TIMESTAMP': <TokenType.TIMESTAMP: 144>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 145>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 147>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 147>, 'DATE': <TokenType.DATE: 155>, 'DATETIME': <TokenType.DATETIME: 151>, 'INT4RANGE': <TokenType.INT4RANGE: 157>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 158>, 'INT8RANGE': <TokenType.INT8RANGE: 159>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 160>, 'NUMRANGE': <TokenType.NUMRANGE: 161>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 162>, 'TSRANGE': <TokenType.TSRANGE: 163>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 164>, 'TSTZRANGE': <TokenType.TSTZRANGE: 165>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 166>, 'DATERANGE': <TokenType.DATERANGE: 167>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 168>, 'UNIQUE': <TokenType.UNIQUE: 426>, 'VECTOR': <TokenType.VECTOR: 213>, 'STRUCT': <TokenType.STRUCT: 399>, 'SEQUENCE': <TokenType.SEQUENCE: 386>, 'VARIANT': <TokenType.VARIANT: 196>, 'ALTER': <TokenType.ALTER: 217>, 'ANALYZE': <TokenType.ANALYZE: 435>, 'CALL': <TokenType.COMMAND: 235>, 'COMMENT': <TokenType.COMMENT: 236>, 'EXPLAIN': <TokenType.COMMAND: 235>, 'GRANT': <TokenType.GRANT: 286>, 'REVOKE': <TokenType.REVOKE: 373>, 'OPTIMIZE': <TokenType.COMMAND: 235>, 'PREPARE': <TokenType.COMMAND: 235>, 'VACUUM': <TokenType.COMMAND: 235>, 'USER-DEFINED': <TokenType.USERDEFINED: 191>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 430>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 431>}
COMMANDS = {<TokenType.SHOW: 390>, <TokenType.EXECUTE: 267>, <TokenType.COMMAND: 235>, <TokenType.FETCH: 270>, <TokenType.RENAME: 370>}
COMMAND_PREFIX_TOKENS = {<TokenType.SEMICOLON: 19>, <TokenType.BEGIN: 227>}
NUMERIC_LITERALS: ClassVar[dict[str, str]] = {}
NUMBERS_CAN_HAVE_DECIMALS: ClassVar[bool] = True
COMMENTS = ['--', ('/*', '*/')]
dialect
def tokenize(self, sql: str) -> list[sqlglot.tokenizer_core.Token]:
577    def tokenize(self, sql: str) -> list[Token]:
578        """Returns a list of tokens corresponding to the SQL string `sql`."""
579        return self._core.tokenize(sql)  # type: ignore

Returns a list of tokens corresponding to the SQL string sql.

sql: str
581    @property
582    def sql(self) -> str:
583        """The SQL string being tokenized."""
584        return self._core.sql

The SQL string being tokenized.

size: int
586    @property
587    def size(self) -> int:
588        """Length of the SQL string."""
589        return self._core.size

Length of the SQL string.

tokens: list[sqlglot.tokenizer_core.Token]
591    @property
592    def tokens(self) -> list[Token]:
593        """The list of tokens produced by tokenization."""
594        return self._core.tokens

The list of tokens produced by tokenization.