Edit on GitHub

sqlglot.tokens

  1from __future__ import annotations
  2
  3import threading
  4import typing as t
  5
  6from sqlglot.trie import new_trie
  7
  8from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType
  9
 10T = t.TypeVar("T")
 11
 12
 13class ThreadLocalCache(threading.local):
 14    """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
 15
 16    def __init__(self) -> None:
 17        self.cache: dict[type, t.Any] = {}
 18
 19    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
 20        if not (obj := self.cache.get(key)):
 21            self.cache[key] = obj = build()
 22        return obj
 23
 24
 25try:
 26    import sqlglotc  # noqa: F401
 27except ImportError:
 28    pass
 29
 30try:
 31    import sqlglotrs  # type: ignore # noqa: F401
 32    import warnings
 33
 34    if "sqlglotc" not in globals():
 35        warnings.warn(
 36            "sqlglot[rs] is deprecated and no longer compatible with sqlglot. "
 37            "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]",
 38        )
 39except ImportError:
 40    pass
 41
 42if t.TYPE_CHECKING:
 43    from sqlglot.dialects.dialect import DialectType
 44
 45
 46def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]:
 47    return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr)
 48
 49
 50def _quotes_to_format(
 51    token_type: TokenType, arr: list[str | tuple[str, str]]
 52) -> dict[str, tuple[str, TokenType]]:
 53    return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
 54
 55
 56class _TokenizerBase:
 57    QUOTES: t.ClassVar[list[tuple[str, str] | str]]
 58    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]]
 59    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 60    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 61    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 62    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 63    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 64    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 65    STRING_ESCAPES: t.ClassVar[list[str]]
 66    BYTE_STRING_ESCAPES: t.ClassVar[list[str]]
 67    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]]
 68    IDENTIFIER_ESCAPES: t.ClassVar[list[str]]
 69    HINT_START: t.ClassVar[str]
 70    KEYWORDS: t.ClassVar[dict[str, TokenType]]
 71    SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]]
 72    NUMERIC_LITERALS: t.ClassVar[dict[str, str]]
 73    VAR_SINGLE_TOKENS: t.ClassVar[set[str]]
 74    COMMANDS: t.ClassVar[set[TokenType]]
 75    COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]]
 76    HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool]
 77    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool]
 78    NESTED_COMMENTS: t.ClassVar[bool]
 79    TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]]
 80    HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType]
 81    COMMENTS: t.ClassVar[list[str | tuple[str, str]]]
 82    _QUOTES: t.ClassVar[dict[str, str]]
 83    _IDENTIFIERS: t.ClassVar[dict[str, str]]
 84    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]]
 85    _STRING_ESCAPES: t.ClassVar[set[str]]
 86    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]]
 87    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]]
 88    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]]
 89    _COMMENTS: t.ClassVar[dict[str, str | None]]
 90    _KEYWORD_TRIE: t.ClassVar[dict[str, object]]
 91
 92    @classmethod
 93    def __init_subclass__(cls, **kwargs: t.Any) -> None:
 94        super().__init_subclass__(**kwargs)
 95        cls._QUOTES = _convert_quotes(cls.QUOTES)
 96        cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS)
 97        cls._FORMAT_STRINGS = {
 98            **{
 99                p + s: (e, TokenType.NATIONAL_STRING)
100                for s, e in cls._QUOTES.items()
101                for p in ("n", "N")
102            },
103            **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS),
104            **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS),
105            **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS),
106            **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS),
107            **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS),
108            **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS),
109        }
110        if "BYTE_STRING_ESCAPES" not in cls.__dict__:
111            cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy()
112        cls._STRING_ESCAPES = set(cls.STRING_ESCAPES)
113        cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES)
114        cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS)
115        cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES)
116        cls._COMMENTS = {
117            **{c: None for c in cls.COMMENTS if isinstance(c, str)},
118            **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)},
119            "{#": "#}",  # Ensure Jinja comments are tokenized correctly in all dialects
120        }
121        if cls.HINT_START in cls.KEYWORDS:
122            cls._COMMENTS[cls.HINT_START] = "*/"
123        cls._KEYWORD_TRIE = new_trie(
124            key.upper()
125            for key in (
126                *cls.KEYWORDS,
127                *cls._COMMENTS,
128                *cls._QUOTES,
129                *cls._FORMAT_STRINGS,
130            )
131            if " " in key or any(single in key for single in cls.SINGLE_TOKENS)
132        )
133
134
135class Tokenizer(_TokenizerBase):
136    SINGLE_TOKENS = {
137        "(": TokenType.L_PAREN,
138        ")": TokenType.R_PAREN,
139        "[": TokenType.L_BRACKET,
140        "]": TokenType.R_BRACKET,
141        "{": TokenType.L_BRACE,
142        "}": TokenType.R_BRACE,
143        "&": TokenType.AMP,
144        "^": TokenType.CARET,
145        ":": TokenType.COLON,
146        ",": TokenType.COMMA,
147        ".": TokenType.DOT,
148        "-": TokenType.DASH,
149        "=": TokenType.EQ,
150        ">": TokenType.GT,
151        "<": TokenType.LT,
152        "%": TokenType.MOD,
153        "!": TokenType.NOT,
154        "|": TokenType.PIPE,
155        "+": TokenType.PLUS,
156        ";": TokenType.SEMICOLON,
157        "/": TokenType.SLASH,
158        "\\": TokenType.BACKSLASH,
159        "*": TokenType.STAR,
160        "~": TokenType.TILDE,
161        "?": TokenType.PLACEHOLDER,
162        "@": TokenType.PARAMETER,
163        "#": TokenType.HASH,
164        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
165        "'": TokenType.UNKNOWN,
166        "`": TokenType.UNKNOWN,
167        '"': TokenType.UNKNOWN,
168    }
169
170    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
171    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
172    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
173    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
174    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
175    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
176    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
177    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
178    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
179    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
180    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
181    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
182
183    # The strings in this list can always be used as escapes, regardless of the surrounding
184    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
185    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
186    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
187
188    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
189    HEREDOC_TAG_IS_IDENTIFIER = False
190
191    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
192    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
193
194    # Whether string escape characters function as such when placed within raw strings
195    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
196
197    NESTED_COMMENTS = True
198
199    HINT_START = "/*+"
200
201    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
202
203    # Autofilled
204    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
205    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
206    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
207    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
208    _QUOTES: t.ClassVar[dict[str, str]] = {}
209    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
210    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
211    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
212    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
213
214    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
215        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
216        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
217        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
218        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
219        HINT_START: TokenType.HINT,
220        "&<": TokenType.AMP_LT,
221        "&>": TokenType.AMP_GT,
222        "==": TokenType.EQ,
223        "::": TokenType.DCOLON,
224        "?::": TokenType.QDCOLON,
225        "||": TokenType.DPIPE,
226        "|>": TokenType.PIPE_GT,
227        ">=": TokenType.GTE,
228        "<=": TokenType.LTE,
229        "<>": TokenType.NEQ,
230        "!=": TokenType.NEQ,
231        ":=": TokenType.COLON_EQ,
232        "<=>": TokenType.NULLSAFE_EQ,
233        "->": TokenType.ARROW,
234        "->>": TokenType.DARROW,
235        "=>": TokenType.FARROW,
236        "#>": TokenType.HASH_ARROW,
237        "#>>": TokenType.DHASH_ARROW,
238        "<->": TokenType.LR_ARROW,
239        "<<->>": TokenType.LLRR_ARROW,
240        "&&": TokenType.DAMP,
241        "??": TokenType.DQMARK,
242        "~~~": TokenType.GLOB,
243        "~~": TokenType.LIKE,
244        "~~*": TokenType.ILIKE,
245        "~*": TokenType.IRLIKE,
246        "-|-": TokenType.ADJACENT,
247        "ALL": TokenType.ALL,
248        "AND": TokenType.AND,
249        "ANTI": TokenType.ANTI,
250        "ANY": TokenType.ANY,
251        "ASC": TokenType.ASC,
252        "AS": TokenType.ALIAS,
253        "ASOF": TokenType.ASOF,
254        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
255        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
256        "BEGIN": TokenType.BEGIN,
257        "BETWEEN": TokenType.BETWEEN,
258        "CACHE": TokenType.CACHE,
259        "UNCACHE": TokenType.UNCACHE,
260        "CASE": TokenType.CASE,
261        "CHARACTER SET": TokenType.CHARACTER_SET,
262        "CLUSTER BY": TokenType.CLUSTER_BY,
263        "COLLATE": TokenType.COLLATE,
264        "COLUMN": TokenType.COLUMN,
265        "COMMIT": TokenType.COMMIT,
266        "CONNECT BY": TokenType.CONNECT_BY,
267        "CONSTRAINT": TokenType.CONSTRAINT,
268        "COPY": TokenType.COPY,
269        "CREATE": TokenType.CREATE,
270        "CROSS": TokenType.CROSS,
271        "CUBE": TokenType.CUBE,
272        "CURRENT_DATE": TokenType.CURRENT_DATE,
273        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
274        "CURRENT_TIME": TokenType.CURRENT_TIME,
275        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
276        "CURRENT_USER": TokenType.CURRENT_USER,
277        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
278        "DATABASE": TokenType.DATABASE,
279        "DEFAULT": TokenType.DEFAULT,
280        "DELETE": TokenType.DELETE,
281        "DESC": TokenType.DESC,
282        "DESCRIBE": TokenType.DESCRIBE,
283        "DISTINCT": TokenType.DISTINCT,
284        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
285        "DIV": TokenType.DIV,
286        "DROP": TokenType.DROP,
287        "ELSE": TokenType.ELSE,
288        "END": TokenType.END,
289        "ENUM": TokenType.ENUM,
290        "ESCAPE": TokenType.ESCAPE,
291        "EXCEPT": TokenType.EXCEPT,
292        "EXECUTE": TokenType.EXECUTE,
293        "EXISTS": TokenType.EXISTS,
294        "FALSE": TokenType.FALSE,
295        "FETCH": TokenType.FETCH,
296        "FILTER": TokenType.FILTER,
297        "FILE": TokenType.FILE,
298        "FIRST": TokenType.FIRST,
299        "FULL": TokenType.FULL,
300        "FUNCTION": TokenType.FUNCTION,
301        "FOR": TokenType.FOR,
302        "FOREIGN KEY": TokenType.FOREIGN_KEY,
303        "FORMAT": TokenType.FORMAT,
304        "FROM": TokenType.FROM,
305        "GEOGRAPHY": TokenType.GEOGRAPHY,
306        "GEOMETRY": TokenType.GEOMETRY,
307        "GLOB": TokenType.GLOB,
308        "GROUP BY": TokenType.GROUP_BY,
309        "GROUPING SETS": TokenType.GROUPING_SETS,
310        "HAVING": TokenType.HAVING,
311        "ILIKE": TokenType.ILIKE,
312        "IN": TokenType.IN,
313        "INDEX": TokenType.INDEX,
314        "INET": TokenType.INET,
315        "INNER": TokenType.INNER,
316        "INSERT": TokenType.INSERT,
317        "INTERVAL": TokenType.INTERVAL,
318        "INTERSECT": TokenType.INTERSECT,
319        "INTO": TokenType.INTO,
320        "IS": TokenType.IS,
321        "ISNULL": TokenType.ISNULL,
322        "JOIN": TokenType.JOIN,
323        "KEEP": TokenType.KEEP,
324        "KILL": TokenType.KILL,
325        "LATERAL": TokenType.LATERAL,
326        "LEFT": TokenType.LEFT,
327        "LIKE": TokenType.LIKE,
328        "LIMIT": TokenType.LIMIT,
329        "LOAD": TokenType.LOAD,
330        "LOCALTIME": TokenType.LOCALTIME,
331        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
332        "LOCK": TokenType.LOCK,
333        "MERGE": TokenType.MERGE,
334        "NAMESPACE": TokenType.NAMESPACE,
335        "NATURAL": TokenType.NATURAL,
336        "NEXT": TokenType.NEXT,
337        "NOT": TokenType.NOT,
338        "NOTNULL": TokenType.NOTNULL,
339        "NULL": TokenType.NULL,
340        "OBJECT": TokenType.OBJECT,
341        "OFFSET": TokenType.OFFSET,
342        "ON": TokenType.ON,
343        "OR": TokenType.OR,
344        "XOR": TokenType.XOR,
345        "ORDER BY": TokenType.ORDER_BY,
346        "ORDINALITY": TokenType.ORDINALITY,
347        "OUT": TokenType.OUT,
348        "OUTER": TokenType.OUTER,
349        "OVER": TokenType.OVER,
350        "OVERLAPS": TokenType.OVERLAPS,
351        "OVERWRITE": TokenType.OVERWRITE,
352        "PARTITION": TokenType.PARTITION,
353        "PARTITION BY": TokenType.PARTITION_BY,
354        "PARTITIONED BY": TokenType.PARTITION_BY,
355        "PARTITIONED_BY": TokenType.PARTITION_BY,
356        "PERCENT": TokenType.PERCENT,
357        "PIVOT": TokenType.PIVOT,
358        "PRAGMA": TokenType.PRAGMA,
359        "PRIMARY KEY": TokenType.PRIMARY_KEY,
360        "PROCEDURE": TokenType.PROCEDURE,
361        "OPERATOR": TokenType.OPERATOR,
362        "QUALIFY": TokenType.QUALIFY,
363        "RANGE": TokenType.RANGE,
364        "RECURSIVE": TokenType.RECURSIVE,
365        "REGEXP": TokenType.RLIKE,
366        "RENAME": TokenType.RENAME,
367        "REPLACE": TokenType.REPLACE,
368        "RETURNING": TokenType.RETURNING,
369        "REFERENCES": TokenType.REFERENCES,
370        "RIGHT": TokenType.RIGHT,
371        "RLIKE": TokenType.RLIKE,
372        "ROLLBACK": TokenType.ROLLBACK,
373        "ROLLUP": TokenType.ROLLUP,
374        "ROW": TokenType.ROW,
375        "ROWS": TokenType.ROWS,
376        "SCHEMA": TokenType.SCHEMA,
377        "SELECT": TokenType.SELECT,
378        "SEMI": TokenType.SEMI,
379        "SESSION": TokenType.SESSION,
380        "SESSION_USER": TokenType.SESSION_USER,
381        "SET": TokenType.SET,
382        "SETTINGS": TokenType.SETTINGS,
383        "SHOW": TokenType.SHOW,
384        "SIMILAR TO": TokenType.SIMILAR_TO,
385        "SOME": TokenType.SOME,
386        "SORT BY": TokenType.SORT_BY,
387        "SQL SECURITY": TokenType.SQL_SECURITY,
388        "START WITH": TokenType.START_WITH,
389        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
390        "TABLE": TokenType.TABLE,
391        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
392        "TEMP": TokenType.TEMPORARY,
393        "TEMPORARY": TokenType.TEMPORARY,
394        "THEN": TokenType.THEN,
395        "TRUE": TokenType.TRUE,
396        "TRUNCATE": TokenType.TRUNCATE,
397        "TRIGGER": TokenType.TRIGGER,
398        "UNION": TokenType.UNION,
399        "UNKNOWN": TokenType.UNKNOWN,
400        "UNNEST": TokenType.UNNEST,
401        "UNPIVOT": TokenType.UNPIVOT,
402        "UPDATE": TokenType.UPDATE,
403        "USE": TokenType.USE,
404        "USING": TokenType.USING,
405        "UUID": TokenType.UUID,
406        "VALUES": TokenType.VALUES,
407        "VIEW": TokenType.VIEW,
408        "VOLATILE": TokenType.VOLATILE,
409        "WHEN": TokenType.WHEN,
410        "WHERE": TokenType.WHERE,
411        "WINDOW": TokenType.WINDOW,
412        "WITH": TokenType.WITH,
413        "APPLY": TokenType.APPLY,
414        "ARRAY": TokenType.ARRAY,
415        "BIT": TokenType.BIT,
416        "BOOL": TokenType.BOOLEAN,
417        "BOOLEAN": TokenType.BOOLEAN,
418        "BYTE": TokenType.TINYINT,
419        "MEDIUMINT": TokenType.MEDIUMINT,
420        "INT1": TokenType.TINYINT,
421        "TINYINT": TokenType.TINYINT,
422        "INT16": TokenType.SMALLINT,
423        "SHORT": TokenType.SMALLINT,
424        "SMALLINT": TokenType.SMALLINT,
425        "HUGEINT": TokenType.INT128,
426        "UHUGEINT": TokenType.UINT128,
427        "INT2": TokenType.SMALLINT,
428        "INTEGER": TokenType.INT,
429        "INT": TokenType.INT,
430        "INT4": TokenType.INT,
431        "INT32": TokenType.INT,
432        "INT64": TokenType.BIGINT,
433        "INT128": TokenType.INT128,
434        "INT256": TokenType.INT256,
435        "LONG": TokenType.BIGINT,
436        "BIGINT": TokenType.BIGINT,
437        "INT8": TokenType.TINYINT,
438        "UINT": TokenType.UINT,
439        "UINT128": TokenType.UINT128,
440        "UINT256": TokenType.UINT256,
441        "DEC": TokenType.DECIMAL,
442        "DECIMAL": TokenType.DECIMAL,
443        "DECIMAL32": TokenType.DECIMAL32,
444        "DECIMAL64": TokenType.DECIMAL64,
445        "DECIMAL128": TokenType.DECIMAL128,
446        "DECIMAL256": TokenType.DECIMAL256,
447        "DECFLOAT": TokenType.DECFLOAT,
448        "BIGDECIMAL": TokenType.BIGDECIMAL,
449        "BIGNUMERIC": TokenType.BIGDECIMAL,
450        "BIGNUM": TokenType.BIGNUM,
451        "LIST": TokenType.LIST,
452        "MAP": TokenType.MAP,
453        "NULLABLE": TokenType.NULLABLE,
454        "NUMBER": TokenType.DECIMAL,
455        "NUMERIC": TokenType.DECIMAL,
456        "FIXED": TokenType.DECIMAL,
457        "REAL": TokenType.FLOAT,
458        "FLOAT": TokenType.FLOAT,
459        "FLOAT4": TokenType.FLOAT,
460        "FLOAT8": TokenType.DOUBLE,
461        "DOUBLE": TokenType.DOUBLE,
462        "DOUBLE PRECISION": TokenType.DOUBLE,
463        "JSON": TokenType.JSON,
464        "JSONB": TokenType.JSONB,
465        "CHAR": TokenType.CHAR,
466        "CHARACTER": TokenType.CHAR,
467        "CHAR VARYING": TokenType.VARCHAR,
468        "CHARACTER VARYING": TokenType.VARCHAR,
469        "NCHAR": TokenType.NCHAR,
470        "VARCHAR": TokenType.VARCHAR,
471        "VARCHAR2": TokenType.VARCHAR,
472        "NVARCHAR": TokenType.NVARCHAR,
473        "NVARCHAR2": TokenType.NVARCHAR,
474        "BPCHAR": TokenType.BPCHAR,
475        "STR": TokenType.TEXT,
476        "STRING": TokenType.TEXT,
477        "TEXT": TokenType.TEXT,
478        "LONGTEXT": TokenType.LONGTEXT,
479        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
480        "TINYTEXT": TokenType.TINYTEXT,
481        "CLOB": TokenType.TEXT,
482        "LONGVARCHAR": TokenType.TEXT,
483        "BINARY": TokenType.BINARY,
484        "BLOB": TokenType.VARBINARY,
485        "LONGBLOB": TokenType.LONGBLOB,
486        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
487        "TINYBLOB": TokenType.TINYBLOB,
488        "BYTEA": TokenType.VARBINARY,
489        "VARBINARY": TokenType.VARBINARY,
490        "TIME": TokenType.TIME,
491        "TIMETZ": TokenType.TIMETZ,
492        "TIME_NS": TokenType.TIME_NS,
493        "TIMESTAMP": TokenType.TIMESTAMP,
494        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
495        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
496        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
497        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
498        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
499        "DATE": TokenType.DATE,
500        "DATETIME": TokenType.DATETIME,
501        "INT4RANGE": TokenType.INT4RANGE,
502        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
503        "INT8RANGE": TokenType.INT8RANGE,
504        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
505        "NUMRANGE": TokenType.NUMRANGE,
506        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
507        "TSRANGE": TokenType.TSRANGE,
508        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
509        "TSTZRANGE": TokenType.TSTZRANGE,
510        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
511        "DATERANGE": TokenType.DATERANGE,
512        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
513        "UNIQUE": TokenType.UNIQUE,
514        "VECTOR": TokenType.VECTOR,
515        "STRUCT": TokenType.STRUCT,
516        "SEQUENCE": TokenType.SEQUENCE,
517        "VARIANT": TokenType.VARIANT,
518        "ALTER": TokenType.ALTER,
519        "ANALYZE": TokenType.ANALYZE,
520        "CALL": TokenType.COMMAND,
521        "COMMENT": TokenType.COMMENT,
522        "EXPLAIN": TokenType.COMMAND,
523        "GRANT": TokenType.GRANT,
524        "REVOKE": TokenType.REVOKE,
525        "OPTIMIZE": TokenType.COMMAND,
526        "PREPARE": TokenType.COMMAND,
527        "VACUUM": TokenType.COMMAND,
528        "USER-DEFINED": TokenType.USERDEFINED,
529        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
530        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
531    }
532
533    COMMANDS = {
534        TokenType.COMMAND,
535        TokenType.EXECUTE,
536        TokenType.FETCH,
537        TokenType.SHOW,
538        TokenType.RENAME,
539    }
540
541    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
542
543    # Handle numeric literals like in hive (3L = BIGINT)
544    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
545
546    # In tokenizers like JSONPath, dots are always key separators, never decimal points
547    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
548
549    COMMENTS = ["--", ("/*", "*/")]
550
551    _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
552
553    __slots__ = (
554        "dialect",
555        "_core",
556    )
557
558    def __init__(self, dialect: DialectType = None) -> None:
559        from sqlglot.dialects.dialect import Dialect
560
561        self.dialect = Dialect.get_or_raise(dialect)
562        self._core = self._core_cache.get_or_build(type(self), self._init_core)
563
564    def _init_core(self) -> TokenizerCore:
565        return TokenizerCore(
566            single_tokens=self.SINGLE_TOKENS,
567            keywords=self.KEYWORDS,
568            quotes=self._QUOTES,
569            format_strings=self._FORMAT_STRINGS,
570            identifiers=self._IDENTIFIERS,
571            comments=self._COMMENTS,
572            string_escapes=self._STRING_ESCAPES,
573            byte_string_escapes=self._BYTE_STRING_ESCAPES,
574            identifier_escapes=self._IDENTIFIER_ESCAPES,
575            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
576            commands=self.COMMANDS,
577            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
578            nested_comments=self.NESTED_COMMENTS,
579            hint_start=self.HINT_START,
580            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
581            has_bit_strings=bool(self.BIT_STRINGS),
582            has_hex_strings=bool(self.HEX_STRINGS),
583            numeric_literals=self.NUMERIC_LITERALS,
584            var_single_tokens=self.VAR_SINGLE_TOKENS,
585            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
586            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
587            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
588            keyword_trie=self._KEYWORD_TRIE,
589            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
590            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
591            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
592            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
593        )
594
595    def tokenize(self, sql: str) -> list[Token]:
596        """Returns a list of tokens corresponding to the SQL string `sql`."""
597        return self._core.tokenize(sql)  # type: ignore
598
599    @property
600    def sql(self) -> str:
601        """The SQL string being tokenized."""
602        return self._core.sql
603
604    @property
605    def size(self) -> int:
606        """Length of the SQL string."""
607        return self._core.size
608
609    @property
610    def tokens(self) -> list[Token]:
611        """The list of tokens produced by tokenization."""
612        return self._core.tokens
class ThreadLocalCache(_thread._local):
14class ThreadLocalCache(threading.local):
15    """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
16
17    def __init__(self) -> None:
18        self.cache: dict[type, t.Any] = {}
19
20    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
21        if not (obj := self.cache.get(key)):
22            self.cache[key] = obj = build()
23        return obj

Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.

cache: dict[type, typing.Any]
def get_or_build(self, key: type, build: Callable[[], ~T]) -> ~T:
20    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
21        if not (obj := self.cache.get(key)):
22            self.cache[key] = obj = build()
23        return obj
class Tokenizer(_TokenizerBase):
136class Tokenizer(_TokenizerBase):
137    SINGLE_TOKENS = {
138        "(": TokenType.L_PAREN,
139        ")": TokenType.R_PAREN,
140        "[": TokenType.L_BRACKET,
141        "]": TokenType.R_BRACKET,
142        "{": TokenType.L_BRACE,
143        "}": TokenType.R_BRACE,
144        "&": TokenType.AMP,
145        "^": TokenType.CARET,
146        ":": TokenType.COLON,
147        ",": TokenType.COMMA,
148        ".": TokenType.DOT,
149        "-": TokenType.DASH,
150        "=": TokenType.EQ,
151        ">": TokenType.GT,
152        "<": TokenType.LT,
153        "%": TokenType.MOD,
154        "!": TokenType.NOT,
155        "|": TokenType.PIPE,
156        "+": TokenType.PLUS,
157        ";": TokenType.SEMICOLON,
158        "/": TokenType.SLASH,
159        "\\": TokenType.BACKSLASH,
160        "*": TokenType.STAR,
161        "~": TokenType.TILDE,
162        "?": TokenType.PLACEHOLDER,
163        "@": TokenType.PARAMETER,
164        "#": TokenType.HASH,
165        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
166        "'": TokenType.UNKNOWN,
167        "`": TokenType.UNKNOWN,
168        '"': TokenType.UNKNOWN,
169    }
170
171    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
172    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
173    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
174    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
175    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
176    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
177    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
178    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
179    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
180    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
181    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
182    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
183
184    # The strings in this list can always be used as escapes, regardless of the surrounding
185    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
186    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
187    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
188
189    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
190    HEREDOC_TAG_IS_IDENTIFIER = False
191
192    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
193    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
194
195    # Whether string escape characters function as such when placed within raw strings
196    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
197
198    NESTED_COMMENTS = True
199
200    HINT_START = "/*+"
201
202    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
203
204    # Autofilled
205    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
206    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
207    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
208    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
209    _QUOTES: t.ClassVar[dict[str, str]] = {}
210    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
211    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
212    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
213    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
214
215    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
216        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
217        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
218        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
219        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
220        HINT_START: TokenType.HINT,
221        "&<": TokenType.AMP_LT,
222        "&>": TokenType.AMP_GT,
223        "==": TokenType.EQ,
224        "::": TokenType.DCOLON,
225        "?::": TokenType.QDCOLON,
226        "||": TokenType.DPIPE,
227        "|>": TokenType.PIPE_GT,
228        ">=": TokenType.GTE,
229        "<=": TokenType.LTE,
230        "<>": TokenType.NEQ,
231        "!=": TokenType.NEQ,
232        ":=": TokenType.COLON_EQ,
233        "<=>": TokenType.NULLSAFE_EQ,
234        "->": TokenType.ARROW,
235        "->>": TokenType.DARROW,
236        "=>": TokenType.FARROW,
237        "#>": TokenType.HASH_ARROW,
238        "#>>": TokenType.DHASH_ARROW,
239        "<->": TokenType.LR_ARROW,
240        "<<->>": TokenType.LLRR_ARROW,
241        "&&": TokenType.DAMP,
242        "??": TokenType.DQMARK,
243        "~~~": TokenType.GLOB,
244        "~~": TokenType.LIKE,
245        "~~*": TokenType.ILIKE,
246        "~*": TokenType.IRLIKE,
247        "-|-": TokenType.ADJACENT,
248        "ALL": TokenType.ALL,
249        "AND": TokenType.AND,
250        "ANTI": TokenType.ANTI,
251        "ANY": TokenType.ANY,
252        "ASC": TokenType.ASC,
253        "AS": TokenType.ALIAS,
254        "ASOF": TokenType.ASOF,
255        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
256        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
257        "BEGIN": TokenType.BEGIN,
258        "BETWEEN": TokenType.BETWEEN,
259        "CACHE": TokenType.CACHE,
260        "UNCACHE": TokenType.UNCACHE,
261        "CASE": TokenType.CASE,
262        "CHARACTER SET": TokenType.CHARACTER_SET,
263        "CLUSTER BY": TokenType.CLUSTER_BY,
264        "COLLATE": TokenType.COLLATE,
265        "COLUMN": TokenType.COLUMN,
266        "COMMIT": TokenType.COMMIT,
267        "CONNECT BY": TokenType.CONNECT_BY,
268        "CONSTRAINT": TokenType.CONSTRAINT,
269        "COPY": TokenType.COPY,
270        "CREATE": TokenType.CREATE,
271        "CROSS": TokenType.CROSS,
272        "CUBE": TokenType.CUBE,
273        "CURRENT_DATE": TokenType.CURRENT_DATE,
274        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
275        "CURRENT_TIME": TokenType.CURRENT_TIME,
276        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
277        "CURRENT_USER": TokenType.CURRENT_USER,
278        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
279        "DATABASE": TokenType.DATABASE,
280        "DEFAULT": TokenType.DEFAULT,
281        "DELETE": TokenType.DELETE,
282        "DESC": TokenType.DESC,
283        "DESCRIBE": TokenType.DESCRIBE,
284        "DISTINCT": TokenType.DISTINCT,
285        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
286        "DIV": TokenType.DIV,
287        "DROP": TokenType.DROP,
288        "ELSE": TokenType.ELSE,
289        "END": TokenType.END,
290        "ENUM": TokenType.ENUM,
291        "ESCAPE": TokenType.ESCAPE,
292        "EXCEPT": TokenType.EXCEPT,
293        "EXECUTE": TokenType.EXECUTE,
294        "EXISTS": TokenType.EXISTS,
295        "FALSE": TokenType.FALSE,
296        "FETCH": TokenType.FETCH,
297        "FILTER": TokenType.FILTER,
298        "FILE": TokenType.FILE,
299        "FIRST": TokenType.FIRST,
300        "FULL": TokenType.FULL,
301        "FUNCTION": TokenType.FUNCTION,
302        "FOR": TokenType.FOR,
303        "FOREIGN KEY": TokenType.FOREIGN_KEY,
304        "FORMAT": TokenType.FORMAT,
305        "FROM": TokenType.FROM,
306        "GEOGRAPHY": TokenType.GEOGRAPHY,
307        "GEOMETRY": TokenType.GEOMETRY,
308        "GLOB": TokenType.GLOB,
309        "GROUP BY": TokenType.GROUP_BY,
310        "GROUPING SETS": TokenType.GROUPING_SETS,
311        "HAVING": TokenType.HAVING,
312        "ILIKE": TokenType.ILIKE,
313        "IN": TokenType.IN,
314        "INDEX": TokenType.INDEX,
315        "INET": TokenType.INET,
316        "INNER": TokenType.INNER,
317        "INSERT": TokenType.INSERT,
318        "INTERVAL": TokenType.INTERVAL,
319        "INTERSECT": TokenType.INTERSECT,
320        "INTO": TokenType.INTO,
321        "IS": TokenType.IS,
322        "ISNULL": TokenType.ISNULL,
323        "JOIN": TokenType.JOIN,
324        "KEEP": TokenType.KEEP,
325        "KILL": TokenType.KILL,
326        "LATERAL": TokenType.LATERAL,
327        "LEFT": TokenType.LEFT,
328        "LIKE": TokenType.LIKE,
329        "LIMIT": TokenType.LIMIT,
330        "LOAD": TokenType.LOAD,
331        "LOCALTIME": TokenType.LOCALTIME,
332        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
333        "LOCK": TokenType.LOCK,
334        "MERGE": TokenType.MERGE,
335        "NAMESPACE": TokenType.NAMESPACE,
336        "NATURAL": TokenType.NATURAL,
337        "NEXT": TokenType.NEXT,
338        "NOT": TokenType.NOT,
339        "NOTNULL": TokenType.NOTNULL,
340        "NULL": TokenType.NULL,
341        "OBJECT": TokenType.OBJECT,
342        "OFFSET": TokenType.OFFSET,
343        "ON": TokenType.ON,
344        "OR": TokenType.OR,
345        "XOR": TokenType.XOR,
346        "ORDER BY": TokenType.ORDER_BY,
347        "ORDINALITY": TokenType.ORDINALITY,
348        "OUT": TokenType.OUT,
349        "OUTER": TokenType.OUTER,
350        "OVER": TokenType.OVER,
351        "OVERLAPS": TokenType.OVERLAPS,
352        "OVERWRITE": TokenType.OVERWRITE,
353        "PARTITION": TokenType.PARTITION,
354        "PARTITION BY": TokenType.PARTITION_BY,
355        "PARTITIONED BY": TokenType.PARTITION_BY,
356        "PARTITIONED_BY": TokenType.PARTITION_BY,
357        "PERCENT": TokenType.PERCENT,
358        "PIVOT": TokenType.PIVOT,
359        "PRAGMA": TokenType.PRAGMA,
360        "PRIMARY KEY": TokenType.PRIMARY_KEY,
361        "PROCEDURE": TokenType.PROCEDURE,
362        "OPERATOR": TokenType.OPERATOR,
363        "QUALIFY": TokenType.QUALIFY,
364        "RANGE": TokenType.RANGE,
365        "RECURSIVE": TokenType.RECURSIVE,
366        "REGEXP": TokenType.RLIKE,
367        "RENAME": TokenType.RENAME,
368        "REPLACE": TokenType.REPLACE,
369        "RETURNING": TokenType.RETURNING,
370        "REFERENCES": TokenType.REFERENCES,
371        "RIGHT": TokenType.RIGHT,
372        "RLIKE": TokenType.RLIKE,
373        "ROLLBACK": TokenType.ROLLBACK,
374        "ROLLUP": TokenType.ROLLUP,
375        "ROW": TokenType.ROW,
376        "ROWS": TokenType.ROWS,
377        "SCHEMA": TokenType.SCHEMA,
378        "SELECT": TokenType.SELECT,
379        "SEMI": TokenType.SEMI,
380        "SESSION": TokenType.SESSION,
381        "SESSION_USER": TokenType.SESSION_USER,
382        "SET": TokenType.SET,
383        "SETTINGS": TokenType.SETTINGS,
384        "SHOW": TokenType.SHOW,
385        "SIMILAR TO": TokenType.SIMILAR_TO,
386        "SOME": TokenType.SOME,
387        "SORT BY": TokenType.SORT_BY,
388        "SQL SECURITY": TokenType.SQL_SECURITY,
389        "START WITH": TokenType.START_WITH,
390        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
391        "TABLE": TokenType.TABLE,
392        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
393        "TEMP": TokenType.TEMPORARY,
394        "TEMPORARY": TokenType.TEMPORARY,
395        "THEN": TokenType.THEN,
396        "TRUE": TokenType.TRUE,
397        "TRUNCATE": TokenType.TRUNCATE,
398        "TRIGGER": TokenType.TRIGGER,
399        "UNION": TokenType.UNION,
400        "UNKNOWN": TokenType.UNKNOWN,
401        "UNNEST": TokenType.UNNEST,
402        "UNPIVOT": TokenType.UNPIVOT,
403        "UPDATE": TokenType.UPDATE,
404        "USE": TokenType.USE,
405        "USING": TokenType.USING,
406        "UUID": TokenType.UUID,
407        "VALUES": TokenType.VALUES,
408        "VIEW": TokenType.VIEW,
409        "VOLATILE": TokenType.VOLATILE,
410        "WHEN": TokenType.WHEN,
411        "WHERE": TokenType.WHERE,
412        "WINDOW": TokenType.WINDOW,
413        "WITH": TokenType.WITH,
414        "APPLY": TokenType.APPLY,
415        "ARRAY": TokenType.ARRAY,
416        "BIT": TokenType.BIT,
417        "BOOL": TokenType.BOOLEAN,
418        "BOOLEAN": TokenType.BOOLEAN,
419        "BYTE": TokenType.TINYINT,
420        "MEDIUMINT": TokenType.MEDIUMINT,
421        "INT1": TokenType.TINYINT,
422        "TINYINT": TokenType.TINYINT,
423        "INT16": TokenType.SMALLINT,
424        "SHORT": TokenType.SMALLINT,
425        "SMALLINT": TokenType.SMALLINT,
426        "HUGEINT": TokenType.INT128,
427        "UHUGEINT": TokenType.UINT128,
428        "INT2": TokenType.SMALLINT,
429        "INTEGER": TokenType.INT,
430        "INT": TokenType.INT,
431        "INT4": TokenType.INT,
432        "INT32": TokenType.INT,
433        "INT64": TokenType.BIGINT,
434        "INT128": TokenType.INT128,
435        "INT256": TokenType.INT256,
436        "LONG": TokenType.BIGINT,
437        "BIGINT": TokenType.BIGINT,
438        "INT8": TokenType.TINYINT,
439        "UINT": TokenType.UINT,
440        "UINT128": TokenType.UINT128,
441        "UINT256": TokenType.UINT256,
442        "DEC": TokenType.DECIMAL,
443        "DECIMAL": TokenType.DECIMAL,
444        "DECIMAL32": TokenType.DECIMAL32,
445        "DECIMAL64": TokenType.DECIMAL64,
446        "DECIMAL128": TokenType.DECIMAL128,
447        "DECIMAL256": TokenType.DECIMAL256,
448        "DECFLOAT": TokenType.DECFLOAT,
449        "BIGDECIMAL": TokenType.BIGDECIMAL,
450        "BIGNUMERIC": TokenType.BIGDECIMAL,
451        "BIGNUM": TokenType.BIGNUM,
452        "LIST": TokenType.LIST,
453        "MAP": TokenType.MAP,
454        "NULLABLE": TokenType.NULLABLE,
455        "NUMBER": TokenType.DECIMAL,
456        "NUMERIC": TokenType.DECIMAL,
457        "FIXED": TokenType.DECIMAL,
458        "REAL": TokenType.FLOAT,
459        "FLOAT": TokenType.FLOAT,
460        "FLOAT4": TokenType.FLOAT,
461        "FLOAT8": TokenType.DOUBLE,
462        "DOUBLE": TokenType.DOUBLE,
463        "DOUBLE PRECISION": TokenType.DOUBLE,
464        "JSON": TokenType.JSON,
465        "JSONB": TokenType.JSONB,
466        "CHAR": TokenType.CHAR,
467        "CHARACTER": TokenType.CHAR,
468        "CHAR VARYING": TokenType.VARCHAR,
469        "CHARACTER VARYING": TokenType.VARCHAR,
470        "NCHAR": TokenType.NCHAR,
471        "VARCHAR": TokenType.VARCHAR,
472        "VARCHAR2": TokenType.VARCHAR,
473        "NVARCHAR": TokenType.NVARCHAR,
474        "NVARCHAR2": TokenType.NVARCHAR,
475        "BPCHAR": TokenType.BPCHAR,
476        "STR": TokenType.TEXT,
477        "STRING": TokenType.TEXT,
478        "TEXT": TokenType.TEXT,
479        "LONGTEXT": TokenType.LONGTEXT,
480        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
481        "TINYTEXT": TokenType.TINYTEXT,
482        "CLOB": TokenType.TEXT,
483        "LONGVARCHAR": TokenType.TEXT,
484        "BINARY": TokenType.BINARY,
485        "BLOB": TokenType.VARBINARY,
486        "LONGBLOB": TokenType.LONGBLOB,
487        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
488        "TINYBLOB": TokenType.TINYBLOB,
489        "BYTEA": TokenType.VARBINARY,
490        "VARBINARY": TokenType.VARBINARY,
491        "TIME": TokenType.TIME,
492        "TIMETZ": TokenType.TIMETZ,
493        "TIME_NS": TokenType.TIME_NS,
494        "TIMESTAMP": TokenType.TIMESTAMP,
495        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
496        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
497        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
498        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
499        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
500        "DATE": TokenType.DATE,
501        "DATETIME": TokenType.DATETIME,
502        "INT4RANGE": TokenType.INT4RANGE,
503        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
504        "INT8RANGE": TokenType.INT8RANGE,
505        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
506        "NUMRANGE": TokenType.NUMRANGE,
507        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
508        "TSRANGE": TokenType.TSRANGE,
509        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
510        "TSTZRANGE": TokenType.TSTZRANGE,
511        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
512        "DATERANGE": TokenType.DATERANGE,
513        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
514        "UNIQUE": TokenType.UNIQUE,
515        "VECTOR": TokenType.VECTOR,
516        "STRUCT": TokenType.STRUCT,
517        "SEQUENCE": TokenType.SEQUENCE,
518        "VARIANT": TokenType.VARIANT,
519        "ALTER": TokenType.ALTER,
520        "ANALYZE": TokenType.ANALYZE,
521        "CALL": TokenType.COMMAND,
522        "COMMENT": TokenType.COMMENT,
523        "EXPLAIN": TokenType.COMMAND,
524        "GRANT": TokenType.GRANT,
525        "REVOKE": TokenType.REVOKE,
526        "OPTIMIZE": TokenType.COMMAND,
527        "PREPARE": TokenType.COMMAND,
528        "VACUUM": TokenType.COMMAND,
529        "USER-DEFINED": TokenType.USERDEFINED,
530        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
531        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
532    }
533
534    COMMANDS = {
535        TokenType.COMMAND,
536        TokenType.EXECUTE,
537        TokenType.FETCH,
538        TokenType.SHOW,
539        TokenType.RENAME,
540    }
541
542    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
543
544    # Handle numeric literals like in hive (3L = BIGINT)
545    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
546
547    # In tokenizers like JSONPath, dots are always key separators, never decimal points
548    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
549
550    COMMENTS = ["--", ("/*", "*/")]
551
552    _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
553
554    __slots__ = (
555        "dialect",
556        "_core",
557    )
558
559    def __init__(self, dialect: DialectType = None) -> None:
560        from sqlglot.dialects.dialect import Dialect
561
562        self.dialect = Dialect.get_or_raise(dialect)
563        self._core = self._core_cache.get_or_build(type(self), self._init_core)
564
565    def _init_core(self) -> TokenizerCore:
566        return TokenizerCore(
567            single_tokens=self.SINGLE_TOKENS,
568            keywords=self.KEYWORDS,
569            quotes=self._QUOTES,
570            format_strings=self._FORMAT_STRINGS,
571            identifiers=self._IDENTIFIERS,
572            comments=self._COMMENTS,
573            string_escapes=self._STRING_ESCAPES,
574            byte_string_escapes=self._BYTE_STRING_ESCAPES,
575            identifier_escapes=self._IDENTIFIER_ESCAPES,
576            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
577            commands=self.COMMANDS,
578            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
579            nested_comments=self.NESTED_COMMENTS,
580            hint_start=self.HINT_START,
581            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
582            has_bit_strings=bool(self.BIT_STRINGS),
583            has_hex_strings=bool(self.HEX_STRINGS),
584            numeric_literals=self.NUMERIC_LITERALS,
585            var_single_tokens=self.VAR_SINGLE_TOKENS,
586            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
587            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
588            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
589            keyword_trie=self._KEYWORD_TRIE,
590            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
591            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
592            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
593            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
594        )
595
596    def tokenize(self, sql: str) -> list[Token]:
597        """Returns a list of tokens corresponding to the SQL string `sql`."""
598        return self._core.tokenize(sql)  # type: ignore
599
600    @property
601    def sql(self) -> str:
602        """The SQL string being tokenized."""
603        return self._core.sql
604
605    @property
606    def size(self) -> int:
607        """Length of the SQL string."""
608        return self._core.size
609
610    @property
611    def tokens(self) -> list[Token]:
612        """The list of tokens produced by tokenization."""
613        return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
559    def __init__(self, dialect: DialectType = None) -> None:
560        from sqlglot.dialects.dialect import Dialect
561
562        self.dialect = Dialect.get_or_raise(dialect)
563        self._core = self._core_cache.get_or_build(type(self), self._init_core)
SINGLE_TOKENS = {'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 328>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 355>, '@': <TokenType.PARAMETER: 57>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 213>, '`': <TokenType.UNKNOWN: 213>, '"': <TokenType.UNKNOWN: 213>}
BIT_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
BYTE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEX_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
RAW_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEREDOC_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
UNICODE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
IDENTIFIERS: ClassVar[list[tuple[str, str] | str]] = ['"']
QUOTES: ClassVar[list[tuple[str, str] | str]] = ["'"]
STRING_ESCAPES: ClassVar[list[str]] = ["'"]
BYTE_STRING_ESCAPES: ClassVar[list[str]] = []
VAR_SINGLE_TOKENS: ClassVar[set[str]] = set()
ESCAPE_FOLLOW_CHARS: ClassVar[list[str]] = []
IDENTIFIER_ESCAPES: ClassVar[list[str]] = []
HEREDOC_TAG_IS_IDENTIFIER = False
HEREDOC_STRING_ALTERNATIVE = <TokenType.VAR: 88>
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
NESTED_COMMENTS = True
HINT_START = '/*+'
TOKENS_PRECEDING_HINT = {<TokenType.UPDATE: 416>, <TokenType.SELECT: 385>, <TokenType.INSERT: 299>, <TokenType.DELETE: 256>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] = {'{%': <TokenType.BLOCK_START: 72>, '{%+': <TokenType.BLOCK_START: 72>, '{%-': <TokenType.BLOCK_START: 72>, '%}': <TokenType.BLOCK_END: 73>, '+%}': <TokenType.BLOCK_END: 73>, '-%}': <TokenType.BLOCK_END: 73>, '{{+': <TokenType.BLOCK_START: 72>, '{{-': <TokenType.BLOCK_START: 72>, '+}}': <TokenType.BLOCK_END: 73>, '-}}': <TokenType.BLOCK_END: 73>, '/*+': <TokenType.HINT: 292>, '&<': <TokenType.AMP_LT: 62>, '&>': <TokenType.AMP_GT: 63>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 368>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '<<->>': <TokenType.LLRR_ARROW: 52>, '&&': <TokenType.DAMP: 61>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 286>, '~~': <TokenType.LIKE: 317>, '~~*': <TokenType.ILIKE: 294>, '~*': <TokenType.IRLIKE: 306>, '-|-': <TokenType.ADJACENT: 64>, 'ALL': <TokenType.ALL: 219>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 220>, 'ANY': <TokenType.ANY: 221>, 'ASC': <TokenType.ASC: 224>, 'AS': <TokenType.ALIAS: 217>, 'ASOF': <TokenType.ASOF: 225>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 227>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 227>, 'BEGIN': <TokenType.BEGIN: 228>, 'BETWEEN': <TokenType.BETWEEN: 229>, 'CACHE': <TokenType.CACHE: 231>, 'UNCACHE': <TokenType.UNCACHE: 412>, 'CASE': <TokenType.CASE: 232>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 233>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 234>, 'COLLATE': <TokenType.COLLATE: 235>, 'COLUMN': <TokenType.COLUMN: 80>, 'COMMIT': <TokenType.COMMIT: 238>, 'CONNECT BY': <TokenType.CONNECT_BY: 239>, 'CONSTRAINT': <TokenType.CONSTRAINT: 240>, 'COPY': <TokenType.COPY: 241>, 'CREATE': <TokenType.CREATE: 242>, 'CROSS': <TokenType.CROSS: 243>, 'CUBE': <TokenType.CUBE: 244>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 245>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 247>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 248>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 249>, 'CURRENT_USER': <TokenType.CURRENT_USER: 250>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 253>, 'DATABASE': <TokenType.DATABASE: 79>, 'DEFAULT': <TokenType.DEFAULT: 255>, 'DELETE': <TokenType.DELETE: 256>, 'DESC': <TokenType.DESC: 257>, 'DESCRIBE': <TokenType.DESCRIBE: 258>, 'DISTINCT': <TokenType.DISTINCT: 261>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 262>, 'DIV': <TokenType.DIV: 263>, 'DROP': <TokenType.DROP: 264>, 'ELSE': <TokenType.ELSE: 265>, 'END': <TokenType.END: 266>, 'ENUM': <TokenType.ENUM: 204>, 'ESCAPE': <TokenType.ESCAPE: 267>, 'EXCEPT': <TokenType.EXCEPT: 268>, 'EXECUTE': <TokenType.EXECUTE: 269>, 'EXISTS': <TokenType.EXISTS: 270>, 'FALSE': <TokenType.FALSE: 271>, 'FETCH': <TokenType.FETCH: 272>, 'FILTER': <TokenType.FILTER: 275>, 'FILE': <TokenType.FILE: 273>, 'FIRST': <TokenType.FIRST: 277>, 'FULL': <TokenType.FULL: 283>, 'FUNCTION': <TokenType.FUNCTION: 284>, 'FOR': <TokenType.FOR: 278>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 280>, 'FORMAT': <TokenType.FORMAT: 281>, 'FROM': <TokenType.FROM: 282>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 171>, 'GEOMETRY': <TokenType.GEOMETRY: 174>, 'GLOB': <TokenType.GLOB: 286>, 'GROUP BY': <TokenType.GROUP_BY: 289>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 290>, 'HAVING': <TokenType.HAVING: 291>, 'ILIKE': <TokenType.ILIKE: 294>, 'IN': <TokenType.IN: 295>, 'INDEX': <TokenType.INDEX: 296>, 'INET': <TokenType.INET: 199>, 'INNER': <TokenType.INNER: 298>, 'INSERT': <TokenType.INSERT: 299>, 'INTERVAL': <TokenType.INTERVAL: 303>, 'INTERSECT': <TokenType.INTERSECT: 302>, 'INTO': <TokenType.INTO: 304>, 'IS': <TokenType.IS: 307>, 'ISNULL': <TokenType.ISNULL: 308>, 'JOIN': <TokenType.JOIN: 309>, 'KEEP': <TokenType.KEEP: 311>, 'KILL': <TokenType.KILL: 313>, 'LATERAL': <TokenType.LATERAL: 315>, 'LEFT': <TokenType.LEFT: 316>, 'LIKE': <TokenType.LIKE: 317>, 'LIMIT': <TokenType.LIMIT: 318>, 'LOAD': <TokenType.LOAD: 320>, 'LOCALTIME': <TokenType.LOCALTIME: 178>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 179>, 'LOCK': <TokenType.LOCK: 321>, 'MERGE': <TokenType.MERGE: 327>, 'NAMESPACE': <TokenType.NAMESPACE: 439>, 'NATURAL': <TokenType.NATURAL: 330>, 'NEXT': <TokenType.NEXT: 331>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 333>, 'NULL': <TokenType.NULL: 334>, 'OBJECT': <TokenType.OBJECT: 198>, 'OFFSET': <TokenType.OFFSET: 336>, 'ON': <TokenType.ON: 337>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 65>, 'ORDER BY': <TokenType.ORDER_BY: 340>, 'ORDINALITY': <TokenType.ORDINALITY: 343>, 'OUT': <TokenType.OUT: 344>, 'OUTER': <TokenType.OUTER: 346>, 'OVER': <TokenType.OVER: 347>, 'OVERLAPS': <TokenType.OVERLAPS: 348>, 'OVERWRITE': <TokenType.OVERWRITE: 349>, 'PARTITION': <TokenType.PARTITION: 351>, 'PARTITION BY': <TokenType.PARTITION_BY: 352>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 352>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 352>, 'PERCENT': <TokenType.PERCENT: 353>, 'PIVOT': <TokenType.PIVOT: 354>, 'PRAGMA': <TokenType.PRAGMA: 359>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 361>, 'PROCEDURE': <TokenType.PROCEDURE: 362>, 'OPERATOR': <TokenType.OPERATOR: 339>, 'QUALIFY': <TokenType.QUALIFY: 366>, 'RANGE': <TokenType.RANGE: 369>, 'RECURSIVE': <TokenType.RECURSIVE: 370>, 'REGEXP': <TokenType.RLIKE: 378>, 'RENAME': <TokenType.RENAME: 372>, 'REPLACE': <TokenType.REPLACE: 373>, 'RETURNING': <TokenType.RETURNING: 374>, 'REFERENCES': <TokenType.REFERENCES: 376>, 'RIGHT': <TokenType.RIGHT: 377>, 'RLIKE': <TokenType.RLIKE: 378>, 'ROLLBACK': <TokenType.ROLLBACK: 380>, 'ROLLUP': <TokenType.ROLLUP: 381>, 'ROW': <TokenType.ROW: 382>, 'ROWS': <TokenType.ROWS: 383>, 'SCHEMA': <TokenType.SCHEMA: 82>, 'SELECT': <TokenType.SELECT: 385>, 'SEMI': <TokenType.SEMI: 386>, 'SESSION': <TokenType.SESSION: 58>, 'SESSION_USER': <TokenType.SESSION_USER: 60>, 'SET': <TokenType.SET: 390>, 'SETTINGS': <TokenType.SETTINGS: 391>, 'SHOW': <TokenType.SHOW: 392>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 393>, 'SOME': <TokenType.SOME: 394>, 'SORT BY': <TokenType.SORT_BY: 395>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 397>, 'START WITH': <TokenType.START_WITH: 398>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 400>, 'TABLE': <TokenType.TABLE: 83>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 403>, 'TEMP': <TokenType.TEMPORARY: 405>, 'TEMPORARY': <TokenType.TEMPORARY: 405>, 'THEN': <TokenType.THEN: 407>, 'TRUE': <TokenType.TRUE: 408>, 'TRUNCATE': <TokenType.TRUNCATE: 409>, 'TRIGGER': <TokenType.TRIGGER: 410>, 'UNION': <TokenType.UNION: 413>, 'UNKNOWN': <TokenType.UNKNOWN: 213>, 'UNNEST': <TokenType.UNNEST: 414>, 'UNPIVOT': <TokenType.UNPIVOT: 415>, 'UPDATE': <TokenType.UPDATE: 416>, 'USE': <TokenType.USE: 417>, 'USING': <TokenType.USING: 418>, 'UUID': <TokenType.UUID: 170>, 'VALUES': <TokenType.VALUES: 419>, 'VIEW': <TokenType.VIEW: 421>, 'VOLATILE': <TokenType.VOLATILE: 423>, 'WHEN': <TokenType.WHEN: 425>, 'WHERE': <TokenType.WHERE: 426>, 'WINDOW': <TokenType.WINDOW: 427>, 'WITH': <TokenType.WITH: 428>, 'APPLY': <TokenType.APPLY: 222>, 'ARRAY': <TokenType.ARRAY: 223>, 'BIT': <TokenType.BIT: 96>, 'BOOL': <TokenType.BOOLEAN: 97>, 'BOOLEAN': <TokenType.BOOLEAN: 97>, 'BYTE': <TokenType.TINYINT: 98>, 'MEDIUMINT': <TokenType.MEDIUMINT: 102>, 'INT1': <TokenType.TINYINT: 98>, 'TINYINT': <TokenType.TINYINT: 98>, 'INT16': <TokenType.SMALLINT: 100>, 'SHORT': <TokenType.SMALLINT: 100>, 'SMALLINT': <TokenType.SMALLINT: 100>, 'HUGEINT': <TokenType.INT128: 109>, 'UHUGEINT': <TokenType.UINT128: 110>, 'INT2': <TokenType.SMALLINT: 100>, 'INTEGER': <TokenType.INT: 104>, 'INT': <TokenType.INT: 104>, 'INT4': <TokenType.INT: 104>, 'INT32': <TokenType.INT: 104>, 'INT64': <TokenType.BIGINT: 106>, 'INT128': <TokenType.INT128: 109>, 'INT256': <TokenType.INT256: 111>, 'LONG': <TokenType.BIGINT: 106>, 'BIGINT': <TokenType.BIGINT: 106>, 'INT8': <TokenType.TINYINT: 98>, 'UINT': <TokenType.UINT: 105>, 'UINT128': <TokenType.UINT128: 110>, 'UINT256': <TokenType.UINT256: 112>, 'DEC': <TokenType.DECIMAL: 116>, 'DECIMAL': <TokenType.DECIMAL: 116>, 'DECIMAL32': <TokenType.DECIMAL32: 117>, 'DECIMAL64': <TokenType.DECIMAL64: 118>, 'DECIMAL128': <TokenType.DECIMAL128: 119>, 'DECIMAL256': <TokenType.DECIMAL256: 120>, 'DECFLOAT': <TokenType.DECFLOAT: 121>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 123>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 123>, 'BIGNUM': <TokenType.BIGNUM: 108>, 'LIST': <TokenType.LIST: 319>, 'MAP': <TokenType.MAP: 322>, 'NULLABLE': <TokenType.NULLABLE: 173>, 'NUMBER': <TokenType.DECIMAL: 116>, 'NUMERIC': <TokenType.DECIMAL: 116>, 'FIXED': <TokenType.DECIMAL: 116>, 'REAL': <TokenType.FLOAT: 113>, 'FLOAT': <TokenType.FLOAT: 113>, 'FLOAT4': <TokenType.FLOAT: 113>, 'FLOAT8': <TokenType.DOUBLE: 114>, 'DOUBLE': <TokenType.DOUBLE: 114>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 114>, 'JSON': <TokenType.JSON: 140>, 'JSONB': <TokenType.JSONB: 141>, 'CHAR': <TokenType.CHAR: 124>, 'CHARACTER': <TokenType.CHAR: 124>, 'CHAR VARYING': <TokenType.VARCHAR: 126>, 'CHARACTER VARYING': <TokenType.VARCHAR: 126>, 'NCHAR': <TokenType.NCHAR: 125>, 'VARCHAR': <TokenType.VARCHAR: 126>, 'VARCHAR2': <TokenType.VARCHAR: 126>, 'NVARCHAR': <TokenType.NVARCHAR: 127>, 'NVARCHAR2': <TokenType.NVARCHAR: 127>, 'BPCHAR': <TokenType.BPCHAR: 128>, 'STR': <TokenType.TEXT: 129>, 'STRING': <TokenType.TEXT: 129>, 'TEXT': <TokenType.TEXT: 129>, 'LONGTEXT': <TokenType.LONGTEXT: 131>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 130>, 'TINYTEXT': <TokenType.TINYTEXT: 136>, 'CLOB': <TokenType.TEXT: 129>, 'LONGVARCHAR': <TokenType.TEXT: 129>, 'BINARY': <TokenType.BINARY: 138>, 'BLOB': <TokenType.VARBINARY: 139>, 'LONGBLOB': <TokenType.LONGBLOB: 134>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 133>, 'TINYBLOB': <TokenType.TINYBLOB: 135>, 'BYTEA': <TokenType.VARBINARY: 139>, 'VARBINARY': <TokenType.VARBINARY: 139>, 'TIME': <TokenType.TIME: 142>, 'TIMETZ': <TokenType.TIMETZ: 143>, 'TIME_NS': <TokenType.TIME_NS: 144>, 'TIMESTAMP': <TokenType.TIMESTAMP: 145>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 146>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 147>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 147>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 148>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 148>, 'DATE': <TokenType.DATE: 156>, 'DATETIME': <TokenType.DATETIME: 152>, 'INT4RANGE': <TokenType.INT4RANGE: 158>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 159>, 'INT8RANGE': <TokenType.INT8RANGE: 160>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 161>, 'NUMRANGE': <TokenType.NUMRANGE: 162>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 163>, 'TSRANGE': <TokenType.TSRANGE: 164>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 165>, 'TSTZRANGE': <TokenType.TSTZRANGE: 166>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 167>, 'DATERANGE': <TokenType.DATERANGE: 168>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 169>, 'UNIQUE': <TokenType.UNIQUE: 429>, 'VECTOR': <TokenType.VECTOR: 214>, 'STRUCT': <TokenType.STRUCT: 401>, 'SEQUENCE': <TokenType.SEQUENCE: 388>, 'VARIANT': <TokenType.VARIANT: 197>, 'ALTER': <TokenType.ALTER: 218>, 'ANALYZE': <TokenType.ANALYZE: 438>, 'CALL': <TokenType.COMMAND: 236>, 'COMMENT': <TokenType.COMMENT: 237>, 'EXPLAIN': <TokenType.COMMAND: 236>, 'GRANT': <TokenType.GRANT: 288>, 'REVOKE': <TokenType.REVOKE: 375>, 'OPTIMIZE': <TokenType.COMMAND: 236>, 'PREPARE': <TokenType.COMMAND: 236>, 'VACUUM': <TokenType.COMMAND: 236>, 'USER-DEFINED': <TokenType.USERDEFINED: 192>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 433>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 434>}
COMMANDS = {<TokenType.SHOW: 392>, <TokenType.COMMAND: 236>, <TokenType.EXECUTE: 269>, <TokenType.FETCH: 272>, <TokenType.RENAME: 372>}
COMMAND_PREFIX_TOKENS = {<TokenType.SEMICOLON: 19>, <TokenType.BEGIN: 228>}
NUMERIC_LITERALS: ClassVar[dict[str, str]] = {}
NUMBERS_CAN_HAVE_DECIMALS: ClassVar[bool] = True
COMMENTS = ['--', ('/*', '*/')]
dialect
def tokenize(self, sql: str) -> list[sqlglot.tokenizer_core.Token]:
596    def tokenize(self, sql: str) -> list[Token]:
597        """Returns a list of tokens corresponding to the SQL string `sql`."""
598        return self._core.tokenize(sql)  # type: ignore

Returns a list of tokens corresponding to the SQL string sql.

sql: str
600    @property
601    def sql(self) -> str:
602        """The SQL string being tokenized."""
603        return self._core.sql

The SQL string being tokenized.

size: int
605    @property
606    def size(self) -> int:
607        """Length of the SQL string."""
608        return self._core.size

Length of the SQL string.

tokens: list[sqlglot.tokenizer_core.Token]
610    @property
611    def tokens(self) -> list[Token]:
612        """The list of tokens produced by tokenization."""
613        return self._core.tokens

The list of tokens produced by tokenization.