Edit on GitHub

sqlglot.tokens

  1from __future__ import annotations
  2
  3import threading
  4import typing as t
  5
  6from sqlglot.trie import new_trie
  7
  8from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType
  9
 10T = t.TypeVar("T")
 11
 12
 13class ThreadLocalCache(threading.local):
 14    """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
 15
 16    def __init__(self) -> None:
 17        self.cache: dict[type, t.Any] = {}
 18
 19    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
 20        if not (obj := self.cache.get(key)):
 21            self.cache[key] = obj = build()
 22        return obj
 23
 24
 25try:
 26    import sqlglotc  # noqa: F401
 27except ImportError:
 28    pass
 29
 30try:
 31    import sqlglotrs  # type: ignore # noqa: F401
 32    import warnings
 33
 34    if "sqlglotc" not in globals():
 35        warnings.warn(
 36            "sqlglot[rs] is deprecated and no longer compatible with sqlglot. "
 37            "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]",
 38        )
 39except ImportError:
 40    pass
 41
 42if t.TYPE_CHECKING:
 43    from sqlglot.dialects.dialect import DialectType
 44
 45
 46def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]:
 47    return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr)
 48
 49
 50def _quotes_to_format(
 51    token_type: TokenType, arr: list[str | tuple[str, str]]
 52) -> dict[str, tuple[str, TokenType]]:
 53    return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
 54
 55
 56class _TokenizerBase:
 57    QUOTES: t.ClassVar[list[tuple[str, str] | str]]
 58    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]]
 59    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 60    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 61    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 62    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 63    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 64    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]]
 65    STRING_ESCAPES: t.ClassVar[list[str]]
 66    BYTE_STRING_ESCAPES: t.ClassVar[list[str]]
 67    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]]
 68    IDENTIFIER_ESCAPES: t.ClassVar[list[str]]
 69    HINT_START: t.ClassVar[str]
 70    KEYWORDS: t.ClassVar[dict[str, TokenType]]
 71    SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]]
 72    NUMERIC_LITERALS: t.ClassVar[dict[str, str]]
 73    VAR_SINGLE_TOKENS: t.ClassVar[set[str]]
 74    COMMANDS: t.ClassVar[set[TokenType]]
 75    COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]]
 76    HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool]
 77    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool]
 78    NESTED_COMMENTS: t.ClassVar[bool]
 79    TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]]
 80    HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType]
 81    COMMENTS: t.ClassVar[list[str | tuple[str, str]]]
 82    _QUOTES: t.ClassVar[dict[str, str]]
 83    _IDENTIFIERS: t.ClassVar[dict[str, str]]
 84    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]]
 85    _STRING_ESCAPES: t.ClassVar[set[str]]
 86    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]]
 87    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]]
 88    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]]
 89    _COMMENTS: t.ClassVar[dict[str, str | None]]
 90    _KEYWORD_TRIE: t.ClassVar[dict[str, object]]
 91
 92    @classmethod
 93    def __init_subclass__(cls, **kwargs: t.Any) -> None:
 94        super().__init_subclass__(**kwargs)
 95        cls._QUOTES = _convert_quotes(cls.QUOTES)
 96        cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS)
 97        cls._FORMAT_STRINGS = {
 98            **{
 99                p + s: (e, TokenType.NATIONAL_STRING)
100                for s, e in cls._QUOTES.items()
101                for p in ("n", "N")
102            },
103            **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS),
104            **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS),
105            **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS),
106            **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS),
107            **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS),
108            **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS),
109        }
110        if "BYTE_STRING_ESCAPES" not in cls.__dict__:
111            cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy()
112        cls._STRING_ESCAPES = set(cls.STRING_ESCAPES)
113        cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES)
114        cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS)
115        cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES)
116        cls._COMMENTS = {
117            **{c: None for c in cls.COMMENTS if isinstance(c, str)},
118            **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)},
119            "{#": "#}",  # Ensure Jinja comments are tokenized correctly in all dialects
120        }
121        if cls.HINT_START in cls.KEYWORDS:
122            cls._COMMENTS[cls.HINT_START] = "*/"
123        cls._KEYWORD_TRIE = new_trie(
124            key.upper()
125            for key in (
126                *cls.KEYWORDS,
127                *cls._COMMENTS,
128                *cls._QUOTES,
129                *cls._FORMAT_STRINGS,
130            )
131            if " " in key or any(single in key for single in cls.SINGLE_TOKENS)
132        )
133
134
135class Tokenizer(_TokenizerBase):
136    SINGLE_TOKENS = {
137        "(": TokenType.L_PAREN,
138        ")": TokenType.R_PAREN,
139        "[": TokenType.L_BRACKET,
140        "]": TokenType.R_BRACKET,
141        "{": TokenType.L_BRACE,
142        "}": TokenType.R_BRACE,
143        "&": TokenType.AMP,
144        "^": TokenType.CARET,
145        ":": TokenType.COLON,
146        ",": TokenType.COMMA,
147        ".": TokenType.DOT,
148        "-": TokenType.DASH,
149        "=": TokenType.EQ,
150        ">": TokenType.GT,
151        "<": TokenType.LT,
152        "%": TokenType.MOD,
153        "!": TokenType.NOT,
154        "|": TokenType.PIPE,
155        "+": TokenType.PLUS,
156        ";": TokenType.SEMICOLON,
157        "/": TokenType.SLASH,
158        "\\": TokenType.BACKSLASH,
159        "*": TokenType.STAR,
160        "~": TokenType.TILDE,
161        "?": TokenType.PLACEHOLDER,
162        "@": TokenType.PARAMETER,
163        "#": TokenType.HASH,
164        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
165        "'": TokenType.UNKNOWN,
166        "`": TokenType.UNKNOWN,
167        '"': TokenType.UNKNOWN,
168    }
169
170    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
171    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
172    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
173    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
174    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
175    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
176    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
177    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
178    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
179    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
180    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
181    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
182
183    # The strings in this list can always be used as escapes, regardless of the surrounding
184    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
185    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
186    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
187
188    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
189    HEREDOC_TAG_IS_IDENTIFIER = False
190
191    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
192    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
193
194    # Whether string escape characters function as such when placed within raw strings
195    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
196
197    NESTED_COMMENTS = True
198
199    HINT_START = "/*+"
200
201    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
202
203    # Autofilled
204    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
205    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
206    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
207    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
208    _QUOTES: t.ClassVar[dict[str, str]] = {}
209    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
210    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
211    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
212    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
213
214    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
215        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
216        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
217        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
218        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
219        HINT_START: TokenType.HINT,
220        "&<": TokenType.AMP_LT,
221        "&>": TokenType.AMP_GT,
222        "==": TokenType.EQ,
223        "::": TokenType.DCOLON,
224        "?::": TokenType.QDCOLON,
225        "||": TokenType.DPIPE,
226        "|>": TokenType.PIPE_GT,
227        ">=": TokenType.GTE,
228        "<=": TokenType.LTE,
229        "<>": TokenType.NEQ,
230        "!=": TokenType.NEQ,
231        ":=": TokenType.COLON_EQ,
232        "<=>": TokenType.NULLSAFE_EQ,
233        "->": TokenType.ARROW,
234        "->>": TokenType.DARROW,
235        "=>": TokenType.FARROW,
236        "#>": TokenType.HASH_ARROW,
237        "#>>": TokenType.DHASH_ARROW,
238        "<->": TokenType.LR_ARROW,
239        "&&": TokenType.DAMP,
240        "??": TokenType.DQMARK,
241        "~~~": TokenType.GLOB,
242        "~~": TokenType.LIKE,
243        "~~*": TokenType.ILIKE,
244        "~*": TokenType.IRLIKE,
245        "-|-": TokenType.ADJACENT,
246        "ALL": TokenType.ALL,
247        "AND": TokenType.AND,
248        "ANTI": TokenType.ANTI,
249        "ANY": TokenType.ANY,
250        "ASC": TokenType.ASC,
251        "AS": TokenType.ALIAS,
252        "ASOF": TokenType.ASOF,
253        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
254        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
255        "BEGIN": TokenType.BEGIN,
256        "BETWEEN": TokenType.BETWEEN,
257        "CACHE": TokenType.CACHE,
258        "UNCACHE": TokenType.UNCACHE,
259        "CASE": TokenType.CASE,
260        "CHARACTER SET": TokenType.CHARACTER_SET,
261        "CLUSTER BY": TokenType.CLUSTER_BY,
262        "COLLATE": TokenType.COLLATE,
263        "COLUMN": TokenType.COLUMN,
264        "COMMIT": TokenType.COMMIT,
265        "CONNECT BY": TokenType.CONNECT_BY,
266        "CONSTRAINT": TokenType.CONSTRAINT,
267        "COPY": TokenType.COPY,
268        "CREATE": TokenType.CREATE,
269        "CROSS": TokenType.CROSS,
270        "CUBE": TokenType.CUBE,
271        "CURRENT_DATE": TokenType.CURRENT_DATE,
272        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
273        "CURRENT_TIME": TokenType.CURRENT_TIME,
274        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
275        "CURRENT_USER": TokenType.CURRENT_USER,
276        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
277        "DATABASE": TokenType.DATABASE,
278        "DEFAULT": TokenType.DEFAULT,
279        "DELETE": TokenType.DELETE,
280        "DESC": TokenType.DESC,
281        "DESCRIBE": TokenType.DESCRIBE,
282        "DISTINCT": TokenType.DISTINCT,
283        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
284        "DIV": TokenType.DIV,
285        "DROP": TokenType.DROP,
286        "ELSE": TokenType.ELSE,
287        "END": TokenType.END,
288        "ENUM": TokenType.ENUM,
289        "ESCAPE": TokenType.ESCAPE,
290        "EXCEPT": TokenType.EXCEPT,
291        "EXECUTE": TokenType.EXECUTE,
292        "EXISTS": TokenType.EXISTS,
293        "FALSE": TokenType.FALSE,
294        "FETCH": TokenType.FETCH,
295        "FILTER": TokenType.FILTER,
296        "FILE": TokenType.FILE,
297        "FIRST": TokenType.FIRST,
298        "FULL": TokenType.FULL,
299        "FUNCTION": TokenType.FUNCTION,
300        "FOR": TokenType.FOR,
301        "FOREIGN KEY": TokenType.FOREIGN_KEY,
302        "FORMAT": TokenType.FORMAT,
303        "FROM": TokenType.FROM,
304        "GEOGRAPHY": TokenType.GEOGRAPHY,
305        "GEOMETRY": TokenType.GEOMETRY,
306        "GLOB": TokenType.GLOB,
307        "GROUP BY": TokenType.GROUP_BY,
308        "GROUPING SETS": TokenType.GROUPING_SETS,
309        "HAVING": TokenType.HAVING,
310        "ILIKE": TokenType.ILIKE,
311        "IN": TokenType.IN,
312        "INDEX": TokenType.INDEX,
313        "INET": TokenType.INET,
314        "INNER": TokenType.INNER,
315        "INSERT": TokenType.INSERT,
316        "INTERVAL": TokenType.INTERVAL,
317        "INTERSECT": TokenType.INTERSECT,
318        "INTO": TokenType.INTO,
319        "IS": TokenType.IS,
320        "ISNULL": TokenType.ISNULL,
321        "JOIN": TokenType.JOIN,
322        "KEEP": TokenType.KEEP,
323        "KILL": TokenType.KILL,
324        "LATERAL": TokenType.LATERAL,
325        "LEFT": TokenType.LEFT,
326        "LIKE": TokenType.LIKE,
327        "LIMIT": TokenType.LIMIT,
328        "LOAD": TokenType.LOAD,
329        "LOCALTIME": TokenType.LOCALTIME,
330        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
331        "LOCK": TokenType.LOCK,
332        "MERGE": TokenType.MERGE,
333        "NAMESPACE": TokenType.NAMESPACE,
334        "NATURAL": TokenType.NATURAL,
335        "NEXT": TokenType.NEXT,
336        "NOT": TokenType.NOT,
337        "NOTNULL": TokenType.NOTNULL,
338        "NULL": TokenType.NULL,
339        "OBJECT": TokenType.OBJECT,
340        "OFFSET": TokenType.OFFSET,
341        "ON": TokenType.ON,
342        "OR": TokenType.OR,
343        "XOR": TokenType.XOR,
344        "ORDER BY": TokenType.ORDER_BY,
345        "ORDINALITY": TokenType.ORDINALITY,
346        "OUT": TokenType.OUT,
347        "OUTER": TokenType.OUTER,
348        "OVER": TokenType.OVER,
349        "OVERLAPS": TokenType.OVERLAPS,
350        "OVERWRITE": TokenType.OVERWRITE,
351        "PARTITION": TokenType.PARTITION,
352        "PARTITION BY": TokenType.PARTITION_BY,
353        "PARTITIONED BY": TokenType.PARTITION_BY,
354        "PARTITIONED_BY": TokenType.PARTITION_BY,
355        "PERCENT": TokenType.PERCENT,
356        "PIVOT": TokenType.PIVOT,
357        "PRAGMA": TokenType.PRAGMA,
358        "PRIMARY KEY": TokenType.PRIMARY_KEY,
359        "PROCEDURE": TokenType.PROCEDURE,
360        "OPERATOR": TokenType.OPERATOR,
361        "QUALIFY": TokenType.QUALIFY,
362        "RANGE": TokenType.RANGE,
363        "RECURSIVE": TokenType.RECURSIVE,
364        "REGEXP": TokenType.RLIKE,
365        "RENAME": TokenType.RENAME,
366        "REPLACE": TokenType.REPLACE,
367        "RETURNING": TokenType.RETURNING,
368        "REFERENCES": TokenType.REFERENCES,
369        "RIGHT": TokenType.RIGHT,
370        "RLIKE": TokenType.RLIKE,
371        "ROLLBACK": TokenType.ROLLBACK,
372        "ROLLUP": TokenType.ROLLUP,
373        "ROW": TokenType.ROW,
374        "ROWS": TokenType.ROWS,
375        "SCHEMA": TokenType.SCHEMA,
376        "SELECT": TokenType.SELECT,
377        "SEMI": TokenType.SEMI,
378        "SESSION": TokenType.SESSION,
379        "SESSION_USER": TokenType.SESSION_USER,
380        "SET": TokenType.SET,
381        "SETTINGS": TokenType.SETTINGS,
382        "SHOW": TokenType.SHOW,
383        "SIMILAR TO": TokenType.SIMILAR_TO,
384        "SOME": TokenType.SOME,
385        "SORT BY": TokenType.SORT_BY,
386        "SQL SECURITY": TokenType.SQL_SECURITY,
387        "START WITH": TokenType.START_WITH,
388        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
389        "TABLE": TokenType.TABLE,
390        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
391        "TEMP": TokenType.TEMPORARY,
392        "TEMPORARY": TokenType.TEMPORARY,
393        "THEN": TokenType.THEN,
394        "TRUE": TokenType.TRUE,
395        "TRUNCATE": TokenType.TRUNCATE,
396        "TRIGGER": TokenType.TRIGGER,
397        "UNION": TokenType.UNION,
398        "UNKNOWN": TokenType.UNKNOWN,
399        "UNNEST": TokenType.UNNEST,
400        "UNPIVOT": TokenType.UNPIVOT,
401        "UPDATE": TokenType.UPDATE,
402        "USE": TokenType.USE,
403        "USING": TokenType.USING,
404        "UUID": TokenType.UUID,
405        "VALUES": TokenType.VALUES,
406        "VIEW": TokenType.VIEW,
407        "VOLATILE": TokenType.VOLATILE,
408        "WHEN": TokenType.WHEN,
409        "WHERE": TokenType.WHERE,
410        "WINDOW": TokenType.WINDOW,
411        "WITH": TokenType.WITH,
412        "APPLY": TokenType.APPLY,
413        "ARRAY": TokenType.ARRAY,
414        "BIT": TokenType.BIT,
415        "BOOL": TokenType.BOOLEAN,
416        "BOOLEAN": TokenType.BOOLEAN,
417        "BYTE": TokenType.TINYINT,
418        "MEDIUMINT": TokenType.MEDIUMINT,
419        "INT1": TokenType.TINYINT,
420        "TINYINT": TokenType.TINYINT,
421        "INT16": TokenType.SMALLINT,
422        "SHORT": TokenType.SMALLINT,
423        "SMALLINT": TokenType.SMALLINT,
424        "HUGEINT": TokenType.INT128,
425        "UHUGEINT": TokenType.UINT128,
426        "INT2": TokenType.SMALLINT,
427        "INTEGER": TokenType.INT,
428        "INT": TokenType.INT,
429        "INT4": TokenType.INT,
430        "INT32": TokenType.INT,
431        "INT64": TokenType.BIGINT,
432        "INT128": TokenType.INT128,
433        "INT256": TokenType.INT256,
434        "LONG": TokenType.BIGINT,
435        "BIGINT": TokenType.BIGINT,
436        "INT8": TokenType.TINYINT,
437        "UINT": TokenType.UINT,
438        "UINT128": TokenType.UINT128,
439        "UINT256": TokenType.UINT256,
440        "DEC": TokenType.DECIMAL,
441        "DECIMAL": TokenType.DECIMAL,
442        "DECIMAL32": TokenType.DECIMAL32,
443        "DECIMAL64": TokenType.DECIMAL64,
444        "DECIMAL128": TokenType.DECIMAL128,
445        "DECIMAL256": TokenType.DECIMAL256,
446        "DECFLOAT": TokenType.DECFLOAT,
447        "BIGDECIMAL": TokenType.BIGDECIMAL,
448        "BIGNUMERIC": TokenType.BIGDECIMAL,
449        "BIGNUM": TokenType.BIGNUM,
450        "LIST": TokenType.LIST,
451        "MAP": TokenType.MAP,
452        "NULLABLE": TokenType.NULLABLE,
453        "NUMBER": TokenType.DECIMAL,
454        "NUMERIC": TokenType.DECIMAL,
455        "FIXED": TokenType.DECIMAL,
456        "REAL": TokenType.FLOAT,
457        "FLOAT": TokenType.FLOAT,
458        "FLOAT4": TokenType.FLOAT,
459        "FLOAT8": TokenType.DOUBLE,
460        "DOUBLE": TokenType.DOUBLE,
461        "DOUBLE PRECISION": TokenType.DOUBLE,
462        "JSON": TokenType.JSON,
463        "JSONB": TokenType.JSONB,
464        "CHAR": TokenType.CHAR,
465        "CHARACTER": TokenType.CHAR,
466        "CHAR VARYING": TokenType.VARCHAR,
467        "CHARACTER VARYING": TokenType.VARCHAR,
468        "NCHAR": TokenType.NCHAR,
469        "VARCHAR": TokenType.VARCHAR,
470        "VARCHAR2": TokenType.VARCHAR,
471        "NVARCHAR": TokenType.NVARCHAR,
472        "NVARCHAR2": TokenType.NVARCHAR,
473        "BPCHAR": TokenType.BPCHAR,
474        "STR": TokenType.TEXT,
475        "STRING": TokenType.TEXT,
476        "TEXT": TokenType.TEXT,
477        "LONGTEXT": TokenType.LONGTEXT,
478        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
479        "TINYTEXT": TokenType.TINYTEXT,
480        "CLOB": TokenType.TEXT,
481        "LONGVARCHAR": TokenType.TEXT,
482        "BINARY": TokenType.BINARY,
483        "BLOB": TokenType.VARBINARY,
484        "LONGBLOB": TokenType.LONGBLOB,
485        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
486        "TINYBLOB": TokenType.TINYBLOB,
487        "BYTEA": TokenType.VARBINARY,
488        "VARBINARY": TokenType.VARBINARY,
489        "TIME": TokenType.TIME,
490        "TIMETZ": TokenType.TIMETZ,
491        "TIME_NS": TokenType.TIME_NS,
492        "TIMESTAMP": TokenType.TIMESTAMP,
493        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
494        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
495        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
496        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
497        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
498        "DATE": TokenType.DATE,
499        "DATETIME": TokenType.DATETIME,
500        "INT4RANGE": TokenType.INT4RANGE,
501        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
502        "INT8RANGE": TokenType.INT8RANGE,
503        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
504        "NUMRANGE": TokenType.NUMRANGE,
505        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
506        "TSRANGE": TokenType.TSRANGE,
507        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
508        "TSTZRANGE": TokenType.TSTZRANGE,
509        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
510        "DATERANGE": TokenType.DATERANGE,
511        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
512        "UNIQUE": TokenType.UNIQUE,
513        "VECTOR": TokenType.VECTOR,
514        "STRUCT": TokenType.STRUCT,
515        "SEQUENCE": TokenType.SEQUENCE,
516        "VARIANT": TokenType.VARIANT,
517        "ALTER": TokenType.ALTER,
518        "ANALYZE": TokenType.ANALYZE,
519        "CALL": TokenType.COMMAND,
520        "COMMENT": TokenType.COMMENT,
521        "EXPLAIN": TokenType.COMMAND,
522        "GRANT": TokenType.GRANT,
523        "REVOKE": TokenType.REVOKE,
524        "OPTIMIZE": TokenType.COMMAND,
525        "PREPARE": TokenType.COMMAND,
526        "VACUUM": TokenType.COMMAND,
527        "USER-DEFINED": TokenType.USERDEFINED,
528        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
529        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
530    }
531
532    COMMANDS = {
533        TokenType.COMMAND,
534        TokenType.EXECUTE,
535        TokenType.FETCH,
536        TokenType.SHOW,
537        TokenType.RENAME,
538    }
539
540    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
541
542    # Handle numeric literals like in hive (3L = BIGINT)
543    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
544
545    # In tokenizers like JSONPath, dots are always key separators, never decimal points
546    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
547
548    COMMENTS = ["--", ("/*", "*/")]
549
550    _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
551
552    __slots__ = (
553        "dialect",
554        "_core",
555    )
556
557    def __init__(self, dialect: DialectType = None) -> None:
558        from sqlglot.dialects.dialect import Dialect
559
560        self.dialect = Dialect.get_or_raise(dialect)
561        self._core = self._core_cache.get_or_build(type(self), self._init_core)
562
563    def _init_core(self) -> TokenizerCore:
564        return TokenizerCore(
565            single_tokens=self.SINGLE_TOKENS,
566            keywords=self.KEYWORDS,
567            quotes=self._QUOTES,
568            format_strings=self._FORMAT_STRINGS,
569            identifiers=self._IDENTIFIERS,
570            comments=self._COMMENTS,
571            string_escapes=self._STRING_ESCAPES,
572            byte_string_escapes=self._BYTE_STRING_ESCAPES,
573            identifier_escapes=self._IDENTIFIER_ESCAPES,
574            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
575            commands=self.COMMANDS,
576            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
577            nested_comments=self.NESTED_COMMENTS,
578            hint_start=self.HINT_START,
579            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
580            has_bit_strings=bool(self.BIT_STRINGS),
581            has_hex_strings=bool(self.HEX_STRINGS),
582            numeric_literals=self.NUMERIC_LITERALS,
583            var_single_tokens=self.VAR_SINGLE_TOKENS,
584            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
585            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
586            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
587            keyword_trie=self._KEYWORD_TRIE,
588            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
589            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
590            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
591            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
592        )
593
594    def tokenize(self, sql: str) -> list[Token]:
595        """Returns a list of tokens corresponding to the SQL string `sql`."""
596        return self._core.tokenize(sql)  # type: ignore
597
598    @property
599    def sql(self) -> str:
600        """The SQL string being tokenized."""
601        return self._core.sql
602
603    @property
604    def size(self) -> int:
605        """Length of the SQL string."""
606        return self._core.size
607
608    @property
609    def tokens(self) -> list[Token]:
610        """The list of tokens produced by tokenization."""
611        return self._core.tokens
class ThreadLocalCache(_thread._local):
14class ThreadLocalCache(threading.local):
15    """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects."""
16
17    def __init__(self) -> None:
18        self.cache: dict[type, t.Any] = {}
19
20    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
21        if not (obj := self.cache.get(key)):
22            self.cache[key] = obj = build()
23        return obj

Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.

cache: dict[type, typing.Any]
def get_or_build(self, key: type, build: Callable[[], ~T]) -> ~T:
20    def get_or_build(self, key: type, build: t.Callable[[], T]) -> T:
21        if not (obj := self.cache.get(key)):
22            self.cache[key] = obj = build()
23        return obj
class Tokenizer(_TokenizerBase):
136class Tokenizer(_TokenizerBase):
137    SINGLE_TOKENS = {
138        "(": TokenType.L_PAREN,
139        ")": TokenType.R_PAREN,
140        "[": TokenType.L_BRACKET,
141        "]": TokenType.R_BRACKET,
142        "{": TokenType.L_BRACE,
143        "}": TokenType.R_BRACE,
144        "&": TokenType.AMP,
145        "^": TokenType.CARET,
146        ":": TokenType.COLON,
147        ",": TokenType.COMMA,
148        ".": TokenType.DOT,
149        "-": TokenType.DASH,
150        "=": TokenType.EQ,
151        ">": TokenType.GT,
152        "<": TokenType.LT,
153        "%": TokenType.MOD,
154        "!": TokenType.NOT,
155        "|": TokenType.PIPE,
156        "+": TokenType.PLUS,
157        ";": TokenType.SEMICOLON,
158        "/": TokenType.SLASH,
159        "\\": TokenType.BACKSLASH,
160        "*": TokenType.STAR,
161        "~": TokenType.TILDE,
162        "?": TokenType.PLACEHOLDER,
163        "@": TokenType.PARAMETER,
164        "#": TokenType.HASH,
165        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
166        "'": TokenType.UNKNOWN,
167        "`": TokenType.UNKNOWN,
168        '"': TokenType.UNKNOWN,
169    }
170
171    BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
172    BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
173    HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
174    RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
175    HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
176    UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = []
177    IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"']
178    QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"]
179    STRING_ESCAPES: t.ClassVar[list[str]] = ["'"]
180    BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = []
181    VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set()
182    ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = []
183
184    # The strings in this list can always be used as escapes, regardless of the surrounding
185    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
186    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
187    IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = []
188
189    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
190    HEREDOC_TAG_IS_IDENTIFIER = False
191
192    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
193    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
194
195    # Whether string escape characters function as such when placed within raw strings
196    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
197
198    NESTED_COMMENTS = True
199
200    HINT_START = "/*+"
201
202    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
203
204    # Autofilled
205    _COMMENTS: t.ClassVar[dict[str, str | None]] = {}
206    _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {}
207    _IDENTIFIERS: t.ClassVar[dict[str, str]] = {}
208    _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set()
209    _QUOTES: t.ClassVar[dict[str, str]] = {}
210    _STRING_ESCAPES: t.ClassVar[set[str]] = set()
211    _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set()
212    _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {}
213    _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set()
214
215    KEYWORDS: t.ClassVar[dict[str, TokenType]] = {
216        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
217        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
218        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
219        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
220        HINT_START: TokenType.HINT,
221        "&<": TokenType.AMP_LT,
222        "&>": TokenType.AMP_GT,
223        "==": TokenType.EQ,
224        "::": TokenType.DCOLON,
225        "?::": TokenType.QDCOLON,
226        "||": TokenType.DPIPE,
227        "|>": TokenType.PIPE_GT,
228        ">=": TokenType.GTE,
229        "<=": TokenType.LTE,
230        "<>": TokenType.NEQ,
231        "!=": TokenType.NEQ,
232        ":=": TokenType.COLON_EQ,
233        "<=>": TokenType.NULLSAFE_EQ,
234        "->": TokenType.ARROW,
235        "->>": TokenType.DARROW,
236        "=>": TokenType.FARROW,
237        "#>": TokenType.HASH_ARROW,
238        "#>>": TokenType.DHASH_ARROW,
239        "<->": TokenType.LR_ARROW,
240        "&&": TokenType.DAMP,
241        "??": TokenType.DQMARK,
242        "~~~": TokenType.GLOB,
243        "~~": TokenType.LIKE,
244        "~~*": TokenType.ILIKE,
245        "~*": TokenType.IRLIKE,
246        "-|-": TokenType.ADJACENT,
247        "ALL": TokenType.ALL,
248        "AND": TokenType.AND,
249        "ANTI": TokenType.ANTI,
250        "ANY": TokenType.ANY,
251        "ASC": TokenType.ASC,
252        "AS": TokenType.ALIAS,
253        "ASOF": TokenType.ASOF,
254        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
255        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
256        "BEGIN": TokenType.BEGIN,
257        "BETWEEN": TokenType.BETWEEN,
258        "CACHE": TokenType.CACHE,
259        "UNCACHE": TokenType.UNCACHE,
260        "CASE": TokenType.CASE,
261        "CHARACTER SET": TokenType.CHARACTER_SET,
262        "CLUSTER BY": TokenType.CLUSTER_BY,
263        "COLLATE": TokenType.COLLATE,
264        "COLUMN": TokenType.COLUMN,
265        "COMMIT": TokenType.COMMIT,
266        "CONNECT BY": TokenType.CONNECT_BY,
267        "CONSTRAINT": TokenType.CONSTRAINT,
268        "COPY": TokenType.COPY,
269        "CREATE": TokenType.CREATE,
270        "CROSS": TokenType.CROSS,
271        "CUBE": TokenType.CUBE,
272        "CURRENT_DATE": TokenType.CURRENT_DATE,
273        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
274        "CURRENT_TIME": TokenType.CURRENT_TIME,
275        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
276        "CURRENT_USER": TokenType.CURRENT_USER,
277        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
278        "DATABASE": TokenType.DATABASE,
279        "DEFAULT": TokenType.DEFAULT,
280        "DELETE": TokenType.DELETE,
281        "DESC": TokenType.DESC,
282        "DESCRIBE": TokenType.DESCRIBE,
283        "DISTINCT": TokenType.DISTINCT,
284        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
285        "DIV": TokenType.DIV,
286        "DROP": TokenType.DROP,
287        "ELSE": TokenType.ELSE,
288        "END": TokenType.END,
289        "ENUM": TokenType.ENUM,
290        "ESCAPE": TokenType.ESCAPE,
291        "EXCEPT": TokenType.EXCEPT,
292        "EXECUTE": TokenType.EXECUTE,
293        "EXISTS": TokenType.EXISTS,
294        "FALSE": TokenType.FALSE,
295        "FETCH": TokenType.FETCH,
296        "FILTER": TokenType.FILTER,
297        "FILE": TokenType.FILE,
298        "FIRST": TokenType.FIRST,
299        "FULL": TokenType.FULL,
300        "FUNCTION": TokenType.FUNCTION,
301        "FOR": TokenType.FOR,
302        "FOREIGN KEY": TokenType.FOREIGN_KEY,
303        "FORMAT": TokenType.FORMAT,
304        "FROM": TokenType.FROM,
305        "GEOGRAPHY": TokenType.GEOGRAPHY,
306        "GEOMETRY": TokenType.GEOMETRY,
307        "GLOB": TokenType.GLOB,
308        "GROUP BY": TokenType.GROUP_BY,
309        "GROUPING SETS": TokenType.GROUPING_SETS,
310        "HAVING": TokenType.HAVING,
311        "ILIKE": TokenType.ILIKE,
312        "IN": TokenType.IN,
313        "INDEX": TokenType.INDEX,
314        "INET": TokenType.INET,
315        "INNER": TokenType.INNER,
316        "INSERT": TokenType.INSERT,
317        "INTERVAL": TokenType.INTERVAL,
318        "INTERSECT": TokenType.INTERSECT,
319        "INTO": TokenType.INTO,
320        "IS": TokenType.IS,
321        "ISNULL": TokenType.ISNULL,
322        "JOIN": TokenType.JOIN,
323        "KEEP": TokenType.KEEP,
324        "KILL": TokenType.KILL,
325        "LATERAL": TokenType.LATERAL,
326        "LEFT": TokenType.LEFT,
327        "LIKE": TokenType.LIKE,
328        "LIMIT": TokenType.LIMIT,
329        "LOAD": TokenType.LOAD,
330        "LOCALTIME": TokenType.LOCALTIME,
331        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
332        "LOCK": TokenType.LOCK,
333        "MERGE": TokenType.MERGE,
334        "NAMESPACE": TokenType.NAMESPACE,
335        "NATURAL": TokenType.NATURAL,
336        "NEXT": TokenType.NEXT,
337        "NOT": TokenType.NOT,
338        "NOTNULL": TokenType.NOTNULL,
339        "NULL": TokenType.NULL,
340        "OBJECT": TokenType.OBJECT,
341        "OFFSET": TokenType.OFFSET,
342        "ON": TokenType.ON,
343        "OR": TokenType.OR,
344        "XOR": TokenType.XOR,
345        "ORDER BY": TokenType.ORDER_BY,
346        "ORDINALITY": TokenType.ORDINALITY,
347        "OUT": TokenType.OUT,
348        "OUTER": TokenType.OUTER,
349        "OVER": TokenType.OVER,
350        "OVERLAPS": TokenType.OVERLAPS,
351        "OVERWRITE": TokenType.OVERWRITE,
352        "PARTITION": TokenType.PARTITION,
353        "PARTITION BY": TokenType.PARTITION_BY,
354        "PARTITIONED BY": TokenType.PARTITION_BY,
355        "PARTITIONED_BY": TokenType.PARTITION_BY,
356        "PERCENT": TokenType.PERCENT,
357        "PIVOT": TokenType.PIVOT,
358        "PRAGMA": TokenType.PRAGMA,
359        "PRIMARY KEY": TokenType.PRIMARY_KEY,
360        "PROCEDURE": TokenType.PROCEDURE,
361        "OPERATOR": TokenType.OPERATOR,
362        "QUALIFY": TokenType.QUALIFY,
363        "RANGE": TokenType.RANGE,
364        "RECURSIVE": TokenType.RECURSIVE,
365        "REGEXP": TokenType.RLIKE,
366        "RENAME": TokenType.RENAME,
367        "REPLACE": TokenType.REPLACE,
368        "RETURNING": TokenType.RETURNING,
369        "REFERENCES": TokenType.REFERENCES,
370        "RIGHT": TokenType.RIGHT,
371        "RLIKE": TokenType.RLIKE,
372        "ROLLBACK": TokenType.ROLLBACK,
373        "ROLLUP": TokenType.ROLLUP,
374        "ROW": TokenType.ROW,
375        "ROWS": TokenType.ROWS,
376        "SCHEMA": TokenType.SCHEMA,
377        "SELECT": TokenType.SELECT,
378        "SEMI": TokenType.SEMI,
379        "SESSION": TokenType.SESSION,
380        "SESSION_USER": TokenType.SESSION_USER,
381        "SET": TokenType.SET,
382        "SETTINGS": TokenType.SETTINGS,
383        "SHOW": TokenType.SHOW,
384        "SIMILAR TO": TokenType.SIMILAR_TO,
385        "SOME": TokenType.SOME,
386        "SORT BY": TokenType.SORT_BY,
387        "SQL SECURITY": TokenType.SQL_SECURITY,
388        "START WITH": TokenType.START_WITH,
389        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
390        "TABLE": TokenType.TABLE,
391        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
392        "TEMP": TokenType.TEMPORARY,
393        "TEMPORARY": TokenType.TEMPORARY,
394        "THEN": TokenType.THEN,
395        "TRUE": TokenType.TRUE,
396        "TRUNCATE": TokenType.TRUNCATE,
397        "TRIGGER": TokenType.TRIGGER,
398        "UNION": TokenType.UNION,
399        "UNKNOWN": TokenType.UNKNOWN,
400        "UNNEST": TokenType.UNNEST,
401        "UNPIVOT": TokenType.UNPIVOT,
402        "UPDATE": TokenType.UPDATE,
403        "USE": TokenType.USE,
404        "USING": TokenType.USING,
405        "UUID": TokenType.UUID,
406        "VALUES": TokenType.VALUES,
407        "VIEW": TokenType.VIEW,
408        "VOLATILE": TokenType.VOLATILE,
409        "WHEN": TokenType.WHEN,
410        "WHERE": TokenType.WHERE,
411        "WINDOW": TokenType.WINDOW,
412        "WITH": TokenType.WITH,
413        "APPLY": TokenType.APPLY,
414        "ARRAY": TokenType.ARRAY,
415        "BIT": TokenType.BIT,
416        "BOOL": TokenType.BOOLEAN,
417        "BOOLEAN": TokenType.BOOLEAN,
418        "BYTE": TokenType.TINYINT,
419        "MEDIUMINT": TokenType.MEDIUMINT,
420        "INT1": TokenType.TINYINT,
421        "TINYINT": TokenType.TINYINT,
422        "INT16": TokenType.SMALLINT,
423        "SHORT": TokenType.SMALLINT,
424        "SMALLINT": TokenType.SMALLINT,
425        "HUGEINT": TokenType.INT128,
426        "UHUGEINT": TokenType.UINT128,
427        "INT2": TokenType.SMALLINT,
428        "INTEGER": TokenType.INT,
429        "INT": TokenType.INT,
430        "INT4": TokenType.INT,
431        "INT32": TokenType.INT,
432        "INT64": TokenType.BIGINT,
433        "INT128": TokenType.INT128,
434        "INT256": TokenType.INT256,
435        "LONG": TokenType.BIGINT,
436        "BIGINT": TokenType.BIGINT,
437        "INT8": TokenType.TINYINT,
438        "UINT": TokenType.UINT,
439        "UINT128": TokenType.UINT128,
440        "UINT256": TokenType.UINT256,
441        "DEC": TokenType.DECIMAL,
442        "DECIMAL": TokenType.DECIMAL,
443        "DECIMAL32": TokenType.DECIMAL32,
444        "DECIMAL64": TokenType.DECIMAL64,
445        "DECIMAL128": TokenType.DECIMAL128,
446        "DECIMAL256": TokenType.DECIMAL256,
447        "DECFLOAT": TokenType.DECFLOAT,
448        "BIGDECIMAL": TokenType.BIGDECIMAL,
449        "BIGNUMERIC": TokenType.BIGDECIMAL,
450        "BIGNUM": TokenType.BIGNUM,
451        "LIST": TokenType.LIST,
452        "MAP": TokenType.MAP,
453        "NULLABLE": TokenType.NULLABLE,
454        "NUMBER": TokenType.DECIMAL,
455        "NUMERIC": TokenType.DECIMAL,
456        "FIXED": TokenType.DECIMAL,
457        "REAL": TokenType.FLOAT,
458        "FLOAT": TokenType.FLOAT,
459        "FLOAT4": TokenType.FLOAT,
460        "FLOAT8": TokenType.DOUBLE,
461        "DOUBLE": TokenType.DOUBLE,
462        "DOUBLE PRECISION": TokenType.DOUBLE,
463        "JSON": TokenType.JSON,
464        "JSONB": TokenType.JSONB,
465        "CHAR": TokenType.CHAR,
466        "CHARACTER": TokenType.CHAR,
467        "CHAR VARYING": TokenType.VARCHAR,
468        "CHARACTER VARYING": TokenType.VARCHAR,
469        "NCHAR": TokenType.NCHAR,
470        "VARCHAR": TokenType.VARCHAR,
471        "VARCHAR2": TokenType.VARCHAR,
472        "NVARCHAR": TokenType.NVARCHAR,
473        "NVARCHAR2": TokenType.NVARCHAR,
474        "BPCHAR": TokenType.BPCHAR,
475        "STR": TokenType.TEXT,
476        "STRING": TokenType.TEXT,
477        "TEXT": TokenType.TEXT,
478        "LONGTEXT": TokenType.LONGTEXT,
479        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
480        "TINYTEXT": TokenType.TINYTEXT,
481        "CLOB": TokenType.TEXT,
482        "LONGVARCHAR": TokenType.TEXT,
483        "BINARY": TokenType.BINARY,
484        "BLOB": TokenType.VARBINARY,
485        "LONGBLOB": TokenType.LONGBLOB,
486        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
487        "TINYBLOB": TokenType.TINYBLOB,
488        "BYTEA": TokenType.VARBINARY,
489        "VARBINARY": TokenType.VARBINARY,
490        "TIME": TokenType.TIME,
491        "TIMETZ": TokenType.TIMETZ,
492        "TIME_NS": TokenType.TIME_NS,
493        "TIMESTAMP": TokenType.TIMESTAMP,
494        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
495        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
496        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
497        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
498        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
499        "DATE": TokenType.DATE,
500        "DATETIME": TokenType.DATETIME,
501        "INT4RANGE": TokenType.INT4RANGE,
502        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
503        "INT8RANGE": TokenType.INT8RANGE,
504        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
505        "NUMRANGE": TokenType.NUMRANGE,
506        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
507        "TSRANGE": TokenType.TSRANGE,
508        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
509        "TSTZRANGE": TokenType.TSTZRANGE,
510        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
511        "DATERANGE": TokenType.DATERANGE,
512        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
513        "UNIQUE": TokenType.UNIQUE,
514        "VECTOR": TokenType.VECTOR,
515        "STRUCT": TokenType.STRUCT,
516        "SEQUENCE": TokenType.SEQUENCE,
517        "VARIANT": TokenType.VARIANT,
518        "ALTER": TokenType.ALTER,
519        "ANALYZE": TokenType.ANALYZE,
520        "CALL": TokenType.COMMAND,
521        "COMMENT": TokenType.COMMENT,
522        "EXPLAIN": TokenType.COMMAND,
523        "GRANT": TokenType.GRANT,
524        "REVOKE": TokenType.REVOKE,
525        "OPTIMIZE": TokenType.COMMAND,
526        "PREPARE": TokenType.COMMAND,
527        "VACUUM": TokenType.COMMAND,
528        "USER-DEFINED": TokenType.USERDEFINED,
529        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
530        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
531    }
532
533    COMMANDS = {
534        TokenType.COMMAND,
535        TokenType.EXECUTE,
536        TokenType.FETCH,
537        TokenType.SHOW,
538        TokenType.RENAME,
539    }
540
541    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
542
543    # Handle numeric literals like in hive (3L = BIGINT)
544    NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {}
545
546    # In tokenizers like JSONPath, dots are always key separators, never decimal points
547    NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True
548
549    COMMENTS = ["--", ("/*", "*/")]
550
551    _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache()
552
553    __slots__ = (
554        "dialect",
555        "_core",
556    )
557
558    def __init__(self, dialect: DialectType = None) -> None:
559        from sqlglot.dialects.dialect import Dialect
560
561        self.dialect = Dialect.get_or_raise(dialect)
562        self._core = self._core_cache.get_or_build(type(self), self._init_core)
563
564    def _init_core(self) -> TokenizerCore:
565        return TokenizerCore(
566            single_tokens=self.SINGLE_TOKENS,
567            keywords=self.KEYWORDS,
568            quotes=self._QUOTES,
569            format_strings=self._FORMAT_STRINGS,
570            identifiers=self._IDENTIFIERS,
571            comments=self._COMMENTS,
572            string_escapes=self._STRING_ESCAPES,
573            byte_string_escapes=self._BYTE_STRING_ESCAPES,
574            identifier_escapes=self._IDENTIFIER_ESCAPES,
575            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
576            commands=self.COMMANDS,
577            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
578            nested_comments=self.NESTED_COMMENTS,
579            hint_start=self.HINT_START,
580            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
581            has_bit_strings=bool(self.BIT_STRINGS),
582            has_hex_strings=bool(self.HEX_STRINGS),
583            numeric_literals=self.NUMERIC_LITERALS,
584            var_single_tokens=self.VAR_SINGLE_TOKENS,
585            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
586            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
587            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
588            keyword_trie=self._KEYWORD_TRIE,
589            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
590            numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS,
591            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
592            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
593        )
594
595    def tokenize(self, sql: str) -> list[Token]:
596        """Returns a list of tokens corresponding to the SQL string `sql`."""
597        return self._core.tokenize(sql)  # type: ignore
598
599    @property
600    def sql(self) -> str:
601        """The SQL string being tokenized."""
602        return self._core.sql
603
604    @property
605    def size(self) -> int:
606        """Length of the SQL string."""
607        return self._core.size
608
609    @property
610    def tokens(self) -> list[Token]:
611        """The list of tokens produced by tokenization."""
612        return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
558    def __init__(self, dialect: DialectType = None) -> None:
559        from sqlglot.dialects.dialect import Dialect
560
561        self.dialect = Dialect.get_or_raise(dialect)
562        self._core = self._core_cache.get_or_build(type(self), self._init_core)
SINGLE_TOKENS = {'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 327>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 354>, '@': <TokenType.PARAMETER: 56>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 212>, '`': <TokenType.UNKNOWN: 212>, '"': <TokenType.UNKNOWN: 212>}
BIT_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
BYTE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEX_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
RAW_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
HEREDOC_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
UNICODE_STRINGS: ClassVar[list[tuple[str, str] | str]] = []
IDENTIFIERS: ClassVar[list[tuple[str, str] | str]] = ['"']
QUOTES: ClassVar[list[tuple[str, str] | str]] = ["'"]
STRING_ESCAPES: ClassVar[list[str]] = ["'"]
BYTE_STRING_ESCAPES: ClassVar[list[str]] = []
VAR_SINGLE_TOKENS: ClassVar[set[str]] = set()
ESCAPE_FOLLOW_CHARS: ClassVar[list[str]] = []
IDENTIFIER_ESCAPES: ClassVar[list[str]] = []
HEREDOC_TAG_IS_IDENTIFIER = False
HEREDOC_STRING_ALTERNATIVE = <TokenType.VAR: 87>
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
NESTED_COMMENTS = True
HINT_START = '/*+'
TOKENS_PRECEDING_HINT = {<TokenType.SELECT: 384>, <TokenType.INSERT: 298>, <TokenType.DELETE: 255>, <TokenType.UPDATE: 415>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] = {'{%': <TokenType.BLOCK_START: 71>, '{%+': <TokenType.BLOCK_START: 71>, '{%-': <TokenType.BLOCK_START: 71>, '%}': <TokenType.BLOCK_END: 72>, '+%}': <TokenType.BLOCK_END: 72>, '-%}': <TokenType.BLOCK_END: 72>, '{{+': <TokenType.BLOCK_START: 71>, '{{-': <TokenType.BLOCK_START: 71>, '+}}': <TokenType.BLOCK_END: 72>, '-}}': <TokenType.BLOCK_END: 72>, '/*+': <TokenType.HINT: 291>, '&<': <TokenType.AMP_LT: 61>, '&>': <TokenType.AMP_GT: 62>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 367>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '&&': <TokenType.DAMP: 60>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 285>, '~~': <TokenType.LIKE: 316>, '~~*': <TokenType.ILIKE: 293>, '~*': <TokenType.IRLIKE: 305>, '-|-': <TokenType.ADJACENT: 63>, 'ALL': <TokenType.ALL: 218>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 219>, 'ANY': <TokenType.ANY: 220>, 'ASC': <TokenType.ASC: 223>, 'AS': <TokenType.ALIAS: 216>, 'ASOF': <TokenType.ASOF: 224>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'BEGIN': <TokenType.BEGIN: 227>, 'BETWEEN': <TokenType.BETWEEN: 228>, 'CACHE': <TokenType.CACHE: 230>, 'UNCACHE': <TokenType.UNCACHE: 411>, 'CASE': <TokenType.CASE: 231>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 232>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 233>, 'COLLATE': <TokenType.COLLATE: 234>, 'COLUMN': <TokenType.COLUMN: 79>, 'COMMIT': <TokenType.COMMIT: 237>, 'CONNECT BY': <TokenType.CONNECT_BY: 238>, 'CONSTRAINT': <TokenType.CONSTRAINT: 239>, 'COPY': <TokenType.COPY: 240>, 'CREATE': <TokenType.CREATE: 241>, 'CROSS': <TokenType.CROSS: 242>, 'CUBE': <TokenType.CUBE: 243>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 244>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 246>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 247>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 248>, 'CURRENT_USER': <TokenType.CURRENT_USER: 249>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 252>, 'DATABASE': <TokenType.DATABASE: 78>, 'DEFAULT': <TokenType.DEFAULT: 254>, 'DELETE': <TokenType.DELETE: 255>, 'DESC': <TokenType.DESC: 256>, 'DESCRIBE': <TokenType.DESCRIBE: 257>, 'DISTINCT': <TokenType.DISTINCT: 260>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 261>, 'DIV': <TokenType.DIV: 262>, 'DROP': <TokenType.DROP: 263>, 'ELSE': <TokenType.ELSE: 264>, 'END': <TokenType.END: 265>, 'ENUM': <TokenType.ENUM: 203>, 'ESCAPE': <TokenType.ESCAPE: 266>, 'EXCEPT': <TokenType.EXCEPT: 267>, 'EXECUTE': <TokenType.EXECUTE: 268>, 'EXISTS': <TokenType.EXISTS: 269>, 'FALSE': <TokenType.FALSE: 270>, 'FETCH': <TokenType.FETCH: 271>, 'FILTER': <TokenType.FILTER: 274>, 'FILE': <TokenType.FILE: 272>, 'FIRST': <TokenType.FIRST: 276>, 'FULL': <TokenType.FULL: 282>, 'FUNCTION': <TokenType.FUNCTION: 283>, 'FOR': <TokenType.FOR: 277>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 279>, 'FORMAT': <TokenType.FORMAT: 280>, 'FROM': <TokenType.FROM: 281>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 170>, 'GEOMETRY': <TokenType.GEOMETRY: 173>, 'GLOB': <TokenType.GLOB: 285>, 'GROUP BY': <TokenType.GROUP_BY: 288>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 289>, 'HAVING': <TokenType.HAVING: 290>, 'ILIKE': <TokenType.ILIKE: 293>, 'IN': <TokenType.IN: 294>, 'INDEX': <TokenType.INDEX: 295>, 'INET': <TokenType.INET: 198>, 'INNER': <TokenType.INNER: 297>, 'INSERT': <TokenType.INSERT: 298>, 'INTERVAL': <TokenType.INTERVAL: 302>, 'INTERSECT': <TokenType.INTERSECT: 301>, 'INTO': <TokenType.INTO: 303>, 'IS': <TokenType.IS: 306>, 'ISNULL': <TokenType.ISNULL: 307>, 'JOIN': <TokenType.JOIN: 308>, 'KEEP': <TokenType.KEEP: 310>, 'KILL': <TokenType.KILL: 312>, 'LATERAL': <TokenType.LATERAL: 314>, 'LEFT': <TokenType.LEFT: 315>, 'LIKE': <TokenType.LIKE: 316>, 'LIMIT': <TokenType.LIMIT: 317>, 'LOAD': <TokenType.LOAD: 319>, 'LOCALTIME': <TokenType.LOCALTIME: 177>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 178>, 'LOCK': <TokenType.LOCK: 320>, 'MERGE': <TokenType.MERGE: 326>, 'NAMESPACE': <TokenType.NAMESPACE: 438>, 'NATURAL': <TokenType.NATURAL: 329>, 'NEXT': <TokenType.NEXT: 330>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 332>, 'NULL': <TokenType.NULL: 333>, 'OBJECT': <TokenType.OBJECT: 197>, 'OFFSET': <TokenType.OFFSET: 335>, 'ON': <TokenType.ON: 336>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 64>, 'ORDER BY': <TokenType.ORDER_BY: 339>, 'ORDINALITY': <TokenType.ORDINALITY: 342>, 'OUT': <TokenType.OUT: 343>, 'OUTER': <TokenType.OUTER: 345>, 'OVER': <TokenType.OVER: 346>, 'OVERLAPS': <TokenType.OVERLAPS: 347>, 'OVERWRITE': <TokenType.OVERWRITE: 348>, 'PARTITION': <TokenType.PARTITION: 350>, 'PARTITION BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 351>, 'PERCENT': <TokenType.PERCENT: 352>, 'PIVOT': <TokenType.PIVOT: 353>, 'PRAGMA': <TokenType.PRAGMA: 358>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 360>, 'PROCEDURE': <TokenType.PROCEDURE: 361>, 'OPERATOR': <TokenType.OPERATOR: 338>, 'QUALIFY': <TokenType.QUALIFY: 365>, 'RANGE': <TokenType.RANGE: 368>, 'RECURSIVE': <TokenType.RECURSIVE: 369>, 'REGEXP': <TokenType.RLIKE: 377>, 'RENAME': <TokenType.RENAME: 371>, 'REPLACE': <TokenType.REPLACE: 372>, 'RETURNING': <TokenType.RETURNING: 373>, 'REFERENCES': <TokenType.REFERENCES: 375>, 'RIGHT': <TokenType.RIGHT: 376>, 'RLIKE': <TokenType.RLIKE: 377>, 'ROLLBACK': <TokenType.ROLLBACK: 379>, 'ROLLUP': <TokenType.ROLLUP: 380>, 'ROW': <TokenType.ROW: 381>, 'ROWS': <TokenType.ROWS: 382>, 'SCHEMA': <TokenType.SCHEMA: 81>, 'SELECT': <TokenType.SELECT: 384>, 'SEMI': <TokenType.SEMI: 385>, 'SESSION': <TokenType.SESSION: 57>, 'SESSION_USER': <TokenType.SESSION_USER: 59>, 'SET': <TokenType.SET: 389>, 'SETTINGS': <TokenType.SETTINGS: 390>, 'SHOW': <TokenType.SHOW: 391>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 392>, 'SOME': <TokenType.SOME: 393>, 'SORT BY': <TokenType.SORT_BY: 394>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 396>, 'START WITH': <TokenType.START_WITH: 397>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 399>, 'TABLE': <TokenType.TABLE: 82>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 402>, 'TEMP': <TokenType.TEMPORARY: 404>, 'TEMPORARY': <TokenType.TEMPORARY: 404>, 'THEN': <TokenType.THEN: 406>, 'TRUE': <TokenType.TRUE: 407>, 'TRUNCATE': <TokenType.TRUNCATE: 408>, 'TRIGGER': <TokenType.TRIGGER: 409>, 'UNION': <TokenType.UNION: 412>, 'UNKNOWN': <TokenType.UNKNOWN: 212>, 'UNNEST': <TokenType.UNNEST: 413>, 'UNPIVOT': <TokenType.UNPIVOT: 414>, 'UPDATE': <TokenType.UPDATE: 415>, 'USE': <TokenType.USE: 416>, 'USING': <TokenType.USING: 417>, 'UUID': <TokenType.UUID: 169>, 'VALUES': <TokenType.VALUES: 418>, 'VIEW': <TokenType.VIEW: 420>, 'VOLATILE': <TokenType.VOLATILE: 422>, 'WHEN': <TokenType.WHEN: 424>, 'WHERE': <TokenType.WHERE: 425>, 'WINDOW': <TokenType.WINDOW: 426>, 'WITH': <TokenType.WITH: 427>, 'APPLY': <TokenType.APPLY: 221>, 'ARRAY': <TokenType.ARRAY: 222>, 'BIT': <TokenType.BIT: 95>, 'BOOL': <TokenType.BOOLEAN: 96>, 'BOOLEAN': <TokenType.BOOLEAN: 96>, 'BYTE': <TokenType.TINYINT: 97>, 'MEDIUMINT': <TokenType.MEDIUMINT: 101>, 'INT1': <TokenType.TINYINT: 97>, 'TINYINT': <TokenType.TINYINT: 97>, 'INT16': <TokenType.SMALLINT: 99>, 'SHORT': <TokenType.SMALLINT: 99>, 'SMALLINT': <TokenType.SMALLINT: 99>, 'HUGEINT': <TokenType.INT128: 108>, 'UHUGEINT': <TokenType.UINT128: 109>, 'INT2': <TokenType.SMALLINT: 99>, 'INTEGER': <TokenType.INT: 103>, 'INT': <TokenType.INT: 103>, 'INT4': <TokenType.INT: 103>, 'INT32': <TokenType.INT: 103>, 'INT64': <TokenType.BIGINT: 105>, 'INT128': <TokenType.INT128: 108>, 'INT256': <TokenType.INT256: 110>, 'LONG': <TokenType.BIGINT: 105>, 'BIGINT': <TokenType.BIGINT: 105>, 'INT8': <TokenType.TINYINT: 97>, 'UINT': <TokenType.UINT: 104>, 'UINT128': <TokenType.UINT128: 109>, 'UINT256': <TokenType.UINT256: 111>, 'DEC': <TokenType.DECIMAL: 115>, 'DECIMAL': <TokenType.DECIMAL: 115>, 'DECIMAL32': <TokenType.DECIMAL32: 116>, 'DECIMAL64': <TokenType.DECIMAL64: 117>, 'DECIMAL128': <TokenType.DECIMAL128: 118>, 'DECIMAL256': <TokenType.DECIMAL256: 119>, 'DECFLOAT': <TokenType.DECFLOAT: 120>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 122>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 122>, 'BIGNUM': <TokenType.BIGNUM: 107>, 'LIST': <TokenType.LIST: 318>, 'MAP': <TokenType.MAP: 321>, 'NULLABLE': <TokenType.NULLABLE: 172>, 'NUMBER': <TokenType.DECIMAL: 115>, 'NUMERIC': <TokenType.DECIMAL: 115>, 'FIXED': <TokenType.DECIMAL: 115>, 'REAL': <TokenType.FLOAT: 112>, 'FLOAT': <TokenType.FLOAT: 112>, 'FLOAT4': <TokenType.FLOAT: 112>, 'FLOAT8': <TokenType.DOUBLE: 113>, 'DOUBLE': <TokenType.DOUBLE: 113>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 113>, 'JSON': <TokenType.JSON: 139>, 'JSONB': <TokenType.JSONB: 140>, 'CHAR': <TokenType.CHAR: 123>, 'CHARACTER': <TokenType.CHAR: 123>, 'CHAR VARYING': <TokenType.VARCHAR: 125>, 'CHARACTER VARYING': <TokenType.VARCHAR: 125>, 'NCHAR': <TokenType.NCHAR: 124>, 'VARCHAR': <TokenType.VARCHAR: 125>, 'VARCHAR2': <TokenType.VARCHAR: 125>, 'NVARCHAR': <TokenType.NVARCHAR: 126>, 'NVARCHAR2': <TokenType.NVARCHAR: 126>, 'BPCHAR': <TokenType.BPCHAR: 127>, 'STR': <TokenType.TEXT: 128>, 'STRING': <TokenType.TEXT: 128>, 'TEXT': <TokenType.TEXT: 128>, 'LONGTEXT': <TokenType.LONGTEXT: 130>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 129>, 'TINYTEXT': <TokenType.TINYTEXT: 135>, 'CLOB': <TokenType.TEXT: 128>, 'LONGVARCHAR': <TokenType.TEXT: 128>, 'BINARY': <TokenType.BINARY: 137>, 'BLOB': <TokenType.VARBINARY: 138>, 'LONGBLOB': <TokenType.LONGBLOB: 133>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 132>, 'TINYBLOB': <TokenType.TINYBLOB: 134>, 'BYTEA': <TokenType.VARBINARY: 138>, 'VARBINARY': <TokenType.VARBINARY: 138>, 'TIME': <TokenType.TIME: 141>, 'TIMETZ': <TokenType.TIMETZ: 142>, 'TIME_NS': <TokenType.TIME_NS: 143>, 'TIMESTAMP': <TokenType.TIMESTAMP: 144>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 145>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 147>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 147>, 'DATE': <TokenType.DATE: 155>, 'DATETIME': <TokenType.DATETIME: 151>, 'INT4RANGE': <TokenType.INT4RANGE: 157>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 158>, 'INT8RANGE': <TokenType.INT8RANGE: 159>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 160>, 'NUMRANGE': <TokenType.NUMRANGE: 161>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 162>, 'TSRANGE': <TokenType.TSRANGE: 163>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 164>, 'TSTZRANGE': <TokenType.TSTZRANGE: 165>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 166>, 'DATERANGE': <TokenType.DATERANGE: 167>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 168>, 'UNIQUE': <TokenType.UNIQUE: 428>, 'VECTOR': <TokenType.VECTOR: 213>, 'STRUCT': <TokenType.STRUCT: 400>, 'SEQUENCE': <TokenType.SEQUENCE: 387>, 'VARIANT': <TokenType.VARIANT: 196>, 'ALTER': <TokenType.ALTER: 217>, 'ANALYZE': <TokenType.ANALYZE: 437>, 'CALL': <TokenType.COMMAND: 235>, 'COMMENT': <TokenType.COMMENT: 236>, 'EXPLAIN': <TokenType.COMMAND: 235>, 'GRANT': <TokenType.GRANT: 287>, 'REVOKE': <TokenType.REVOKE: 374>, 'OPTIMIZE': <TokenType.COMMAND: 235>, 'PREPARE': <TokenType.COMMAND: 235>, 'VACUUM': <TokenType.COMMAND: 235>, 'USER-DEFINED': <TokenType.USERDEFINED: 191>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 432>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 433>}
COMMANDS = {<TokenType.SHOW: 391>, <TokenType.COMMAND: 235>, <TokenType.EXECUTE: 268>, <TokenType.FETCH: 271>, <TokenType.RENAME: 371>}
COMMAND_PREFIX_TOKENS = {<TokenType.SEMICOLON: 19>, <TokenType.BEGIN: 227>}
NUMERIC_LITERALS: ClassVar[dict[str, str]] = {}
NUMBERS_CAN_HAVE_DECIMALS: ClassVar[bool] = True
COMMENTS = ['--', ('/*', '*/')]
dialect
def tokenize(self, sql: str) -> list[sqlglot.tokenizer_core.Token]:
595    def tokenize(self, sql: str) -> list[Token]:
596        """Returns a list of tokens corresponding to the SQL string `sql`."""
597        return self._core.tokenize(sql)  # type: ignore

Returns a list of tokens corresponding to the SQL string sql.

sql: str
599    @property
600    def sql(self) -> str:
601        """The SQL string being tokenized."""
602        return self._core.sql

The SQL string being tokenized.

size: int
604    @property
605    def size(self) -> int:
606        """Length of the SQL string."""
607        return self._core.size

Length of the SQL string.

tokens: list[sqlglot.tokenizer_core.Token]
609    @property
610    def tokens(self) -> list[Token]:
611        """The list of tokens produced by tokenization."""
612        return self._core.tokens

The list of tokens produced by tokenization.