Edit on GitHub

sqlglot.tokens

  1from __future__ import annotations
  2
  3import typing as t
  4
  5from sqlglot.trie import new_trie
  6
  7# Import Token and TokenType from tokenizer_core (compiled with mypyc)
  8from sqlglot.tokenizer_core import Token, TokenType
  9
 10try:
 11    import sqlglotrs  # type: ignore # noqa: F401
 12    import warnings
 13
 14    warnings.warn(
 15        "sqlglot[rs] is deprecated and no longer compatible with sqlglot. "
 16        "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]",
 17    )
 18except ImportError:
 19    pass
 20
 21if t.TYPE_CHECKING:
 22    from sqlglot.dialects.dialect import DialectType
 23
 24
 25def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
 26    return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr)
 27
 28
 29def _quotes_to_format(
 30    token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
 31) -> t.Dict[str, t.Tuple[str, TokenType]]:
 32    return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
 33
 34
 35class _TokenizerBase:
 36    QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]]
 37    IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 38    BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 39    BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 40    HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 41    RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 42    HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 43    UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 44    STRING_ESCAPES: t.ClassVar[t.List[str]]
 45    BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]]
 46    ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]]
 47    IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]]
 48    HINT_START: t.ClassVar[str]
 49    KEYWORDS: t.ClassVar[t.Dict[str, TokenType]]
 50    SINGLE_TOKENS: t.ClassVar[t.Dict[str, TokenType]]
 51    NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]]
 52    VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]]
 53    COMMANDS: t.ClassVar[t.Set[TokenType]]
 54    COMMAND_PREFIX_TOKENS: t.ClassVar[t.Set[TokenType]]
 55    HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool]
 56    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool]
 57    NESTED_COMMENTS: t.ClassVar[bool]
 58    TOKENS_PRECEDING_HINT: t.ClassVar[t.Set[TokenType]]
 59    HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType]
 60    COMMENTS: t.ClassVar[t.List[str | t.Tuple[str, str]]]
 61    _QUOTES: t.ClassVar[t.Dict[str, str]]
 62    _IDENTIFIERS: t.ClassVar[t.Dict[str, str]]
 63    _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]]
 64    _STRING_ESCAPES: t.ClassVar[t.Set[str]]
 65    _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]]
 66    _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]]
 67    _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]]
 68    _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]]
 69    _KEYWORD_TRIE: t.ClassVar[t.Dict]
 70
 71    @classmethod
 72    def __init_subclass__(cls, **kwargs: t.Any) -> None:
 73        super().__init_subclass__(**kwargs)
 74        cls._QUOTES = _convert_quotes(cls.QUOTES)
 75        cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS)
 76        cls._FORMAT_STRINGS = {
 77            **{
 78                p + s: (e, TokenType.NATIONAL_STRING)
 79                for s, e in cls._QUOTES.items()
 80                for p in ("n", "N")
 81            },
 82            **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS),
 83            **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS),
 84            **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS),
 85            **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS),
 86            **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS),
 87            **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS),
 88        }
 89        if "BYTE_STRING_ESCAPES" not in cls.__dict__:
 90            cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy()
 91        cls._STRING_ESCAPES = set(cls.STRING_ESCAPES)
 92        cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES)
 93        cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS)
 94        cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES)
 95        cls._COMMENTS = {
 96            **{c: None for c in cls.COMMENTS if isinstance(c, str)},
 97            **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)},
 98            "{#": "#}",  # Ensure Jinja comments are tokenized correctly in all dialects
 99        }
100        if cls.HINT_START in cls.KEYWORDS:
101            cls._COMMENTS[cls.HINT_START] = "*/"
102        cls._KEYWORD_TRIE = new_trie(
103            key.upper()
104            for key in (
105                *cls.KEYWORDS,
106                *cls._COMMENTS,
107                *cls._QUOTES,
108                *cls._FORMAT_STRINGS,
109            )
110            if " " in key or any(single in key for single in cls.SINGLE_TOKENS)
111        )
112
113
114class Tokenizer(_TokenizerBase):
115    SINGLE_TOKENS = {
116        "(": TokenType.L_PAREN,
117        ")": TokenType.R_PAREN,
118        "[": TokenType.L_BRACKET,
119        "]": TokenType.R_BRACKET,
120        "{": TokenType.L_BRACE,
121        "}": TokenType.R_BRACE,
122        "&": TokenType.AMP,
123        "^": TokenType.CARET,
124        ":": TokenType.COLON,
125        ",": TokenType.COMMA,
126        ".": TokenType.DOT,
127        "-": TokenType.DASH,
128        "=": TokenType.EQ,
129        ">": TokenType.GT,
130        "<": TokenType.LT,
131        "%": TokenType.MOD,
132        "!": TokenType.NOT,
133        "|": TokenType.PIPE,
134        "+": TokenType.PLUS,
135        ";": TokenType.SEMICOLON,
136        "/": TokenType.SLASH,
137        "\\": TokenType.BACKSLASH,
138        "*": TokenType.STAR,
139        "~": TokenType.TILDE,
140        "?": TokenType.PLACEHOLDER,
141        "@": TokenType.PARAMETER,
142        "#": TokenType.HASH,
143        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
144        "'": TokenType.UNKNOWN,
145        "`": TokenType.UNKNOWN,
146        '"': TokenType.UNKNOWN,
147    }
148
149    BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
150    BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
151    HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
152    RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
153    HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
154    UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
155    IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = ['"']
156    QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]] = ["'"]
157    STRING_ESCAPES = ["'"]
158    BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]] = []
159    VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]] = set()
160    ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]] = []
161
162    # The strings in this list can always be used as escapes, regardless of the surrounding
163    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
164    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
165    IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]] = []
166
167    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
168    HEREDOC_TAG_IS_IDENTIFIER = False
169
170    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
171    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
172
173    # Whether string escape characters function as such when placed within raw strings
174    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
175
176    NESTED_COMMENTS = True
177
178    HINT_START = "/*+"
179
180    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
181
182    # Autofilled
183    _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]] = {}
184    _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]] = {}
185    _IDENTIFIERS: t.ClassVar[t.Dict[str, str]] = {}
186    _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]] = set()
187    _QUOTES: t.ClassVar[t.Dict[str, str]] = {}
188    _STRING_ESCAPES: t.ClassVar[t.Set[str]] = set()
189    _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]] = set()
190    _KEYWORD_TRIE: t.ClassVar[t.Dict] = {}
191    _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]] = set()
192
193    KEYWORDS: t.ClassVar[t.Dict[str, TokenType]] = {
194        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
195        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
196        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
197        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
198        HINT_START: TokenType.HINT,
199        "&<": TokenType.AMP_LT,
200        "&>": TokenType.AMP_GT,
201        "==": TokenType.EQ,
202        "::": TokenType.DCOLON,
203        "?::": TokenType.QDCOLON,
204        "||": TokenType.DPIPE,
205        "|>": TokenType.PIPE_GT,
206        ">=": TokenType.GTE,
207        "<=": TokenType.LTE,
208        "<>": TokenType.NEQ,
209        "!=": TokenType.NEQ,
210        ":=": TokenType.COLON_EQ,
211        "<=>": TokenType.NULLSAFE_EQ,
212        "->": TokenType.ARROW,
213        "->>": TokenType.DARROW,
214        "=>": TokenType.FARROW,
215        "#>": TokenType.HASH_ARROW,
216        "#>>": TokenType.DHASH_ARROW,
217        "<->": TokenType.LR_ARROW,
218        "&&": TokenType.DAMP,
219        "??": TokenType.DQMARK,
220        "~~~": TokenType.GLOB,
221        "~~": TokenType.LIKE,
222        "~~*": TokenType.ILIKE,
223        "~*": TokenType.IRLIKE,
224        "-|-": TokenType.ADJACENT,
225        "ALL": TokenType.ALL,
226        "AND": TokenType.AND,
227        "ANTI": TokenType.ANTI,
228        "ANY": TokenType.ANY,
229        "ASC": TokenType.ASC,
230        "AS": TokenType.ALIAS,
231        "ASOF": TokenType.ASOF,
232        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
233        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
234        "BEGIN": TokenType.BEGIN,
235        "BETWEEN": TokenType.BETWEEN,
236        "CACHE": TokenType.CACHE,
237        "UNCACHE": TokenType.UNCACHE,
238        "CASE": TokenType.CASE,
239        "CHARACTER SET": TokenType.CHARACTER_SET,
240        "CLUSTER BY": TokenType.CLUSTER_BY,
241        "COLLATE": TokenType.COLLATE,
242        "COLUMN": TokenType.COLUMN,
243        "COMMIT": TokenType.COMMIT,
244        "CONNECT BY": TokenType.CONNECT_BY,
245        "CONSTRAINT": TokenType.CONSTRAINT,
246        "COPY": TokenType.COPY,
247        "CREATE": TokenType.CREATE,
248        "CROSS": TokenType.CROSS,
249        "CUBE": TokenType.CUBE,
250        "CURRENT_DATE": TokenType.CURRENT_DATE,
251        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
252        "CURRENT_TIME": TokenType.CURRENT_TIME,
253        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
254        "CURRENT_USER": TokenType.CURRENT_USER,
255        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
256        "DATABASE": TokenType.DATABASE,
257        "DEFAULT": TokenType.DEFAULT,
258        "DELETE": TokenType.DELETE,
259        "DESC": TokenType.DESC,
260        "DESCRIBE": TokenType.DESCRIBE,
261        "DISTINCT": TokenType.DISTINCT,
262        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
263        "DIV": TokenType.DIV,
264        "DROP": TokenType.DROP,
265        "ELSE": TokenType.ELSE,
266        "END": TokenType.END,
267        "ENUM": TokenType.ENUM,
268        "ESCAPE": TokenType.ESCAPE,
269        "EXCEPT": TokenType.EXCEPT,
270        "EXECUTE": TokenType.EXECUTE,
271        "EXISTS": TokenType.EXISTS,
272        "FALSE": TokenType.FALSE,
273        "FETCH": TokenType.FETCH,
274        "FILTER": TokenType.FILTER,
275        "FILE": TokenType.FILE,
276        "FIRST": TokenType.FIRST,
277        "FULL": TokenType.FULL,
278        "FUNCTION": TokenType.FUNCTION,
279        "FOR": TokenType.FOR,
280        "FOREIGN KEY": TokenType.FOREIGN_KEY,
281        "FORMAT": TokenType.FORMAT,
282        "FROM": TokenType.FROM,
283        "GEOGRAPHY": TokenType.GEOGRAPHY,
284        "GEOMETRY": TokenType.GEOMETRY,
285        "GLOB": TokenType.GLOB,
286        "GROUP BY": TokenType.GROUP_BY,
287        "GROUPING SETS": TokenType.GROUPING_SETS,
288        "HAVING": TokenType.HAVING,
289        "ILIKE": TokenType.ILIKE,
290        "IN": TokenType.IN,
291        "INDEX": TokenType.INDEX,
292        "INET": TokenType.INET,
293        "INNER": TokenType.INNER,
294        "INSERT": TokenType.INSERT,
295        "INTERVAL": TokenType.INTERVAL,
296        "INTERSECT": TokenType.INTERSECT,
297        "INTO": TokenType.INTO,
298        "IS": TokenType.IS,
299        "ISNULL": TokenType.ISNULL,
300        "JOIN": TokenType.JOIN,
301        "KEEP": TokenType.KEEP,
302        "KILL": TokenType.KILL,
303        "LATERAL": TokenType.LATERAL,
304        "LEFT": TokenType.LEFT,
305        "LIKE": TokenType.LIKE,
306        "LIMIT": TokenType.LIMIT,
307        "LOAD": TokenType.LOAD,
308        "LOCALTIME": TokenType.LOCALTIME,
309        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
310        "LOCK": TokenType.LOCK,
311        "MERGE": TokenType.MERGE,
312        "NAMESPACE": TokenType.NAMESPACE,
313        "NATURAL": TokenType.NATURAL,
314        "NEXT": TokenType.NEXT,
315        "NOT": TokenType.NOT,
316        "NOTNULL": TokenType.NOTNULL,
317        "NULL": TokenType.NULL,
318        "OBJECT": TokenType.OBJECT,
319        "OFFSET": TokenType.OFFSET,
320        "ON": TokenType.ON,
321        "OR": TokenType.OR,
322        "XOR": TokenType.XOR,
323        "ORDER BY": TokenType.ORDER_BY,
324        "ORDINALITY": TokenType.ORDINALITY,
325        "OUT": TokenType.OUT,
326        "OUTER": TokenType.OUTER,
327        "OVER": TokenType.OVER,
328        "OVERLAPS": TokenType.OVERLAPS,
329        "OVERWRITE": TokenType.OVERWRITE,
330        "PARTITION": TokenType.PARTITION,
331        "PARTITION BY": TokenType.PARTITION_BY,
332        "PARTITIONED BY": TokenType.PARTITION_BY,
333        "PARTITIONED_BY": TokenType.PARTITION_BY,
334        "PERCENT": TokenType.PERCENT,
335        "PIVOT": TokenType.PIVOT,
336        "PRAGMA": TokenType.PRAGMA,
337        "PRIMARY KEY": TokenType.PRIMARY_KEY,
338        "PROCEDURE": TokenType.PROCEDURE,
339        "OPERATOR": TokenType.OPERATOR,
340        "QUALIFY": TokenType.QUALIFY,
341        "RANGE": TokenType.RANGE,
342        "RECURSIVE": TokenType.RECURSIVE,
343        "REGEXP": TokenType.RLIKE,
344        "RENAME": TokenType.RENAME,
345        "REPLACE": TokenType.REPLACE,
346        "RETURNING": TokenType.RETURNING,
347        "REFERENCES": TokenType.REFERENCES,
348        "RIGHT": TokenType.RIGHT,
349        "RLIKE": TokenType.RLIKE,
350        "ROLLBACK": TokenType.ROLLBACK,
351        "ROLLUP": TokenType.ROLLUP,
352        "ROW": TokenType.ROW,
353        "ROWS": TokenType.ROWS,
354        "SCHEMA": TokenType.SCHEMA,
355        "SELECT": TokenType.SELECT,
356        "SEMI": TokenType.SEMI,
357        "SESSION": TokenType.SESSION,
358        "SESSION_USER": TokenType.SESSION_USER,
359        "SET": TokenType.SET,
360        "SETTINGS": TokenType.SETTINGS,
361        "SHOW": TokenType.SHOW,
362        "SIMILAR TO": TokenType.SIMILAR_TO,
363        "SOME": TokenType.SOME,
364        "SORT BY": TokenType.SORT_BY,
365        "START WITH": TokenType.START_WITH,
366        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
367        "TABLE": TokenType.TABLE,
368        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
369        "TEMP": TokenType.TEMPORARY,
370        "TEMPORARY": TokenType.TEMPORARY,
371        "THEN": TokenType.THEN,
372        "TRUE": TokenType.TRUE,
373        "TRUNCATE": TokenType.TRUNCATE,
374        "TRIGGER": TokenType.TRIGGER,
375        "UNION": TokenType.UNION,
376        "UNKNOWN": TokenType.UNKNOWN,
377        "UNNEST": TokenType.UNNEST,
378        "UNPIVOT": TokenType.UNPIVOT,
379        "UPDATE": TokenType.UPDATE,
380        "USE": TokenType.USE,
381        "USING": TokenType.USING,
382        "UUID": TokenType.UUID,
383        "VALUES": TokenType.VALUES,
384        "VIEW": TokenType.VIEW,
385        "VOLATILE": TokenType.VOLATILE,
386        "WHEN": TokenType.WHEN,
387        "WHERE": TokenType.WHERE,
388        "WINDOW": TokenType.WINDOW,
389        "WITH": TokenType.WITH,
390        "APPLY": TokenType.APPLY,
391        "ARRAY": TokenType.ARRAY,
392        "BIT": TokenType.BIT,
393        "BOOL": TokenType.BOOLEAN,
394        "BOOLEAN": TokenType.BOOLEAN,
395        "BYTE": TokenType.TINYINT,
396        "MEDIUMINT": TokenType.MEDIUMINT,
397        "INT1": TokenType.TINYINT,
398        "TINYINT": TokenType.TINYINT,
399        "INT16": TokenType.SMALLINT,
400        "SHORT": TokenType.SMALLINT,
401        "SMALLINT": TokenType.SMALLINT,
402        "HUGEINT": TokenType.INT128,
403        "UHUGEINT": TokenType.UINT128,
404        "INT2": TokenType.SMALLINT,
405        "INTEGER": TokenType.INT,
406        "INT": TokenType.INT,
407        "INT4": TokenType.INT,
408        "INT32": TokenType.INT,
409        "INT64": TokenType.BIGINT,
410        "INT128": TokenType.INT128,
411        "INT256": TokenType.INT256,
412        "LONG": TokenType.BIGINT,
413        "BIGINT": TokenType.BIGINT,
414        "INT8": TokenType.TINYINT,
415        "UINT": TokenType.UINT,
416        "UINT128": TokenType.UINT128,
417        "UINT256": TokenType.UINT256,
418        "DEC": TokenType.DECIMAL,
419        "DECIMAL": TokenType.DECIMAL,
420        "DECIMAL32": TokenType.DECIMAL32,
421        "DECIMAL64": TokenType.DECIMAL64,
422        "DECIMAL128": TokenType.DECIMAL128,
423        "DECIMAL256": TokenType.DECIMAL256,
424        "DECFLOAT": TokenType.DECFLOAT,
425        "BIGDECIMAL": TokenType.BIGDECIMAL,
426        "BIGNUMERIC": TokenType.BIGDECIMAL,
427        "BIGNUM": TokenType.BIGNUM,
428        "LIST": TokenType.LIST,
429        "MAP": TokenType.MAP,
430        "NULLABLE": TokenType.NULLABLE,
431        "NUMBER": TokenType.DECIMAL,
432        "NUMERIC": TokenType.DECIMAL,
433        "FIXED": TokenType.DECIMAL,
434        "REAL": TokenType.FLOAT,
435        "FLOAT": TokenType.FLOAT,
436        "FLOAT4": TokenType.FLOAT,
437        "FLOAT8": TokenType.DOUBLE,
438        "DOUBLE": TokenType.DOUBLE,
439        "DOUBLE PRECISION": TokenType.DOUBLE,
440        "JSON": TokenType.JSON,
441        "JSONB": TokenType.JSONB,
442        "CHAR": TokenType.CHAR,
443        "CHARACTER": TokenType.CHAR,
444        "CHAR VARYING": TokenType.VARCHAR,
445        "CHARACTER VARYING": TokenType.VARCHAR,
446        "NCHAR": TokenType.NCHAR,
447        "VARCHAR": TokenType.VARCHAR,
448        "VARCHAR2": TokenType.VARCHAR,
449        "NVARCHAR": TokenType.NVARCHAR,
450        "NVARCHAR2": TokenType.NVARCHAR,
451        "BPCHAR": TokenType.BPCHAR,
452        "STR": TokenType.TEXT,
453        "STRING": TokenType.TEXT,
454        "TEXT": TokenType.TEXT,
455        "LONGTEXT": TokenType.LONGTEXT,
456        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
457        "TINYTEXT": TokenType.TINYTEXT,
458        "CLOB": TokenType.TEXT,
459        "LONGVARCHAR": TokenType.TEXT,
460        "BINARY": TokenType.BINARY,
461        "BLOB": TokenType.VARBINARY,
462        "LONGBLOB": TokenType.LONGBLOB,
463        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
464        "TINYBLOB": TokenType.TINYBLOB,
465        "BYTEA": TokenType.VARBINARY,
466        "VARBINARY": TokenType.VARBINARY,
467        "TIME": TokenType.TIME,
468        "TIMETZ": TokenType.TIMETZ,
469        "TIME_NS": TokenType.TIME_NS,
470        "TIMESTAMP": TokenType.TIMESTAMP,
471        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
472        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
473        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
474        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
475        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
476        "DATE": TokenType.DATE,
477        "DATETIME": TokenType.DATETIME,
478        "INT4RANGE": TokenType.INT4RANGE,
479        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
480        "INT8RANGE": TokenType.INT8RANGE,
481        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
482        "NUMRANGE": TokenType.NUMRANGE,
483        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
484        "TSRANGE": TokenType.TSRANGE,
485        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
486        "TSTZRANGE": TokenType.TSTZRANGE,
487        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
488        "DATERANGE": TokenType.DATERANGE,
489        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
490        "UNIQUE": TokenType.UNIQUE,
491        "VECTOR": TokenType.VECTOR,
492        "STRUCT": TokenType.STRUCT,
493        "SEQUENCE": TokenType.SEQUENCE,
494        "VARIANT": TokenType.VARIANT,
495        "ALTER": TokenType.ALTER,
496        "ANALYZE": TokenType.ANALYZE,
497        "CALL": TokenType.COMMAND,
498        "COMMENT": TokenType.COMMENT,
499        "EXPLAIN": TokenType.COMMAND,
500        "GRANT": TokenType.GRANT,
501        "REVOKE": TokenType.REVOKE,
502        "OPTIMIZE": TokenType.COMMAND,
503        "PREPARE": TokenType.COMMAND,
504        "VACUUM": TokenType.COMMAND,
505        "USER-DEFINED": TokenType.USERDEFINED,
506        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
507        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
508    }
509
510    COMMANDS = {
511        TokenType.COMMAND,
512        TokenType.EXECUTE,
513        TokenType.FETCH,
514        TokenType.SHOW,
515        TokenType.RENAME,
516    }
517
518    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
519
520    # Handle numeric literals like in hive (3L = BIGINT)
521    NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]] = {}
522
523    COMMENTS = ["--", ("/*", "*/")]
524
525    __slots__ = (
526        "dialect",
527        "_core",
528    )
529
530    def __init__(
531        self,
532        dialect: DialectType = None,
533        **opts: t.Any,
534    ) -> None:
535        from sqlglot.dialects import Dialect
536        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
537
538        self.dialect = Dialect.get_or_raise(dialect)
539
540        self._core = _TokenizerCore(
541            single_tokens=self.SINGLE_TOKENS,
542            keywords=self.KEYWORDS,
543            quotes=self._QUOTES,
544            format_strings=self._FORMAT_STRINGS,
545            identifiers=self._IDENTIFIERS,
546            comments=self._COMMENTS,
547            string_escapes=self._STRING_ESCAPES,
548            byte_string_escapes=self._BYTE_STRING_ESCAPES,
549            identifier_escapes=self._IDENTIFIER_ESCAPES,
550            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
551            commands=self.COMMANDS,
552            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
553            nested_comments=self.NESTED_COMMENTS,
554            hint_start=self.HINT_START,
555            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
556            bit_strings=list(self.BIT_STRINGS),
557            hex_strings=list(self.HEX_STRINGS),
558            numeric_literals=self.NUMERIC_LITERALS,
559            var_single_tokens=self.VAR_SINGLE_TOKENS,
560            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
561            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
562            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
563            keyword_trie=self._KEYWORD_TRIE,
564            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
565            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
566            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
567        )
568
569    def tokenize(self, sql: str) -> t.List[Token]:
570        """Returns a list of tokens corresponding to the SQL string `sql`."""
571        return self._core.tokenize(sql)  # type: ignore
572
573    @property
574    def sql(self) -> str:
575        """The SQL string being tokenized."""
576        return self._core.sql
577
578    @property
579    def size(self) -> int:
580        """Length of the SQL string."""
581        return self._core.size
582
583    @property
584    def tokens(self) -> t.List[Token]:
585        """The list of tokens produced by tokenization."""
586        return self._core.tokens
class Tokenizer(_TokenizerBase):
115class Tokenizer(_TokenizerBase):
116    SINGLE_TOKENS = {
117        "(": TokenType.L_PAREN,
118        ")": TokenType.R_PAREN,
119        "[": TokenType.L_BRACKET,
120        "]": TokenType.R_BRACKET,
121        "{": TokenType.L_BRACE,
122        "}": TokenType.R_BRACE,
123        "&": TokenType.AMP,
124        "^": TokenType.CARET,
125        ":": TokenType.COLON,
126        ",": TokenType.COMMA,
127        ".": TokenType.DOT,
128        "-": TokenType.DASH,
129        "=": TokenType.EQ,
130        ">": TokenType.GT,
131        "<": TokenType.LT,
132        "%": TokenType.MOD,
133        "!": TokenType.NOT,
134        "|": TokenType.PIPE,
135        "+": TokenType.PLUS,
136        ";": TokenType.SEMICOLON,
137        "/": TokenType.SLASH,
138        "\\": TokenType.BACKSLASH,
139        "*": TokenType.STAR,
140        "~": TokenType.TILDE,
141        "?": TokenType.PLACEHOLDER,
142        "@": TokenType.PARAMETER,
143        "#": TokenType.HASH,
144        # Used for breaking a var like x'y' but nothing else the token type doesn't matter
145        "'": TokenType.UNKNOWN,
146        "`": TokenType.UNKNOWN,
147        '"': TokenType.UNKNOWN,
148    }
149
150    BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
151    BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
152    HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
153    RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
154    HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
155    UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = []
156    IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = ['"']
157    QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]] = ["'"]
158    STRING_ESCAPES = ["'"]
159    BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]] = []
160    VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]] = set()
161    ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]] = []
162
163    # The strings in this list can always be used as escapes, regardless of the surrounding
164    # identifier delimiters. By default, the closing delimiter is assumed to also act as an
165    # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x"""
166    IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]] = []
167
168    # Whether the heredoc tags follow the same lexical rules as unquoted identifiers
169    HEREDOC_TAG_IS_IDENTIFIER = False
170
171    # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc
172    HEREDOC_STRING_ALTERNATIVE = TokenType.VAR
173
174    # Whether string escape characters function as such when placed within raw strings
175    STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
176
177    NESTED_COMMENTS = True
178
179    HINT_START = "/*+"
180
181    TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE}
182
183    # Autofilled
184    _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]] = {}
185    _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]] = {}
186    _IDENTIFIERS: t.ClassVar[t.Dict[str, str]] = {}
187    _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]] = set()
188    _QUOTES: t.ClassVar[t.Dict[str, str]] = {}
189    _STRING_ESCAPES: t.ClassVar[t.Set[str]] = set()
190    _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]] = set()
191    _KEYWORD_TRIE: t.ClassVar[t.Dict] = {}
192    _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]] = set()
193
194    KEYWORDS: t.ClassVar[t.Dict[str, TokenType]] = {
195        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
196        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
197        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
198        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
199        HINT_START: TokenType.HINT,
200        "&<": TokenType.AMP_LT,
201        "&>": TokenType.AMP_GT,
202        "==": TokenType.EQ,
203        "::": TokenType.DCOLON,
204        "?::": TokenType.QDCOLON,
205        "||": TokenType.DPIPE,
206        "|>": TokenType.PIPE_GT,
207        ">=": TokenType.GTE,
208        "<=": TokenType.LTE,
209        "<>": TokenType.NEQ,
210        "!=": TokenType.NEQ,
211        ":=": TokenType.COLON_EQ,
212        "<=>": TokenType.NULLSAFE_EQ,
213        "->": TokenType.ARROW,
214        "->>": TokenType.DARROW,
215        "=>": TokenType.FARROW,
216        "#>": TokenType.HASH_ARROW,
217        "#>>": TokenType.DHASH_ARROW,
218        "<->": TokenType.LR_ARROW,
219        "&&": TokenType.DAMP,
220        "??": TokenType.DQMARK,
221        "~~~": TokenType.GLOB,
222        "~~": TokenType.LIKE,
223        "~~*": TokenType.ILIKE,
224        "~*": TokenType.IRLIKE,
225        "-|-": TokenType.ADJACENT,
226        "ALL": TokenType.ALL,
227        "AND": TokenType.AND,
228        "ANTI": TokenType.ANTI,
229        "ANY": TokenType.ANY,
230        "ASC": TokenType.ASC,
231        "AS": TokenType.ALIAS,
232        "ASOF": TokenType.ASOF,
233        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
234        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
235        "BEGIN": TokenType.BEGIN,
236        "BETWEEN": TokenType.BETWEEN,
237        "CACHE": TokenType.CACHE,
238        "UNCACHE": TokenType.UNCACHE,
239        "CASE": TokenType.CASE,
240        "CHARACTER SET": TokenType.CHARACTER_SET,
241        "CLUSTER BY": TokenType.CLUSTER_BY,
242        "COLLATE": TokenType.COLLATE,
243        "COLUMN": TokenType.COLUMN,
244        "COMMIT": TokenType.COMMIT,
245        "CONNECT BY": TokenType.CONNECT_BY,
246        "CONSTRAINT": TokenType.CONSTRAINT,
247        "COPY": TokenType.COPY,
248        "CREATE": TokenType.CREATE,
249        "CROSS": TokenType.CROSS,
250        "CUBE": TokenType.CUBE,
251        "CURRENT_DATE": TokenType.CURRENT_DATE,
252        "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA,
253        "CURRENT_TIME": TokenType.CURRENT_TIME,
254        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
255        "CURRENT_USER": TokenType.CURRENT_USER,
256        "CURRENT_CATALOG": TokenType.CURRENT_CATALOG,
257        "DATABASE": TokenType.DATABASE,
258        "DEFAULT": TokenType.DEFAULT,
259        "DELETE": TokenType.DELETE,
260        "DESC": TokenType.DESC,
261        "DESCRIBE": TokenType.DESCRIBE,
262        "DISTINCT": TokenType.DISTINCT,
263        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
264        "DIV": TokenType.DIV,
265        "DROP": TokenType.DROP,
266        "ELSE": TokenType.ELSE,
267        "END": TokenType.END,
268        "ENUM": TokenType.ENUM,
269        "ESCAPE": TokenType.ESCAPE,
270        "EXCEPT": TokenType.EXCEPT,
271        "EXECUTE": TokenType.EXECUTE,
272        "EXISTS": TokenType.EXISTS,
273        "FALSE": TokenType.FALSE,
274        "FETCH": TokenType.FETCH,
275        "FILTER": TokenType.FILTER,
276        "FILE": TokenType.FILE,
277        "FIRST": TokenType.FIRST,
278        "FULL": TokenType.FULL,
279        "FUNCTION": TokenType.FUNCTION,
280        "FOR": TokenType.FOR,
281        "FOREIGN KEY": TokenType.FOREIGN_KEY,
282        "FORMAT": TokenType.FORMAT,
283        "FROM": TokenType.FROM,
284        "GEOGRAPHY": TokenType.GEOGRAPHY,
285        "GEOMETRY": TokenType.GEOMETRY,
286        "GLOB": TokenType.GLOB,
287        "GROUP BY": TokenType.GROUP_BY,
288        "GROUPING SETS": TokenType.GROUPING_SETS,
289        "HAVING": TokenType.HAVING,
290        "ILIKE": TokenType.ILIKE,
291        "IN": TokenType.IN,
292        "INDEX": TokenType.INDEX,
293        "INET": TokenType.INET,
294        "INNER": TokenType.INNER,
295        "INSERT": TokenType.INSERT,
296        "INTERVAL": TokenType.INTERVAL,
297        "INTERSECT": TokenType.INTERSECT,
298        "INTO": TokenType.INTO,
299        "IS": TokenType.IS,
300        "ISNULL": TokenType.ISNULL,
301        "JOIN": TokenType.JOIN,
302        "KEEP": TokenType.KEEP,
303        "KILL": TokenType.KILL,
304        "LATERAL": TokenType.LATERAL,
305        "LEFT": TokenType.LEFT,
306        "LIKE": TokenType.LIKE,
307        "LIMIT": TokenType.LIMIT,
308        "LOAD": TokenType.LOAD,
309        "LOCALTIME": TokenType.LOCALTIME,
310        "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP,
311        "LOCK": TokenType.LOCK,
312        "MERGE": TokenType.MERGE,
313        "NAMESPACE": TokenType.NAMESPACE,
314        "NATURAL": TokenType.NATURAL,
315        "NEXT": TokenType.NEXT,
316        "NOT": TokenType.NOT,
317        "NOTNULL": TokenType.NOTNULL,
318        "NULL": TokenType.NULL,
319        "OBJECT": TokenType.OBJECT,
320        "OFFSET": TokenType.OFFSET,
321        "ON": TokenType.ON,
322        "OR": TokenType.OR,
323        "XOR": TokenType.XOR,
324        "ORDER BY": TokenType.ORDER_BY,
325        "ORDINALITY": TokenType.ORDINALITY,
326        "OUT": TokenType.OUT,
327        "OUTER": TokenType.OUTER,
328        "OVER": TokenType.OVER,
329        "OVERLAPS": TokenType.OVERLAPS,
330        "OVERWRITE": TokenType.OVERWRITE,
331        "PARTITION": TokenType.PARTITION,
332        "PARTITION BY": TokenType.PARTITION_BY,
333        "PARTITIONED BY": TokenType.PARTITION_BY,
334        "PARTITIONED_BY": TokenType.PARTITION_BY,
335        "PERCENT": TokenType.PERCENT,
336        "PIVOT": TokenType.PIVOT,
337        "PRAGMA": TokenType.PRAGMA,
338        "PRIMARY KEY": TokenType.PRIMARY_KEY,
339        "PROCEDURE": TokenType.PROCEDURE,
340        "OPERATOR": TokenType.OPERATOR,
341        "QUALIFY": TokenType.QUALIFY,
342        "RANGE": TokenType.RANGE,
343        "RECURSIVE": TokenType.RECURSIVE,
344        "REGEXP": TokenType.RLIKE,
345        "RENAME": TokenType.RENAME,
346        "REPLACE": TokenType.REPLACE,
347        "RETURNING": TokenType.RETURNING,
348        "REFERENCES": TokenType.REFERENCES,
349        "RIGHT": TokenType.RIGHT,
350        "RLIKE": TokenType.RLIKE,
351        "ROLLBACK": TokenType.ROLLBACK,
352        "ROLLUP": TokenType.ROLLUP,
353        "ROW": TokenType.ROW,
354        "ROWS": TokenType.ROWS,
355        "SCHEMA": TokenType.SCHEMA,
356        "SELECT": TokenType.SELECT,
357        "SEMI": TokenType.SEMI,
358        "SESSION": TokenType.SESSION,
359        "SESSION_USER": TokenType.SESSION_USER,
360        "SET": TokenType.SET,
361        "SETTINGS": TokenType.SETTINGS,
362        "SHOW": TokenType.SHOW,
363        "SIMILAR TO": TokenType.SIMILAR_TO,
364        "SOME": TokenType.SOME,
365        "SORT BY": TokenType.SORT_BY,
366        "START WITH": TokenType.START_WITH,
367        "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN,
368        "TABLE": TokenType.TABLE,
369        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
370        "TEMP": TokenType.TEMPORARY,
371        "TEMPORARY": TokenType.TEMPORARY,
372        "THEN": TokenType.THEN,
373        "TRUE": TokenType.TRUE,
374        "TRUNCATE": TokenType.TRUNCATE,
375        "TRIGGER": TokenType.TRIGGER,
376        "UNION": TokenType.UNION,
377        "UNKNOWN": TokenType.UNKNOWN,
378        "UNNEST": TokenType.UNNEST,
379        "UNPIVOT": TokenType.UNPIVOT,
380        "UPDATE": TokenType.UPDATE,
381        "USE": TokenType.USE,
382        "USING": TokenType.USING,
383        "UUID": TokenType.UUID,
384        "VALUES": TokenType.VALUES,
385        "VIEW": TokenType.VIEW,
386        "VOLATILE": TokenType.VOLATILE,
387        "WHEN": TokenType.WHEN,
388        "WHERE": TokenType.WHERE,
389        "WINDOW": TokenType.WINDOW,
390        "WITH": TokenType.WITH,
391        "APPLY": TokenType.APPLY,
392        "ARRAY": TokenType.ARRAY,
393        "BIT": TokenType.BIT,
394        "BOOL": TokenType.BOOLEAN,
395        "BOOLEAN": TokenType.BOOLEAN,
396        "BYTE": TokenType.TINYINT,
397        "MEDIUMINT": TokenType.MEDIUMINT,
398        "INT1": TokenType.TINYINT,
399        "TINYINT": TokenType.TINYINT,
400        "INT16": TokenType.SMALLINT,
401        "SHORT": TokenType.SMALLINT,
402        "SMALLINT": TokenType.SMALLINT,
403        "HUGEINT": TokenType.INT128,
404        "UHUGEINT": TokenType.UINT128,
405        "INT2": TokenType.SMALLINT,
406        "INTEGER": TokenType.INT,
407        "INT": TokenType.INT,
408        "INT4": TokenType.INT,
409        "INT32": TokenType.INT,
410        "INT64": TokenType.BIGINT,
411        "INT128": TokenType.INT128,
412        "INT256": TokenType.INT256,
413        "LONG": TokenType.BIGINT,
414        "BIGINT": TokenType.BIGINT,
415        "INT8": TokenType.TINYINT,
416        "UINT": TokenType.UINT,
417        "UINT128": TokenType.UINT128,
418        "UINT256": TokenType.UINT256,
419        "DEC": TokenType.DECIMAL,
420        "DECIMAL": TokenType.DECIMAL,
421        "DECIMAL32": TokenType.DECIMAL32,
422        "DECIMAL64": TokenType.DECIMAL64,
423        "DECIMAL128": TokenType.DECIMAL128,
424        "DECIMAL256": TokenType.DECIMAL256,
425        "DECFLOAT": TokenType.DECFLOAT,
426        "BIGDECIMAL": TokenType.BIGDECIMAL,
427        "BIGNUMERIC": TokenType.BIGDECIMAL,
428        "BIGNUM": TokenType.BIGNUM,
429        "LIST": TokenType.LIST,
430        "MAP": TokenType.MAP,
431        "NULLABLE": TokenType.NULLABLE,
432        "NUMBER": TokenType.DECIMAL,
433        "NUMERIC": TokenType.DECIMAL,
434        "FIXED": TokenType.DECIMAL,
435        "REAL": TokenType.FLOAT,
436        "FLOAT": TokenType.FLOAT,
437        "FLOAT4": TokenType.FLOAT,
438        "FLOAT8": TokenType.DOUBLE,
439        "DOUBLE": TokenType.DOUBLE,
440        "DOUBLE PRECISION": TokenType.DOUBLE,
441        "JSON": TokenType.JSON,
442        "JSONB": TokenType.JSONB,
443        "CHAR": TokenType.CHAR,
444        "CHARACTER": TokenType.CHAR,
445        "CHAR VARYING": TokenType.VARCHAR,
446        "CHARACTER VARYING": TokenType.VARCHAR,
447        "NCHAR": TokenType.NCHAR,
448        "VARCHAR": TokenType.VARCHAR,
449        "VARCHAR2": TokenType.VARCHAR,
450        "NVARCHAR": TokenType.NVARCHAR,
451        "NVARCHAR2": TokenType.NVARCHAR,
452        "BPCHAR": TokenType.BPCHAR,
453        "STR": TokenType.TEXT,
454        "STRING": TokenType.TEXT,
455        "TEXT": TokenType.TEXT,
456        "LONGTEXT": TokenType.LONGTEXT,
457        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
458        "TINYTEXT": TokenType.TINYTEXT,
459        "CLOB": TokenType.TEXT,
460        "LONGVARCHAR": TokenType.TEXT,
461        "BINARY": TokenType.BINARY,
462        "BLOB": TokenType.VARBINARY,
463        "LONGBLOB": TokenType.LONGBLOB,
464        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
465        "TINYBLOB": TokenType.TINYBLOB,
466        "BYTEA": TokenType.VARBINARY,
467        "VARBINARY": TokenType.VARBINARY,
468        "TIME": TokenType.TIME,
469        "TIMETZ": TokenType.TIMETZ,
470        "TIME_NS": TokenType.TIME_NS,
471        "TIMESTAMP": TokenType.TIMESTAMP,
472        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
473        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
474        "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
475        "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ,
476        "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ,
477        "DATE": TokenType.DATE,
478        "DATETIME": TokenType.DATETIME,
479        "INT4RANGE": TokenType.INT4RANGE,
480        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
481        "INT8RANGE": TokenType.INT8RANGE,
482        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
483        "NUMRANGE": TokenType.NUMRANGE,
484        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
485        "TSRANGE": TokenType.TSRANGE,
486        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
487        "TSTZRANGE": TokenType.TSTZRANGE,
488        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
489        "DATERANGE": TokenType.DATERANGE,
490        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
491        "UNIQUE": TokenType.UNIQUE,
492        "VECTOR": TokenType.VECTOR,
493        "STRUCT": TokenType.STRUCT,
494        "SEQUENCE": TokenType.SEQUENCE,
495        "VARIANT": TokenType.VARIANT,
496        "ALTER": TokenType.ALTER,
497        "ANALYZE": TokenType.ANALYZE,
498        "CALL": TokenType.COMMAND,
499        "COMMENT": TokenType.COMMENT,
500        "EXPLAIN": TokenType.COMMAND,
501        "GRANT": TokenType.GRANT,
502        "REVOKE": TokenType.REVOKE,
503        "OPTIMIZE": TokenType.COMMAND,
504        "PREPARE": TokenType.COMMAND,
505        "VACUUM": TokenType.COMMAND,
506        "USER-DEFINED": TokenType.USERDEFINED,
507        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
508        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
509    }
510
511    COMMANDS = {
512        TokenType.COMMAND,
513        TokenType.EXECUTE,
514        TokenType.FETCH,
515        TokenType.SHOW,
516        TokenType.RENAME,
517    }
518
519    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
520
521    # Handle numeric literals like in hive (3L = BIGINT)
522    NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]] = {}
523
524    COMMENTS = ["--", ("/*", "*/")]
525
526    __slots__ = (
527        "dialect",
528        "_core",
529    )
530
531    def __init__(
532        self,
533        dialect: DialectType = None,
534        **opts: t.Any,
535    ) -> None:
536        from sqlglot.dialects import Dialect
537        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
538
539        self.dialect = Dialect.get_or_raise(dialect)
540
541        self._core = _TokenizerCore(
542            single_tokens=self.SINGLE_TOKENS,
543            keywords=self.KEYWORDS,
544            quotes=self._QUOTES,
545            format_strings=self._FORMAT_STRINGS,
546            identifiers=self._IDENTIFIERS,
547            comments=self._COMMENTS,
548            string_escapes=self._STRING_ESCAPES,
549            byte_string_escapes=self._BYTE_STRING_ESCAPES,
550            identifier_escapes=self._IDENTIFIER_ESCAPES,
551            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
552            commands=self.COMMANDS,
553            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
554            nested_comments=self.NESTED_COMMENTS,
555            hint_start=self.HINT_START,
556            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
557            bit_strings=list(self.BIT_STRINGS),
558            hex_strings=list(self.HEX_STRINGS),
559            numeric_literals=self.NUMERIC_LITERALS,
560            var_single_tokens=self.VAR_SINGLE_TOKENS,
561            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
562            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
563            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
564            keyword_trie=self._KEYWORD_TRIE,
565            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
566            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
567            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
568        )
569
570    def tokenize(self, sql: str) -> t.List[Token]:
571        """Returns a list of tokens corresponding to the SQL string `sql`."""
572        return self._core.tokenize(sql)  # type: ignore
573
574    @property
575    def sql(self) -> str:
576        """The SQL string being tokenized."""
577        return self._core.sql
578
579    @property
580    def size(self) -> int:
581        """Length of the SQL string."""
582        return self._core.size
583
584    @property
585    def tokens(self) -> t.List[Token]:
586        """The list of tokens produced by tokenization."""
587        return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, Type[sqlglot.dialects.Dialect], NoneType] = None, **opts: Any)
531    def __init__(
532        self,
533        dialect: DialectType = None,
534        **opts: t.Any,
535    ) -> None:
536        from sqlglot.dialects import Dialect
537        from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore
538
539        self.dialect = Dialect.get_or_raise(dialect)
540
541        self._core = _TokenizerCore(
542            single_tokens=self.SINGLE_TOKENS,
543            keywords=self.KEYWORDS,
544            quotes=self._QUOTES,
545            format_strings=self._FORMAT_STRINGS,
546            identifiers=self._IDENTIFIERS,
547            comments=self._COMMENTS,
548            string_escapes=self._STRING_ESCAPES,
549            byte_string_escapes=self._BYTE_STRING_ESCAPES,
550            identifier_escapes=self._IDENTIFIER_ESCAPES,
551            escape_follow_chars=self._ESCAPE_FOLLOW_CHARS,
552            commands=self.COMMANDS,
553            command_prefix_tokens=self.COMMAND_PREFIX_TOKENS,
554            nested_comments=self.NESTED_COMMENTS,
555            hint_start=self.HINT_START,
556            tokens_preceding_hint=self.TOKENS_PRECEDING_HINT,
557            bit_strings=list(self.BIT_STRINGS),
558            hex_strings=list(self.HEX_STRINGS),
559            numeric_literals=self.NUMERIC_LITERALS,
560            var_single_tokens=self.VAR_SINGLE_TOKENS,
561            string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS,
562            heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER,
563            heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE,
564            keyword_trie=self._KEYWORD_TRIE,
565            numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
566            identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
567            unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
568        )
SINGLE_TOKENS = {'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 35>, '^': <TokenType.CARET: 41>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 27>, '>': <TokenType.GT: 24>, '<': <TokenType.LT: 22>, '%': <TokenType.MOD: 323>, '!': <TokenType.NOT: 26>, '|': <TokenType.PIPE: 38>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 18>, '/': <TokenType.SLASH: 21>, '\\': <TokenType.BACKSLASH: 20>, '*': <TokenType.STAR: 19>, '~': <TokenType.TILDE: 43>, '?': <TokenType.PLACEHOLDER: 349>, '@': <TokenType.PARAMETER: 55>, '#': <TokenType.HASH: 47>, "'": <TokenType.UNKNOWN: 210>, '`': <TokenType.UNKNOWN: 210>, '"': <TokenType.UNKNOWN: 210>}
BIT_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
BYTE_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
HEX_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
RAW_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
HEREDOC_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
UNICODE_STRINGS: ClassVar[List[Union[str, Tuple[str, str]]]] = []
IDENTIFIERS: ClassVar[List[Union[str, Tuple[str, str]]]] = ['"']
QUOTES: ClassVar[List[Union[str, Tuple[str, str]]]] = ["'"]
STRING_ESCAPES = ["'"]
BYTE_STRING_ESCAPES: ClassVar[List[str]] = []
VAR_SINGLE_TOKENS: ClassVar[Set[str]] = set()
ESCAPE_FOLLOW_CHARS: ClassVar[List[str]] = []
IDENTIFIER_ESCAPES: ClassVar[List[str]] = []
HEREDOC_TAG_IS_IDENTIFIER = False
HEREDOC_STRING_ALTERNATIVE = <TokenType.VAR: 85>
STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True
NESTED_COMMENTS = True
HINT_START = '/*+'
TOKENS_PRECEDING_HINT = {<TokenType.DELETE: 252>, <TokenType.UPDATE: 404>, <TokenType.INSERT: 295>, <TokenType.SELECT: 375>}
KEYWORDS: ClassVar[Dict[str, sqlglot.tokenizer_core.TokenType]] = {'{%': <TokenType.BLOCK_START: 70>, '{%+': <TokenType.BLOCK_START: 70>, '{%-': <TokenType.BLOCK_START: 70>, '%}': <TokenType.BLOCK_END: 71>, '+%}': <TokenType.BLOCK_END: 71>, '-%}': <TokenType.BLOCK_END: 71>, '{{+': <TokenType.BLOCK_START: 70>, '{{-': <TokenType.BLOCK_START: 70>, '+}}': <TokenType.BLOCK_END: 71>, '-}}': <TokenType.BLOCK_END: 71>, '/*+': <TokenType.HINT: 288>, '&<': <TokenType.AMP_LT: 60>, '&>': <TokenType.AMP_GT: 61>, '==': <TokenType.EQ: 27>, '::': <TokenType.DCOLON: 13>, '?::': <TokenType.QDCOLON: 360>, '||': <TokenType.DPIPE: 36>, '|>': <TokenType.PIPE_GT: 37>, '>=': <TokenType.GTE: 25>, '<=': <TokenType.LTE: 23>, '<>': <TokenType.NEQ: 28>, '!=': <TokenType.NEQ: 28>, ':=': <TokenType.COLON_EQ: 30>, '<=>': <TokenType.NULLSAFE_EQ: 29>, '->': <TokenType.ARROW: 44>, '->>': <TokenType.DARROW: 45>, '=>': <TokenType.FARROW: 46>, '#>': <TokenType.HASH_ARROW: 48>, '#>>': <TokenType.DHASH_ARROW: 49>, '<->': <TokenType.LR_ARROW: 50>, '&&': <TokenType.DAMP: 59>, '??': <TokenType.DQMARK: 17>, '~~~': <TokenType.GLOB: 282>, '~~': <TokenType.LIKE: 312>, '~~*': <TokenType.ILIKE: 290>, '~*': <TokenType.IRLIKE: 301>, '-|-': <TokenType.ADJACENT: 62>, 'ALL': <TokenType.ALL: 216>, 'AND': <TokenType.AND: 33>, 'ANTI': <TokenType.ANTI: 217>, 'ANY': <TokenType.ANY: 218>, 'ASC': <TokenType.ASC: 221>, 'AS': <TokenType.ALIAS: 214>, 'ASOF': <TokenType.ASOF: 222>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 224>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 224>, 'BEGIN': <TokenType.BEGIN: 225>, 'BETWEEN': <TokenType.BETWEEN: 226>, 'CACHE': <TokenType.CACHE: 228>, 'UNCACHE': <TokenType.UNCACHE: 400>, 'CASE': <TokenType.CASE: 229>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 230>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 231>, 'COLLATE': <TokenType.COLLATE: 232>, 'COLUMN': <TokenType.COLUMN: 78>, 'COMMIT': <TokenType.COMMIT: 235>, 'CONNECT BY': <TokenType.CONNECT_BY: 236>, 'CONSTRAINT': <TokenType.CONSTRAINT: 237>, 'COPY': <TokenType.COPY: 238>, 'CREATE': <TokenType.CREATE: 239>, 'CROSS': <TokenType.CROSS: 240>, 'CUBE': <TokenType.CUBE: 241>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 242>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 244>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 245>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 246>, 'CURRENT_USER': <TokenType.CURRENT_USER: 247>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 249>, 'DATABASE': <TokenType.DATABASE: 77>, 'DEFAULT': <TokenType.DEFAULT: 251>, 'DELETE': <TokenType.DELETE: 252>, 'DESC': <TokenType.DESC: 253>, 'DESCRIBE': <TokenType.DESCRIBE: 254>, 'DISTINCT': <TokenType.DISTINCT: 257>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 258>, 'DIV': <TokenType.DIV: 259>, 'DROP': <TokenType.DROP: 260>, 'ELSE': <TokenType.ELSE: 261>, 'END': <TokenType.END: 262>, 'ENUM': <TokenType.ENUM: 201>, 'ESCAPE': <TokenType.ESCAPE: 263>, 'EXCEPT': <TokenType.EXCEPT: 264>, 'EXECUTE': <TokenType.EXECUTE: 265>, 'EXISTS': <TokenType.EXISTS: 266>, 'FALSE': <TokenType.FALSE: 267>, 'FETCH': <TokenType.FETCH: 268>, 'FILTER': <TokenType.FILTER: 271>, 'FILE': <TokenType.FILE: 269>, 'FIRST': <TokenType.FIRST: 273>, 'FULL': <TokenType.FULL: 279>, 'FUNCTION': <TokenType.FUNCTION: 280>, 'FOR': <TokenType.FOR: 274>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 276>, 'FORMAT': <TokenType.FORMAT: 277>, 'FROM': <TokenType.FROM: 278>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 168>, 'GEOMETRY': <TokenType.GEOMETRY: 171>, 'GLOB': <TokenType.GLOB: 282>, 'GROUP BY': <TokenType.GROUP_BY: 285>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 286>, 'HAVING': <TokenType.HAVING: 287>, 'ILIKE': <TokenType.ILIKE: 290>, 'IN': <TokenType.IN: 291>, 'INDEX': <TokenType.INDEX: 292>, 'INET': <TokenType.INET: 196>, 'INNER': <TokenType.INNER: 294>, 'INSERT': <TokenType.INSERT: 295>, 'INTERVAL': <TokenType.INTERVAL: 298>, 'INTERSECT': <TokenType.INTERSECT: 297>, 'INTO': <TokenType.INTO: 299>, 'IS': <TokenType.IS: 302>, 'ISNULL': <TokenType.ISNULL: 303>, 'JOIN': <TokenType.JOIN: 304>, 'KEEP': <TokenType.KEEP: 306>, 'KILL': <TokenType.KILL: 308>, 'LATERAL': <TokenType.LATERAL: 310>, 'LEFT': <TokenType.LEFT: 311>, 'LIKE': <TokenType.LIKE: 312>, 'LIMIT': <TokenType.LIMIT: 313>, 'LOAD': <TokenType.LOAD: 315>, 'LOCALTIME': <TokenType.LOCALTIME: 175>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 176>, 'LOCK': <TokenType.LOCK: 316>, 'MERGE': <TokenType.MERGE: 322>, 'NAMESPACE': <TokenType.NAMESPACE: 426>, 'NATURAL': <TokenType.NATURAL: 325>, 'NEXT': <TokenType.NEXT: 326>, 'NOT': <TokenType.NOT: 26>, 'NOTNULL': <TokenType.NOTNULL: 328>, 'NULL': <TokenType.NULL: 329>, 'OBJECT': <TokenType.OBJECT: 195>, 'OFFSET': <TokenType.OFFSET: 331>, 'ON': <TokenType.ON: 332>, 'OR': <TokenType.OR: 34>, 'XOR': <TokenType.XOR: 63>, 'ORDER BY': <TokenType.ORDER_BY: 335>, 'ORDINALITY': <TokenType.ORDINALITY: 338>, 'OUT': <TokenType.OUT: 339>, 'OUTER': <TokenType.OUTER: 341>, 'OVER': <TokenType.OVER: 342>, 'OVERLAPS': <TokenType.OVERLAPS: 343>, 'OVERWRITE': <TokenType.OVERWRITE: 344>, 'PARTITION': <TokenType.PARTITION: 345>, 'PARTITION BY': <TokenType.PARTITION_BY: 346>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 346>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 346>, 'PERCENT': <TokenType.PERCENT: 347>, 'PIVOT': <TokenType.PIVOT: 348>, 'PRAGMA': <TokenType.PRAGMA: 351>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 353>, 'PROCEDURE': <TokenType.PROCEDURE: 354>, 'OPERATOR': <TokenType.OPERATOR: 334>, 'QUALIFY': <TokenType.QUALIFY: 358>, 'RANGE': <TokenType.RANGE: 361>, 'RECURSIVE': <TokenType.RECURSIVE: 362>, 'REGEXP': <TokenType.RLIKE: 370>, 'RENAME': <TokenType.RENAME: 364>, 'REPLACE': <TokenType.REPLACE: 365>, 'RETURNING': <TokenType.RETURNING: 366>, 'REFERENCES': <TokenType.REFERENCES: 368>, 'RIGHT': <TokenType.RIGHT: 369>, 'RLIKE': <TokenType.RLIKE: 370>, 'ROLLBACK': <TokenType.ROLLBACK: 371>, 'ROLLUP': <TokenType.ROLLUP: 372>, 'ROW': <TokenType.ROW: 373>, 'ROWS': <TokenType.ROWS: 374>, 'SCHEMA': <TokenType.SCHEMA: 80>, 'SELECT': <TokenType.SELECT: 375>, 'SEMI': <TokenType.SEMI: 376>, 'SESSION': <TokenType.SESSION: 56>, 'SESSION_USER': <TokenType.SESSION_USER: 58>, 'SET': <TokenType.SET: 380>, 'SETTINGS': <TokenType.SETTINGS: 381>, 'SHOW': <TokenType.SHOW: 382>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 383>, 'SOME': <TokenType.SOME: 384>, 'SORT BY': <TokenType.SORT_BY: 385>, 'START WITH': <TokenType.START_WITH: 387>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 389>, 'TABLE': <TokenType.TABLE: 81>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 392>, 'TEMP': <TokenType.TEMPORARY: 394>, 'TEMPORARY': <TokenType.TEMPORARY: 394>, 'THEN': <TokenType.THEN: 396>, 'TRUE': <TokenType.TRUE: 397>, 'TRUNCATE': <TokenType.TRUNCATE: 398>, 'TRIGGER': <TokenType.TRIGGER: 399>, 'UNION': <TokenType.UNION: 401>, 'UNKNOWN': <TokenType.UNKNOWN: 210>, 'UNNEST': <TokenType.UNNEST: 402>, 'UNPIVOT': <TokenType.UNPIVOT: 403>, 'UPDATE': <TokenType.UPDATE: 404>, 'USE': <TokenType.USE: 405>, 'USING': <TokenType.USING: 406>, 'UUID': <TokenType.UUID: 167>, 'VALUES': <TokenType.VALUES: 407>, 'VIEW': <TokenType.VIEW: 409>, 'VOLATILE': <TokenType.VOLATILE: 411>, 'WHEN': <TokenType.WHEN: 412>, 'WHERE': <TokenType.WHERE: 413>, 'WINDOW': <TokenType.WINDOW: 414>, 'WITH': <TokenType.WITH: 415>, 'APPLY': <TokenType.APPLY: 219>, 'ARRAY': <TokenType.ARRAY: 220>, 'BIT': <TokenType.BIT: 93>, 'BOOL': <TokenType.BOOLEAN: 94>, 'BOOLEAN': <TokenType.BOOLEAN: 94>, 'BYTE': <TokenType.TINYINT: 95>, 'MEDIUMINT': <TokenType.MEDIUMINT: 99>, 'INT1': <TokenType.TINYINT: 95>, 'TINYINT': <TokenType.TINYINT: 95>, 'INT16': <TokenType.SMALLINT: 97>, 'SHORT': <TokenType.SMALLINT: 97>, 'SMALLINT': <TokenType.SMALLINT: 97>, 'HUGEINT': <TokenType.INT128: 106>, 'UHUGEINT': <TokenType.UINT128: 107>, 'INT2': <TokenType.SMALLINT: 97>, 'INTEGER': <TokenType.INT: 101>, 'INT': <TokenType.INT: 101>, 'INT4': <TokenType.INT: 101>, 'INT32': <TokenType.INT: 101>, 'INT64': <TokenType.BIGINT: 103>, 'INT128': <TokenType.INT128: 106>, 'INT256': <TokenType.INT256: 108>, 'LONG': <TokenType.BIGINT: 103>, 'BIGINT': <TokenType.BIGINT: 103>, 'INT8': <TokenType.TINYINT: 95>, 'UINT': <TokenType.UINT: 102>, 'UINT128': <TokenType.UINT128: 107>, 'UINT256': <TokenType.UINT256: 109>, 'DEC': <TokenType.DECIMAL: 113>, 'DECIMAL': <TokenType.DECIMAL: 113>, 'DECIMAL32': <TokenType.DECIMAL32: 114>, 'DECIMAL64': <TokenType.DECIMAL64: 115>, 'DECIMAL128': <TokenType.DECIMAL128: 116>, 'DECIMAL256': <TokenType.DECIMAL256: 117>, 'DECFLOAT': <TokenType.DECFLOAT: 118>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 120>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 120>, 'BIGNUM': <TokenType.BIGNUM: 105>, 'LIST': <TokenType.LIST: 314>, 'MAP': <TokenType.MAP: 317>, 'NULLABLE': <TokenType.NULLABLE: 170>, 'NUMBER': <TokenType.DECIMAL: 113>, 'NUMERIC': <TokenType.DECIMAL: 113>, 'FIXED': <TokenType.DECIMAL: 113>, 'REAL': <TokenType.FLOAT: 110>, 'FLOAT': <TokenType.FLOAT: 110>, 'FLOAT4': <TokenType.FLOAT: 110>, 'FLOAT8': <TokenType.DOUBLE: 111>, 'DOUBLE': <TokenType.DOUBLE: 111>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 111>, 'JSON': <TokenType.JSON: 137>, 'JSONB': <TokenType.JSONB: 138>, 'CHAR': <TokenType.CHAR: 121>, 'CHARACTER': <TokenType.CHAR: 121>, 'CHAR VARYING': <TokenType.VARCHAR: 123>, 'CHARACTER VARYING': <TokenType.VARCHAR: 123>, 'NCHAR': <TokenType.NCHAR: 122>, 'VARCHAR': <TokenType.VARCHAR: 123>, 'VARCHAR2': <TokenType.VARCHAR: 123>, 'NVARCHAR': <TokenType.NVARCHAR: 124>, 'NVARCHAR2': <TokenType.NVARCHAR: 124>, 'BPCHAR': <TokenType.BPCHAR: 125>, 'STR': <TokenType.TEXT: 126>, 'STRING': <TokenType.TEXT: 126>, 'TEXT': <TokenType.TEXT: 126>, 'LONGTEXT': <TokenType.LONGTEXT: 128>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 127>, 'TINYTEXT': <TokenType.TINYTEXT: 133>, 'CLOB': <TokenType.TEXT: 126>, 'LONGVARCHAR': <TokenType.TEXT: 126>, 'BINARY': <TokenType.BINARY: 135>, 'BLOB': <TokenType.VARBINARY: 136>, 'LONGBLOB': <TokenType.LONGBLOB: 131>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 130>, 'TINYBLOB': <TokenType.TINYBLOB: 132>, 'BYTEA': <TokenType.VARBINARY: 136>, 'VARBINARY': <TokenType.VARBINARY: 136>, 'TIME': <TokenType.TIME: 139>, 'TIMETZ': <TokenType.TIMETZ: 140>, 'TIME_NS': <TokenType.TIME_NS: 141>, 'TIMESTAMP': <TokenType.TIMESTAMP: 142>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 143>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 144>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 144>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 145>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 145>, 'DATE': <TokenType.DATE: 153>, 'DATETIME': <TokenType.DATETIME: 149>, 'INT4RANGE': <TokenType.INT4RANGE: 155>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 156>, 'INT8RANGE': <TokenType.INT8RANGE: 157>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 158>, 'NUMRANGE': <TokenType.NUMRANGE: 159>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 160>, 'TSRANGE': <TokenType.TSRANGE: 161>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 162>, 'TSTZRANGE': <TokenType.TSTZRANGE: 163>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 164>, 'DATERANGE': <TokenType.DATERANGE: 165>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 166>, 'UNIQUE': <TokenType.UNIQUE: 416>, 'VECTOR': <TokenType.VECTOR: 211>, 'STRUCT': <TokenType.STRUCT: 390>, 'SEQUENCE': <TokenType.SEQUENCE: 378>, 'VARIANT': <TokenType.VARIANT: 194>, 'ALTER': <TokenType.ALTER: 215>, 'ANALYZE': <TokenType.ANALYZE: 425>, 'CALL': <TokenType.COMMAND: 233>, 'COMMENT': <TokenType.COMMENT: 234>, 'EXPLAIN': <TokenType.COMMAND: 233>, 'GRANT': <TokenType.GRANT: 284>, 'REVOKE': <TokenType.REVOKE: 367>, 'OPTIMIZE': <TokenType.COMMAND: 233>, 'PREPARE': <TokenType.COMMAND: 233>, 'VACUUM': <TokenType.COMMAND: 233>, 'USER-DEFINED': <TokenType.USERDEFINED: 189>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 420>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 421>}
COMMANDS = {<TokenType.COMMAND: 233>, <TokenType.EXECUTE: 265>, <TokenType.RENAME: 364>, <TokenType.FETCH: 268>, <TokenType.SHOW: 382>}
COMMAND_PREFIX_TOKENS = {<TokenType.BEGIN: 225>, <TokenType.SEMICOLON: 18>}
NUMERIC_LITERALS: ClassVar[Dict[str, str]] = {}
COMMENTS = ['--', ('/*', '*/')]
dialect
def tokenize(self, sql: str) -> List[sqlglot.tokenizer_core.Token]:
570    def tokenize(self, sql: str) -> t.List[Token]:
571        """Returns a list of tokens corresponding to the SQL string `sql`."""
572        return self._core.tokenize(sql)  # type: ignore

Returns a list of tokens corresponding to the SQL string sql.

sql: str
574    @property
575    def sql(self) -> str:
576        """The SQL string being tokenized."""
577        return self._core.sql

The SQL string being tokenized.

size: int
579    @property
580    def size(self) -> int:
581        """Length of the SQL string."""
582        return self._core.size

Length of the SQL string.

tokens: List[sqlglot.tokenizer_core.Token]
584    @property
585    def tokens(self) -> t.List[Token]:
586        """The list of tokens produced by tokenization."""
587        return self._core.tokens

The list of tokens produced by tokenization.