sqlglot.tokens
1from __future__ import annotations 2 3import threading 4import typing as t 5 6from sqlglot.trie import new_trie 7 8from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType 9 10T = t.TypeVar("T") 11 12 13class ThreadLocalCache(threading.local): 14 """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.""" 15 16 def __init__(self) -> None: 17 self.cache: dict[type, t.Any] = {} 18 19 def get_or_build(self, key: type, build: t.Callable[[], T]) -> T: 20 if not (obj := self.cache.get(key)): 21 self.cache[key] = obj = build() 22 return obj 23 24 25try: 26 import sqlglotc # noqa: F401 27except ImportError: 28 pass 29 30try: 31 import sqlglotrs # type: ignore # noqa: F401 32 import warnings 33 34 if "sqlglotc" not in globals(): 35 warnings.warn( 36 "sqlglot[rs] is deprecated and no longer compatible with sqlglot. " 37 "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]", 38 ) 39except ImportError: 40 pass 41 42if t.TYPE_CHECKING: 43 from sqlglot.dialects.dialect import DialectType 44 45 46def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]: 47 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr) 48 49 50def _quotes_to_format( 51 token_type: TokenType, arr: list[str | tuple[str, str]] 52) -> dict[str, tuple[str, TokenType]]: 53 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 54 55 56class _TokenizerBase: 57 QUOTES: t.ClassVar[list[tuple[str, str] | str]] 58 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] 59 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 60 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 61 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 62 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 63 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 64 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 65 STRING_ESCAPES: t.ClassVar[list[str]] 66 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] 67 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] 68 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] 69 HINT_START: t.ClassVar[str] 70 KEYWORDS: t.ClassVar[dict[str, TokenType]] 71 SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]] 72 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] 73 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] 74 COMMANDS: t.ClassVar[set[TokenType]] 75 COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]] 76 HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool] 77 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool] 78 NESTED_COMMENTS: t.ClassVar[bool] 79 TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]] 80 HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType] 81 COMMENTS: t.ClassVar[list[str | tuple[str, str]]] 82 _QUOTES: t.ClassVar[dict[str, str]] 83 _IDENTIFIERS: t.ClassVar[dict[str, str]] 84 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] 85 _STRING_ESCAPES: t.ClassVar[set[str]] 86 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] 87 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] 88 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] 89 _COMMENTS: t.ClassVar[dict[str, str | None]] 90 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] 91 92 @classmethod 93 def __init_subclass__(cls, **kwargs: t.Any) -> None: 94 super().__init_subclass__(**kwargs) 95 cls._QUOTES = _convert_quotes(cls.QUOTES) 96 cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS) 97 cls._FORMAT_STRINGS = { 98 **{ 99 p + s: (e, TokenType.NATIONAL_STRING) 100 for s, e in cls._QUOTES.items() 101 for p in ("n", "N") 102 }, 103 **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS), 104 **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS), 105 **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS), 106 **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS), 107 **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS), 108 **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS), 109 } 110 if "BYTE_STRING_ESCAPES" not in cls.__dict__: 111 cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy() 112 cls._STRING_ESCAPES = set(cls.STRING_ESCAPES) 113 cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES) 114 cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS) 115 cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES) 116 cls._COMMENTS = { 117 **{c: None for c in cls.COMMENTS if isinstance(c, str)}, 118 **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)}, 119 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 120 } 121 if cls.HINT_START in cls.KEYWORDS: 122 cls._COMMENTS[cls.HINT_START] = "*/" 123 cls._KEYWORD_TRIE = new_trie( 124 key.upper() 125 for key in ( 126 *cls.KEYWORDS, 127 *cls._COMMENTS, 128 *cls._QUOTES, 129 *cls._FORMAT_STRINGS, 130 ) 131 if " " in key or any(single in key for single in cls.SINGLE_TOKENS) 132 ) 133 134 135class Tokenizer(_TokenizerBase): 136 SINGLE_TOKENS = { 137 "(": TokenType.L_PAREN, 138 ")": TokenType.R_PAREN, 139 "[": TokenType.L_BRACKET, 140 "]": TokenType.R_BRACKET, 141 "{": TokenType.L_BRACE, 142 "}": TokenType.R_BRACE, 143 "&": TokenType.AMP, 144 "^": TokenType.CARET, 145 ":": TokenType.COLON, 146 ",": TokenType.COMMA, 147 ".": TokenType.DOT, 148 "-": TokenType.DASH, 149 "=": TokenType.EQ, 150 ">": TokenType.GT, 151 "<": TokenType.LT, 152 "%": TokenType.MOD, 153 "!": TokenType.NOT, 154 "|": TokenType.PIPE, 155 "+": TokenType.PLUS, 156 ";": TokenType.SEMICOLON, 157 "/": TokenType.SLASH, 158 "\\": TokenType.BACKSLASH, 159 "*": TokenType.STAR, 160 "~": TokenType.TILDE, 161 "?": TokenType.PLACEHOLDER, 162 "@": TokenType.PARAMETER, 163 "#": TokenType.HASH, 164 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 165 "'": TokenType.UNKNOWN, 166 "`": TokenType.UNKNOWN, 167 '"': TokenType.UNKNOWN, 168 } 169 170 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 171 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 172 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 173 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 174 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 175 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 176 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 177 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 178 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 179 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 180 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 181 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 182 183 # The strings in this list can always be used as escapes, regardless of the surrounding 184 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 185 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 186 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 187 188 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 189 HEREDOC_TAG_IS_IDENTIFIER = False 190 191 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 192 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 193 194 # Whether string escape characters function as such when placed within raw strings 195 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 196 197 NESTED_COMMENTS = True 198 199 HINT_START = "/*+" 200 201 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 202 203 # Autofilled 204 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 205 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 206 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 207 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 208 _QUOTES: t.ClassVar[dict[str, str]] = {} 209 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 210 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 211 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 212 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 213 214 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 215 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 216 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 217 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 218 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 219 HINT_START: TokenType.HINT, 220 "&<": TokenType.AMP_LT, 221 "&>": TokenType.AMP_GT, 222 "==": TokenType.EQ, 223 "::": TokenType.DCOLON, 224 "?::": TokenType.QDCOLON, 225 "||": TokenType.DPIPE, 226 "|>": TokenType.PIPE_GT, 227 ">=": TokenType.GTE, 228 "<=": TokenType.LTE, 229 "<>": TokenType.NEQ, 230 "!=": TokenType.NEQ, 231 ":=": TokenType.COLON_EQ, 232 "<=>": TokenType.NULLSAFE_EQ, 233 "->": TokenType.ARROW, 234 "->>": TokenType.DARROW, 235 "=>": TokenType.FARROW, 236 "#>": TokenType.HASH_ARROW, 237 "#>>": TokenType.DHASH_ARROW, 238 "<->": TokenType.LR_ARROW, 239 "&&": TokenType.DAMP, 240 "??": TokenType.DQMARK, 241 "~~~": TokenType.GLOB, 242 "~~": TokenType.LIKE, 243 "~~*": TokenType.ILIKE, 244 "~*": TokenType.IRLIKE, 245 "-|-": TokenType.ADJACENT, 246 "ALL": TokenType.ALL, 247 "AND": TokenType.AND, 248 "ANTI": TokenType.ANTI, 249 "ANY": TokenType.ANY, 250 "ASC": TokenType.ASC, 251 "AS": TokenType.ALIAS, 252 "ASOF": TokenType.ASOF, 253 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 254 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 255 "BEGIN": TokenType.BEGIN, 256 "BETWEEN": TokenType.BETWEEN, 257 "CACHE": TokenType.CACHE, 258 "UNCACHE": TokenType.UNCACHE, 259 "CASE": TokenType.CASE, 260 "CHARACTER SET": TokenType.CHARACTER_SET, 261 "CLUSTER BY": TokenType.CLUSTER_BY, 262 "COLLATE": TokenType.COLLATE, 263 "COLUMN": TokenType.COLUMN, 264 "COMMIT": TokenType.COMMIT, 265 "CONNECT BY": TokenType.CONNECT_BY, 266 "CONSTRAINT": TokenType.CONSTRAINT, 267 "COPY": TokenType.COPY, 268 "CREATE": TokenType.CREATE, 269 "CROSS": TokenType.CROSS, 270 "CUBE": TokenType.CUBE, 271 "CURRENT_DATE": TokenType.CURRENT_DATE, 272 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 273 "CURRENT_TIME": TokenType.CURRENT_TIME, 274 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 275 "CURRENT_USER": TokenType.CURRENT_USER, 276 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 277 "DATABASE": TokenType.DATABASE, 278 "DEFAULT": TokenType.DEFAULT, 279 "DELETE": TokenType.DELETE, 280 "DESC": TokenType.DESC, 281 "DESCRIBE": TokenType.DESCRIBE, 282 "DISTINCT": TokenType.DISTINCT, 283 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 284 "DIV": TokenType.DIV, 285 "DROP": TokenType.DROP, 286 "ELSE": TokenType.ELSE, 287 "END": TokenType.END, 288 "ENUM": TokenType.ENUM, 289 "ESCAPE": TokenType.ESCAPE, 290 "EXCEPT": TokenType.EXCEPT, 291 "EXECUTE": TokenType.EXECUTE, 292 "EXISTS": TokenType.EXISTS, 293 "FALSE": TokenType.FALSE, 294 "FETCH": TokenType.FETCH, 295 "FILTER": TokenType.FILTER, 296 "FILE": TokenType.FILE, 297 "FIRST": TokenType.FIRST, 298 "FULL": TokenType.FULL, 299 "FUNCTION": TokenType.FUNCTION, 300 "FOR": TokenType.FOR, 301 "FOREIGN KEY": TokenType.FOREIGN_KEY, 302 "FORMAT": TokenType.FORMAT, 303 "FROM": TokenType.FROM, 304 "GEOGRAPHY": TokenType.GEOGRAPHY, 305 "GEOMETRY": TokenType.GEOMETRY, 306 "GLOB": TokenType.GLOB, 307 "GROUP BY": TokenType.GROUP_BY, 308 "GROUPING SETS": TokenType.GROUPING_SETS, 309 "HAVING": TokenType.HAVING, 310 "ILIKE": TokenType.ILIKE, 311 "IN": TokenType.IN, 312 "INDEX": TokenType.INDEX, 313 "INET": TokenType.INET, 314 "INNER": TokenType.INNER, 315 "INSERT": TokenType.INSERT, 316 "INTERVAL": TokenType.INTERVAL, 317 "INTERSECT": TokenType.INTERSECT, 318 "INTO": TokenType.INTO, 319 "IS": TokenType.IS, 320 "ISNULL": TokenType.ISNULL, 321 "JOIN": TokenType.JOIN, 322 "KEEP": TokenType.KEEP, 323 "KILL": TokenType.KILL, 324 "LATERAL": TokenType.LATERAL, 325 "LEFT": TokenType.LEFT, 326 "LIKE": TokenType.LIKE, 327 "LIMIT": TokenType.LIMIT, 328 "LOAD": TokenType.LOAD, 329 "LOCALTIME": TokenType.LOCALTIME, 330 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 331 "LOCK": TokenType.LOCK, 332 "MERGE": TokenType.MERGE, 333 "NAMESPACE": TokenType.NAMESPACE, 334 "NATURAL": TokenType.NATURAL, 335 "NEXT": TokenType.NEXT, 336 "NOT": TokenType.NOT, 337 "NOTNULL": TokenType.NOTNULL, 338 "NULL": TokenType.NULL, 339 "OBJECT": TokenType.OBJECT, 340 "OFFSET": TokenType.OFFSET, 341 "ON": TokenType.ON, 342 "OR": TokenType.OR, 343 "XOR": TokenType.XOR, 344 "ORDER BY": TokenType.ORDER_BY, 345 "ORDINALITY": TokenType.ORDINALITY, 346 "OUT": TokenType.OUT, 347 "OUTER": TokenType.OUTER, 348 "OVER": TokenType.OVER, 349 "OVERLAPS": TokenType.OVERLAPS, 350 "OVERWRITE": TokenType.OVERWRITE, 351 "PARTITION": TokenType.PARTITION, 352 "PARTITION BY": TokenType.PARTITION_BY, 353 "PARTITIONED BY": TokenType.PARTITION_BY, 354 "PARTITIONED_BY": TokenType.PARTITION_BY, 355 "PERCENT": TokenType.PERCENT, 356 "PIVOT": TokenType.PIVOT, 357 "PRAGMA": TokenType.PRAGMA, 358 "PRIMARY KEY": TokenType.PRIMARY_KEY, 359 "PROCEDURE": TokenType.PROCEDURE, 360 "OPERATOR": TokenType.OPERATOR, 361 "QUALIFY": TokenType.QUALIFY, 362 "RANGE": TokenType.RANGE, 363 "RECURSIVE": TokenType.RECURSIVE, 364 "REGEXP": TokenType.RLIKE, 365 "RENAME": TokenType.RENAME, 366 "REPLACE": TokenType.REPLACE, 367 "RETURNING": TokenType.RETURNING, 368 "REFERENCES": TokenType.REFERENCES, 369 "RIGHT": TokenType.RIGHT, 370 "RLIKE": TokenType.RLIKE, 371 "ROLLBACK": TokenType.ROLLBACK, 372 "ROLLUP": TokenType.ROLLUP, 373 "ROW": TokenType.ROW, 374 "ROWS": TokenType.ROWS, 375 "SCHEMA": TokenType.SCHEMA, 376 "SELECT": TokenType.SELECT, 377 "SEMI": TokenType.SEMI, 378 "SESSION": TokenType.SESSION, 379 "SESSION_USER": TokenType.SESSION_USER, 380 "SET": TokenType.SET, 381 "SETTINGS": TokenType.SETTINGS, 382 "SHOW": TokenType.SHOW, 383 "SIMILAR TO": TokenType.SIMILAR_TO, 384 "SOME": TokenType.SOME, 385 "SORT BY": TokenType.SORT_BY, 386 "SQL SECURITY": TokenType.SQL_SECURITY, 387 "START WITH": TokenType.START_WITH, 388 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 389 "TABLE": TokenType.TABLE, 390 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 391 "TEMP": TokenType.TEMPORARY, 392 "TEMPORARY": TokenType.TEMPORARY, 393 "THEN": TokenType.THEN, 394 "TRUE": TokenType.TRUE, 395 "TRUNCATE": TokenType.TRUNCATE, 396 "TRIGGER": TokenType.TRIGGER, 397 "UNION": TokenType.UNION, 398 "UNKNOWN": TokenType.UNKNOWN, 399 "UNNEST": TokenType.UNNEST, 400 "UNPIVOT": TokenType.UNPIVOT, 401 "UPDATE": TokenType.UPDATE, 402 "USE": TokenType.USE, 403 "USING": TokenType.USING, 404 "UUID": TokenType.UUID, 405 "VALUES": TokenType.VALUES, 406 "VIEW": TokenType.VIEW, 407 "VOLATILE": TokenType.VOLATILE, 408 "WHEN": TokenType.WHEN, 409 "WHERE": TokenType.WHERE, 410 "WINDOW": TokenType.WINDOW, 411 "WITH": TokenType.WITH, 412 "APPLY": TokenType.APPLY, 413 "ARRAY": TokenType.ARRAY, 414 "BIT": TokenType.BIT, 415 "BOOL": TokenType.BOOLEAN, 416 "BOOLEAN": TokenType.BOOLEAN, 417 "BYTE": TokenType.TINYINT, 418 "MEDIUMINT": TokenType.MEDIUMINT, 419 "INT1": TokenType.TINYINT, 420 "TINYINT": TokenType.TINYINT, 421 "INT16": TokenType.SMALLINT, 422 "SHORT": TokenType.SMALLINT, 423 "SMALLINT": TokenType.SMALLINT, 424 "HUGEINT": TokenType.INT128, 425 "UHUGEINT": TokenType.UINT128, 426 "INT2": TokenType.SMALLINT, 427 "INTEGER": TokenType.INT, 428 "INT": TokenType.INT, 429 "INT4": TokenType.INT, 430 "INT32": TokenType.INT, 431 "INT64": TokenType.BIGINT, 432 "INT128": TokenType.INT128, 433 "INT256": TokenType.INT256, 434 "LONG": TokenType.BIGINT, 435 "BIGINT": TokenType.BIGINT, 436 "INT8": TokenType.TINYINT, 437 "UINT": TokenType.UINT, 438 "UINT128": TokenType.UINT128, 439 "UINT256": TokenType.UINT256, 440 "DEC": TokenType.DECIMAL, 441 "DECIMAL": TokenType.DECIMAL, 442 "DECIMAL32": TokenType.DECIMAL32, 443 "DECIMAL64": TokenType.DECIMAL64, 444 "DECIMAL128": TokenType.DECIMAL128, 445 "DECIMAL256": TokenType.DECIMAL256, 446 "DECFLOAT": TokenType.DECFLOAT, 447 "BIGDECIMAL": TokenType.BIGDECIMAL, 448 "BIGNUMERIC": TokenType.BIGDECIMAL, 449 "BIGNUM": TokenType.BIGNUM, 450 "LIST": TokenType.LIST, 451 "MAP": TokenType.MAP, 452 "NULLABLE": TokenType.NULLABLE, 453 "NUMBER": TokenType.DECIMAL, 454 "NUMERIC": TokenType.DECIMAL, 455 "FIXED": TokenType.DECIMAL, 456 "REAL": TokenType.FLOAT, 457 "FLOAT": TokenType.FLOAT, 458 "FLOAT4": TokenType.FLOAT, 459 "FLOAT8": TokenType.DOUBLE, 460 "DOUBLE": TokenType.DOUBLE, 461 "DOUBLE PRECISION": TokenType.DOUBLE, 462 "JSON": TokenType.JSON, 463 "JSONB": TokenType.JSONB, 464 "CHAR": TokenType.CHAR, 465 "CHARACTER": TokenType.CHAR, 466 "CHAR VARYING": TokenType.VARCHAR, 467 "CHARACTER VARYING": TokenType.VARCHAR, 468 "NCHAR": TokenType.NCHAR, 469 "VARCHAR": TokenType.VARCHAR, 470 "VARCHAR2": TokenType.VARCHAR, 471 "NVARCHAR": TokenType.NVARCHAR, 472 "NVARCHAR2": TokenType.NVARCHAR, 473 "BPCHAR": TokenType.BPCHAR, 474 "STR": TokenType.TEXT, 475 "STRING": TokenType.TEXT, 476 "TEXT": TokenType.TEXT, 477 "LONGTEXT": TokenType.LONGTEXT, 478 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 479 "TINYTEXT": TokenType.TINYTEXT, 480 "CLOB": TokenType.TEXT, 481 "LONGVARCHAR": TokenType.TEXT, 482 "BINARY": TokenType.BINARY, 483 "BLOB": TokenType.VARBINARY, 484 "LONGBLOB": TokenType.LONGBLOB, 485 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 486 "TINYBLOB": TokenType.TINYBLOB, 487 "BYTEA": TokenType.VARBINARY, 488 "VARBINARY": TokenType.VARBINARY, 489 "TIME": TokenType.TIME, 490 "TIMETZ": TokenType.TIMETZ, 491 "TIME_NS": TokenType.TIME_NS, 492 "TIMESTAMP": TokenType.TIMESTAMP, 493 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 494 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 495 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 496 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 497 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 498 "DATE": TokenType.DATE, 499 "DATETIME": TokenType.DATETIME, 500 "INT4RANGE": TokenType.INT4RANGE, 501 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 502 "INT8RANGE": TokenType.INT8RANGE, 503 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 504 "NUMRANGE": TokenType.NUMRANGE, 505 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 506 "TSRANGE": TokenType.TSRANGE, 507 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 508 "TSTZRANGE": TokenType.TSTZRANGE, 509 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 510 "DATERANGE": TokenType.DATERANGE, 511 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 512 "UNIQUE": TokenType.UNIQUE, 513 "VECTOR": TokenType.VECTOR, 514 "STRUCT": TokenType.STRUCT, 515 "SEQUENCE": TokenType.SEQUENCE, 516 "VARIANT": TokenType.VARIANT, 517 "ALTER": TokenType.ALTER, 518 "ANALYZE": TokenType.ANALYZE, 519 "CALL": TokenType.COMMAND, 520 "COMMENT": TokenType.COMMENT, 521 "EXPLAIN": TokenType.COMMAND, 522 "GRANT": TokenType.GRANT, 523 "REVOKE": TokenType.REVOKE, 524 "OPTIMIZE": TokenType.COMMAND, 525 "PREPARE": TokenType.COMMAND, 526 "VACUUM": TokenType.COMMAND, 527 "USER-DEFINED": TokenType.USERDEFINED, 528 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 529 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 530 } 531 532 COMMANDS = { 533 TokenType.COMMAND, 534 TokenType.EXECUTE, 535 TokenType.FETCH, 536 TokenType.SHOW, 537 TokenType.RENAME, 538 } 539 540 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 541 542 # Handle numeric literals like in hive (3L = BIGINT) 543 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 544 545 # In tokenizers like JSONPath, dots are always key separators, never decimal points 546 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 547 548 COMMENTS = ["--", ("/*", "*/")] 549 550 _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache() 551 552 __slots__ = ( 553 "dialect", 554 "_core", 555 ) 556 557 def __init__(self, dialect: DialectType = None) -> None: 558 from sqlglot.dialects.dialect import Dialect 559 560 self.dialect = Dialect.get_or_raise(dialect) 561 self._core = self._core_cache.get_or_build(type(self), self._init_core) 562 563 def _init_core(self) -> TokenizerCore: 564 return TokenizerCore( 565 single_tokens=self.SINGLE_TOKENS, 566 keywords=self.KEYWORDS, 567 quotes=self._QUOTES, 568 format_strings=self._FORMAT_STRINGS, 569 identifiers=self._IDENTIFIERS, 570 comments=self._COMMENTS, 571 string_escapes=self._STRING_ESCAPES, 572 byte_string_escapes=self._BYTE_STRING_ESCAPES, 573 identifier_escapes=self._IDENTIFIER_ESCAPES, 574 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 575 commands=self.COMMANDS, 576 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 577 nested_comments=self.NESTED_COMMENTS, 578 hint_start=self.HINT_START, 579 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 580 has_bit_strings=bool(self.BIT_STRINGS), 581 has_hex_strings=bool(self.HEX_STRINGS), 582 numeric_literals=self.NUMERIC_LITERALS, 583 var_single_tokens=self.VAR_SINGLE_TOKENS, 584 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 585 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 586 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 587 keyword_trie=self._KEYWORD_TRIE, 588 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 589 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 590 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 591 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 592 ) 593 594 def tokenize(self, sql: str) -> list[Token]: 595 """Returns a list of tokens corresponding to the SQL string `sql`.""" 596 return self._core.tokenize(sql) # type: ignore 597 598 @property 599 def sql(self) -> str: 600 """The SQL string being tokenized.""" 601 return self._core.sql 602 603 @property 604 def size(self) -> int: 605 """Length of the SQL string.""" 606 return self._core.size 607 608 @property 609 def tokens(self) -> list[Token]: 610 """The list of tokens produced by tokenization.""" 611 return self._core.tokens
class
ThreadLocalCache(_thread._local):
14class ThreadLocalCache(threading.local): 15 """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.""" 16 17 def __init__(self) -> None: 18 self.cache: dict[type, t.Any] = {} 19 20 def get_or_build(self, key: type, build: t.Callable[[], T]) -> T: 21 if not (obj := self.cache.get(key)): 22 self.cache[key] = obj = build() 23 return obj
Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.
136class Tokenizer(_TokenizerBase): 137 SINGLE_TOKENS = { 138 "(": TokenType.L_PAREN, 139 ")": TokenType.R_PAREN, 140 "[": TokenType.L_BRACKET, 141 "]": TokenType.R_BRACKET, 142 "{": TokenType.L_BRACE, 143 "}": TokenType.R_BRACE, 144 "&": TokenType.AMP, 145 "^": TokenType.CARET, 146 ":": TokenType.COLON, 147 ",": TokenType.COMMA, 148 ".": TokenType.DOT, 149 "-": TokenType.DASH, 150 "=": TokenType.EQ, 151 ">": TokenType.GT, 152 "<": TokenType.LT, 153 "%": TokenType.MOD, 154 "!": TokenType.NOT, 155 "|": TokenType.PIPE, 156 "+": TokenType.PLUS, 157 ";": TokenType.SEMICOLON, 158 "/": TokenType.SLASH, 159 "\\": TokenType.BACKSLASH, 160 "*": TokenType.STAR, 161 "~": TokenType.TILDE, 162 "?": TokenType.PLACEHOLDER, 163 "@": TokenType.PARAMETER, 164 "#": TokenType.HASH, 165 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 166 "'": TokenType.UNKNOWN, 167 "`": TokenType.UNKNOWN, 168 '"': TokenType.UNKNOWN, 169 } 170 171 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 172 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 173 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 174 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 175 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 176 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 177 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 178 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 179 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 180 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 181 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 182 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 183 184 # The strings in this list can always be used as escapes, regardless of the surrounding 185 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 186 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 187 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 188 189 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 190 HEREDOC_TAG_IS_IDENTIFIER = False 191 192 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 193 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 194 195 # Whether string escape characters function as such when placed within raw strings 196 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 197 198 NESTED_COMMENTS = True 199 200 HINT_START = "/*+" 201 202 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 203 204 # Autofilled 205 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 206 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 207 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 208 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 209 _QUOTES: t.ClassVar[dict[str, str]] = {} 210 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 211 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 212 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 213 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 214 215 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 216 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 217 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 218 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 219 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 220 HINT_START: TokenType.HINT, 221 "&<": TokenType.AMP_LT, 222 "&>": TokenType.AMP_GT, 223 "==": TokenType.EQ, 224 "::": TokenType.DCOLON, 225 "?::": TokenType.QDCOLON, 226 "||": TokenType.DPIPE, 227 "|>": TokenType.PIPE_GT, 228 ">=": TokenType.GTE, 229 "<=": TokenType.LTE, 230 "<>": TokenType.NEQ, 231 "!=": TokenType.NEQ, 232 ":=": TokenType.COLON_EQ, 233 "<=>": TokenType.NULLSAFE_EQ, 234 "->": TokenType.ARROW, 235 "->>": TokenType.DARROW, 236 "=>": TokenType.FARROW, 237 "#>": TokenType.HASH_ARROW, 238 "#>>": TokenType.DHASH_ARROW, 239 "<->": TokenType.LR_ARROW, 240 "&&": TokenType.DAMP, 241 "??": TokenType.DQMARK, 242 "~~~": TokenType.GLOB, 243 "~~": TokenType.LIKE, 244 "~~*": TokenType.ILIKE, 245 "~*": TokenType.IRLIKE, 246 "-|-": TokenType.ADJACENT, 247 "ALL": TokenType.ALL, 248 "AND": TokenType.AND, 249 "ANTI": TokenType.ANTI, 250 "ANY": TokenType.ANY, 251 "ASC": TokenType.ASC, 252 "AS": TokenType.ALIAS, 253 "ASOF": TokenType.ASOF, 254 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 255 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 256 "BEGIN": TokenType.BEGIN, 257 "BETWEEN": TokenType.BETWEEN, 258 "CACHE": TokenType.CACHE, 259 "UNCACHE": TokenType.UNCACHE, 260 "CASE": TokenType.CASE, 261 "CHARACTER SET": TokenType.CHARACTER_SET, 262 "CLUSTER BY": TokenType.CLUSTER_BY, 263 "COLLATE": TokenType.COLLATE, 264 "COLUMN": TokenType.COLUMN, 265 "COMMIT": TokenType.COMMIT, 266 "CONNECT BY": TokenType.CONNECT_BY, 267 "CONSTRAINT": TokenType.CONSTRAINT, 268 "COPY": TokenType.COPY, 269 "CREATE": TokenType.CREATE, 270 "CROSS": TokenType.CROSS, 271 "CUBE": TokenType.CUBE, 272 "CURRENT_DATE": TokenType.CURRENT_DATE, 273 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 274 "CURRENT_TIME": TokenType.CURRENT_TIME, 275 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 276 "CURRENT_USER": TokenType.CURRENT_USER, 277 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 278 "DATABASE": TokenType.DATABASE, 279 "DEFAULT": TokenType.DEFAULT, 280 "DELETE": TokenType.DELETE, 281 "DESC": TokenType.DESC, 282 "DESCRIBE": TokenType.DESCRIBE, 283 "DISTINCT": TokenType.DISTINCT, 284 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 285 "DIV": TokenType.DIV, 286 "DROP": TokenType.DROP, 287 "ELSE": TokenType.ELSE, 288 "END": TokenType.END, 289 "ENUM": TokenType.ENUM, 290 "ESCAPE": TokenType.ESCAPE, 291 "EXCEPT": TokenType.EXCEPT, 292 "EXECUTE": TokenType.EXECUTE, 293 "EXISTS": TokenType.EXISTS, 294 "FALSE": TokenType.FALSE, 295 "FETCH": TokenType.FETCH, 296 "FILTER": TokenType.FILTER, 297 "FILE": TokenType.FILE, 298 "FIRST": TokenType.FIRST, 299 "FULL": TokenType.FULL, 300 "FUNCTION": TokenType.FUNCTION, 301 "FOR": TokenType.FOR, 302 "FOREIGN KEY": TokenType.FOREIGN_KEY, 303 "FORMAT": TokenType.FORMAT, 304 "FROM": TokenType.FROM, 305 "GEOGRAPHY": TokenType.GEOGRAPHY, 306 "GEOMETRY": TokenType.GEOMETRY, 307 "GLOB": TokenType.GLOB, 308 "GROUP BY": TokenType.GROUP_BY, 309 "GROUPING SETS": TokenType.GROUPING_SETS, 310 "HAVING": TokenType.HAVING, 311 "ILIKE": TokenType.ILIKE, 312 "IN": TokenType.IN, 313 "INDEX": TokenType.INDEX, 314 "INET": TokenType.INET, 315 "INNER": TokenType.INNER, 316 "INSERT": TokenType.INSERT, 317 "INTERVAL": TokenType.INTERVAL, 318 "INTERSECT": TokenType.INTERSECT, 319 "INTO": TokenType.INTO, 320 "IS": TokenType.IS, 321 "ISNULL": TokenType.ISNULL, 322 "JOIN": TokenType.JOIN, 323 "KEEP": TokenType.KEEP, 324 "KILL": TokenType.KILL, 325 "LATERAL": TokenType.LATERAL, 326 "LEFT": TokenType.LEFT, 327 "LIKE": TokenType.LIKE, 328 "LIMIT": TokenType.LIMIT, 329 "LOAD": TokenType.LOAD, 330 "LOCALTIME": TokenType.LOCALTIME, 331 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 332 "LOCK": TokenType.LOCK, 333 "MERGE": TokenType.MERGE, 334 "NAMESPACE": TokenType.NAMESPACE, 335 "NATURAL": TokenType.NATURAL, 336 "NEXT": TokenType.NEXT, 337 "NOT": TokenType.NOT, 338 "NOTNULL": TokenType.NOTNULL, 339 "NULL": TokenType.NULL, 340 "OBJECT": TokenType.OBJECT, 341 "OFFSET": TokenType.OFFSET, 342 "ON": TokenType.ON, 343 "OR": TokenType.OR, 344 "XOR": TokenType.XOR, 345 "ORDER BY": TokenType.ORDER_BY, 346 "ORDINALITY": TokenType.ORDINALITY, 347 "OUT": TokenType.OUT, 348 "OUTER": TokenType.OUTER, 349 "OVER": TokenType.OVER, 350 "OVERLAPS": TokenType.OVERLAPS, 351 "OVERWRITE": TokenType.OVERWRITE, 352 "PARTITION": TokenType.PARTITION, 353 "PARTITION BY": TokenType.PARTITION_BY, 354 "PARTITIONED BY": TokenType.PARTITION_BY, 355 "PARTITIONED_BY": TokenType.PARTITION_BY, 356 "PERCENT": TokenType.PERCENT, 357 "PIVOT": TokenType.PIVOT, 358 "PRAGMA": TokenType.PRAGMA, 359 "PRIMARY KEY": TokenType.PRIMARY_KEY, 360 "PROCEDURE": TokenType.PROCEDURE, 361 "OPERATOR": TokenType.OPERATOR, 362 "QUALIFY": TokenType.QUALIFY, 363 "RANGE": TokenType.RANGE, 364 "RECURSIVE": TokenType.RECURSIVE, 365 "REGEXP": TokenType.RLIKE, 366 "RENAME": TokenType.RENAME, 367 "REPLACE": TokenType.REPLACE, 368 "RETURNING": TokenType.RETURNING, 369 "REFERENCES": TokenType.REFERENCES, 370 "RIGHT": TokenType.RIGHT, 371 "RLIKE": TokenType.RLIKE, 372 "ROLLBACK": TokenType.ROLLBACK, 373 "ROLLUP": TokenType.ROLLUP, 374 "ROW": TokenType.ROW, 375 "ROWS": TokenType.ROWS, 376 "SCHEMA": TokenType.SCHEMA, 377 "SELECT": TokenType.SELECT, 378 "SEMI": TokenType.SEMI, 379 "SESSION": TokenType.SESSION, 380 "SESSION_USER": TokenType.SESSION_USER, 381 "SET": TokenType.SET, 382 "SETTINGS": TokenType.SETTINGS, 383 "SHOW": TokenType.SHOW, 384 "SIMILAR TO": TokenType.SIMILAR_TO, 385 "SOME": TokenType.SOME, 386 "SORT BY": TokenType.SORT_BY, 387 "SQL SECURITY": TokenType.SQL_SECURITY, 388 "START WITH": TokenType.START_WITH, 389 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 390 "TABLE": TokenType.TABLE, 391 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 392 "TEMP": TokenType.TEMPORARY, 393 "TEMPORARY": TokenType.TEMPORARY, 394 "THEN": TokenType.THEN, 395 "TRUE": TokenType.TRUE, 396 "TRUNCATE": TokenType.TRUNCATE, 397 "TRIGGER": TokenType.TRIGGER, 398 "UNION": TokenType.UNION, 399 "UNKNOWN": TokenType.UNKNOWN, 400 "UNNEST": TokenType.UNNEST, 401 "UNPIVOT": TokenType.UNPIVOT, 402 "UPDATE": TokenType.UPDATE, 403 "USE": TokenType.USE, 404 "USING": TokenType.USING, 405 "UUID": TokenType.UUID, 406 "VALUES": TokenType.VALUES, 407 "VIEW": TokenType.VIEW, 408 "VOLATILE": TokenType.VOLATILE, 409 "WHEN": TokenType.WHEN, 410 "WHERE": TokenType.WHERE, 411 "WINDOW": TokenType.WINDOW, 412 "WITH": TokenType.WITH, 413 "APPLY": TokenType.APPLY, 414 "ARRAY": TokenType.ARRAY, 415 "BIT": TokenType.BIT, 416 "BOOL": TokenType.BOOLEAN, 417 "BOOLEAN": TokenType.BOOLEAN, 418 "BYTE": TokenType.TINYINT, 419 "MEDIUMINT": TokenType.MEDIUMINT, 420 "INT1": TokenType.TINYINT, 421 "TINYINT": TokenType.TINYINT, 422 "INT16": TokenType.SMALLINT, 423 "SHORT": TokenType.SMALLINT, 424 "SMALLINT": TokenType.SMALLINT, 425 "HUGEINT": TokenType.INT128, 426 "UHUGEINT": TokenType.UINT128, 427 "INT2": TokenType.SMALLINT, 428 "INTEGER": TokenType.INT, 429 "INT": TokenType.INT, 430 "INT4": TokenType.INT, 431 "INT32": TokenType.INT, 432 "INT64": TokenType.BIGINT, 433 "INT128": TokenType.INT128, 434 "INT256": TokenType.INT256, 435 "LONG": TokenType.BIGINT, 436 "BIGINT": TokenType.BIGINT, 437 "INT8": TokenType.TINYINT, 438 "UINT": TokenType.UINT, 439 "UINT128": TokenType.UINT128, 440 "UINT256": TokenType.UINT256, 441 "DEC": TokenType.DECIMAL, 442 "DECIMAL": TokenType.DECIMAL, 443 "DECIMAL32": TokenType.DECIMAL32, 444 "DECIMAL64": TokenType.DECIMAL64, 445 "DECIMAL128": TokenType.DECIMAL128, 446 "DECIMAL256": TokenType.DECIMAL256, 447 "DECFLOAT": TokenType.DECFLOAT, 448 "BIGDECIMAL": TokenType.BIGDECIMAL, 449 "BIGNUMERIC": TokenType.BIGDECIMAL, 450 "BIGNUM": TokenType.BIGNUM, 451 "LIST": TokenType.LIST, 452 "MAP": TokenType.MAP, 453 "NULLABLE": TokenType.NULLABLE, 454 "NUMBER": TokenType.DECIMAL, 455 "NUMERIC": TokenType.DECIMAL, 456 "FIXED": TokenType.DECIMAL, 457 "REAL": TokenType.FLOAT, 458 "FLOAT": TokenType.FLOAT, 459 "FLOAT4": TokenType.FLOAT, 460 "FLOAT8": TokenType.DOUBLE, 461 "DOUBLE": TokenType.DOUBLE, 462 "DOUBLE PRECISION": TokenType.DOUBLE, 463 "JSON": TokenType.JSON, 464 "JSONB": TokenType.JSONB, 465 "CHAR": TokenType.CHAR, 466 "CHARACTER": TokenType.CHAR, 467 "CHAR VARYING": TokenType.VARCHAR, 468 "CHARACTER VARYING": TokenType.VARCHAR, 469 "NCHAR": TokenType.NCHAR, 470 "VARCHAR": TokenType.VARCHAR, 471 "VARCHAR2": TokenType.VARCHAR, 472 "NVARCHAR": TokenType.NVARCHAR, 473 "NVARCHAR2": TokenType.NVARCHAR, 474 "BPCHAR": TokenType.BPCHAR, 475 "STR": TokenType.TEXT, 476 "STRING": TokenType.TEXT, 477 "TEXT": TokenType.TEXT, 478 "LONGTEXT": TokenType.LONGTEXT, 479 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 480 "TINYTEXT": TokenType.TINYTEXT, 481 "CLOB": TokenType.TEXT, 482 "LONGVARCHAR": TokenType.TEXT, 483 "BINARY": TokenType.BINARY, 484 "BLOB": TokenType.VARBINARY, 485 "LONGBLOB": TokenType.LONGBLOB, 486 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 487 "TINYBLOB": TokenType.TINYBLOB, 488 "BYTEA": TokenType.VARBINARY, 489 "VARBINARY": TokenType.VARBINARY, 490 "TIME": TokenType.TIME, 491 "TIMETZ": TokenType.TIMETZ, 492 "TIME_NS": TokenType.TIME_NS, 493 "TIMESTAMP": TokenType.TIMESTAMP, 494 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 495 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 496 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 497 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 498 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 499 "DATE": TokenType.DATE, 500 "DATETIME": TokenType.DATETIME, 501 "INT4RANGE": TokenType.INT4RANGE, 502 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 503 "INT8RANGE": TokenType.INT8RANGE, 504 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 505 "NUMRANGE": TokenType.NUMRANGE, 506 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 507 "TSRANGE": TokenType.TSRANGE, 508 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 509 "TSTZRANGE": TokenType.TSTZRANGE, 510 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 511 "DATERANGE": TokenType.DATERANGE, 512 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 513 "UNIQUE": TokenType.UNIQUE, 514 "VECTOR": TokenType.VECTOR, 515 "STRUCT": TokenType.STRUCT, 516 "SEQUENCE": TokenType.SEQUENCE, 517 "VARIANT": TokenType.VARIANT, 518 "ALTER": TokenType.ALTER, 519 "ANALYZE": TokenType.ANALYZE, 520 "CALL": TokenType.COMMAND, 521 "COMMENT": TokenType.COMMENT, 522 "EXPLAIN": TokenType.COMMAND, 523 "GRANT": TokenType.GRANT, 524 "REVOKE": TokenType.REVOKE, 525 "OPTIMIZE": TokenType.COMMAND, 526 "PREPARE": TokenType.COMMAND, 527 "VACUUM": TokenType.COMMAND, 528 "USER-DEFINED": TokenType.USERDEFINED, 529 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 530 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 531 } 532 533 COMMANDS = { 534 TokenType.COMMAND, 535 TokenType.EXECUTE, 536 TokenType.FETCH, 537 TokenType.SHOW, 538 TokenType.RENAME, 539 } 540 541 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 542 543 # Handle numeric literals like in hive (3L = BIGINT) 544 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 545 546 # In tokenizers like JSONPath, dots are always key separators, never decimal points 547 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 548 549 COMMENTS = ["--", ("/*", "*/")] 550 551 _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache() 552 553 __slots__ = ( 554 "dialect", 555 "_core", 556 ) 557 558 def __init__(self, dialect: DialectType = None) -> None: 559 from sqlglot.dialects.dialect import Dialect 560 561 self.dialect = Dialect.get_or_raise(dialect) 562 self._core = self._core_cache.get_or_build(type(self), self._init_core) 563 564 def _init_core(self) -> TokenizerCore: 565 return TokenizerCore( 566 single_tokens=self.SINGLE_TOKENS, 567 keywords=self.KEYWORDS, 568 quotes=self._QUOTES, 569 format_strings=self._FORMAT_STRINGS, 570 identifiers=self._IDENTIFIERS, 571 comments=self._COMMENTS, 572 string_escapes=self._STRING_ESCAPES, 573 byte_string_escapes=self._BYTE_STRING_ESCAPES, 574 identifier_escapes=self._IDENTIFIER_ESCAPES, 575 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 576 commands=self.COMMANDS, 577 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 578 nested_comments=self.NESTED_COMMENTS, 579 hint_start=self.HINT_START, 580 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 581 has_bit_strings=bool(self.BIT_STRINGS), 582 has_hex_strings=bool(self.HEX_STRINGS), 583 numeric_literals=self.NUMERIC_LITERALS, 584 var_single_tokens=self.VAR_SINGLE_TOKENS, 585 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 586 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 587 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 588 keyword_trie=self._KEYWORD_TRIE, 589 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 590 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 591 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 592 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 593 ) 594 595 def tokenize(self, sql: str) -> list[Token]: 596 """Returns a list of tokens corresponding to the SQL string `sql`.""" 597 return self._core.tokenize(sql) # type: ignore 598 599 @property 600 def sql(self) -> str: 601 """The SQL string being tokenized.""" 602 return self._core.sql 603 604 @property 605 def size(self) -> int: 606 """Length of the SQL string.""" 607 return self._core.size 608 609 @property 610 def tokens(self) -> list[Token]: 611 """The list of tokens produced by tokenization.""" 612 return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 327>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 354>, '@': <TokenType.PARAMETER: 56>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 212>, '`': <TokenType.UNKNOWN: 212>, '"': <TokenType.UNKNOWN: 212>}
TOKENS_PRECEDING_HINT =
{<TokenType.SELECT: 384>, <TokenType.INSERT: 298>, <TokenType.DELETE: 255>, <TokenType.UPDATE: 415>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] =
{'{%': <TokenType.BLOCK_START: 71>, '{%+': <TokenType.BLOCK_START: 71>, '{%-': <TokenType.BLOCK_START: 71>, '%}': <TokenType.BLOCK_END: 72>, '+%}': <TokenType.BLOCK_END: 72>, '-%}': <TokenType.BLOCK_END: 72>, '{{+': <TokenType.BLOCK_START: 71>, '{{-': <TokenType.BLOCK_START: 71>, '+}}': <TokenType.BLOCK_END: 72>, '-}}': <TokenType.BLOCK_END: 72>, '/*+': <TokenType.HINT: 291>, '&<': <TokenType.AMP_LT: 61>, '&>': <TokenType.AMP_GT: 62>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 367>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '&&': <TokenType.DAMP: 60>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 285>, '~~': <TokenType.LIKE: 316>, '~~*': <TokenType.ILIKE: 293>, '~*': <TokenType.IRLIKE: 305>, '-|-': <TokenType.ADJACENT: 63>, 'ALL': <TokenType.ALL: 218>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 219>, 'ANY': <TokenType.ANY: 220>, 'ASC': <TokenType.ASC: 223>, 'AS': <TokenType.ALIAS: 216>, 'ASOF': <TokenType.ASOF: 224>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'BEGIN': <TokenType.BEGIN: 227>, 'BETWEEN': <TokenType.BETWEEN: 228>, 'CACHE': <TokenType.CACHE: 230>, 'UNCACHE': <TokenType.UNCACHE: 411>, 'CASE': <TokenType.CASE: 231>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 232>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 233>, 'COLLATE': <TokenType.COLLATE: 234>, 'COLUMN': <TokenType.COLUMN: 79>, 'COMMIT': <TokenType.COMMIT: 237>, 'CONNECT BY': <TokenType.CONNECT_BY: 238>, 'CONSTRAINT': <TokenType.CONSTRAINT: 239>, 'COPY': <TokenType.COPY: 240>, 'CREATE': <TokenType.CREATE: 241>, 'CROSS': <TokenType.CROSS: 242>, 'CUBE': <TokenType.CUBE: 243>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 244>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 246>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 247>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 248>, 'CURRENT_USER': <TokenType.CURRENT_USER: 249>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 252>, 'DATABASE': <TokenType.DATABASE: 78>, 'DEFAULT': <TokenType.DEFAULT: 254>, 'DELETE': <TokenType.DELETE: 255>, 'DESC': <TokenType.DESC: 256>, 'DESCRIBE': <TokenType.DESCRIBE: 257>, 'DISTINCT': <TokenType.DISTINCT: 260>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 261>, 'DIV': <TokenType.DIV: 262>, 'DROP': <TokenType.DROP: 263>, 'ELSE': <TokenType.ELSE: 264>, 'END': <TokenType.END: 265>, 'ENUM': <TokenType.ENUM: 203>, 'ESCAPE': <TokenType.ESCAPE: 266>, 'EXCEPT': <TokenType.EXCEPT: 267>, 'EXECUTE': <TokenType.EXECUTE: 268>, 'EXISTS': <TokenType.EXISTS: 269>, 'FALSE': <TokenType.FALSE: 270>, 'FETCH': <TokenType.FETCH: 271>, 'FILTER': <TokenType.FILTER: 274>, 'FILE': <TokenType.FILE: 272>, 'FIRST': <TokenType.FIRST: 276>, 'FULL': <TokenType.FULL: 282>, 'FUNCTION': <TokenType.FUNCTION: 283>, 'FOR': <TokenType.FOR: 277>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 279>, 'FORMAT': <TokenType.FORMAT: 280>, 'FROM': <TokenType.FROM: 281>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 170>, 'GEOMETRY': <TokenType.GEOMETRY: 173>, 'GLOB': <TokenType.GLOB: 285>, 'GROUP BY': <TokenType.GROUP_BY: 288>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 289>, 'HAVING': <TokenType.HAVING: 290>, 'ILIKE': <TokenType.ILIKE: 293>, 'IN': <TokenType.IN: 294>, 'INDEX': <TokenType.INDEX: 295>, 'INET': <TokenType.INET: 198>, 'INNER': <TokenType.INNER: 297>, 'INSERT': <TokenType.INSERT: 298>, 'INTERVAL': <TokenType.INTERVAL: 302>, 'INTERSECT': <TokenType.INTERSECT: 301>, 'INTO': <TokenType.INTO: 303>, 'IS': <TokenType.IS: 306>, 'ISNULL': <TokenType.ISNULL: 307>, 'JOIN': <TokenType.JOIN: 308>, 'KEEP': <TokenType.KEEP: 310>, 'KILL': <TokenType.KILL: 312>, 'LATERAL': <TokenType.LATERAL: 314>, 'LEFT': <TokenType.LEFT: 315>, 'LIKE': <TokenType.LIKE: 316>, 'LIMIT': <TokenType.LIMIT: 317>, 'LOAD': <TokenType.LOAD: 319>, 'LOCALTIME': <TokenType.LOCALTIME: 177>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 178>, 'LOCK': <TokenType.LOCK: 320>, 'MERGE': <TokenType.MERGE: 326>, 'NAMESPACE': <TokenType.NAMESPACE: 438>, 'NATURAL': <TokenType.NATURAL: 329>, 'NEXT': <TokenType.NEXT: 330>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 332>, 'NULL': <TokenType.NULL: 333>, 'OBJECT': <TokenType.OBJECT: 197>, 'OFFSET': <TokenType.OFFSET: 335>, 'ON': <TokenType.ON: 336>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 64>, 'ORDER BY': <TokenType.ORDER_BY: 339>, 'ORDINALITY': <TokenType.ORDINALITY: 342>, 'OUT': <TokenType.OUT: 343>, 'OUTER': <TokenType.OUTER: 345>, 'OVER': <TokenType.OVER: 346>, 'OVERLAPS': <TokenType.OVERLAPS: 347>, 'OVERWRITE': <TokenType.OVERWRITE: 348>, 'PARTITION': <TokenType.PARTITION: 350>, 'PARTITION BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 351>, 'PERCENT': <TokenType.PERCENT: 352>, 'PIVOT': <TokenType.PIVOT: 353>, 'PRAGMA': <TokenType.PRAGMA: 358>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 360>, 'PROCEDURE': <TokenType.PROCEDURE: 361>, 'OPERATOR': <TokenType.OPERATOR: 338>, 'QUALIFY': <TokenType.QUALIFY: 365>, 'RANGE': <TokenType.RANGE: 368>, 'RECURSIVE': <TokenType.RECURSIVE: 369>, 'REGEXP': <TokenType.RLIKE: 377>, 'RENAME': <TokenType.RENAME: 371>, 'REPLACE': <TokenType.REPLACE: 372>, 'RETURNING': <TokenType.RETURNING: 373>, 'REFERENCES': <TokenType.REFERENCES: 375>, 'RIGHT': <TokenType.RIGHT: 376>, 'RLIKE': <TokenType.RLIKE: 377>, 'ROLLBACK': <TokenType.ROLLBACK: 379>, 'ROLLUP': <TokenType.ROLLUP: 380>, 'ROW': <TokenType.ROW: 381>, 'ROWS': <TokenType.ROWS: 382>, 'SCHEMA': <TokenType.SCHEMA: 81>, 'SELECT': <TokenType.SELECT: 384>, 'SEMI': <TokenType.SEMI: 385>, 'SESSION': <TokenType.SESSION: 57>, 'SESSION_USER': <TokenType.SESSION_USER: 59>, 'SET': <TokenType.SET: 389>, 'SETTINGS': <TokenType.SETTINGS: 390>, 'SHOW': <TokenType.SHOW: 391>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 392>, 'SOME': <TokenType.SOME: 393>, 'SORT BY': <TokenType.SORT_BY: 394>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 396>, 'START WITH': <TokenType.START_WITH: 397>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 399>, 'TABLE': <TokenType.TABLE: 82>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 402>, 'TEMP': <TokenType.TEMPORARY: 404>, 'TEMPORARY': <TokenType.TEMPORARY: 404>, 'THEN': <TokenType.THEN: 406>, 'TRUE': <TokenType.TRUE: 407>, 'TRUNCATE': <TokenType.TRUNCATE: 408>, 'TRIGGER': <TokenType.TRIGGER: 409>, 'UNION': <TokenType.UNION: 412>, 'UNKNOWN': <TokenType.UNKNOWN: 212>, 'UNNEST': <TokenType.UNNEST: 413>, 'UNPIVOT': <TokenType.UNPIVOT: 414>, 'UPDATE': <TokenType.UPDATE: 415>, 'USE': <TokenType.USE: 416>, 'USING': <TokenType.USING: 417>, 'UUID': <TokenType.UUID: 169>, 'VALUES': <TokenType.VALUES: 418>, 'VIEW': <TokenType.VIEW: 420>, 'VOLATILE': <TokenType.VOLATILE: 422>, 'WHEN': <TokenType.WHEN: 424>, 'WHERE': <TokenType.WHERE: 425>, 'WINDOW': <TokenType.WINDOW: 426>, 'WITH': <TokenType.WITH: 427>, 'APPLY': <TokenType.APPLY: 221>, 'ARRAY': <TokenType.ARRAY: 222>, 'BIT': <TokenType.BIT: 95>, 'BOOL': <TokenType.BOOLEAN: 96>, 'BOOLEAN': <TokenType.BOOLEAN: 96>, 'BYTE': <TokenType.TINYINT: 97>, 'MEDIUMINT': <TokenType.MEDIUMINT: 101>, 'INT1': <TokenType.TINYINT: 97>, 'TINYINT': <TokenType.TINYINT: 97>, 'INT16': <TokenType.SMALLINT: 99>, 'SHORT': <TokenType.SMALLINT: 99>, 'SMALLINT': <TokenType.SMALLINT: 99>, 'HUGEINT': <TokenType.INT128: 108>, 'UHUGEINT': <TokenType.UINT128: 109>, 'INT2': <TokenType.SMALLINT: 99>, 'INTEGER': <TokenType.INT: 103>, 'INT': <TokenType.INT: 103>, 'INT4': <TokenType.INT: 103>, 'INT32': <TokenType.INT: 103>, 'INT64': <TokenType.BIGINT: 105>, 'INT128': <TokenType.INT128: 108>, 'INT256': <TokenType.INT256: 110>, 'LONG': <TokenType.BIGINT: 105>, 'BIGINT': <TokenType.BIGINT: 105>, 'INT8': <TokenType.TINYINT: 97>, 'UINT': <TokenType.UINT: 104>, 'UINT128': <TokenType.UINT128: 109>, 'UINT256': <TokenType.UINT256: 111>, 'DEC': <TokenType.DECIMAL: 115>, 'DECIMAL': <TokenType.DECIMAL: 115>, 'DECIMAL32': <TokenType.DECIMAL32: 116>, 'DECIMAL64': <TokenType.DECIMAL64: 117>, 'DECIMAL128': <TokenType.DECIMAL128: 118>, 'DECIMAL256': <TokenType.DECIMAL256: 119>, 'DECFLOAT': <TokenType.DECFLOAT: 120>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 122>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 122>, 'BIGNUM': <TokenType.BIGNUM: 107>, 'LIST': <TokenType.LIST: 318>, 'MAP': <TokenType.MAP: 321>, 'NULLABLE': <TokenType.NULLABLE: 172>, 'NUMBER': <TokenType.DECIMAL: 115>, 'NUMERIC': <TokenType.DECIMAL: 115>, 'FIXED': <TokenType.DECIMAL: 115>, 'REAL': <TokenType.FLOAT: 112>, 'FLOAT': <TokenType.FLOAT: 112>, 'FLOAT4': <TokenType.FLOAT: 112>, 'FLOAT8': <TokenType.DOUBLE: 113>, 'DOUBLE': <TokenType.DOUBLE: 113>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 113>, 'JSON': <TokenType.JSON: 139>, 'JSONB': <TokenType.JSONB: 140>, 'CHAR': <TokenType.CHAR: 123>, 'CHARACTER': <TokenType.CHAR: 123>, 'CHAR VARYING': <TokenType.VARCHAR: 125>, 'CHARACTER VARYING': <TokenType.VARCHAR: 125>, 'NCHAR': <TokenType.NCHAR: 124>, 'VARCHAR': <TokenType.VARCHAR: 125>, 'VARCHAR2': <TokenType.VARCHAR: 125>, 'NVARCHAR': <TokenType.NVARCHAR: 126>, 'NVARCHAR2': <TokenType.NVARCHAR: 126>, 'BPCHAR': <TokenType.BPCHAR: 127>, 'STR': <TokenType.TEXT: 128>, 'STRING': <TokenType.TEXT: 128>, 'TEXT': <TokenType.TEXT: 128>, 'LONGTEXT': <TokenType.LONGTEXT: 130>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 129>, 'TINYTEXT': <TokenType.TINYTEXT: 135>, 'CLOB': <TokenType.TEXT: 128>, 'LONGVARCHAR': <TokenType.TEXT: 128>, 'BINARY': <TokenType.BINARY: 137>, 'BLOB': <TokenType.VARBINARY: 138>, 'LONGBLOB': <TokenType.LONGBLOB: 133>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 132>, 'TINYBLOB': <TokenType.TINYBLOB: 134>, 'BYTEA': <TokenType.VARBINARY: 138>, 'VARBINARY': <TokenType.VARBINARY: 138>, 'TIME': <TokenType.TIME: 141>, 'TIMETZ': <TokenType.TIMETZ: 142>, 'TIME_NS': <TokenType.TIME_NS: 143>, 'TIMESTAMP': <TokenType.TIMESTAMP: 144>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 145>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 147>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 147>, 'DATE': <TokenType.DATE: 155>, 'DATETIME': <TokenType.DATETIME: 151>, 'INT4RANGE': <TokenType.INT4RANGE: 157>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 158>, 'INT8RANGE': <TokenType.INT8RANGE: 159>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 160>, 'NUMRANGE': <TokenType.NUMRANGE: 161>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 162>, 'TSRANGE': <TokenType.TSRANGE: 163>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 164>, 'TSTZRANGE': <TokenType.TSTZRANGE: 165>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 166>, 'DATERANGE': <TokenType.DATERANGE: 167>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 168>, 'UNIQUE': <TokenType.UNIQUE: 428>, 'VECTOR': <TokenType.VECTOR: 213>, 'STRUCT': <TokenType.STRUCT: 400>, 'SEQUENCE': <TokenType.SEQUENCE: 387>, 'VARIANT': <TokenType.VARIANT: 196>, 'ALTER': <TokenType.ALTER: 217>, 'ANALYZE': <TokenType.ANALYZE: 437>, 'CALL': <TokenType.COMMAND: 235>, 'COMMENT': <TokenType.COMMENT: 236>, 'EXPLAIN': <TokenType.COMMAND: 235>, 'GRANT': <TokenType.GRANT: 287>, 'REVOKE': <TokenType.REVOKE: 374>, 'OPTIMIZE': <TokenType.COMMAND: 235>, 'PREPARE': <TokenType.COMMAND: 235>, 'VACUUM': <TokenType.COMMAND: 235>, 'USER-DEFINED': <TokenType.USERDEFINED: 191>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 432>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 433>}
COMMANDS =
{<TokenType.SHOW: 391>, <TokenType.COMMAND: 235>, <TokenType.EXECUTE: 268>, <TokenType.FETCH: 271>, <TokenType.RENAME: 371>}
595 def tokenize(self, sql: str) -> list[Token]: 596 """Returns a list of tokens corresponding to the SQL string `sql`.""" 597 return self._core.tokenize(sql) # type: ignore
Returns a list of tokens corresponding to the SQL string sql.
sql: str
599 @property 600 def sql(self) -> str: 601 """The SQL string being tokenized.""" 602 return self._core.sql
The SQL string being tokenized.
size: int
604 @property 605 def size(self) -> int: 606 """Length of the SQL string.""" 607 return self._core.size
Length of the SQL string.
tokens: list[sqlglot.tokenizer_core.Token]
609 @property 610 def tokens(self) -> list[Token]: 611 """The list of tokens produced by tokenization.""" 612 return self._core.tokens
The list of tokens produced by tokenization.