sqlglot.tokens
1from __future__ import annotations 2 3import threading 4import typing as t 5 6from sqlglot.trie import new_trie 7 8from sqlglot.tokenizer_core import Token, TokenizerCore, TokenType 9 10T = t.TypeVar("T") 11 12 13class ThreadLocalCache(threading.local): 14 """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.""" 15 16 def __init__(self) -> None: 17 self.cache: dict[type, t.Any] = {} 18 19 def get_or_build(self, key: type, build: t.Callable[[], T]) -> T: 20 if not (obj := self.cache.get(key)): 21 self.cache[key] = obj = build() 22 return obj 23 24 25try: 26 import sqlglotc # noqa: F401 27except ImportError: 28 pass 29 30try: 31 import sqlglotrs # type: ignore # noqa: F401 32 import warnings 33 34 if "sqlglotc" not in globals(): 35 warnings.warn( 36 "sqlglot[rs] is deprecated and no longer compatible with sqlglot. " 37 "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]", 38 ) 39except ImportError: 40 pass 41 42if t.TYPE_CHECKING: 43 from sqlglot.dialects.dialect import DialectType 44 45 46def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]: 47 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr) 48 49 50def _quotes_to_format( 51 token_type: TokenType, arr: list[str | tuple[str, str]] 52) -> dict[str, tuple[str, TokenType]]: 53 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 54 55 56class _TokenizerBase: 57 QUOTES: t.ClassVar[list[tuple[str, str] | str]] 58 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] 59 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 60 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 61 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 62 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 63 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 64 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 65 STRING_ESCAPES: t.ClassVar[list[str]] 66 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] 67 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] 68 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] 69 HINT_START: t.ClassVar[str] 70 KEYWORDS: t.ClassVar[dict[str, TokenType]] 71 SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]] 72 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] 73 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] 74 COMMANDS: t.ClassVar[set[TokenType]] 75 COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]] 76 HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool] 77 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool] 78 NESTED_COMMENTS: t.ClassVar[bool] 79 TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]] 80 HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType] 81 COMMENTS: t.ClassVar[list[str | tuple[str, str]]] 82 _QUOTES: t.ClassVar[dict[str, str]] 83 _IDENTIFIERS: t.ClassVar[dict[str, str]] 84 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] 85 _STRING_ESCAPES: t.ClassVar[set[str]] 86 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] 87 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] 88 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] 89 _COMMENTS: t.ClassVar[dict[str, str | None]] 90 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] 91 92 @classmethod 93 def __init_subclass__(cls, **kwargs: t.Any) -> None: 94 super().__init_subclass__(**kwargs) 95 cls._QUOTES = _convert_quotes(cls.QUOTES) 96 cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS) 97 cls._FORMAT_STRINGS = { 98 **{ 99 p + s: (e, TokenType.NATIONAL_STRING) 100 for s, e in cls._QUOTES.items() 101 for p in ("n", "N") 102 }, 103 **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS), 104 **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS), 105 **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS), 106 **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS), 107 **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS), 108 **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS), 109 } 110 if "BYTE_STRING_ESCAPES" not in cls.__dict__: 111 cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy() 112 cls._STRING_ESCAPES = set(cls.STRING_ESCAPES) 113 cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES) 114 cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS) 115 cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES) 116 cls._COMMENTS = { 117 **{c: None for c in cls.COMMENTS if isinstance(c, str)}, 118 **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)}, 119 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 120 } 121 if cls.HINT_START in cls.KEYWORDS: 122 cls._COMMENTS[cls.HINT_START] = "*/" 123 cls._KEYWORD_TRIE = new_trie( 124 key.upper() 125 for key in ( 126 *cls.KEYWORDS, 127 *cls._COMMENTS, 128 *cls._QUOTES, 129 *cls._FORMAT_STRINGS, 130 ) 131 if " " in key or any(single in key for single in cls.SINGLE_TOKENS) 132 ) 133 134 135class Tokenizer(_TokenizerBase): 136 SINGLE_TOKENS = { 137 "(": TokenType.L_PAREN, 138 ")": TokenType.R_PAREN, 139 "[": TokenType.L_BRACKET, 140 "]": TokenType.R_BRACKET, 141 "{": TokenType.L_BRACE, 142 "}": TokenType.R_BRACE, 143 "&": TokenType.AMP, 144 "^": TokenType.CARET, 145 ":": TokenType.COLON, 146 ",": TokenType.COMMA, 147 ".": TokenType.DOT, 148 "-": TokenType.DASH, 149 "=": TokenType.EQ, 150 ">": TokenType.GT, 151 "<": TokenType.LT, 152 "%": TokenType.MOD, 153 "!": TokenType.NOT, 154 "|": TokenType.PIPE, 155 "+": TokenType.PLUS, 156 ";": TokenType.SEMICOLON, 157 "/": TokenType.SLASH, 158 "\\": TokenType.BACKSLASH, 159 "*": TokenType.STAR, 160 "~": TokenType.TILDE, 161 "?": TokenType.PLACEHOLDER, 162 "@": TokenType.PARAMETER, 163 "#": TokenType.HASH, 164 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 165 "'": TokenType.UNKNOWN, 166 "`": TokenType.UNKNOWN, 167 '"': TokenType.UNKNOWN, 168 } 169 170 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 171 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 172 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 173 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 174 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 175 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 176 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 177 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 178 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 179 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 180 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 181 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 182 183 # The strings in this list can always be used as escapes, regardless of the surrounding 184 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 185 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 186 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 187 188 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 189 HEREDOC_TAG_IS_IDENTIFIER = False 190 191 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 192 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 193 194 # Whether string escape characters function as such when placed within raw strings 195 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 196 197 NESTED_COMMENTS = True 198 199 HINT_START = "/*+" 200 201 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 202 203 # Autofilled 204 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 205 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 206 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 207 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 208 _QUOTES: t.ClassVar[dict[str, str]] = {} 209 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 210 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 211 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 212 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 213 214 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 215 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 216 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 217 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 218 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 219 HINT_START: TokenType.HINT, 220 "&<": TokenType.AMP_LT, 221 "&>": TokenType.AMP_GT, 222 "==": TokenType.EQ, 223 "::": TokenType.DCOLON, 224 "?::": TokenType.QDCOLON, 225 "||": TokenType.DPIPE, 226 "|>": TokenType.PIPE_GT, 227 ">=": TokenType.GTE, 228 "<=": TokenType.LTE, 229 "<>": TokenType.NEQ, 230 "!=": TokenType.NEQ, 231 ":=": TokenType.COLON_EQ, 232 "<=>": TokenType.NULLSAFE_EQ, 233 "->": TokenType.ARROW, 234 "->>": TokenType.DARROW, 235 "=>": TokenType.FARROW, 236 "#>": TokenType.HASH_ARROW, 237 "#>>": TokenType.DHASH_ARROW, 238 "<->": TokenType.LR_ARROW, 239 "<<->>": TokenType.LLRR_ARROW, 240 "&&": TokenType.DAMP, 241 "??": TokenType.DQMARK, 242 "~~~": TokenType.GLOB, 243 "~~": TokenType.LIKE, 244 "~~*": TokenType.ILIKE, 245 "~*": TokenType.IRLIKE, 246 "-|-": TokenType.ADJACENT, 247 "ALL": TokenType.ALL, 248 "AND": TokenType.AND, 249 "ANTI": TokenType.ANTI, 250 "ANY": TokenType.ANY, 251 "ASC": TokenType.ASC, 252 "AS": TokenType.ALIAS, 253 "ASOF": TokenType.ASOF, 254 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 255 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 256 "BEGIN": TokenType.BEGIN, 257 "BETWEEN": TokenType.BETWEEN, 258 "CACHE": TokenType.CACHE, 259 "UNCACHE": TokenType.UNCACHE, 260 "CASE": TokenType.CASE, 261 "CHARACTER SET": TokenType.CHARACTER_SET, 262 "CLUSTER BY": TokenType.CLUSTER_BY, 263 "COLLATE": TokenType.COLLATE, 264 "COLUMN": TokenType.COLUMN, 265 "COMMIT": TokenType.COMMIT, 266 "CONNECT BY": TokenType.CONNECT_BY, 267 "CONSTRAINT": TokenType.CONSTRAINT, 268 "COPY": TokenType.COPY, 269 "CREATE": TokenType.CREATE, 270 "CROSS": TokenType.CROSS, 271 "CUBE": TokenType.CUBE, 272 "CURRENT_DATE": TokenType.CURRENT_DATE, 273 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 274 "CURRENT_TIME": TokenType.CURRENT_TIME, 275 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 276 "CURRENT_USER": TokenType.CURRENT_USER, 277 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 278 "DATABASE": TokenType.DATABASE, 279 "DEFAULT": TokenType.DEFAULT, 280 "DELETE": TokenType.DELETE, 281 "DESC": TokenType.DESC, 282 "DESCRIBE": TokenType.DESCRIBE, 283 "DISTINCT": TokenType.DISTINCT, 284 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 285 "DIV": TokenType.DIV, 286 "DROP": TokenType.DROP, 287 "ELSE": TokenType.ELSE, 288 "END": TokenType.END, 289 "ENUM": TokenType.ENUM, 290 "ESCAPE": TokenType.ESCAPE, 291 "EXCEPT": TokenType.EXCEPT, 292 "EXECUTE": TokenType.EXECUTE, 293 "EXISTS": TokenType.EXISTS, 294 "FALSE": TokenType.FALSE, 295 "FETCH": TokenType.FETCH, 296 "FILTER": TokenType.FILTER, 297 "FILE": TokenType.FILE, 298 "FIRST": TokenType.FIRST, 299 "FULL": TokenType.FULL, 300 "FUNCTION": TokenType.FUNCTION, 301 "FOR": TokenType.FOR, 302 "FOREIGN KEY": TokenType.FOREIGN_KEY, 303 "FORMAT": TokenType.FORMAT, 304 "FROM": TokenType.FROM, 305 "GEOGRAPHY": TokenType.GEOGRAPHY, 306 "GEOMETRY": TokenType.GEOMETRY, 307 "GLOB": TokenType.GLOB, 308 "GROUP BY": TokenType.GROUP_BY, 309 "GROUPING SETS": TokenType.GROUPING_SETS, 310 "HAVING": TokenType.HAVING, 311 "ILIKE": TokenType.ILIKE, 312 "IN": TokenType.IN, 313 "INDEX": TokenType.INDEX, 314 "INET": TokenType.INET, 315 "INNER": TokenType.INNER, 316 "INSERT": TokenType.INSERT, 317 "INTERVAL": TokenType.INTERVAL, 318 "INTERSECT": TokenType.INTERSECT, 319 "INTO": TokenType.INTO, 320 "IS": TokenType.IS, 321 "ISNULL": TokenType.ISNULL, 322 "JOIN": TokenType.JOIN, 323 "KEEP": TokenType.KEEP, 324 "KILL": TokenType.KILL, 325 "LATERAL": TokenType.LATERAL, 326 "LEFT": TokenType.LEFT, 327 "LIKE": TokenType.LIKE, 328 "LIMIT": TokenType.LIMIT, 329 "LOAD": TokenType.LOAD, 330 "LOCALTIME": TokenType.LOCALTIME, 331 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 332 "LOCK": TokenType.LOCK, 333 "MERGE": TokenType.MERGE, 334 "NAMESPACE": TokenType.NAMESPACE, 335 "NATURAL": TokenType.NATURAL, 336 "NEXT": TokenType.NEXT, 337 "NOT": TokenType.NOT, 338 "NOTNULL": TokenType.NOTNULL, 339 "NULL": TokenType.NULL, 340 "OBJECT": TokenType.OBJECT, 341 "OFFSET": TokenType.OFFSET, 342 "ON": TokenType.ON, 343 "OR": TokenType.OR, 344 "XOR": TokenType.XOR, 345 "ORDER BY": TokenType.ORDER_BY, 346 "ORDINALITY": TokenType.ORDINALITY, 347 "OUT": TokenType.OUT, 348 "OUTER": TokenType.OUTER, 349 "OVER": TokenType.OVER, 350 "OVERLAPS": TokenType.OVERLAPS, 351 "OVERWRITE": TokenType.OVERWRITE, 352 "PARTITION": TokenType.PARTITION, 353 "PARTITION BY": TokenType.PARTITION_BY, 354 "PARTITIONED BY": TokenType.PARTITION_BY, 355 "PARTITIONED_BY": TokenType.PARTITION_BY, 356 "PERCENT": TokenType.PERCENT, 357 "PIVOT": TokenType.PIVOT, 358 "PRAGMA": TokenType.PRAGMA, 359 "PRIMARY KEY": TokenType.PRIMARY_KEY, 360 "PROCEDURE": TokenType.PROCEDURE, 361 "OPERATOR": TokenType.OPERATOR, 362 "QUALIFY": TokenType.QUALIFY, 363 "RANGE": TokenType.RANGE, 364 "RECURSIVE": TokenType.RECURSIVE, 365 "REGEXP": TokenType.RLIKE, 366 "RENAME": TokenType.RENAME, 367 "REPLACE": TokenType.REPLACE, 368 "RETURNING": TokenType.RETURNING, 369 "REFERENCES": TokenType.REFERENCES, 370 "RIGHT": TokenType.RIGHT, 371 "RLIKE": TokenType.RLIKE, 372 "ROLLBACK": TokenType.ROLLBACK, 373 "ROLLUP": TokenType.ROLLUP, 374 "ROW": TokenType.ROW, 375 "ROWS": TokenType.ROWS, 376 "SCHEMA": TokenType.SCHEMA, 377 "SELECT": TokenType.SELECT, 378 "SEMI": TokenType.SEMI, 379 "SESSION": TokenType.SESSION, 380 "SESSION_USER": TokenType.SESSION_USER, 381 "SET": TokenType.SET, 382 "SETTINGS": TokenType.SETTINGS, 383 "SHOW": TokenType.SHOW, 384 "SIMILAR TO": TokenType.SIMILAR_TO, 385 "SOME": TokenType.SOME, 386 "SORT BY": TokenType.SORT_BY, 387 "SQL SECURITY": TokenType.SQL_SECURITY, 388 "START WITH": TokenType.START_WITH, 389 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 390 "TABLE": TokenType.TABLE, 391 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 392 "TEMP": TokenType.TEMPORARY, 393 "TEMPORARY": TokenType.TEMPORARY, 394 "THEN": TokenType.THEN, 395 "TRUE": TokenType.TRUE, 396 "TRUNCATE": TokenType.TRUNCATE, 397 "TRIGGER": TokenType.TRIGGER, 398 "UNION": TokenType.UNION, 399 "UNKNOWN": TokenType.UNKNOWN, 400 "UNNEST": TokenType.UNNEST, 401 "UNPIVOT": TokenType.UNPIVOT, 402 "UPDATE": TokenType.UPDATE, 403 "USE": TokenType.USE, 404 "USING": TokenType.USING, 405 "UUID": TokenType.UUID, 406 "VALUES": TokenType.VALUES, 407 "VIEW": TokenType.VIEW, 408 "VOLATILE": TokenType.VOLATILE, 409 "WHEN": TokenType.WHEN, 410 "WHERE": TokenType.WHERE, 411 "WINDOW": TokenType.WINDOW, 412 "WITH": TokenType.WITH, 413 "APPLY": TokenType.APPLY, 414 "ARRAY": TokenType.ARRAY, 415 "BIT": TokenType.BIT, 416 "BOOL": TokenType.BOOLEAN, 417 "BOOLEAN": TokenType.BOOLEAN, 418 "BYTE": TokenType.TINYINT, 419 "MEDIUMINT": TokenType.MEDIUMINT, 420 "INT1": TokenType.TINYINT, 421 "TINYINT": TokenType.TINYINT, 422 "INT16": TokenType.SMALLINT, 423 "SHORT": TokenType.SMALLINT, 424 "SMALLINT": TokenType.SMALLINT, 425 "HUGEINT": TokenType.INT128, 426 "UHUGEINT": TokenType.UINT128, 427 "INT2": TokenType.SMALLINT, 428 "INTEGER": TokenType.INT, 429 "INT": TokenType.INT, 430 "INT4": TokenType.INT, 431 "INT32": TokenType.INT, 432 "INT64": TokenType.BIGINT, 433 "INT128": TokenType.INT128, 434 "INT256": TokenType.INT256, 435 "LONG": TokenType.BIGINT, 436 "BIGINT": TokenType.BIGINT, 437 "INT8": TokenType.TINYINT, 438 "UINT": TokenType.UINT, 439 "UINT128": TokenType.UINT128, 440 "UINT256": TokenType.UINT256, 441 "DEC": TokenType.DECIMAL, 442 "DECIMAL": TokenType.DECIMAL, 443 "DECIMAL32": TokenType.DECIMAL32, 444 "DECIMAL64": TokenType.DECIMAL64, 445 "DECIMAL128": TokenType.DECIMAL128, 446 "DECIMAL256": TokenType.DECIMAL256, 447 "DECFLOAT": TokenType.DECFLOAT, 448 "BIGDECIMAL": TokenType.BIGDECIMAL, 449 "BIGNUMERIC": TokenType.BIGDECIMAL, 450 "BIGNUM": TokenType.BIGNUM, 451 "LIST": TokenType.LIST, 452 "MAP": TokenType.MAP, 453 "NULLABLE": TokenType.NULLABLE, 454 "NUMBER": TokenType.DECIMAL, 455 "NUMERIC": TokenType.DECIMAL, 456 "FIXED": TokenType.DECIMAL, 457 "REAL": TokenType.FLOAT, 458 "FLOAT": TokenType.FLOAT, 459 "FLOAT4": TokenType.FLOAT, 460 "FLOAT8": TokenType.DOUBLE, 461 "DOUBLE": TokenType.DOUBLE, 462 "DOUBLE PRECISION": TokenType.DOUBLE, 463 "JSON": TokenType.JSON, 464 "JSONB": TokenType.JSONB, 465 "CHAR": TokenType.CHAR, 466 "CHARACTER": TokenType.CHAR, 467 "CHAR VARYING": TokenType.VARCHAR, 468 "CHARACTER VARYING": TokenType.VARCHAR, 469 "NCHAR": TokenType.NCHAR, 470 "VARCHAR": TokenType.VARCHAR, 471 "VARCHAR2": TokenType.VARCHAR, 472 "NVARCHAR": TokenType.NVARCHAR, 473 "NVARCHAR2": TokenType.NVARCHAR, 474 "BPCHAR": TokenType.BPCHAR, 475 "STR": TokenType.TEXT, 476 "STRING": TokenType.TEXT, 477 "TEXT": TokenType.TEXT, 478 "LONGTEXT": TokenType.LONGTEXT, 479 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 480 "TINYTEXT": TokenType.TINYTEXT, 481 "CLOB": TokenType.TEXT, 482 "LONGVARCHAR": TokenType.TEXT, 483 "BINARY": TokenType.BINARY, 484 "BLOB": TokenType.VARBINARY, 485 "LONGBLOB": TokenType.LONGBLOB, 486 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 487 "TINYBLOB": TokenType.TINYBLOB, 488 "BYTEA": TokenType.VARBINARY, 489 "VARBINARY": TokenType.VARBINARY, 490 "TIME": TokenType.TIME, 491 "TIMETZ": TokenType.TIMETZ, 492 "TIME_NS": TokenType.TIME_NS, 493 "TIMESTAMP": TokenType.TIMESTAMP, 494 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 495 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 496 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 497 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 498 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 499 "DATE": TokenType.DATE, 500 "DATETIME": TokenType.DATETIME, 501 "INT4RANGE": TokenType.INT4RANGE, 502 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 503 "INT8RANGE": TokenType.INT8RANGE, 504 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 505 "NUMRANGE": TokenType.NUMRANGE, 506 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 507 "TSRANGE": TokenType.TSRANGE, 508 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 509 "TSTZRANGE": TokenType.TSTZRANGE, 510 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 511 "DATERANGE": TokenType.DATERANGE, 512 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 513 "UNIQUE": TokenType.UNIQUE, 514 "VECTOR": TokenType.VECTOR, 515 "STRUCT": TokenType.STRUCT, 516 "SEQUENCE": TokenType.SEQUENCE, 517 "VARIANT": TokenType.VARIANT, 518 "ALTER": TokenType.ALTER, 519 "ANALYZE": TokenType.ANALYZE, 520 "CALL": TokenType.COMMAND, 521 "COMMENT": TokenType.COMMENT, 522 "EXPLAIN": TokenType.COMMAND, 523 "GRANT": TokenType.GRANT, 524 "REVOKE": TokenType.REVOKE, 525 "OPTIMIZE": TokenType.COMMAND, 526 "PREPARE": TokenType.COMMAND, 527 "VACUUM": TokenType.COMMAND, 528 "USER-DEFINED": TokenType.USERDEFINED, 529 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 530 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 531 } 532 533 COMMANDS = { 534 TokenType.COMMAND, 535 TokenType.EXECUTE, 536 TokenType.FETCH, 537 TokenType.SHOW, 538 TokenType.RENAME, 539 } 540 541 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 542 543 # Handle numeric literals like in hive (3L = BIGINT) 544 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 545 546 # In tokenizers like JSONPath, dots are always key separators, never decimal points 547 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 548 549 COMMENTS = ["--", ("/*", "*/")] 550 551 _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache() 552 553 __slots__ = ( 554 "dialect", 555 "_core", 556 ) 557 558 def __init__(self, dialect: DialectType = None) -> None: 559 from sqlglot.dialects.dialect import Dialect 560 561 self.dialect = Dialect.get_or_raise(dialect) 562 self._core = self._core_cache.get_or_build(type(self), self._init_core) 563 564 def _init_core(self) -> TokenizerCore: 565 return TokenizerCore( 566 single_tokens=self.SINGLE_TOKENS, 567 keywords=self.KEYWORDS, 568 quotes=self._QUOTES, 569 format_strings=self._FORMAT_STRINGS, 570 identifiers=self._IDENTIFIERS, 571 comments=self._COMMENTS, 572 string_escapes=self._STRING_ESCAPES, 573 byte_string_escapes=self._BYTE_STRING_ESCAPES, 574 identifier_escapes=self._IDENTIFIER_ESCAPES, 575 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 576 commands=self.COMMANDS, 577 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 578 nested_comments=self.NESTED_COMMENTS, 579 hint_start=self.HINT_START, 580 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 581 has_bit_strings=bool(self.BIT_STRINGS), 582 has_hex_strings=bool(self.HEX_STRINGS), 583 numeric_literals=self.NUMERIC_LITERALS, 584 var_single_tokens=self.VAR_SINGLE_TOKENS, 585 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 586 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 587 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 588 keyword_trie=self._KEYWORD_TRIE, 589 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 590 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 591 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 592 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 593 ) 594 595 def tokenize(self, sql: str) -> list[Token]: 596 """Returns a list of tokens corresponding to the SQL string `sql`.""" 597 return self._core.tokenize(sql) # type: ignore 598 599 @property 600 def sql(self) -> str: 601 """The SQL string being tokenized.""" 602 return self._core.sql 603 604 @property 605 def size(self) -> int: 606 """Length of the SQL string.""" 607 return self._core.size 608 609 @property 610 def tokens(self) -> list[Token]: 611 """The list of tokens produced by tokenization.""" 612 return self._core.tokens
class
ThreadLocalCache(_thread._local):
14class ThreadLocalCache(threading.local): 15 """Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.""" 16 17 def __init__(self) -> None: 18 self.cache: dict[type, t.Any] = {} 19 20 def get_or_build(self, key: type, build: t.Callable[[], T]) -> T: 21 if not (obj := self.cache.get(key)): 22 self.cache[key] = obj = build() 23 return obj
Per-thread cache. Each thread sees its own dict; safe for caching stateful objects.
136class Tokenizer(_TokenizerBase): 137 SINGLE_TOKENS = { 138 "(": TokenType.L_PAREN, 139 ")": TokenType.R_PAREN, 140 "[": TokenType.L_BRACKET, 141 "]": TokenType.R_BRACKET, 142 "{": TokenType.L_BRACE, 143 "}": TokenType.R_BRACE, 144 "&": TokenType.AMP, 145 "^": TokenType.CARET, 146 ":": TokenType.COLON, 147 ",": TokenType.COMMA, 148 ".": TokenType.DOT, 149 "-": TokenType.DASH, 150 "=": TokenType.EQ, 151 ">": TokenType.GT, 152 "<": TokenType.LT, 153 "%": TokenType.MOD, 154 "!": TokenType.NOT, 155 "|": TokenType.PIPE, 156 "+": TokenType.PLUS, 157 ";": TokenType.SEMICOLON, 158 "/": TokenType.SLASH, 159 "\\": TokenType.BACKSLASH, 160 "*": TokenType.STAR, 161 "~": TokenType.TILDE, 162 "?": TokenType.PLACEHOLDER, 163 "@": TokenType.PARAMETER, 164 "#": TokenType.HASH, 165 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 166 "'": TokenType.UNKNOWN, 167 "`": TokenType.UNKNOWN, 168 '"': TokenType.UNKNOWN, 169 } 170 171 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 172 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 173 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 174 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 175 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 176 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 177 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 178 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 179 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 180 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 181 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 182 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 183 184 # The strings in this list can always be used as escapes, regardless of the surrounding 185 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 186 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 187 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 188 189 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 190 HEREDOC_TAG_IS_IDENTIFIER = False 191 192 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 193 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 194 195 # Whether string escape characters function as such when placed within raw strings 196 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 197 198 NESTED_COMMENTS = True 199 200 HINT_START = "/*+" 201 202 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 203 204 # Autofilled 205 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 206 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 207 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 208 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 209 _QUOTES: t.ClassVar[dict[str, str]] = {} 210 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 211 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 212 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 213 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 214 215 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 216 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 217 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 218 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 219 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 220 HINT_START: TokenType.HINT, 221 "&<": TokenType.AMP_LT, 222 "&>": TokenType.AMP_GT, 223 "==": TokenType.EQ, 224 "::": TokenType.DCOLON, 225 "?::": TokenType.QDCOLON, 226 "||": TokenType.DPIPE, 227 "|>": TokenType.PIPE_GT, 228 ">=": TokenType.GTE, 229 "<=": TokenType.LTE, 230 "<>": TokenType.NEQ, 231 "!=": TokenType.NEQ, 232 ":=": TokenType.COLON_EQ, 233 "<=>": TokenType.NULLSAFE_EQ, 234 "->": TokenType.ARROW, 235 "->>": TokenType.DARROW, 236 "=>": TokenType.FARROW, 237 "#>": TokenType.HASH_ARROW, 238 "#>>": TokenType.DHASH_ARROW, 239 "<->": TokenType.LR_ARROW, 240 "<<->>": TokenType.LLRR_ARROW, 241 "&&": TokenType.DAMP, 242 "??": TokenType.DQMARK, 243 "~~~": TokenType.GLOB, 244 "~~": TokenType.LIKE, 245 "~~*": TokenType.ILIKE, 246 "~*": TokenType.IRLIKE, 247 "-|-": TokenType.ADJACENT, 248 "ALL": TokenType.ALL, 249 "AND": TokenType.AND, 250 "ANTI": TokenType.ANTI, 251 "ANY": TokenType.ANY, 252 "ASC": TokenType.ASC, 253 "AS": TokenType.ALIAS, 254 "ASOF": TokenType.ASOF, 255 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 256 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 257 "BEGIN": TokenType.BEGIN, 258 "BETWEEN": TokenType.BETWEEN, 259 "CACHE": TokenType.CACHE, 260 "UNCACHE": TokenType.UNCACHE, 261 "CASE": TokenType.CASE, 262 "CHARACTER SET": TokenType.CHARACTER_SET, 263 "CLUSTER BY": TokenType.CLUSTER_BY, 264 "COLLATE": TokenType.COLLATE, 265 "COLUMN": TokenType.COLUMN, 266 "COMMIT": TokenType.COMMIT, 267 "CONNECT BY": TokenType.CONNECT_BY, 268 "CONSTRAINT": TokenType.CONSTRAINT, 269 "COPY": TokenType.COPY, 270 "CREATE": TokenType.CREATE, 271 "CROSS": TokenType.CROSS, 272 "CUBE": TokenType.CUBE, 273 "CURRENT_DATE": TokenType.CURRENT_DATE, 274 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 275 "CURRENT_TIME": TokenType.CURRENT_TIME, 276 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 277 "CURRENT_USER": TokenType.CURRENT_USER, 278 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 279 "DATABASE": TokenType.DATABASE, 280 "DEFAULT": TokenType.DEFAULT, 281 "DELETE": TokenType.DELETE, 282 "DESC": TokenType.DESC, 283 "DESCRIBE": TokenType.DESCRIBE, 284 "DISTINCT": TokenType.DISTINCT, 285 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 286 "DIV": TokenType.DIV, 287 "DROP": TokenType.DROP, 288 "ELSE": TokenType.ELSE, 289 "END": TokenType.END, 290 "ENUM": TokenType.ENUM, 291 "ESCAPE": TokenType.ESCAPE, 292 "EXCEPT": TokenType.EXCEPT, 293 "EXECUTE": TokenType.EXECUTE, 294 "EXISTS": TokenType.EXISTS, 295 "FALSE": TokenType.FALSE, 296 "FETCH": TokenType.FETCH, 297 "FILTER": TokenType.FILTER, 298 "FILE": TokenType.FILE, 299 "FIRST": TokenType.FIRST, 300 "FULL": TokenType.FULL, 301 "FUNCTION": TokenType.FUNCTION, 302 "FOR": TokenType.FOR, 303 "FOREIGN KEY": TokenType.FOREIGN_KEY, 304 "FORMAT": TokenType.FORMAT, 305 "FROM": TokenType.FROM, 306 "GEOGRAPHY": TokenType.GEOGRAPHY, 307 "GEOMETRY": TokenType.GEOMETRY, 308 "GLOB": TokenType.GLOB, 309 "GROUP BY": TokenType.GROUP_BY, 310 "GROUPING SETS": TokenType.GROUPING_SETS, 311 "HAVING": TokenType.HAVING, 312 "ILIKE": TokenType.ILIKE, 313 "IN": TokenType.IN, 314 "INDEX": TokenType.INDEX, 315 "INET": TokenType.INET, 316 "INNER": TokenType.INNER, 317 "INSERT": TokenType.INSERT, 318 "INTERVAL": TokenType.INTERVAL, 319 "INTERSECT": TokenType.INTERSECT, 320 "INTO": TokenType.INTO, 321 "IS": TokenType.IS, 322 "ISNULL": TokenType.ISNULL, 323 "JOIN": TokenType.JOIN, 324 "KEEP": TokenType.KEEP, 325 "KILL": TokenType.KILL, 326 "LATERAL": TokenType.LATERAL, 327 "LEFT": TokenType.LEFT, 328 "LIKE": TokenType.LIKE, 329 "LIMIT": TokenType.LIMIT, 330 "LOAD": TokenType.LOAD, 331 "LOCALTIME": TokenType.LOCALTIME, 332 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 333 "LOCK": TokenType.LOCK, 334 "MERGE": TokenType.MERGE, 335 "NAMESPACE": TokenType.NAMESPACE, 336 "NATURAL": TokenType.NATURAL, 337 "NEXT": TokenType.NEXT, 338 "NOT": TokenType.NOT, 339 "NOTNULL": TokenType.NOTNULL, 340 "NULL": TokenType.NULL, 341 "OBJECT": TokenType.OBJECT, 342 "OFFSET": TokenType.OFFSET, 343 "ON": TokenType.ON, 344 "OR": TokenType.OR, 345 "XOR": TokenType.XOR, 346 "ORDER BY": TokenType.ORDER_BY, 347 "ORDINALITY": TokenType.ORDINALITY, 348 "OUT": TokenType.OUT, 349 "OUTER": TokenType.OUTER, 350 "OVER": TokenType.OVER, 351 "OVERLAPS": TokenType.OVERLAPS, 352 "OVERWRITE": TokenType.OVERWRITE, 353 "PARTITION": TokenType.PARTITION, 354 "PARTITION BY": TokenType.PARTITION_BY, 355 "PARTITIONED BY": TokenType.PARTITION_BY, 356 "PARTITIONED_BY": TokenType.PARTITION_BY, 357 "PERCENT": TokenType.PERCENT, 358 "PIVOT": TokenType.PIVOT, 359 "PRAGMA": TokenType.PRAGMA, 360 "PRIMARY KEY": TokenType.PRIMARY_KEY, 361 "PROCEDURE": TokenType.PROCEDURE, 362 "OPERATOR": TokenType.OPERATOR, 363 "QUALIFY": TokenType.QUALIFY, 364 "RANGE": TokenType.RANGE, 365 "RECURSIVE": TokenType.RECURSIVE, 366 "REGEXP": TokenType.RLIKE, 367 "RENAME": TokenType.RENAME, 368 "REPLACE": TokenType.REPLACE, 369 "RETURNING": TokenType.RETURNING, 370 "REFERENCES": TokenType.REFERENCES, 371 "RIGHT": TokenType.RIGHT, 372 "RLIKE": TokenType.RLIKE, 373 "ROLLBACK": TokenType.ROLLBACK, 374 "ROLLUP": TokenType.ROLLUP, 375 "ROW": TokenType.ROW, 376 "ROWS": TokenType.ROWS, 377 "SCHEMA": TokenType.SCHEMA, 378 "SELECT": TokenType.SELECT, 379 "SEMI": TokenType.SEMI, 380 "SESSION": TokenType.SESSION, 381 "SESSION_USER": TokenType.SESSION_USER, 382 "SET": TokenType.SET, 383 "SETTINGS": TokenType.SETTINGS, 384 "SHOW": TokenType.SHOW, 385 "SIMILAR TO": TokenType.SIMILAR_TO, 386 "SOME": TokenType.SOME, 387 "SORT BY": TokenType.SORT_BY, 388 "SQL SECURITY": TokenType.SQL_SECURITY, 389 "START WITH": TokenType.START_WITH, 390 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 391 "TABLE": TokenType.TABLE, 392 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 393 "TEMP": TokenType.TEMPORARY, 394 "TEMPORARY": TokenType.TEMPORARY, 395 "THEN": TokenType.THEN, 396 "TRUE": TokenType.TRUE, 397 "TRUNCATE": TokenType.TRUNCATE, 398 "TRIGGER": TokenType.TRIGGER, 399 "UNION": TokenType.UNION, 400 "UNKNOWN": TokenType.UNKNOWN, 401 "UNNEST": TokenType.UNNEST, 402 "UNPIVOT": TokenType.UNPIVOT, 403 "UPDATE": TokenType.UPDATE, 404 "USE": TokenType.USE, 405 "USING": TokenType.USING, 406 "UUID": TokenType.UUID, 407 "VALUES": TokenType.VALUES, 408 "VIEW": TokenType.VIEW, 409 "VOLATILE": TokenType.VOLATILE, 410 "WHEN": TokenType.WHEN, 411 "WHERE": TokenType.WHERE, 412 "WINDOW": TokenType.WINDOW, 413 "WITH": TokenType.WITH, 414 "APPLY": TokenType.APPLY, 415 "ARRAY": TokenType.ARRAY, 416 "BIT": TokenType.BIT, 417 "BOOL": TokenType.BOOLEAN, 418 "BOOLEAN": TokenType.BOOLEAN, 419 "BYTE": TokenType.TINYINT, 420 "MEDIUMINT": TokenType.MEDIUMINT, 421 "INT1": TokenType.TINYINT, 422 "TINYINT": TokenType.TINYINT, 423 "INT16": TokenType.SMALLINT, 424 "SHORT": TokenType.SMALLINT, 425 "SMALLINT": TokenType.SMALLINT, 426 "HUGEINT": TokenType.INT128, 427 "UHUGEINT": TokenType.UINT128, 428 "INT2": TokenType.SMALLINT, 429 "INTEGER": TokenType.INT, 430 "INT": TokenType.INT, 431 "INT4": TokenType.INT, 432 "INT32": TokenType.INT, 433 "INT64": TokenType.BIGINT, 434 "INT128": TokenType.INT128, 435 "INT256": TokenType.INT256, 436 "LONG": TokenType.BIGINT, 437 "BIGINT": TokenType.BIGINT, 438 "INT8": TokenType.TINYINT, 439 "UINT": TokenType.UINT, 440 "UINT128": TokenType.UINT128, 441 "UINT256": TokenType.UINT256, 442 "DEC": TokenType.DECIMAL, 443 "DECIMAL": TokenType.DECIMAL, 444 "DECIMAL32": TokenType.DECIMAL32, 445 "DECIMAL64": TokenType.DECIMAL64, 446 "DECIMAL128": TokenType.DECIMAL128, 447 "DECIMAL256": TokenType.DECIMAL256, 448 "DECFLOAT": TokenType.DECFLOAT, 449 "BIGDECIMAL": TokenType.BIGDECIMAL, 450 "BIGNUMERIC": TokenType.BIGDECIMAL, 451 "BIGNUM": TokenType.BIGNUM, 452 "LIST": TokenType.LIST, 453 "MAP": TokenType.MAP, 454 "NULLABLE": TokenType.NULLABLE, 455 "NUMBER": TokenType.DECIMAL, 456 "NUMERIC": TokenType.DECIMAL, 457 "FIXED": TokenType.DECIMAL, 458 "REAL": TokenType.FLOAT, 459 "FLOAT": TokenType.FLOAT, 460 "FLOAT4": TokenType.FLOAT, 461 "FLOAT8": TokenType.DOUBLE, 462 "DOUBLE": TokenType.DOUBLE, 463 "DOUBLE PRECISION": TokenType.DOUBLE, 464 "JSON": TokenType.JSON, 465 "JSONB": TokenType.JSONB, 466 "CHAR": TokenType.CHAR, 467 "CHARACTER": TokenType.CHAR, 468 "CHAR VARYING": TokenType.VARCHAR, 469 "CHARACTER VARYING": TokenType.VARCHAR, 470 "NCHAR": TokenType.NCHAR, 471 "VARCHAR": TokenType.VARCHAR, 472 "VARCHAR2": TokenType.VARCHAR, 473 "NVARCHAR": TokenType.NVARCHAR, 474 "NVARCHAR2": TokenType.NVARCHAR, 475 "BPCHAR": TokenType.BPCHAR, 476 "STR": TokenType.TEXT, 477 "STRING": TokenType.TEXT, 478 "TEXT": TokenType.TEXT, 479 "LONGTEXT": TokenType.LONGTEXT, 480 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 481 "TINYTEXT": TokenType.TINYTEXT, 482 "CLOB": TokenType.TEXT, 483 "LONGVARCHAR": TokenType.TEXT, 484 "BINARY": TokenType.BINARY, 485 "BLOB": TokenType.VARBINARY, 486 "LONGBLOB": TokenType.LONGBLOB, 487 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 488 "TINYBLOB": TokenType.TINYBLOB, 489 "BYTEA": TokenType.VARBINARY, 490 "VARBINARY": TokenType.VARBINARY, 491 "TIME": TokenType.TIME, 492 "TIMETZ": TokenType.TIMETZ, 493 "TIME_NS": TokenType.TIME_NS, 494 "TIMESTAMP": TokenType.TIMESTAMP, 495 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 496 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 497 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 498 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 499 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 500 "DATE": TokenType.DATE, 501 "DATETIME": TokenType.DATETIME, 502 "INT4RANGE": TokenType.INT4RANGE, 503 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 504 "INT8RANGE": TokenType.INT8RANGE, 505 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 506 "NUMRANGE": TokenType.NUMRANGE, 507 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 508 "TSRANGE": TokenType.TSRANGE, 509 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 510 "TSTZRANGE": TokenType.TSTZRANGE, 511 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 512 "DATERANGE": TokenType.DATERANGE, 513 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 514 "UNIQUE": TokenType.UNIQUE, 515 "VECTOR": TokenType.VECTOR, 516 "STRUCT": TokenType.STRUCT, 517 "SEQUENCE": TokenType.SEQUENCE, 518 "VARIANT": TokenType.VARIANT, 519 "ALTER": TokenType.ALTER, 520 "ANALYZE": TokenType.ANALYZE, 521 "CALL": TokenType.COMMAND, 522 "COMMENT": TokenType.COMMENT, 523 "EXPLAIN": TokenType.COMMAND, 524 "GRANT": TokenType.GRANT, 525 "REVOKE": TokenType.REVOKE, 526 "OPTIMIZE": TokenType.COMMAND, 527 "PREPARE": TokenType.COMMAND, 528 "VACUUM": TokenType.COMMAND, 529 "USER-DEFINED": TokenType.USERDEFINED, 530 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 531 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 532 } 533 534 COMMANDS = { 535 TokenType.COMMAND, 536 TokenType.EXECUTE, 537 TokenType.FETCH, 538 TokenType.SHOW, 539 TokenType.RENAME, 540 } 541 542 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 543 544 # Handle numeric literals like in hive (3L = BIGINT) 545 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 546 547 # In tokenizers like JSONPath, dots are always key separators, never decimal points 548 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 549 550 COMMENTS = ["--", ("/*", "*/")] 551 552 _core_cache: t.ClassVar[ThreadLocalCache] = ThreadLocalCache() 553 554 __slots__ = ( 555 "dialect", 556 "_core", 557 ) 558 559 def __init__(self, dialect: DialectType = None) -> None: 560 from sqlglot.dialects.dialect import Dialect 561 562 self.dialect = Dialect.get_or_raise(dialect) 563 self._core = self._core_cache.get_or_build(type(self), self._init_core) 564 565 def _init_core(self) -> TokenizerCore: 566 return TokenizerCore( 567 single_tokens=self.SINGLE_TOKENS, 568 keywords=self.KEYWORDS, 569 quotes=self._QUOTES, 570 format_strings=self._FORMAT_STRINGS, 571 identifiers=self._IDENTIFIERS, 572 comments=self._COMMENTS, 573 string_escapes=self._STRING_ESCAPES, 574 byte_string_escapes=self._BYTE_STRING_ESCAPES, 575 identifier_escapes=self._IDENTIFIER_ESCAPES, 576 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 577 commands=self.COMMANDS, 578 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 579 nested_comments=self.NESTED_COMMENTS, 580 hint_start=self.HINT_START, 581 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 582 has_bit_strings=bool(self.BIT_STRINGS), 583 has_hex_strings=bool(self.HEX_STRINGS), 584 numeric_literals=self.NUMERIC_LITERALS, 585 var_single_tokens=self.VAR_SINGLE_TOKENS, 586 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 587 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 588 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 589 keyword_trie=self._KEYWORD_TRIE, 590 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 591 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 592 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 593 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 594 ) 595 596 def tokenize(self, sql: str) -> list[Token]: 597 """Returns a list of tokens corresponding to the SQL string `sql`.""" 598 return self._core.tokenize(sql) # type: ignore 599 600 @property 601 def sql(self) -> str: 602 """The SQL string being tokenized.""" 603 return self._core.sql 604 605 @property 606 def size(self) -> int: 607 """Length of the SQL string.""" 608 return self._core.size 609 610 @property 611 def tokens(self) -> list[Token]: 612 """The list of tokens produced by tokenization.""" 613 return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 328>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 355>, '@': <TokenType.PARAMETER: 57>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 213>, '`': <TokenType.UNKNOWN: 213>, '"': <TokenType.UNKNOWN: 213>}
TOKENS_PRECEDING_HINT =
{<TokenType.UPDATE: 416>, <TokenType.SELECT: 385>, <TokenType.INSERT: 299>, <TokenType.DELETE: 256>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] =
{'{%': <TokenType.BLOCK_START: 72>, '{%+': <TokenType.BLOCK_START: 72>, '{%-': <TokenType.BLOCK_START: 72>, '%}': <TokenType.BLOCK_END: 73>, '+%}': <TokenType.BLOCK_END: 73>, '-%}': <TokenType.BLOCK_END: 73>, '{{+': <TokenType.BLOCK_START: 72>, '{{-': <TokenType.BLOCK_START: 72>, '+}}': <TokenType.BLOCK_END: 73>, '-}}': <TokenType.BLOCK_END: 73>, '/*+': <TokenType.HINT: 292>, '&<': <TokenType.AMP_LT: 62>, '&>': <TokenType.AMP_GT: 63>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 368>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '<<->>': <TokenType.LLRR_ARROW: 52>, '&&': <TokenType.DAMP: 61>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 286>, '~~': <TokenType.LIKE: 317>, '~~*': <TokenType.ILIKE: 294>, '~*': <TokenType.IRLIKE: 306>, '-|-': <TokenType.ADJACENT: 64>, 'ALL': <TokenType.ALL: 219>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 220>, 'ANY': <TokenType.ANY: 221>, 'ASC': <TokenType.ASC: 224>, 'AS': <TokenType.ALIAS: 217>, 'ASOF': <TokenType.ASOF: 225>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 227>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 227>, 'BEGIN': <TokenType.BEGIN: 228>, 'BETWEEN': <TokenType.BETWEEN: 229>, 'CACHE': <TokenType.CACHE: 231>, 'UNCACHE': <TokenType.UNCACHE: 412>, 'CASE': <TokenType.CASE: 232>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 233>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 234>, 'COLLATE': <TokenType.COLLATE: 235>, 'COLUMN': <TokenType.COLUMN: 80>, 'COMMIT': <TokenType.COMMIT: 238>, 'CONNECT BY': <TokenType.CONNECT_BY: 239>, 'CONSTRAINT': <TokenType.CONSTRAINT: 240>, 'COPY': <TokenType.COPY: 241>, 'CREATE': <TokenType.CREATE: 242>, 'CROSS': <TokenType.CROSS: 243>, 'CUBE': <TokenType.CUBE: 244>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 245>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 247>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 248>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 249>, 'CURRENT_USER': <TokenType.CURRENT_USER: 250>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 253>, 'DATABASE': <TokenType.DATABASE: 79>, 'DEFAULT': <TokenType.DEFAULT: 255>, 'DELETE': <TokenType.DELETE: 256>, 'DESC': <TokenType.DESC: 257>, 'DESCRIBE': <TokenType.DESCRIBE: 258>, 'DISTINCT': <TokenType.DISTINCT: 261>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 262>, 'DIV': <TokenType.DIV: 263>, 'DROP': <TokenType.DROP: 264>, 'ELSE': <TokenType.ELSE: 265>, 'END': <TokenType.END: 266>, 'ENUM': <TokenType.ENUM: 204>, 'ESCAPE': <TokenType.ESCAPE: 267>, 'EXCEPT': <TokenType.EXCEPT: 268>, 'EXECUTE': <TokenType.EXECUTE: 269>, 'EXISTS': <TokenType.EXISTS: 270>, 'FALSE': <TokenType.FALSE: 271>, 'FETCH': <TokenType.FETCH: 272>, 'FILTER': <TokenType.FILTER: 275>, 'FILE': <TokenType.FILE: 273>, 'FIRST': <TokenType.FIRST: 277>, 'FULL': <TokenType.FULL: 283>, 'FUNCTION': <TokenType.FUNCTION: 284>, 'FOR': <TokenType.FOR: 278>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 280>, 'FORMAT': <TokenType.FORMAT: 281>, 'FROM': <TokenType.FROM: 282>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 171>, 'GEOMETRY': <TokenType.GEOMETRY: 174>, 'GLOB': <TokenType.GLOB: 286>, 'GROUP BY': <TokenType.GROUP_BY: 289>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 290>, 'HAVING': <TokenType.HAVING: 291>, 'ILIKE': <TokenType.ILIKE: 294>, 'IN': <TokenType.IN: 295>, 'INDEX': <TokenType.INDEX: 296>, 'INET': <TokenType.INET: 199>, 'INNER': <TokenType.INNER: 298>, 'INSERT': <TokenType.INSERT: 299>, 'INTERVAL': <TokenType.INTERVAL: 303>, 'INTERSECT': <TokenType.INTERSECT: 302>, 'INTO': <TokenType.INTO: 304>, 'IS': <TokenType.IS: 307>, 'ISNULL': <TokenType.ISNULL: 308>, 'JOIN': <TokenType.JOIN: 309>, 'KEEP': <TokenType.KEEP: 311>, 'KILL': <TokenType.KILL: 313>, 'LATERAL': <TokenType.LATERAL: 315>, 'LEFT': <TokenType.LEFT: 316>, 'LIKE': <TokenType.LIKE: 317>, 'LIMIT': <TokenType.LIMIT: 318>, 'LOAD': <TokenType.LOAD: 320>, 'LOCALTIME': <TokenType.LOCALTIME: 178>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 179>, 'LOCK': <TokenType.LOCK: 321>, 'MERGE': <TokenType.MERGE: 327>, 'NAMESPACE': <TokenType.NAMESPACE: 439>, 'NATURAL': <TokenType.NATURAL: 330>, 'NEXT': <TokenType.NEXT: 331>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 333>, 'NULL': <TokenType.NULL: 334>, 'OBJECT': <TokenType.OBJECT: 198>, 'OFFSET': <TokenType.OFFSET: 336>, 'ON': <TokenType.ON: 337>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 65>, 'ORDER BY': <TokenType.ORDER_BY: 340>, 'ORDINALITY': <TokenType.ORDINALITY: 343>, 'OUT': <TokenType.OUT: 344>, 'OUTER': <TokenType.OUTER: 346>, 'OVER': <TokenType.OVER: 347>, 'OVERLAPS': <TokenType.OVERLAPS: 348>, 'OVERWRITE': <TokenType.OVERWRITE: 349>, 'PARTITION': <TokenType.PARTITION: 351>, 'PARTITION BY': <TokenType.PARTITION_BY: 352>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 352>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 352>, 'PERCENT': <TokenType.PERCENT: 353>, 'PIVOT': <TokenType.PIVOT: 354>, 'PRAGMA': <TokenType.PRAGMA: 359>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 361>, 'PROCEDURE': <TokenType.PROCEDURE: 362>, 'OPERATOR': <TokenType.OPERATOR: 339>, 'QUALIFY': <TokenType.QUALIFY: 366>, 'RANGE': <TokenType.RANGE: 369>, 'RECURSIVE': <TokenType.RECURSIVE: 370>, 'REGEXP': <TokenType.RLIKE: 378>, 'RENAME': <TokenType.RENAME: 372>, 'REPLACE': <TokenType.REPLACE: 373>, 'RETURNING': <TokenType.RETURNING: 374>, 'REFERENCES': <TokenType.REFERENCES: 376>, 'RIGHT': <TokenType.RIGHT: 377>, 'RLIKE': <TokenType.RLIKE: 378>, 'ROLLBACK': <TokenType.ROLLBACK: 380>, 'ROLLUP': <TokenType.ROLLUP: 381>, 'ROW': <TokenType.ROW: 382>, 'ROWS': <TokenType.ROWS: 383>, 'SCHEMA': <TokenType.SCHEMA: 82>, 'SELECT': <TokenType.SELECT: 385>, 'SEMI': <TokenType.SEMI: 386>, 'SESSION': <TokenType.SESSION: 58>, 'SESSION_USER': <TokenType.SESSION_USER: 60>, 'SET': <TokenType.SET: 390>, 'SETTINGS': <TokenType.SETTINGS: 391>, 'SHOW': <TokenType.SHOW: 392>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 393>, 'SOME': <TokenType.SOME: 394>, 'SORT BY': <TokenType.SORT_BY: 395>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 397>, 'START WITH': <TokenType.START_WITH: 398>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 400>, 'TABLE': <TokenType.TABLE: 83>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 403>, 'TEMP': <TokenType.TEMPORARY: 405>, 'TEMPORARY': <TokenType.TEMPORARY: 405>, 'THEN': <TokenType.THEN: 407>, 'TRUE': <TokenType.TRUE: 408>, 'TRUNCATE': <TokenType.TRUNCATE: 409>, 'TRIGGER': <TokenType.TRIGGER: 410>, 'UNION': <TokenType.UNION: 413>, 'UNKNOWN': <TokenType.UNKNOWN: 213>, 'UNNEST': <TokenType.UNNEST: 414>, 'UNPIVOT': <TokenType.UNPIVOT: 415>, 'UPDATE': <TokenType.UPDATE: 416>, 'USE': <TokenType.USE: 417>, 'USING': <TokenType.USING: 418>, 'UUID': <TokenType.UUID: 170>, 'VALUES': <TokenType.VALUES: 419>, 'VIEW': <TokenType.VIEW: 421>, 'VOLATILE': <TokenType.VOLATILE: 423>, 'WHEN': <TokenType.WHEN: 425>, 'WHERE': <TokenType.WHERE: 426>, 'WINDOW': <TokenType.WINDOW: 427>, 'WITH': <TokenType.WITH: 428>, 'APPLY': <TokenType.APPLY: 222>, 'ARRAY': <TokenType.ARRAY: 223>, 'BIT': <TokenType.BIT: 96>, 'BOOL': <TokenType.BOOLEAN: 97>, 'BOOLEAN': <TokenType.BOOLEAN: 97>, 'BYTE': <TokenType.TINYINT: 98>, 'MEDIUMINT': <TokenType.MEDIUMINT: 102>, 'INT1': <TokenType.TINYINT: 98>, 'TINYINT': <TokenType.TINYINT: 98>, 'INT16': <TokenType.SMALLINT: 100>, 'SHORT': <TokenType.SMALLINT: 100>, 'SMALLINT': <TokenType.SMALLINT: 100>, 'HUGEINT': <TokenType.INT128: 109>, 'UHUGEINT': <TokenType.UINT128: 110>, 'INT2': <TokenType.SMALLINT: 100>, 'INTEGER': <TokenType.INT: 104>, 'INT': <TokenType.INT: 104>, 'INT4': <TokenType.INT: 104>, 'INT32': <TokenType.INT: 104>, 'INT64': <TokenType.BIGINT: 106>, 'INT128': <TokenType.INT128: 109>, 'INT256': <TokenType.INT256: 111>, 'LONG': <TokenType.BIGINT: 106>, 'BIGINT': <TokenType.BIGINT: 106>, 'INT8': <TokenType.TINYINT: 98>, 'UINT': <TokenType.UINT: 105>, 'UINT128': <TokenType.UINT128: 110>, 'UINT256': <TokenType.UINT256: 112>, 'DEC': <TokenType.DECIMAL: 116>, 'DECIMAL': <TokenType.DECIMAL: 116>, 'DECIMAL32': <TokenType.DECIMAL32: 117>, 'DECIMAL64': <TokenType.DECIMAL64: 118>, 'DECIMAL128': <TokenType.DECIMAL128: 119>, 'DECIMAL256': <TokenType.DECIMAL256: 120>, 'DECFLOAT': <TokenType.DECFLOAT: 121>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 123>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 123>, 'BIGNUM': <TokenType.BIGNUM: 108>, 'LIST': <TokenType.LIST: 319>, 'MAP': <TokenType.MAP: 322>, 'NULLABLE': <TokenType.NULLABLE: 173>, 'NUMBER': <TokenType.DECIMAL: 116>, 'NUMERIC': <TokenType.DECIMAL: 116>, 'FIXED': <TokenType.DECIMAL: 116>, 'REAL': <TokenType.FLOAT: 113>, 'FLOAT': <TokenType.FLOAT: 113>, 'FLOAT4': <TokenType.FLOAT: 113>, 'FLOAT8': <TokenType.DOUBLE: 114>, 'DOUBLE': <TokenType.DOUBLE: 114>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 114>, 'JSON': <TokenType.JSON: 140>, 'JSONB': <TokenType.JSONB: 141>, 'CHAR': <TokenType.CHAR: 124>, 'CHARACTER': <TokenType.CHAR: 124>, 'CHAR VARYING': <TokenType.VARCHAR: 126>, 'CHARACTER VARYING': <TokenType.VARCHAR: 126>, 'NCHAR': <TokenType.NCHAR: 125>, 'VARCHAR': <TokenType.VARCHAR: 126>, 'VARCHAR2': <TokenType.VARCHAR: 126>, 'NVARCHAR': <TokenType.NVARCHAR: 127>, 'NVARCHAR2': <TokenType.NVARCHAR: 127>, 'BPCHAR': <TokenType.BPCHAR: 128>, 'STR': <TokenType.TEXT: 129>, 'STRING': <TokenType.TEXT: 129>, 'TEXT': <TokenType.TEXT: 129>, 'LONGTEXT': <TokenType.LONGTEXT: 131>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 130>, 'TINYTEXT': <TokenType.TINYTEXT: 136>, 'CLOB': <TokenType.TEXT: 129>, 'LONGVARCHAR': <TokenType.TEXT: 129>, 'BINARY': <TokenType.BINARY: 138>, 'BLOB': <TokenType.VARBINARY: 139>, 'LONGBLOB': <TokenType.LONGBLOB: 134>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 133>, 'TINYBLOB': <TokenType.TINYBLOB: 135>, 'BYTEA': <TokenType.VARBINARY: 139>, 'VARBINARY': <TokenType.VARBINARY: 139>, 'TIME': <TokenType.TIME: 142>, 'TIMETZ': <TokenType.TIMETZ: 143>, 'TIME_NS': <TokenType.TIME_NS: 144>, 'TIMESTAMP': <TokenType.TIMESTAMP: 145>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 146>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 147>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 147>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 148>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 148>, 'DATE': <TokenType.DATE: 156>, 'DATETIME': <TokenType.DATETIME: 152>, 'INT4RANGE': <TokenType.INT4RANGE: 158>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 159>, 'INT8RANGE': <TokenType.INT8RANGE: 160>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 161>, 'NUMRANGE': <TokenType.NUMRANGE: 162>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 163>, 'TSRANGE': <TokenType.TSRANGE: 164>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 165>, 'TSTZRANGE': <TokenType.TSTZRANGE: 166>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 167>, 'DATERANGE': <TokenType.DATERANGE: 168>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 169>, 'UNIQUE': <TokenType.UNIQUE: 429>, 'VECTOR': <TokenType.VECTOR: 214>, 'STRUCT': <TokenType.STRUCT: 401>, 'SEQUENCE': <TokenType.SEQUENCE: 388>, 'VARIANT': <TokenType.VARIANT: 197>, 'ALTER': <TokenType.ALTER: 218>, 'ANALYZE': <TokenType.ANALYZE: 438>, 'CALL': <TokenType.COMMAND: 236>, 'COMMENT': <TokenType.COMMENT: 237>, 'EXPLAIN': <TokenType.COMMAND: 236>, 'GRANT': <TokenType.GRANT: 288>, 'REVOKE': <TokenType.REVOKE: 375>, 'OPTIMIZE': <TokenType.COMMAND: 236>, 'PREPARE': <TokenType.COMMAND: 236>, 'VACUUM': <TokenType.COMMAND: 236>, 'USER-DEFINED': <TokenType.USERDEFINED: 192>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 433>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 434>}
COMMANDS =
{<TokenType.SHOW: 392>, <TokenType.COMMAND: 236>, <TokenType.EXECUTE: 269>, <TokenType.FETCH: 272>, <TokenType.RENAME: 372>}
596 def tokenize(self, sql: str) -> list[Token]: 597 """Returns a list of tokens corresponding to the SQL string `sql`.""" 598 return self._core.tokenize(sql) # type: ignore
Returns a list of tokens corresponding to the SQL string sql.
sql: str
600 @property 601 def sql(self) -> str: 602 """The SQL string being tokenized.""" 603 return self._core.sql
The SQL string being tokenized.
size: int
605 @property 606 def size(self) -> int: 607 """Length of the SQL string.""" 608 return self._core.size
Length of the SQL string.
tokens: list[sqlglot.tokenizer_core.Token]
610 @property 611 def tokens(self) -> list[Token]: 612 """The list of tokens produced by tokenization.""" 613 return self._core.tokens
The list of tokens produced by tokenization.