sqlglot.tokens
1from __future__ import annotations 2 3import typing as t 4 5from sqlglot.trie import new_trie 6 7# Import Token and TokenType from tokenizer_core (compiled with mypyc) 8from sqlglot.tokenizer_core import Token, TokenType 9 10try: 11 import sqlglotc # noqa: F401 12except ImportError: 13 pass 14 15try: 16 import sqlglotrs # type: ignore # noqa: F401 17 import warnings 18 19 if "sqlglotc" not in globals(): 20 warnings.warn( 21 "sqlglot[rs] is deprecated and no longer compatible with sqlglot. " 22 "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]", 23 ) 24except ImportError: 25 pass 26 27if t.TYPE_CHECKING: 28 from sqlglot.dialects.dialect import DialectType 29 30 31def _convert_quotes(arr: list[str | tuple[str, str]]) -> dict[str, str]: 32 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr) 33 34 35def _quotes_to_format( 36 token_type: TokenType, arr: list[str | tuple[str, str]] 37) -> dict[str, tuple[str, TokenType]]: 38 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 39 40 41class _TokenizerBase: 42 QUOTES: t.ClassVar[list[tuple[str, str] | str]] 43 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] 44 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 45 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 46 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 47 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 48 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 49 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] 50 STRING_ESCAPES: t.ClassVar[list[str]] 51 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] 52 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] 53 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] 54 HINT_START: t.ClassVar[str] 55 KEYWORDS: t.ClassVar[dict[str, TokenType]] 56 SINGLE_TOKENS: t.ClassVar[dict[str, TokenType]] 57 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] 58 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] 59 COMMANDS: t.ClassVar[set[TokenType]] 60 COMMAND_PREFIX_TOKENS: t.ClassVar[set[TokenType]] 61 HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool] 62 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool] 63 NESTED_COMMENTS: t.ClassVar[bool] 64 TOKENS_PRECEDING_HINT: t.ClassVar[set[TokenType]] 65 HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType] 66 COMMENTS: t.ClassVar[list[str | tuple[str, str]]] 67 _QUOTES: t.ClassVar[dict[str, str]] 68 _IDENTIFIERS: t.ClassVar[dict[str, str]] 69 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] 70 _STRING_ESCAPES: t.ClassVar[set[str]] 71 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] 72 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] 73 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] 74 _COMMENTS: t.ClassVar[dict[str, str | None]] 75 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] 76 77 @classmethod 78 def __init_subclass__(cls, **kwargs: t.Any) -> None: 79 super().__init_subclass__(**kwargs) 80 cls._QUOTES = _convert_quotes(cls.QUOTES) 81 cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS) 82 cls._FORMAT_STRINGS = { 83 **{ 84 p + s: (e, TokenType.NATIONAL_STRING) 85 for s, e in cls._QUOTES.items() 86 for p in ("n", "N") 87 }, 88 **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS), 89 **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS), 90 **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS), 91 **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS), 92 **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS), 93 **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS), 94 } 95 if "BYTE_STRING_ESCAPES" not in cls.__dict__: 96 cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy() 97 cls._STRING_ESCAPES = set(cls.STRING_ESCAPES) 98 cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES) 99 cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS) 100 cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES) 101 cls._COMMENTS = { 102 **{c: None for c in cls.COMMENTS if isinstance(c, str)}, 103 **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)}, 104 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 105 } 106 if cls.HINT_START in cls.KEYWORDS: 107 cls._COMMENTS[cls.HINT_START] = "*/" 108 cls._KEYWORD_TRIE = new_trie( 109 key.upper() 110 for key in ( 111 *cls.KEYWORDS, 112 *cls._COMMENTS, 113 *cls._QUOTES, 114 *cls._FORMAT_STRINGS, 115 ) 116 if " " in key or any(single in key for single in cls.SINGLE_TOKENS) 117 ) 118 119 120class Tokenizer(_TokenizerBase): 121 SINGLE_TOKENS = { 122 "(": TokenType.L_PAREN, 123 ")": TokenType.R_PAREN, 124 "[": TokenType.L_BRACKET, 125 "]": TokenType.R_BRACKET, 126 "{": TokenType.L_BRACE, 127 "}": TokenType.R_BRACE, 128 "&": TokenType.AMP, 129 "^": TokenType.CARET, 130 ":": TokenType.COLON, 131 ",": TokenType.COMMA, 132 ".": TokenType.DOT, 133 "-": TokenType.DASH, 134 "=": TokenType.EQ, 135 ">": TokenType.GT, 136 "<": TokenType.LT, 137 "%": TokenType.MOD, 138 "!": TokenType.NOT, 139 "|": TokenType.PIPE, 140 "+": TokenType.PLUS, 141 ";": TokenType.SEMICOLON, 142 "/": TokenType.SLASH, 143 "\\": TokenType.BACKSLASH, 144 "*": TokenType.STAR, 145 "~": TokenType.TILDE, 146 "?": TokenType.PLACEHOLDER, 147 "@": TokenType.PARAMETER, 148 "#": TokenType.HASH, 149 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 150 "'": TokenType.UNKNOWN, 151 "`": TokenType.UNKNOWN, 152 '"': TokenType.UNKNOWN, 153 } 154 155 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 156 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 157 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 158 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 159 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 160 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 161 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 162 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 163 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 164 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 165 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 166 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 167 168 # The strings in this list can always be used as escapes, regardless of the surrounding 169 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 170 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 171 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 172 173 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 174 HEREDOC_TAG_IS_IDENTIFIER = False 175 176 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 177 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 178 179 # Whether string escape characters function as such when placed within raw strings 180 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 181 182 NESTED_COMMENTS = True 183 184 HINT_START = "/*+" 185 186 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 187 188 # Autofilled 189 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 190 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 191 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 192 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 193 _QUOTES: t.ClassVar[dict[str, str]] = {} 194 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 195 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 196 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 197 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 198 199 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 200 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 201 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 202 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 203 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 204 HINT_START: TokenType.HINT, 205 "&<": TokenType.AMP_LT, 206 "&>": TokenType.AMP_GT, 207 "==": TokenType.EQ, 208 "::": TokenType.DCOLON, 209 "?::": TokenType.QDCOLON, 210 "||": TokenType.DPIPE, 211 "|>": TokenType.PIPE_GT, 212 ">=": TokenType.GTE, 213 "<=": TokenType.LTE, 214 "<>": TokenType.NEQ, 215 "!=": TokenType.NEQ, 216 ":=": TokenType.COLON_EQ, 217 "<=>": TokenType.NULLSAFE_EQ, 218 "->": TokenType.ARROW, 219 "->>": TokenType.DARROW, 220 "=>": TokenType.FARROW, 221 "#>": TokenType.HASH_ARROW, 222 "#>>": TokenType.DHASH_ARROW, 223 "<->": TokenType.LR_ARROW, 224 "&&": TokenType.DAMP, 225 "??": TokenType.DQMARK, 226 "~~~": TokenType.GLOB, 227 "~~": TokenType.LIKE, 228 "~~*": TokenType.ILIKE, 229 "~*": TokenType.IRLIKE, 230 "-|-": TokenType.ADJACENT, 231 "ALL": TokenType.ALL, 232 "AND": TokenType.AND, 233 "ANTI": TokenType.ANTI, 234 "ANY": TokenType.ANY, 235 "ASC": TokenType.ASC, 236 "AS": TokenType.ALIAS, 237 "ASOF": TokenType.ASOF, 238 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 239 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 240 "BEGIN": TokenType.BEGIN, 241 "BETWEEN": TokenType.BETWEEN, 242 "CACHE": TokenType.CACHE, 243 "UNCACHE": TokenType.UNCACHE, 244 "CASE": TokenType.CASE, 245 "CHARACTER SET": TokenType.CHARACTER_SET, 246 "CLUSTER BY": TokenType.CLUSTER_BY, 247 "COLLATE": TokenType.COLLATE, 248 "COLUMN": TokenType.COLUMN, 249 "COMMIT": TokenType.COMMIT, 250 "CONNECT BY": TokenType.CONNECT_BY, 251 "CONSTRAINT": TokenType.CONSTRAINT, 252 "COPY": TokenType.COPY, 253 "CREATE": TokenType.CREATE, 254 "CROSS": TokenType.CROSS, 255 "CUBE": TokenType.CUBE, 256 "CURRENT_DATE": TokenType.CURRENT_DATE, 257 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 258 "CURRENT_TIME": TokenType.CURRENT_TIME, 259 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 260 "CURRENT_USER": TokenType.CURRENT_USER, 261 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 262 "DATABASE": TokenType.DATABASE, 263 "DEFAULT": TokenType.DEFAULT, 264 "DELETE": TokenType.DELETE, 265 "DESC": TokenType.DESC, 266 "DESCRIBE": TokenType.DESCRIBE, 267 "DISTINCT": TokenType.DISTINCT, 268 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 269 "DIV": TokenType.DIV, 270 "DROP": TokenType.DROP, 271 "ELSE": TokenType.ELSE, 272 "END": TokenType.END, 273 "ENUM": TokenType.ENUM, 274 "ESCAPE": TokenType.ESCAPE, 275 "EXCEPT": TokenType.EXCEPT, 276 "EXECUTE": TokenType.EXECUTE, 277 "EXISTS": TokenType.EXISTS, 278 "FALSE": TokenType.FALSE, 279 "FETCH": TokenType.FETCH, 280 "FILTER": TokenType.FILTER, 281 "FILE": TokenType.FILE, 282 "FIRST": TokenType.FIRST, 283 "FULL": TokenType.FULL, 284 "FUNCTION": TokenType.FUNCTION, 285 "FOR": TokenType.FOR, 286 "FOREIGN KEY": TokenType.FOREIGN_KEY, 287 "FORMAT": TokenType.FORMAT, 288 "FROM": TokenType.FROM, 289 "GEOGRAPHY": TokenType.GEOGRAPHY, 290 "GEOMETRY": TokenType.GEOMETRY, 291 "GLOB": TokenType.GLOB, 292 "GROUP BY": TokenType.GROUP_BY, 293 "GROUPING SETS": TokenType.GROUPING_SETS, 294 "HAVING": TokenType.HAVING, 295 "ILIKE": TokenType.ILIKE, 296 "IN": TokenType.IN, 297 "INDEX": TokenType.INDEX, 298 "INET": TokenType.INET, 299 "INNER": TokenType.INNER, 300 "INSERT": TokenType.INSERT, 301 "INTERVAL": TokenType.INTERVAL, 302 "INTERSECT": TokenType.INTERSECT, 303 "INTO": TokenType.INTO, 304 "IS": TokenType.IS, 305 "ISNULL": TokenType.ISNULL, 306 "JOIN": TokenType.JOIN, 307 "KEEP": TokenType.KEEP, 308 "KILL": TokenType.KILL, 309 "LATERAL": TokenType.LATERAL, 310 "LEFT": TokenType.LEFT, 311 "LIKE": TokenType.LIKE, 312 "LIMIT": TokenType.LIMIT, 313 "LOAD": TokenType.LOAD, 314 "LOCALTIME": TokenType.LOCALTIME, 315 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 316 "LOCK": TokenType.LOCK, 317 "MERGE": TokenType.MERGE, 318 "NAMESPACE": TokenType.NAMESPACE, 319 "NATURAL": TokenType.NATURAL, 320 "NEXT": TokenType.NEXT, 321 "NOT": TokenType.NOT, 322 "NOTNULL": TokenType.NOTNULL, 323 "NULL": TokenType.NULL, 324 "OBJECT": TokenType.OBJECT, 325 "OFFSET": TokenType.OFFSET, 326 "ON": TokenType.ON, 327 "OR": TokenType.OR, 328 "XOR": TokenType.XOR, 329 "ORDER BY": TokenType.ORDER_BY, 330 "ORDINALITY": TokenType.ORDINALITY, 331 "OUT": TokenType.OUT, 332 "OUTER": TokenType.OUTER, 333 "OVER": TokenType.OVER, 334 "OVERLAPS": TokenType.OVERLAPS, 335 "OVERWRITE": TokenType.OVERWRITE, 336 "PARTITION": TokenType.PARTITION, 337 "PARTITION BY": TokenType.PARTITION_BY, 338 "PARTITIONED BY": TokenType.PARTITION_BY, 339 "PARTITIONED_BY": TokenType.PARTITION_BY, 340 "PERCENT": TokenType.PERCENT, 341 "PIVOT": TokenType.PIVOT, 342 "PRAGMA": TokenType.PRAGMA, 343 "PRIMARY KEY": TokenType.PRIMARY_KEY, 344 "PROCEDURE": TokenType.PROCEDURE, 345 "OPERATOR": TokenType.OPERATOR, 346 "QUALIFY": TokenType.QUALIFY, 347 "RANGE": TokenType.RANGE, 348 "RECURSIVE": TokenType.RECURSIVE, 349 "REGEXP": TokenType.RLIKE, 350 "RENAME": TokenType.RENAME, 351 "REPLACE": TokenType.REPLACE, 352 "RETURNING": TokenType.RETURNING, 353 "REFERENCES": TokenType.REFERENCES, 354 "RIGHT": TokenType.RIGHT, 355 "RLIKE": TokenType.RLIKE, 356 "ROLLBACK": TokenType.ROLLBACK, 357 "ROLLUP": TokenType.ROLLUP, 358 "ROW": TokenType.ROW, 359 "ROWS": TokenType.ROWS, 360 "SCHEMA": TokenType.SCHEMA, 361 "SELECT": TokenType.SELECT, 362 "SEMI": TokenType.SEMI, 363 "SESSION": TokenType.SESSION, 364 "SESSION_USER": TokenType.SESSION_USER, 365 "SET": TokenType.SET, 366 "SETTINGS": TokenType.SETTINGS, 367 "SHOW": TokenType.SHOW, 368 "SIMILAR TO": TokenType.SIMILAR_TO, 369 "SOME": TokenType.SOME, 370 "SORT BY": TokenType.SORT_BY, 371 "SQL SECURITY": TokenType.SQL_SECURITY, 372 "START WITH": TokenType.START_WITH, 373 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 374 "TABLE": TokenType.TABLE, 375 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 376 "TEMP": TokenType.TEMPORARY, 377 "TEMPORARY": TokenType.TEMPORARY, 378 "THEN": TokenType.THEN, 379 "TRUE": TokenType.TRUE, 380 "TRUNCATE": TokenType.TRUNCATE, 381 "TRIGGER": TokenType.TRIGGER, 382 "UNION": TokenType.UNION, 383 "UNKNOWN": TokenType.UNKNOWN, 384 "UNNEST": TokenType.UNNEST, 385 "UNPIVOT": TokenType.UNPIVOT, 386 "UPDATE": TokenType.UPDATE, 387 "USE": TokenType.USE, 388 "USING": TokenType.USING, 389 "UUID": TokenType.UUID, 390 "VALUES": TokenType.VALUES, 391 "VIEW": TokenType.VIEW, 392 "VOLATILE": TokenType.VOLATILE, 393 "WHEN": TokenType.WHEN, 394 "WHERE": TokenType.WHERE, 395 "WINDOW": TokenType.WINDOW, 396 "WITH": TokenType.WITH, 397 "APPLY": TokenType.APPLY, 398 "ARRAY": TokenType.ARRAY, 399 "BIT": TokenType.BIT, 400 "BOOL": TokenType.BOOLEAN, 401 "BOOLEAN": TokenType.BOOLEAN, 402 "BYTE": TokenType.TINYINT, 403 "MEDIUMINT": TokenType.MEDIUMINT, 404 "INT1": TokenType.TINYINT, 405 "TINYINT": TokenType.TINYINT, 406 "INT16": TokenType.SMALLINT, 407 "SHORT": TokenType.SMALLINT, 408 "SMALLINT": TokenType.SMALLINT, 409 "HUGEINT": TokenType.INT128, 410 "UHUGEINT": TokenType.UINT128, 411 "INT2": TokenType.SMALLINT, 412 "INTEGER": TokenType.INT, 413 "INT": TokenType.INT, 414 "INT4": TokenType.INT, 415 "INT32": TokenType.INT, 416 "INT64": TokenType.BIGINT, 417 "INT128": TokenType.INT128, 418 "INT256": TokenType.INT256, 419 "LONG": TokenType.BIGINT, 420 "BIGINT": TokenType.BIGINT, 421 "INT8": TokenType.TINYINT, 422 "UINT": TokenType.UINT, 423 "UINT128": TokenType.UINT128, 424 "UINT256": TokenType.UINT256, 425 "DEC": TokenType.DECIMAL, 426 "DECIMAL": TokenType.DECIMAL, 427 "DECIMAL32": TokenType.DECIMAL32, 428 "DECIMAL64": TokenType.DECIMAL64, 429 "DECIMAL128": TokenType.DECIMAL128, 430 "DECIMAL256": TokenType.DECIMAL256, 431 "DECFLOAT": TokenType.DECFLOAT, 432 "BIGDECIMAL": TokenType.BIGDECIMAL, 433 "BIGNUMERIC": TokenType.BIGDECIMAL, 434 "BIGNUM": TokenType.BIGNUM, 435 "LIST": TokenType.LIST, 436 "MAP": TokenType.MAP, 437 "NULLABLE": TokenType.NULLABLE, 438 "NUMBER": TokenType.DECIMAL, 439 "NUMERIC": TokenType.DECIMAL, 440 "FIXED": TokenType.DECIMAL, 441 "REAL": TokenType.FLOAT, 442 "FLOAT": TokenType.FLOAT, 443 "FLOAT4": TokenType.FLOAT, 444 "FLOAT8": TokenType.DOUBLE, 445 "DOUBLE": TokenType.DOUBLE, 446 "DOUBLE PRECISION": TokenType.DOUBLE, 447 "JSON": TokenType.JSON, 448 "JSONB": TokenType.JSONB, 449 "CHAR": TokenType.CHAR, 450 "CHARACTER": TokenType.CHAR, 451 "CHAR VARYING": TokenType.VARCHAR, 452 "CHARACTER VARYING": TokenType.VARCHAR, 453 "NCHAR": TokenType.NCHAR, 454 "VARCHAR": TokenType.VARCHAR, 455 "VARCHAR2": TokenType.VARCHAR, 456 "NVARCHAR": TokenType.NVARCHAR, 457 "NVARCHAR2": TokenType.NVARCHAR, 458 "BPCHAR": TokenType.BPCHAR, 459 "STR": TokenType.TEXT, 460 "STRING": TokenType.TEXT, 461 "TEXT": TokenType.TEXT, 462 "LONGTEXT": TokenType.LONGTEXT, 463 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 464 "TINYTEXT": TokenType.TINYTEXT, 465 "CLOB": TokenType.TEXT, 466 "LONGVARCHAR": TokenType.TEXT, 467 "BINARY": TokenType.BINARY, 468 "BLOB": TokenType.VARBINARY, 469 "LONGBLOB": TokenType.LONGBLOB, 470 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 471 "TINYBLOB": TokenType.TINYBLOB, 472 "BYTEA": TokenType.VARBINARY, 473 "VARBINARY": TokenType.VARBINARY, 474 "TIME": TokenType.TIME, 475 "TIMETZ": TokenType.TIMETZ, 476 "TIME_NS": TokenType.TIME_NS, 477 "TIMESTAMP": TokenType.TIMESTAMP, 478 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 479 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 480 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 481 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 482 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 483 "DATE": TokenType.DATE, 484 "DATETIME": TokenType.DATETIME, 485 "INT4RANGE": TokenType.INT4RANGE, 486 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 487 "INT8RANGE": TokenType.INT8RANGE, 488 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 489 "NUMRANGE": TokenType.NUMRANGE, 490 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 491 "TSRANGE": TokenType.TSRANGE, 492 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 493 "TSTZRANGE": TokenType.TSTZRANGE, 494 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 495 "DATERANGE": TokenType.DATERANGE, 496 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 497 "UNIQUE": TokenType.UNIQUE, 498 "VECTOR": TokenType.VECTOR, 499 "STRUCT": TokenType.STRUCT, 500 "SEQUENCE": TokenType.SEQUENCE, 501 "VARIANT": TokenType.VARIANT, 502 "ALTER": TokenType.ALTER, 503 "ANALYZE": TokenType.ANALYZE, 504 "CALL": TokenType.COMMAND, 505 "COMMENT": TokenType.COMMENT, 506 "EXPLAIN": TokenType.COMMAND, 507 "GRANT": TokenType.GRANT, 508 "REVOKE": TokenType.REVOKE, 509 "OPTIMIZE": TokenType.COMMAND, 510 "PREPARE": TokenType.COMMAND, 511 "VACUUM": TokenType.COMMAND, 512 "USER-DEFINED": TokenType.USERDEFINED, 513 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 514 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 515 } 516 517 COMMANDS = { 518 TokenType.COMMAND, 519 TokenType.EXECUTE, 520 TokenType.FETCH, 521 TokenType.SHOW, 522 TokenType.RENAME, 523 } 524 525 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 526 527 # Handle numeric literals like in hive (3L = BIGINT) 528 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 529 530 # In tokenizers like JSONPath, dots are always key separators, never decimal points 531 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 532 533 COMMENTS = ["--", ("/*", "*/")] 534 535 __slots__ = ( 536 "dialect", 537 "_core", 538 ) 539 540 def __init__(self, dialect: DialectType = None) -> None: 541 from sqlglot.dialects.dialect import Dialect 542 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 543 544 self.dialect = Dialect.get_or_raise(dialect) 545 546 self._core = _TokenizerCore( 547 single_tokens=self.SINGLE_TOKENS, 548 keywords=self.KEYWORDS, 549 quotes=self._QUOTES, 550 format_strings=self._FORMAT_STRINGS, 551 identifiers=self._IDENTIFIERS, 552 comments=self._COMMENTS, 553 string_escapes=self._STRING_ESCAPES, 554 byte_string_escapes=self._BYTE_STRING_ESCAPES, 555 identifier_escapes=self._IDENTIFIER_ESCAPES, 556 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 557 commands=self.COMMANDS, 558 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 559 nested_comments=self.NESTED_COMMENTS, 560 hint_start=self.HINT_START, 561 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 562 bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS), 563 hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS), 564 numeric_literals=self.NUMERIC_LITERALS, 565 var_single_tokens=self.VAR_SINGLE_TOKENS, 566 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 567 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 568 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 569 keyword_trie=self._KEYWORD_TRIE, 570 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 571 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 572 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 573 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 574 ) 575 576 def tokenize(self, sql: str) -> list[Token]: 577 """Returns a list of tokens corresponding to the SQL string `sql`.""" 578 return self._core.tokenize(sql) # type: ignore 579 580 @property 581 def sql(self) -> str: 582 """The SQL string being tokenized.""" 583 return self._core.sql 584 585 @property 586 def size(self) -> int: 587 """Length of the SQL string.""" 588 return self._core.size 589 590 @property 591 def tokens(self) -> list[Token]: 592 """The list of tokens produced by tokenization.""" 593 return self._core.tokens
121class Tokenizer(_TokenizerBase): 122 SINGLE_TOKENS = { 123 "(": TokenType.L_PAREN, 124 ")": TokenType.R_PAREN, 125 "[": TokenType.L_BRACKET, 126 "]": TokenType.R_BRACKET, 127 "{": TokenType.L_BRACE, 128 "}": TokenType.R_BRACE, 129 "&": TokenType.AMP, 130 "^": TokenType.CARET, 131 ":": TokenType.COLON, 132 ",": TokenType.COMMA, 133 ".": TokenType.DOT, 134 "-": TokenType.DASH, 135 "=": TokenType.EQ, 136 ">": TokenType.GT, 137 "<": TokenType.LT, 138 "%": TokenType.MOD, 139 "!": TokenType.NOT, 140 "|": TokenType.PIPE, 141 "+": TokenType.PLUS, 142 ";": TokenType.SEMICOLON, 143 "/": TokenType.SLASH, 144 "\\": TokenType.BACKSLASH, 145 "*": TokenType.STAR, 146 "~": TokenType.TILDE, 147 "?": TokenType.PLACEHOLDER, 148 "@": TokenType.PARAMETER, 149 "#": TokenType.HASH, 150 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 151 "'": TokenType.UNKNOWN, 152 "`": TokenType.UNKNOWN, 153 '"': TokenType.UNKNOWN, 154 } 155 156 BIT_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 157 BYTE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 158 HEX_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 159 RAW_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 160 HEREDOC_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 161 UNICODE_STRINGS: t.ClassVar[list[str | tuple[str, str]]] = [] 162 IDENTIFIERS: t.ClassVar[list[str | tuple[str, str]]] = ['"'] 163 QUOTES: t.ClassVar[list[tuple[str, str] | str]] = ["'"] 164 STRING_ESCAPES: t.ClassVar[list[str]] = ["'"] 165 BYTE_STRING_ESCAPES: t.ClassVar[list[str]] = [] 166 VAR_SINGLE_TOKENS: t.ClassVar[set[str]] = set() 167 ESCAPE_FOLLOW_CHARS: t.ClassVar[list[str]] = [] 168 169 # The strings in this list can always be used as escapes, regardless of the surrounding 170 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 171 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 172 IDENTIFIER_ESCAPES: t.ClassVar[list[str]] = [] 173 174 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 175 HEREDOC_TAG_IS_IDENTIFIER = False 176 177 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 178 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 179 180 # Whether string escape characters function as such when placed within raw strings 181 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 182 183 NESTED_COMMENTS = True 184 185 HINT_START = "/*+" 186 187 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 188 189 # Autofilled 190 _COMMENTS: t.ClassVar[dict[str, str | None]] = {} 191 _FORMAT_STRINGS: t.ClassVar[dict[str, tuple[str, TokenType]]] = {} 192 _IDENTIFIERS: t.ClassVar[dict[str, str]] = {} 193 _IDENTIFIER_ESCAPES: t.ClassVar[set[str]] = set() 194 _QUOTES: t.ClassVar[dict[str, str]] = {} 195 _STRING_ESCAPES: t.ClassVar[set[str]] = set() 196 _BYTE_STRING_ESCAPES: t.ClassVar[set[str]] = set() 197 _KEYWORD_TRIE: t.ClassVar[dict[str, object]] = {} 198 _ESCAPE_FOLLOW_CHARS: t.ClassVar[set[str]] = set() 199 200 KEYWORDS: t.ClassVar[dict[str, TokenType]] = { 201 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 202 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 203 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 204 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 205 HINT_START: TokenType.HINT, 206 "&<": TokenType.AMP_LT, 207 "&>": TokenType.AMP_GT, 208 "==": TokenType.EQ, 209 "::": TokenType.DCOLON, 210 "?::": TokenType.QDCOLON, 211 "||": TokenType.DPIPE, 212 "|>": TokenType.PIPE_GT, 213 ">=": TokenType.GTE, 214 "<=": TokenType.LTE, 215 "<>": TokenType.NEQ, 216 "!=": TokenType.NEQ, 217 ":=": TokenType.COLON_EQ, 218 "<=>": TokenType.NULLSAFE_EQ, 219 "->": TokenType.ARROW, 220 "->>": TokenType.DARROW, 221 "=>": TokenType.FARROW, 222 "#>": TokenType.HASH_ARROW, 223 "#>>": TokenType.DHASH_ARROW, 224 "<->": TokenType.LR_ARROW, 225 "&&": TokenType.DAMP, 226 "??": TokenType.DQMARK, 227 "~~~": TokenType.GLOB, 228 "~~": TokenType.LIKE, 229 "~~*": TokenType.ILIKE, 230 "~*": TokenType.IRLIKE, 231 "-|-": TokenType.ADJACENT, 232 "ALL": TokenType.ALL, 233 "AND": TokenType.AND, 234 "ANTI": TokenType.ANTI, 235 "ANY": TokenType.ANY, 236 "ASC": TokenType.ASC, 237 "AS": TokenType.ALIAS, 238 "ASOF": TokenType.ASOF, 239 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 240 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 241 "BEGIN": TokenType.BEGIN, 242 "BETWEEN": TokenType.BETWEEN, 243 "CACHE": TokenType.CACHE, 244 "UNCACHE": TokenType.UNCACHE, 245 "CASE": TokenType.CASE, 246 "CHARACTER SET": TokenType.CHARACTER_SET, 247 "CLUSTER BY": TokenType.CLUSTER_BY, 248 "COLLATE": TokenType.COLLATE, 249 "COLUMN": TokenType.COLUMN, 250 "COMMIT": TokenType.COMMIT, 251 "CONNECT BY": TokenType.CONNECT_BY, 252 "CONSTRAINT": TokenType.CONSTRAINT, 253 "COPY": TokenType.COPY, 254 "CREATE": TokenType.CREATE, 255 "CROSS": TokenType.CROSS, 256 "CUBE": TokenType.CUBE, 257 "CURRENT_DATE": TokenType.CURRENT_DATE, 258 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 259 "CURRENT_TIME": TokenType.CURRENT_TIME, 260 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 261 "CURRENT_USER": TokenType.CURRENT_USER, 262 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 263 "DATABASE": TokenType.DATABASE, 264 "DEFAULT": TokenType.DEFAULT, 265 "DELETE": TokenType.DELETE, 266 "DESC": TokenType.DESC, 267 "DESCRIBE": TokenType.DESCRIBE, 268 "DISTINCT": TokenType.DISTINCT, 269 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 270 "DIV": TokenType.DIV, 271 "DROP": TokenType.DROP, 272 "ELSE": TokenType.ELSE, 273 "END": TokenType.END, 274 "ENUM": TokenType.ENUM, 275 "ESCAPE": TokenType.ESCAPE, 276 "EXCEPT": TokenType.EXCEPT, 277 "EXECUTE": TokenType.EXECUTE, 278 "EXISTS": TokenType.EXISTS, 279 "FALSE": TokenType.FALSE, 280 "FETCH": TokenType.FETCH, 281 "FILTER": TokenType.FILTER, 282 "FILE": TokenType.FILE, 283 "FIRST": TokenType.FIRST, 284 "FULL": TokenType.FULL, 285 "FUNCTION": TokenType.FUNCTION, 286 "FOR": TokenType.FOR, 287 "FOREIGN KEY": TokenType.FOREIGN_KEY, 288 "FORMAT": TokenType.FORMAT, 289 "FROM": TokenType.FROM, 290 "GEOGRAPHY": TokenType.GEOGRAPHY, 291 "GEOMETRY": TokenType.GEOMETRY, 292 "GLOB": TokenType.GLOB, 293 "GROUP BY": TokenType.GROUP_BY, 294 "GROUPING SETS": TokenType.GROUPING_SETS, 295 "HAVING": TokenType.HAVING, 296 "ILIKE": TokenType.ILIKE, 297 "IN": TokenType.IN, 298 "INDEX": TokenType.INDEX, 299 "INET": TokenType.INET, 300 "INNER": TokenType.INNER, 301 "INSERT": TokenType.INSERT, 302 "INTERVAL": TokenType.INTERVAL, 303 "INTERSECT": TokenType.INTERSECT, 304 "INTO": TokenType.INTO, 305 "IS": TokenType.IS, 306 "ISNULL": TokenType.ISNULL, 307 "JOIN": TokenType.JOIN, 308 "KEEP": TokenType.KEEP, 309 "KILL": TokenType.KILL, 310 "LATERAL": TokenType.LATERAL, 311 "LEFT": TokenType.LEFT, 312 "LIKE": TokenType.LIKE, 313 "LIMIT": TokenType.LIMIT, 314 "LOAD": TokenType.LOAD, 315 "LOCALTIME": TokenType.LOCALTIME, 316 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 317 "LOCK": TokenType.LOCK, 318 "MERGE": TokenType.MERGE, 319 "NAMESPACE": TokenType.NAMESPACE, 320 "NATURAL": TokenType.NATURAL, 321 "NEXT": TokenType.NEXT, 322 "NOT": TokenType.NOT, 323 "NOTNULL": TokenType.NOTNULL, 324 "NULL": TokenType.NULL, 325 "OBJECT": TokenType.OBJECT, 326 "OFFSET": TokenType.OFFSET, 327 "ON": TokenType.ON, 328 "OR": TokenType.OR, 329 "XOR": TokenType.XOR, 330 "ORDER BY": TokenType.ORDER_BY, 331 "ORDINALITY": TokenType.ORDINALITY, 332 "OUT": TokenType.OUT, 333 "OUTER": TokenType.OUTER, 334 "OVER": TokenType.OVER, 335 "OVERLAPS": TokenType.OVERLAPS, 336 "OVERWRITE": TokenType.OVERWRITE, 337 "PARTITION": TokenType.PARTITION, 338 "PARTITION BY": TokenType.PARTITION_BY, 339 "PARTITIONED BY": TokenType.PARTITION_BY, 340 "PARTITIONED_BY": TokenType.PARTITION_BY, 341 "PERCENT": TokenType.PERCENT, 342 "PIVOT": TokenType.PIVOT, 343 "PRAGMA": TokenType.PRAGMA, 344 "PRIMARY KEY": TokenType.PRIMARY_KEY, 345 "PROCEDURE": TokenType.PROCEDURE, 346 "OPERATOR": TokenType.OPERATOR, 347 "QUALIFY": TokenType.QUALIFY, 348 "RANGE": TokenType.RANGE, 349 "RECURSIVE": TokenType.RECURSIVE, 350 "REGEXP": TokenType.RLIKE, 351 "RENAME": TokenType.RENAME, 352 "REPLACE": TokenType.REPLACE, 353 "RETURNING": TokenType.RETURNING, 354 "REFERENCES": TokenType.REFERENCES, 355 "RIGHT": TokenType.RIGHT, 356 "RLIKE": TokenType.RLIKE, 357 "ROLLBACK": TokenType.ROLLBACK, 358 "ROLLUP": TokenType.ROLLUP, 359 "ROW": TokenType.ROW, 360 "ROWS": TokenType.ROWS, 361 "SCHEMA": TokenType.SCHEMA, 362 "SELECT": TokenType.SELECT, 363 "SEMI": TokenType.SEMI, 364 "SESSION": TokenType.SESSION, 365 "SESSION_USER": TokenType.SESSION_USER, 366 "SET": TokenType.SET, 367 "SETTINGS": TokenType.SETTINGS, 368 "SHOW": TokenType.SHOW, 369 "SIMILAR TO": TokenType.SIMILAR_TO, 370 "SOME": TokenType.SOME, 371 "SORT BY": TokenType.SORT_BY, 372 "SQL SECURITY": TokenType.SQL_SECURITY, 373 "START WITH": TokenType.START_WITH, 374 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 375 "TABLE": TokenType.TABLE, 376 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 377 "TEMP": TokenType.TEMPORARY, 378 "TEMPORARY": TokenType.TEMPORARY, 379 "THEN": TokenType.THEN, 380 "TRUE": TokenType.TRUE, 381 "TRUNCATE": TokenType.TRUNCATE, 382 "TRIGGER": TokenType.TRIGGER, 383 "UNION": TokenType.UNION, 384 "UNKNOWN": TokenType.UNKNOWN, 385 "UNNEST": TokenType.UNNEST, 386 "UNPIVOT": TokenType.UNPIVOT, 387 "UPDATE": TokenType.UPDATE, 388 "USE": TokenType.USE, 389 "USING": TokenType.USING, 390 "UUID": TokenType.UUID, 391 "VALUES": TokenType.VALUES, 392 "VIEW": TokenType.VIEW, 393 "VOLATILE": TokenType.VOLATILE, 394 "WHEN": TokenType.WHEN, 395 "WHERE": TokenType.WHERE, 396 "WINDOW": TokenType.WINDOW, 397 "WITH": TokenType.WITH, 398 "APPLY": TokenType.APPLY, 399 "ARRAY": TokenType.ARRAY, 400 "BIT": TokenType.BIT, 401 "BOOL": TokenType.BOOLEAN, 402 "BOOLEAN": TokenType.BOOLEAN, 403 "BYTE": TokenType.TINYINT, 404 "MEDIUMINT": TokenType.MEDIUMINT, 405 "INT1": TokenType.TINYINT, 406 "TINYINT": TokenType.TINYINT, 407 "INT16": TokenType.SMALLINT, 408 "SHORT": TokenType.SMALLINT, 409 "SMALLINT": TokenType.SMALLINT, 410 "HUGEINT": TokenType.INT128, 411 "UHUGEINT": TokenType.UINT128, 412 "INT2": TokenType.SMALLINT, 413 "INTEGER": TokenType.INT, 414 "INT": TokenType.INT, 415 "INT4": TokenType.INT, 416 "INT32": TokenType.INT, 417 "INT64": TokenType.BIGINT, 418 "INT128": TokenType.INT128, 419 "INT256": TokenType.INT256, 420 "LONG": TokenType.BIGINT, 421 "BIGINT": TokenType.BIGINT, 422 "INT8": TokenType.TINYINT, 423 "UINT": TokenType.UINT, 424 "UINT128": TokenType.UINT128, 425 "UINT256": TokenType.UINT256, 426 "DEC": TokenType.DECIMAL, 427 "DECIMAL": TokenType.DECIMAL, 428 "DECIMAL32": TokenType.DECIMAL32, 429 "DECIMAL64": TokenType.DECIMAL64, 430 "DECIMAL128": TokenType.DECIMAL128, 431 "DECIMAL256": TokenType.DECIMAL256, 432 "DECFLOAT": TokenType.DECFLOAT, 433 "BIGDECIMAL": TokenType.BIGDECIMAL, 434 "BIGNUMERIC": TokenType.BIGDECIMAL, 435 "BIGNUM": TokenType.BIGNUM, 436 "LIST": TokenType.LIST, 437 "MAP": TokenType.MAP, 438 "NULLABLE": TokenType.NULLABLE, 439 "NUMBER": TokenType.DECIMAL, 440 "NUMERIC": TokenType.DECIMAL, 441 "FIXED": TokenType.DECIMAL, 442 "REAL": TokenType.FLOAT, 443 "FLOAT": TokenType.FLOAT, 444 "FLOAT4": TokenType.FLOAT, 445 "FLOAT8": TokenType.DOUBLE, 446 "DOUBLE": TokenType.DOUBLE, 447 "DOUBLE PRECISION": TokenType.DOUBLE, 448 "JSON": TokenType.JSON, 449 "JSONB": TokenType.JSONB, 450 "CHAR": TokenType.CHAR, 451 "CHARACTER": TokenType.CHAR, 452 "CHAR VARYING": TokenType.VARCHAR, 453 "CHARACTER VARYING": TokenType.VARCHAR, 454 "NCHAR": TokenType.NCHAR, 455 "VARCHAR": TokenType.VARCHAR, 456 "VARCHAR2": TokenType.VARCHAR, 457 "NVARCHAR": TokenType.NVARCHAR, 458 "NVARCHAR2": TokenType.NVARCHAR, 459 "BPCHAR": TokenType.BPCHAR, 460 "STR": TokenType.TEXT, 461 "STRING": TokenType.TEXT, 462 "TEXT": TokenType.TEXT, 463 "LONGTEXT": TokenType.LONGTEXT, 464 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 465 "TINYTEXT": TokenType.TINYTEXT, 466 "CLOB": TokenType.TEXT, 467 "LONGVARCHAR": TokenType.TEXT, 468 "BINARY": TokenType.BINARY, 469 "BLOB": TokenType.VARBINARY, 470 "LONGBLOB": TokenType.LONGBLOB, 471 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 472 "TINYBLOB": TokenType.TINYBLOB, 473 "BYTEA": TokenType.VARBINARY, 474 "VARBINARY": TokenType.VARBINARY, 475 "TIME": TokenType.TIME, 476 "TIMETZ": TokenType.TIMETZ, 477 "TIME_NS": TokenType.TIME_NS, 478 "TIMESTAMP": TokenType.TIMESTAMP, 479 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 480 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 481 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 482 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 483 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 484 "DATE": TokenType.DATE, 485 "DATETIME": TokenType.DATETIME, 486 "INT4RANGE": TokenType.INT4RANGE, 487 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 488 "INT8RANGE": TokenType.INT8RANGE, 489 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 490 "NUMRANGE": TokenType.NUMRANGE, 491 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 492 "TSRANGE": TokenType.TSRANGE, 493 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 494 "TSTZRANGE": TokenType.TSTZRANGE, 495 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 496 "DATERANGE": TokenType.DATERANGE, 497 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 498 "UNIQUE": TokenType.UNIQUE, 499 "VECTOR": TokenType.VECTOR, 500 "STRUCT": TokenType.STRUCT, 501 "SEQUENCE": TokenType.SEQUENCE, 502 "VARIANT": TokenType.VARIANT, 503 "ALTER": TokenType.ALTER, 504 "ANALYZE": TokenType.ANALYZE, 505 "CALL": TokenType.COMMAND, 506 "COMMENT": TokenType.COMMENT, 507 "EXPLAIN": TokenType.COMMAND, 508 "GRANT": TokenType.GRANT, 509 "REVOKE": TokenType.REVOKE, 510 "OPTIMIZE": TokenType.COMMAND, 511 "PREPARE": TokenType.COMMAND, 512 "VACUUM": TokenType.COMMAND, 513 "USER-DEFINED": TokenType.USERDEFINED, 514 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 515 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 516 } 517 518 COMMANDS = { 519 TokenType.COMMAND, 520 TokenType.EXECUTE, 521 TokenType.FETCH, 522 TokenType.SHOW, 523 TokenType.RENAME, 524 } 525 526 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 527 528 # Handle numeric literals like in hive (3L = BIGINT) 529 NUMERIC_LITERALS: t.ClassVar[dict[str, str]] = {} 530 531 # In tokenizers like JSONPath, dots are always key separators, never decimal points 532 NUMBERS_CAN_HAVE_DECIMALS: t.ClassVar[bool] = True 533 534 COMMENTS = ["--", ("/*", "*/")] 535 536 __slots__ = ( 537 "dialect", 538 "_core", 539 ) 540 541 def __init__(self, dialect: DialectType = None) -> None: 542 from sqlglot.dialects.dialect import Dialect 543 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 544 545 self.dialect = Dialect.get_or_raise(dialect) 546 547 self._core = _TokenizerCore( 548 single_tokens=self.SINGLE_TOKENS, 549 keywords=self.KEYWORDS, 550 quotes=self._QUOTES, 551 format_strings=self._FORMAT_STRINGS, 552 identifiers=self._IDENTIFIERS, 553 comments=self._COMMENTS, 554 string_escapes=self._STRING_ESCAPES, 555 byte_string_escapes=self._BYTE_STRING_ESCAPES, 556 identifier_escapes=self._IDENTIFIER_ESCAPES, 557 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 558 commands=self.COMMANDS, 559 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 560 nested_comments=self.NESTED_COMMENTS, 561 hint_start=self.HINT_START, 562 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 563 bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS), 564 hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS), 565 numeric_literals=self.NUMERIC_LITERALS, 566 var_single_tokens=self.VAR_SINGLE_TOKENS, 567 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 568 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 569 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 570 keyword_trie=self._KEYWORD_TRIE, 571 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 572 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 573 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 574 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 575 ) 576 577 def tokenize(self, sql: str) -> list[Token]: 578 """Returns a list of tokens corresponding to the SQL string `sql`.""" 579 return self._core.tokenize(sql) # type: ignore 580 581 @property 582 def sql(self) -> str: 583 """The SQL string being tokenized.""" 584 return self._core.sql 585 586 @property 587 def size(self) -> int: 588 """Length of the SQL string.""" 589 return self._core.size 590 591 @property 592 def tokens(self) -> list[Token]: 593 """The list of tokens produced by tokenization.""" 594 return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
541 def __init__(self, dialect: DialectType = None) -> None: 542 from sqlglot.dialects.dialect import Dialect 543 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 544 545 self.dialect = Dialect.get_or_raise(dialect) 546 547 self._core = _TokenizerCore( 548 single_tokens=self.SINGLE_TOKENS, 549 keywords=self.KEYWORDS, 550 quotes=self._QUOTES, 551 format_strings=self._FORMAT_STRINGS, 552 identifiers=self._IDENTIFIERS, 553 comments=self._COMMENTS, 554 string_escapes=self._STRING_ESCAPES, 555 byte_string_escapes=self._BYTE_STRING_ESCAPES, 556 identifier_escapes=self._IDENTIFIER_ESCAPES, 557 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 558 commands=self.COMMANDS, 559 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 560 nested_comments=self.NESTED_COMMENTS, 561 hint_start=self.HINT_START, 562 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 563 bit_strings=list[t.Union[str, tuple[str, str]]](self.BIT_STRINGS), 564 hex_strings=list[t.Union[str, tuple[str, str]]](self.HEX_STRINGS), 565 numeric_literals=self.NUMERIC_LITERALS, 566 var_single_tokens=self.VAR_SINGLE_TOKENS, 567 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 568 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 569 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 570 keyword_trie=self._KEYWORD_TRIE, 571 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 572 numbers_can_have_decimals=self.NUMBERS_CAN_HAVE_DECIMALS, 573 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 574 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 575 )
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 36>, '^': <TokenType.CARET: 42>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 28>, '>': <TokenType.GT: 25>, '<': <TokenType.LT: 23>, '%': <TokenType.MOD: 326>, '!': <TokenType.NOT: 27>, '|': <TokenType.PIPE: 39>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 19>, '/': <TokenType.SLASH: 22>, '\\': <TokenType.BACKSLASH: 21>, '*': <TokenType.STAR: 20>, '~': <TokenType.TILDE: 44>, '?': <TokenType.PLACEHOLDER: 353>, '@': <TokenType.PARAMETER: 56>, '#': <TokenType.HASH: 48>, "'": <TokenType.UNKNOWN: 212>, '`': <TokenType.UNKNOWN: 212>, '"': <TokenType.UNKNOWN: 212>}
TOKENS_PRECEDING_HINT =
{<TokenType.INSERT: 297>, <TokenType.UPDATE: 413>, <TokenType.DELETE: 254>, <TokenType.SELECT: 383>}
KEYWORDS: ClassVar[dict[str, sqlglot.tokenizer_core.TokenType]] =
{'{%': <TokenType.BLOCK_START: 71>, '{%+': <TokenType.BLOCK_START: 71>, '{%-': <TokenType.BLOCK_START: 71>, '%}': <TokenType.BLOCK_END: 72>, '+%}': <TokenType.BLOCK_END: 72>, '-%}': <TokenType.BLOCK_END: 72>, '{{+': <TokenType.BLOCK_START: 71>, '{{-': <TokenType.BLOCK_START: 71>, '+}}': <TokenType.BLOCK_END: 72>, '-}}': <TokenType.BLOCK_END: 72>, '/*+': <TokenType.HINT: 290>, '&<': <TokenType.AMP_LT: 61>, '&>': <TokenType.AMP_GT: 62>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 366>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '&&': <TokenType.DAMP: 60>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 284>, '~~': <TokenType.LIKE: 315>, '~~*': <TokenType.ILIKE: 292>, '~*': <TokenType.IRLIKE: 304>, '-|-': <TokenType.ADJACENT: 63>, 'ALL': <TokenType.ALL: 218>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 219>, 'ANY': <TokenType.ANY: 220>, 'ASC': <TokenType.ASC: 223>, 'AS': <TokenType.ALIAS: 216>, 'ASOF': <TokenType.ASOF: 224>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'BEGIN': <TokenType.BEGIN: 227>, 'BETWEEN': <TokenType.BETWEEN: 228>, 'CACHE': <TokenType.CACHE: 230>, 'UNCACHE': <TokenType.UNCACHE: 409>, 'CASE': <TokenType.CASE: 231>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 232>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 233>, 'COLLATE': <TokenType.COLLATE: 234>, 'COLUMN': <TokenType.COLUMN: 79>, 'COMMIT': <TokenType.COMMIT: 237>, 'CONNECT BY': <TokenType.CONNECT_BY: 238>, 'CONSTRAINT': <TokenType.CONSTRAINT: 239>, 'COPY': <TokenType.COPY: 240>, 'CREATE': <TokenType.CREATE: 241>, 'CROSS': <TokenType.CROSS: 242>, 'CUBE': <TokenType.CUBE: 243>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 244>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 246>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 247>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 248>, 'CURRENT_USER': <TokenType.CURRENT_USER: 249>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 251>, 'DATABASE': <TokenType.DATABASE: 78>, 'DEFAULT': <TokenType.DEFAULT: 253>, 'DELETE': <TokenType.DELETE: 254>, 'DESC': <TokenType.DESC: 255>, 'DESCRIBE': <TokenType.DESCRIBE: 256>, 'DISTINCT': <TokenType.DISTINCT: 259>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 260>, 'DIV': <TokenType.DIV: 261>, 'DROP': <TokenType.DROP: 262>, 'ELSE': <TokenType.ELSE: 263>, 'END': <TokenType.END: 264>, 'ENUM': <TokenType.ENUM: 203>, 'ESCAPE': <TokenType.ESCAPE: 265>, 'EXCEPT': <TokenType.EXCEPT: 266>, 'EXECUTE': <TokenType.EXECUTE: 267>, 'EXISTS': <TokenType.EXISTS: 268>, 'FALSE': <TokenType.FALSE: 269>, 'FETCH': <TokenType.FETCH: 270>, 'FILTER': <TokenType.FILTER: 273>, 'FILE': <TokenType.FILE: 271>, 'FIRST': <TokenType.FIRST: 275>, 'FULL': <TokenType.FULL: 281>, 'FUNCTION': <TokenType.FUNCTION: 282>, 'FOR': <TokenType.FOR: 276>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 278>, 'FORMAT': <TokenType.FORMAT: 279>, 'FROM': <TokenType.FROM: 280>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 170>, 'GEOMETRY': <TokenType.GEOMETRY: 173>, 'GLOB': <TokenType.GLOB: 284>, 'GROUP BY': <TokenType.GROUP_BY: 287>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 288>, 'HAVING': <TokenType.HAVING: 289>, 'ILIKE': <TokenType.ILIKE: 292>, 'IN': <TokenType.IN: 293>, 'INDEX': <TokenType.INDEX: 294>, 'INET': <TokenType.INET: 198>, 'INNER': <TokenType.INNER: 296>, 'INSERT': <TokenType.INSERT: 297>, 'INTERVAL': <TokenType.INTERVAL: 301>, 'INTERSECT': <TokenType.INTERSECT: 300>, 'INTO': <TokenType.INTO: 302>, 'IS': <TokenType.IS: 305>, 'ISNULL': <TokenType.ISNULL: 306>, 'JOIN': <TokenType.JOIN: 307>, 'KEEP': <TokenType.KEEP: 309>, 'KILL': <TokenType.KILL: 311>, 'LATERAL': <TokenType.LATERAL: 313>, 'LEFT': <TokenType.LEFT: 314>, 'LIKE': <TokenType.LIKE: 315>, 'LIMIT': <TokenType.LIMIT: 316>, 'LOAD': <TokenType.LOAD: 318>, 'LOCALTIME': <TokenType.LOCALTIME: 177>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 178>, 'LOCK': <TokenType.LOCK: 319>, 'MERGE': <TokenType.MERGE: 325>, 'NAMESPACE': <TokenType.NAMESPACE: 436>, 'NATURAL': <TokenType.NATURAL: 328>, 'NEXT': <TokenType.NEXT: 329>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 331>, 'NULL': <TokenType.NULL: 332>, 'OBJECT': <TokenType.OBJECT: 197>, 'OFFSET': <TokenType.OFFSET: 334>, 'ON': <TokenType.ON: 335>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 64>, 'ORDER BY': <TokenType.ORDER_BY: 338>, 'ORDINALITY': <TokenType.ORDINALITY: 341>, 'OUT': <TokenType.OUT: 342>, 'OUTER': <TokenType.OUTER: 344>, 'OVER': <TokenType.OVER: 345>, 'OVERLAPS': <TokenType.OVERLAPS: 346>, 'OVERWRITE': <TokenType.OVERWRITE: 347>, 'PARTITION': <TokenType.PARTITION: 349>, 'PARTITION BY': <TokenType.PARTITION_BY: 350>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 350>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 350>, 'PERCENT': <TokenType.PERCENT: 351>, 'PIVOT': <TokenType.PIVOT: 352>, 'PRAGMA': <TokenType.PRAGMA: 357>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 359>, 'PROCEDURE': <TokenType.PROCEDURE: 360>, 'OPERATOR': <TokenType.OPERATOR: 337>, 'QUALIFY': <TokenType.QUALIFY: 364>, 'RANGE': <TokenType.RANGE: 367>, 'RECURSIVE': <TokenType.RECURSIVE: 368>, 'REGEXP': <TokenType.RLIKE: 376>, 'RENAME': <TokenType.RENAME: 370>, 'REPLACE': <TokenType.REPLACE: 371>, 'RETURNING': <TokenType.RETURNING: 372>, 'REFERENCES': <TokenType.REFERENCES: 374>, 'RIGHT': <TokenType.RIGHT: 375>, 'RLIKE': <TokenType.RLIKE: 376>, 'ROLLBACK': <TokenType.ROLLBACK: 378>, 'ROLLUP': <TokenType.ROLLUP: 379>, 'ROW': <TokenType.ROW: 380>, 'ROWS': <TokenType.ROWS: 381>, 'SCHEMA': <TokenType.SCHEMA: 81>, 'SELECT': <TokenType.SELECT: 383>, 'SEMI': <TokenType.SEMI: 384>, 'SESSION': <TokenType.SESSION: 57>, 'SESSION_USER': <TokenType.SESSION_USER: 59>, 'SET': <TokenType.SET: 388>, 'SETTINGS': <TokenType.SETTINGS: 389>, 'SHOW': <TokenType.SHOW: 390>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 391>, 'SOME': <TokenType.SOME: 392>, 'SORT BY': <TokenType.SORT_BY: 393>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 395>, 'START WITH': <TokenType.START_WITH: 396>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 398>, 'TABLE': <TokenType.TABLE: 82>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 401>, 'TEMP': <TokenType.TEMPORARY: 403>, 'TEMPORARY': <TokenType.TEMPORARY: 403>, 'THEN': <TokenType.THEN: 405>, 'TRUE': <TokenType.TRUE: 406>, 'TRUNCATE': <TokenType.TRUNCATE: 407>, 'TRIGGER': <TokenType.TRIGGER: 408>, 'UNION': <TokenType.UNION: 410>, 'UNKNOWN': <TokenType.UNKNOWN: 212>, 'UNNEST': <TokenType.UNNEST: 411>, 'UNPIVOT': <TokenType.UNPIVOT: 412>, 'UPDATE': <TokenType.UPDATE: 413>, 'USE': <TokenType.USE: 414>, 'USING': <TokenType.USING: 415>, 'UUID': <TokenType.UUID: 169>, 'VALUES': <TokenType.VALUES: 416>, 'VIEW': <TokenType.VIEW: 418>, 'VOLATILE': <TokenType.VOLATILE: 420>, 'WHEN': <TokenType.WHEN: 422>, 'WHERE': <TokenType.WHERE: 423>, 'WINDOW': <TokenType.WINDOW: 424>, 'WITH': <TokenType.WITH: 425>, 'APPLY': <TokenType.APPLY: 221>, 'ARRAY': <TokenType.ARRAY: 222>, 'BIT': <TokenType.BIT: 95>, 'BOOL': <TokenType.BOOLEAN: 96>, 'BOOLEAN': <TokenType.BOOLEAN: 96>, 'BYTE': <TokenType.TINYINT: 97>, 'MEDIUMINT': <TokenType.MEDIUMINT: 101>, 'INT1': <TokenType.TINYINT: 97>, 'TINYINT': <TokenType.TINYINT: 97>, 'INT16': <TokenType.SMALLINT: 99>, 'SHORT': <TokenType.SMALLINT: 99>, 'SMALLINT': <TokenType.SMALLINT: 99>, 'HUGEINT': <TokenType.INT128: 108>, 'UHUGEINT': <TokenType.UINT128: 109>, 'INT2': <TokenType.SMALLINT: 99>, 'INTEGER': <TokenType.INT: 103>, 'INT': <TokenType.INT: 103>, 'INT4': <TokenType.INT: 103>, 'INT32': <TokenType.INT: 103>, 'INT64': <TokenType.BIGINT: 105>, 'INT128': <TokenType.INT128: 108>, 'INT256': <TokenType.INT256: 110>, 'LONG': <TokenType.BIGINT: 105>, 'BIGINT': <TokenType.BIGINT: 105>, 'INT8': <TokenType.TINYINT: 97>, 'UINT': <TokenType.UINT: 104>, 'UINT128': <TokenType.UINT128: 109>, 'UINT256': <TokenType.UINT256: 111>, 'DEC': <TokenType.DECIMAL: 115>, 'DECIMAL': <TokenType.DECIMAL: 115>, 'DECIMAL32': <TokenType.DECIMAL32: 116>, 'DECIMAL64': <TokenType.DECIMAL64: 117>, 'DECIMAL128': <TokenType.DECIMAL128: 118>, 'DECIMAL256': <TokenType.DECIMAL256: 119>, 'DECFLOAT': <TokenType.DECFLOAT: 120>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 122>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 122>, 'BIGNUM': <TokenType.BIGNUM: 107>, 'LIST': <TokenType.LIST: 317>, 'MAP': <TokenType.MAP: 320>, 'NULLABLE': <TokenType.NULLABLE: 172>, 'NUMBER': <TokenType.DECIMAL: 115>, 'NUMERIC': <TokenType.DECIMAL: 115>, 'FIXED': <TokenType.DECIMAL: 115>, 'REAL': <TokenType.FLOAT: 112>, 'FLOAT': <TokenType.FLOAT: 112>, 'FLOAT4': <TokenType.FLOAT: 112>, 'FLOAT8': <TokenType.DOUBLE: 113>, 'DOUBLE': <TokenType.DOUBLE: 113>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 113>, 'JSON': <TokenType.JSON: 139>, 'JSONB': <TokenType.JSONB: 140>, 'CHAR': <TokenType.CHAR: 123>, 'CHARACTER': <TokenType.CHAR: 123>, 'CHAR VARYING': <TokenType.VARCHAR: 125>, 'CHARACTER VARYING': <TokenType.VARCHAR: 125>, 'NCHAR': <TokenType.NCHAR: 124>, 'VARCHAR': <TokenType.VARCHAR: 125>, 'VARCHAR2': <TokenType.VARCHAR: 125>, 'NVARCHAR': <TokenType.NVARCHAR: 126>, 'NVARCHAR2': <TokenType.NVARCHAR: 126>, 'BPCHAR': <TokenType.BPCHAR: 127>, 'STR': <TokenType.TEXT: 128>, 'STRING': <TokenType.TEXT: 128>, 'TEXT': <TokenType.TEXT: 128>, 'LONGTEXT': <TokenType.LONGTEXT: 130>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 129>, 'TINYTEXT': <TokenType.TINYTEXT: 135>, 'CLOB': <TokenType.TEXT: 128>, 'LONGVARCHAR': <TokenType.TEXT: 128>, 'BINARY': <TokenType.BINARY: 137>, 'BLOB': <TokenType.VARBINARY: 138>, 'LONGBLOB': <TokenType.LONGBLOB: 133>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 132>, 'TINYBLOB': <TokenType.TINYBLOB: 134>, 'BYTEA': <TokenType.VARBINARY: 138>, 'VARBINARY': <TokenType.VARBINARY: 138>, 'TIME': <TokenType.TIME: 141>, 'TIMETZ': <TokenType.TIMETZ: 142>, 'TIME_NS': <TokenType.TIME_NS: 143>, 'TIMESTAMP': <TokenType.TIMESTAMP: 144>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 145>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 147>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 147>, 'DATE': <TokenType.DATE: 155>, 'DATETIME': <TokenType.DATETIME: 151>, 'INT4RANGE': <TokenType.INT4RANGE: 157>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 158>, 'INT8RANGE': <TokenType.INT8RANGE: 159>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 160>, 'NUMRANGE': <TokenType.NUMRANGE: 161>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 162>, 'TSRANGE': <TokenType.TSRANGE: 163>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 164>, 'TSTZRANGE': <TokenType.TSTZRANGE: 165>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 166>, 'DATERANGE': <TokenType.DATERANGE: 167>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 168>, 'UNIQUE': <TokenType.UNIQUE: 426>, 'VECTOR': <TokenType.VECTOR: 213>, 'STRUCT': <TokenType.STRUCT: 399>, 'SEQUENCE': <TokenType.SEQUENCE: 386>, 'VARIANT': <TokenType.VARIANT: 196>, 'ALTER': <TokenType.ALTER: 217>, 'ANALYZE': <TokenType.ANALYZE: 435>, 'CALL': <TokenType.COMMAND: 235>, 'COMMENT': <TokenType.COMMENT: 236>, 'EXPLAIN': <TokenType.COMMAND: 235>, 'GRANT': <TokenType.GRANT: 286>, 'REVOKE': <TokenType.REVOKE: 373>, 'OPTIMIZE': <TokenType.COMMAND: 235>, 'PREPARE': <TokenType.COMMAND: 235>, 'VACUUM': <TokenType.COMMAND: 235>, 'USER-DEFINED': <TokenType.USERDEFINED: 191>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 430>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 431>}
COMMANDS =
{<TokenType.SHOW: 390>, <TokenType.EXECUTE: 267>, <TokenType.COMMAND: 235>, <TokenType.FETCH: 270>, <TokenType.RENAME: 370>}
577 def tokenize(self, sql: str) -> list[Token]: 578 """Returns a list of tokens corresponding to the SQL string `sql`.""" 579 return self._core.tokenize(sql) # type: ignore
Returns a list of tokens corresponding to the SQL string sql.
sql: str
581 @property 582 def sql(self) -> str: 583 """The SQL string being tokenized.""" 584 return self._core.sql
The SQL string being tokenized.
size: int
586 @property 587 def size(self) -> int: 588 """Length of the SQL string.""" 589 return self._core.size
Length of the SQL string.
tokens: list[sqlglot.tokenizer_core.Token]
591 @property 592 def tokens(self) -> list[Token]: 593 """The list of tokens produced by tokenization.""" 594 return self._core.tokens
The list of tokens produced by tokenization.