sqlglot.tokens
1from __future__ import annotations 2 3import typing as t 4 5from sqlglot.trie import new_trie 6 7# Import Token and TokenType from tokenizer_core (compiled with mypyc) 8from sqlglot.tokenizer_core import Token, TokenType 9 10try: 11 import sqlglotrs # type: ignore # noqa: F401 12 import warnings 13 14 warnings.warn( 15 "sqlglot[rs] is deprecated and no longer compatible with sqlglot. " 16 "Please use sqlglotc instead for faster parsing: pip install sqlglot[c]", 17 ) 18except ImportError: 19 pass 20 21if t.TYPE_CHECKING: 22 from sqlglot.dialects.dialect import DialectType 23 24 25def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 26 return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr) 27 28 29def _quotes_to_format( 30 token_type: TokenType, arr: t.List[str | t.Tuple[str, str]] 31) -> t.Dict[str, t.Tuple[str, TokenType]]: 32 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 33 34 35class _TokenizerBase: 36 QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]] 37 IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 38 BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 39 BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 40 HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 41 RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 42 HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 43 UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 44 STRING_ESCAPES: t.ClassVar[t.List[str]] 45 BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]] 46 ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]] 47 IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]] 48 HINT_START: t.ClassVar[str] 49 KEYWORDS: t.ClassVar[t.Dict[str, TokenType]] 50 SINGLE_TOKENS: t.ClassVar[t.Dict[str, TokenType]] 51 NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]] 52 VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]] 53 COMMANDS: t.ClassVar[t.Set[TokenType]] 54 COMMAND_PREFIX_TOKENS: t.ClassVar[t.Set[TokenType]] 55 HEREDOC_TAG_IS_IDENTIFIER: t.ClassVar[bool] 56 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS: t.ClassVar[bool] 57 NESTED_COMMENTS: t.ClassVar[bool] 58 TOKENS_PRECEDING_HINT: t.ClassVar[t.Set[TokenType]] 59 HEREDOC_STRING_ALTERNATIVE: t.ClassVar[TokenType] 60 COMMENTS: t.ClassVar[t.List[str | t.Tuple[str, str]]] 61 _QUOTES: t.ClassVar[t.Dict[str, str]] 62 _IDENTIFIERS: t.ClassVar[t.Dict[str, str]] 63 _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]] 64 _STRING_ESCAPES: t.ClassVar[t.Set[str]] 65 _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]] 66 _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]] 67 _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]] 68 _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]] 69 _KEYWORD_TRIE: t.ClassVar[t.Dict] 70 71 @classmethod 72 def __init_subclass__(cls, **kwargs: t.Any) -> None: 73 super().__init_subclass__(**kwargs) 74 cls._QUOTES = _convert_quotes(cls.QUOTES) 75 cls._IDENTIFIERS = _convert_quotes(cls.IDENTIFIERS) 76 cls._FORMAT_STRINGS = { 77 **{ 78 p + s: (e, TokenType.NATIONAL_STRING) 79 for s, e in cls._QUOTES.items() 80 for p in ("n", "N") 81 }, 82 **_quotes_to_format(TokenType.BIT_STRING, cls.BIT_STRINGS), 83 **_quotes_to_format(TokenType.BYTE_STRING, cls.BYTE_STRINGS), 84 **_quotes_to_format(TokenType.HEX_STRING, cls.HEX_STRINGS), 85 **_quotes_to_format(TokenType.RAW_STRING, cls.RAW_STRINGS), 86 **_quotes_to_format(TokenType.HEREDOC_STRING, cls.HEREDOC_STRINGS), 87 **_quotes_to_format(TokenType.UNICODE_STRING, cls.UNICODE_STRINGS), 88 } 89 if "BYTE_STRING_ESCAPES" not in cls.__dict__: 90 cls.BYTE_STRING_ESCAPES = cls.STRING_ESCAPES.copy() 91 cls._STRING_ESCAPES = set(cls.STRING_ESCAPES) 92 cls._BYTE_STRING_ESCAPES = set(cls.BYTE_STRING_ESCAPES) 93 cls._ESCAPE_FOLLOW_CHARS = set(cls.ESCAPE_FOLLOW_CHARS) 94 cls._IDENTIFIER_ESCAPES = set(cls.IDENTIFIER_ESCAPES) 95 cls._COMMENTS = { 96 **{c: None for c in cls.COMMENTS if isinstance(c, str)}, 97 **{c[0]: c[1] for c in cls.COMMENTS if not isinstance(c, str)}, 98 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 99 } 100 if cls.HINT_START in cls.KEYWORDS: 101 cls._COMMENTS[cls.HINT_START] = "*/" 102 cls._KEYWORD_TRIE = new_trie( 103 key.upper() 104 for key in ( 105 *cls.KEYWORDS, 106 *cls._COMMENTS, 107 *cls._QUOTES, 108 *cls._FORMAT_STRINGS, 109 ) 110 if " " in key or any(single in key for single in cls.SINGLE_TOKENS) 111 ) 112 113 114class Tokenizer(_TokenizerBase): 115 SINGLE_TOKENS = { 116 "(": TokenType.L_PAREN, 117 ")": TokenType.R_PAREN, 118 "[": TokenType.L_BRACKET, 119 "]": TokenType.R_BRACKET, 120 "{": TokenType.L_BRACE, 121 "}": TokenType.R_BRACE, 122 "&": TokenType.AMP, 123 "^": TokenType.CARET, 124 ":": TokenType.COLON, 125 ",": TokenType.COMMA, 126 ".": TokenType.DOT, 127 "-": TokenType.DASH, 128 "=": TokenType.EQ, 129 ">": TokenType.GT, 130 "<": TokenType.LT, 131 "%": TokenType.MOD, 132 "!": TokenType.NOT, 133 "|": TokenType.PIPE, 134 "+": TokenType.PLUS, 135 ";": TokenType.SEMICOLON, 136 "/": TokenType.SLASH, 137 "\\": TokenType.BACKSLASH, 138 "*": TokenType.STAR, 139 "~": TokenType.TILDE, 140 "?": TokenType.PLACEHOLDER, 141 "@": TokenType.PARAMETER, 142 "#": TokenType.HASH, 143 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 144 "'": TokenType.UNKNOWN, 145 "`": TokenType.UNKNOWN, 146 '"': TokenType.UNKNOWN, 147 } 148 149 BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 150 BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 151 HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 152 RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 153 HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 154 UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 155 IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = ['"'] 156 QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]] = ["'"] 157 STRING_ESCAPES = ["'"] 158 BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]] = [] 159 VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]] = set() 160 ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]] = [] 161 162 # The strings in this list can always be used as escapes, regardless of the surrounding 163 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 164 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 165 IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]] = [] 166 167 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 168 HEREDOC_TAG_IS_IDENTIFIER = False 169 170 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 171 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 172 173 # Whether string escape characters function as such when placed within raw strings 174 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 175 176 NESTED_COMMENTS = True 177 178 HINT_START = "/*+" 179 180 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 181 182 # Autofilled 183 _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]] = {} 184 _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]] = {} 185 _IDENTIFIERS: t.ClassVar[t.Dict[str, str]] = {} 186 _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]] = set() 187 _QUOTES: t.ClassVar[t.Dict[str, str]] = {} 188 _STRING_ESCAPES: t.ClassVar[t.Set[str]] = set() 189 _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]] = set() 190 _KEYWORD_TRIE: t.ClassVar[t.Dict] = {} 191 _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]] = set() 192 193 KEYWORDS: t.ClassVar[t.Dict[str, TokenType]] = { 194 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 195 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 196 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 197 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 198 HINT_START: TokenType.HINT, 199 "&<": TokenType.AMP_LT, 200 "&>": TokenType.AMP_GT, 201 "==": TokenType.EQ, 202 "::": TokenType.DCOLON, 203 "?::": TokenType.QDCOLON, 204 "||": TokenType.DPIPE, 205 "|>": TokenType.PIPE_GT, 206 ">=": TokenType.GTE, 207 "<=": TokenType.LTE, 208 "<>": TokenType.NEQ, 209 "!=": TokenType.NEQ, 210 ":=": TokenType.COLON_EQ, 211 "<=>": TokenType.NULLSAFE_EQ, 212 "->": TokenType.ARROW, 213 "->>": TokenType.DARROW, 214 "=>": TokenType.FARROW, 215 "#>": TokenType.HASH_ARROW, 216 "#>>": TokenType.DHASH_ARROW, 217 "<->": TokenType.LR_ARROW, 218 "&&": TokenType.DAMP, 219 "??": TokenType.DQMARK, 220 "~~~": TokenType.GLOB, 221 "~~": TokenType.LIKE, 222 "~~*": TokenType.ILIKE, 223 "~*": TokenType.IRLIKE, 224 "-|-": TokenType.ADJACENT, 225 "ALL": TokenType.ALL, 226 "AND": TokenType.AND, 227 "ANTI": TokenType.ANTI, 228 "ANY": TokenType.ANY, 229 "ASC": TokenType.ASC, 230 "AS": TokenType.ALIAS, 231 "ASOF": TokenType.ASOF, 232 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 233 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 234 "BEGIN": TokenType.BEGIN, 235 "BETWEEN": TokenType.BETWEEN, 236 "CACHE": TokenType.CACHE, 237 "UNCACHE": TokenType.UNCACHE, 238 "CASE": TokenType.CASE, 239 "CHARACTER SET": TokenType.CHARACTER_SET, 240 "CLUSTER BY": TokenType.CLUSTER_BY, 241 "COLLATE": TokenType.COLLATE, 242 "COLUMN": TokenType.COLUMN, 243 "COMMIT": TokenType.COMMIT, 244 "CONNECT BY": TokenType.CONNECT_BY, 245 "CONSTRAINT": TokenType.CONSTRAINT, 246 "COPY": TokenType.COPY, 247 "CREATE": TokenType.CREATE, 248 "CROSS": TokenType.CROSS, 249 "CUBE": TokenType.CUBE, 250 "CURRENT_DATE": TokenType.CURRENT_DATE, 251 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 252 "CURRENT_TIME": TokenType.CURRENT_TIME, 253 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 254 "CURRENT_USER": TokenType.CURRENT_USER, 255 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 256 "DATABASE": TokenType.DATABASE, 257 "DEFAULT": TokenType.DEFAULT, 258 "DELETE": TokenType.DELETE, 259 "DESC": TokenType.DESC, 260 "DESCRIBE": TokenType.DESCRIBE, 261 "DISTINCT": TokenType.DISTINCT, 262 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 263 "DIV": TokenType.DIV, 264 "DROP": TokenType.DROP, 265 "ELSE": TokenType.ELSE, 266 "END": TokenType.END, 267 "ENUM": TokenType.ENUM, 268 "ESCAPE": TokenType.ESCAPE, 269 "EXCEPT": TokenType.EXCEPT, 270 "EXECUTE": TokenType.EXECUTE, 271 "EXISTS": TokenType.EXISTS, 272 "FALSE": TokenType.FALSE, 273 "FETCH": TokenType.FETCH, 274 "FILTER": TokenType.FILTER, 275 "FILE": TokenType.FILE, 276 "FIRST": TokenType.FIRST, 277 "FULL": TokenType.FULL, 278 "FUNCTION": TokenType.FUNCTION, 279 "FOR": TokenType.FOR, 280 "FOREIGN KEY": TokenType.FOREIGN_KEY, 281 "FORMAT": TokenType.FORMAT, 282 "FROM": TokenType.FROM, 283 "GEOGRAPHY": TokenType.GEOGRAPHY, 284 "GEOMETRY": TokenType.GEOMETRY, 285 "GLOB": TokenType.GLOB, 286 "GROUP BY": TokenType.GROUP_BY, 287 "GROUPING SETS": TokenType.GROUPING_SETS, 288 "HAVING": TokenType.HAVING, 289 "ILIKE": TokenType.ILIKE, 290 "IN": TokenType.IN, 291 "INDEX": TokenType.INDEX, 292 "INET": TokenType.INET, 293 "INNER": TokenType.INNER, 294 "INSERT": TokenType.INSERT, 295 "INTERVAL": TokenType.INTERVAL, 296 "INTERSECT": TokenType.INTERSECT, 297 "INTO": TokenType.INTO, 298 "IS": TokenType.IS, 299 "ISNULL": TokenType.ISNULL, 300 "JOIN": TokenType.JOIN, 301 "KEEP": TokenType.KEEP, 302 "KILL": TokenType.KILL, 303 "LATERAL": TokenType.LATERAL, 304 "LEFT": TokenType.LEFT, 305 "LIKE": TokenType.LIKE, 306 "LIMIT": TokenType.LIMIT, 307 "LOAD": TokenType.LOAD, 308 "LOCALTIME": TokenType.LOCALTIME, 309 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 310 "LOCK": TokenType.LOCK, 311 "MERGE": TokenType.MERGE, 312 "NAMESPACE": TokenType.NAMESPACE, 313 "NATURAL": TokenType.NATURAL, 314 "NEXT": TokenType.NEXT, 315 "NOT": TokenType.NOT, 316 "NOTNULL": TokenType.NOTNULL, 317 "NULL": TokenType.NULL, 318 "OBJECT": TokenType.OBJECT, 319 "OFFSET": TokenType.OFFSET, 320 "ON": TokenType.ON, 321 "OR": TokenType.OR, 322 "XOR": TokenType.XOR, 323 "ORDER BY": TokenType.ORDER_BY, 324 "ORDINALITY": TokenType.ORDINALITY, 325 "OUT": TokenType.OUT, 326 "OUTER": TokenType.OUTER, 327 "OVER": TokenType.OVER, 328 "OVERLAPS": TokenType.OVERLAPS, 329 "OVERWRITE": TokenType.OVERWRITE, 330 "PARTITION": TokenType.PARTITION, 331 "PARTITION BY": TokenType.PARTITION_BY, 332 "PARTITIONED BY": TokenType.PARTITION_BY, 333 "PARTITIONED_BY": TokenType.PARTITION_BY, 334 "PERCENT": TokenType.PERCENT, 335 "PIVOT": TokenType.PIVOT, 336 "PRAGMA": TokenType.PRAGMA, 337 "PRIMARY KEY": TokenType.PRIMARY_KEY, 338 "PROCEDURE": TokenType.PROCEDURE, 339 "OPERATOR": TokenType.OPERATOR, 340 "QUALIFY": TokenType.QUALIFY, 341 "RANGE": TokenType.RANGE, 342 "RECURSIVE": TokenType.RECURSIVE, 343 "REGEXP": TokenType.RLIKE, 344 "RENAME": TokenType.RENAME, 345 "REPLACE": TokenType.REPLACE, 346 "RETURNING": TokenType.RETURNING, 347 "REFERENCES": TokenType.REFERENCES, 348 "RIGHT": TokenType.RIGHT, 349 "RLIKE": TokenType.RLIKE, 350 "ROLLBACK": TokenType.ROLLBACK, 351 "ROLLUP": TokenType.ROLLUP, 352 "ROW": TokenType.ROW, 353 "ROWS": TokenType.ROWS, 354 "SCHEMA": TokenType.SCHEMA, 355 "SELECT": TokenType.SELECT, 356 "SEMI": TokenType.SEMI, 357 "SESSION": TokenType.SESSION, 358 "SESSION_USER": TokenType.SESSION_USER, 359 "SET": TokenType.SET, 360 "SETTINGS": TokenType.SETTINGS, 361 "SHOW": TokenType.SHOW, 362 "SIMILAR TO": TokenType.SIMILAR_TO, 363 "SOME": TokenType.SOME, 364 "SORT BY": TokenType.SORT_BY, 365 "START WITH": TokenType.START_WITH, 366 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 367 "TABLE": TokenType.TABLE, 368 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 369 "TEMP": TokenType.TEMPORARY, 370 "TEMPORARY": TokenType.TEMPORARY, 371 "THEN": TokenType.THEN, 372 "TRUE": TokenType.TRUE, 373 "TRUNCATE": TokenType.TRUNCATE, 374 "TRIGGER": TokenType.TRIGGER, 375 "UNION": TokenType.UNION, 376 "UNKNOWN": TokenType.UNKNOWN, 377 "UNNEST": TokenType.UNNEST, 378 "UNPIVOT": TokenType.UNPIVOT, 379 "UPDATE": TokenType.UPDATE, 380 "USE": TokenType.USE, 381 "USING": TokenType.USING, 382 "UUID": TokenType.UUID, 383 "VALUES": TokenType.VALUES, 384 "VIEW": TokenType.VIEW, 385 "VOLATILE": TokenType.VOLATILE, 386 "WHEN": TokenType.WHEN, 387 "WHERE": TokenType.WHERE, 388 "WINDOW": TokenType.WINDOW, 389 "WITH": TokenType.WITH, 390 "APPLY": TokenType.APPLY, 391 "ARRAY": TokenType.ARRAY, 392 "BIT": TokenType.BIT, 393 "BOOL": TokenType.BOOLEAN, 394 "BOOLEAN": TokenType.BOOLEAN, 395 "BYTE": TokenType.TINYINT, 396 "MEDIUMINT": TokenType.MEDIUMINT, 397 "INT1": TokenType.TINYINT, 398 "TINYINT": TokenType.TINYINT, 399 "INT16": TokenType.SMALLINT, 400 "SHORT": TokenType.SMALLINT, 401 "SMALLINT": TokenType.SMALLINT, 402 "HUGEINT": TokenType.INT128, 403 "UHUGEINT": TokenType.UINT128, 404 "INT2": TokenType.SMALLINT, 405 "INTEGER": TokenType.INT, 406 "INT": TokenType.INT, 407 "INT4": TokenType.INT, 408 "INT32": TokenType.INT, 409 "INT64": TokenType.BIGINT, 410 "INT128": TokenType.INT128, 411 "INT256": TokenType.INT256, 412 "LONG": TokenType.BIGINT, 413 "BIGINT": TokenType.BIGINT, 414 "INT8": TokenType.TINYINT, 415 "UINT": TokenType.UINT, 416 "UINT128": TokenType.UINT128, 417 "UINT256": TokenType.UINT256, 418 "DEC": TokenType.DECIMAL, 419 "DECIMAL": TokenType.DECIMAL, 420 "DECIMAL32": TokenType.DECIMAL32, 421 "DECIMAL64": TokenType.DECIMAL64, 422 "DECIMAL128": TokenType.DECIMAL128, 423 "DECIMAL256": TokenType.DECIMAL256, 424 "DECFLOAT": TokenType.DECFLOAT, 425 "BIGDECIMAL": TokenType.BIGDECIMAL, 426 "BIGNUMERIC": TokenType.BIGDECIMAL, 427 "BIGNUM": TokenType.BIGNUM, 428 "LIST": TokenType.LIST, 429 "MAP": TokenType.MAP, 430 "NULLABLE": TokenType.NULLABLE, 431 "NUMBER": TokenType.DECIMAL, 432 "NUMERIC": TokenType.DECIMAL, 433 "FIXED": TokenType.DECIMAL, 434 "REAL": TokenType.FLOAT, 435 "FLOAT": TokenType.FLOAT, 436 "FLOAT4": TokenType.FLOAT, 437 "FLOAT8": TokenType.DOUBLE, 438 "DOUBLE": TokenType.DOUBLE, 439 "DOUBLE PRECISION": TokenType.DOUBLE, 440 "JSON": TokenType.JSON, 441 "JSONB": TokenType.JSONB, 442 "CHAR": TokenType.CHAR, 443 "CHARACTER": TokenType.CHAR, 444 "CHAR VARYING": TokenType.VARCHAR, 445 "CHARACTER VARYING": TokenType.VARCHAR, 446 "NCHAR": TokenType.NCHAR, 447 "VARCHAR": TokenType.VARCHAR, 448 "VARCHAR2": TokenType.VARCHAR, 449 "NVARCHAR": TokenType.NVARCHAR, 450 "NVARCHAR2": TokenType.NVARCHAR, 451 "BPCHAR": TokenType.BPCHAR, 452 "STR": TokenType.TEXT, 453 "STRING": TokenType.TEXT, 454 "TEXT": TokenType.TEXT, 455 "LONGTEXT": TokenType.LONGTEXT, 456 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 457 "TINYTEXT": TokenType.TINYTEXT, 458 "CLOB": TokenType.TEXT, 459 "LONGVARCHAR": TokenType.TEXT, 460 "BINARY": TokenType.BINARY, 461 "BLOB": TokenType.VARBINARY, 462 "LONGBLOB": TokenType.LONGBLOB, 463 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 464 "TINYBLOB": TokenType.TINYBLOB, 465 "BYTEA": TokenType.VARBINARY, 466 "VARBINARY": TokenType.VARBINARY, 467 "TIME": TokenType.TIME, 468 "TIMETZ": TokenType.TIMETZ, 469 "TIME_NS": TokenType.TIME_NS, 470 "TIMESTAMP": TokenType.TIMESTAMP, 471 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 472 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 473 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 474 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 475 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 476 "DATE": TokenType.DATE, 477 "DATETIME": TokenType.DATETIME, 478 "INT4RANGE": TokenType.INT4RANGE, 479 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 480 "INT8RANGE": TokenType.INT8RANGE, 481 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 482 "NUMRANGE": TokenType.NUMRANGE, 483 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 484 "TSRANGE": TokenType.TSRANGE, 485 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 486 "TSTZRANGE": TokenType.TSTZRANGE, 487 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 488 "DATERANGE": TokenType.DATERANGE, 489 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 490 "UNIQUE": TokenType.UNIQUE, 491 "VECTOR": TokenType.VECTOR, 492 "STRUCT": TokenType.STRUCT, 493 "SEQUENCE": TokenType.SEQUENCE, 494 "VARIANT": TokenType.VARIANT, 495 "ALTER": TokenType.ALTER, 496 "ANALYZE": TokenType.ANALYZE, 497 "CALL": TokenType.COMMAND, 498 "COMMENT": TokenType.COMMENT, 499 "EXPLAIN": TokenType.COMMAND, 500 "GRANT": TokenType.GRANT, 501 "REVOKE": TokenType.REVOKE, 502 "OPTIMIZE": TokenType.COMMAND, 503 "PREPARE": TokenType.COMMAND, 504 "VACUUM": TokenType.COMMAND, 505 "USER-DEFINED": TokenType.USERDEFINED, 506 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 507 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 508 } 509 510 COMMANDS = { 511 TokenType.COMMAND, 512 TokenType.EXECUTE, 513 TokenType.FETCH, 514 TokenType.SHOW, 515 TokenType.RENAME, 516 } 517 518 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 519 520 # Handle numeric literals like in hive (3L = BIGINT) 521 NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]] = {} 522 523 COMMENTS = ["--", ("/*", "*/")] 524 525 __slots__ = ( 526 "dialect", 527 "_core", 528 ) 529 530 def __init__( 531 self, 532 dialect: DialectType = None, 533 **opts: t.Any, 534 ) -> None: 535 from sqlglot.dialects import Dialect 536 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 537 538 self.dialect = Dialect.get_or_raise(dialect) 539 540 self._core = _TokenizerCore( 541 single_tokens=self.SINGLE_TOKENS, 542 keywords=self.KEYWORDS, 543 quotes=self._QUOTES, 544 format_strings=self._FORMAT_STRINGS, 545 identifiers=self._IDENTIFIERS, 546 comments=self._COMMENTS, 547 string_escapes=self._STRING_ESCAPES, 548 byte_string_escapes=self._BYTE_STRING_ESCAPES, 549 identifier_escapes=self._IDENTIFIER_ESCAPES, 550 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 551 commands=self.COMMANDS, 552 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 553 nested_comments=self.NESTED_COMMENTS, 554 hint_start=self.HINT_START, 555 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 556 bit_strings=list(self.BIT_STRINGS), 557 hex_strings=list(self.HEX_STRINGS), 558 numeric_literals=self.NUMERIC_LITERALS, 559 var_single_tokens=self.VAR_SINGLE_TOKENS, 560 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 561 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 562 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 563 keyword_trie=self._KEYWORD_TRIE, 564 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 565 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 566 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 567 ) 568 569 def tokenize(self, sql: str) -> t.List[Token]: 570 """Returns a list of tokens corresponding to the SQL string `sql`.""" 571 return self._core.tokenize(sql) # type: ignore 572 573 @property 574 def sql(self) -> str: 575 """The SQL string being tokenized.""" 576 return self._core.sql 577 578 @property 579 def size(self) -> int: 580 """Length of the SQL string.""" 581 return self._core.size 582 583 @property 584 def tokens(self) -> t.List[Token]: 585 """The list of tokens produced by tokenization.""" 586 return self._core.tokens
115class Tokenizer(_TokenizerBase): 116 SINGLE_TOKENS = { 117 "(": TokenType.L_PAREN, 118 ")": TokenType.R_PAREN, 119 "[": TokenType.L_BRACKET, 120 "]": TokenType.R_BRACKET, 121 "{": TokenType.L_BRACE, 122 "}": TokenType.R_BRACE, 123 "&": TokenType.AMP, 124 "^": TokenType.CARET, 125 ":": TokenType.COLON, 126 ",": TokenType.COMMA, 127 ".": TokenType.DOT, 128 "-": TokenType.DASH, 129 "=": TokenType.EQ, 130 ">": TokenType.GT, 131 "<": TokenType.LT, 132 "%": TokenType.MOD, 133 "!": TokenType.NOT, 134 "|": TokenType.PIPE, 135 "+": TokenType.PLUS, 136 ";": TokenType.SEMICOLON, 137 "/": TokenType.SLASH, 138 "\\": TokenType.BACKSLASH, 139 "*": TokenType.STAR, 140 "~": TokenType.TILDE, 141 "?": TokenType.PLACEHOLDER, 142 "@": TokenType.PARAMETER, 143 "#": TokenType.HASH, 144 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 145 "'": TokenType.UNKNOWN, 146 "`": TokenType.UNKNOWN, 147 '"': TokenType.UNKNOWN, 148 } 149 150 BIT_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 151 BYTE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 152 HEX_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 153 RAW_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 154 HEREDOC_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 155 UNICODE_STRINGS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = [] 156 IDENTIFIERS: t.ClassVar[t.List[str | t.Tuple[str, str]]] = ['"'] 157 QUOTES: t.ClassVar[t.List[t.Tuple[str, str] | str]] = ["'"] 158 STRING_ESCAPES = ["'"] 159 BYTE_STRING_ESCAPES: t.ClassVar[t.List[str]] = [] 160 VAR_SINGLE_TOKENS: t.ClassVar[t.Set[str]] = set() 161 ESCAPE_FOLLOW_CHARS: t.ClassVar[t.List[str]] = [] 162 163 # The strings in this list can always be used as escapes, regardless of the surrounding 164 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 165 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 166 IDENTIFIER_ESCAPES: t.ClassVar[t.List[str]] = [] 167 168 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 169 HEREDOC_TAG_IS_IDENTIFIER = False 170 171 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 172 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 173 174 # Whether string escape characters function as such when placed within raw strings 175 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 176 177 NESTED_COMMENTS = True 178 179 HINT_START = "/*+" 180 181 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 182 183 # Autofilled 184 _COMMENTS: t.ClassVar[t.Dict[str, t.Optional[str]]] = {} 185 _FORMAT_STRINGS: t.ClassVar[t.Dict[str, t.Tuple[str, TokenType]]] = {} 186 _IDENTIFIERS: t.ClassVar[t.Dict[str, str]] = {} 187 _IDENTIFIER_ESCAPES: t.ClassVar[t.Set[str]] = set() 188 _QUOTES: t.ClassVar[t.Dict[str, str]] = {} 189 _STRING_ESCAPES: t.ClassVar[t.Set[str]] = set() 190 _BYTE_STRING_ESCAPES: t.ClassVar[t.Set[str]] = set() 191 _KEYWORD_TRIE: t.ClassVar[t.Dict] = {} 192 _ESCAPE_FOLLOW_CHARS: t.ClassVar[t.Set[str]] = set() 193 194 KEYWORDS: t.ClassVar[t.Dict[str, TokenType]] = { 195 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 196 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 197 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 198 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 199 HINT_START: TokenType.HINT, 200 "&<": TokenType.AMP_LT, 201 "&>": TokenType.AMP_GT, 202 "==": TokenType.EQ, 203 "::": TokenType.DCOLON, 204 "?::": TokenType.QDCOLON, 205 "||": TokenType.DPIPE, 206 "|>": TokenType.PIPE_GT, 207 ">=": TokenType.GTE, 208 "<=": TokenType.LTE, 209 "<>": TokenType.NEQ, 210 "!=": TokenType.NEQ, 211 ":=": TokenType.COLON_EQ, 212 "<=>": TokenType.NULLSAFE_EQ, 213 "->": TokenType.ARROW, 214 "->>": TokenType.DARROW, 215 "=>": TokenType.FARROW, 216 "#>": TokenType.HASH_ARROW, 217 "#>>": TokenType.DHASH_ARROW, 218 "<->": TokenType.LR_ARROW, 219 "&&": TokenType.DAMP, 220 "??": TokenType.DQMARK, 221 "~~~": TokenType.GLOB, 222 "~~": TokenType.LIKE, 223 "~~*": TokenType.ILIKE, 224 "~*": TokenType.IRLIKE, 225 "-|-": TokenType.ADJACENT, 226 "ALL": TokenType.ALL, 227 "AND": TokenType.AND, 228 "ANTI": TokenType.ANTI, 229 "ANY": TokenType.ANY, 230 "ASC": TokenType.ASC, 231 "AS": TokenType.ALIAS, 232 "ASOF": TokenType.ASOF, 233 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 234 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 235 "BEGIN": TokenType.BEGIN, 236 "BETWEEN": TokenType.BETWEEN, 237 "CACHE": TokenType.CACHE, 238 "UNCACHE": TokenType.UNCACHE, 239 "CASE": TokenType.CASE, 240 "CHARACTER SET": TokenType.CHARACTER_SET, 241 "CLUSTER BY": TokenType.CLUSTER_BY, 242 "COLLATE": TokenType.COLLATE, 243 "COLUMN": TokenType.COLUMN, 244 "COMMIT": TokenType.COMMIT, 245 "CONNECT BY": TokenType.CONNECT_BY, 246 "CONSTRAINT": TokenType.CONSTRAINT, 247 "COPY": TokenType.COPY, 248 "CREATE": TokenType.CREATE, 249 "CROSS": TokenType.CROSS, 250 "CUBE": TokenType.CUBE, 251 "CURRENT_DATE": TokenType.CURRENT_DATE, 252 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 253 "CURRENT_TIME": TokenType.CURRENT_TIME, 254 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 255 "CURRENT_USER": TokenType.CURRENT_USER, 256 "CURRENT_CATALOG": TokenType.CURRENT_CATALOG, 257 "DATABASE": TokenType.DATABASE, 258 "DEFAULT": TokenType.DEFAULT, 259 "DELETE": TokenType.DELETE, 260 "DESC": TokenType.DESC, 261 "DESCRIBE": TokenType.DESCRIBE, 262 "DISTINCT": TokenType.DISTINCT, 263 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 264 "DIV": TokenType.DIV, 265 "DROP": TokenType.DROP, 266 "ELSE": TokenType.ELSE, 267 "END": TokenType.END, 268 "ENUM": TokenType.ENUM, 269 "ESCAPE": TokenType.ESCAPE, 270 "EXCEPT": TokenType.EXCEPT, 271 "EXECUTE": TokenType.EXECUTE, 272 "EXISTS": TokenType.EXISTS, 273 "FALSE": TokenType.FALSE, 274 "FETCH": TokenType.FETCH, 275 "FILTER": TokenType.FILTER, 276 "FILE": TokenType.FILE, 277 "FIRST": TokenType.FIRST, 278 "FULL": TokenType.FULL, 279 "FUNCTION": TokenType.FUNCTION, 280 "FOR": TokenType.FOR, 281 "FOREIGN KEY": TokenType.FOREIGN_KEY, 282 "FORMAT": TokenType.FORMAT, 283 "FROM": TokenType.FROM, 284 "GEOGRAPHY": TokenType.GEOGRAPHY, 285 "GEOMETRY": TokenType.GEOMETRY, 286 "GLOB": TokenType.GLOB, 287 "GROUP BY": TokenType.GROUP_BY, 288 "GROUPING SETS": TokenType.GROUPING_SETS, 289 "HAVING": TokenType.HAVING, 290 "ILIKE": TokenType.ILIKE, 291 "IN": TokenType.IN, 292 "INDEX": TokenType.INDEX, 293 "INET": TokenType.INET, 294 "INNER": TokenType.INNER, 295 "INSERT": TokenType.INSERT, 296 "INTERVAL": TokenType.INTERVAL, 297 "INTERSECT": TokenType.INTERSECT, 298 "INTO": TokenType.INTO, 299 "IS": TokenType.IS, 300 "ISNULL": TokenType.ISNULL, 301 "JOIN": TokenType.JOIN, 302 "KEEP": TokenType.KEEP, 303 "KILL": TokenType.KILL, 304 "LATERAL": TokenType.LATERAL, 305 "LEFT": TokenType.LEFT, 306 "LIKE": TokenType.LIKE, 307 "LIMIT": TokenType.LIMIT, 308 "LOAD": TokenType.LOAD, 309 "LOCALTIME": TokenType.LOCALTIME, 310 "LOCALTIMESTAMP": TokenType.LOCALTIMESTAMP, 311 "LOCK": TokenType.LOCK, 312 "MERGE": TokenType.MERGE, 313 "NAMESPACE": TokenType.NAMESPACE, 314 "NATURAL": TokenType.NATURAL, 315 "NEXT": TokenType.NEXT, 316 "NOT": TokenType.NOT, 317 "NOTNULL": TokenType.NOTNULL, 318 "NULL": TokenType.NULL, 319 "OBJECT": TokenType.OBJECT, 320 "OFFSET": TokenType.OFFSET, 321 "ON": TokenType.ON, 322 "OR": TokenType.OR, 323 "XOR": TokenType.XOR, 324 "ORDER BY": TokenType.ORDER_BY, 325 "ORDINALITY": TokenType.ORDINALITY, 326 "OUT": TokenType.OUT, 327 "OUTER": TokenType.OUTER, 328 "OVER": TokenType.OVER, 329 "OVERLAPS": TokenType.OVERLAPS, 330 "OVERWRITE": TokenType.OVERWRITE, 331 "PARTITION": TokenType.PARTITION, 332 "PARTITION BY": TokenType.PARTITION_BY, 333 "PARTITIONED BY": TokenType.PARTITION_BY, 334 "PARTITIONED_BY": TokenType.PARTITION_BY, 335 "PERCENT": TokenType.PERCENT, 336 "PIVOT": TokenType.PIVOT, 337 "PRAGMA": TokenType.PRAGMA, 338 "PRIMARY KEY": TokenType.PRIMARY_KEY, 339 "PROCEDURE": TokenType.PROCEDURE, 340 "OPERATOR": TokenType.OPERATOR, 341 "QUALIFY": TokenType.QUALIFY, 342 "RANGE": TokenType.RANGE, 343 "RECURSIVE": TokenType.RECURSIVE, 344 "REGEXP": TokenType.RLIKE, 345 "RENAME": TokenType.RENAME, 346 "REPLACE": TokenType.REPLACE, 347 "RETURNING": TokenType.RETURNING, 348 "REFERENCES": TokenType.REFERENCES, 349 "RIGHT": TokenType.RIGHT, 350 "RLIKE": TokenType.RLIKE, 351 "ROLLBACK": TokenType.ROLLBACK, 352 "ROLLUP": TokenType.ROLLUP, 353 "ROW": TokenType.ROW, 354 "ROWS": TokenType.ROWS, 355 "SCHEMA": TokenType.SCHEMA, 356 "SELECT": TokenType.SELECT, 357 "SEMI": TokenType.SEMI, 358 "SESSION": TokenType.SESSION, 359 "SESSION_USER": TokenType.SESSION_USER, 360 "SET": TokenType.SET, 361 "SETTINGS": TokenType.SETTINGS, 362 "SHOW": TokenType.SHOW, 363 "SIMILAR TO": TokenType.SIMILAR_TO, 364 "SOME": TokenType.SOME, 365 "SORT BY": TokenType.SORT_BY, 366 "START WITH": TokenType.START_WITH, 367 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 368 "TABLE": TokenType.TABLE, 369 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 370 "TEMP": TokenType.TEMPORARY, 371 "TEMPORARY": TokenType.TEMPORARY, 372 "THEN": TokenType.THEN, 373 "TRUE": TokenType.TRUE, 374 "TRUNCATE": TokenType.TRUNCATE, 375 "TRIGGER": TokenType.TRIGGER, 376 "UNION": TokenType.UNION, 377 "UNKNOWN": TokenType.UNKNOWN, 378 "UNNEST": TokenType.UNNEST, 379 "UNPIVOT": TokenType.UNPIVOT, 380 "UPDATE": TokenType.UPDATE, 381 "USE": TokenType.USE, 382 "USING": TokenType.USING, 383 "UUID": TokenType.UUID, 384 "VALUES": TokenType.VALUES, 385 "VIEW": TokenType.VIEW, 386 "VOLATILE": TokenType.VOLATILE, 387 "WHEN": TokenType.WHEN, 388 "WHERE": TokenType.WHERE, 389 "WINDOW": TokenType.WINDOW, 390 "WITH": TokenType.WITH, 391 "APPLY": TokenType.APPLY, 392 "ARRAY": TokenType.ARRAY, 393 "BIT": TokenType.BIT, 394 "BOOL": TokenType.BOOLEAN, 395 "BOOLEAN": TokenType.BOOLEAN, 396 "BYTE": TokenType.TINYINT, 397 "MEDIUMINT": TokenType.MEDIUMINT, 398 "INT1": TokenType.TINYINT, 399 "TINYINT": TokenType.TINYINT, 400 "INT16": TokenType.SMALLINT, 401 "SHORT": TokenType.SMALLINT, 402 "SMALLINT": TokenType.SMALLINT, 403 "HUGEINT": TokenType.INT128, 404 "UHUGEINT": TokenType.UINT128, 405 "INT2": TokenType.SMALLINT, 406 "INTEGER": TokenType.INT, 407 "INT": TokenType.INT, 408 "INT4": TokenType.INT, 409 "INT32": TokenType.INT, 410 "INT64": TokenType.BIGINT, 411 "INT128": TokenType.INT128, 412 "INT256": TokenType.INT256, 413 "LONG": TokenType.BIGINT, 414 "BIGINT": TokenType.BIGINT, 415 "INT8": TokenType.TINYINT, 416 "UINT": TokenType.UINT, 417 "UINT128": TokenType.UINT128, 418 "UINT256": TokenType.UINT256, 419 "DEC": TokenType.DECIMAL, 420 "DECIMAL": TokenType.DECIMAL, 421 "DECIMAL32": TokenType.DECIMAL32, 422 "DECIMAL64": TokenType.DECIMAL64, 423 "DECIMAL128": TokenType.DECIMAL128, 424 "DECIMAL256": TokenType.DECIMAL256, 425 "DECFLOAT": TokenType.DECFLOAT, 426 "BIGDECIMAL": TokenType.BIGDECIMAL, 427 "BIGNUMERIC": TokenType.BIGDECIMAL, 428 "BIGNUM": TokenType.BIGNUM, 429 "LIST": TokenType.LIST, 430 "MAP": TokenType.MAP, 431 "NULLABLE": TokenType.NULLABLE, 432 "NUMBER": TokenType.DECIMAL, 433 "NUMERIC": TokenType.DECIMAL, 434 "FIXED": TokenType.DECIMAL, 435 "REAL": TokenType.FLOAT, 436 "FLOAT": TokenType.FLOAT, 437 "FLOAT4": TokenType.FLOAT, 438 "FLOAT8": TokenType.DOUBLE, 439 "DOUBLE": TokenType.DOUBLE, 440 "DOUBLE PRECISION": TokenType.DOUBLE, 441 "JSON": TokenType.JSON, 442 "JSONB": TokenType.JSONB, 443 "CHAR": TokenType.CHAR, 444 "CHARACTER": TokenType.CHAR, 445 "CHAR VARYING": TokenType.VARCHAR, 446 "CHARACTER VARYING": TokenType.VARCHAR, 447 "NCHAR": TokenType.NCHAR, 448 "VARCHAR": TokenType.VARCHAR, 449 "VARCHAR2": TokenType.VARCHAR, 450 "NVARCHAR": TokenType.NVARCHAR, 451 "NVARCHAR2": TokenType.NVARCHAR, 452 "BPCHAR": TokenType.BPCHAR, 453 "STR": TokenType.TEXT, 454 "STRING": TokenType.TEXT, 455 "TEXT": TokenType.TEXT, 456 "LONGTEXT": TokenType.LONGTEXT, 457 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 458 "TINYTEXT": TokenType.TINYTEXT, 459 "CLOB": TokenType.TEXT, 460 "LONGVARCHAR": TokenType.TEXT, 461 "BINARY": TokenType.BINARY, 462 "BLOB": TokenType.VARBINARY, 463 "LONGBLOB": TokenType.LONGBLOB, 464 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 465 "TINYBLOB": TokenType.TINYBLOB, 466 "BYTEA": TokenType.VARBINARY, 467 "VARBINARY": TokenType.VARBINARY, 468 "TIME": TokenType.TIME, 469 "TIMETZ": TokenType.TIMETZ, 470 "TIME_NS": TokenType.TIME_NS, 471 "TIMESTAMP": TokenType.TIMESTAMP, 472 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 473 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 474 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 475 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 476 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 477 "DATE": TokenType.DATE, 478 "DATETIME": TokenType.DATETIME, 479 "INT4RANGE": TokenType.INT4RANGE, 480 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 481 "INT8RANGE": TokenType.INT8RANGE, 482 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 483 "NUMRANGE": TokenType.NUMRANGE, 484 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 485 "TSRANGE": TokenType.TSRANGE, 486 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 487 "TSTZRANGE": TokenType.TSTZRANGE, 488 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 489 "DATERANGE": TokenType.DATERANGE, 490 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 491 "UNIQUE": TokenType.UNIQUE, 492 "VECTOR": TokenType.VECTOR, 493 "STRUCT": TokenType.STRUCT, 494 "SEQUENCE": TokenType.SEQUENCE, 495 "VARIANT": TokenType.VARIANT, 496 "ALTER": TokenType.ALTER, 497 "ANALYZE": TokenType.ANALYZE, 498 "CALL": TokenType.COMMAND, 499 "COMMENT": TokenType.COMMENT, 500 "EXPLAIN": TokenType.COMMAND, 501 "GRANT": TokenType.GRANT, 502 "REVOKE": TokenType.REVOKE, 503 "OPTIMIZE": TokenType.COMMAND, 504 "PREPARE": TokenType.COMMAND, 505 "VACUUM": TokenType.COMMAND, 506 "USER-DEFINED": TokenType.USERDEFINED, 507 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 508 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 509 } 510 511 COMMANDS = { 512 TokenType.COMMAND, 513 TokenType.EXECUTE, 514 TokenType.FETCH, 515 TokenType.SHOW, 516 TokenType.RENAME, 517 } 518 519 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 520 521 # Handle numeric literals like in hive (3L = BIGINT) 522 NUMERIC_LITERALS: t.ClassVar[t.Dict[str, str]] = {} 523 524 COMMENTS = ["--", ("/*", "*/")] 525 526 __slots__ = ( 527 "dialect", 528 "_core", 529 ) 530 531 def __init__( 532 self, 533 dialect: DialectType = None, 534 **opts: t.Any, 535 ) -> None: 536 from sqlglot.dialects import Dialect 537 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 538 539 self.dialect = Dialect.get_or_raise(dialect) 540 541 self._core = _TokenizerCore( 542 single_tokens=self.SINGLE_TOKENS, 543 keywords=self.KEYWORDS, 544 quotes=self._QUOTES, 545 format_strings=self._FORMAT_STRINGS, 546 identifiers=self._IDENTIFIERS, 547 comments=self._COMMENTS, 548 string_escapes=self._STRING_ESCAPES, 549 byte_string_escapes=self._BYTE_STRING_ESCAPES, 550 identifier_escapes=self._IDENTIFIER_ESCAPES, 551 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 552 commands=self.COMMANDS, 553 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 554 nested_comments=self.NESTED_COMMENTS, 555 hint_start=self.HINT_START, 556 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 557 bit_strings=list(self.BIT_STRINGS), 558 hex_strings=list(self.HEX_STRINGS), 559 numeric_literals=self.NUMERIC_LITERALS, 560 var_single_tokens=self.VAR_SINGLE_TOKENS, 561 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 562 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 563 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 564 keyword_trie=self._KEYWORD_TRIE, 565 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 566 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 567 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 568 ) 569 570 def tokenize(self, sql: str) -> t.List[Token]: 571 """Returns a list of tokens corresponding to the SQL string `sql`.""" 572 return self._core.tokenize(sql) # type: ignore 573 574 @property 575 def sql(self) -> str: 576 """The SQL string being tokenized.""" 577 return self._core.sql 578 579 @property 580 def size(self) -> int: 581 """Length of the SQL string.""" 582 return self._core.size 583 584 @property 585 def tokens(self) -> t.List[Token]: 586 """The list of tokens produced by tokenization.""" 587 return self._core.tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, Type[sqlglot.dialects.Dialect], NoneType] = None, **opts: Any)
531 def __init__( 532 self, 533 dialect: DialectType = None, 534 **opts: t.Any, 535 ) -> None: 536 from sqlglot.dialects import Dialect 537 from sqlglot.tokenizer_core import TokenizerCore as _TokenizerCore 538 539 self.dialect = Dialect.get_or_raise(dialect) 540 541 self._core = _TokenizerCore( 542 single_tokens=self.SINGLE_TOKENS, 543 keywords=self.KEYWORDS, 544 quotes=self._QUOTES, 545 format_strings=self._FORMAT_STRINGS, 546 identifiers=self._IDENTIFIERS, 547 comments=self._COMMENTS, 548 string_escapes=self._STRING_ESCAPES, 549 byte_string_escapes=self._BYTE_STRING_ESCAPES, 550 identifier_escapes=self._IDENTIFIER_ESCAPES, 551 escape_follow_chars=self._ESCAPE_FOLLOW_CHARS, 552 commands=self.COMMANDS, 553 command_prefix_tokens=self.COMMAND_PREFIX_TOKENS, 554 nested_comments=self.NESTED_COMMENTS, 555 hint_start=self.HINT_START, 556 tokens_preceding_hint=self.TOKENS_PRECEDING_HINT, 557 bit_strings=list(self.BIT_STRINGS), 558 hex_strings=list(self.HEX_STRINGS), 559 numeric_literals=self.NUMERIC_LITERALS, 560 var_single_tokens=self.VAR_SINGLE_TOKENS, 561 string_escapes_allowed_in_raw_strings=self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 562 heredoc_tag_is_identifier=self.HEREDOC_TAG_IS_IDENTIFIER, 563 heredoc_string_alternative=self.HEREDOC_STRING_ALTERNATIVE, 564 keyword_trie=self._KEYWORD_TRIE, 565 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 566 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 567 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 568 )
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 1>, ')': <TokenType.R_PAREN: 2>, '[': <TokenType.L_BRACKET: 3>, ']': <TokenType.R_BRACKET: 4>, '{': <TokenType.L_BRACE: 5>, '}': <TokenType.R_BRACE: 6>, '&': <TokenType.AMP: 35>, '^': <TokenType.CARET: 41>, ':': <TokenType.COLON: 11>, ',': <TokenType.COMMA: 7>, '.': <TokenType.DOT: 8>, '-': <TokenType.DASH: 9>, '=': <TokenType.EQ: 27>, '>': <TokenType.GT: 24>, '<': <TokenType.LT: 22>, '%': <TokenType.MOD: 323>, '!': <TokenType.NOT: 26>, '|': <TokenType.PIPE: 38>, '+': <TokenType.PLUS: 10>, ';': <TokenType.SEMICOLON: 18>, '/': <TokenType.SLASH: 21>, '\\': <TokenType.BACKSLASH: 20>, '*': <TokenType.STAR: 19>, '~': <TokenType.TILDE: 43>, '?': <TokenType.PLACEHOLDER: 349>, '@': <TokenType.PARAMETER: 55>, '#': <TokenType.HASH: 47>, "'": <TokenType.UNKNOWN: 210>, '`': <TokenType.UNKNOWN: 210>, '"': <TokenType.UNKNOWN: 210>}
TOKENS_PRECEDING_HINT =
{<TokenType.DELETE: 252>, <TokenType.UPDATE: 404>, <TokenType.INSERT: 295>, <TokenType.SELECT: 375>}
KEYWORDS: ClassVar[Dict[str, sqlglot.tokenizer_core.TokenType]] =
{'{%': <TokenType.BLOCK_START: 70>, '{%+': <TokenType.BLOCK_START: 70>, '{%-': <TokenType.BLOCK_START: 70>, '%}': <TokenType.BLOCK_END: 71>, '+%}': <TokenType.BLOCK_END: 71>, '-%}': <TokenType.BLOCK_END: 71>, '{{+': <TokenType.BLOCK_START: 70>, '{{-': <TokenType.BLOCK_START: 70>, '+}}': <TokenType.BLOCK_END: 71>, '-}}': <TokenType.BLOCK_END: 71>, '/*+': <TokenType.HINT: 288>, '&<': <TokenType.AMP_LT: 60>, '&>': <TokenType.AMP_GT: 61>, '==': <TokenType.EQ: 27>, '::': <TokenType.DCOLON: 13>, '?::': <TokenType.QDCOLON: 360>, '||': <TokenType.DPIPE: 36>, '|>': <TokenType.PIPE_GT: 37>, '>=': <TokenType.GTE: 25>, '<=': <TokenType.LTE: 23>, '<>': <TokenType.NEQ: 28>, '!=': <TokenType.NEQ: 28>, ':=': <TokenType.COLON_EQ: 30>, '<=>': <TokenType.NULLSAFE_EQ: 29>, '->': <TokenType.ARROW: 44>, '->>': <TokenType.DARROW: 45>, '=>': <TokenType.FARROW: 46>, '#>': <TokenType.HASH_ARROW: 48>, '#>>': <TokenType.DHASH_ARROW: 49>, '<->': <TokenType.LR_ARROW: 50>, '&&': <TokenType.DAMP: 59>, '??': <TokenType.DQMARK: 17>, '~~~': <TokenType.GLOB: 282>, '~~': <TokenType.LIKE: 312>, '~~*': <TokenType.ILIKE: 290>, '~*': <TokenType.IRLIKE: 301>, '-|-': <TokenType.ADJACENT: 62>, 'ALL': <TokenType.ALL: 216>, 'AND': <TokenType.AND: 33>, 'ANTI': <TokenType.ANTI: 217>, 'ANY': <TokenType.ANY: 218>, 'ASC': <TokenType.ASC: 221>, 'AS': <TokenType.ALIAS: 214>, 'ASOF': <TokenType.ASOF: 222>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 224>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 224>, 'BEGIN': <TokenType.BEGIN: 225>, 'BETWEEN': <TokenType.BETWEEN: 226>, 'CACHE': <TokenType.CACHE: 228>, 'UNCACHE': <TokenType.UNCACHE: 400>, 'CASE': <TokenType.CASE: 229>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 230>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 231>, 'COLLATE': <TokenType.COLLATE: 232>, 'COLUMN': <TokenType.COLUMN: 78>, 'COMMIT': <TokenType.COMMIT: 235>, 'CONNECT BY': <TokenType.CONNECT_BY: 236>, 'CONSTRAINT': <TokenType.CONSTRAINT: 237>, 'COPY': <TokenType.COPY: 238>, 'CREATE': <TokenType.CREATE: 239>, 'CROSS': <TokenType.CROSS: 240>, 'CUBE': <TokenType.CUBE: 241>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 242>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 244>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 245>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 246>, 'CURRENT_USER': <TokenType.CURRENT_USER: 247>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 249>, 'DATABASE': <TokenType.DATABASE: 77>, 'DEFAULT': <TokenType.DEFAULT: 251>, 'DELETE': <TokenType.DELETE: 252>, 'DESC': <TokenType.DESC: 253>, 'DESCRIBE': <TokenType.DESCRIBE: 254>, 'DISTINCT': <TokenType.DISTINCT: 257>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 258>, 'DIV': <TokenType.DIV: 259>, 'DROP': <TokenType.DROP: 260>, 'ELSE': <TokenType.ELSE: 261>, 'END': <TokenType.END: 262>, 'ENUM': <TokenType.ENUM: 201>, 'ESCAPE': <TokenType.ESCAPE: 263>, 'EXCEPT': <TokenType.EXCEPT: 264>, 'EXECUTE': <TokenType.EXECUTE: 265>, 'EXISTS': <TokenType.EXISTS: 266>, 'FALSE': <TokenType.FALSE: 267>, 'FETCH': <TokenType.FETCH: 268>, 'FILTER': <TokenType.FILTER: 271>, 'FILE': <TokenType.FILE: 269>, 'FIRST': <TokenType.FIRST: 273>, 'FULL': <TokenType.FULL: 279>, 'FUNCTION': <TokenType.FUNCTION: 280>, 'FOR': <TokenType.FOR: 274>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 276>, 'FORMAT': <TokenType.FORMAT: 277>, 'FROM': <TokenType.FROM: 278>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 168>, 'GEOMETRY': <TokenType.GEOMETRY: 171>, 'GLOB': <TokenType.GLOB: 282>, 'GROUP BY': <TokenType.GROUP_BY: 285>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 286>, 'HAVING': <TokenType.HAVING: 287>, 'ILIKE': <TokenType.ILIKE: 290>, 'IN': <TokenType.IN: 291>, 'INDEX': <TokenType.INDEX: 292>, 'INET': <TokenType.INET: 196>, 'INNER': <TokenType.INNER: 294>, 'INSERT': <TokenType.INSERT: 295>, 'INTERVAL': <TokenType.INTERVAL: 298>, 'INTERSECT': <TokenType.INTERSECT: 297>, 'INTO': <TokenType.INTO: 299>, 'IS': <TokenType.IS: 302>, 'ISNULL': <TokenType.ISNULL: 303>, 'JOIN': <TokenType.JOIN: 304>, 'KEEP': <TokenType.KEEP: 306>, 'KILL': <TokenType.KILL: 308>, 'LATERAL': <TokenType.LATERAL: 310>, 'LEFT': <TokenType.LEFT: 311>, 'LIKE': <TokenType.LIKE: 312>, 'LIMIT': <TokenType.LIMIT: 313>, 'LOAD': <TokenType.LOAD: 315>, 'LOCALTIME': <TokenType.LOCALTIME: 175>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 176>, 'LOCK': <TokenType.LOCK: 316>, 'MERGE': <TokenType.MERGE: 322>, 'NAMESPACE': <TokenType.NAMESPACE: 426>, 'NATURAL': <TokenType.NATURAL: 325>, 'NEXT': <TokenType.NEXT: 326>, 'NOT': <TokenType.NOT: 26>, 'NOTNULL': <TokenType.NOTNULL: 328>, 'NULL': <TokenType.NULL: 329>, 'OBJECT': <TokenType.OBJECT: 195>, 'OFFSET': <TokenType.OFFSET: 331>, 'ON': <TokenType.ON: 332>, 'OR': <TokenType.OR: 34>, 'XOR': <TokenType.XOR: 63>, 'ORDER BY': <TokenType.ORDER_BY: 335>, 'ORDINALITY': <TokenType.ORDINALITY: 338>, 'OUT': <TokenType.OUT: 339>, 'OUTER': <TokenType.OUTER: 341>, 'OVER': <TokenType.OVER: 342>, 'OVERLAPS': <TokenType.OVERLAPS: 343>, 'OVERWRITE': <TokenType.OVERWRITE: 344>, 'PARTITION': <TokenType.PARTITION: 345>, 'PARTITION BY': <TokenType.PARTITION_BY: 346>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 346>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 346>, 'PERCENT': <TokenType.PERCENT: 347>, 'PIVOT': <TokenType.PIVOT: 348>, 'PRAGMA': <TokenType.PRAGMA: 351>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 353>, 'PROCEDURE': <TokenType.PROCEDURE: 354>, 'OPERATOR': <TokenType.OPERATOR: 334>, 'QUALIFY': <TokenType.QUALIFY: 358>, 'RANGE': <TokenType.RANGE: 361>, 'RECURSIVE': <TokenType.RECURSIVE: 362>, 'REGEXP': <TokenType.RLIKE: 370>, 'RENAME': <TokenType.RENAME: 364>, 'REPLACE': <TokenType.REPLACE: 365>, 'RETURNING': <TokenType.RETURNING: 366>, 'REFERENCES': <TokenType.REFERENCES: 368>, 'RIGHT': <TokenType.RIGHT: 369>, 'RLIKE': <TokenType.RLIKE: 370>, 'ROLLBACK': <TokenType.ROLLBACK: 371>, 'ROLLUP': <TokenType.ROLLUP: 372>, 'ROW': <TokenType.ROW: 373>, 'ROWS': <TokenType.ROWS: 374>, 'SCHEMA': <TokenType.SCHEMA: 80>, 'SELECT': <TokenType.SELECT: 375>, 'SEMI': <TokenType.SEMI: 376>, 'SESSION': <TokenType.SESSION: 56>, 'SESSION_USER': <TokenType.SESSION_USER: 58>, 'SET': <TokenType.SET: 380>, 'SETTINGS': <TokenType.SETTINGS: 381>, 'SHOW': <TokenType.SHOW: 382>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 383>, 'SOME': <TokenType.SOME: 384>, 'SORT BY': <TokenType.SORT_BY: 385>, 'START WITH': <TokenType.START_WITH: 387>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 389>, 'TABLE': <TokenType.TABLE: 81>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 392>, 'TEMP': <TokenType.TEMPORARY: 394>, 'TEMPORARY': <TokenType.TEMPORARY: 394>, 'THEN': <TokenType.THEN: 396>, 'TRUE': <TokenType.TRUE: 397>, 'TRUNCATE': <TokenType.TRUNCATE: 398>, 'TRIGGER': <TokenType.TRIGGER: 399>, 'UNION': <TokenType.UNION: 401>, 'UNKNOWN': <TokenType.UNKNOWN: 210>, 'UNNEST': <TokenType.UNNEST: 402>, 'UNPIVOT': <TokenType.UNPIVOT: 403>, 'UPDATE': <TokenType.UPDATE: 404>, 'USE': <TokenType.USE: 405>, 'USING': <TokenType.USING: 406>, 'UUID': <TokenType.UUID: 167>, 'VALUES': <TokenType.VALUES: 407>, 'VIEW': <TokenType.VIEW: 409>, 'VOLATILE': <TokenType.VOLATILE: 411>, 'WHEN': <TokenType.WHEN: 412>, 'WHERE': <TokenType.WHERE: 413>, 'WINDOW': <TokenType.WINDOW: 414>, 'WITH': <TokenType.WITH: 415>, 'APPLY': <TokenType.APPLY: 219>, 'ARRAY': <TokenType.ARRAY: 220>, 'BIT': <TokenType.BIT: 93>, 'BOOL': <TokenType.BOOLEAN: 94>, 'BOOLEAN': <TokenType.BOOLEAN: 94>, 'BYTE': <TokenType.TINYINT: 95>, 'MEDIUMINT': <TokenType.MEDIUMINT: 99>, 'INT1': <TokenType.TINYINT: 95>, 'TINYINT': <TokenType.TINYINT: 95>, 'INT16': <TokenType.SMALLINT: 97>, 'SHORT': <TokenType.SMALLINT: 97>, 'SMALLINT': <TokenType.SMALLINT: 97>, 'HUGEINT': <TokenType.INT128: 106>, 'UHUGEINT': <TokenType.UINT128: 107>, 'INT2': <TokenType.SMALLINT: 97>, 'INTEGER': <TokenType.INT: 101>, 'INT': <TokenType.INT: 101>, 'INT4': <TokenType.INT: 101>, 'INT32': <TokenType.INT: 101>, 'INT64': <TokenType.BIGINT: 103>, 'INT128': <TokenType.INT128: 106>, 'INT256': <TokenType.INT256: 108>, 'LONG': <TokenType.BIGINT: 103>, 'BIGINT': <TokenType.BIGINT: 103>, 'INT8': <TokenType.TINYINT: 95>, 'UINT': <TokenType.UINT: 102>, 'UINT128': <TokenType.UINT128: 107>, 'UINT256': <TokenType.UINT256: 109>, 'DEC': <TokenType.DECIMAL: 113>, 'DECIMAL': <TokenType.DECIMAL: 113>, 'DECIMAL32': <TokenType.DECIMAL32: 114>, 'DECIMAL64': <TokenType.DECIMAL64: 115>, 'DECIMAL128': <TokenType.DECIMAL128: 116>, 'DECIMAL256': <TokenType.DECIMAL256: 117>, 'DECFLOAT': <TokenType.DECFLOAT: 118>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 120>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 120>, 'BIGNUM': <TokenType.BIGNUM: 105>, 'LIST': <TokenType.LIST: 314>, 'MAP': <TokenType.MAP: 317>, 'NULLABLE': <TokenType.NULLABLE: 170>, 'NUMBER': <TokenType.DECIMAL: 113>, 'NUMERIC': <TokenType.DECIMAL: 113>, 'FIXED': <TokenType.DECIMAL: 113>, 'REAL': <TokenType.FLOAT: 110>, 'FLOAT': <TokenType.FLOAT: 110>, 'FLOAT4': <TokenType.FLOAT: 110>, 'FLOAT8': <TokenType.DOUBLE: 111>, 'DOUBLE': <TokenType.DOUBLE: 111>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 111>, 'JSON': <TokenType.JSON: 137>, 'JSONB': <TokenType.JSONB: 138>, 'CHAR': <TokenType.CHAR: 121>, 'CHARACTER': <TokenType.CHAR: 121>, 'CHAR VARYING': <TokenType.VARCHAR: 123>, 'CHARACTER VARYING': <TokenType.VARCHAR: 123>, 'NCHAR': <TokenType.NCHAR: 122>, 'VARCHAR': <TokenType.VARCHAR: 123>, 'VARCHAR2': <TokenType.VARCHAR: 123>, 'NVARCHAR': <TokenType.NVARCHAR: 124>, 'NVARCHAR2': <TokenType.NVARCHAR: 124>, 'BPCHAR': <TokenType.BPCHAR: 125>, 'STR': <TokenType.TEXT: 126>, 'STRING': <TokenType.TEXT: 126>, 'TEXT': <TokenType.TEXT: 126>, 'LONGTEXT': <TokenType.LONGTEXT: 128>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 127>, 'TINYTEXT': <TokenType.TINYTEXT: 133>, 'CLOB': <TokenType.TEXT: 126>, 'LONGVARCHAR': <TokenType.TEXT: 126>, 'BINARY': <TokenType.BINARY: 135>, 'BLOB': <TokenType.VARBINARY: 136>, 'LONGBLOB': <TokenType.LONGBLOB: 131>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 130>, 'TINYBLOB': <TokenType.TINYBLOB: 132>, 'BYTEA': <TokenType.VARBINARY: 136>, 'VARBINARY': <TokenType.VARBINARY: 136>, 'TIME': <TokenType.TIME: 139>, 'TIMETZ': <TokenType.TIMETZ: 140>, 'TIME_NS': <TokenType.TIME_NS: 141>, 'TIMESTAMP': <TokenType.TIMESTAMP: 142>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 143>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 144>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 144>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 145>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 145>, 'DATE': <TokenType.DATE: 153>, 'DATETIME': <TokenType.DATETIME: 149>, 'INT4RANGE': <TokenType.INT4RANGE: 155>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 156>, 'INT8RANGE': <TokenType.INT8RANGE: 157>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 158>, 'NUMRANGE': <TokenType.NUMRANGE: 159>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 160>, 'TSRANGE': <TokenType.TSRANGE: 161>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 162>, 'TSTZRANGE': <TokenType.TSTZRANGE: 163>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 164>, 'DATERANGE': <TokenType.DATERANGE: 165>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 166>, 'UNIQUE': <TokenType.UNIQUE: 416>, 'VECTOR': <TokenType.VECTOR: 211>, 'STRUCT': <TokenType.STRUCT: 390>, 'SEQUENCE': <TokenType.SEQUENCE: 378>, 'VARIANT': <TokenType.VARIANT: 194>, 'ALTER': <TokenType.ALTER: 215>, 'ANALYZE': <TokenType.ANALYZE: 425>, 'CALL': <TokenType.COMMAND: 233>, 'COMMENT': <TokenType.COMMENT: 234>, 'EXPLAIN': <TokenType.COMMAND: 233>, 'GRANT': <TokenType.GRANT: 284>, 'REVOKE': <TokenType.REVOKE: 367>, 'OPTIMIZE': <TokenType.COMMAND: 233>, 'PREPARE': <TokenType.COMMAND: 233>, 'VACUUM': <TokenType.COMMAND: 233>, 'USER-DEFINED': <TokenType.USERDEFINED: 189>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 420>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 421>}
COMMANDS =
{<TokenType.COMMAND: 233>, <TokenType.EXECUTE: 265>, <TokenType.RENAME: 364>, <TokenType.FETCH: 268>, <TokenType.SHOW: 382>}
570 def tokenize(self, sql: str) -> t.List[Token]: 571 """Returns a list of tokens corresponding to the SQL string `sql`.""" 572 return self._core.tokenize(sql) # type: ignore
Returns a list of tokens corresponding to the SQL string sql.
sql: str
574 @property 575 def sql(self) -> str: 576 """The SQL string being tokenized.""" 577 return self._core.sql
The SQL string being tokenized.
size: int
579 @property 580 def size(self) -> int: 581 """Length of the SQL string.""" 582 return self._core.size
Length of the SQL string.
tokens: List[sqlglot.tokenizer_core.Token]
584 @property 585 def tokens(self) -> t.List[Token]: 586 """The list of tokens produced by tokenization.""" 587 return self._core.tokens
The list of tokens produced by tokenization.