Edit on GitHub

sqlglot.dialects.athena

  1from __future__ import annotations
  2
  3
  4from sqlglot import tokens
  5from sqlglot.dialects.dialect import Dialect, DialectType
  6from sqlglot.generators.athena import AthenaGenerator
  7from sqlglot.parsers.athena import AthenaParser
  8from sqlglot.tokens import TokenType, Token
  9from sqlglot.dialects.trino import Trino
 10from sqlglot.dialects.hive import Hive
 11
 12
 13class Athena(Dialect):
 14    """
 15    Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific
 16    modifications and then built the Athena service around them.
 17
 18    Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an
 19    execution engine depending on the query type.
 20
 21    As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3",
 22    the following engines exist:
 23
 24    Hive:
 25     - Accepts mostly the same syntax as Hadoop / Hive
 26     - Uses backticks to quote identifiers
 27     - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc)
 28       that is different from Trino
 29     - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead:
 30        - CREATE [EXTERNAL] TABLE (without AS SELECT)
 31        - ALTER
 32        - DROP
 33
 34    Trino:
 35      - Uses double quotes to quote identifiers
 36      - Used for DDL operations that involve SELECT queries, eg:
 37        - CREATE VIEW / DROP VIEW
 38        - CREATE TABLE... AS SELECT
 39      - Used for DML operations
 40        - SELECT, INSERT, UPDATE, DELETE, MERGE
 41
 42    The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the
 43    tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain
 44    incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.
 45
 46    References:
 47    - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html
 48    - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html
 49    """
 50
 51    # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens
 52    # to disambiguate which dialect needs to be actually used in order to tokenize correctly.
 53    class Tokenizer(tokens.Tokenizer):
 54        IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS
 55        STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES
 56        HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS
 57        UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS
 58
 59        NUMERIC_LITERALS = {
 60            **Trino.Tokenizer.NUMERIC_LITERALS,
 61            **Hive.Tokenizer.NUMERIC_LITERALS,
 62        }
 63
 64        KEYWORDS = {
 65            **Hive.Tokenizer.KEYWORDS,
 66            **Trino.Tokenizer.KEYWORDS,
 67            "UNLOAD": TokenType.COMMAND,
 68        }
 69
 70        def __init__(self, dialect: DialectType = None) -> None:
 71            super().__init__(dialect=dialect)
 72
 73            self._hive_tokenizer = Hive().tokenizer()
 74            self._trino_tokenizer = _TrinoTokenizer(Trino())
 75
 76        def tokenize(self, sql: str) -> list[Token]:
 77            tokens = super().tokenize(sql)
 78
 79            if _tokenize_as_hive(tokens):
 80                return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql)
 81
 82            return self._trino_tokenizer.tokenize(sql)
 83
 84    Parser = AthenaParser
 85
 86    Generator = AthenaGenerator
 87
 88
 89def _tokenize_as_hive(tokens: list[Token]) -> bool:
 90    if len(tokens) < 2:
 91        return False
 92
 93    first, second, *rest = tokens
 94
 95    first_type = first.token_type
 96    first_text = first.text.upper()
 97    second_type = second.token_type
 98    second_text = second.text.upper()
 99
100    if first_type in (TokenType.DESCRIBE, TokenType.SHOW) or first_text == "MSCK REPAIR":
101        return True
102
103    if first_type in (TokenType.ALTER, TokenType.CREATE, TokenType.DROP):
104        if second_text in ("DATABASE", "EXTERNAL", "SCHEMA"):
105            return True
106        if second_type == TokenType.VIEW:
107            return False
108
109        return all(t.token_type != TokenType.SELECT for t in rest)
110
111    return False
112
113
114# Athena extensions to Trino's tokenizer
115class _TrinoTokenizer(Trino.Tokenizer):
116    KEYWORDS = {
117        **Trino.Tokenizer.KEYWORDS,
118        "UNLOAD": TokenType.COMMAND,
119    }
class Athena(sqlglot.dialects.dialect.Dialect):
14class Athena(Dialect):
15    """
16    Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific
17    modifications and then built the Athena service around them.
18
19    Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an
20    execution engine depending on the query type.
21
22    As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3",
23    the following engines exist:
24
25    Hive:
26     - Accepts mostly the same syntax as Hadoop / Hive
27     - Uses backticks to quote identifiers
28     - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc)
29       that is different from Trino
30     - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead:
31        - CREATE [EXTERNAL] TABLE (without AS SELECT)
32        - ALTER
33        - DROP
34
35    Trino:
36      - Uses double quotes to quote identifiers
37      - Used for DDL operations that involve SELECT queries, eg:
38        - CREATE VIEW / DROP VIEW
39        - CREATE TABLE... AS SELECT
40      - Used for DML operations
41        - SELECT, INSERT, UPDATE, DELETE, MERGE
42
43    The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the
44    tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain
45    incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.
46
47    References:
48    - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html
49    - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html
50    """
51
52    # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens
53    # to disambiguate which dialect needs to be actually used in order to tokenize correctly.
54    class Tokenizer(tokens.Tokenizer):
55        IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS
56        STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES
57        HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS
58        UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS
59
60        NUMERIC_LITERALS = {
61            **Trino.Tokenizer.NUMERIC_LITERALS,
62            **Hive.Tokenizer.NUMERIC_LITERALS,
63        }
64
65        KEYWORDS = {
66            **Hive.Tokenizer.KEYWORDS,
67            **Trino.Tokenizer.KEYWORDS,
68            "UNLOAD": TokenType.COMMAND,
69        }
70
71        def __init__(self, dialect: DialectType = None) -> None:
72            super().__init__(dialect=dialect)
73
74            self._hive_tokenizer = Hive().tokenizer()
75            self._trino_tokenizer = _TrinoTokenizer(Trino())
76
77        def tokenize(self, sql: str) -> list[Token]:
78            tokens = super().tokenize(sql)
79
80            if _tokenize_as_hive(tokens):
81                return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql)
82
83            return self._trino_tokenizer.tokenize(sql)
84
85    Parser = AthenaParser
86
87    Generator = AthenaGenerator

Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific modifications and then built the Athena service around them.

Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an execution engine depending on the query type.

As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", the following engines exist:

Hive:
  • Accepts mostly the same syntax as Hadoop / Hive
  • Uses backticks to quote identifiers
  • Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) that is different from Trino
  • Used for most DDL, with some exceptions that get routed to the Trino engine instead:
    • CREATE [EXTERNAL] TABLE (without AS SELECT)
    • ALTER
    • DROP
Trino:
  • Uses double quotes to quote identifiers
  • Used for DDL operations that involve SELECT queries, eg:
    • CREATE VIEW / DROP VIEW
    • CREATE TABLE... AS SELECT
  • Used for DML operations
    • SELECT, INSERT, UPDATE, DELETE, MERGE

The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.

References:

SUPPORTS_COLUMN_JOIN_MARKS = False

Whether the old-style outer join (+) syntax is supported.

UNESCAPED_SEQUENCES: dict[str, str] = {'\\a': '\x07', '\\b': '\x08', '\\f': '\x0c', '\\n': '\n', '\\r': '\r', '\\t': '\t', '\\v': '\x0b', '\\\\': '\\'}

Mapping of an escaped sequence (\n) to its unescaped version ( ).

STRINGS_SUPPORT_ESCAPED_SEQUENCES: bool = True

Whether string literals support escape sequences (e.g. \n). Set by the metaclass based on the tokenizer's STRING_ESCAPES.

BYTE_STRINGS_SUPPORT_ESCAPED_SEQUENCES: bool = True

Whether byte string literals support escape sequences. Set by the metaclass based on the tokenizer's BYTE_STRING_ESCAPES.

INITCAP_SUPPORTS_CUSTOM_DELIMITERS = False
tokenizer_class = <class 'Athena.Tokenizer'>
jsonpath_tokenizer_class = <class 'sqlglot.dialects.dialect.JSONPathTokenizer'>
parser_class = <class 'sqlglot.parsers.athena.AthenaParser'>
generator_class = <class 'sqlglot.generators.athena.AthenaGenerator'>
TIME_TRIE: dict = {}
FORMAT_TRIE: dict = {}
INVERSE_TIME_MAPPING: dict[str, str] = {}
INVERSE_TIME_TRIE: dict = {}
INVERSE_FORMAT_MAPPING: dict[str, str] = {}
INVERSE_FORMAT_TRIE: dict = {}
INVERSE_CREATABLE_KIND_MAPPING: dict[str, str] = {}
ESCAPED_SEQUENCES: dict[str, str] = {'\x07': '\\a', '\x08': '\\b', '\x0c': '\\f', '\n': '\\n', '\r': '\\r', '\t': '\\t', '\x0b': '\\v', '\\': '\\\\'}
QUOTE_START = "'"
QUOTE_END = "'"
IDENTIFIER_START = '"'
IDENTIFIER_END = '"'
VALID_INTERVAL_UNITS: set[str] = {'DAYOFYEAR', 'H', 'MICROSEC', 'NSECONDS', 'S', 'MONTHS', 'DEC', 'EPOCH_SECOND', 'WEEKOFYEAR', 'CENTS', 'DW', 'QUARTERS', 'WEEKDAY_ISO', 'DECADE', 'NANOSEC', 'CENTURY', 'EPOCH', 'MSECS', 'M', 'DAYOFWEEK', 'WEEKDAY', 'CENTURIES', 'MILS', 'MI', 'DAYOFWEEKISO', 'MONTH', 'HOURS', 'DOW_ISO', 'WEEK_ISO', 'SEC', 'NSECOND', 'DAY', 'HH', 'MSEC', 'EPOCH_MICROSECONDS', 'CENT', 'DAYOFWEEK_ISO', 'YYY', 'YEAR', 'USECOND', 'C', 'EPOCH_SECONDS', 'YR', 'MIN', 'NSEC', 'WY', 'USECS', 'WEEKOFYEAR_ISO', 'DAY OF YEAR', 'YEARS', 'NANOSECOND', 'QTR', 'DOW', 'WEEKISO', 'EPOCH_MICROSECOND', 'DY', 'MILLISEC', 'MM', 'W', 'WEEK', 'SECONDS', 'NANOSECS', 'MILLISECON', 'SECOND', 'USECONDS', 'D', 'US', 'EPOCH_NANOSECONDS', 'Y', 'NS', 'EPOCH_NANOSECOND', 'YYYY', 'DW_ISO', 'DAYOFMONTH', 'QUARTER', 'MON', 'Q', 'MIL', 'MONS', 'WOY', 'WEEKOFYEARISO', 'MINUTE', 'YY', 'USEC', 'YRS', 'MILLISECONDS', 'MILLENNIUM', 'TIMEZONE_MINUTE', 'HR', 'HOUR', 'MICROSECONDS', 'EPOCH_MILLISECONDS', 'MILLISECOND', 'MILLISECS', 'MINS', 'MSECOND', 'WK', 'MSECONDS', 'DD', 'TIMEZONE_HOUR', 'TZM', 'MICROSECOND', 'MINUTES', 'DECS', 'QTRS', 'MS', 'TZH', 'MICROSECS', 'EPOCH_MILLISECOND', 'DECADES', 'HRS', 'DAY OF WEEK', 'SECS', 'DAYS', 'DOY', 'MILLENIA'}
BIT_START: str | None = None
BIT_END: str | None = None
HEX_START: str | None = "x'"
HEX_END: str | None = "'"
BYTE_START: str | None = None
BYTE_END: str | None = None
UNICODE_START: str | None = "U&'"
UNICODE_END: str | None = "'"
class Athena.Tokenizer(sqlglot.tokens.Tokenizer):
54    class Tokenizer(tokens.Tokenizer):
55        IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS
56        STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES
57        HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS
58        UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS
59
60        NUMERIC_LITERALS = {
61            **Trino.Tokenizer.NUMERIC_LITERALS,
62            **Hive.Tokenizer.NUMERIC_LITERALS,
63        }
64
65        KEYWORDS = {
66            **Hive.Tokenizer.KEYWORDS,
67            **Trino.Tokenizer.KEYWORDS,
68            "UNLOAD": TokenType.COMMAND,
69        }
70
71        def __init__(self, dialect: DialectType = None) -> None:
72            super().__init__(dialect=dialect)
73
74            self._hive_tokenizer = Hive().tokenizer()
75            self._trino_tokenizer = _TrinoTokenizer(Trino())
76
77        def tokenize(self, sql: str) -> list[Token]:
78            tokens = super().tokenize(sql)
79
80            if _tokenize_as_hive(tokens):
81                return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql)
82
83            return self._trino_tokenizer.tokenize(sql)
Athena.Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, type[sqlglot.dialects.Dialect], NoneType] = None)
71        def __init__(self, dialect: DialectType = None) -> None:
72            super().__init__(dialect=dialect)
73
74            self._hive_tokenizer = Hive().tokenizer()
75            self._trino_tokenizer = _TrinoTokenizer(Trino())
IDENTIFIERS = ['"', '`']
STRING_ESCAPES = ["'", '\\']
HEX_STRINGS = [("x'", "'"), ("X'", "'")]
UNICODE_STRINGS = [("U&'", "'"), ("u&'", "'")]
NUMERIC_LITERALS = {'L': 'BIGINT', 'S': 'SMALLINT', 'Y': 'TINYINT', 'D': 'DOUBLE', 'F': 'FLOAT', 'BD': 'DECIMAL'}
KEYWORDS = {'{%': <TokenType.BLOCK_START: 71>, '{%+': <TokenType.BLOCK_START: 71>, '{%-': <TokenType.BLOCK_START: 71>, '%}': <TokenType.BLOCK_END: 72>, '+%}': <TokenType.BLOCK_END: 72>, '-%}': <TokenType.BLOCK_END: 72>, '{{+': <TokenType.BLOCK_START: 71>, '{{-': <TokenType.BLOCK_START: 71>, '+}}': <TokenType.BLOCK_END: 72>, '-}}': <TokenType.BLOCK_END: 72>, '/*+': <TokenType.HINT: 291>, '&<': <TokenType.AMP_LT: 61>, '&>': <TokenType.AMP_GT: 62>, '==': <TokenType.EQ: 28>, '::': <TokenType.DCOLON: 14>, '?::': <TokenType.QDCOLON: 367>, '||': <TokenType.DPIPE: 37>, '|>': <TokenType.PIPE_GT: 38>, '>=': <TokenType.GTE: 26>, '<=': <TokenType.LTE: 24>, '<>': <TokenType.NEQ: 29>, '!=': <TokenType.NEQ: 29>, ':=': <TokenType.COLON_EQ: 31>, '<=>': <TokenType.NULLSAFE_EQ: 30>, '->': <TokenType.ARROW: 45>, '->>': <TokenType.DARROW: 46>, '=>': <TokenType.FARROW: 47>, '#>': <TokenType.HASH_ARROW: 49>, '#>>': <TokenType.DHASH_ARROW: 50>, '<->': <TokenType.LR_ARROW: 51>, '&&': <TokenType.DAMP: 60>, '??': <TokenType.DQMARK: 18>, '~~~': <TokenType.GLOB: 285>, '~~': <TokenType.LIKE: 316>, '~~*': <TokenType.ILIKE: 293>, '~*': <TokenType.IRLIKE: 305>, '-|-': <TokenType.ADJACENT: 63>, 'ALL': <TokenType.ALL: 218>, 'AND': <TokenType.AND: 34>, 'ANTI': <TokenType.ANTI: 219>, 'ANY': <TokenType.ANY: 220>, 'ASC': <TokenType.ASC: 223>, 'AS': <TokenType.ALIAS: 216>, 'ASOF': <TokenType.ASOF: 224>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 226>, 'BEGIN': <TokenType.BEGIN: 227>, 'BETWEEN': <TokenType.BETWEEN: 228>, 'CACHE': <TokenType.CACHE: 230>, 'UNCACHE': <TokenType.UNCACHE: 411>, 'CASE': <TokenType.CASE: 231>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 232>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 233>, 'COLLATE': <TokenType.COLLATE: 234>, 'COLUMN': <TokenType.COLUMN: 79>, 'COMMIT': <TokenType.COMMIT: 237>, 'CONNECT BY': <TokenType.CONNECT_BY: 238>, 'CONSTRAINT': <TokenType.CONSTRAINT: 239>, 'COPY': <TokenType.COPY: 240>, 'CREATE': <TokenType.CREATE: 241>, 'CROSS': <TokenType.CROSS: 242>, 'CUBE': <TokenType.CUBE: 243>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 244>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 246>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 247>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 248>, 'CURRENT_USER': <TokenType.CURRENT_USER: 249>, 'CURRENT_CATALOG': <TokenType.CURRENT_CATALOG: 252>, 'DATABASE': <TokenType.DATABASE: 78>, 'DEFAULT': <TokenType.DEFAULT: 254>, 'DELETE': <TokenType.DELETE: 255>, 'DESC': <TokenType.DESC: 256>, 'DESCRIBE': <TokenType.DESCRIBE: 257>, 'DISTINCT': <TokenType.DISTINCT: 260>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 261>, 'DIV': <TokenType.DIV: 262>, 'DROP': <TokenType.DROP: 263>, 'ELSE': <TokenType.ELSE: 264>, 'END': <TokenType.END: 265>, 'ENUM': <TokenType.ENUM: 203>, 'ESCAPE': <TokenType.ESCAPE: 266>, 'EXCEPT': <TokenType.EXCEPT: 267>, 'EXECUTE': <TokenType.EXECUTE: 268>, 'EXISTS': <TokenType.EXISTS: 269>, 'FALSE': <TokenType.FALSE: 270>, 'FETCH': <TokenType.FETCH: 271>, 'FILTER': <TokenType.FILTER: 274>, 'FILE': <TokenType.FILE: 272>, 'FIRST': <TokenType.FIRST: 276>, 'FULL': <TokenType.FULL: 282>, 'FUNCTION': <TokenType.FUNCTION: 283>, 'FOR': <TokenType.FOR: 277>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 279>, 'FORMAT': <TokenType.FORMAT: 280>, 'FROM': <TokenType.FROM: 281>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 170>, 'GEOMETRY': <TokenType.GEOMETRY: 173>, 'GLOB': <TokenType.GLOB: 285>, 'GROUP BY': <TokenType.GROUP_BY: 288>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 289>, 'HAVING': <TokenType.HAVING: 290>, 'ILIKE': <TokenType.ILIKE: 293>, 'IN': <TokenType.IN: 294>, 'INDEX': <TokenType.INDEX: 295>, 'INET': <TokenType.INET: 198>, 'INNER': <TokenType.INNER: 297>, 'INSERT': <TokenType.INSERT: 298>, 'INTERVAL': <TokenType.INTERVAL: 302>, 'INTERSECT': <TokenType.INTERSECT: 301>, 'INTO': <TokenType.INTO: 303>, 'IS': <TokenType.IS: 306>, 'ISNULL': <TokenType.ISNULL: 307>, 'JOIN': <TokenType.JOIN: 308>, 'KEEP': <TokenType.KEEP: 310>, 'KILL': <TokenType.KILL: 312>, 'LATERAL': <TokenType.LATERAL: 314>, 'LEFT': <TokenType.LEFT: 315>, 'LIKE': <TokenType.LIKE: 316>, 'LIMIT': <TokenType.LIMIT: 317>, 'LOAD': <TokenType.LOAD: 319>, 'LOCALTIME': <TokenType.LOCALTIME: 177>, 'LOCALTIMESTAMP': <TokenType.LOCALTIMESTAMP: 178>, 'LOCK': <TokenType.LOCK: 320>, 'MERGE': <TokenType.MERGE: 326>, 'NAMESPACE': <TokenType.NAMESPACE: 438>, 'NATURAL': <TokenType.NATURAL: 329>, 'NEXT': <TokenType.NEXT: 330>, 'NOT': <TokenType.NOT: 27>, 'NOTNULL': <TokenType.NOTNULL: 332>, 'NULL': <TokenType.NULL: 333>, 'OBJECT': <TokenType.OBJECT: 197>, 'OFFSET': <TokenType.OFFSET: 335>, 'ON': <TokenType.ON: 336>, 'OR': <TokenType.OR: 35>, 'XOR': <TokenType.XOR: 64>, 'ORDER BY': <TokenType.ORDER_BY: 339>, 'ORDINALITY': <TokenType.ORDINALITY: 342>, 'OUT': <TokenType.OUT: 343>, 'OUTER': <TokenType.OUTER: 345>, 'OVER': <TokenType.OVER: 346>, 'OVERLAPS': <TokenType.OVERLAPS: 347>, 'OVERWRITE': <TokenType.OVERWRITE: 348>, 'PARTITION': <TokenType.PARTITION: 350>, 'PARTITION BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 351>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 351>, 'PERCENT': <TokenType.PERCENT: 352>, 'PIVOT': <TokenType.PIVOT: 353>, 'PRAGMA': <TokenType.PRAGMA: 358>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 360>, 'PROCEDURE': <TokenType.PROCEDURE: 361>, 'OPERATOR': <TokenType.OPERATOR: 338>, 'QUALIFY': <TokenType.QUALIFY: 365>, 'RANGE': <TokenType.RANGE: 368>, 'RECURSIVE': <TokenType.RECURSIVE: 369>, 'REGEXP': <TokenType.RLIKE: 377>, 'RENAME': <TokenType.RENAME: 371>, 'REPLACE': <TokenType.REPLACE: 372>, 'RETURNING': <TokenType.RETURNING: 373>, 'REFERENCES': <TokenType.REFERENCES: 375>, 'RIGHT': <TokenType.RIGHT: 376>, 'RLIKE': <TokenType.RLIKE: 377>, 'ROLLBACK': <TokenType.ROLLBACK: 379>, 'ROLLUP': <TokenType.ROLLUP: 380>, 'ROW': <TokenType.STRUCT: 400>, 'ROWS': <TokenType.ROWS: 382>, 'SCHEMA': <TokenType.SCHEMA: 81>, 'SELECT': <TokenType.SELECT: 384>, 'SEMI': <TokenType.SEMI: 385>, 'SESSION': <TokenType.SESSION: 57>, 'SESSION_USER': <TokenType.SESSION_USER: 59>, 'SET': <TokenType.SET: 389>, 'SETTINGS': <TokenType.SETTINGS: 390>, 'SHOW': <TokenType.SHOW: 391>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 392>, 'SOME': <TokenType.SOME: 393>, 'SORT BY': <TokenType.SORT_BY: 394>, 'SQL SECURITY': <TokenType.SQL_SECURITY: 396>, 'START WITH': <TokenType.START_WITH: 397>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 399>, 'TABLE': <TokenType.TABLE: 82>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 402>, 'TEMP': <TokenType.TEMPORARY: 404>, 'TEMPORARY': <TokenType.TEMPORARY: 404>, 'THEN': <TokenType.THEN: 406>, 'TRUE': <TokenType.TRUE: 407>, 'TRUNCATE': <TokenType.TRUNCATE: 408>, 'TRIGGER': <TokenType.TRIGGER: 409>, 'UNION': <TokenType.UNION: 412>, 'UNKNOWN': <TokenType.UNKNOWN: 212>, 'UNNEST': <TokenType.UNNEST: 413>, 'UNPIVOT': <TokenType.UNPIVOT: 414>, 'UPDATE': <TokenType.UPDATE: 415>, 'USE': <TokenType.USE: 416>, 'USING': <TokenType.USING: 417>, 'UUID': <TokenType.UUID: 169>, 'VALUES': <TokenType.VALUES: 418>, 'VIEW': <TokenType.VIEW: 420>, 'VOLATILE': <TokenType.VOLATILE: 422>, 'WHEN': <TokenType.WHEN: 424>, 'WHERE': <TokenType.WHERE: 425>, 'WINDOW': <TokenType.WINDOW: 426>, 'WITH': <TokenType.WITH: 427>, 'APPLY': <TokenType.APPLY: 221>, 'ARRAY': <TokenType.ARRAY: 222>, 'BIT': <TokenType.BIT: 95>, 'BOOL': <TokenType.BOOLEAN: 96>, 'BOOLEAN': <TokenType.BOOLEAN: 96>, 'BYTE': <TokenType.TINYINT: 97>, 'MEDIUMINT': <TokenType.MEDIUMINT: 101>, 'INT1': <TokenType.TINYINT: 97>, 'TINYINT': <TokenType.TINYINT: 97>, 'INT16': <TokenType.SMALLINT: 99>, 'SHORT': <TokenType.SMALLINT: 99>, 'SMALLINT': <TokenType.SMALLINT: 99>, 'HUGEINT': <TokenType.INT128: 108>, 'UHUGEINT': <TokenType.UINT128: 109>, 'INT2': <TokenType.SMALLINT: 99>, 'INTEGER': <TokenType.INT: 103>, 'INT': <TokenType.INT: 103>, 'INT4': <TokenType.INT: 103>, 'INT32': <TokenType.INT: 103>, 'INT64': <TokenType.BIGINT: 105>, 'INT128': <TokenType.INT128: 108>, 'INT256': <TokenType.INT256: 110>, 'LONG': <TokenType.BIGINT: 105>, 'BIGINT': <TokenType.BIGINT: 105>, 'INT8': <TokenType.TINYINT: 97>, 'UINT': <TokenType.UINT: 104>, 'UINT128': <TokenType.UINT128: 109>, 'UINT256': <TokenType.UINT256: 111>, 'DEC': <TokenType.DECIMAL: 115>, 'DECIMAL': <TokenType.DECIMAL: 115>, 'DECIMAL32': <TokenType.DECIMAL32: 116>, 'DECIMAL64': <TokenType.DECIMAL64: 117>, 'DECIMAL128': <TokenType.DECIMAL128: 118>, 'DECIMAL256': <TokenType.DECIMAL256: 119>, 'DECFLOAT': <TokenType.DECFLOAT: 120>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 122>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 122>, 'BIGNUM': <TokenType.BIGNUM: 107>, 'LIST': <TokenType.LIST: 318>, 'MAP': <TokenType.MAP: 321>, 'NULLABLE': <TokenType.NULLABLE: 172>, 'NUMBER': <TokenType.DECIMAL: 115>, 'NUMERIC': <TokenType.DECIMAL: 115>, 'FIXED': <TokenType.DECIMAL: 115>, 'REAL': <TokenType.FLOAT: 112>, 'FLOAT': <TokenType.FLOAT: 112>, 'FLOAT4': <TokenType.FLOAT: 112>, 'FLOAT8': <TokenType.DOUBLE: 113>, 'DOUBLE': <TokenType.DOUBLE: 113>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 113>, 'JSON': <TokenType.JSON: 139>, 'JSONB': <TokenType.JSONB: 140>, 'CHAR': <TokenType.CHAR: 123>, 'CHARACTER': <TokenType.CHAR: 123>, 'CHAR VARYING': <TokenType.VARCHAR: 125>, 'CHARACTER VARYING': <TokenType.VARCHAR: 125>, 'NCHAR': <TokenType.NCHAR: 124>, 'VARCHAR': <TokenType.VARCHAR: 125>, 'VARCHAR2': <TokenType.VARCHAR: 125>, 'NVARCHAR': <TokenType.NVARCHAR: 126>, 'NVARCHAR2': <TokenType.NVARCHAR: 126>, 'BPCHAR': <TokenType.BPCHAR: 127>, 'STR': <TokenType.TEXT: 128>, 'STRING': <TokenType.TEXT: 128>, 'TEXT': <TokenType.TEXT: 128>, 'LONGTEXT': <TokenType.LONGTEXT: 130>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 129>, 'TINYTEXT': <TokenType.TINYTEXT: 135>, 'CLOB': <TokenType.TEXT: 128>, 'LONGVARCHAR': <TokenType.TEXT: 128>, 'BINARY': <TokenType.BINARY: 137>, 'BLOB': <TokenType.VARBINARY: 138>, 'LONGBLOB': <TokenType.LONGBLOB: 133>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 132>, 'TINYBLOB': <TokenType.TINYBLOB: 134>, 'BYTEA': <TokenType.VARBINARY: 138>, 'VARBINARY': <TokenType.VARBINARY: 138>, 'TIME': <TokenType.TIME: 141>, 'TIMETZ': <TokenType.TIMETZ: 142>, 'TIME_NS': <TokenType.TIME_NS: 143>, 'TIMESTAMP': <TokenType.TIMESTAMP: 144>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 145>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 146>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 147>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 147>, 'DATE': <TokenType.DATE: 155>, 'DATETIME': <TokenType.DATETIME: 151>, 'INT4RANGE': <TokenType.INT4RANGE: 157>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 158>, 'INT8RANGE': <TokenType.INT8RANGE: 159>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 160>, 'NUMRANGE': <TokenType.NUMRANGE: 161>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 162>, 'TSRANGE': <TokenType.TSRANGE: 163>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 164>, 'TSTZRANGE': <TokenType.TSTZRANGE: 165>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 166>, 'DATERANGE': <TokenType.DATERANGE: 167>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 168>, 'UNIQUE': <TokenType.UNIQUE: 428>, 'VECTOR': <TokenType.VECTOR: 213>, 'STRUCT': <TokenType.STRUCT: 400>, 'SEQUENCE': <TokenType.SEQUENCE: 387>, 'VARIANT': <TokenType.VARIANT: 196>, 'ALTER': <TokenType.ALTER: 217>, 'ANALYZE': <TokenType.ANALYZE: 437>, 'CALL': <TokenType.COMMAND: 235>, 'COMMENT': <TokenType.COMMENT: 236>, 'EXPLAIN': <TokenType.COMMAND: 235>, 'GRANT': <TokenType.GRANT: 287>, 'REVOKE': <TokenType.REVOKE: 374>, 'OPTIMIZE': <TokenType.COMMAND: 235>, 'PREPARE': <TokenType.COMMAND: 235>, 'VACUUM': <TokenType.COMMAND: 235>, 'USER-DEFINED': <TokenType.USERDEFINED: 191>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 432>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 433>, 'ADD ARCHIVE': <TokenType.COMMAND: 235>, 'ADD ARCHIVES': <TokenType.COMMAND: 235>, 'ADD FILE': <TokenType.COMMAND: 235>, 'ADD FILES': <TokenType.COMMAND: 235>, 'ADD JAR': <TokenType.COMMAND: 235>, 'ADD JARS': <TokenType.COMMAND: 235>, 'MINUS': <TokenType.EXCEPT: 267>, 'MSCK REPAIR': <TokenType.COMMAND: 235>, 'REFRESH': <TokenType.REFRESH: 370>, 'TIMESTAMP AS OF': <TokenType.TIMESTAMP_SNAPSHOT: 433>, 'VERSION AS OF': <TokenType.VERSION_SNAPSHOT: 432>, 'SERDEPROPERTIES': <TokenType.SERDE_PROPERTIES: 388>, 'DEALLOCATE PREPARE': <TokenType.COMMAND: 235>, 'DESCRIBE INPUT': <TokenType.COMMAND: 235>, 'DESCRIBE OUTPUT': <TokenType.COMMAND: 235>, 'RESET SESSION': <TokenType.COMMAND: 235>, 'START': <TokenType.BEGIN: 227>, 'MATCH_RECOGNIZE': <TokenType.MATCH_RECOGNIZE: 324>, 'IPADDRESS': <TokenType.IPADDRESS: 199>, 'IPPREFIX': <TokenType.IPPREFIX: 200>, 'TDIGEST': <TokenType.TDIGEST: 211>, 'HYPERLOGLOG': <TokenType.HLLSKETCH: 183>, 'UNLOAD': <TokenType.COMMAND: 235>}
def tokenize(self, sql: str) -> list[sqlglot.tokenizer_core.Token]:
77        def tokenize(self, sql: str) -> list[Token]:
78            tokens = super().tokenize(sql)
79
80            if _tokenize_as_hive(tokens):
81                return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql)
82
83            return self._trino_tokenizer.tokenize(sql)

Returns a list of tokens corresponding to the SQL string sql.

BYTE_STRING_ESCAPES: ClassVar[list[str]] = ["'", '\\']