sqlglot.dialects.athena
1from __future__ import annotations 2 3 4from sqlglot import tokens 5from sqlglot.dialects.dialect import Dialect, DialectType 6from sqlglot.generators.athena import AthenaGenerator 7from sqlglot.parsers.athena import AthenaParser 8from sqlglot.tokens import TokenType, Token 9from sqlglot.dialects.trino import Trino 10from sqlglot.dialects.hive import Hive 11 12 13class Athena(Dialect): 14 """ 15 Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific 16 modifications and then built the Athena service around them. 17 18 Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an 19 execution engine depending on the query type. 20 21 As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", 22 the following engines exist: 23 24 Hive: 25 - Accepts mostly the same syntax as Hadoop / Hive 26 - Uses backticks to quote identifiers 27 - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) 28 that is different from Trino 29 - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead: 30 - CREATE [EXTERNAL] TABLE (without AS SELECT) 31 - ALTER 32 - DROP 33 34 Trino: 35 - Uses double quotes to quote identifiers 36 - Used for DDL operations that involve SELECT queries, eg: 37 - CREATE VIEW / DROP VIEW 38 - CREATE TABLE... AS SELECT 39 - Used for DML operations 40 - SELECT, INSERT, UPDATE, DELETE, MERGE 41 42 The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the 43 tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain 44 incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect. 45 46 References: 47 - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html 48 - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html 49 """ 50 51 # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens 52 # to disambiguate which dialect needs to be actually used in order to tokenize correctly. 53 class Tokenizer(tokens.Tokenizer): 54 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 55 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 56 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 57 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 58 59 NUMERIC_LITERALS = { 60 **Trino.Tokenizer.NUMERIC_LITERALS, 61 **Hive.Tokenizer.NUMERIC_LITERALS, 62 } 63 64 KEYWORDS = { 65 **Hive.Tokenizer.KEYWORDS, 66 **Trino.Tokenizer.KEYWORDS, 67 "UNLOAD": TokenType.COMMAND, 68 } 69 70 def __init__(self, dialect: DialectType = None) -> None: 71 super().__init__(dialect=dialect) 72 73 self._hive_tokenizer = Hive().tokenizer() 74 self._trino_tokenizer = _TrinoTokenizer(Trino()) 75 76 def tokenize(self, sql: str) -> list[Token]: 77 tokens = super().tokenize(sql) 78 79 if _tokenize_as_hive(tokens): 80 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 81 82 return self._trino_tokenizer.tokenize(sql) 83 84 Parser = AthenaParser 85 86 Generator = AthenaGenerator 87 88 89def _tokenize_as_hive(tokens: list[Token]) -> bool: 90 if len(tokens) < 2: 91 return False 92 93 first, second, *rest = tokens 94 95 first_type = first.token_type 96 first_text = first.text.upper() 97 second_type = second.token_type 98 second_text = second.text.upper() 99 100 if first_type in (TokenType.DESCRIBE, TokenType.SHOW) or first_text == "MSCK REPAIR": 101 return True 102 103 if first_type in (TokenType.ALTER, TokenType.CREATE, TokenType.DROP): 104 if second_text in ("DATABASE", "EXTERNAL", "SCHEMA"): 105 return True 106 if second_type == TokenType.VIEW: 107 return False 108 109 return all(t.token_type != TokenType.SELECT for t in rest) 110 111 return False 112 113 114# Athena extensions to Trino's tokenizer 115class _TrinoTokenizer(Trino.Tokenizer): 116 KEYWORDS = { 117 **Trino.Tokenizer.KEYWORDS, 118 "UNLOAD": TokenType.COMMAND, 119 }
14class Athena(Dialect): 15 """ 16 Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific 17 modifications and then built the Athena service around them. 18 19 Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an 20 execution engine depending on the query type. 21 22 As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", 23 the following engines exist: 24 25 Hive: 26 - Accepts mostly the same syntax as Hadoop / Hive 27 - Uses backticks to quote identifiers 28 - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) 29 that is different from Trino 30 - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead: 31 - CREATE [EXTERNAL] TABLE (without AS SELECT) 32 - ALTER 33 - DROP 34 35 Trino: 36 - Uses double quotes to quote identifiers 37 - Used for DDL operations that involve SELECT queries, eg: 38 - CREATE VIEW / DROP VIEW 39 - CREATE TABLE... AS SELECT 40 - Used for DML operations 41 - SELECT, INSERT, UPDATE, DELETE, MERGE 42 43 The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the 44 tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain 45 incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect. 46 47 References: 48 - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html 49 - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html 50 """ 51 52 # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens 53 # to disambiguate which dialect needs to be actually used in order to tokenize correctly. 54 class Tokenizer(tokens.Tokenizer): 55 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 56 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 57 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 58 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 59 60 NUMERIC_LITERALS = { 61 **Trino.Tokenizer.NUMERIC_LITERALS, 62 **Hive.Tokenizer.NUMERIC_LITERALS, 63 } 64 65 KEYWORDS = { 66 **Hive.Tokenizer.KEYWORDS, 67 **Trino.Tokenizer.KEYWORDS, 68 "UNLOAD": TokenType.COMMAND, 69 } 70 71 def __init__(self, dialect: DialectType = None) -> None: 72 super().__init__(dialect=dialect) 73 74 self._hive_tokenizer = Hive().tokenizer() 75 self._trino_tokenizer = _TrinoTokenizer(Trino()) 76 77 def tokenize(self, sql: str) -> list[Token]: 78 tokens = super().tokenize(sql) 79 80 if _tokenize_as_hive(tokens): 81 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 82 83 return self._trino_tokenizer.tokenize(sql) 84 85 Parser = AthenaParser 86 87 Generator = AthenaGenerator
Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific modifications and then built the Athena service around them.
Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an execution engine depending on the query type.
As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", the following engines exist:
Hive:
- Accepts mostly the same syntax as Hadoop / Hive
- Uses backticks to quote identifiers
- Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) that is different from Trino
- Used for most DDL, with some exceptions that get routed to the Trino engine instead:
- CREATE [EXTERNAL] TABLE (without AS SELECT)
- ALTER
- DROP
Trino:
- Uses double quotes to quote identifiers
- Used for DDL operations that involve SELECT queries, eg:
- CREATE VIEW / DROP VIEW
- CREATE TABLE... AS SELECT
- Used for DML operations
- SELECT, INSERT, UPDATE, DELETE, MERGE
The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.
References:
Mapping of an escaped sequence (\n) to its unescaped version (
).
Whether string literals support escape sequences (e.g. \n). Set by the metaclass based on the tokenizer's STRING_ESCAPES.
Whether byte string literals support escape sequences. Set by the metaclass based on the tokenizer's BYTE_STRING_ESCAPES.
54 class Tokenizer(tokens.Tokenizer): 55 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 56 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 57 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 58 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 59 60 NUMERIC_LITERALS = { 61 **Trino.Tokenizer.NUMERIC_LITERALS, 62 **Hive.Tokenizer.NUMERIC_LITERALS, 63 } 64 65 KEYWORDS = { 66 **Hive.Tokenizer.KEYWORDS, 67 **Trino.Tokenizer.KEYWORDS, 68 "UNLOAD": TokenType.COMMAND, 69 } 70 71 def __init__(self, dialect: DialectType = None) -> None: 72 super().__init__(dialect=dialect) 73 74 self._hive_tokenizer = Hive().tokenizer() 75 self._trino_tokenizer = _TrinoTokenizer(Trino()) 76 77 def tokenize(self, sql: str) -> list[Token]: 78 tokens = super().tokenize(sql) 79 80 if _tokenize_as_hive(tokens): 81 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 82 83 return self._trino_tokenizer.tokenize(sql)
77 def tokenize(self, sql: str) -> list[Token]: 78 tokens = super().tokenize(sql) 79 80 if _tokenize_as_hive(tokens): 81 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 82 83 return self._trino_tokenizer.tokenize(sql)
Returns a list of tokens corresponding to the SQL string sql.
Inherited Members
- sqlglot.tokens.Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- BYTE_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMBERS_CAN_HAVE_DECIMALS
- COMMENTS
- dialect
- sql
- size
- tokens