sqlglot.dialects.athena
1from __future__ import annotations 2 3import typing as t 4 5from sqlglot import exp, generator, parser, tokens 6from sqlglot.dialects import Dialect, Hive, Trino 7from sqlglot.tokens import TokenType, Token 8 9 10class Athena(Dialect): 11 """ 12 Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific 13 modifications and then built the Athena service around them. 14 15 Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an 16 execution engine depending on the query type. 17 18 As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", 19 the following engines exist: 20 21 Hive: 22 - Accepts mostly the same syntax as Hadoop / Hive 23 - Uses backticks to quote identifiers 24 - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) 25 that is different from Trino 26 - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead: 27 - CREATE [EXTERNAL] TABLE (without AS SELECT) 28 - ALTER 29 - DROP 30 31 Trino: 32 - Uses double quotes to quote identifiers 33 - Used for DDL operations that involve SELECT queries, eg: 34 - CREATE VIEW / DROP VIEW 35 - CREATE TABLE... AS SELECT 36 - Used for DML operations 37 - SELECT, INSERT, UPDATE, DELETE, MERGE 38 39 The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the 40 tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain 41 incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect. 42 43 References: 44 - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html 45 - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html 46 """ 47 48 def __init__(self, **kwargs): 49 super().__init__(**kwargs) 50 51 self._hive = Hive(**kwargs) 52 self._trino = Trino(**kwargs) 53 54 def tokenize(self, sql: str, **opts) -> t.List[Token]: 55 opts["hive"] = self._hive 56 opts["trino"] = self._trino 57 return super().tokenize(sql, **opts) 58 59 def parse(self, sql: str, **opts) -> t.List[t.Optional[exp.Expression]]: 60 opts["hive"] = self._hive 61 opts["trino"] = self._trino 62 return super().parse(sql, **opts) 63 64 def parse_into( 65 self, expression_type: exp.IntoType, sql: str, **opts 66 ) -> t.List[t.Optional[exp.Expression]]: 67 opts["hive"] = self._hive 68 opts["trino"] = self._trino 69 return super().parse_into(expression_type, sql, **opts) 70 71 def generate(self, expression: exp.Expression, copy: bool = True, **opts) -> str: 72 opts["hive"] = self._hive 73 opts["trino"] = self._trino 74 return super().generate(expression, copy=copy, **opts) 75 76 # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens 77 # to disambiguate which dialect needs to be actually used in order to tokenize correctly. 78 class Tokenizer(tokens.Tokenizer): 79 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 80 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 81 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 82 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 83 84 NUMERIC_LITERALS = { 85 **Trino.Tokenizer.NUMERIC_LITERALS, 86 **Hive.Tokenizer.NUMERIC_LITERALS, 87 } 88 89 KEYWORDS = { 90 **Hive.Tokenizer.KEYWORDS, 91 **Trino.Tokenizer.KEYWORDS, 92 "UNLOAD": TokenType.COMMAND, 93 } 94 95 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 96 hive = kwargs.pop("hive", None) or Hive() 97 trino = kwargs.pop("trino", None) or Trino() 98 99 super().__init__(*args, **kwargs) 100 101 self._hive_tokenizer = hive.tokenizer(*args, **{**kwargs, "dialect": hive}) 102 self._trino_tokenizer = _TrinoTokenizer(*args, **{**kwargs, "dialect": trino}) 103 104 def tokenize(self, sql: str) -> t.List[Token]: 105 tokens = super().tokenize(sql) 106 107 if _tokenize_as_hive(tokens): 108 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 109 110 return self._trino_tokenizer.tokenize(sql) 111 112 class Parser(parser.Parser): 113 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 114 hive = kwargs.pop("hive", None) or Hive() 115 trino = kwargs.pop("trino", None) or Trino() 116 117 super().__init__(*args, **kwargs) 118 119 self._hive_parser = hive.parser(*args, **{**kwargs, "dialect": hive}) 120 self._trino_parser = _TrinoParser(*args, **{**kwargs, "dialect": trino}) 121 122 def parse( 123 self, raw_tokens: t.List[Token], sql: t.Optional[str] = None 124 ) -> t.List[t.Optional[exp.Expression]]: 125 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 126 return self._hive_parser.parse(raw_tokens[1:], sql) 127 128 return self._trino_parser.parse(raw_tokens, sql) 129 130 def parse_into( 131 self, 132 expression_types: exp.IntoType, 133 raw_tokens: t.List[Token], 134 sql: t.Optional[str] = None, 135 ) -> t.List[t.Optional[exp.Expression]]: 136 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 137 return self._hive_parser.parse_into(expression_types, raw_tokens[1:], sql) 138 139 return self._trino_parser.parse_into(expression_types, raw_tokens, sql) 140 141 class Generator(generator.Generator): 142 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 143 hive = kwargs.pop("hive", None) or Hive() 144 trino = kwargs.pop("trino", None) or Trino() 145 146 super().__init__(*args, **kwargs) 147 148 self._hive_generator = _HiveGenerator(*args, **{**kwargs, "dialect": hive}) 149 self._trino_generator = _TrinoGenerator(*args, **{**kwargs, "dialect": trino}) 150 151 def generate(self, expression: exp.Expression, copy: bool = True) -> str: 152 if _generate_as_hive(expression): 153 generator = self._hive_generator 154 else: 155 generator = self._trino_generator 156 157 return generator.generate(expression, copy=copy) 158 159 160def _tokenize_as_hive(tokens: t.List[Token]) -> bool: 161 if len(tokens) < 2: 162 return False 163 164 first, second, *rest = tokens 165 166 first_type = first.token_type 167 first_text = first.text.upper() 168 second_type = second.token_type 169 second_text = second.text.upper() 170 171 if first_type in (TokenType.DESCRIBE, TokenType.SHOW) or first_text == "MSCK REPAIR": 172 return True 173 174 if first_type in (TokenType.ALTER, TokenType.CREATE, TokenType.DROP): 175 if second_text in ("DATABASE", "EXTERNAL", "SCHEMA"): 176 return True 177 if second_type == TokenType.VIEW: 178 return False 179 180 return all(t.token_type != TokenType.SELECT for t in rest) 181 182 return False 183 184 185def _generate_as_hive(expression: exp.Expression) -> bool: 186 if isinstance(expression, exp.Create): 187 if expression.kind == "TABLE": 188 properties = expression.args.get("properties") 189 190 # CREATE EXTERNAL TABLE is Hive 191 if properties and properties.find(exp.ExternalProperty): 192 return True 193 194 # Any CREATE TABLE other than CREATE TABLE ... AS <query> is Hive 195 if not isinstance(expression.expression, exp.Query): 196 return True 197 else: 198 # CREATE VIEW is Trino, but CREATE SCHEMA, CREATE DATABASE, etc, is Hive 199 return expression.kind != "VIEW" 200 elif isinstance(expression, (exp.Alter, exp.Drop, exp.Describe, exp.Show)): 201 if isinstance(expression, exp.Drop) and expression.kind == "VIEW": 202 # DROP VIEW is Trino, because CREATE VIEW is as well 203 return False 204 205 # Everything else, e.g., ALTER statements, is Hive 206 return True 207 208 return False 209 210 211def _is_iceberg_table(properties: exp.Properties) -> bool: 212 for p in properties.expressions: 213 if isinstance(p, exp.Property) and p.name == "table_type": 214 return p.text("value").lower() == "iceberg" 215 216 return False 217 218 219def _location_property_sql(self: Athena.Generator, e: exp.LocationProperty): 220 # If table_type='iceberg', the LocationProperty is called 'location' 221 # Otherwise, it's called 'external_location' 222 # ref: https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html 223 224 prop_name = "external_location" 225 226 if isinstance(e.parent, exp.Properties): 227 if _is_iceberg_table(e.parent): 228 prop_name = "location" 229 230 return f"{prop_name}={self.sql(e, 'this')}" 231 232 233def _partitioned_by_property_sql(self: Athena.Generator, e: exp.PartitionedByProperty) -> str: 234 # If table_type='iceberg' then the table property for partitioning is called 'partitioning' 235 # If table_type='hive' it's called 'partitioned_by' 236 # ref: https://docs.aws.amazon.com/athena/latest/ug/create-table-as.html#ctas-table-properties 237 238 prop_name = "partitioned_by" 239 240 if isinstance(e.parent, exp.Properties): 241 if _is_iceberg_table(e.parent): 242 prop_name = "partitioning" 243 244 return f"{prop_name}={self.sql(e, 'this')}" 245 246 247# Athena extensions to Hive's generator 248class _HiveGenerator(Hive.Generator): 249 def alter_sql(self, expression: exp.Alter) -> str: 250 # Package any ALTER TABLE ADD actions into a Schema object, so it gets generated as 251 # `ALTER TABLE .. ADD COLUMNS(...)`, instead of `ALTER TABLE ... ADD COLUMN`, which 252 # is invalid syntax on Athena 253 if isinstance(expression, exp.Alter) and expression.kind == "TABLE": 254 if expression.actions and isinstance(expression.actions[0], exp.ColumnDef): 255 new_actions = exp.Schema(expressions=expression.actions) 256 expression.set("actions", [new_actions]) 257 258 return super().alter_sql(expression) 259 260 261# Athena extensions to Trino's tokenizer 262class _TrinoTokenizer(Trino.Tokenizer): 263 KEYWORDS = { 264 **Trino.Tokenizer.KEYWORDS, 265 "UNLOAD": TokenType.COMMAND, 266 } 267 268 269# Athena extensions to Trino's parser 270class _TrinoParser(Trino.Parser): 271 STATEMENT_PARSERS = { 272 **Trino.Parser.STATEMENT_PARSERS, 273 TokenType.USING: lambda self: self._parse_as_command(self._prev), 274 } 275 276 277# Athena extensions to Trino's generator 278class _TrinoGenerator(Trino.Generator): 279 PROPERTIES_LOCATION = { 280 **Trino.Generator.PROPERTIES_LOCATION, 281 exp.LocationProperty: exp.Properties.Location.POST_WITH, 282 } 283 284 TRANSFORMS = { 285 **Trino.Generator.TRANSFORMS, 286 exp.PartitionedByProperty: _partitioned_by_property_sql, 287 exp.LocationProperty: _location_property_sql, 288 }
11class Athena(Dialect): 12 """ 13 Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific 14 modifications and then built the Athena service around them. 15 16 Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an 17 execution engine depending on the query type. 18 19 As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", 20 the following engines exist: 21 22 Hive: 23 - Accepts mostly the same syntax as Hadoop / Hive 24 - Uses backticks to quote identifiers 25 - Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) 26 that is different from Trino 27 - Used for *most* DDL, with some exceptions that get routed to the Trino engine instead: 28 - CREATE [EXTERNAL] TABLE (without AS SELECT) 29 - ALTER 30 - DROP 31 32 Trino: 33 - Uses double quotes to quote identifiers 34 - Used for DDL operations that involve SELECT queries, eg: 35 - CREATE VIEW / DROP VIEW 36 - CREATE TABLE... AS SELECT 37 - Used for DML operations 38 - SELECT, INSERT, UPDATE, DELETE, MERGE 39 40 The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the 41 tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain 42 incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect. 43 44 References: 45 - https://docs.aws.amazon.com/athena/latest/ug/ddl-reference.html 46 - https://docs.aws.amazon.com/athena/latest/ug/dml-queries-functions-operators.html 47 """ 48 49 def __init__(self, **kwargs): 50 super().__init__(**kwargs) 51 52 self._hive = Hive(**kwargs) 53 self._trino = Trino(**kwargs) 54 55 def tokenize(self, sql: str, **opts) -> t.List[Token]: 56 opts["hive"] = self._hive 57 opts["trino"] = self._trino 58 return super().tokenize(sql, **opts) 59 60 def parse(self, sql: str, **opts) -> t.List[t.Optional[exp.Expression]]: 61 opts["hive"] = self._hive 62 opts["trino"] = self._trino 63 return super().parse(sql, **opts) 64 65 def parse_into( 66 self, expression_type: exp.IntoType, sql: str, **opts 67 ) -> t.List[t.Optional[exp.Expression]]: 68 opts["hive"] = self._hive 69 opts["trino"] = self._trino 70 return super().parse_into(expression_type, sql, **opts) 71 72 def generate(self, expression: exp.Expression, copy: bool = True, **opts) -> str: 73 opts["hive"] = self._hive 74 opts["trino"] = self._trino 75 return super().generate(expression, copy=copy, **opts) 76 77 # This Tokenizer consumes a combination of HiveQL and Trino SQL and then processes the tokens 78 # to disambiguate which dialect needs to be actually used in order to tokenize correctly. 79 class Tokenizer(tokens.Tokenizer): 80 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 81 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 82 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 83 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 84 85 NUMERIC_LITERALS = { 86 **Trino.Tokenizer.NUMERIC_LITERALS, 87 **Hive.Tokenizer.NUMERIC_LITERALS, 88 } 89 90 KEYWORDS = { 91 **Hive.Tokenizer.KEYWORDS, 92 **Trino.Tokenizer.KEYWORDS, 93 "UNLOAD": TokenType.COMMAND, 94 } 95 96 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 97 hive = kwargs.pop("hive", None) or Hive() 98 trino = kwargs.pop("trino", None) or Trino() 99 100 super().__init__(*args, **kwargs) 101 102 self._hive_tokenizer = hive.tokenizer(*args, **{**kwargs, "dialect": hive}) 103 self._trino_tokenizer = _TrinoTokenizer(*args, **{**kwargs, "dialect": trino}) 104 105 def tokenize(self, sql: str) -> t.List[Token]: 106 tokens = super().tokenize(sql) 107 108 if _tokenize_as_hive(tokens): 109 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 110 111 return self._trino_tokenizer.tokenize(sql) 112 113 class Parser(parser.Parser): 114 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 115 hive = kwargs.pop("hive", None) or Hive() 116 trino = kwargs.pop("trino", None) or Trino() 117 118 super().__init__(*args, **kwargs) 119 120 self._hive_parser = hive.parser(*args, **{**kwargs, "dialect": hive}) 121 self._trino_parser = _TrinoParser(*args, **{**kwargs, "dialect": trino}) 122 123 def parse( 124 self, raw_tokens: t.List[Token], sql: t.Optional[str] = None 125 ) -> t.List[t.Optional[exp.Expression]]: 126 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 127 return self._hive_parser.parse(raw_tokens[1:], sql) 128 129 return self._trino_parser.parse(raw_tokens, sql) 130 131 def parse_into( 132 self, 133 expression_types: exp.IntoType, 134 raw_tokens: t.List[Token], 135 sql: t.Optional[str] = None, 136 ) -> t.List[t.Optional[exp.Expression]]: 137 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 138 return self._hive_parser.parse_into(expression_types, raw_tokens[1:], sql) 139 140 return self._trino_parser.parse_into(expression_types, raw_tokens, sql) 141 142 class Generator(generator.Generator): 143 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 144 hive = kwargs.pop("hive", None) or Hive() 145 trino = kwargs.pop("trino", None) or Trino() 146 147 super().__init__(*args, **kwargs) 148 149 self._hive_generator = _HiveGenerator(*args, **{**kwargs, "dialect": hive}) 150 self._trino_generator = _TrinoGenerator(*args, **{**kwargs, "dialect": trino}) 151 152 def generate(self, expression: exp.Expression, copy: bool = True) -> str: 153 if _generate_as_hive(expression): 154 generator = self._hive_generator 155 else: 156 generator = self._trino_generator 157 158 return generator.generate(expression, copy=copy)
Over the years, it looks like AWS has taken various execution engines, bolted on AWS-specific modifications and then built the Athena service around them.
Thus, Athena is not simply hosted Trino, it's more like a router that routes SQL queries to an execution engine depending on the query type.
As at 2024-09-10, assuming your Athena workgroup is configured to use "Athena engine version 3", the following engines exist:
Hive:
- Accepts mostly the same syntax as Hadoop / Hive
- Uses backticks to quote identifiers
- Has a distinctive DDL syntax (around things like setting table properties, storage locations etc) that is different from Trino
- Used for most DDL, with some exceptions that get routed to the Trino engine instead:
- CREATE [EXTERNAL] TABLE (without AS SELECT)
- ALTER
- DROP
Trino:
- Uses double quotes to quote identifiers
- Used for DDL operations that involve SELECT queries, eg:
- CREATE VIEW / DROP VIEW
- CREATE TABLE... AS SELECT
- Used for DML operations
- SELECT, INSERT, UPDATE, DELETE, MERGE
The SQLGlot Athena dialect tries to identify which engine a query would be routed to and then uses the tokenizer / parser / generator for that engine. This is unfortunately necessary, as there are certain incompatibilities between the engines' dialects and thus can't be handled by a single, unifying dialect.
References:
Mapping of an escaped sequence (\n
) to its unescaped version (
).
79 class Tokenizer(tokens.Tokenizer): 80 IDENTIFIERS = Trino.Tokenizer.IDENTIFIERS + Hive.Tokenizer.IDENTIFIERS 81 STRING_ESCAPES = Trino.Tokenizer.STRING_ESCAPES + Hive.Tokenizer.STRING_ESCAPES 82 HEX_STRINGS = Trino.Tokenizer.HEX_STRINGS + Hive.Tokenizer.HEX_STRINGS 83 UNICODE_STRINGS = Trino.Tokenizer.UNICODE_STRINGS + Hive.Tokenizer.UNICODE_STRINGS 84 85 NUMERIC_LITERALS = { 86 **Trino.Tokenizer.NUMERIC_LITERALS, 87 **Hive.Tokenizer.NUMERIC_LITERALS, 88 } 89 90 KEYWORDS = { 91 **Hive.Tokenizer.KEYWORDS, 92 **Trino.Tokenizer.KEYWORDS, 93 "UNLOAD": TokenType.COMMAND, 94 } 95 96 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 97 hive = kwargs.pop("hive", None) or Hive() 98 trino = kwargs.pop("trino", None) or Trino() 99 100 super().__init__(*args, **kwargs) 101 102 self._hive_tokenizer = hive.tokenizer(*args, **{**kwargs, "dialect": hive}) 103 self._trino_tokenizer = _TrinoTokenizer(*args, **{**kwargs, "dialect": trino}) 104 105 def tokenize(self, sql: str) -> t.List[Token]: 106 tokens = super().tokenize(sql) 107 108 if _tokenize_as_hive(tokens): 109 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 110 111 return self._trino_tokenizer.tokenize(sql)
96 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 97 hive = kwargs.pop("hive", None) or Hive() 98 trino = kwargs.pop("trino", None) or Trino() 99 100 super().__init__(*args, **kwargs) 101 102 self._hive_tokenizer = hive.tokenizer(*args, **{**kwargs, "dialect": hive}) 103 self._trino_tokenizer = _TrinoTokenizer(*args, **{**kwargs, "dialect": trino})
105 def tokenize(self, sql: str) -> t.List[Token]: 106 tokens = super().tokenize(sql) 107 108 if _tokenize_as_hive(tokens): 109 return [Token(TokenType.HIVE_TOKEN_STREAM, "")] + self._hive_tokenizer.tokenize(sql) 110 111 return self._trino_tokenizer.tokenize(sql)
Returns a list of tokens corresponding to the SQL string sql
.
Inherited Members
- sqlglot.tokens.Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- BYTE_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize_rs
- size
- sql
- tokens
113 class Parser(parser.Parser): 114 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 115 hive = kwargs.pop("hive", None) or Hive() 116 trino = kwargs.pop("trino", None) or Trino() 117 118 super().__init__(*args, **kwargs) 119 120 self._hive_parser = hive.parser(*args, **{**kwargs, "dialect": hive}) 121 self._trino_parser = _TrinoParser(*args, **{**kwargs, "dialect": trino}) 122 123 def parse( 124 self, raw_tokens: t.List[Token], sql: t.Optional[str] = None 125 ) -> t.List[t.Optional[exp.Expression]]: 126 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 127 return self._hive_parser.parse(raw_tokens[1:], sql) 128 129 return self._trino_parser.parse(raw_tokens, sql) 130 131 def parse_into( 132 self, 133 expression_types: exp.IntoType, 134 raw_tokens: t.List[Token], 135 sql: t.Optional[str] = None, 136 ) -> t.List[t.Optional[exp.Expression]]: 137 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 138 return self._hive_parser.parse_into(expression_types, raw_tokens[1:], sql) 139 140 return self._trino_parser.parse_into(expression_types, raw_tokens, sql)
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
114 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 115 hive = kwargs.pop("hive", None) or Hive() 116 trino = kwargs.pop("trino", None) or Trino() 117 118 super().__init__(*args, **kwargs) 119 120 self._hive_parser = hive.parser(*args, **{**kwargs, "dialect": hive}) 121 self._trino_parser = _TrinoParser(*args, **{**kwargs, "dialect": trino})
123 def parse( 124 self, raw_tokens: t.List[Token], sql: t.Optional[str] = None 125 ) -> t.List[t.Optional[exp.Expression]]: 126 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 127 return self._hive_parser.parse(raw_tokens[1:], sql) 128 129 return self._trino_parser.parse(raw_tokens, sql)
Parses a list of tokens and returns a list of syntax trees, one tree per parsed SQL statement.
Arguments:
- raw_tokens: The list of tokens.
- sql: The original SQL string, used to produce helpful debug messages.
Returns:
The list of the produced syntax trees.
131 def parse_into( 132 self, 133 expression_types: exp.IntoType, 134 raw_tokens: t.List[Token], 135 sql: t.Optional[str] = None, 136 ) -> t.List[t.Optional[exp.Expression]]: 137 if raw_tokens and raw_tokens[0].token_type == TokenType.HIVE_TOKEN_STREAM: 138 return self._hive_parser.parse_into(expression_types, raw_tokens[1:], sql) 139 140 return self._trino_parser.parse_into(expression_types, raw_tokens, sql)
Parses a list of tokens into a given Expression type. If a collection of Expression types is given instead, this method will try to parse the token list into each one of them, stopping at the first for which the parsing succeeds.
Arguments:
- expression_types: The expression type(s) to try and parse the token list into.
- raw_tokens: The list of tokens.
- sql: The original SQL string, used to produce helpful debug messages.
Returns:
The target Expression.
Inherited Members
- sqlglot.parser.Parser
- FUNCTIONS
- NO_PAREN_FUNCTIONS
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- ALIAS_TOKENS
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- STATEMENT_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- RANGE_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- PROPERTY_PARSERS
- CONSTRAINT_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- FUNCTION_PARSERS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- NULL_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- LOG_DEFAULTS_TO_LN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- JOINS_HAVE_EQUAL_PRECEDENCE
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- error_level
- error_message_context
- max_errors
- dialect
- reset
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
142 class Generator(generator.Generator): 143 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 144 hive = kwargs.pop("hive", None) or Hive() 145 trino = kwargs.pop("trino", None) or Trino() 146 147 super().__init__(*args, **kwargs) 148 149 self._hive_generator = _HiveGenerator(*args, **{**kwargs, "dialect": hive}) 150 self._trino_generator = _TrinoGenerator(*args, **{**kwargs, "dialect": trino}) 151 152 def generate(self, expression: exp.Expression, copy: bool = True) -> str: 153 if _generate_as_hive(expression): 154 generator = self._hive_generator 155 else: 156 generator = self._trino_generator 157 158 return generator.generate(expression, copy=copy)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
143 def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: 144 hive = kwargs.pop("hive", None) or Hive() 145 trino = kwargs.pop("trino", None) or Trino() 146 147 super().__init__(*args, **kwargs) 148 149 self._hive_generator = _HiveGenerator(*args, **{**kwargs, "dialect": hive}) 150 self._trino_generator = _TrinoGenerator(*args, **{**kwargs, "dialect": trino})
152 def generate(self, expression: exp.Expression, copy: bool = True) -> str: 153 if _generate_as_hive(expression): 154 generator = self._hive_generator 155 else: 156 generator = self._trino_generator 157 158 return generator.generate(expression, copy=copy)
Generates the SQL string corresponding to the given syntax tree.
Arguments:
- expression: The syntax tree.
- copy: Whether to copy the expression. The generator performs mutations so it is safer to copy.
Returns:
The SQL string corresponding to
expression
.
Inherited Members
- sqlglot.generator.Generator
- TRANSFORMS
- NULL_ORDERING_SUPPORTED
- IGNORE_NULLS_IN_FUNC
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_FETCH
- LIMIT_ONLY_LITERALS
- RENAME_TABLE_WITH_DB
- GROUPINGS_SEP
- INDEX_ON
- JOIN_HINTS
- TABLE_HINTS
- QUERY_HINTS
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- NVL2_SUPPORTED
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- JSON_KEY_VALUE_PAIR_SEP
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SUPPORTED_JSON_PATH_PARTS
- CAN_IMPLEMENT_ARRAY_ANY
- SUPPORTS_TO_NUMBER
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- TRY_SUPPORTED
- SUPPORTS_UESCAPE
- STAR_EXCEPT
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- TYPE_MAPPING
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- NAMED_PLACEHOLDER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- PROPERTIES_LOCATION
- RESERVED_KEYWORDS
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- bracket_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- eq_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- trycast_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql