sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot.dialects.dialect import ( 9 Dialect, 10 NormalizationStrategy, 11 arg_max_or_min_no_count, 12 binary_from_function, 13 date_add_interval_sql, 14 datestrtodate_sql, 15 build_formatted_time, 16 filter_array_using_unnest, 17 if_sql, 18 inline_array_unless_query, 19 max_or_greatest, 20 min_or_least, 21 no_ilike_sql, 22 build_date_delta_with_interval, 23 regexp_replace_sql, 24 rename_func, 25 sha256_sql, 26 timestrtotime_sql, 27 ts_or_ds_add_cast, 28 unit_to_var, 29) 30from sqlglot.helper import seq_get, split_num_words 31from sqlglot.tokens import TokenType 32 33if t.TYPE_CHECKING: 34 from sqlglot._typing import E, Lit 35 36logger = logging.getLogger("sqlglot") 37 38 39def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 40 if not expression.find_ancestor(exp.From, exp.Join): 41 return self.values_sql(expression) 42 43 structs = [] 44 alias = expression.args.get("alias") 45 for tup in expression.find_all(exp.Tuple): 46 field_aliases = ( 47 alias.columns 48 if alias and alias.columns 49 else (f"_c{i}" for i in range(len(tup.expressions))) 50 ) 51 expressions = [ 52 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 53 for name, fld in zip(field_aliases, tup.expressions) 54 ] 55 structs.append(exp.Struct(expressions=expressions)) 56 57 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 58 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 59 return self.unnest_sql( 60 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 61 ) 62 63 64def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 65 this = expression.this 66 if isinstance(this, exp.Schema): 67 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 68 else: 69 this = self.sql(this) 70 return f"RETURNS {this}" 71 72 73def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 74 returns = expression.find(exp.ReturnsProperty) 75 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 76 expression.set("kind", "TABLE FUNCTION") 77 78 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 79 expression.set("expression", expression.expression.this) 80 81 return self.create_sql(expression) 82 83 84# https://issuetracker.google.com/issues/162294746 85# workaround for bigquery bug when grouping by an expression and then ordering 86# WITH x AS (SELECT 1 y) 87# SELECT y + 1 z 88# FROM x 89# GROUP BY x + 1 90# ORDER by z 91def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 92 if isinstance(expression, exp.Select): 93 group = expression.args.get("group") 94 order = expression.args.get("order") 95 96 if group and order: 97 aliases = { 98 select.this: select.args["alias"] 99 for select in expression.selects 100 if isinstance(select, exp.Alias) 101 } 102 103 for grouped in group.expressions: 104 if grouped.is_int: 105 continue 106 alias = aliases.get(grouped) 107 if alias: 108 grouped.replace(exp.column(alias)) 109 110 return expression 111 112 113def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 114 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 115 if isinstance(expression, exp.CTE) and expression.alias_column_names: 116 cte_query = expression.this 117 118 if cte_query.is_star: 119 logger.warning( 120 "Can't push down CTE column names for star queries. Run the query through" 121 " the optimizer or use 'qualify' to expand the star projections first." 122 ) 123 return expression 124 125 column_names = expression.alias_column_names 126 expression.args["alias"].set("columns", None) 127 128 for name, select in zip(column_names, cte_query.selects): 129 to_replace = select 130 131 if isinstance(select, exp.Alias): 132 select = select.this 133 134 # Inner aliases are shadowed by the CTE column names 135 to_replace.replace(exp.alias_(select, name)) 136 137 return expression 138 139 140def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 141 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 142 this.set("zone", seq_get(args, 2)) 143 return this 144 145 146def _build_timestamp(args: t.List) -> exp.Timestamp: 147 timestamp = exp.Timestamp.from_arg_list(args) 148 timestamp.set("with_tz", True) 149 return timestamp 150 151 152def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 153 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 154 return expr_type.from_arg_list(args) 155 156 157def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 158 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 159 arg = seq_get(args, 0) 160 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 161 162 163def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 164 return self.sql( 165 exp.Exists( 166 this=exp.select("1") 167 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 168 .where(exp.column("_col").eq(expression.right)) 169 ) 170 ) 171 172 173def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 174 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 175 176 177def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 178 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 179 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 180 unit = unit_to_var(expression) 181 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 182 183 184def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 185 scale = expression.args.get("scale") 186 timestamp = expression.this 187 188 if scale in (None, exp.UnixToTime.SECONDS): 189 return self.func("TIMESTAMP_SECONDS", timestamp) 190 if scale == exp.UnixToTime.MILLIS: 191 return self.func("TIMESTAMP_MILLIS", timestamp) 192 if scale == exp.UnixToTime.MICROS: 193 return self.func("TIMESTAMP_MICROS", timestamp) 194 195 unix_seconds = exp.cast( 196 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 197 ) 198 return self.func("TIMESTAMP_SECONDS", unix_seconds) 199 200 201def _build_time(args: t.List) -> exp.Func: 202 if len(args) == 1: 203 return exp.TsOrDsToTime(this=args[0]) 204 if len(args) == 2: 205 return exp.Time.from_arg_list(args) 206 return exp.TimeFromParts.from_arg_list(args) 207 208 209def _build_datetime(args: t.List) -> exp.Func: 210 if len(args) == 1: 211 return exp.TsOrDsToTimestamp.from_arg_list(args) 212 if len(args) == 2: 213 return exp.Datetime.from_arg_list(args) 214 return exp.TimestampFromParts.from_arg_list(args) 215 216 217def _str_to_datetime_sql( 218 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 219) -> str: 220 this = self.sql(expression, "this") 221 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 222 223 if expression.args.get("safe"): 224 fmt = self.format_time( 225 expression, 226 self.dialect.INVERSE_FORMAT_MAPPING, 227 self.dialect.INVERSE_FORMAT_TRIE, 228 ) 229 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 230 231 fmt = self.format_time(expression) 232 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 233 234 235class BigQuery(Dialect): 236 WEEK_OFFSET = -1 237 UNNEST_COLUMN_ONLY = True 238 SUPPORTS_USER_DEFINED_TYPES = False 239 SUPPORTS_SEMI_ANTI_JOIN = False 240 LOG_BASE_FIRST = False 241 HEX_LOWERCASE = True 242 FORCE_EARLY_ALIAS_REF_EXPANSION = True 243 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 244 245 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 246 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 247 248 # bigquery udfs are case sensitive 249 NORMALIZE_FUNCTIONS = False 250 251 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 252 TIME_MAPPING = { 253 "%D": "%m/%d/%y", 254 "%E6S": "%S.%f", 255 } 256 257 FORMAT_MAPPING = { 258 "DD": "%d", 259 "MM": "%m", 260 "MON": "%b", 261 "MONTH": "%B", 262 "YYYY": "%Y", 263 "YY": "%y", 264 "HH": "%I", 265 "HH12": "%I", 266 "HH24": "%H", 267 "MI": "%M", 268 "SS": "%S", 269 "SSSSS": "%f", 270 "TZH": "%z", 271 } 272 273 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 274 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 275 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 276 277 def normalize_identifier(self, expression: E) -> E: 278 if ( 279 isinstance(expression, exp.Identifier) 280 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 281 ): 282 parent = expression.parent 283 while isinstance(parent, exp.Dot): 284 parent = parent.parent 285 286 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 287 # by default. The following check uses a heuristic to detect tables based on whether 288 # they are qualified. This should generally be correct, because tables in BigQuery 289 # must be qualified with at least a dataset, unless @@dataset_id is set. 290 case_sensitive = ( 291 isinstance(parent, exp.UserDefinedFunction) 292 or ( 293 isinstance(parent, exp.Table) 294 and parent.db 295 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 296 ) 297 or expression.meta.get("is_table") 298 ) 299 if not case_sensitive: 300 expression.set("this", expression.this.lower()) 301 302 return expression 303 304 class Tokenizer(tokens.Tokenizer): 305 QUOTES = ["'", '"', '"""', "'''"] 306 COMMENTS = ["--", "#", ("/*", "*/")] 307 IDENTIFIERS = ["`"] 308 STRING_ESCAPES = ["\\"] 309 310 HEX_STRINGS = [("0x", ""), ("0X", "")] 311 312 BYTE_STRINGS = [ 313 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 314 ] 315 316 RAW_STRINGS = [ 317 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 318 ] 319 320 KEYWORDS = { 321 **tokens.Tokenizer.KEYWORDS, 322 "ANY TYPE": TokenType.VARIANT, 323 "BEGIN": TokenType.COMMAND, 324 "BEGIN TRANSACTION": TokenType.BEGIN, 325 "BYTES": TokenType.BINARY, 326 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 327 "DATETIME": TokenType.TIMESTAMP, 328 "DECLARE": TokenType.COMMAND, 329 "ELSEIF": TokenType.COMMAND, 330 "EXCEPTION": TokenType.COMMAND, 331 "FLOAT64": TokenType.DOUBLE, 332 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 333 "MODEL": TokenType.MODEL, 334 "NOT DETERMINISTIC": TokenType.VOLATILE, 335 "RECORD": TokenType.STRUCT, 336 "TIMESTAMP": TokenType.TIMESTAMPTZ, 337 } 338 KEYWORDS.pop("DIV") 339 KEYWORDS.pop("VALUES") 340 KEYWORDS.pop("/*+") 341 342 class Parser(parser.Parser): 343 PREFIXED_PIVOT_COLUMNS = True 344 LOG_DEFAULTS_TO_LN = True 345 SUPPORTS_IMPLICIT_UNNEST = True 346 347 FUNCTIONS = { 348 **parser.Parser.FUNCTIONS, 349 "DATE": _build_date, 350 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 351 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 352 "DATE_TRUNC": lambda args: exp.DateTrunc( 353 unit=exp.Literal.string(str(seq_get(args, 1))), 354 this=seq_get(args, 0), 355 ), 356 "DATETIME": _build_datetime, 357 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 358 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 359 "DIV": binary_from_function(exp.IntDiv), 360 "FORMAT_DATE": lambda args: exp.TimeToStr( 361 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 362 ), 363 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 364 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 365 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 366 ), 367 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 368 "MD5": exp.MD5Digest.from_arg_list, 369 "TO_HEX": _build_to_hex, 370 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 371 [seq_get(args, 1), seq_get(args, 0)] 372 ), 373 "PARSE_TIMESTAMP": _build_parse_timestamp, 374 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 375 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 376 this=seq_get(args, 0), 377 expression=seq_get(args, 1), 378 position=seq_get(args, 2), 379 occurrence=seq_get(args, 3), 380 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 381 ), 382 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 383 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 384 "SPLIT": lambda args: exp.Split( 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 386 this=seq_get(args, 0), 387 expression=seq_get(args, 1) or exp.Literal.string(","), 388 ), 389 "TIME": _build_time, 390 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 391 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 392 "TIMESTAMP": _build_timestamp, 393 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 394 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 395 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 396 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 397 ), 398 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 399 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 400 ), 401 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 402 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 403 } 404 405 FUNCTION_PARSERS = { 406 **parser.Parser.FUNCTION_PARSERS, 407 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 408 } 409 FUNCTION_PARSERS.pop("TRIM") 410 411 NO_PAREN_FUNCTIONS = { 412 **parser.Parser.NO_PAREN_FUNCTIONS, 413 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 414 } 415 416 NESTED_TYPE_TOKENS = { 417 *parser.Parser.NESTED_TYPE_TOKENS, 418 TokenType.TABLE, 419 } 420 421 PROPERTY_PARSERS = { 422 **parser.Parser.PROPERTY_PARSERS, 423 "NOT DETERMINISTIC": lambda self: self.expression( 424 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 425 ), 426 "OPTIONS": lambda self: self._parse_with_property(), 427 } 428 429 CONSTRAINT_PARSERS = { 430 **parser.Parser.CONSTRAINT_PARSERS, 431 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 432 } 433 434 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 435 RANGE_PARSERS.pop(TokenType.OVERLAPS) 436 437 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 438 439 STATEMENT_PARSERS = { 440 **parser.Parser.STATEMENT_PARSERS, 441 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 442 TokenType.END: lambda self: self._parse_as_command(self._prev), 443 TokenType.FOR: lambda self: self._parse_for_in(), 444 } 445 446 BRACKET_OFFSETS = { 447 "OFFSET": (0, False), 448 "ORDINAL": (1, False), 449 "SAFE_OFFSET": (0, True), 450 "SAFE_ORDINAL": (1, True), 451 } 452 453 def _parse_for_in(self) -> exp.ForIn: 454 this = self._parse_range() 455 self._match_text_seq("DO") 456 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 457 458 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 459 this = super()._parse_table_part(schema=schema) or self._parse_number() 460 461 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 462 if isinstance(this, exp.Identifier): 463 table_name = this.name 464 while self._match(TokenType.DASH, advance=False) and self._next: 465 text = "" 466 while self._curr and self._curr.token_type != TokenType.DOT: 467 self._advance() 468 text += self._prev.text 469 table_name += text 470 471 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 472 elif isinstance(this, exp.Literal): 473 table_name = this.name 474 475 if self._is_connected() and self._parse_var(any_token=True): 476 table_name += self._prev.text 477 478 this = exp.Identifier(this=table_name, quoted=True) 479 480 return this 481 482 def _parse_table_parts( 483 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 484 ) -> exp.Table: 485 table = super()._parse_table_parts( 486 schema=schema, is_db_reference=is_db_reference, wildcard=True 487 ) 488 489 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 490 if not table.catalog: 491 if table.db: 492 parts = table.db.split(".") 493 if len(parts) == 2 and not table.args["db"].quoted: 494 table.set("catalog", exp.Identifier(this=parts[0])) 495 table.set("db", exp.Identifier(this=parts[1])) 496 else: 497 parts = table.name.split(".") 498 if len(parts) == 2 and not table.this.quoted: 499 table.set("db", exp.Identifier(this=parts[0])) 500 table.set("this", exp.Identifier(this=parts[1])) 501 502 if any("." in p.name for p in table.parts): 503 catalog, db, this, *rest = ( 504 exp.to_identifier(p, quoted=True) 505 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 506 ) 507 508 if rest and this: 509 this = exp.Dot.build([this, *rest]) # type: ignore 510 511 table = exp.Table( 512 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 513 ) 514 table.meta["quoted_table"] = True 515 516 return table 517 518 def _parse_column(self) -> t.Optional[exp.Expression]: 519 column = super()._parse_column() 520 if isinstance(column, exp.Column): 521 parts = column.parts 522 if any("." in p.name for p in parts): 523 catalog, db, table, this, *rest = ( 524 exp.to_identifier(p, quoted=True) 525 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 526 ) 527 528 if rest and this: 529 this = exp.Dot.build([this, *rest]) # type: ignore 530 531 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 532 column.meta["quoted_column"] = True 533 534 return column 535 536 @t.overload 537 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 538 539 @t.overload 540 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 541 542 def _parse_json_object(self, agg=False): 543 json_object = super()._parse_json_object() 544 array_kv_pair = seq_get(json_object.expressions, 0) 545 546 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 547 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 548 if ( 549 array_kv_pair 550 and isinstance(array_kv_pair.this, exp.Array) 551 and isinstance(array_kv_pair.expression, exp.Array) 552 ): 553 keys = array_kv_pair.this.expressions 554 values = array_kv_pair.expression.expressions 555 556 json_object.set( 557 "expressions", 558 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 559 ) 560 561 return json_object 562 563 def _parse_bracket( 564 self, this: t.Optional[exp.Expression] = None 565 ) -> t.Optional[exp.Expression]: 566 bracket = super()._parse_bracket(this) 567 568 if this is bracket: 569 return bracket 570 571 if isinstance(bracket, exp.Bracket): 572 for expression in bracket.expressions: 573 name = expression.name.upper() 574 575 if name not in self.BRACKET_OFFSETS: 576 break 577 578 offset, safe = self.BRACKET_OFFSETS[name] 579 bracket.set("offset", offset) 580 bracket.set("safe", safe) 581 expression.replace(expression.expressions[0]) 582 583 return bracket 584 585 class Generator(generator.Generator): 586 EXPLICIT_SET_OP = True 587 INTERVAL_ALLOWS_PLURAL_FORM = False 588 JOIN_HINTS = False 589 QUERY_HINTS = False 590 TABLE_HINTS = False 591 LIMIT_FETCH = "LIMIT" 592 RENAME_TABLE_WITH_DB = False 593 NVL2_SUPPORTED = False 594 UNNEST_WITH_ORDINALITY = False 595 COLLATE_IS_FUNC = True 596 LIMIT_ONLY_LITERALS = True 597 SUPPORTS_TABLE_ALIAS_COLUMNS = False 598 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 599 JSON_KEY_VALUE_PAIR_SEP = "," 600 NULL_ORDERING_SUPPORTED = False 601 IGNORE_NULLS_IN_FUNC = True 602 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 603 CAN_IMPLEMENT_ARRAY_ANY = True 604 SUPPORTS_TO_NUMBER = False 605 NAMED_PLACEHOLDER_TOKEN = "@" 606 HEX_FUNC = "TO_HEX" 607 WITH_PROPERTIES_PREFIX = "OPTIONS" 608 609 TRANSFORMS = { 610 **generator.Generator.TRANSFORMS, 611 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 612 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 613 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 614 exp.Array: inline_array_unless_query, 615 exp.ArrayContains: _array_contains_sql, 616 exp.ArrayFilter: filter_array_using_unnest, 617 exp.ArraySize: rename_func("ARRAY_LENGTH"), 618 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 619 exp.CollateProperty: lambda self, e: ( 620 f"DEFAULT COLLATE {self.sql(e, 'this')}" 621 if e.args.get("default") 622 else f"COLLATE {self.sql(e, 'this')}" 623 ), 624 exp.Commit: lambda *_: "COMMIT TRANSACTION", 625 exp.CountIf: rename_func("COUNTIF"), 626 exp.Create: _create_sql, 627 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 628 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 629 exp.DateDiff: lambda self, e: self.func( 630 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 631 ), 632 exp.DateFromParts: rename_func("DATE"), 633 exp.DateStrToDate: datestrtodate_sql, 634 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 635 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 636 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 637 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 638 exp.FromTimeZone: lambda self, e: self.func( 639 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 640 ), 641 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 642 exp.GroupConcat: rename_func("STRING_AGG"), 643 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 644 exp.If: if_sql(false_value="NULL"), 645 exp.ILike: no_ilike_sql, 646 exp.IntDiv: rename_func("DIV"), 647 exp.JSONFormat: rename_func("TO_JSON_STRING"), 648 exp.Max: max_or_greatest, 649 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 650 exp.MD5Digest: rename_func("MD5"), 651 exp.Min: min_or_least, 652 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 653 exp.RegexpExtract: lambda self, e: self.func( 654 "REGEXP_EXTRACT", 655 e.this, 656 e.expression, 657 e.args.get("position"), 658 e.args.get("occurrence"), 659 ), 660 exp.RegexpReplace: regexp_replace_sql, 661 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 662 exp.ReturnsProperty: _returnsproperty_sql, 663 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 664 exp.Select: transforms.preprocess( 665 [ 666 transforms.explode_to_unnest(), 667 transforms.unqualify_unnest, 668 transforms.eliminate_distinct_on, 669 _alias_ordered_group, 670 transforms.eliminate_semi_and_anti_joins, 671 ] 672 ), 673 exp.SHA: rename_func("SHA1"), 674 exp.SHA2: sha256_sql, 675 exp.StabilityProperty: lambda self, e: ( 676 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 677 ), 678 exp.StrToDate: _str_to_datetime_sql, 679 exp.StrToTime: _str_to_datetime_sql, 680 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 681 exp.TimeFromParts: rename_func("TIME"), 682 exp.TimestampFromParts: rename_func("DATETIME"), 683 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 684 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 685 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 686 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 687 exp.TimeStrToTime: timestrtotime_sql, 688 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 689 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 690 exp.TsOrDsAdd: _ts_or_ds_add_sql, 691 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 692 exp.TsOrDsToTime: rename_func("TIME"), 693 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 694 exp.Unhex: rename_func("FROM_HEX"), 695 exp.UnixDate: rename_func("UNIX_DATE"), 696 exp.UnixToTime: _unix_to_time_sql, 697 exp.Values: _derived_table_values_to_unnest, 698 exp.VariancePop: rename_func("VAR_POP"), 699 } 700 701 SUPPORTED_JSON_PATH_PARTS = { 702 exp.JSONPathKey, 703 exp.JSONPathRoot, 704 exp.JSONPathSubscript, 705 } 706 707 TYPE_MAPPING = { 708 **generator.Generator.TYPE_MAPPING, 709 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 710 exp.DataType.Type.BIGINT: "INT64", 711 exp.DataType.Type.BINARY: "BYTES", 712 exp.DataType.Type.BOOLEAN: "BOOL", 713 exp.DataType.Type.CHAR: "STRING", 714 exp.DataType.Type.DECIMAL: "NUMERIC", 715 exp.DataType.Type.DOUBLE: "FLOAT64", 716 exp.DataType.Type.FLOAT: "FLOAT64", 717 exp.DataType.Type.INT: "INT64", 718 exp.DataType.Type.NCHAR: "STRING", 719 exp.DataType.Type.NVARCHAR: "STRING", 720 exp.DataType.Type.SMALLINT: "INT64", 721 exp.DataType.Type.TEXT: "STRING", 722 exp.DataType.Type.TIMESTAMP: "DATETIME", 723 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 724 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 725 exp.DataType.Type.TINYINT: "INT64", 726 exp.DataType.Type.VARBINARY: "BYTES", 727 exp.DataType.Type.ROWVERSION: "BYTES", 728 exp.DataType.Type.VARCHAR: "STRING", 729 exp.DataType.Type.VARIANT: "ANY TYPE", 730 } 731 732 PROPERTIES_LOCATION = { 733 **generator.Generator.PROPERTIES_LOCATION, 734 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 735 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 736 } 737 738 # WINDOW comes after QUALIFY 739 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 740 AFTER_HAVING_MODIFIER_TRANSFORMS = { 741 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 742 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 743 } 744 745 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 746 RESERVED_KEYWORDS = { 747 "all", 748 "and", 749 "any", 750 "array", 751 "as", 752 "asc", 753 "assert_rows_modified", 754 "at", 755 "between", 756 "by", 757 "case", 758 "cast", 759 "collate", 760 "contains", 761 "create", 762 "cross", 763 "cube", 764 "current", 765 "default", 766 "define", 767 "desc", 768 "distinct", 769 "else", 770 "end", 771 "enum", 772 "escape", 773 "except", 774 "exclude", 775 "exists", 776 "extract", 777 "false", 778 "fetch", 779 "following", 780 "for", 781 "from", 782 "full", 783 "group", 784 "grouping", 785 "groups", 786 "hash", 787 "having", 788 "if", 789 "ignore", 790 "in", 791 "inner", 792 "intersect", 793 "interval", 794 "into", 795 "is", 796 "join", 797 "lateral", 798 "left", 799 "like", 800 "limit", 801 "lookup", 802 "merge", 803 "natural", 804 "new", 805 "no", 806 "not", 807 "null", 808 "nulls", 809 "of", 810 "on", 811 "or", 812 "order", 813 "outer", 814 "over", 815 "partition", 816 "preceding", 817 "proto", 818 "qualify", 819 "range", 820 "recursive", 821 "respect", 822 "right", 823 "rollup", 824 "rows", 825 "select", 826 "set", 827 "some", 828 "struct", 829 "tablesample", 830 "then", 831 "to", 832 "treat", 833 "true", 834 "unbounded", 835 "union", 836 "unnest", 837 "using", 838 "when", 839 "where", 840 "window", 841 "with", 842 "within", 843 } 844 845 def mod_sql(self, expression: exp.Mod) -> str: 846 this = expression.this 847 expr = expression.expression 848 return self.func( 849 "MOD", 850 this.unnest() if isinstance(this, exp.Paren) else this, 851 expr.unnest() if isinstance(expr, exp.Paren) else expr, 852 ) 853 854 def column_parts(self, expression: exp.Column) -> str: 855 if expression.meta.get("quoted_column"): 856 # If a column reference is of the form `dataset.table`.name, we need 857 # to preserve the quoted table path, otherwise the reference breaks 858 table_parts = ".".join(p.name for p in expression.parts[:-1]) 859 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 860 return f"{table_path}.{self.sql(expression, 'this')}" 861 862 return super().column_parts(expression) 863 864 def table_parts(self, expression: exp.Table) -> str: 865 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 866 # we need to make sure the correct quoting is used in each case. 867 # 868 # For example, if there is a CTE x that clashes with a schema name, then the former will 869 # return the table y in that schema, whereas the latter will return the CTE's y column: 870 # 871 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 872 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 873 if expression.meta.get("quoted_table"): 874 table_parts = ".".join(p.name for p in expression.parts) 875 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 876 877 return super().table_parts(expression) 878 879 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 880 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 881 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 882 883 def eq_sql(self, expression: exp.EQ) -> str: 884 # Operands of = cannot be NULL in BigQuery 885 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 886 if not isinstance(expression.parent, exp.Update): 887 return "NULL" 888 889 return self.binary(expression, "=") 890 891 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 892 parent = expression.parent 893 894 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 895 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 896 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 897 return self.func( 898 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 899 ) 900 901 return super().attimezone_sql(expression) 902 903 def trycast_sql(self, expression: exp.TryCast) -> str: 904 return self.cast_sql(expression, safe_prefix="SAFE_") 905 906 def bracket_sql(self, expression: exp.Bracket) -> str: 907 this = expression.this 908 expressions = expression.expressions 909 910 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 911 arg = expressions[0] 912 if arg.type is None: 913 from sqlglot.optimizer.annotate_types import annotate_types 914 915 arg = annotate_types(arg) 916 917 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 918 # BQ doesn't support bracket syntax with string values for structs 919 return f"{self.sql(this)}.{arg.name}" 920 921 expressions_sql = self.expressions(expression, flat=True) 922 offset = expression.args.get("offset") 923 924 if offset == 0: 925 expressions_sql = f"OFFSET({expressions_sql})" 926 elif offset == 1: 927 expressions_sql = f"ORDINAL({expressions_sql})" 928 elif offset is not None: 929 self.unsupported(f"Unsupported array offset: {offset}") 930 931 if expression.args.get("safe"): 932 expressions_sql = f"SAFE_{expressions_sql}" 933 934 return f"{self.sql(this)}[{expressions_sql}]" 935 936 def in_unnest_op(self, expression: exp.Unnest) -> str: 937 return self.sql(expression) 938 939 def except_op(self, expression: exp.Except) -> str: 940 if not expression.args.get("distinct"): 941 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 942 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 943 944 def intersect_op(self, expression: exp.Intersect) -> str: 945 if not expression.args.get("distinct"): 946 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 947 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 948 949 def version_sql(self, expression: exp.Version) -> str: 950 if expression.name == "TIMESTAMP": 951 expression.set("this", "SYSTEM_TIME") 952 return super().version_sql(expression)
236class BigQuery(Dialect): 237 WEEK_OFFSET = -1 238 UNNEST_COLUMN_ONLY = True 239 SUPPORTS_USER_DEFINED_TYPES = False 240 SUPPORTS_SEMI_ANTI_JOIN = False 241 LOG_BASE_FIRST = False 242 HEX_LOWERCASE = True 243 FORCE_EARLY_ALIAS_REF_EXPANSION = True 244 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 245 246 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 247 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 248 249 # bigquery udfs are case sensitive 250 NORMALIZE_FUNCTIONS = False 251 252 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 253 TIME_MAPPING = { 254 "%D": "%m/%d/%y", 255 "%E6S": "%S.%f", 256 } 257 258 FORMAT_MAPPING = { 259 "DD": "%d", 260 "MM": "%m", 261 "MON": "%b", 262 "MONTH": "%B", 263 "YYYY": "%Y", 264 "YY": "%y", 265 "HH": "%I", 266 "HH12": "%I", 267 "HH24": "%H", 268 "MI": "%M", 269 "SS": "%S", 270 "SSSSS": "%f", 271 "TZH": "%z", 272 } 273 274 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 275 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 276 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 277 278 def normalize_identifier(self, expression: E) -> E: 279 if ( 280 isinstance(expression, exp.Identifier) 281 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 282 ): 283 parent = expression.parent 284 while isinstance(parent, exp.Dot): 285 parent = parent.parent 286 287 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 288 # by default. The following check uses a heuristic to detect tables based on whether 289 # they are qualified. This should generally be correct, because tables in BigQuery 290 # must be qualified with at least a dataset, unless @@dataset_id is set. 291 case_sensitive = ( 292 isinstance(parent, exp.UserDefinedFunction) 293 or ( 294 isinstance(parent, exp.Table) 295 and parent.db 296 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 297 ) 298 or expression.meta.get("is_table") 299 ) 300 if not case_sensitive: 301 expression.set("this", expression.this.lower()) 302 303 return expression 304 305 class Tokenizer(tokens.Tokenizer): 306 QUOTES = ["'", '"', '"""', "'''"] 307 COMMENTS = ["--", "#", ("/*", "*/")] 308 IDENTIFIERS = ["`"] 309 STRING_ESCAPES = ["\\"] 310 311 HEX_STRINGS = [("0x", ""), ("0X", "")] 312 313 BYTE_STRINGS = [ 314 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 315 ] 316 317 RAW_STRINGS = [ 318 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 319 ] 320 321 KEYWORDS = { 322 **tokens.Tokenizer.KEYWORDS, 323 "ANY TYPE": TokenType.VARIANT, 324 "BEGIN": TokenType.COMMAND, 325 "BEGIN TRANSACTION": TokenType.BEGIN, 326 "BYTES": TokenType.BINARY, 327 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 328 "DATETIME": TokenType.TIMESTAMP, 329 "DECLARE": TokenType.COMMAND, 330 "ELSEIF": TokenType.COMMAND, 331 "EXCEPTION": TokenType.COMMAND, 332 "FLOAT64": TokenType.DOUBLE, 333 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 334 "MODEL": TokenType.MODEL, 335 "NOT DETERMINISTIC": TokenType.VOLATILE, 336 "RECORD": TokenType.STRUCT, 337 "TIMESTAMP": TokenType.TIMESTAMPTZ, 338 } 339 KEYWORDS.pop("DIV") 340 KEYWORDS.pop("VALUES") 341 KEYWORDS.pop("/*+") 342 343 class Parser(parser.Parser): 344 PREFIXED_PIVOT_COLUMNS = True 345 LOG_DEFAULTS_TO_LN = True 346 SUPPORTS_IMPLICIT_UNNEST = True 347 348 FUNCTIONS = { 349 **parser.Parser.FUNCTIONS, 350 "DATE": _build_date, 351 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 352 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 353 "DATE_TRUNC": lambda args: exp.DateTrunc( 354 unit=exp.Literal.string(str(seq_get(args, 1))), 355 this=seq_get(args, 0), 356 ), 357 "DATETIME": _build_datetime, 358 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 359 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 360 "DIV": binary_from_function(exp.IntDiv), 361 "FORMAT_DATE": lambda args: exp.TimeToStr( 362 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 363 ), 364 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 365 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 366 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 367 ), 368 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 369 "MD5": exp.MD5Digest.from_arg_list, 370 "TO_HEX": _build_to_hex, 371 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 372 [seq_get(args, 1), seq_get(args, 0)] 373 ), 374 "PARSE_TIMESTAMP": _build_parse_timestamp, 375 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 376 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 377 this=seq_get(args, 0), 378 expression=seq_get(args, 1), 379 position=seq_get(args, 2), 380 occurrence=seq_get(args, 3), 381 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 382 ), 383 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 384 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 385 "SPLIT": lambda args: exp.Split( 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 387 this=seq_get(args, 0), 388 expression=seq_get(args, 1) or exp.Literal.string(","), 389 ), 390 "TIME": _build_time, 391 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 392 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 393 "TIMESTAMP": _build_timestamp, 394 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 395 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 396 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 397 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 398 ), 399 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 400 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 401 ), 402 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 403 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 404 } 405 406 FUNCTION_PARSERS = { 407 **parser.Parser.FUNCTION_PARSERS, 408 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 409 } 410 FUNCTION_PARSERS.pop("TRIM") 411 412 NO_PAREN_FUNCTIONS = { 413 **parser.Parser.NO_PAREN_FUNCTIONS, 414 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 415 } 416 417 NESTED_TYPE_TOKENS = { 418 *parser.Parser.NESTED_TYPE_TOKENS, 419 TokenType.TABLE, 420 } 421 422 PROPERTY_PARSERS = { 423 **parser.Parser.PROPERTY_PARSERS, 424 "NOT DETERMINISTIC": lambda self: self.expression( 425 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 426 ), 427 "OPTIONS": lambda self: self._parse_with_property(), 428 } 429 430 CONSTRAINT_PARSERS = { 431 **parser.Parser.CONSTRAINT_PARSERS, 432 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 433 } 434 435 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 436 RANGE_PARSERS.pop(TokenType.OVERLAPS) 437 438 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 439 440 STATEMENT_PARSERS = { 441 **parser.Parser.STATEMENT_PARSERS, 442 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 443 TokenType.END: lambda self: self._parse_as_command(self._prev), 444 TokenType.FOR: lambda self: self._parse_for_in(), 445 } 446 447 BRACKET_OFFSETS = { 448 "OFFSET": (0, False), 449 "ORDINAL": (1, False), 450 "SAFE_OFFSET": (0, True), 451 "SAFE_ORDINAL": (1, True), 452 } 453 454 def _parse_for_in(self) -> exp.ForIn: 455 this = self._parse_range() 456 self._match_text_seq("DO") 457 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 458 459 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 460 this = super()._parse_table_part(schema=schema) or self._parse_number() 461 462 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 463 if isinstance(this, exp.Identifier): 464 table_name = this.name 465 while self._match(TokenType.DASH, advance=False) and self._next: 466 text = "" 467 while self._curr and self._curr.token_type != TokenType.DOT: 468 self._advance() 469 text += self._prev.text 470 table_name += text 471 472 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 473 elif isinstance(this, exp.Literal): 474 table_name = this.name 475 476 if self._is_connected() and self._parse_var(any_token=True): 477 table_name += self._prev.text 478 479 this = exp.Identifier(this=table_name, quoted=True) 480 481 return this 482 483 def _parse_table_parts( 484 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 485 ) -> exp.Table: 486 table = super()._parse_table_parts( 487 schema=schema, is_db_reference=is_db_reference, wildcard=True 488 ) 489 490 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 491 if not table.catalog: 492 if table.db: 493 parts = table.db.split(".") 494 if len(parts) == 2 and not table.args["db"].quoted: 495 table.set("catalog", exp.Identifier(this=parts[0])) 496 table.set("db", exp.Identifier(this=parts[1])) 497 else: 498 parts = table.name.split(".") 499 if len(parts) == 2 and not table.this.quoted: 500 table.set("db", exp.Identifier(this=parts[0])) 501 table.set("this", exp.Identifier(this=parts[1])) 502 503 if any("." in p.name for p in table.parts): 504 catalog, db, this, *rest = ( 505 exp.to_identifier(p, quoted=True) 506 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 507 ) 508 509 if rest and this: 510 this = exp.Dot.build([this, *rest]) # type: ignore 511 512 table = exp.Table( 513 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 514 ) 515 table.meta["quoted_table"] = True 516 517 return table 518 519 def _parse_column(self) -> t.Optional[exp.Expression]: 520 column = super()._parse_column() 521 if isinstance(column, exp.Column): 522 parts = column.parts 523 if any("." in p.name for p in parts): 524 catalog, db, table, this, *rest = ( 525 exp.to_identifier(p, quoted=True) 526 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 527 ) 528 529 if rest and this: 530 this = exp.Dot.build([this, *rest]) # type: ignore 531 532 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 533 column.meta["quoted_column"] = True 534 535 return column 536 537 @t.overload 538 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 539 540 @t.overload 541 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 542 543 def _parse_json_object(self, agg=False): 544 json_object = super()._parse_json_object() 545 array_kv_pair = seq_get(json_object.expressions, 0) 546 547 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 548 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 549 if ( 550 array_kv_pair 551 and isinstance(array_kv_pair.this, exp.Array) 552 and isinstance(array_kv_pair.expression, exp.Array) 553 ): 554 keys = array_kv_pair.this.expressions 555 values = array_kv_pair.expression.expressions 556 557 json_object.set( 558 "expressions", 559 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 560 ) 561 562 return json_object 563 564 def _parse_bracket( 565 self, this: t.Optional[exp.Expression] = None 566 ) -> t.Optional[exp.Expression]: 567 bracket = super()._parse_bracket(this) 568 569 if this is bracket: 570 return bracket 571 572 if isinstance(bracket, exp.Bracket): 573 for expression in bracket.expressions: 574 name = expression.name.upper() 575 576 if name not in self.BRACKET_OFFSETS: 577 break 578 579 offset, safe = self.BRACKET_OFFSETS[name] 580 bracket.set("offset", offset) 581 bracket.set("safe", safe) 582 expression.replace(expression.expressions[0]) 583 584 return bracket 585 586 class Generator(generator.Generator): 587 EXPLICIT_SET_OP = True 588 INTERVAL_ALLOWS_PLURAL_FORM = False 589 JOIN_HINTS = False 590 QUERY_HINTS = False 591 TABLE_HINTS = False 592 LIMIT_FETCH = "LIMIT" 593 RENAME_TABLE_WITH_DB = False 594 NVL2_SUPPORTED = False 595 UNNEST_WITH_ORDINALITY = False 596 COLLATE_IS_FUNC = True 597 LIMIT_ONLY_LITERALS = True 598 SUPPORTS_TABLE_ALIAS_COLUMNS = False 599 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 600 JSON_KEY_VALUE_PAIR_SEP = "," 601 NULL_ORDERING_SUPPORTED = False 602 IGNORE_NULLS_IN_FUNC = True 603 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 604 CAN_IMPLEMENT_ARRAY_ANY = True 605 SUPPORTS_TO_NUMBER = False 606 NAMED_PLACEHOLDER_TOKEN = "@" 607 HEX_FUNC = "TO_HEX" 608 WITH_PROPERTIES_PREFIX = "OPTIONS" 609 610 TRANSFORMS = { 611 **generator.Generator.TRANSFORMS, 612 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 613 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 614 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 615 exp.Array: inline_array_unless_query, 616 exp.ArrayContains: _array_contains_sql, 617 exp.ArrayFilter: filter_array_using_unnest, 618 exp.ArraySize: rename_func("ARRAY_LENGTH"), 619 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 620 exp.CollateProperty: lambda self, e: ( 621 f"DEFAULT COLLATE {self.sql(e, 'this')}" 622 if e.args.get("default") 623 else f"COLLATE {self.sql(e, 'this')}" 624 ), 625 exp.Commit: lambda *_: "COMMIT TRANSACTION", 626 exp.CountIf: rename_func("COUNTIF"), 627 exp.Create: _create_sql, 628 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 629 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 630 exp.DateDiff: lambda self, e: self.func( 631 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 632 ), 633 exp.DateFromParts: rename_func("DATE"), 634 exp.DateStrToDate: datestrtodate_sql, 635 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 636 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 637 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 638 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 639 exp.FromTimeZone: lambda self, e: self.func( 640 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 641 ), 642 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 643 exp.GroupConcat: rename_func("STRING_AGG"), 644 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 645 exp.If: if_sql(false_value="NULL"), 646 exp.ILike: no_ilike_sql, 647 exp.IntDiv: rename_func("DIV"), 648 exp.JSONFormat: rename_func("TO_JSON_STRING"), 649 exp.Max: max_or_greatest, 650 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 651 exp.MD5Digest: rename_func("MD5"), 652 exp.Min: min_or_least, 653 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 654 exp.RegexpExtract: lambda self, e: self.func( 655 "REGEXP_EXTRACT", 656 e.this, 657 e.expression, 658 e.args.get("position"), 659 e.args.get("occurrence"), 660 ), 661 exp.RegexpReplace: regexp_replace_sql, 662 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 663 exp.ReturnsProperty: _returnsproperty_sql, 664 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 665 exp.Select: transforms.preprocess( 666 [ 667 transforms.explode_to_unnest(), 668 transforms.unqualify_unnest, 669 transforms.eliminate_distinct_on, 670 _alias_ordered_group, 671 transforms.eliminate_semi_and_anti_joins, 672 ] 673 ), 674 exp.SHA: rename_func("SHA1"), 675 exp.SHA2: sha256_sql, 676 exp.StabilityProperty: lambda self, e: ( 677 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 678 ), 679 exp.StrToDate: _str_to_datetime_sql, 680 exp.StrToTime: _str_to_datetime_sql, 681 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 682 exp.TimeFromParts: rename_func("TIME"), 683 exp.TimestampFromParts: rename_func("DATETIME"), 684 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 685 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 686 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 687 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 688 exp.TimeStrToTime: timestrtotime_sql, 689 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 690 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 691 exp.TsOrDsAdd: _ts_or_ds_add_sql, 692 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 693 exp.TsOrDsToTime: rename_func("TIME"), 694 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 695 exp.Unhex: rename_func("FROM_HEX"), 696 exp.UnixDate: rename_func("UNIX_DATE"), 697 exp.UnixToTime: _unix_to_time_sql, 698 exp.Values: _derived_table_values_to_unnest, 699 exp.VariancePop: rename_func("VAR_POP"), 700 } 701 702 SUPPORTED_JSON_PATH_PARTS = { 703 exp.JSONPathKey, 704 exp.JSONPathRoot, 705 exp.JSONPathSubscript, 706 } 707 708 TYPE_MAPPING = { 709 **generator.Generator.TYPE_MAPPING, 710 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 711 exp.DataType.Type.BIGINT: "INT64", 712 exp.DataType.Type.BINARY: "BYTES", 713 exp.DataType.Type.BOOLEAN: "BOOL", 714 exp.DataType.Type.CHAR: "STRING", 715 exp.DataType.Type.DECIMAL: "NUMERIC", 716 exp.DataType.Type.DOUBLE: "FLOAT64", 717 exp.DataType.Type.FLOAT: "FLOAT64", 718 exp.DataType.Type.INT: "INT64", 719 exp.DataType.Type.NCHAR: "STRING", 720 exp.DataType.Type.NVARCHAR: "STRING", 721 exp.DataType.Type.SMALLINT: "INT64", 722 exp.DataType.Type.TEXT: "STRING", 723 exp.DataType.Type.TIMESTAMP: "DATETIME", 724 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 725 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 726 exp.DataType.Type.TINYINT: "INT64", 727 exp.DataType.Type.VARBINARY: "BYTES", 728 exp.DataType.Type.ROWVERSION: "BYTES", 729 exp.DataType.Type.VARCHAR: "STRING", 730 exp.DataType.Type.VARIANT: "ANY TYPE", 731 } 732 733 PROPERTIES_LOCATION = { 734 **generator.Generator.PROPERTIES_LOCATION, 735 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 736 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 737 } 738 739 # WINDOW comes after QUALIFY 740 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 741 AFTER_HAVING_MODIFIER_TRANSFORMS = { 742 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 743 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 744 } 745 746 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 747 RESERVED_KEYWORDS = { 748 "all", 749 "and", 750 "any", 751 "array", 752 "as", 753 "asc", 754 "assert_rows_modified", 755 "at", 756 "between", 757 "by", 758 "case", 759 "cast", 760 "collate", 761 "contains", 762 "create", 763 "cross", 764 "cube", 765 "current", 766 "default", 767 "define", 768 "desc", 769 "distinct", 770 "else", 771 "end", 772 "enum", 773 "escape", 774 "except", 775 "exclude", 776 "exists", 777 "extract", 778 "false", 779 "fetch", 780 "following", 781 "for", 782 "from", 783 "full", 784 "group", 785 "grouping", 786 "groups", 787 "hash", 788 "having", 789 "if", 790 "ignore", 791 "in", 792 "inner", 793 "intersect", 794 "interval", 795 "into", 796 "is", 797 "join", 798 "lateral", 799 "left", 800 "like", 801 "limit", 802 "lookup", 803 "merge", 804 "natural", 805 "new", 806 "no", 807 "not", 808 "null", 809 "nulls", 810 "of", 811 "on", 812 "or", 813 "order", 814 "outer", 815 "over", 816 "partition", 817 "preceding", 818 "proto", 819 "qualify", 820 "range", 821 "recursive", 822 "respect", 823 "right", 824 "rollup", 825 "rows", 826 "select", 827 "set", 828 "some", 829 "struct", 830 "tablesample", 831 "then", 832 "to", 833 "treat", 834 "true", 835 "unbounded", 836 "union", 837 "unnest", 838 "using", 839 "when", 840 "where", 841 "window", 842 "with", 843 "within", 844 } 845 846 def mod_sql(self, expression: exp.Mod) -> str: 847 this = expression.this 848 expr = expression.expression 849 return self.func( 850 "MOD", 851 this.unnest() if isinstance(this, exp.Paren) else this, 852 expr.unnest() if isinstance(expr, exp.Paren) else expr, 853 ) 854 855 def column_parts(self, expression: exp.Column) -> str: 856 if expression.meta.get("quoted_column"): 857 # If a column reference is of the form `dataset.table`.name, we need 858 # to preserve the quoted table path, otherwise the reference breaks 859 table_parts = ".".join(p.name for p in expression.parts[:-1]) 860 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 861 return f"{table_path}.{self.sql(expression, 'this')}" 862 863 return super().column_parts(expression) 864 865 def table_parts(self, expression: exp.Table) -> str: 866 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 867 # we need to make sure the correct quoting is used in each case. 868 # 869 # For example, if there is a CTE x that clashes with a schema name, then the former will 870 # return the table y in that schema, whereas the latter will return the CTE's y column: 871 # 872 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 874 if expression.meta.get("quoted_table"): 875 table_parts = ".".join(p.name for p in expression.parts) 876 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 877 878 return super().table_parts(expression) 879 880 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 881 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 882 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 883 884 def eq_sql(self, expression: exp.EQ) -> str: 885 # Operands of = cannot be NULL in BigQuery 886 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 887 if not isinstance(expression.parent, exp.Update): 888 return "NULL" 889 890 return self.binary(expression, "=") 891 892 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 893 parent = expression.parent 894 895 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 896 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 897 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 898 return self.func( 899 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 900 ) 901 902 return super().attimezone_sql(expression) 903 904 def trycast_sql(self, expression: exp.TryCast) -> str: 905 return self.cast_sql(expression, safe_prefix="SAFE_") 906 907 def bracket_sql(self, expression: exp.Bracket) -> str: 908 this = expression.this 909 expressions = expression.expressions 910 911 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 912 arg = expressions[0] 913 if arg.type is None: 914 from sqlglot.optimizer.annotate_types import annotate_types 915 916 arg = annotate_types(arg) 917 918 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 919 # BQ doesn't support bracket syntax with string values for structs 920 return f"{self.sql(this)}.{arg.name}" 921 922 expressions_sql = self.expressions(expression, flat=True) 923 offset = expression.args.get("offset") 924 925 if offset == 0: 926 expressions_sql = f"OFFSET({expressions_sql})" 927 elif offset == 1: 928 expressions_sql = f"ORDINAL({expressions_sql})" 929 elif offset is not None: 930 self.unsupported(f"Unsupported array offset: {offset}") 931 932 if expression.args.get("safe"): 933 expressions_sql = f"SAFE_{expressions_sql}" 934 935 return f"{self.sql(this)}[{expressions_sql}]" 936 937 def in_unnest_op(self, expression: exp.Unnest) -> str: 938 return self.sql(expression) 939 940 def except_op(self, expression: exp.Except) -> str: 941 if not expression.args.get("distinct"): 942 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 943 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 944 945 def intersect_op(self, expression: exp.Intersect) -> str: 946 if not expression.args.get("distinct"): 947 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 948 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 949 950 def version_sql(self, expression: exp.Version) -> str: 951 if expression.name == "TIMESTAMP": 952 expression.set("this", "SYSTEM_TIME") 953 return super().version_sql(expression)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects "my_id" would refer to "data.my_id" (which is done in _qualify_columns()) across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether alias reference expansion before qualification should only happen for the GROUP BY clause.
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
278 def normalize_identifier(self, expression: E) -> E: 279 if ( 280 isinstance(expression, exp.Identifier) 281 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 282 ): 283 parent = expression.parent 284 while isinstance(parent, exp.Dot): 285 parent = parent.parent 286 287 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 288 # by default. The following check uses a heuristic to detect tables based on whether 289 # they are qualified. This should generally be correct, because tables in BigQuery 290 # must be qualified with at least a dataset, unless @@dataset_id is set. 291 case_sensitive = ( 292 isinstance(parent, exp.UserDefinedFunction) 293 or ( 294 isinstance(parent, exp.Table) 295 and parent.db 296 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 297 ) 298 or expression.meta.get("is_table") 299 ) 300 if not case_sensitive: 301 expression.set("this", expression.this.lower()) 302 303 return expression
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Inherited Members
- sqlglot.dialects.dialect.Dialect
- Dialect
- INDEX_OFFSET
- ALIAS_POST_TABLESAMPLE
- TABLESAMPLE_SIZE_IS_PERCENT
- IDENTIFIERS_CAN_START_WITH_DIGIT
- DPIPE_IS_STRING_CONCAT
- STRICT_STRING_CONCAT
- COPY_PARAMS_ARE_CSV
- NULL_ORDERING
- TYPED_DIVISION
- SAFE_DIVISION
- CONCAT_COALESCE
- DATE_FORMAT
- DATEINT_FORMAT
- TIME_FORMAT
- PREFER_CTE_ALIAS_COLUMN
- SUPPORTS_ORDER_BY_ALL
- DATE_PART_MAPPING
- TYPE_TO_EXPRESSIONS
- ANNOTATORS
- get_or_raise
- format_time
- settings
- case_sensitive
- can_identify
- quote_identifier
- to_json_path
- parse
- parse_into
- generate
- transpile
- tokenize
- tokenizer
- jsonpath_tokenizer
- parser
- generator
305 class Tokenizer(tokens.Tokenizer): 306 QUOTES = ["'", '"', '"""', "'''"] 307 COMMENTS = ["--", "#", ("/*", "*/")] 308 IDENTIFIERS = ["`"] 309 STRING_ESCAPES = ["\\"] 310 311 HEX_STRINGS = [("0x", ""), ("0X", "")] 312 313 BYTE_STRINGS = [ 314 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 315 ] 316 317 RAW_STRINGS = [ 318 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 319 ] 320 321 KEYWORDS = { 322 **tokens.Tokenizer.KEYWORDS, 323 "ANY TYPE": TokenType.VARIANT, 324 "BEGIN": TokenType.COMMAND, 325 "BEGIN TRANSACTION": TokenType.BEGIN, 326 "BYTES": TokenType.BINARY, 327 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 328 "DATETIME": TokenType.TIMESTAMP, 329 "DECLARE": TokenType.COMMAND, 330 "ELSEIF": TokenType.COMMAND, 331 "EXCEPTION": TokenType.COMMAND, 332 "FLOAT64": TokenType.DOUBLE, 333 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 334 "MODEL": TokenType.MODEL, 335 "NOT DETERMINISTIC": TokenType.VOLATILE, 336 "RECORD": TokenType.STRUCT, 337 "TIMESTAMP": TokenType.TIMESTAMPTZ, 338 } 339 KEYWORDS.pop("DIV") 340 KEYWORDS.pop("VALUES") 341 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIER_ESCAPES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
343 class Parser(parser.Parser): 344 PREFIXED_PIVOT_COLUMNS = True 345 LOG_DEFAULTS_TO_LN = True 346 SUPPORTS_IMPLICIT_UNNEST = True 347 348 FUNCTIONS = { 349 **parser.Parser.FUNCTIONS, 350 "DATE": _build_date, 351 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 352 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 353 "DATE_TRUNC": lambda args: exp.DateTrunc( 354 unit=exp.Literal.string(str(seq_get(args, 1))), 355 this=seq_get(args, 0), 356 ), 357 "DATETIME": _build_datetime, 358 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 359 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 360 "DIV": binary_from_function(exp.IntDiv), 361 "FORMAT_DATE": lambda args: exp.TimeToStr( 362 this=exp.TsOrDsToDate(this=seq_get(args, 1)), format=seq_get(args, 0) 363 ), 364 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 365 "JSON_EXTRACT_SCALAR": lambda args: exp.JSONExtractScalar( 366 this=seq_get(args, 0), expression=seq_get(args, 1) or exp.Literal.string("$") 367 ), 368 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 369 "MD5": exp.MD5Digest.from_arg_list, 370 "TO_HEX": _build_to_hex, 371 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 372 [seq_get(args, 1), seq_get(args, 0)] 373 ), 374 "PARSE_TIMESTAMP": _build_parse_timestamp, 375 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 376 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 377 this=seq_get(args, 0), 378 expression=seq_get(args, 1), 379 position=seq_get(args, 2), 380 occurrence=seq_get(args, 3), 381 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 382 ), 383 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 384 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 385 "SPLIT": lambda args: exp.Split( 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 387 this=seq_get(args, 0), 388 expression=seq_get(args, 1) or exp.Literal.string(","), 389 ), 390 "TIME": _build_time, 391 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 392 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 393 "TIMESTAMP": _build_timestamp, 394 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 395 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 396 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 397 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 398 ), 399 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 400 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 401 ), 402 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 403 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 404 } 405 406 FUNCTION_PARSERS = { 407 **parser.Parser.FUNCTION_PARSERS, 408 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 409 } 410 FUNCTION_PARSERS.pop("TRIM") 411 412 NO_PAREN_FUNCTIONS = { 413 **parser.Parser.NO_PAREN_FUNCTIONS, 414 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 415 } 416 417 NESTED_TYPE_TOKENS = { 418 *parser.Parser.NESTED_TYPE_TOKENS, 419 TokenType.TABLE, 420 } 421 422 PROPERTY_PARSERS = { 423 **parser.Parser.PROPERTY_PARSERS, 424 "NOT DETERMINISTIC": lambda self: self.expression( 425 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 426 ), 427 "OPTIONS": lambda self: self._parse_with_property(), 428 } 429 430 CONSTRAINT_PARSERS = { 431 **parser.Parser.CONSTRAINT_PARSERS, 432 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 433 } 434 435 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 436 RANGE_PARSERS.pop(TokenType.OVERLAPS) 437 438 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 439 440 STATEMENT_PARSERS = { 441 **parser.Parser.STATEMENT_PARSERS, 442 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 443 TokenType.END: lambda self: self._parse_as_command(self._prev), 444 TokenType.FOR: lambda self: self._parse_for_in(), 445 } 446 447 BRACKET_OFFSETS = { 448 "OFFSET": (0, False), 449 "ORDINAL": (1, False), 450 "SAFE_OFFSET": (0, True), 451 "SAFE_ORDINAL": (1, True), 452 } 453 454 def _parse_for_in(self) -> exp.ForIn: 455 this = self._parse_range() 456 self._match_text_seq("DO") 457 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 458 459 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 460 this = super()._parse_table_part(schema=schema) or self._parse_number() 461 462 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 463 if isinstance(this, exp.Identifier): 464 table_name = this.name 465 while self._match(TokenType.DASH, advance=False) and self._next: 466 text = "" 467 while self._curr and self._curr.token_type != TokenType.DOT: 468 self._advance() 469 text += self._prev.text 470 table_name += text 471 472 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 473 elif isinstance(this, exp.Literal): 474 table_name = this.name 475 476 if self._is_connected() and self._parse_var(any_token=True): 477 table_name += self._prev.text 478 479 this = exp.Identifier(this=table_name, quoted=True) 480 481 return this 482 483 def _parse_table_parts( 484 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 485 ) -> exp.Table: 486 table = super()._parse_table_parts( 487 schema=schema, is_db_reference=is_db_reference, wildcard=True 488 ) 489 490 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 491 if not table.catalog: 492 if table.db: 493 parts = table.db.split(".") 494 if len(parts) == 2 and not table.args["db"].quoted: 495 table.set("catalog", exp.Identifier(this=parts[0])) 496 table.set("db", exp.Identifier(this=parts[1])) 497 else: 498 parts = table.name.split(".") 499 if len(parts) == 2 and not table.this.quoted: 500 table.set("db", exp.Identifier(this=parts[0])) 501 table.set("this", exp.Identifier(this=parts[1])) 502 503 if any("." in p.name for p in table.parts): 504 catalog, db, this, *rest = ( 505 exp.to_identifier(p, quoted=True) 506 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 507 ) 508 509 if rest and this: 510 this = exp.Dot.build([this, *rest]) # type: ignore 511 512 table = exp.Table( 513 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 514 ) 515 table.meta["quoted_table"] = True 516 517 return table 518 519 def _parse_column(self) -> t.Optional[exp.Expression]: 520 column = super()._parse_column() 521 if isinstance(column, exp.Column): 522 parts = column.parts 523 if any("." in p.name for p in parts): 524 catalog, db, table, this, *rest = ( 525 exp.to_identifier(p, quoted=True) 526 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 527 ) 528 529 if rest and this: 530 this = exp.Dot.build([this, *rest]) # type: ignore 531 532 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 533 column.meta["quoted_column"] = True 534 535 return column 536 537 @t.overload 538 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 539 540 @t.overload 541 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 542 543 def _parse_json_object(self, agg=False): 544 json_object = super()._parse_json_object() 545 array_kv_pair = seq_get(json_object.expressions, 0) 546 547 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 548 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 549 if ( 550 array_kv_pair 551 and isinstance(array_kv_pair.this, exp.Array) 552 and isinstance(array_kv_pair.expression, exp.Array) 553 ): 554 keys = array_kv_pair.this.expressions 555 values = array_kv_pair.expression.expressions 556 557 json_object.set( 558 "expressions", 559 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 560 ) 561 562 return json_object 563 564 def _parse_bracket( 565 self, this: t.Optional[exp.Expression] = None 566 ) -> t.Optional[exp.Expression]: 567 bracket = super()._parse_bracket(this) 568 569 if this is bracket: 570 return bracket 571 572 if isinstance(bracket, exp.Bracket): 573 for expression in bracket.expressions: 574 name = expression.name.upper() 575 576 if name not in self.BRACKET_OFFSETS: 577 break 578 579 offset, safe = self.BRACKET_OFFSETS[name] 580 bracket.set("offset", offset) 581 bracket.set("safe", safe) 582 expression.replace(expression.expressions[0]) 583 584 return bracket
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- INTERVAL_VARS
- ALIAS_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- errors
- sql
586 class Generator(generator.Generator): 587 EXPLICIT_SET_OP = True 588 INTERVAL_ALLOWS_PLURAL_FORM = False 589 JOIN_HINTS = False 590 QUERY_HINTS = False 591 TABLE_HINTS = False 592 LIMIT_FETCH = "LIMIT" 593 RENAME_TABLE_WITH_DB = False 594 NVL2_SUPPORTED = False 595 UNNEST_WITH_ORDINALITY = False 596 COLLATE_IS_FUNC = True 597 LIMIT_ONLY_LITERALS = True 598 SUPPORTS_TABLE_ALIAS_COLUMNS = False 599 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 600 JSON_KEY_VALUE_PAIR_SEP = "," 601 NULL_ORDERING_SUPPORTED = False 602 IGNORE_NULLS_IN_FUNC = True 603 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 604 CAN_IMPLEMENT_ARRAY_ANY = True 605 SUPPORTS_TO_NUMBER = False 606 NAMED_PLACEHOLDER_TOKEN = "@" 607 HEX_FUNC = "TO_HEX" 608 WITH_PROPERTIES_PREFIX = "OPTIONS" 609 610 TRANSFORMS = { 611 **generator.Generator.TRANSFORMS, 612 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 613 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 614 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 615 exp.Array: inline_array_unless_query, 616 exp.ArrayContains: _array_contains_sql, 617 exp.ArrayFilter: filter_array_using_unnest, 618 exp.ArraySize: rename_func("ARRAY_LENGTH"), 619 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 620 exp.CollateProperty: lambda self, e: ( 621 f"DEFAULT COLLATE {self.sql(e, 'this')}" 622 if e.args.get("default") 623 else f"COLLATE {self.sql(e, 'this')}" 624 ), 625 exp.Commit: lambda *_: "COMMIT TRANSACTION", 626 exp.CountIf: rename_func("COUNTIF"), 627 exp.Create: _create_sql, 628 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 629 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 630 exp.DateDiff: lambda self, e: self.func( 631 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 632 ), 633 exp.DateFromParts: rename_func("DATE"), 634 exp.DateStrToDate: datestrtodate_sql, 635 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 636 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 637 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 638 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 639 exp.FromTimeZone: lambda self, e: self.func( 640 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 641 ), 642 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 643 exp.GroupConcat: rename_func("STRING_AGG"), 644 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 645 exp.If: if_sql(false_value="NULL"), 646 exp.ILike: no_ilike_sql, 647 exp.IntDiv: rename_func("DIV"), 648 exp.JSONFormat: rename_func("TO_JSON_STRING"), 649 exp.Max: max_or_greatest, 650 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 651 exp.MD5Digest: rename_func("MD5"), 652 exp.Min: min_or_least, 653 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 654 exp.RegexpExtract: lambda self, e: self.func( 655 "REGEXP_EXTRACT", 656 e.this, 657 e.expression, 658 e.args.get("position"), 659 e.args.get("occurrence"), 660 ), 661 exp.RegexpReplace: regexp_replace_sql, 662 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 663 exp.ReturnsProperty: _returnsproperty_sql, 664 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 665 exp.Select: transforms.preprocess( 666 [ 667 transforms.explode_to_unnest(), 668 transforms.unqualify_unnest, 669 transforms.eliminate_distinct_on, 670 _alias_ordered_group, 671 transforms.eliminate_semi_and_anti_joins, 672 ] 673 ), 674 exp.SHA: rename_func("SHA1"), 675 exp.SHA2: sha256_sql, 676 exp.StabilityProperty: lambda self, e: ( 677 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 678 ), 679 exp.StrToDate: _str_to_datetime_sql, 680 exp.StrToTime: _str_to_datetime_sql, 681 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 682 exp.TimeFromParts: rename_func("TIME"), 683 exp.TimestampFromParts: rename_func("DATETIME"), 684 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 685 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 686 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 687 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 688 exp.TimeStrToTime: timestrtotime_sql, 689 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 690 exp.Trim: lambda self, e: self.func("TRIM", e.this, e.expression), 691 exp.TsOrDsAdd: _ts_or_ds_add_sql, 692 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 693 exp.TsOrDsToTime: rename_func("TIME"), 694 exp.TsOrDsToTimestamp: rename_func("DATETIME"), 695 exp.Unhex: rename_func("FROM_HEX"), 696 exp.UnixDate: rename_func("UNIX_DATE"), 697 exp.UnixToTime: _unix_to_time_sql, 698 exp.Values: _derived_table_values_to_unnest, 699 exp.VariancePop: rename_func("VAR_POP"), 700 } 701 702 SUPPORTED_JSON_PATH_PARTS = { 703 exp.JSONPathKey, 704 exp.JSONPathRoot, 705 exp.JSONPathSubscript, 706 } 707 708 TYPE_MAPPING = { 709 **generator.Generator.TYPE_MAPPING, 710 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 711 exp.DataType.Type.BIGINT: "INT64", 712 exp.DataType.Type.BINARY: "BYTES", 713 exp.DataType.Type.BOOLEAN: "BOOL", 714 exp.DataType.Type.CHAR: "STRING", 715 exp.DataType.Type.DECIMAL: "NUMERIC", 716 exp.DataType.Type.DOUBLE: "FLOAT64", 717 exp.DataType.Type.FLOAT: "FLOAT64", 718 exp.DataType.Type.INT: "INT64", 719 exp.DataType.Type.NCHAR: "STRING", 720 exp.DataType.Type.NVARCHAR: "STRING", 721 exp.DataType.Type.SMALLINT: "INT64", 722 exp.DataType.Type.TEXT: "STRING", 723 exp.DataType.Type.TIMESTAMP: "DATETIME", 724 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 725 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 726 exp.DataType.Type.TINYINT: "INT64", 727 exp.DataType.Type.VARBINARY: "BYTES", 728 exp.DataType.Type.ROWVERSION: "BYTES", 729 exp.DataType.Type.VARCHAR: "STRING", 730 exp.DataType.Type.VARIANT: "ANY TYPE", 731 } 732 733 PROPERTIES_LOCATION = { 734 **generator.Generator.PROPERTIES_LOCATION, 735 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 736 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 737 } 738 739 # WINDOW comes after QUALIFY 740 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 741 AFTER_HAVING_MODIFIER_TRANSFORMS = { 742 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 743 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 744 } 745 746 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 747 RESERVED_KEYWORDS = { 748 "all", 749 "and", 750 "any", 751 "array", 752 "as", 753 "asc", 754 "assert_rows_modified", 755 "at", 756 "between", 757 "by", 758 "case", 759 "cast", 760 "collate", 761 "contains", 762 "create", 763 "cross", 764 "cube", 765 "current", 766 "default", 767 "define", 768 "desc", 769 "distinct", 770 "else", 771 "end", 772 "enum", 773 "escape", 774 "except", 775 "exclude", 776 "exists", 777 "extract", 778 "false", 779 "fetch", 780 "following", 781 "for", 782 "from", 783 "full", 784 "group", 785 "grouping", 786 "groups", 787 "hash", 788 "having", 789 "if", 790 "ignore", 791 "in", 792 "inner", 793 "intersect", 794 "interval", 795 "into", 796 "is", 797 "join", 798 "lateral", 799 "left", 800 "like", 801 "limit", 802 "lookup", 803 "merge", 804 "natural", 805 "new", 806 "no", 807 "not", 808 "null", 809 "nulls", 810 "of", 811 "on", 812 "or", 813 "order", 814 "outer", 815 "over", 816 "partition", 817 "preceding", 818 "proto", 819 "qualify", 820 "range", 821 "recursive", 822 "respect", 823 "right", 824 "rollup", 825 "rows", 826 "select", 827 "set", 828 "some", 829 "struct", 830 "tablesample", 831 "then", 832 "to", 833 "treat", 834 "true", 835 "unbounded", 836 "union", 837 "unnest", 838 "using", 839 "when", 840 "where", 841 "window", 842 "with", 843 "within", 844 } 845 846 def mod_sql(self, expression: exp.Mod) -> str: 847 this = expression.this 848 expr = expression.expression 849 return self.func( 850 "MOD", 851 this.unnest() if isinstance(this, exp.Paren) else this, 852 expr.unnest() if isinstance(expr, exp.Paren) else expr, 853 ) 854 855 def column_parts(self, expression: exp.Column) -> str: 856 if expression.meta.get("quoted_column"): 857 # If a column reference is of the form `dataset.table`.name, we need 858 # to preserve the quoted table path, otherwise the reference breaks 859 table_parts = ".".join(p.name for p in expression.parts[:-1]) 860 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 861 return f"{table_path}.{self.sql(expression, 'this')}" 862 863 return super().column_parts(expression) 864 865 def table_parts(self, expression: exp.Table) -> str: 866 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 867 # we need to make sure the correct quoting is used in each case. 868 # 869 # For example, if there is a CTE x that clashes with a schema name, then the former will 870 # return the table y in that schema, whereas the latter will return the CTE's y column: 871 # 872 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 874 if expression.meta.get("quoted_table"): 875 table_parts = ".".join(p.name for p in expression.parts) 876 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 877 878 return super().table_parts(expression) 879 880 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 881 this = expression.this if isinstance(expression.this, exp.TsOrDsToDate) else expression 882 return self.func("FORMAT_DATE", self.format_time(expression), this.this) 883 884 def eq_sql(self, expression: exp.EQ) -> str: 885 # Operands of = cannot be NULL in BigQuery 886 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 887 if not isinstance(expression.parent, exp.Update): 888 return "NULL" 889 890 return self.binary(expression, "=") 891 892 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 893 parent = expression.parent 894 895 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 896 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 897 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 898 return self.func( 899 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 900 ) 901 902 return super().attimezone_sql(expression) 903 904 def trycast_sql(self, expression: exp.TryCast) -> str: 905 return self.cast_sql(expression, safe_prefix="SAFE_") 906 907 def bracket_sql(self, expression: exp.Bracket) -> str: 908 this = expression.this 909 expressions = expression.expressions 910 911 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 912 arg = expressions[0] 913 if arg.type is None: 914 from sqlglot.optimizer.annotate_types import annotate_types 915 916 arg = annotate_types(arg) 917 918 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 919 # BQ doesn't support bracket syntax with string values for structs 920 return f"{self.sql(this)}.{arg.name}" 921 922 expressions_sql = self.expressions(expression, flat=True) 923 offset = expression.args.get("offset") 924 925 if offset == 0: 926 expressions_sql = f"OFFSET({expressions_sql})" 927 elif offset == 1: 928 expressions_sql = f"ORDINAL({expressions_sql})" 929 elif offset is not None: 930 self.unsupported(f"Unsupported array offset: {offset}") 931 932 if expression.args.get("safe"): 933 expressions_sql = f"SAFE_{expressions_sql}" 934 935 return f"{self.sql(this)}[{expressions_sql}]" 936 937 def in_unnest_op(self, expression: exp.Unnest) -> str: 938 return self.sql(expression) 939 940 def except_op(self, expression: exp.Except) -> str: 941 if not expression.args.get("distinct"): 942 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 943 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 944 945 def intersect_op(self, expression: exp.Intersect) -> str: 946 if not expression.args.get("distinct"): 947 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 948 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 949 950 def version_sql(self, expression: exp.Version) -> str: 951 if expression.name == "TIMESTAMP": 952 expression.set("this", "SYSTEM_TIME") 953 return super().version_sql(expression)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
855 def column_parts(self, expression: exp.Column) -> str: 856 if expression.meta.get("quoted_column"): 857 # If a column reference is of the form `dataset.table`.name, we need 858 # to preserve the quoted table path, otherwise the reference breaks 859 table_parts = ".".join(p.name for p in expression.parts[:-1]) 860 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 861 return f"{table_path}.{self.sql(expression, 'this')}" 862 863 return super().column_parts(expression)
865 def table_parts(self, expression: exp.Table) -> str: 866 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 867 # we need to make sure the correct quoting is used in each case. 868 # 869 # For example, if there is a CTE x that clashes with a schema name, then the former will 870 # return the table y in that schema, whereas the latter will return the CTE's y column: 871 # 872 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 873 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 874 if expression.meta.get("quoted_table"): 875 table_parts = ".".join(p.name for p in expression.parts) 876 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 877 878 return super().table_parts(expression)
892 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 893 parent = expression.parent 894 895 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 896 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 897 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 898 return self.func( 899 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 900 ) 901 902 return super().attimezone_sql(expression)
907 def bracket_sql(self, expression: exp.Bracket) -> str: 908 this = expression.this 909 expressions = expression.expressions 910 911 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 912 arg = expressions[0] 913 if arg.type is None: 914 from sqlglot.optimizer.annotate_types import annotate_types 915 916 arg = annotate_types(arg) 917 918 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 919 # BQ doesn't support bracket syntax with string values for structs 920 return f"{self.sql(this)}.{arg.name}" 921 922 expressions_sql = self.expressions(expression, flat=True) 923 offset = expression.args.get("offset") 924 925 if offset == 0: 926 expressions_sql = f"OFFSET({expressions_sql})" 927 elif offset == 1: 928 expressions_sql = f"ORDINAL({expressions_sql})" 929 elif offset is not None: 930 self.unsupported(f"Unsupported array offset: {offset}") 931 932 if expression.args.get("safe"): 933 expressions_sql = f"SAFE_{expressions_sql}" 934 935 return f"{self.sql(this)}[{expressions_sql}]"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- PARSE_JSON_NAME
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- pad_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- transformcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- except_sql
- fetch_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- intersect_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- set_operations
- union_sql
- union_op
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- currenttimestamp_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterdiststyle_sql
- altersortkey_sql
- renametable_sql
- renamecolumn_sql
- alterset_sql
- altertable_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- try_sql
- log_sql
- use_sql
- binary
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- operator_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- generateseries_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- length_sql
- rand_sql
- strtodate_sql
- strtotime_sql
- changes_sql
- pad_sql