sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 annotate_with_type_lambda, 13 arg_max_or_min_no_count, 14 binary_from_function, 15 date_add_interval_sql, 16 datestrtodate_sql, 17 build_formatted_time, 18 filter_array_using_unnest, 19 if_sql, 20 inline_array_unless_query, 21 max_or_greatest, 22 min_or_least, 23 no_ilike_sql, 24 build_date_delta_with_interval, 25 regexp_replace_sql, 26 rename_func, 27 sha256_sql, 28 timestrtotime_sql, 29 ts_or_ds_add_cast, 30 unit_to_var, 31 strposition_sql, 32 groupconcat_sql, 33 space_sql, 34) 35from sqlglot.helper import seq_get, split_num_words 36from sqlglot.tokens import TokenType 37from sqlglot.generator import unsupported_args 38 39if t.TYPE_CHECKING: 40 from sqlglot._typing import Lit 41 42 from sqlglot.optimizer.annotate_types import TypeAnnotator 43 44logger = logging.getLogger("sqlglot") 45 46 47JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 48 49DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 50 51 52def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 53 if not expression.find_ancestor(exp.From, exp.Join): 54 return self.values_sql(expression) 55 56 structs = [] 57 alias = expression.args.get("alias") 58 for tup in expression.find_all(exp.Tuple): 59 field_aliases = ( 60 alias.columns 61 if alias and alias.columns 62 else (f"_c{i}" for i in range(len(tup.expressions))) 63 ) 64 expressions = [ 65 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 66 for name, fld in zip(field_aliases, tup.expressions) 67 ] 68 structs.append(exp.Struct(expressions=expressions)) 69 70 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 71 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 72 return self.unnest_sql( 73 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 74 ) 75 76 77def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 78 this = expression.this 79 if isinstance(this, exp.Schema): 80 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 81 else: 82 this = self.sql(this) 83 return f"RETURNS {this}" 84 85 86def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 87 returns = expression.find(exp.ReturnsProperty) 88 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 89 expression.set("kind", "TABLE FUNCTION") 90 91 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 92 expression.set("expression", expression.expression.this) 93 94 return self.create_sql(expression) 95 96 97# https://issuetracker.google.com/issues/162294746 98# workaround for bigquery bug when grouping by an expression and then ordering 99# WITH x AS (SELECT 1 y) 100# SELECT y + 1 z 101# FROM x 102# GROUP BY x + 1 103# ORDER by z 104def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 105 if isinstance(expression, exp.Select): 106 group = expression.args.get("group") 107 order = expression.args.get("order") 108 109 if group and order: 110 aliases = { 111 select.this: select.args["alias"] 112 for select in expression.selects 113 if isinstance(select, exp.Alias) 114 } 115 116 for grouped in group.expressions: 117 if grouped.is_int: 118 continue 119 alias = aliases.get(grouped) 120 if alias: 121 grouped.replace(exp.column(alias)) 122 123 return expression 124 125 126def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 127 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 128 if isinstance(expression, exp.CTE) and expression.alias_column_names: 129 cte_query = expression.this 130 131 if cte_query.is_star: 132 logger.warning( 133 "Can't push down CTE column names for star queries. Run the query through" 134 " the optimizer or use 'qualify' to expand the star projections first." 135 ) 136 return expression 137 138 column_names = expression.alias_column_names 139 expression.args["alias"].set("columns", None) 140 141 for name, select in zip(column_names, cte_query.selects): 142 to_replace = select 143 144 if isinstance(select, exp.Alias): 145 select = select.this 146 147 # Inner aliases are shadowed by the CTE column names 148 to_replace.replace(exp.alias_(select, name)) 149 150 return expression 151 152 153def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 154 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 155 this.set("zone", seq_get(args, 2)) 156 return this 157 158 159def _build_timestamp(args: t.List) -> exp.Timestamp: 160 timestamp = exp.Timestamp.from_arg_list(args) 161 timestamp.set("with_tz", True) 162 return timestamp 163 164 165def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 166 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 167 return expr_type.from_arg_list(args) 168 169 170def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 171 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 172 arg = seq_get(args, 0) 173 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 174 175 176def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 177 return self.sql( 178 exp.Exists( 179 this=exp.select("1") 180 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 181 .where(exp.column("_col").eq(expression.right)) 182 ) 183 ) 184 185 186def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 187 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 188 189 190def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 191 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 192 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 193 unit = unit_to_var(expression) 194 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 195 196 197def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 198 scale = expression.args.get("scale") 199 timestamp = expression.this 200 201 if scale in (None, exp.UnixToTime.SECONDS): 202 return self.func("TIMESTAMP_SECONDS", timestamp) 203 if scale == exp.UnixToTime.MILLIS: 204 return self.func("TIMESTAMP_MILLIS", timestamp) 205 if scale == exp.UnixToTime.MICROS: 206 return self.func("TIMESTAMP_MICROS", timestamp) 207 208 unix_seconds = exp.cast( 209 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 210 ) 211 return self.func("TIMESTAMP_SECONDS", unix_seconds) 212 213 214def _build_time(args: t.List) -> exp.Func: 215 if len(args) == 1: 216 return exp.TsOrDsToTime(this=args[0]) 217 if len(args) == 2: 218 return exp.Time.from_arg_list(args) 219 return exp.TimeFromParts.from_arg_list(args) 220 221 222def _build_datetime(args: t.List) -> exp.Func: 223 if len(args) == 1: 224 return exp.TsOrDsToDatetime.from_arg_list(args) 225 if len(args) == 2: 226 return exp.Datetime.from_arg_list(args) 227 return exp.TimestampFromParts.from_arg_list(args) 228 229 230def _build_regexp_extract( 231 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 232) -> t.Callable[[t.List], E]: 233 def _builder(args: t.List) -> E: 234 try: 235 group = re.compile(args[1].name).groups == 1 236 except re.error: 237 group = False 238 239 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 240 return expr_type( 241 this=seq_get(args, 0), 242 expression=seq_get(args, 1), 243 position=seq_get(args, 2), 244 occurrence=seq_get(args, 3), 245 group=exp.Literal.number(1) if group else default_group, 246 ) 247 248 return _builder 249 250 251def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 252 def _builder(args: t.List, dialect: Dialect) -> E: 253 if len(args) == 1: 254 # The default value for the JSONPath is '$' i.e all of the data 255 args.append(exp.Literal.string("$")) 256 return parser.build_extract_json_with_path(expr_type)(args, dialect) 257 258 return _builder 259 260 261def _str_to_datetime_sql( 262 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 263) -> str: 264 this = self.sql(expression, "this") 265 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 266 267 if expression.args.get("safe"): 268 fmt = self.format_time( 269 expression, 270 self.dialect.INVERSE_FORMAT_MAPPING, 271 self.dialect.INVERSE_FORMAT_TRIE, 272 ) 273 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 274 275 fmt = self.format_time(expression) 276 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 277 278 279def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 280 """ 281 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 282 +---------+---------+---------+------------+---------+ 283 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 284 +---------+---------+---------+------------+---------+ 285 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 286 +---------+---------+---------+------------+---------+ 287 """ 288 self._annotate_args(expression) 289 290 this: exp.Expression = expression.this 291 292 self._set_type( 293 expression, 294 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 295 ) 296 return expression 297 298 299@unsupported_args("ins_cost", "del_cost", "sub_cost") 300def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 301 max_dist = expression.args.get("max_dist") 302 if max_dist: 303 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 304 305 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 306 307 308def _build_levenshtein(args: t.List) -> exp.Levenshtein: 309 max_dist = seq_get(args, 2) 310 return exp.Levenshtein( 311 this=seq_get(args, 0), 312 expression=seq_get(args, 1), 313 max_dist=max_dist.expression if max_dist else None, 314 ) 315 316 317def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 318 def _builder(args: t.List) -> exp.TimeToStr: 319 return exp.TimeToStr( 320 this=expr_type(this=seq_get(args, 1)), 321 format=seq_get(args, 0), 322 zone=seq_get(args, 2), 323 ) 324 325 return _builder 326 327 328def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 329 if len(args) == 3: 330 return exp.Anonymous(this="CONTAINS_SUBSTR", expressions=args) 331 332 # Lowercase the operands in case of transpilation, as exp.Contains 333 # is case-sensitive on other dialects 334 this = exp.Lower(this=seq_get(args, 0)) 335 expr = exp.Lower(this=seq_get(args, 1)) 336 337 return exp.Contains(this=this, expression=expr) 338 339 340def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 341 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 342 upper = name.upper() 343 344 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 345 346 if dquote_escaping: 347 self._quote_json_path_key_using_brackets = False 348 349 sql = rename_func(upper)(self, expression) 350 351 if dquote_escaping: 352 self._quote_json_path_key_using_brackets = True 353 354 return sql 355 356 357def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 358 annotated = self._annotate_by_args(expression, "expressions") 359 360 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 361 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 362 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 363 annotated.type = exp.DataType.Type.VARCHAR 364 365 return annotated 366 367 368class BigQuery(Dialect): 369 WEEK_OFFSET = -1 370 UNNEST_COLUMN_ONLY = True 371 SUPPORTS_USER_DEFINED_TYPES = False 372 SUPPORTS_SEMI_ANTI_JOIN = False 373 LOG_BASE_FIRST = False 374 HEX_LOWERCASE = True 375 FORCE_EARLY_ALIAS_REF_EXPANSION = True 376 PRESERVE_ORIGINAL_NAMES = True 377 HEX_STRING_IS_INTEGER_TYPE = True 378 379 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 380 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 381 382 # bigquery udfs are case sensitive 383 NORMALIZE_FUNCTIONS = False 384 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 386 TIME_MAPPING = { 387 "%D": "%m/%d/%y", 388 "%E6S": "%S.%f", 389 "%e": "%-d", 390 } 391 392 FORMAT_MAPPING = { 393 "DD": "%d", 394 "MM": "%m", 395 "MON": "%b", 396 "MONTH": "%B", 397 "YYYY": "%Y", 398 "YY": "%y", 399 "HH": "%I", 400 "HH12": "%I", 401 "HH24": "%H", 402 "MI": "%M", 403 "SS": "%S", 404 "SSSSS": "%f", 405 "TZH": "%z", 406 } 407 408 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 409 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 410 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 411 412 # All set operations require either a DISTINCT or ALL specifier 413 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 414 415 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 416 TYPE_TO_EXPRESSIONS = { 417 **Dialect.TYPE_TO_EXPRESSIONS, 418 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 419 } 420 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 421 422 ANNOTATORS = { 423 **Dialect.ANNOTATORS, 424 **{ 425 expr_type: annotate_with_type_lambda(data_type) 426 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 427 for expr_type in expressions 428 }, 429 **{ 430 expr_type: lambda self, e: _annotate_math_functions(self, e) 431 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 432 }, 433 **{ 434 expr_type: lambda self, e: self._annotate_by_args(e, "this") 435 for expr_type in ( 436 exp.Left, 437 exp.Right, 438 exp.Lower, 439 exp.Upper, 440 exp.Pad, 441 exp.Trim, 442 exp.RegexpExtract, 443 exp.RegexpReplace, 444 exp.Repeat, 445 exp.Substring, 446 ) 447 }, 448 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 449 exp.Concat: _annotate_concat, 450 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 451 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 452 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 453 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 454 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 455 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 456 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 457 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 458 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 459 e, exp.DataType.Type.DATETIME 460 ), 461 } 462 463 def normalize_identifier(self, expression: E) -> E: 464 if ( 465 isinstance(expression, exp.Identifier) 466 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 467 ): 468 parent = expression.parent 469 while isinstance(parent, exp.Dot): 470 parent = parent.parent 471 472 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 473 # by default. The following check uses a heuristic to detect tables based on whether 474 # they are qualified. This should generally be correct, because tables in BigQuery 475 # must be qualified with at least a dataset, unless @@dataset_id is set. 476 case_sensitive = ( 477 isinstance(parent, exp.UserDefinedFunction) 478 or ( 479 isinstance(parent, exp.Table) 480 and parent.db 481 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 482 ) 483 or expression.meta.get("is_table") 484 ) 485 if not case_sensitive: 486 expression.set("this", expression.this.lower()) 487 488 return t.cast(E, expression) 489 490 return super().normalize_identifier(expression) 491 492 class Tokenizer(tokens.Tokenizer): 493 QUOTES = ["'", '"', '"""', "'''"] 494 COMMENTS = ["--", "#", ("/*", "*/")] 495 IDENTIFIERS = ["`"] 496 STRING_ESCAPES = ["\\"] 497 498 HEX_STRINGS = [("0x", ""), ("0X", "")] 499 500 BYTE_STRINGS = [ 501 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 502 ] 503 504 RAW_STRINGS = [ 505 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 506 ] 507 508 NESTED_COMMENTS = False 509 510 KEYWORDS = { 511 **tokens.Tokenizer.KEYWORDS, 512 "ANY TYPE": TokenType.VARIANT, 513 "BEGIN": TokenType.COMMAND, 514 "BEGIN TRANSACTION": TokenType.BEGIN, 515 "BYTEINT": TokenType.INT, 516 "BYTES": TokenType.BINARY, 517 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 518 "DATETIME": TokenType.TIMESTAMP, 519 "DECLARE": TokenType.COMMAND, 520 "ELSEIF": TokenType.COMMAND, 521 "EXCEPTION": TokenType.COMMAND, 522 "EXPORT": TokenType.EXPORT, 523 "FLOAT64": TokenType.DOUBLE, 524 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 525 "MODEL": TokenType.MODEL, 526 "NOT DETERMINISTIC": TokenType.VOLATILE, 527 "RECORD": TokenType.STRUCT, 528 "TIMESTAMP": TokenType.TIMESTAMPTZ, 529 } 530 KEYWORDS.pop("DIV") 531 KEYWORDS.pop("VALUES") 532 KEYWORDS.pop("/*+") 533 534 class Parser(parser.Parser): 535 PREFIXED_PIVOT_COLUMNS = True 536 LOG_DEFAULTS_TO_LN = True 537 SUPPORTS_IMPLICIT_UNNEST = True 538 JOINS_HAVE_EQUAL_PRECEDENCE = True 539 540 # BigQuery does not allow ASC/DESC to be used as an identifier 541 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 542 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 543 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 544 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 545 TokenType.ASC, 546 TokenType.DESC, 547 } 548 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 549 550 FUNCTIONS = { 551 **parser.Parser.FUNCTIONS, 552 "CONTAINS_SUBSTR": _build_contains_substring, 553 "DATE": _build_date, 554 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 555 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 556 "DATE_TRUNC": lambda args: exp.DateTrunc( 557 unit=seq_get(args, 1), 558 this=seq_get(args, 0), 559 zone=seq_get(args, 2), 560 ), 561 "DATETIME": _build_datetime, 562 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 563 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 564 "DIV": binary_from_function(exp.IntDiv), 565 "EDIT_DISTANCE": _build_levenshtein, 566 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 567 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 568 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 569 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 570 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 571 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 572 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 573 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 574 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 575 "MD5": exp.MD5Digest.from_arg_list, 576 "TO_HEX": _build_to_hex, 577 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 578 [seq_get(args, 1), seq_get(args, 0)] 579 ), 580 "PARSE_TIMESTAMP": _build_parse_timestamp, 581 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 582 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 583 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 584 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 585 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 586 ), 587 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 588 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 589 "SPLIT": lambda args: exp.Split( 590 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 591 this=seq_get(args, 0), 592 expression=seq_get(args, 1) or exp.Literal.string(","), 593 ), 594 "STRPOS": exp.StrPosition.from_arg_list, 595 "TIME": _build_time, 596 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 597 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 598 "TIMESTAMP": _build_timestamp, 599 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 600 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 601 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 602 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 603 ), 604 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 605 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 606 ), 607 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 608 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 609 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 610 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 611 } 612 613 FUNCTION_PARSERS = { 614 **parser.Parser.FUNCTION_PARSERS, 615 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 616 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 617 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 618 } 619 FUNCTION_PARSERS.pop("TRIM") 620 621 NO_PAREN_FUNCTIONS = { 622 **parser.Parser.NO_PAREN_FUNCTIONS, 623 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 624 } 625 626 NESTED_TYPE_TOKENS = { 627 *parser.Parser.NESTED_TYPE_TOKENS, 628 TokenType.TABLE, 629 } 630 631 PROPERTY_PARSERS = { 632 **parser.Parser.PROPERTY_PARSERS, 633 "NOT DETERMINISTIC": lambda self: self.expression( 634 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 635 ), 636 "OPTIONS": lambda self: self._parse_with_property(), 637 } 638 639 CONSTRAINT_PARSERS = { 640 **parser.Parser.CONSTRAINT_PARSERS, 641 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 642 } 643 644 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 645 RANGE_PARSERS.pop(TokenType.OVERLAPS) 646 647 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 648 649 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 650 651 STATEMENT_PARSERS = { 652 **parser.Parser.STATEMENT_PARSERS, 653 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 654 TokenType.END: lambda self: self._parse_as_command(self._prev), 655 TokenType.FOR: lambda self: self._parse_for_in(), 656 TokenType.EXPORT: lambda self: self._parse_export_data(), 657 } 658 659 BRACKET_OFFSETS = { 660 "OFFSET": (0, False), 661 "ORDINAL": (1, False), 662 "SAFE_OFFSET": (0, True), 663 "SAFE_ORDINAL": (1, True), 664 } 665 666 def _parse_for_in(self) -> exp.ForIn: 667 this = self._parse_range() 668 self._match_text_seq("DO") 669 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 670 671 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 672 this = super()._parse_table_part(schema=schema) or self._parse_number() 673 674 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 675 if isinstance(this, exp.Identifier): 676 table_name = this.name 677 while self._match(TokenType.DASH, advance=False) and self._next: 678 start = self._curr 679 while self._is_connected() and not self._match_set( 680 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 681 ): 682 self._advance() 683 684 if start == self._curr: 685 break 686 687 table_name += self._find_sql(start, self._prev) 688 689 this = exp.Identifier( 690 this=table_name, quoted=this.args.get("quoted") 691 ).update_positions(this) 692 elif isinstance(this, exp.Literal): 693 table_name = this.name 694 695 if self._is_connected() and self._parse_var(any_token=True): 696 table_name += self._prev.text 697 698 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 699 700 return this 701 702 def _parse_table_parts( 703 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 704 ) -> exp.Table: 705 table = super()._parse_table_parts( 706 schema=schema, is_db_reference=is_db_reference, wildcard=True 707 ) 708 709 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 710 if not table.catalog: 711 if table.db: 712 previous_db = table.args["db"] 713 parts = table.db.split(".") 714 if len(parts) == 2 and not table.args["db"].quoted: 715 table.set( 716 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 717 ) 718 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 719 else: 720 previous_this = table.this 721 parts = table.name.split(".") 722 if len(parts) == 2 and not table.this.quoted: 723 table.set( 724 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 725 ) 726 table.set( 727 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 728 ) 729 730 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 731 alias = table.this 732 catalog, db, this, *rest = ( 733 exp.to_identifier(p, quoted=True) 734 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 735 ) 736 737 for part in (catalog, db, this): 738 if part: 739 part.update_positions(table.this) 740 741 if rest and this: 742 this = exp.Dot.build([this, *rest]) # type: ignore 743 744 table = exp.Table( 745 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 746 ) 747 table.meta["quoted_table"] = True 748 else: 749 alias = None 750 751 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 752 # dataset, so if the project identifier is omitted we need to fix the ast so that 753 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 754 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 755 # views, because it would seem like the "catalog" part is set, when it'd actually 756 # be the region/dataset. Merging the two identifiers into a single one is done to 757 # avoid producing a 4-part Table reference, which would cause issues in the schema 758 # module, when there are 3-part table names mixed with information schema views. 759 # 760 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 761 table_parts = table.parts 762 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 763 # We need to alias the table here to avoid breaking existing qualified columns. 764 # This is expected to be safe, because if there's an actual alias coming up in 765 # the token stream, it will overwrite this one. If there isn't one, we are only 766 # exposing the name that can be used to reference the view explicitly (a no-op). 767 exp.alias_( 768 table, 769 t.cast(exp.Identifier, alias or table_parts[-1]), 770 table=True, 771 copy=False, 772 ) 773 774 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 775 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 776 line=table_parts[-2].meta.get("line"), 777 col=table_parts[-1].meta.get("col"), 778 start=table_parts[-2].meta.get("start"), 779 end=table_parts[-1].meta.get("end"), 780 ) 781 table.set("this", new_this) 782 table.set("db", seq_get(table_parts, -3)) 783 table.set("catalog", seq_get(table_parts, -4)) 784 785 return table 786 787 def _parse_column(self) -> t.Optional[exp.Expression]: 788 column = super()._parse_column() 789 if isinstance(column, exp.Column): 790 parts = column.parts 791 if any("." in p.name for p in parts): 792 catalog, db, table, this, *rest = ( 793 exp.to_identifier(p, quoted=True) 794 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 795 ) 796 797 if rest and this: 798 this = exp.Dot.build([this, *rest]) # type: ignore 799 800 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 801 column.meta["quoted_column"] = True 802 803 return column 804 805 @t.overload 806 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 807 808 @t.overload 809 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 810 811 def _parse_json_object(self, agg=False): 812 json_object = super()._parse_json_object() 813 array_kv_pair = seq_get(json_object.expressions, 0) 814 815 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 816 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 817 if ( 818 array_kv_pair 819 and isinstance(array_kv_pair.this, exp.Array) 820 and isinstance(array_kv_pair.expression, exp.Array) 821 ): 822 keys = array_kv_pair.this.expressions 823 values = array_kv_pair.expression.expressions 824 825 json_object.set( 826 "expressions", 827 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 828 ) 829 830 return json_object 831 832 def _parse_bracket( 833 self, this: t.Optional[exp.Expression] = None 834 ) -> t.Optional[exp.Expression]: 835 bracket = super()._parse_bracket(this) 836 837 if this is bracket: 838 return bracket 839 840 if isinstance(bracket, exp.Bracket): 841 for expression in bracket.expressions: 842 name = expression.name.upper() 843 844 if name not in self.BRACKET_OFFSETS: 845 break 846 847 offset, safe = self.BRACKET_OFFSETS[name] 848 bracket.set("offset", offset) 849 bracket.set("safe", safe) 850 expression.replace(expression.expressions[0]) 851 852 return bracket 853 854 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 855 unnest = super()._parse_unnest(with_alias=with_alias) 856 857 if not unnest: 858 return None 859 860 unnest_expr = seq_get(unnest.expressions, 0) 861 if unnest_expr: 862 from sqlglot.optimizer.annotate_types import annotate_types 863 864 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 865 866 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 867 # in contrast to other dialects such as DuckDB which flattens only the array by default 868 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 869 array_elem.is_type(exp.DataType.Type.STRUCT) 870 for array_elem in unnest_expr._type.expressions 871 ): 872 unnest.set("explode_array", True) 873 874 return unnest 875 876 def _parse_make_interval(self) -> exp.MakeInterval: 877 expr = exp.MakeInterval() 878 879 for arg_key in expr.arg_types: 880 value = self._parse_lambda() 881 882 if not value: 883 break 884 885 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 886 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 887 if isinstance(value, exp.Kwarg): 888 arg_key = value.this.name 889 890 expr.set(arg_key, value) 891 892 self._match(TokenType.COMMA) 893 894 return expr 895 896 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 897 expr = self.expression( 898 exp.FeaturesAtTime, 899 this=(self._match(TokenType.TABLE) and self._parse_table()) 900 or self._parse_select(nested=True), 901 ) 902 903 while self._match(TokenType.COMMA): 904 arg = self._parse_lambda() 905 906 # Get the LHS of the Kwarg and set the arg to that value, e.g 907 # "num_rows => 1" sets the expr's `num_rows` arg 908 if arg: 909 expr.set(arg.this.name, arg) 910 911 return expr 912 913 def _parse_export_data(self) -> exp.Export: 914 self._match_text_seq("DATA") 915 916 return self.expression( 917 exp.Export, 918 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 919 options=self._parse_properties(), 920 this=self._match_text_seq("AS") and self._parse_select(), 921 ) 922 923 class Generator(generator.Generator): 924 INTERVAL_ALLOWS_PLURAL_FORM = False 925 JOIN_HINTS = False 926 QUERY_HINTS = False 927 TABLE_HINTS = False 928 LIMIT_FETCH = "LIMIT" 929 RENAME_TABLE_WITH_DB = False 930 NVL2_SUPPORTED = False 931 UNNEST_WITH_ORDINALITY = False 932 COLLATE_IS_FUNC = True 933 LIMIT_ONLY_LITERALS = True 934 SUPPORTS_TABLE_ALIAS_COLUMNS = False 935 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 936 JSON_KEY_VALUE_PAIR_SEP = "," 937 NULL_ORDERING_SUPPORTED = False 938 IGNORE_NULLS_IN_FUNC = True 939 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 940 CAN_IMPLEMENT_ARRAY_ANY = True 941 SUPPORTS_TO_NUMBER = False 942 NAMED_PLACEHOLDER_TOKEN = "@" 943 HEX_FUNC = "TO_HEX" 944 WITH_PROPERTIES_PREFIX = "OPTIONS" 945 SUPPORTS_EXPLODING_PROJECTIONS = False 946 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 947 SUPPORTS_UNIX_SECONDS = True 948 949 TRANSFORMS = { 950 **generator.Generator.TRANSFORMS, 951 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 952 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 953 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 954 exp.Array: inline_array_unless_query, 955 exp.ArrayContains: _array_contains_sql, 956 exp.ArrayFilter: filter_array_using_unnest, 957 exp.ArrayRemove: filter_array_using_unnest, 958 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 959 exp.CollateProperty: lambda self, e: ( 960 f"DEFAULT COLLATE {self.sql(e, 'this')}" 961 if e.args.get("default") 962 else f"COLLATE {self.sql(e, 'this')}" 963 ), 964 exp.Commit: lambda *_: "COMMIT TRANSACTION", 965 exp.CountIf: rename_func("COUNTIF"), 966 exp.Create: _create_sql, 967 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 968 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 969 exp.DateDiff: lambda self, e: self.func( 970 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 971 ), 972 exp.DateFromParts: rename_func("DATE"), 973 exp.DateStrToDate: datestrtodate_sql, 974 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 975 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 976 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 977 exp.FromTimeZone: lambda self, e: self.func( 978 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 979 ), 980 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 981 exp.GroupConcat: lambda self, e: groupconcat_sql( 982 self, e, func_name="STRING_AGG", within_group=False 983 ), 984 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 985 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 986 exp.If: if_sql(false_value="NULL"), 987 exp.ILike: no_ilike_sql, 988 exp.IntDiv: rename_func("DIV"), 989 exp.Int64: rename_func("INT64"), 990 exp.JSONExtract: _json_extract_sql, 991 exp.JSONExtractArray: _json_extract_sql, 992 exp.JSONExtractScalar: _json_extract_sql, 993 exp.JSONFormat: rename_func("TO_JSON_STRING"), 994 exp.Levenshtein: _levenshtein_sql, 995 exp.Max: max_or_greatest, 996 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 997 exp.MD5Digest: rename_func("MD5"), 998 exp.Min: min_or_least, 999 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1000 exp.RegexpExtract: lambda self, e: self.func( 1001 "REGEXP_EXTRACT", 1002 e.this, 1003 e.expression, 1004 e.args.get("position"), 1005 e.args.get("occurrence"), 1006 ), 1007 exp.RegexpExtractAll: lambda self, e: self.func( 1008 "REGEXP_EXTRACT_ALL", e.this, e.expression 1009 ), 1010 exp.RegexpReplace: regexp_replace_sql, 1011 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1012 exp.ReturnsProperty: _returnsproperty_sql, 1013 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1014 exp.Select: transforms.preprocess( 1015 [ 1016 transforms.explode_projection_to_unnest(), 1017 transforms.unqualify_unnest, 1018 transforms.eliminate_distinct_on, 1019 _alias_ordered_group, 1020 transforms.eliminate_semi_and_anti_joins, 1021 ] 1022 ), 1023 exp.SHA: rename_func("SHA1"), 1024 exp.SHA2: sha256_sql, 1025 exp.Space: space_sql, 1026 exp.StabilityProperty: lambda self, e: ( 1027 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1028 ), 1029 exp.String: rename_func("STRING"), 1030 exp.StrPosition: lambda self, e: ( 1031 strposition_sql( 1032 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1033 ) 1034 ), 1035 exp.StrToDate: _str_to_datetime_sql, 1036 exp.StrToTime: _str_to_datetime_sql, 1037 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1038 exp.TimeFromParts: rename_func("TIME"), 1039 exp.TimestampFromParts: rename_func("DATETIME"), 1040 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1041 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1042 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1043 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1044 exp.TimeStrToTime: timestrtotime_sql, 1045 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1046 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1047 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1048 exp.TsOrDsToTime: rename_func("TIME"), 1049 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1050 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1051 exp.Unhex: rename_func("FROM_HEX"), 1052 exp.UnixDate: rename_func("UNIX_DATE"), 1053 exp.UnixToTime: _unix_to_time_sql, 1054 exp.Uuid: lambda *_: "GENERATE_UUID()", 1055 exp.Values: _derived_table_values_to_unnest, 1056 exp.VariancePop: rename_func("VAR_POP"), 1057 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1058 } 1059 1060 SUPPORTED_JSON_PATH_PARTS = { 1061 exp.JSONPathKey, 1062 exp.JSONPathRoot, 1063 exp.JSONPathSubscript, 1064 } 1065 1066 TYPE_MAPPING = { 1067 **generator.Generator.TYPE_MAPPING, 1068 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1069 exp.DataType.Type.BIGINT: "INT64", 1070 exp.DataType.Type.BINARY: "BYTES", 1071 exp.DataType.Type.BLOB: "BYTES", 1072 exp.DataType.Type.BOOLEAN: "BOOL", 1073 exp.DataType.Type.CHAR: "STRING", 1074 exp.DataType.Type.DECIMAL: "NUMERIC", 1075 exp.DataType.Type.DOUBLE: "FLOAT64", 1076 exp.DataType.Type.FLOAT: "FLOAT64", 1077 exp.DataType.Type.INT: "INT64", 1078 exp.DataType.Type.NCHAR: "STRING", 1079 exp.DataType.Type.NVARCHAR: "STRING", 1080 exp.DataType.Type.SMALLINT: "INT64", 1081 exp.DataType.Type.TEXT: "STRING", 1082 exp.DataType.Type.TIMESTAMP: "DATETIME", 1083 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1084 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1085 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1086 exp.DataType.Type.TINYINT: "INT64", 1087 exp.DataType.Type.ROWVERSION: "BYTES", 1088 exp.DataType.Type.UUID: "STRING", 1089 exp.DataType.Type.VARBINARY: "BYTES", 1090 exp.DataType.Type.VARCHAR: "STRING", 1091 exp.DataType.Type.VARIANT: "ANY TYPE", 1092 } 1093 1094 PROPERTIES_LOCATION = { 1095 **generator.Generator.PROPERTIES_LOCATION, 1096 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1097 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1098 } 1099 1100 # WINDOW comes after QUALIFY 1101 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1102 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1103 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1104 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1105 } 1106 1107 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1108 RESERVED_KEYWORDS = { 1109 "all", 1110 "and", 1111 "any", 1112 "array", 1113 "as", 1114 "asc", 1115 "assert_rows_modified", 1116 "at", 1117 "between", 1118 "by", 1119 "case", 1120 "cast", 1121 "collate", 1122 "contains", 1123 "create", 1124 "cross", 1125 "cube", 1126 "current", 1127 "default", 1128 "define", 1129 "desc", 1130 "distinct", 1131 "else", 1132 "end", 1133 "enum", 1134 "escape", 1135 "except", 1136 "exclude", 1137 "exists", 1138 "extract", 1139 "false", 1140 "fetch", 1141 "following", 1142 "for", 1143 "from", 1144 "full", 1145 "group", 1146 "grouping", 1147 "groups", 1148 "hash", 1149 "having", 1150 "if", 1151 "ignore", 1152 "in", 1153 "inner", 1154 "intersect", 1155 "interval", 1156 "into", 1157 "is", 1158 "join", 1159 "lateral", 1160 "left", 1161 "like", 1162 "limit", 1163 "lookup", 1164 "merge", 1165 "natural", 1166 "new", 1167 "no", 1168 "not", 1169 "null", 1170 "nulls", 1171 "of", 1172 "on", 1173 "or", 1174 "order", 1175 "outer", 1176 "over", 1177 "partition", 1178 "preceding", 1179 "proto", 1180 "qualify", 1181 "range", 1182 "recursive", 1183 "respect", 1184 "right", 1185 "rollup", 1186 "rows", 1187 "select", 1188 "set", 1189 "some", 1190 "struct", 1191 "tablesample", 1192 "then", 1193 "to", 1194 "treat", 1195 "true", 1196 "unbounded", 1197 "union", 1198 "unnest", 1199 "using", 1200 "when", 1201 "where", 1202 "window", 1203 "with", 1204 "within", 1205 } 1206 1207 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1208 unit = expression.unit 1209 unit_sql = unit.name if unit.is_string else self.sql(unit) 1210 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1211 1212 def mod_sql(self, expression: exp.Mod) -> str: 1213 this = expression.this 1214 expr = expression.expression 1215 return self.func( 1216 "MOD", 1217 this.unnest() if isinstance(this, exp.Paren) else this, 1218 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1219 ) 1220 1221 def column_parts(self, expression: exp.Column) -> str: 1222 if expression.meta.get("quoted_column"): 1223 # If a column reference is of the form `dataset.table`.name, we need 1224 # to preserve the quoted table path, otherwise the reference breaks 1225 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1226 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1227 return f"{table_path}.{self.sql(expression, 'this')}" 1228 1229 return super().column_parts(expression) 1230 1231 def table_parts(self, expression: exp.Table) -> str: 1232 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1233 # we need to make sure the correct quoting is used in each case. 1234 # 1235 # For example, if there is a CTE x that clashes with a schema name, then the former will 1236 # return the table y in that schema, whereas the latter will return the CTE's y column: 1237 # 1238 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1239 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1240 if expression.meta.get("quoted_table"): 1241 table_parts = ".".join(p.name for p in expression.parts) 1242 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1243 1244 return super().table_parts(expression) 1245 1246 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1247 this = expression.this 1248 if isinstance(this, exp.TsOrDsToDatetime): 1249 func_name = "FORMAT_DATETIME" 1250 elif isinstance(this, exp.TsOrDsToTimestamp): 1251 func_name = "FORMAT_TIMESTAMP" 1252 else: 1253 func_name = "FORMAT_DATE" 1254 1255 time_expr = ( 1256 this 1257 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1258 else expression 1259 ) 1260 return self.func( 1261 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1262 ) 1263 1264 def eq_sql(self, expression: exp.EQ) -> str: 1265 # Operands of = cannot be NULL in BigQuery 1266 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1267 if not isinstance(expression.parent, exp.Update): 1268 return "NULL" 1269 1270 return self.binary(expression, "=") 1271 1272 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1273 parent = expression.parent 1274 1275 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1276 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1277 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1278 return self.func( 1279 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1280 ) 1281 1282 return super().attimezone_sql(expression) 1283 1284 def trycast_sql(self, expression: exp.TryCast) -> str: 1285 return self.cast_sql(expression, safe_prefix="SAFE_") 1286 1287 def bracket_sql(self, expression: exp.Bracket) -> str: 1288 this = expression.this 1289 expressions = expression.expressions 1290 1291 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1292 arg = expressions[0] 1293 if arg.type is None: 1294 from sqlglot.optimizer.annotate_types import annotate_types 1295 1296 arg = annotate_types(arg, dialect=self.dialect) 1297 1298 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1299 # BQ doesn't support bracket syntax with string values for structs 1300 return f"{self.sql(this)}.{arg.name}" 1301 1302 expressions_sql = self.expressions(expression, flat=True) 1303 offset = expression.args.get("offset") 1304 1305 if offset == 0: 1306 expressions_sql = f"OFFSET({expressions_sql})" 1307 elif offset == 1: 1308 expressions_sql = f"ORDINAL({expressions_sql})" 1309 elif offset is not None: 1310 self.unsupported(f"Unsupported array offset: {offset}") 1311 1312 if expression.args.get("safe"): 1313 expressions_sql = f"SAFE_{expressions_sql}" 1314 1315 return f"{self.sql(this)}[{expressions_sql}]" 1316 1317 def in_unnest_op(self, expression: exp.Unnest) -> str: 1318 return self.sql(expression) 1319 1320 def version_sql(self, expression: exp.Version) -> str: 1321 if expression.name == "TIMESTAMP": 1322 expression.set("this", "SYSTEM_TIME") 1323 return super().version_sql(expression) 1324 1325 def contains_sql(self, expression: exp.Contains) -> str: 1326 this = expression.this 1327 expr = expression.expression 1328 1329 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1330 this = this.this 1331 expr = expr.this 1332 1333 return self.func("CONTAINS_SUBSTR", this, expr) 1334 1335 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1336 this = expression.this 1337 1338 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1339 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1340 # because they aren't literals and so the above syntax is invalid BigQuery. 1341 if isinstance(this, exp.Array): 1342 elem = seq_get(this.expressions, 0) 1343 if not (elem and elem.find(exp.Query)): 1344 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1345 1346 return super().cast_sql(expression, safe_prefix=safe_prefix)
369class BigQuery(Dialect): 370 WEEK_OFFSET = -1 371 UNNEST_COLUMN_ONLY = True 372 SUPPORTS_USER_DEFINED_TYPES = False 373 SUPPORTS_SEMI_ANTI_JOIN = False 374 LOG_BASE_FIRST = False 375 HEX_LOWERCASE = True 376 FORCE_EARLY_ALIAS_REF_EXPANSION = True 377 PRESERVE_ORIGINAL_NAMES = True 378 HEX_STRING_IS_INTEGER_TYPE = True 379 380 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 381 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 382 383 # bigquery udfs are case sensitive 384 NORMALIZE_FUNCTIONS = False 385 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 387 TIME_MAPPING = { 388 "%D": "%m/%d/%y", 389 "%E6S": "%S.%f", 390 "%e": "%-d", 391 } 392 393 FORMAT_MAPPING = { 394 "DD": "%d", 395 "MM": "%m", 396 "MON": "%b", 397 "MONTH": "%B", 398 "YYYY": "%Y", 399 "YY": "%y", 400 "HH": "%I", 401 "HH12": "%I", 402 "HH24": "%H", 403 "MI": "%M", 404 "SS": "%S", 405 "SSSSS": "%f", 406 "TZH": "%z", 407 } 408 409 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 410 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 411 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 412 413 # All set operations require either a DISTINCT or ALL specifier 414 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 415 416 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 417 TYPE_TO_EXPRESSIONS = { 418 **Dialect.TYPE_TO_EXPRESSIONS, 419 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 420 } 421 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 422 423 ANNOTATORS = { 424 **Dialect.ANNOTATORS, 425 **{ 426 expr_type: annotate_with_type_lambda(data_type) 427 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 428 for expr_type in expressions 429 }, 430 **{ 431 expr_type: lambda self, e: _annotate_math_functions(self, e) 432 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 433 }, 434 **{ 435 expr_type: lambda self, e: self._annotate_by_args(e, "this") 436 for expr_type in ( 437 exp.Left, 438 exp.Right, 439 exp.Lower, 440 exp.Upper, 441 exp.Pad, 442 exp.Trim, 443 exp.RegexpExtract, 444 exp.RegexpReplace, 445 exp.Repeat, 446 exp.Substring, 447 ) 448 }, 449 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 450 exp.Concat: _annotate_concat, 451 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 452 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 453 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 454 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 455 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 456 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 457 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 458 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 459 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 460 e, exp.DataType.Type.DATETIME 461 ), 462 } 463 464 def normalize_identifier(self, expression: E) -> E: 465 if ( 466 isinstance(expression, exp.Identifier) 467 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 468 ): 469 parent = expression.parent 470 while isinstance(parent, exp.Dot): 471 parent = parent.parent 472 473 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 474 # by default. The following check uses a heuristic to detect tables based on whether 475 # they are qualified. This should generally be correct, because tables in BigQuery 476 # must be qualified with at least a dataset, unless @@dataset_id is set. 477 case_sensitive = ( 478 isinstance(parent, exp.UserDefinedFunction) 479 or ( 480 isinstance(parent, exp.Table) 481 and parent.db 482 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 483 ) 484 or expression.meta.get("is_table") 485 ) 486 if not case_sensitive: 487 expression.set("this", expression.this.lower()) 488 489 return t.cast(E, expression) 490 491 return super().normalize_identifier(expression) 492 493 class Tokenizer(tokens.Tokenizer): 494 QUOTES = ["'", '"', '"""', "'''"] 495 COMMENTS = ["--", "#", ("/*", "*/")] 496 IDENTIFIERS = ["`"] 497 STRING_ESCAPES = ["\\"] 498 499 HEX_STRINGS = [("0x", ""), ("0X", "")] 500 501 BYTE_STRINGS = [ 502 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 503 ] 504 505 RAW_STRINGS = [ 506 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 507 ] 508 509 NESTED_COMMENTS = False 510 511 KEYWORDS = { 512 **tokens.Tokenizer.KEYWORDS, 513 "ANY TYPE": TokenType.VARIANT, 514 "BEGIN": TokenType.COMMAND, 515 "BEGIN TRANSACTION": TokenType.BEGIN, 516 "BYTEINT": TokenType.INT, 517 "BYTES": TokenType.BINARY, 518 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 519 "DATETIME": TokenType.TIMESTAMP, 520 "DECLARE": TokenType.COMMAND, 521 "ELSEIF": TokenType.COMMAND, 522 "EXCEPTION": TokenType.COMMAND, 523 "EXPORT": TokenType.EXPORT, 524 "FLOAT64": TokenType.DOUBLE, 525 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 526 "MODEL": TokenType.MODEL, 527 "NOT DETERMINISTIC": TokenType.VOLATILE, 528 "RECORD": TokenType.STRUCT, 529 "TIMESTAMP": TokenType.TIMESTAMPTZ, 530 } 531 KEYWORDS.pop("DIV") 532 KEYWORDS.pop("VALUES") 533 KEYWORDS.pop("/*+") 534 535 class Parser(parser.Parser): 536 PREFIXED_PIVOT_COLUMNS = True 537 LOG_DEFAULTS_TO_LN = True 538 SUPPORTS_IMPLICIT_UNNEST = True 539 JOINS_HAVE_EQUAL_PRECEDENCE = True 540 541 # BigQuery does not allow ASC/DESC to be used as an identifier 542 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 543 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 544 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 545 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 546 TokenType.ASC, 547 TokenType.DESC, 548 } 549 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 550 551 FUNCTIONS = { 552 **parser.Parser.FUNCTIONS, 553 "CONTAINS_SUBSTR": _build_contains_substring, 554 "DATE": _build_date, 555 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 556 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 557 "DATE_TRUNC": lambda args: exp.DateTrunc( 558 unit=seq_get(args, 1), 559 this=seq_get(args, 0), 560 zone=seq_get(args, 2), 561 ), 562 "DATETIME": _build_datetime, 563 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 564 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 565 "DIV": binary_from_function(exp.IntDiv), 566 "EDIT_DISTANCE": _build_levenshtein, 567 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 568 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 569 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 570 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 571 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 572 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 573 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 574 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 575 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 576 "MD5": exp.MD5Digest.from_arg_list, 577 "TO_HEX": _build_to_hex, 578 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 579 [seq_get(args, 1), seq_get(args, 0)] 580 ), 581 "PARSE_TIMESTAMP": _build_parse_timestamp, 582 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 583 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 584 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 585 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 586 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 587 ), 588 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 589 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 590 "SPLIT": lambda args: exp.Split( 591 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 592 this=seq_get(args, 0), 593 expression=seq_get(args, 1) or exp.Literal.string(","), 594 ), 595 "STRPOS": exp.StrPosition.from_arg_list, 596 "TIME": _build_time, 597 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 598 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 599 "TIMESTAMP": _build_timestamp, 600 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 601 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 602 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 603 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 604 ), 605 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 606 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 607 ), 608 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 609 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 610 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 611 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 612 } 613 614 FUNCTION_PARSERS = { 615 **parser.Parser.FUNCTION_PARSERS, 616 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 617 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 618 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 619 } 620 FUNCTION_PARSERS.pop("TRIM") 621 622 NO_PAREN_FUNCTIONS = { 623 **parser.Parser.NO_PAREN_FUNCTIONS, 624 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 625 } 626 627 NESTED_TYPE_TOKENS = { 628 *parser.Parser.NESTED_TYPE_TOKENS, 629 TokenType.TABLE, 630 } 631 632 PROPERTY_PARSERS = { 633 **parser.Parser.PROPERTY_PARSERS, 634 "NOT DETERMINISTIC": lambda self: self.expression( 635 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 636 ), 637 "OPTIONS": lambda self: self._parse_with_property(), 638 } 639 640 CONSTRAINT_PARSERS = { 641 **parser.Parser.CONSTRAINT_PARSERS, 642 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 643 } 644 645 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 646 RANGE_PARSERS.pop(TokenType.OVERLAPS) 647 648 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 649 650 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 651 652 STATEMENT_PARSERS = { 653 **parser.Parser.STATEMENT_PARSERS, 654 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 655 TokenType.END: lambda self: self._parse_as_command(self._prev), 656 TokenType.FOR: lambda self: self._parse_for_in(), 657 TokenType.EXPORT: lambda self: self._parse_export_data(), 658 } 659 660 BRACKET_OFFSETS = { 661 "OFFSET": (0, False), 662 "ORDINAL": (1, False), 663 "SAFE_OFFSET": (0, True), 664 "SAFE_ORDINAL": (1, True), 665 } 666 667 def _parse_for_in(self) -> exp.ForIn: 668 this = self._parse_range() 669 self._match_text_seq("DO") 670 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 671 672 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 673 this = super()._parse_table_part(schema=schema) or self._parse_number() 674 675 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 676 if isinstance(this, exp.Identifier): 677 table_name = this.name 678 while self._match(TokenType.DASH, advance=False) and self._next: 679 start = self._curr 680 while self._is_connected() and not self._match_set( 681 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 682 ): 683 self._advance() 684 685 if start == self._curr: 686 break 687 688 table_name += self._find_sql(start, self._prev) 689 690 this = exp.Identifier( 691 this=table_name, quoted=this.args.get("quoted") 692 ).update_positions(this) 693 elif isinstance(this, exp.Literal): 694 table_name = this.name 695 696 if self._is_connected() and self._parse_var(any_token=True): 697 table_name += self._prev.text 698 699 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 700 701 return this 702 703 def _parse_table_parts( 704 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 705 ) -> exp.Table: 706 table = super()._parse_table_parts( 707 schema=schema, is_db_reference=is_db_reference, wildcard=True 708 ) 709 710 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 711 if not table.catalog: 712 if table.db: 713 previous_db = table.args["db"] 714 parts = table.db.split(".") 715 if len(parts) == 2 and not table.args["db"].quoted: 716 table.set( 717 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 718 ) 719 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 720 else: 721 previous_this = table.this 722 parts = table.name.split(".") 723 if len(parts) == 2 and not table.this.quoted: 724 table.set( 725 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 726 ) 727 table.set( 728 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 729 ) 730 731 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 732 alias = table.this 733 catalog, db, this, *rest = ( 734 exp.to_identifier(p, quoted=True) 735 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 736 ) 737 738 for part in (catalog, db, this): 739 if part: 740 part.update_positions(table.this) 741 742 if rest and this: 743 this = exp.Dot.build([this, *rest]) # type: ignore 744 745 table = exp.Table( 746 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 747 ) 748 table.meta["quoted_table"] = True 749 else: 750 alias = None 751 752 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 753 # dataset, so if the project identifier is omitted we need to fix the ast so that 754 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 755 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 756 # views, because it would seem like the "catalog" part is set, when it'd actually 757 # be the region/dataset. Merging the two identifiers into a single one is done to 758 # avoid producing a 4-part Table reference, which would cause issues in the schema 759 # module, when there are 3-part table names mixed with information schema views. 760 # 761 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 762 table_parts = table.parts 763 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 764 # We need to alias the table here to avoid breaking existing qualified columns. 765 # This is expected to be safe, because if there's an actual alias coming up in 766 # the token stream, it will overwrite this one. If there isn't one, we are only 767 # exposing the name that can be used to reference the view explicitly (a no-op). 768 exp.alias_( 769 table, 770 t.cast(exp.Identifier, alias or table_parts[-1]), 771 table=True, 772 copy=False, 773 ) 774 775 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 776 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 777 line=table_parts[-2].meta.get("line"), 778 col=table_parts[-1].meta.get("col"), 779 start=table_parts[-2].meta.get("start"), 780 end=table_parts[-1].meta.get("end"), 781 ) 782 table.set("this", new_this) 783 table.set("db", seq_get(table_parts, -3)) 784 table.set("catalog", seq_get(table_parts, -4)) 785 786 return table 787 788 def _parse_column(self) -> t.Optional[exp.Expression]: 789 column = super()._parse_column() 790 if isinstance(column, exp.Column): 791 parts = column.parts 792 if any("." in p.name for p in parts): 793 catalog, db, table, this, *rest = ( 794 exp.to_identifier(p, quoted=True) 795 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 796 ) 797 798 if rest and this: 799 this = exp.Dot.build([this, *rest]) # type: ignore 800 801 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 802 column.meta["quoted_column"] = True 803 804 return column 805 806 @t.overload 807 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 808 809 @t.overload 810 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 811 812 def _parse_json_object(self, agg=False): 813 json_object = super()._parse_json_object() 814 array_kv_pair = seq_get(json_object.expressions, 0) 815 816 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 817 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 818 if ( 819 array_kv_pair 820 and isinstance(array_kv_pair.this, exp.Array) 821 and isinstance(array_kv_pair.expression, exp.Array) 822 ): 823 keys = array_kv_pair.this.expressions 824 values = array_kv_pair.expression.expressions 825 826 json_object.set( 827 "expressions", 828 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 829 ) 830 831 return json_object 832 833 def _parse_bracket( 834 self, this: t.Optional[exp.Expression] = None 835 ) -> t.Optional[exp.Expression]: 836 bracket = super()._parse_bracket(this) 837 838 if this is bracket: 839 return bracket 840 841 if isinstance(bracket, exp.Bracket): 842 for expression in bracket.expressions: 843 name = expression.name.upper() 844 845 if name not in self.BRACKET_OFFSETS: 846 break 847 848 offset, safe = self.BRACKET_OFFSETS[name] 849 bracket.set("offset", offset) 850 bracket.set("safe", safe) 851 expression.replace(expression.expressions[0]) 852 853 return bracket 854 855 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 856 unnest = super()._parse_unnest(with_alias=with_alias) 857 858 if not unnest: 859 return None 860 861 unnest_expr = seq_get(unnest.expressions, 0) 862 if unnest_expr: 863 from sqlglot.optimizer.annotate_types import annotate_types 864 865 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 866 867 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 868 # in contrast to other dialects such as DuckDB which flattens only the array by default 869 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 870 array_elem.is_type(exp.DataType.Type.STRUCT) 871 for array_elem in unnest_expr._type.expressions 872 ): 873 unnest.set("explode_array", True) 874 875 return unnest 876 877 def _parse_make_interval(self) -> exp.MakeInterval: 878 expr = exp.MakeInterval() 879 880 for arg_key in expr.arg_types: 881 value = self._parse_lambda() 882 883 if not value: 884 break 885 886 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 887 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 888 if isinstance(value, exp.Kwarg): 889 arg_key = value.this.name 890 891 expr.set(arg_key, value) 892 893 self._match(TokenType.COMMA) 894 895 return expr 896 897 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 898 expr = self.expression( 899 exp.FeaturesAtTime, 900 this=(self._match(TokenType.TABLE) and self._parse_table()) 901 or self._parse_select(nested=True), 902 ) 903 904 while self._match(TokenType.COMMA): 905 arg = self._parse_lambda() 906 907 # Get the LHS of the Kwarg and set the arg to that value, e.g 908 # "num_rows => 1" sets the expr's `num_rows` arg 909 if arg: 910 expr.set(arg.this.name, arg) 911 912 return expr 913 914 def _parse_export_data(self) -> exp.Export: 915 self._match_text_seq("DATA") 916 917 return self.expression( 918 exp.Export, 919 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 920 options=self._parse_properties(), 921 this=self._match_text_seq("AS") and self._parse_select(), 922 ) 923 924 class Generator(generator.Generator): 925 INTERVAL_ALLOWS_PLURAL_FORM = False 926 JOIN_HINTS = False 927 QUERY_HINTS = False 928 TABLE_HINTS = False 929 LIMIT_FETCH = "LIMIT" 930 RENAME_TABLE_WITH_DB = False 931 NVL2_SUPPORTED = False 932 UNNEST_WITH_ORDINALITY = False 933 COLLATE_IS_FUNC = True 934 LIMIT_ONLY_LITERALS = True 935 SUPPORTS_TABLE_ALIAS_COLUMNS = False 936 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 937 JSON_KEY_VALUE_PAIR_SEP = "," 938 NULL_ORDERING_SUPPORTED = False 939 IGNORE_NULLS_IN_FUNC = True 940 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 941 CAN_IMPLEMENT_ARRAY_ANY = True 942 SUPPORTS_TO_NUMBER = False 943 NAMED_PLACEHOLDER_TOKEN = "@" 944 HEX_FUNC = "TO_HEX" 945 WITH_PROPERTIES_PREFIX = "OPTIONS" 946 SUPPORTS_EXPLODING_PROJECTIONS = False 947 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 948 SUPPORTS_UNIX_SECONDS = True 949 950 TRANSFORMS = { 951 **generator.Generator.TRANSFORMS, 952 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 953 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 954 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 955 exp.Array: inline_array_unless_query, 956 exp.ArrayContains: _array_contains_sql, 957 exp.ArrayFilter: filter_array_using_unnest, 958 exp.ArrayRemove: filter_array_using_unnest, 959 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 960 exp.CollateProperty: lambda self, e: ( 961 f"DEFAULT COLLATE {self.sql(e, 'this')}" 962 if e.args.get("default") 963 else f"COLLATE {self.sql(e, 'this')}" 964 ), 965 exp.Commit: lambda *_: "COMMIT TRANSACTION", 966 exp.CountIf: rename_func("COUNTIF"), 967 exp.Create: _create_sql, 968 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 969 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 970 exp.DateDiff: lambda self, e: self.func( 971 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 972 ), 973 exp.DateFromParts: rename_func("DATE"), 974 exp.DateStrToDate: datestrtodate_sql, 975 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 976 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 977 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 978 exp.FromTimeZone: lambda self, e: self.func( 979 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 980 ), 981 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 982 exp.GroupConcat: lambda self, e: groupconcat_sql( 983 self, e, func_name="STRING_AGG", within_group=False 984 ), 985 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 986 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 987 exp.If: if_sql(false_value="NULL"), 988 exp.ILike: no_ilike_sql, 989 exp.IntDiv: rename_func("DIV"), 990 exp.Int64: rename_func("INT64"), 991 exp.JSONExtract: _json_extract_sql, 992 exp.JSONExtractArray: _json_extract_sql, 993 exp.JSONExtractScalar: _json_extract_sql, 994 exp.JSONFormat: rename_func("TO_JSON_STRING"), 995 exp.Levenshtein: _levenshtein_sql, 996 exp.Max: max_or_greatest, 997 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 998 exp.MD5Digest: rename_func("MD5"), 999 exp.Min: min_or_least, 1000 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1001 exp.RegexpExtract: lambda self, e: self.func( 1002 "REGEXP_EXTRACT", 1003 e.this, 1004 e.expression, 1005 e.args.get("position"), 1006 e.args.get("occurrence"), 1007 ), 1008 exp.RegexpExtractAll: lambda self, e: self.func( 1009 "REGEXP_EXTRACT_ALL", e.this, e.expression 1010 ), 1011 exp.RegexpReplace: regexp_replace_sql, 1012 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1013 exp.ReturnsProperty: _returnsproperty_sql, 1014 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1015 exp.Select: transforms.preprocess( 1016 [ 1017 transforms.explode_projection_to_unnest(), 1018 transforms.unqualify_unnest, 1019 transforms.eliminate_distinct_on, 1020 _alias_ordered_group, 1021 transforms.eliminate_semi_and_anti_joins, 1022 ] 1023 ), 1024 exp.SHA: rename_func("SHA1"), 1025 exp.SHA2: sha256_sql, 1026 exp.Space: space_sql, 1027 exp.StabilityProperty: lambda self, e: ( 1028 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1029 ), 1030 exp.String: rename_func("STRING"), 1031 exp.StrPosition: lambda self, e: ( 1032 strposition_sql( 1033 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1034 ) 1035 ), 1036 exp.StrToDate: _str_to_datetime_sql, 1037 exp.StrToTime: _str_to_datetime_sql, 1038 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1039 exp.TimeFromParts: rename_func("TIME"), 1040 exp.TimestampFromParts: rename_func("DATETIME"), 1041 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1042 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1043 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1044 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1045 exp.TimeStrToTime: timestrtotime_sql, 1046 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1047 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1048 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1049 exp.TsOrDsToTime: rename_func("TIME"), 1050 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1051 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1052 exp.Unhex: rename_func("FROM_HEX"), 1053 exp.UnixDate: rename_func("UNIX_DATE"), 1054 exp.UnixToTime: _unix_to_time_sql, 1055 exp.Uuid: lambda *_: "GENERATE_UUID()", 1056 exp.Values: _derived_table_values_to_unnest, 1057 exp.VariancePop: rename_func("VAR_POP"), 1058 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1059 } 1060 1061 SUPPORTED_JSON_PATH_PARTS = { 1062 exp.JSONPathKey, 1063 exp.JSONPathRoot, 1064 exp.JSONPathSubscript, 1065 } 1066 1067 TYPE_MAPPING = { 1068 **generator.Generator.TYPE_MAPPING, 1069 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1070 exp.DataType.Type.BIGINT: "INT64", 1071 exp.DataType.Type.BINARY: "BYTES", 1072 exp.DataType.Type.BLOB: "BYTES", 1073 exp.DataType.Type.BOOLEAN: "BOOL", 1074 exp.DataType.Type.CHAR: "STRING", 1075 exp.DataType.Type.DECIMAL: "NUMERIC", 1076 exp.DataType.Type.DOUBLE: "FLOAT64", 1077 exp.DataType.Type.FLOAT: "FLOAT64", 1078 exp.DataType.Type.INT: "INT64", 1079 exp.DataType.Type.NCHAR: "STRING", 1080 exp.DataType.Type.NVARCHAR: "STRING", 1081 exp.DataType.Type.SMALLINT: "INT64", 1082 exp.DataType.Type.TEXT: "STRING", 1083 exp.DataType.Type.TIMESTAMP: "DATETIME", 1084 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1085 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1086 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1087 exp.DataType.Type.TINYINT: "INT64", 1088 exp.DataType.Type.ROWVERSION: "BYTES", 1089 exp.DataType.Type.UUID: "STRING", 1090 exp.DataType.Type.VARBINARY: "BYTES", 1091 exp.DataType.Type.VARCHAR: "STRING", 1092 exp.DataType.Type.VARIANT: "ANY TYPE", 1093 } 1094 1095 PROPERTIES_LOCATION = { 1096 **generator.Generator.PROPERTIES_LOCATION, 1097 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1098 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1099 } 1100 1101 # WINDOW comes after QUALIFY 1102 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1103 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1104 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1105 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1106 } 1107 1108 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1109 RESERVED_KEYWORDS = { 1110 "all", 1111 "and", 1112 "any", 1113 "array", 1114 "as", 1115 "asc", 1116 "assert_rows_modified", 1117 "at", 1118 "between", 1119 "by", 1120 "case", 1121 "cast", 1122 "collate", 1123 "contains", 1124 "create", 1125 "cross", 1126 "cube", 1127 "current", 1128 "default", 1129 "define", 1130 "desc", 1131 "distinct", 1132 "else", 1133 "end", 1134 "enum", 1135 "escape", 1136 "except", 1137 "exclude", 1138 "exists", 1139 "extract", 1140 "false", 1141 "fetch", 1142 "following", 1143 "for", 1144 "from", 1145 "full", 1146 "group", 1147 "grouping", 1148 "groups", 1149 "hash", 1150 "having", 1151 "if", 1152 "ignore", 1153 "in", 1154 "inner", 1155 "intersect", 1156 "interval", 1157 "into", 1158 "is", 1159 "join", 1160 "lateral", 1161 "left", 1162 "like", 1163 "limit", 1164 "lookup", 1165 "merge", 1166 "natural", 1167 "new", 1168 "no", 1169 "not", 1170 "null", 1171 "nulls", 1172 "of", 1173 "on", 1174 "or", 1175 "order", 1176 "outer", 1177 "over", 1178 "partition", 1179 "preceding", 1180 "proto", 1181 "qualify", 1182 "range", 1183 "recursive", 1184 "respect", 1185 "right", 1186 "rollup", 1187 "rows", 1188 "select", 1189 "set", 1190 "some", 1191 "struct", 1192 "tablesample", 1193 "then", 1194 "to", 1195 "treat", 1196 "true", 1197 "unbounded", 1198 "union", 1199 "unnest", 1200 "using", 1201 "when", 1202 "where", 1203 "window", 1204 "with", 1205 "within", 1206 } 1207 1208 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1209 unit = expression.unit 1210 unit_sql = unit.name if unit.is_string else self.sql(unit) 1211 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1212 1213 def mod_sql(self, expression: exp.Mod) -> str: 1214 this = expression.this 1215 expr = expression.expression 1216 return self.func( 1217 "MOD", 1218 this.unnest() if isinstance(this, exp.Paren) else this, 1219 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1220 ) 1221 1222 def column_parts(self, expression: exp.Column) -> str: 1223 if expression.meta.get("quoted_column"): 1224 # If a column reference is of the form `dataset.table`.name, we need 1225 # to preserve the quoted table path, otherwise the reference breaks 1226 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1227 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1228 return f"{table_path}.{self.sql(expression, 'this')}" 1229 1230 return super().column_parts(expression) 1231 1232 def table_parts(self, expression: exp.Table) -> str: 1233 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1234 # we need to make sure the correct quoting is used in each case. 1235 # 1236 # For example, if there is a CTE x that clashes with a schema name, then the former will 1237 # return the table y in that schema, whereas the latter will return the CTE's y column: 1238 # 1239 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1240 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1241 if expression.meta.get("quoted_table"): 1242 table_parts = ".".join(p.name for p in expression.parts) 1243 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1244 1245 return super().table_parts(expression) 1246 1247 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1248 this = expression.this 1249 if isinstance(this, exp.TsOrDsToDatetime): 1250 func_name = "FORMAT_DATETIME" 1251 elif isinstance(this, exp.TsOrDsToTimestamp): 1252 func_name = "FORMAT_TIMESTAMP" 1253 else: 1254 func_name = "FORMAT_DATE" 1255 1256 time_expr = ( 1257 this 1258 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1259 else expression 1260 ) 1261 return self.func( 1262 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1263 ) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 # Operands of = cannot be NULL in BigQuery 1267 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1268 if not isinstance(expression.parent, exp.Update): 1269 return "NULL" 1270 1271 return self.binary(expression, "=") 1272 1273 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1274 parent = expression.parent 1275 1276 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1277 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1278 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1279 return self.func( 1280 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1281 ) 1282 1283 return super().attimezone_sql(expression) 1284 1285 def trycast_sql(self, expression: exp.TryCast) -> str: 1286 return self.cast_sql(expression, safe_prefix="SAFE_") 1287 1288 def bracket_sql(self, expression: exp.Bracket) -> str: 1289 this = expression.this 1290 expressions = expression.expressions 1291 1292 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1293 arg = expressions[0] 1294 if arg.type is None: 1295 from sqlglot.optimizer.annotate_types import annotate_types 1296 1297 arg = annotate_types(arg, dialect=self.dialect) 1298 1299 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1300 # BQ doesn't support bracket syntax with string values for structs 1301 return f"{self.sql(this)}.{arg.name}" 1302 1303 expressions_sql = self.expressions(expression, flat=True) 1304 offset = expression.args.get("offset") 1305 1306 if offset == 0: 1307 expressions_sql = f"OFFSET({expressions_sql})" 1308 elif offset == 1: 1309 expressions_sql = f"ORDINAL({expressions_sql})" 1310 elif offset is not None: 1311 self.unsupported(f"Unsupported array offset: {offset}") 1312 1313 if expression.args.get("safe"): 1314 expressions_sql = f"SAFE_{expressions_sql}" 1315 1316 return f"{self.sql(this)}[{expressions_sql}]" 1317 1318 def in_unnest_op(self, expression: exp.Unnest) -> str: 1319 return self.sql(expression) 1320 1321 def version_sql(self, expression: exp.Version) -> str: 1322 if expression.name == "TIMESTAMP": 1323 expression.set("this", "SYSTEM_TIME") 1324 return super().version_sql(expression) 1325 1326 def contains_sql(self, expression: exp.Contains) -> str: 1327 this = expression.this 1328 expr = expression.expression 1329 1330 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1331 this = this.this 1332 expr = expr.this 1333 1334 return self.func("CONTAINS_SUBSTR", this, expr) 1335 1336 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1337 this = expression.this 1338 1339 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1340 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1341 # because they aren't literals and so the above syntax is invalid BigQuery. 1342 if isinstance(this, exp.Array): 1343 elem = seq_get(this.expressions, 0) 1344 if not (elem and elem.find(exp.Query)): 1345 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1346 1347 return super().cast_sql(expression, safe_prefix=safe_prefix)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
464 def normalize_identifier(self, expression: E) -> E: 465 if ( 466 isinstance(expression, exp.Identifier) 467 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 468 ): 469 parent = expression.parent 470 while isinstance(parent, exp.Dot): 471 parent = parent.parent 472 473 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 474 # by default. The following check uses a heuristic to detect tables based on whether 475 # they are qualified. This should generally be correct, because tables in BigQuery 476 # must be qualified with at least a dataset, unless @@dataset_id is set. 477 case_sensitive = ( 478 isinstance(parent, exp.UserDefinedFunction) 479 or ( 480 isinstance(parent, exp.Table) 481 and parent.db 482 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 483 ) 484 or expression.meta.get("is_table") 485 ) 486 if not case_sensitive: 487 expression.set("this", expression.this.lower()) 488 489 return t.cast(E, expression) 490 491 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
493 class Tokenizer(tokens.Tokenizer): 494 QUOTES = ["'", '"', '"""', "'''"] 495 COMMENTS = ["--", "#", ("/*", "*/")] 496 IDENTIFIERS = ["`"] 497 STRING_ESCAPES = ["\\"] 498 499 HEX_STRINGS = [("0x", ""), ("0X", "")] 500 501 BYTE_STRINGS = [ 502 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 503 ] 504 505 RAW_STRINGS = [ 506 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 507 ] 508 509 NESTED_COMMENTS = False 510 511 KEYWORDS = { 512 **tokens.Tokenizer.KEYWORDS, 513 "ANY TYPE": TokenType.VARIANT, 514 "BEGIN": TokenType.COMMAND, 515 "BEGIN TRANSACTION": TokenType.BEGIN, 516 "BYTEINT": TokenType.INT, 517 "BYTES": TokenType.BINARY, 518 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 519 "DATETIME": TokenType.TIMESTAMP, 520 "DECLARE": TokenType.COMMAND, 521 "ELSEIF": TokenType.COMMAND, 522 "EXCEPTION": TokenType.COMMAND, 523 "EXPORT": TokenType.EXPORT, 524 "FLOAT64": TokenType.DOUBLE, 525 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 526 "MODEL": TokenType.MODEL, 527 "NOT DETERMINISTIC": TokenType.VOLATILE, 528 "RECORD": TokenType.STRUCT, 529 "TIMESTAMP": TokenType.TIMESTAMPTZ, 530 } 531 KEYWORDS.pop("DIV") 532 KEYWORDS.pop("VALUES") 533 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
535 class Parser(parser.Parser): 536 PREFIXED_PIVOT_COLUMNS = True 537 LOG_DEFAULTS_TO_LN = True 538 SUPPORTS_IMPLICIT_UNNEST = True 539 JOINS_HAVE_EQUAL_PRECEDENCE = True 540 541 # BigQuery does not allow ASC/DESC to be used as an identifier 542 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 543 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 544 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 545 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 546 TokenType.ASC, 547 TokenType.DESC, 548 } 549 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 550 551 FUNCTIONS = { 552 **parser.Parser.FUNCTIONS, 553 "CONTAINS_SUBSTR": _build_contains_substring, 554 "DATE": _build_date, 555 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 556 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 557 "DATE_TRUNC": lambda args: exp.DateTrunc( 558 unit=seq_get(args, 1), 559 this=seq_get(args, 0), 560 zone=seq_get(args, 2), 561 ), 562 "DATETIME": _build_datetime, 563 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 564 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 565 "DIV": binary_from_function(exp.IntDiv), 566 "EDIT_DISTANCE": _build_levenshtein, 567 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 568 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 569 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 570 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 571 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 572 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 573 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 574 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 575 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 576 "MD5": exp.MD5Digest.from_arg_list, 577 "TO_HEX": _build_to_hex, 578 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 579 [seq_get(args, 1), seq_get(args, 0)] 580 ), 581 "PARSE_TIMESTAMP": _build_parse_timestamp, 582 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 583 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 584 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 585 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 586 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 587 ), 588 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 589 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 590 "SPLIT": lambda args: exp.Split( 591 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 592 this=seq_get(args, 0), 593 expression=seq_get(args, 1) or exp.Literal.string(","), 594 ), 595 "STRPOS": exp.StrPosition.from_arg_list, 596 "TIME": _build_time, 597 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 598 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 599 "TIMESTAMP": _build_timestamp, 600 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 601 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 602 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 603 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 604 ), 605 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 606 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 607 ), 608 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 609 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 610 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 611 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 612 } 613 614 FUNCTION_PARSERS = { 615 **parser.Parser.FUNCTION_PARSERS, 616 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 617 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 618 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 619 } 620 FUNCTION_PARSERS.pop("TRIM") 621 622 NO_PAREN_FUNCTIONS = { 623 **parser.Parser.NO_PAREN_FUNCTIONS, 624 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 625 } 626 627 NESTED_TYPE_TOKENS = { 628 *parser.Parser.NESTED_TYPE_TOKENS, 629 TokenType.TABLE, 630 } 631 632 PROPERTY_PARSERS = { 633 **parser.Parser.PROPERTY_PARSERS, 634 "NOT DETERMINISTIC": lambda self: self.expression( 635 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 636 ), 637 "OPTIONS": lambda self: self._parse_with_property(), 638 } 639 640 CONSTRAINT_PARSERS = { 641 **parser.Parser.CONSTRAINT_PARSERS, 642 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 643 } 644 645 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 646 RANGE_PARSERS.pop(TokenType.OVERLAPS) 647 648 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 649 650 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 651 652 STATEMENT_PARSERS = { 653 **parser.Parser.STATEMENT_PARSERS, 654 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 655 TokenType.END: lambda self: self._parse_as_command(self._prev), 656 TokenType.FOR: lambda self: self._parse_for_in(), 657 TokenType.EXPORT: lambda self: self._parse_export_data(), 658 } 659 660 BRACKET_OFFSETS = { 661 "OFFSET": (0, False), 662 "ORDINAL": (1, False), 663 "SAFE_OFFSET": (0, True), 664 "SAFE_ORDINAL": (1, True), 665 } 666 667 def _parse_for_in(self) -> exp.ForIn: 668 this = self._parse_range() 669 self._match_text_seq("DO") 670 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 671 672 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 673 this = super()._parse_table_part(schema=schema) or self._parse_number() 674 675 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 676 if isinstance(this, exp.Identifier): 677 table_name = this.name 678 while self._match(TokenType.DASH, advance=False) and self._next: 679 start = self._curr 680 while self._is_connected() and not self._match_set( 681 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 682 ): 683 self._advance() 684 685 if start == self._curr: 686 break 687 688 table_name += self._find_sql(start, self._prev) 689 690 this = exp.Identifier( 691 this=table_name, quoted=this.args.get("quoted") 692 ).update_positions(this) 693 elif isinstance(this, exp.Literal): 694 table_name = this.name 695 696 if self._is_connected() and self._parse_var(any_token=True): 697 table_name += self._prev.text 698 699 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 700 701 return this 702 703 def _parse_table_parts( 704 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 705 ) -> exp.Table: 706 table = super()._parse_table_parts( 707 schema=schema, is_db_reference=is_db_reference, wildcard=True 708 ) 709 710 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 711 if not table.catalog: 712 if table.db: 713 previous_db = table.args["db"] 714 parts = table.db.split(".") 715 if len(parts) == 2 and not table.args["db"].quoted: 716 table.set( 717 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 718 ) 719 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 720 else: 721 previous_this = table.this 722 parts = table.name.split(".") 723 if len(parts) == 2 and not table.this.quoted: 724 table.set( 725 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 726 ) 727 table.set( 728 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 729 ) 730 731 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 732 alias = table.this 733 catalog, db, this, *rest = ( 734 exp.to_identifier(p, quoted=True) 735 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 736 ) 737 738 for part in (catalog, db, this): 739 if part: 740 part.update_positions(table.this) 741 742 if rest and this: 743 this = exp.Dot.build([this, *rest]) # type: ignore 744 745 table = exp.Table( 746 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 747 ) 748 table.meta["quoted_table"] = True 749 else: 750 alias = None 751 752 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 753 # dataset, so if the project identifier is omitted we need to fix the ast so that 754 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 755 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 756 # views, because it would seem like the "catalog" part is set, when it'd actually 757 # be the region/dataset. Merging the two identifiers into a single one is done to 758 # avoid producing a 4-part Table reference, which would cause issues in the schema 759 # module, when there are 3-part table names mixed with information schema views. 760 # 761 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 762 table_parts = table.parts 763 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 764 # We need to alias the table here to avoid breaking existing qualified columns. 765 # This is expected to be safe, because if there's an actual alias coming up in 766 # the token stream, it will overwrite this one. If there isn't one, we are only 767 # exposing the name that can be used to reference the view explicitly (a no-op). 768 exp.alias_( 769 table, 770 t.cast(exp.Identifier, alias or table_parts[-1]), 771 table=True, 772 copy=False, 773 ) 774 775 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 776 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 777 line=table_parts[-2].meta.get("line"), 778 col=table_parts[-1].meta.get("col"), 779 start=table_parts[-2].meta.get("start"), 780 end=table_parts[-1].meta.get("end"), 781 ) 782 table.set("this", new_this) 783 table.set("db", seq_get(table_parts, -3)) 784 table.set("catalog", seq_get(table_parts, -4)) 785 786 return table 787 788 def _parse_column(self) -> t.Optional[exp.Expression]: 789 column = super()._parse_column() 790 if isinstance(column, exp.Column): 791 parts = column.parts 792 if any("." in p.name for p in parts): 793 catalog, db, table, this, *rest = ( 794 exp.to_identifier(p, quoted=True) 795 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 796 ) 797 798 if rest and this: 799 this = exp.Dot.build([this, *rest]) # type: ignore 800 801 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 802 column.meta["quoted_column"] = True 803 804 return column 805 806 @t.overload 807 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 808 809 @t.overload 810 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 811 812 def _parse_json_object(self, agg=False): 813 json_object = super()._parse_json_object() 814 array_kv_pair = seq_get(json_object.expressions, 0) 815 816 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 817 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 818 if ( 819 array_kv_pair 820 and isinstance(array_kv_pair.this, exp.Array) 821 and isinstance(array_kv_pair.expression, exp.Array) 822 ): 823 keys = array_kv_pair.this.expressions 824 values = array_kv_pair.expression.expressions 825 826 json_object.set( 827 "expressions", 828 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 829 ) 830 831 return json_object 832 833 def _parse_bracket( 834 self, this: t.Optional[exp.Expression] = None 835 ) -> t.Optional[exp.Expression]: 836 bracket = super()._parse_bracket(this) 837 838 if this is bracket: 839 return bracket 840 841 if isinstance(bracket, exp.Bracket): 842 for expression in bracket.expressions: 843 name = expression.name.upper() 844 845 if name not in self.BRACKET_OFFSETS: 846 break 847 848 offset, safe = self.BRACKET_OFFSETS[name] 849 bracket.set("offset", offset) 850 bracket.set("safe", safe) 851 expression.replace(expression.expressions[0]) 852 853 return bracket 854 855 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 856 unnest = super()._parse_unnest(with_alias=with_alias) 857 858 if not unnest: 859 return None 860 861 unnest_expr = seq_get(unnest.expressions, 0) 862 if unnest_expr: 863 from sqlglot.optimizer.annotate_types import annotate_types 864 865 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 866 867 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 868 # in contrast to other dialects such as DuckDB which flattens only the array by default 869 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 870 array_elem.is_type(exp.DataType.Type.STRUCT) 871 for array_elem in unnest_expr._type.expressions 872 ): 873 unnest.set("explode_array", True) 874 875 return unnest 876 877 def _parse_make_interval(self) -> exp.MakeInterval: 878 expr = exp.MakeInterval() 879 880 for arg_key in expr.arg_types: 881 value = self._parse_lambda() 882 883 if not value: 884 break 885 886 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 887 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 888 if isinstance(value, exp.Kwarg): 889 arg_key = value.this.name 890 891 expr.set(arg_key, value) 892 893 self._match(TokenType.COMMA) 894 895 return expr 896 897 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 898 expr = self.expression( 899 exp.FeaturesAtTime, 900 this=(self._match(TokenType.TABLE) and self._parse_table()) 901 or self._parse_select(nested=True), 902 ) 903 904 while self._match(TokenType.COMMA): 905 arg = self._parse_lambda() 906 907 # Get the LHS of the Kwarg and set the arg to that value, e.g 908 # "num_rows => 1" sets the expr's `num_rows` arg 909 if arg: 910 expr.set(arg.this.name, arg) 911 912 return expr 913 914 def _parse_export_data(self) -> exp.Export: 915 self._match_text_seq("DATA") 916 917 return self.expression( 918 exp.Export, 919 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 920 options=self._parse_properties(), 921 this=self._match_text_seq("AS") and self._parse_select(), 922 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
924 class Generator(generator.Generator): 925 INTERVAL_ALLOWS_PLURAL_FORM = False 926 JOIN_HINTS = False 927 QUERY_HINTS = False 928 TABLE_HINTS = False 929 LIMIT_FETCH = "LIMIT" 930 RENAME_TABLE_WITH_DB = False 931 NVL2_SUPPORTED = False 932 UNNEST_WITH_ORDINALITY = False 933 COLLATE_IS_FUNC = True 934 LIMIT_ONLY_LITERALS = True 935 SUPPORTS_TABLE_ALIAS_COLUMNS = False 936 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 937 JSON_KEY_VALUE_PAIR_SEP = "," 938 NULL_ORDERING_SUPPORTED = False 939 IGNORE_NULLS_IN_FUNC = True 940 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 941 CAN_IMPLEMENT_ARRAY_ANY = True 942 SUPPORTS_TO_NUMBER = False 943 NAMED_PLACEHOLDER_TOKEN = "@" 944 HEX_FUNC = "TO_HEX" 945 WITH_PROPERTIES_PREFIX = "OPTIONS" 946 SUPPORTS_EXPLODING_PROJECTIONS = False 947 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 948 SUPPORTS_UNIX_SECONDS = True 949 950 TRANSFORMS = { 951 **generator.Generator.TRANSFORMS, 952 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 953 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 954 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 955 exp.Array: inline_array_unless_query, 956 exp.ArrayContains: _array_contains_sql, 957 exp.ArrayFilter: filter_array_using_unnest, 958 exp.ArrayRemove: filter_array_using_unnest, 959 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 960 exp.CollateProperty: lambda self, e: ( 961 f"DEFAULT COLLATE {self.sql(e, 'this')}" 962 if e.args.get("default") 963 else f"COLLATE {self.sql(e, 'this')}" 964 ), 965 exp.Commit: lambda *_: "COMMIT TRANSACTION", 966 exp.CountIf: rename_func("COUNTIF"), 967 exp.Create: _create_sql, 968 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 969 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 970 exp.DateDiff: lambda self, e: self.func( 971 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 972 ), 973 exp.DateFromParts: rename_func("DATE"), 974 exp.DateStrToDate: datestrtodate_sql, 975 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 976 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 977 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 978 exp.FromTimeZone: lambda self, e: self.func( 979 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 980 ), 981 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 982 exp.GroupConcat: lambda self, e: groupconcat_sql( 983 self, e, func_name="STRING_AGG", within_group=False 984 ), 985 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 986 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 987 exp.If: if_sql(false_value="NULL"), 988 exp.ILike: no_ilike_sql, 989 exp.IntDiv: rename_func("DIV"), 990 exp.Int64: rename_func("INT64"), 991 exp.JSONExtract: _json_extract_sql, 992 exp.JSONExtractArray: _json_extract_sql, 993 exp.JSONExtractScalar: _json_extract_sql, 994 exp.JSONFormat: rename_func("TO_JSON_STRING"), 995 exp.Levenshtein: _levenshtein_sql, 996 exp.Max: max_or_greatest, 997 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 998 exp.MD5Digest: rename_func("MD5"), 999 exp.Min: min_or_least, 1000 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1001 exp.RegexpExtract: lambda self, e: self.func( 1002 "REGEXP_EXTRACT", 1003 e.this, 1004 e.expression, 1005 e.args.get("position"), 1006 e.args.get("occurrence"), 1007 ), 1008 exp.RegexpExtractAll: lambda self, e: self.func( 1009 "REGEXP_EXTRACT_ALL", e.this, e.expression 1010 ), 1011 exp.RegexpReplace: regexp_replace_sql, 1012 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1013 exp.ReturnsProperty: _returnsproperty_sql, 1014 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1015 exp.Select: transforms.preprocess( 1016 [ 1017 transforms.explode_projection_to_unnest(), 1018 transforms.unqualify_unnest, 1019 transforms.eliminate_distinct_on, 1020 _alias_ordered_group, 1021 transforms.eliminate_semi_and_anti_joins, 1022 ] 1023 ), 1024 exp.SHA: rename_func("SHA1"), 1025 exp.SHA2: sha256_sql, 1026 exp.Space: space_sql, 1027 exp.StabilityProperty: lambda self, e: ( 1028 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1029 ), 1030 exp.String: rename_func("STRING"), 1031 exp.StrPosition: lambda self, e: ( 1032 strposition_sql( 1033 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1034 ) 1035 ), 1036 exp.StrToDate: _str_to_datetime_sql, 1037 exp.StrToTime: _str_to_datetime_sql, 1038 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1039 exp.TimeFromParts: rename_func("TIME"), 1040 exp.TimestampFromParts: rename_func("DATETIME"), 1041 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1042 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1043 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1044 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1045 exp.TimeStrToTime: timestrtotime_sql, 1046 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1047 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1048 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1049 exp.TsOrDsToTime: rename_func("TIME"), 1050 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1051 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1052 exp.Unhex: rename_func("FROM_HEX"), 1053 exp.UnixDate: rename_func("UNIX_DATE"), 1054 exp.UnixToTime: _unix_to_time_sql, 1055 exp.Uuid: lambda *_: "GENERATE_UUID()", 1056 exp.Values: _derived_table_values_to_unnest, 1057 exp.VariancePop: rename_func("VAR_POP"), 1058 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1059 } 1060 1061 SUPPORTED_JSON_PATH_PARTS = { 1062 exp.JSONPathKey, 1063 exp.JSONPathRoot, 1064 exp.JSONPathSubscript, 1065 } 1066 1067 TYPE_MAPPING = { 1068 **generator.Generator.TYPE_MAPPING, 1069 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1070 exp.DataType.Type.BIGINT: "INT64", 1071 exp.DataType.Type.BINARY: "BYTES", 1072 exp.DataType.Type.BLOB: "BYTES", 1073 exp.DataType.Type.BOOLEAN: "BOOL", 1074 exp.DataType.Type.CHAR: "STRING", 1075 exp.DataType.Type.DECIMAL: "NUMERIC", 1076 exp.DataType.Type.DOUBLE: "FLOAT64", 1077 exp.DataType.Type.FLOAT: "FLOAT64", 1078 exp.DataType.Type.INT: "INT64", 1079 exp.DataType.Type.NCHAR: "STRING", 1080 exp.DataType.Type.NVARCHAR: "STRING", 1081 exp.DataType.Type.SMALLINT: "INT64", 1082 exp.DataType.Type.TEXT: "STRING", 1083 exp.DataType.Type.TIMESTAMP: "DATETIME", 1084 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1085 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1086 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1087 exp.DataType.Type.TINYINT: "INT64", 1088 exp.DataType.Type.ROWVERSION: "BYTES", 1089 exp.DataType.Type.UUID: "STRING", 1090 exp.DataType.Type.VARBINARY: "BYTES", 1091 exp.DataType.Type.VARCHAR: "STRING", 1092 exp.DataType.Type.VARIANT: "ANY TYPE", 1093 } 1094 1095 PROPERTIES_LOCATION = { 1096 **generator.Generator.PROPERTIES_LOCATION, 1097 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1098 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1099 } 1100 1101 # WINDOW comes after QUALIFY 1102 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1103 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1104 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1105 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1106 } 1107 1108 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1109 RESERVED_KEYWORDS = { 1110 "all", 1111 "and", 1112 "any", 1113 "array", 1114 "as", 1115 "asc", 1116 "assert_rows_modified", 1117 "at", 1118 "between", 1119 "by", 1120 "case", 1121 "cast", 1122 "collate", 1123 "contains", 1124 "create", 1125 "cross", 1126 "cube", 1127 "current", 1128 "default", 1129 "define", 1130 "desc", 1131 "distinct", 1132 "else", 1133 "end", 1134 "enum", 1135 "escape", 1136 "except", 1137 "exclude", 1138 "exists", 1139 "extract", 1140 "false", 1141 "fetch", 1142 "following", 1143 "for", 1144 "from", 1145 "full", 1146 "group", 1147 "grouping", 1148 "groups", 1149 "hash", 1150 "having", 1151 "if", 1152 "ignore", 1153 "in", 1154 "inner", 1155 "intersect", 1156 "interval", 1157 "into", 1158 "is", 1159 "join", 1160 "lateral", 1161 "left", 1162 "like", 1163 "limit", 1164 "lookup", 1165 "merge", 1166 "natural", 1167 "new", 1168 "no", 1169 "not", 1170 "null", 1171 "nulls", 1172 "of", 1173 "on", 1174 "or", 1175 "order", 1176 "outer", 1177 "over", 1178 "partition", 1179 "preceding", 1180 "proto", 1181 "qualify", 1182 "range", 1183 "recursive", 1184 "respect", 1185 "right", 1186 "rollup", 1187 "rows", 1188 "select", 1189 "set", 1190 "some", 1191 "struct", 1192 "tablesample", 1193 "then", 1194 "to", 1195 "treat", 1196 "true", 1197 "unbounded", 1198 "union", 1199 "unnest", 1200 "using", 1201 "when", 1202 "where", 1203 "window", 1204 "with", 1205 "within", 1206 } 1207 1208 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1209 unit = expression.unit 1210 unit_sql = unit.name if unit.is_string else self.sql(unit) 1211 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1212 1213 def mod_sql(self, expression: exp.Mod) -> str: 1214 this = expression.this 1215 expr = expression.expression 1216 return self.func( 1217 "MOD", 1218 this.unnest() if isinstance(this, exp.Paren) else this, 1219 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1220 ) 1221 1222 def column_parts(self, expression: exp.Column) -> str: 1223 if expression.meta.get("quoted_column"): 1224 # If a column reference is of the form `dataset.table`.name, we need 1225 # to preserve the quoted table path, otherwise the reference breaks 1226 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1227 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1228 return f"{table_path}.{self.sql(expression, 'this')}" 1229 1230 return super().column_parts(expression) 1231 1232 def table_parts(self, expression: exp.Table) -> str: 1233 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1234 # we need to make sure the correct quoting is used in each case. 1235 # 1236 # For example, if there is a CTE x that clashes with a schema name, then the former will 1237 # return the table y in that schema, whereas the latter will return the CTE's y column: 1238 # 1239 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1240 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1241 if expression.meta.get("quoted_table"): 1242 table_parts = ".".join(p.name for p in expression.parts) 1243 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1244 1245 return super().table_parts(expression) 1246 1247 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1248 this = expression.this 1249 if isinstance(this, exp.TsOrDsToDatetime): 1250 func_name = "FORMAT_DATETIME" 1251 elif isinstance(this, exp.TsOrDsToTimestamp): 1252 func_name = "FORMAT_TIMESTAMP" 1253 else: 1254 func_name = "FORMAT_DATE" 1255 1256 time_expr = ( 1257 this 1258 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1259 else expression 1260 ) 1261 return self.func( 1262 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1263 ) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 # Operands of = cannot be NULL in BigQuery 1267 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1268 if not isinstance(expression.parent, exp.Update): 1269 return "NULL" 1270 1271 return self.binary(expression, "=") 1272 1273 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1274 parent = expression.parent 1275 1276 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1277 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1278 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1279 return self.func( 1280 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1281 ) 1282 1283 return super().attimezone_sql(expression) 1284 1285 def trycast_sql(self, expression: exp.TryCast) -> str: 1286 return self.cast_sql(expression, safe_prefix="SAFE_") 1287 1288 def bracket_sql(self, expression: exp.Bracket) -> str: 1289 this = expression.this 1290 expressions = expression.expressions 1291 1292 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1293 arg = expressions[0] 1294 if arg.type is None: 1295 from sqlglot.optimizer.annotate_types import annotate_types 1296 1297 arg = annotate_types(arg, dialect=self.dialect) 1298 1299 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1300 # BQ doesn't support bracket syntax with string values for structs 1301 return f"{self.sql(this)}.{arg.name}" 1302 1303 expressions_sql = self.expressions(expression, flat=True) 1304 offset = expression.args.get("offset") 1305 1306 if offset == 0: 1307 expressions_sql = f"OFFSET({expressions_sql})" 1308 elif offset == 1: 1309 expressions_sql = f"ORDINAL({expressions_sql})" 1310 elif offset is not None: 1311 self.unsupported(f"Unsupported array offset: {offset}") 1312 1313 if expression.args.get("safe"): 1314 expressions_sql = f"SAFE_{expressions_sql}" 1315 1316 return f"{self.sql(this)}[{expressions_sql}]" 1317 1318 def in_unnest_op(self, expression: exp.Unnest) -> str: 1319 return self.sql(expression) 1320 1321 def version_sql(self, expression: exp.Version) -> str: 1322 if expression.name == "TIMESTAMP": 1323 expression.set("this", "SYSTEM_TIME") 1324 return super().version_sql(expression) 1325 1326 def contains_sql(self, expression: exp.Contains) -> str: 1327 this = expression.this 1328 expr = expression.expression 1329 1330 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1331 this = this.this 1332 expr = expr.this 1333 1334 return self.func("CONTAINS_SUBSTR", this, expr) 1335 1336 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1337 this = expression.this 1338 1339 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1340 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1341 # because they aren't literals and so the above syntax is invalid BigQuery. 1342 if isinstance(this, exp.Array): 1343 elem = seq_get(this.expressions, 0) 1344 if not (elem and elem.find(exp.Query)): 1345 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1346 1347 return super().cast_sql(expression, safe_prefix=safe_prefix)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1222 def column_parts(self, expression: exp.Column) -> str: 1223 if expression.meta.get("quoted_column"): 1224 # If a column reference is of the form `dataset.table`.name, we need 1225 # to preserve the quoted table path, otherwise the reference breaks 1226 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1227 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1228 return f"{table_path}.{self.sql(expression, 'this')}" 1229 1230 return super().column_parts(expression)
1232 def table_parts(self, expression: exp.Table) -> str: 1233 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1234 # we need to make sure the correct quoting is used in each case. 1235 # 1236 # For example, if there is a CTE x that clashes with a schema name, then the former will 1237 # return the table y in that schema, whereas the latter will return the CTE's y column: 1238 # 1239 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1240 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1241 if expression.meta.get("quoted_table"): 1242 table_parts = ".".join(p.name for p in expression.parts) 1243 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1244 1245 return super().table_parts(expression)
1247 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1248 this = expression.this 1249 if isinstance(this, exp.TsOrDsToDatetime): 1250 func_name = "FORMAT_DATETIME" 1251 elif isinstance(this, exp.TsOrDsToTimestamp): 1252 func_name = "FORMAT_TIMESTAMP" 1253 else: 1254 func_name = "FORMAT_DATE" 1255 1256 time_expr = ( 1257 this 1258 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1259 else expression 1260 ) 1261 return self.func( 1262 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1263 )
1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 # Operands of = cannot be NULL in BigQuery 1267 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1268 if not isinstance(expression.parent, exp.Update): 1269 return "NULL" 1270 1271 return self.binary(expression, "=")
1273 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1274 parent = expression.parent 1275 1276 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1277 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1278 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1279 return self.func( 1280 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1281 ) 1282 1283 return super().attimezone_sql(expression)
1288 def bracket_sql(self, expression: exp.Bracket) -> str: 1289 this = expression.this 1290 expressions = expression.expressions 1291 1292 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1293 arg = expressions[0] 1294 if arg.type is None: 1295 from sqlglot.optimizer.annotate_types import annotate_types 1296 1297 arg = annotate_types(arg, dialect=self.dialect) 1298 1299 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1300 # BQ doesn't support bracket syntax with string values for structs 1301 return f"{self.sql(this)}.{arg.name}" 1302 1303 expressions_sql = self.expressions(expression, flat=True) 1304 offset = expression.args.get("offset") 1305 1306 if offset == 0: 1307 expressions_sql = f"OFFSET({expressions_sql})" 1308 elif offset == 1: 1309 expressions_sql = f"ORDINAL({expressions_sql})" 1310 elif offset is not None: 1311 self.unsupported(f"Unsupported array offset: {offset}") 1312 1313 if expression.args.get("safe"): 1314 expressions_sql = f"SAFE_{expressions_sql}" 1315 1316 return f"{self.sql(this)}[{expressions_sql}]"
1336 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1337 this = expression.this 1338 1339 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1340 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1341 # because they aren't literals and so the above syntax is invalid BigQuery. 1342 if isinstance(this, exp.Array): 1343 elem = seq_get(this.expressions, 0) 1344 if not (elem and elem.find(exp.Query)): 1345 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1346 1347 return super().cast_sql(expression, safe_prefix=safe_prefix)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql