sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 annotate_with_type_lambda, 13 arg_max_or_min_no_count, 14 binary_from_function, 15 date_add_interval_sql, 16 datestrtodate_sql, 17 build_formatted_time, 18 filter_array_using_unnest, 19 if_sql, 20 inline_array_unless_query, 21 max_or_greatest, 22 min_or_least, 23 no_ilike_sql, 24 build_date_delta_with_interval, 25 regexp_replace_sql, 26 rename_func, 27 sha256_sql, 28 timestrtotime_sql, 29 ts_or_ds_add_cast, 30 unit_to_var, 31 strposition_sql, 32 groupconcat_sql, 33 space_sql, 34) 35from sqlglot.helper import seq_get, split_num_words 36from sqlglot.tokens import TokenType 37from sqlglot.generator import unsupported_args 38 39if t.TYPE_CHECKING: 40 from sqlglot._typing import Lit 41 42 from sqlglot.optimizer.annotate_types import TypeAnnotator 43 44logger = logging.getLogger("sqlglot") 45 46 47JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 48 49DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 50 51 52def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 53 if not expression.find_ancestor(exp.From, exp.Join): 54 return self.values_sql(expression) 55 56 structs = [] 57 alias = expression.args.get("alias") 58 for tup in expression.find_all(exp.Tuple): 59 field_aliases = ( 60 alias.columns 61 if alias and alias.columns 62 else (f"_c{i}" for i in range(len(tup.expressions))) 63 ) 64 expressions = [ 65 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 66 for name, fld in zip(field_aliases, tup.expressions) 67 ] 68 structs.append(exp.Struct(expressions=expressions)) 69 70 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 71 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 72 return self.unnest_sql( 73 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 74 ) 75 76 77def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 78 this = expression.this 79 if isinstance(this, exp.Schema): 80 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 81 else: 82 this = self.sql(this) 83 return f"RETURNS {this}" 84 85 86def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 87 returns = expression.find(exp.ReturnsProperty) 88 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 89 expression.set("kind", "TABLE FUNCTION") 90 91 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 92 expression.set("expression", expression.expression.this) 93 94 return self.create_sql(expression) 95 96 97# https://issuetracker.google.com/issues/162294746 98# workaround for bigquery bug when grouping by an expression and then ordering 99# WITH x AS (SELECT 1 y) 100# SELECT y + 1 z 101# FROM x 102# GROUP BY x + 1 103# ORDER by z 104def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 105 if isinstance(expression, exp.Select): 106 group = expression.args.get("group") 107 order = expression.args.get("order") 108 109 if group and order: 110 aliases = { 111 select.this: select.args["alias"] 112 for select in expression.selects 113 if isinstance(select, exp.Alias) 114 } 115 116 for grouped in group.expressions: 117 if grouped.is_int: 118 continue 119 alias = aliases.get(grouped) 120 if alias: 121 grouped.replace(exp.column(alias)) 122 123 return expression 124 125 126def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 127 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 128 if isinstance(expression, exp.CTE) and expression.alias_column_names: 129 cte_query = expression.this 130 131 if cte_query.is_star: 132 logger.warning( 133 "Can't push down CTE column names for star queries. Run the query through" 134 " the optimizer or use 'qualify' to expand the star projections first." 135 ) 136 return expression 137 138 column_names = expression.alias_column_names 139 expression.args["alias"].set("columns", None) 140 141 for name, select in zip(column_names, cte_query.selects): 142 to_replace = select 143 144 if isinstance(select, exp.Alias): 145 select = select.this 146 147 # Inner aliases are shadowed by the CTE column names 148 to_replace.replace(exp.alias_(select, name)) 149 150 return expression 151 152 153def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 154 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 155 this.set("zone", seq_get(args, 2)) 156 return this 157 158 159def _build_timestamp(args: t.List) -> exp.Timestamp: 160 timestamp = exp.Timestamp.from_arg_list(args) 161 timestamp.set("with_tz", True) 162 return timestamp 163 164 165def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 166 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 167 return expr_type.from_arg_list(args) 168 169 170def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 171 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 172 arg = seq_get(args, 0) 173 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 174 175 176def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 177 return self.sql( 178 exp.Exists( 179 this=exp.select("1") 180 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 181 .where(exp.column("_col").eq(expression.right)) 182 ) 183 ) 184 185 186def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 187 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 188 189 190def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 191 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 192 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 193 unit = unit_to_var(expression) 194 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 195 196 197def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 198 scale = expression.args.get("scale") 199 timestamp = expression.this 200 201 if scale in (None, exp.UnixToTime.SECONDS): 202 return self.func("TIMESTAMP_SECONDS", timestamp) 203 if scale == exp.UnixToTime.MILLIS: 204 return self.func("TIMESTAMP_MILLIS", timestamp) 205 if scale == exp.UnixToTime.MICROS: 206 return self.func("TIMESTAMP_MICROS", timestamp) 207 208 unix_seconds = exp.cast( 209 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 210 ) 211 return self.func("TIMESTAMP_SECONDS", unix_seconds) 212 213 214def _build_time(args: t.List) -> exp.Func: 215 if len(args) == 1: 216 return exp.TsOrDsToTime(this=args[0]) 217 if len(args) == 2: 218 return exp.Time.from_arg_list(args) 219 return exp.TimeFromParts.from_arg_list(args) 220 221 222def _build_datetime(args: t.List) -> exp.Func: 223 if len(args) == 1: 224 return exp.TsOrDsToDatetime.from_arg_list(args) 225 if len(args) == 2: 226 return exp.Datetime.from_arg_list(args) 227 return exp.TimestampFromParts.from_arg_list(args) 228 229 230def _build_regexp_extract( 231 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 232) -> t.Callable[[t.List], E]: 233 def _builder(args: t.List) -> E: 234 try: 235 group = re.compile(args[1].name).groups == 1 236 except re.error: 237 group = False 238 239 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 240 return expr_type( 241 this=seq_get(args, 0), 242 expression=seq_get(args, 1), 243 position=seq_get(args, 2), 244 occurrence=seq_get(args, 3), 245 group=exp.Literal.number(1) if group else default_group, 246 ) 247 248 return _builder 249 250 251def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 252 def _builder(args: t.List, dialect: Dialect) -> E: 253 if len(args) == 1: 254 # The default value for the JSONPath is '$' i.e all of the data 255 args.append(exp.Literal.string("$")) 256 return parser.build_extract_json_with_path(expr_type)(args, dialect) 257 258 return _builder 259 260 261def _str_to_datetime_sql( 262 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 263) -> str: 264 this = self.sql(expression, "this") 265 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 266 267 if expression.args.get("safe"): 268 fmt = self.format_time( 269 expression, 270 self.dialect.INVERSE_FORMAT_MAPPING, 271 self.dialect.INVERSE_FORMAT_TRIE, 272 ) 273 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 274 275 fmt = self.format_time(expression) 276 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 277 278 279def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 280 """ 281 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 282 +---------+---------+---------+------------+---------+ 283 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 284 +---------+---------+---------+------------+---------+ 285 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 286 +---------+---------+---------+------------+---------+ 287 """ 288 self._annotate_args(expression) 289 290 this: exp.Expression = expression.this 291 292 self._set_type( 293 expression, 294 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 295 ) 296 return expression 297 298 299@unsupported_args("ins_cost", "del_cost", "sub_cost") 300def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 301 max_dist = expression.args.get("max_dist") 302 if max_dist: 303 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 304 305 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 306 307 308def _build_levenshtein(args: t.List) -> exp.Levenshtein: 309 max_dist = seq_get(args, 2) 310 return exp.Levenshtein( 311 this=seq_get(args, 0), 312 expression=seq_get(args, 1), 313 max_dist=max_dist.expression if max_dist else None, 314 ) 315 316 317def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 318 def _builder(args: t.List) -> exp.TimeToStr: 319 return exp.TimeToStr( 320 this=expr_type(this=seq_get(args, 1)), 321 format=seq_get(args, 0), 322 zone=seq_get(args, 2), 323 ) 324 325 return _builder 326 327 328def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 329 if len(args) == 3: 330 return exp.Anonymous(this="CONTAINS_SUBSTR", expressions=args) 331 332 # Lowercase the operands in case of transpilation, as exp.Contains 333 # is case-sensitive on other dialects 334 this = exp.Lower(this=seq_get(args, 0)) 335 expr = exp.Lower(this=seq_get(args, 1)) 336 337 return exp.Contains(this=this, expression=expr) 338 339 340def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 341 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 342 upper = name.upper() 343 344 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 345 346 if dquote_escaping: 347 self._quote_json_path_key_using_brackets = False 348 349 sql = rename_func(upper)(self, expression) 350 351 if dquote_escaping: 352 self._quote_json_path_key_using_brackets = True 353 354 return sql 355 356 357def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 358 annotated = self._annotate_by_args(expression, "expressions") 359 360 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 361 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 362 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 363 annotated.type = exp.DataType.Type.VARCHAR 364 365 return annotated 366 367 368class BigQuery(Dialect): 369 WEEK_OFFSET = -1 370 UNNEST_COLUMN_ONLY = True 371 SUPPORTS_USER_DEFINED_TYPES = False 372 SUPPORTS_SEMI_ANTI_JOIN = False 373 LOG_BASE_FIRST = False 374 HEX_LOWERCASE = True 375 FORCE_EARLY_ALIAS_REF_EXPANSION = True 376 PRESERVE_ORIGINAL_NAMES = True 377 HEX_STRING_IS_INTEGER_TYPE = True 378 379 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 380 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 381 382 # bigquery udfs are case sensitive 383 NORMALIZE_FUNCTIONS = False 384 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 386 TIME_MAPPING = { 387 "%D": "%m/%d/%y", 388 "%E6S": "%S.%f", 389 "%e": "%-d", 390 } 391 392 FORMAT_MAPPING = { 393 "DD": "%d", 394 "MM": "%m", 395 "MON": "%b", 396 "MONTH": "%B", 397 "YYYY": "%Y", 398 "YY": "%y", 399 "HH": "%I", 400 "HH12": "%I", 401 "HH24": "%H", 402 "MI": "%M", 403 "SS": "%S", 404 "SSSSS": "%f", 405 "TZH": "%z", 406 } 407 408 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 409 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 410 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 411 412 # All set operations require either a DISTINCT or ALL specifier 413 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 414 415 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 416 TYPE_TO_EXPRESSIONS = { 417 **Dialect.TYPE_TO_EXPRESSIONS, 418 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 419 } 420 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 421 422 ANNOTATORS = { 423 **Dialect.ANNOTATORS, 424 **{ 425 expr_type: annotate_with_type_lambda(data_type) 426 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 427 for expr_type in expressions 428 }, 429 **{ 430 expr_type: lambda self, e: _annotate_math_functions(self, e) 431 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 432 }, 433 **{ 434 expr_type: lambda self, e: self._annotate_by_args(e, "this") 435 for expr_type in ( 436 exp.Left, 437 exp.Right, 438 exp.Lower, 439 exp.Upper, 440 exp.Pad, 441 exp.Trim, 442 exp.RegexpExtract, 443 exp.RegexpReplace, 444 exp.Repeat, 445 exp.Substring, 446 ) 447 }, 448 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 449 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 450 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 451 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 452 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 453 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 454 exp.Concat: _annotate_concat, 455 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 456 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 457 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 458 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 459 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 460 e, exp.DataType.Type.VARCHAR 461 ), 462 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 463 e, exp.DataType.build("ARRAY<VARCHAR>") 464 ), 465 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 466 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 467 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 468 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 469 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 470 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 471 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 472 e, exp.DataType.Type.DATETIME 473 ), 474 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 475 } 476 477 def normalize_identifier(self, expression: E) -> E: 478 if ( 479 isinstance(expression, exp.Identifier) 480 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 481 ): 482 parent = expression.parent 483 while isinstance(parent, exp.Dot): 484 parent = parent.parent 485 486 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 487 # by default. The following check uses a heuristic to detect tables based on whether 488 # they are qualified. This should generally be correct, because tables in BigQuery 489 # must be qualified with at least a dataset, unless @@dataset_id is set. 490 case_sensitive = ( 491 isinstance(parent, exp.UserDefinedFunction) 492 or ( 493 isinstance(parent, exp.Table) 494 and parent.db 495 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 496 ) 497 or expression.meta.get("is_table") 498 ) 499 if not case_sensitive: 500 expression.set("this", expression.this.lower()) 501 502 return t.cast(E, expression) 503 504 return super().normalize_identifier(expression) 505 506 class Tokenizer(tokens.Tokenizer): 507 QUOTES = ["'", '"', '"""', "'''"] 508 COMMENTS = ["--", "#", ("/*", "*/")] 509 IDENTIFIERS = ["`"] 510 STRING_ESCAPES = ["\\"] 511 512 HEX_STRINGS = [("0x", ""), ("0X", "")] 513 514 BYTE_STRINGS = [ 515 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 516 ] 517 518 RAW_STRINGS = [ 519 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 520 ] 521 522 NESTED_COMMENTS = False 523 524 KEYWORDS = { 525 **tokens.Tokenizer.KEYWORDS, 526 "ANY TYPE": TokenType.VARIANT, 527 "BEGIN": TokenType.COMMAND, 528 "BEGIN TRANSACTION": TokenType.BEGIN, 529 "BYTEINT": TokenType.INT, 530 "BYTES": TokenType.BINARY, 531 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 532 "DATETIME": TokenType.TIMESTAMP, 533 "DECLARE": TokenType.COMMAND, 534 "ELSEIF": TokenType.COMMAND, 535 "EXCEPTION": TokenType.COMMAND, 536 "EXPORT": TokenType.EXPORT, 537 "FLOAT64": TokenType.DOUBLE, 538 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 539 "MODEL": TokenType.MODEL, 540 "NOT DETERMINISTIC": TokenType.VOLATILE, 541 "RECORD": TokenType.STRUCT, 542 "TIMESTAMP": TokenType.TIMESTAMPTZ, 543 } 544 KEYWORDS.pop("DIV") 545 KEYWORDS.pop("VALUES") 546 KEYWORDS.pop("/*+") 547 548 class Parser(parser.Parser): 549 PREFIXED_PIVOT_COLUMNS = True 550 LOG_DEFAULTS_TO_LN = True 551 SUPPORTS_IMPLICIT_UNNEST = True 552 JOINS_HAVE_EQUAL_PRECEDENCE = True 553 554 # BigQuery does not allow ASC/DESC to be used as an identifier 555 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 556 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 557 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 558 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 559 TokenType.ASC, 560 TokenType.DESC, 561 } 562 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 563 564 FUNCTIONS = { 565 **parser.Parser.FUNCTIONS, 566 "CONTAINS_SUBSTR": _build_contains_substring, 567 "DATE": _build_date, 568 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 569 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 570 "DATE_TRUNC": lambda args: exp.DateTrunc( 571 unit=seq_get(args, 1), 572 this=seq_get(args, 0), 573 zone=seq_get(args, 2), 574 ), 575 "DATETIME": _build_datetime, 576 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 577 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 578 "DIV": binary_from_function(exp.IntDiv), 579 "EDIT_DISTANCE": _build_levenshtein, 580 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 581 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 582 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 583 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 584 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 585 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 586 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 587 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 588 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 589 "MD5": exp.MD5Digest.from_arg_list, 590 "TO_HEX": _build_to_hex, 591 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 592 [seq_get(args, 1), seq_get(args, 0)] 593 ), 594 "PARSE_TIMESTAMP": _build_parse_timestamp, 595 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 596 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 597 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 598 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 599 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 600 ), 601 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 602 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 603 "SPLIT": lambda args: exp.Split( 604 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 605 this=seq_get(args, 0), 606 expression=seq_get(args, 1) or exp.Literal.string(","), 607 ), 608 "STRPOS": exp.StrPosition.from_arg_list, 609 "TIME": _build_time, 610 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 611 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 612 "TIMESTAMP": _build_timestamp, 613 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 614 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 615 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 616 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 617 ), 618 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 619 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 620 ), 621 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 622 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 623 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 624 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 625 } 626 627 FUNCTION_PARSERS = { 628 **parser.Parser.FUNCTION_PARSERS, 629 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 630 "JSON_ARRAY": lambda self: self.expression( 631 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 632 ), 633 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 634 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 635 } 636 FUNCTION_PARSERS.pop("TRIM") 637 638 NO_PAREN_FUNCTIONS = { 639 **parser.Parser.NO_PAREN_FUNCTIONS, 640 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 641 } 642 643 NESTED_TYPE_TOKENS = { 644 *parser.Parser.NESTED_TYPE_TOKENS, 645 TokenType.TABLE, 646 } 647 648 PROPERTY_PARSERS = { 649 **parser.Parser.PROPERTY_PARSERS, 650 "NOT DETERMINISTIC": lambda self: self.expression( 651 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 652 ), 653 "OPTIONS": lambda self: self._parse_with_property(), 654 } 655 656 CONSTRAINT_PARSERS = { 657 **parser.Parser.CONSTRAINT_PARSERS, 658 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 659 } 660 661 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 662 RANGE_PARSERS.pop(TokenType.OVERLAPS) 663 664 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 665 666 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 667 668 STATEMENT_PARSERS = { 669 **parser.Parser.STATEMENT_PARSERS, 670 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 671 TokenType.END: lambda self: self._parse_as_command(self._prev), 672 TokenType.FOR: lambda self: self._parse_for_in(), 673 TokenType.EXPORT: lambda self: self._parse_export_data(), 674 } 675 676 BRACKET_OFFSETS = { 677 "OFFSET": (0, False), 678 "ORDINAL": (1, False), 679 "SAFE_OFFSET": (0, True), 680 "SAFE_ORDINAL": (1, True), 681 } 682 683 def _parse_for_in(self) -> exp.ForIn: 684 this = self._parse_range() 685 self._match_text_seq("DO") 686 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 687 688 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 689 this = super()._parse_table_part(schema=schema) or self._parse_number() 690 691 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 692 if isinstance(this, exp.Identifier): 693 table_name = this.name 694 while self._match(TokenType.DASH, advance=False) and self._next: 695 start = self._curr 696 while self._is_connected() and not self._match_set( 697 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 698 ): 699 self._advance() 700 701 if start == self._curr: 702 break 703 704 table_name += self._find_sql(start, self._prev) 705 706 this = exp.Identifier( 707 this=table_name, quoted=this.args.get("quoted") 708 ).update_positions(this) 709 elif isinstance(this, exp.Literal): 710 table_name = this.name 711 712 if self._is_connected() and self._parse_var(any_token=True): 713 table_name += self._prev.text 714 715 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 716 717 return this 718 719 def _parse_table_parts( 720 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 721 ) -> exp.Table: 722 table = super()._parse_table_parts( 723 schema=schema, is_db_reference=is_db_reference, wildcard=True 724 ) 725 726 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 727 if not table.catalog: 728 if table.db: 729 previous_db = table.args["db"] 730 parts = table.db.split(".") 731 if len(parts) == 2 and not table.args["db"].quoted: 732 table.set( 733 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 734 ) 735 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 736 else: 737 previous_this = table.this 738 parts = table.name.split(".") 739 if len(parts) == 2 and not table.this.quoted: 740 table.set( 741 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 742 ) 743 table.set( 744 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 745 ) 746 747 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 748 alias = table.this 749 catalog, db, this, *rest = ( 750 exp.to_identifier(p, quoted=True) 751 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 752 ) 753 754 for part in (catalog, db, this): 755 if part: 756 part.update_positions(table.this) 757 758 if rest and this: 759 this = exp.Dot.build([this, *rest]) # type: ignore 760 761 table = exp.Table( 762 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 763 ) 764 table.meta["quoted_table"] = True 765 else: 766 alias = None 767 768 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 769 # dataset, so if the project identifier is omitted we need to fix the ast so that 770 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 771 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 772 # views, because it would seem like the "catalog" part is set, when it'd actually 773 # be the region/dataset. Merging the two identifiers into a single one is done to 774 # avoid producing a 4-part Table reference, which would cause issues in the schema 775 # module, when there are 3-part table names mixed with information schema views. 776 # 777 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 778 table_parts = table.parts 779 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 780 # We need to alias the table here to avoid breaking existing qualified columns. 781 # This is expected to be safe, because if there's an actual alias coming up in 782 # the token stream, it will overwrite this one. If there isn't one, we are only 783 # exposing the name that can be used to reference the view explicitly (a no-op). 784 exp.alias_( 785 table, 786 t.cast(exp.Identifier, alias or table_parts[-1]), 787 table=True, 788 copy=False, 789 ) 790 791 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 792 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 793 line=table_parts[-2].meta.get("line"), 794 col=table_parts[-1].meta.get("col"), 795 start=table_parts[-2].meta.get("start"), 796 end=table_parts[-1].meta.get("end"), 797 ) 798 table.set("this", new_this) 799 table.set("db", seq_get(table_parts, -3)) 800 table.set("catalog", seq_get(table_parts, -4)) 801 802 return table 803 804 def _parse_column(self) -> t.Optional[exp.Expression]: 805 column = super()._parse_column() 806 if isinstance(column, exp.Column): 807 parts = column.parts 808 if any("." in p.name for p in parts): 809 catalog, db, table, this, *rest = ( 810 exp.to_identifier(p, quoted=True) 811 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 812 ) 813 814 if rest and this: 815 this = exp.Dot.build([this, *rest]) # type: ignore 816 817 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 818 column.meta["quoted_column"] = True 819 820 return column 821 822 @t.overload 823 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 824 825 @t.overload 826 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 827 828 def _parse_json_object(self, agg=False): 829 json_object = super()._parse_json_object() 830 array_kv_pair = seq_get(json_object.expressions, 0) 831 832 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 833 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 834 if ( 835 array_kv_pair 836 and isinstance(array_kv_pair.this, exp.Array) 837 and isinstance(array_kv_pair.expression, exp.Array) 838 ): 839 keys = array_kv_pair.this.expressions 840 values = array_kv_pair.expression.expressions 841 842 json_object.set( 843 "expressions", 844 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 845 ) 846 847 return json_object 848 849 def _parse_bracket( 850 self, this: t.Optional[exp.Expression] = None 851 ) -> t.Optional[exp.Expression]: 852 bracket = super()._parse_bracket(this) 853 854 if this is bracket: 855 return bracket 856 857 if isinstance(bracket, exp.Bracket): 858 for expression in bracket.expressions: 859 name = expression.name.upper() 860 861 if name not in self.BRACKET_OFFSETS: 862 break 863 864 offset, safe = self.BRACKET_OFFSETS[name] 865 bracket.set("offset", offset) 866 bracket.set("safe", safe) 867 expression.replace(expression.expressions[0]) 868 869 return bracket 870 871 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 872 unnest = super()._parse_unnest(with_alias=with_alias) 873 874 if not unnest: 875 return None 876 877 unnest_expr = seq_get(unnest.expressions, 0) 878 if unnest_expr: 879 from sqlglot.optimizer.annotate_types import annotate_types 880 881 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 882 883 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 884 # in contrast to other dialects such as DuckDB which flattens only the array by default 885 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 886 array_elem.is_type(exp.DataType.Type.STRUCT) 887 for array_elem in unnest_expr._type.expressions 888 ): 889 unnest.set("explode_array", True) 890 891 return unnest 892 893 def _parse_make_interval(self) -> exp.MakeInterval: 894 expr = exp.MakeInterval() 895 896 for arg_key in expr.arg_types: 897 value = self._parse_lambda() 898 899 if not value: 900 break 901 902 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 903 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 904 if isinstance(value, exp.Kwarg): 905 arg_key = value.this.name 906 907 expr.set(arg_key, value) 908 909 self._match(TokenType.COMMA) 910 911 return expr 912 913 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 914 expr = self.expression( 915 exp.FeaturesAtTime, 916 this=(self._match(TokenType.TABLE) and self._parse_table()) 917 or self._parse_select(nested=True), 918 ) 919 920 while self._match(TokenType.COMMA): 921 arg = self._parse_lambda() 922 923 # Get the LHS of the Kwarg and set the arg to that value, e.g 924 # "num_rows => 1" sets the expr's `num_rows` arg 925 if arg: 926 expr.set(arg.this.name, arg) 927 928 return expr 929 930 def _parse_export_data(self) -> exp.Export: 931 self._match_text_seq("DATA") 932 933 return self.expression( 934 exp.Export, 935 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 936 options=self._parse_properties(), 937 this=self._match_text_seq("AS") and self._parse_select(), 938 ) 939 940 class Generator(generator.Generator): 941 INTERVAL_ALLOWS_PLURAL_FORM = False 942 JOIN_HINTS = False 943 QUERY_HINTS = False 944 TABLE_HINTS = False 945 LIMIT_FETCH = "LIMIT" 946 RENAME_TABLE_WITH_DB = False 947 NVL2_SUPPORTED = False 948 UNNEST_WITH_ORDINALITY = False 949 COLLATE_IS_FUNC = True 950 LIMIT_ONLY_LITERALS = True 951 SUPPORTS_TABLE_ALIAS_COLUMNS = False 952 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 953 JSON_KEY_VALUE_PAIR_SEP = "," 954 NULL_ORDERING_SUPPORTED = False 955 IGNORE_NULLS_IN_FUNC = True 956 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 957 CAN_IMPLEMENT_ARRAY_ANY = True 958 SUPPORTS_TO_NUMBER = False 959 NAMED_PLACEHOLDER_TOKEN = "@" 960 HEX_FUNC = "TO_HEX" 961 WITH_PROPERTIES_PREFIX = "OPTIONS" 962 SUPPORTS_EXPLODING_PROJECTIONS = False 963 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 964 SUPPORTS_UNIX_SECONDS = True 965 966 TRANSFORMS = { 967 **generator.Generator.TRANSFORMS, 968 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 969 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 970 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 971 exp.Array: inline_array_unless_query, 972 exp.ArrayContains: _array_contains_sql, 973 exp.ArrayFilter: filter_array_using_unnest, 974 exp.ArrayRemove: filter_array_using_unnest, 975 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 976 exp.CollateProperty: lambda self, e: ( 977 f"DEFAULT COLLATE {self.sql(e, 'this')}" 978 if e.args.get("default") 979 else f"COLLATE {self.sql(e, 'this')}" 980 ), 981 exp.Commit: lambda *_: "COMMIT TRANSACTION", 982 exp.CountIf: rename_func("COUNTIF"), 983 exp.Create: _create_sql, 984 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 985 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 986 exp.DateDiff: lambda self, e: self.func( 987 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 988 ), 989 exp.DateFromParts: rename_func("DATE"), 990 exp.DateStrToDate: datestrtodate_sql, 991 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 992 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 993 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 994 exp.FromTimeZone: lambda self, e: self.func( 995 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 996 ), 997 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 998 exp.GroupConcat: lambda self, e: groupconcat_sql( 999 self, e, func_name="STRING_AGG", within_group=False 1000 ), 1001 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1002 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1003 exp.If: if_sql(false_value="NULL"), 1004 exp.ILike: no_ilike_sql, 1005 exp.IntDiv: rename_func("DIV"), 1006 exp.Int64: rename_func("INT64"), 1007 exp.JSONExtract: _json_extract_sql, 1008 exp.JSONExtractArray: _json_extract_sql, 1009 exp.JSONExtractScalar: _json_extract_sql, 1010 exp.JSONFormat: rename_func("TO_JSON_STRING"), 1011 exp.Levenshtein: _levenshtein_sql, 1012 exp.Max: max_or_greatest, 1013 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1014 exp.MD5Digest: rename_func("MD5"), 1015 exp.Min: min_or_least, 1016 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1017 exp.RegexpExtract: lambda self, e: self.func( 1018 "REGEXP_EXTRACT", 1019 e.this, 1020 e.expression, 1021 e.args.get("position"), 1022 e.args.get("occurrence"), 1023 ), 1024 exp.RegexpExtractAll: lambda self, e: self.func( 1025 "REGEXP_EXTRACT_ALL", e.this, e.expression 1026 ), 1027 exp.RegexpReplace: regexp_replace_sql, 1028 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1029 exp.ReturnsProperty: _returnsproperty_sql, 1030 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1031 exp.Select: transforms.preprocess( 1032 [ 1033 transforms.explode_projection_to_unnest(), 1034 transforms.unqualify_unnest, 1035 transforms.eliminate_distinct_on, 1036 _alias_ordered_group, 1037 transforms.eliminate_semi_and_anti_joins, 1038 ] 1039 ), 1040 exp.SHA: rename_func("SHA1"), 1041 exp.SHA2: sha256_sql, 1042 exp.Space: space_sql, 1043 exp.StabilityProperty: lambda self, e: ( 1044 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1045 ), 1046 exp.String: rename_func("STRING"), 1047 exp.StrPosition: lambda self, e: ( 1048 strposition_sql( 1049 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1050 ) 1051 ), 1052 exp.StrToDate: _str_to_datetime_sql, 1053 exp.StrToTime: _str_to_datetime_sql, 1054 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1055 exp.TimeFromParts: rename_func("TIME"), 1056 exp.TimestampFromParts: rename_func("DATETIME"), 1057 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1058 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1059 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1060 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1061 exp.TimeStrToTime: timestrtotime_sql, 1062 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1063 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1064 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1065 exp.TsOrDsToTime: rename_func("TIME"), 1066 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1067 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1068 exp.Unhex: rename_func("FROM_HEX"), 1069 exp.UnixDate: rename_func("UNIX_DATE"), 1070 exp.UnixToTime: _unix_to_time_sql, 1071 exp.Uuid: lambda *_: "GENERATE_UUID()", 1072 exp.Values: _derived_table_values_to_unnest, 1073 exp.VariancePop: rename_func("VAR_POP"), 1074 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1075 } 1076 1077 SUPPORTED_JSON_PATH_PARTS = { 1078 exp.JSONPathKey, 1079 exp.JSONPathRoot, 1080 exp.JSONPathSubscript, 1081 } 1082 1083 TYPE_MAPPING = { 1084 **generator.Generator.TYPE_MAPPING, 1085 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1086 exp.DataType.Type.BIGINT: "INT64", 1087 exp.DataType.Type.BINARY: "BYTES", 1088 exp.DataType.Type.BLOB: "BYTES", 1089 exp.DataType.Type.BOOLEAN: "BOOL", 1090 exp.DataType.Type.CHAR: "STRING", 1091 exp.DataType.Type.DECIMAL: "NUMERIC", 1092 exp.DataType.Type.DOUBLE: "FLOAT64", 1093 exp.DataType.Type.FLOAT: "FLOAT64", 1094 exp.DataType.Type.INT: "INT64", 1095 exp.DataType.Type.NCHAR: "STRING", 1096 exp.DataType.Type.NVARCHAR: "STRING", 1097 exp.DataType.Type.SMALLINT: "INT64", 1098 exp.DataType.Type.TEXT: "STRING", 1099 exp.DataType.Type.TIMESTAMP: "DATETIME", 1100 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1101 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1102 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1103 exp.DataType.Type.TINYINT: "INT64", 1104 exp.DataType.Type.ROWVERSION: "BYTES", 1105 exp.DataType.Type.UUID: "STRING", 1106 exp.DataType.Type.VARBINARY: "BYTES", 1107 exp.DataType.Type.VARCHAR: "STRING", 1108 exp.DataType.Type.VARIANT: "ANY TYPE", 1109 } 1110 1111 PROPERTIES_LOCATION = { 1112 **generator.Generator.PROPERTIES_LOCATION, 1113 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1114 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1115 } 1116 1117 # WINDOW comes after QUALIFY 1118 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1119 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1120 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1121 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1122 } 1123 1124 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1125 RESERVED_KEYWORDS = { 1126 "all", 1127 "and", 1128 "any", 1129 "array", 1130 "as", 1131 "asc", 1132 "assert_rows_modified", 1133 "at", 1134 "between", 1135 "by", 1136 "case", 1137 "cast", 1138 "collate", 1139 "contains", 1140 "create", 1141 "cross", 1142 "cube", 1143 "current", 1144 "default", 1145 "define", 1146 "desc", 1147 "distinct", 1148 "else", 1149 "end", 1150 "enum", 1151 "escape", 1152 "except", 1153 "exclude", 1154 "exists", 1155 "extract", 1156 "false", 1157 "fetch", 1158 "following", 1159 "for", 1160 "from", 1161 "full", 1162 "group", 1163 "grouping", 1164 "groups", 1165 "hash", 1166 "having", 1167 "if", 1168 "ignore", 1169 "in", 1170 "inner", 1171 "intersect", 1172 "interval", 1173 "into", 1174 "is", 1175 "join", 1176 "lateral", 1177 "left", 1178 "like", 1179 "limit", 1180 "lookup", 1181 "merge", 1182 "natural", 1183 "new", 1184 "no", 1185 "not", 1186 "null", 1187 "nulls", 1188 "of", 1189 "on", 1190 "or", 1191 "order", 1192 "outer", 1193 "over", 1194 "partition", 1195 "preceding", 1196 "proto", 1197 "qualify", 1198 "range", 1199 "recursive", 1200 "respect", 1201 "right", 1202 "rollup", 1203 "rows", 1204 "select", 1205 "set", 1206 "some", 1207 "struct", 1208 "tablesample", 1209 "then", 1210 "to", 1211 "treat", 1212 "true", 1213 "unbounded", 1214 "union", 1215 "unnest", 1216 "using", 1217 "when", 1218 "where", 1219 "window", 1220 "with", 1221 "within", 1222 } 1223 1224 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1225 unit = expression.unit 1226 unit_sql = unit.name if unit.is_string else self.sql(unit) 1227 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1228 1229 def mod_sql(self, expression: exp.Mod) -> str: 1230 this = expression.this 1231 expr = expression.expression 1232 return self.func( 1233 "MOD", 1234 this.unnest() if isinstance(this, exp.Paren) else this, 1235 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1236 ) 1237 1238 def column_parts(self, expression: exp.Column) -> str: 1239 if expression.meta.get("quoted_column"): 1240 # If a column reference is of the form `dataset.table`.name, we need 1241 # to preserve the quoted table path, otherwise the reference breaks 1242 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1243 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1244 return f"{table_path}.{self.sql(expression, 'this')}" 1245 1246 return super().column_parts(expression) 1247 1248 def table_parts(self, expression: exp.Table) -> str: 1249 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1250 # we need to make sure the correct quoting is used in each case. 1251 # 1252 # For example, if there is a CTE x that clashes with a schema name, then the former will 1253 # return the table y in that schema, whereas the latter will return the CTE's y column: 1254 # 1255 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1256 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1257 if expression.meta.get("quoted_table"): 1258 table_parts = ".".join(p.name for p in expression.parts) 1259 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1260 1261 return super().table_parts(expression) 1262 1263 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1264 this = expression.this 1265 if isinstance(this, exp.TsOrDsToDatetime): 1266 func_name = "FORMAT_DATETIME" 1267 elif isinstance(this, exp.TsOrDsToTimestamp): 1268 func_name = "FORMAT_TIMESTAMP" 1269 else: 1270 func_name = "FORMAT_DATE" 1271 1272 time_expr = ( 1273 this 1274 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1275 else expression 1276 ) 1277 return self.func( 1278 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1279 ) 1280 1281 def eq_sql(self, expression: exp.EQ) -> str: 1282 # Operands of = cannot be NULL in BigQuery 1283 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1284 if not isinstance(expression.parent, exp.Update): 1285 return "NULL" 1286 1287 return self.binary(expression, "=") 1288 1289 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1290 parent = expression.parent 1291 1292 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1293 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1294 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1295 return self.func( 1296 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1297 ) 1298 1299 return super().attimezone_sql(expression) 1300 1301 def trycast_sql(self, expression: exp.TryCast) -> str: 1302 return self.cast_sql(expression, safe_prefix="SAFE_") 1303 1304 def bracket_sql(self, expression: exp.Bracket) -> str: 1305 this = expression.this 1306 expressions = expression.expressions 1307 1308 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1309 arg = expressions[0] 1310 if arg.type is None: 1311 from sqlglot.optimizer.annotate_types import annotate_types 1312 1313 arg = annotate_types(arg, dialect=self.dialect) 1314 1315 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1316 # BQ doesn't support bracket syntax with string values for structs 1317 return f"{self.sql(this)}.{arg.name}" 1318 1319 expressions_sql = self.expressions(expression, flat=True) 1320 offset = expression.args.get("offset") 1321 1322 if offset == 0: 1323 expressions_sql = f"OFFSET({expressions_sql})" 1324 elif offset == 1: 1325 expressions_sql = f"ORDINAL({expressions_sql})" 1326 elif offset is not None: 1327 self.unsupported(f"Unsupported array offset: {offset}") 1328 1329 if expression.args.get("safe"): 1330 expressions_sql = f"SAFE_{expressions_sql}" 1331 1332 return f"{self.sql(this)}[{expressions_sql}]" 1333 1334 def in_unnest_op(self, expression: exp.Unnest) -> str: 1335 return self.sql(expression) 1336 1337 def version_sql(self, expression: exp.Version) -> str: 1338 if expression.name == "TIMESTAMP": 1339 expression.set("this", "SYSTEM_TIME") 1340 return super().version_sql(expression) 1341 1342 def contains_sql(self, expression: exp.Contains) -> str: 1343 this = expression.this 1344 expr = expression.expression 1345 1346 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1347 this = this.this 1348 expr = expr.this 1349 1350 return self.func("CONTAINS_SUBSTR", this, expr) 1351 1352 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1353 this = expression.this 1354 1355 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1356 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1357 # because they aren't literals and so the above syntax is invalid BigQuery. 1358 if isinstance(this, exp.Array): 1359 elem = seq_get(this.expressions, 0) 1360 if not (elem and elem.find(exp.Query)): 1361 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1362 1363 return super().cast_sql(expression, safe_prefix=safe_prefix)
369class BigQuery(Dialect): 370 WEEK_OFFSET = -1 371 UNNEST_COLUMN_ONLY = True 372 SUPPORTS_USER_DEFINED_TYPES = False 373 SUPPORTS_SEMI_ANTI_JOIN = False 374 LOG_BASE_FIRST = False 375 HEX_LOWERCASE = True 376 FORCE_EARLY_ALIAS_REF_EXPANSION = True 377 PRESERVE_ORIGINAL_NAMES = True 378 HEX_STRING_IS_INTEGER_TYPE = True 379 380 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 381 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 382 383 # bigquery udfs are case sensitive 384 NORMALIZE_FUNCTIONS = False 385 386 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 387 TIME_MAPPING = { 388 "%D": "%m/%d/%y", 389 "%E6S": "%S.%f", 390 "%e": "%-d", 391 } 392 393 FORMAT_MAPPING = { 394 "DD": "%d", 395 "MM": "%m", 396 "MON": "%b", 397 "MONTH": "%B", 398 "YYYY": "%Y", 399 "YY": "%y", 400 "HH": "%I", 401 "HH12": "%I", 402 "HH24": "%H", 403 "MI": "%M", 404 "SS": "%S", 405 "SSSSS": "%f", 406 "TZH": "%z", 407 } 408 409 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 410 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 411 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 412 413 # All set operations require either a DISTINCT or ALL specifier 414 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 415 416 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 417 TYPE_TO_EXPRESSIONS = { 418 **Dialect.TYPE_TO_EXPRESSIONS, 419 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 420 } 421 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 422 423 ANNOTATORS = { 424 **Dialect.ANNOTATORS, 425 **{ 426 expr_type: annotate_with_type_lambda(data_type) 427 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 428 for expr_type in expressions 429 }, 430 **{ 431 expr_type: lambda self, e: _annotate_math_functions(self, e) 432 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 433 }, 434 **{ 435 expr_type: lambda self, e: self._annotate_by_args(e, "this") 436 for expr_type in ( 437 exp.Left, 438 exp.Right, 439 exp.Lower, 440 exp.Upper, 441 exp.Pad, 442 exp.Trim, 443 exp.RegexpExtract, 444 exp.RegexpReplace, 445 exp.Repeat, 446 exp.Substring, 447 ) 448 }, 449 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 450 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 451 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 452 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 453 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 454 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 455 exp.Concat: _annotate_concat, 456 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 457 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 458 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 459 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 460 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 461 e, exp.DataType.Type.VARCHAR 462 ), 463 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 464 e, exp.DataType.build("ARRAY<VARCHAR>") 465 ), 466 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 467 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 468 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 469 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 470 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 471 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 472 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 473 e, exp.DataType.Type.DATETIME 474 ), 475 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 476 } 477 478 def normalize_identifier(self, expression: E) -> E: 479 if ( 480 isinstance(expression, exp.Identifier) 481 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 482 ): 483 parent = expression.parent 484 while isinstance(parent, exp.Dot): 485 parent = parent.parent 486 487 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 488 # by default. The following check uses a heuristic to detect tables based on whether 489 # they are qualified. This should generally be correct, because tables in BigQuery 490 # must be qualified with at least a dataset, unless @@dataset_id is set. 491 case_sensitive = ( 492 isinstance(parent, exp.UserDefinedFunction) 493 or ( 494 isinstance(parent, exp.Table) 495 and parent.db 496 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 497 ) 498 or expression.meta.get("is_table") 499 ) 500 if not case_sensitive: 501 expression.set("this", expression.this.lower()) 502 503 return t.cast(E, expression) 504 505 return super().normalize_identifier(expression) 506 507 class Tokenizer(tokens.Tokenizer): 508 QUOTES = ["'", '"', '"""', "'''"] 509 COMMENTS = ["--", "#", ("/*", "*/")] 510 IDENTIFIERS = ["`"] 511 STRING_ESCAPES = ["\\"] 512 513 HEX_STRINGS = [("0x", ""), ("0X", "")] 514 515 BYTE_STRINGS = [ 516 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 517 ] 518 519 RAW_STRINGS = [ 520 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 521 ] 522 523 NESTED_COMMENTS = False 524 525 KEYWORDS = { 526 **tokens.Tokenizer.KEYWORDS, 527 "ANY TYPE": TokenType.VARIANT, 528 "BEGIN": TokenType.COMMAND, 529 "BEGIN TRANSACTION": TokenType.BEGIN, 530 "BYTEINT": TokenType.INT, 531 "BYTES": TokenType.BINARY, 532 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 533 "DATETIME": TokenType.TIMESTAMP, 534 "DECLARE": TokenType.COMMAND, 535 "ELSEIF": TokenType.COMMAND, 536 "EXCEPTION": TokenType.COMMAND, 537 "EXPORT": TokenType.EXPORT, 538 "FLOAT64": TokenType.DOUBLE, 539 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 540 "MODEL": TokenType.MODEL, 541 "NOT DETERMINISTIC": TokenType.VOLATILE, 542 "RECORD": TokenType.STRUCT, 543 "TIMESTAMP": TokenType.TIMESTAMPTZ, 544 } 545 KEYWORDS.pop("DIV") 546 KEYWORDS.pop("VALUES") 547 KEYWORDS.pop("/*+") 548 549 class Parser(parser.Parser): 550 PREFIXED_PIVOT_COLUMNS = True 551 LOG_DEFAULTS_TO_LN = True 552 SUPPORTS_IMPLICIT_UNNEST = True 553 JOINS_HAVE_EQUAL_PRECEDENCE = True 554 555 # BigQuery does not allow ASC/DESC to be used as an identifier 556 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 557 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 558 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 559 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 560 TokenType.ASC, 561 TokenType.DESC, 562 } 563 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 564 565 FUNCTIONS = { 566 **parser.Parser.FUNCTIONS, 567 "CONTAINS_SUBSTR": _build_contains_substring, 568 "DATE": _build_date, 569 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 570 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 571 "DATE_TRUNC": lambda args: exp.DateTrunc( 572 unit=seq_get(args, 1), 573 this=seq_get(args, 0), 574 zone=seq_get(args, 2), 575 ), 576 "DATETIME": _build_datetime, 577 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 578 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 579 "DIV": binary_from_function(exp.IntDiv), 580 "EDIT_DISTANCE": _build_levenshtein, 581 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 582 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 583 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 584 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 585 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 586 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 587 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 588 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 589 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 590 "MD5": exp.MD5Digest.from_arg_list, 591 "TO_HEX": _build_to_hex, 592 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 593 [seq_get(args, 1), seq_get(args, 0)] 594 ), 595 "PARSE_TIMESTAMP": _build_parse_timestamp, 596 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 597 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 598 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 599 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 600 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 601 ), 602 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 603 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 604 "SPLIT": lambda args: exp.Split( 605 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 606 this=seq_get(args, 0), 607 expression=seq_get(args, 1) or exp.Literal.string(","), 608 ), 609 "STRPOS": exp.StrPosition.from_arg_list, 610 "TIME": _build_time, 611 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 612 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 613 "TIMESTAMP": _build_timestamp, 614 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 615 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 616 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 617 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 618 ), 619 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 620 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 621 ), 622 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 623 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 624 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 625 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 626 } 627 628 FUNCTION_PARSERS = { 629 **parser.Parser.FUNCTION_PARSERS, 630 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 631 "JSON_ARRAY": lambda self: self.expression( 632 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 633 ), 634 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 635 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 636 } 637 FUNCTION_PARSERS.pop("TRIM") 638 639 NO_PAREN_FUNCTIONS = { 640 **parser.Parser.NO_PAREN_FUNCTIONS, 641 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 642 } 643 644 NESTED_TYPE_TOKENS = { 645 *parser.Parser.NESTED_TYPE_TOKENS, 646 TokenType.TABLE, 647 } 648 649 PROPERTY_PARSERS = { 650 **parser.Parser.PROPERTY_PARSERS, 651 "NOT DETERMINISTIC": lambda self: self.expression( 652 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 653 ), 654 "OPTIONS": lambda self: self._parse_with_property(), 655 } 656 657 CONSTRAINT_PARSERS = { 658 **parser.Parser.CONSTRAINT_PARSERS, 659 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 660 } 661 662 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 663 RANGE_PARSERS.pop(TokenType.OVERLAPS) 664 665 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 666 667 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 668 669 STATEMENT_PARSERS = { 670 **parser.Parser.STATEMENT_PARSERS, 671 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 672 TokenType.END: lambda self: self._parse_as_command(self._prev), 673 TokenType.FOR: lambda self: self._parse_for_in(), 674 TokenType.EXPORT: lambda self: self._parse_export_data(), 675 } 676 677 BRACKET_OFFSETS = { 678 "OFFSET": (0, False), 679 "ORDINAL": (1, False), 680 "SAFE_OFFSET": (0, True), 681 "SAFE_ORDINAL": (1, True), 682 } 683 684 def _parse_for_in(self) -> exp.ForIn: 685 this = self._parse_range() 686 self._match_text_seq("DO") 687 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 688 689 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 690 this = super()._parse_table_part(schema=schema) or self._parse_number() 691 692 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 693 if isinstance(this, exp.Identifier): 694 table_name = this.name 695 while self._match(TokenType.DASH, advance=False) and self._next: 696 start = self._curr 697 while self._is_connected() and not self._match_set( 698 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 699 ): 700 self._advance() 701 702 if start == self._curr: 703 break 704 705 table_name += self._find_sql(start, self._prev) 706 707 this = exp.Identifier( 708 this=table_name, quoted=this.args.get("quoted") 709 ).update_positions(this) 710 elif isinstance(this, exp.Literal): 711 table_name = this.name 712 713 if self._is_connected() and self._parse_var(any_token=True): 714 table_name += self._prev.text 715 716 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 717 718 return this 719 720 def _parse_table_parts( 721 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 722 ) -> exp.Table: 723 table = super()._parse_table_parts( 724 schema=schema, is_db_reference=is_db_reference, wildcard=True 725 ) 726 727 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 728 if not table.catalog: 729 if table.db: 730 previous_db = table.args["db"] 731 parts = table.db.split(".") 732 if len(parts) == 2 and not table.args["db"].quoted: 733 table.set( 734 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 735 ) 736 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 737 else: 738 previous_this = table.this 739 parts = table.name.split(".") 740 if len(parts) == 2 and not table.this.quoted: 741 table.set( 742 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 743 ) 744 table.set( 745 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 746 ) 747 748 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 749 alias = table.this 750 catalog, db, this, *rest = ( 751 exp.to_identifier(p, quoted=True) 752 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 753 ) 754 755 for part in (catalog, db, this): 756 if part: 757 part.update_positions(table.this) 758 759 if rest and this: 760 this = exp.Dot.build([this, *rest]) # type: ignore 761 762 table = exp.Table( 763 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 764 ) 765 table.meta["quoted_table"] = True 766 else: 767 alias = None 768 769 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 770 # dataset, so if the project identifier is omitted we need to fix the ast so that 771 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 772 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 773 # views, because it would seem like the "catalog" part is set, when it'd actually 774 # be the region/dataset. Merging the two identifiers into a single one is done to 775 # avoid producing a 4-part Table reference, which would cause issues in the schema 776 # module, when there are 3-part table names mixed with information schema views. 777 # 778 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 779 table_parts = table.parts 780 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 781 # We need to alias the table here to avoid breaking existing qualified columns. 782 # This is expected to be safe, because if there's an actual alias coming up in 783 # the token stream, it will overwrite this one. If there isn't one, we are only 784 # exposing the name that can be used to reference the view explicitly (a no-op). 785 exp.alias_( 786 table, 787 t.cast(exp.Identifier, alias or table_parts[-1]), 788 table=True, 789 copy=False, 790 ) 791 792 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 793 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 794 line=table_parts[-2].meta.get("line"), 795 col=table_parts[-1].meta.get("col"), 796 start=table_parts[-2].meta.get("start"), 797 end=table_parts[-1].meta.get("end"), 798 ) 799 table.set("this", new_this) 800 table.set("db", seq_get(table_parts, -3)) 801 table.set("catalog", seq_get(table_parts, -4)) 802 803 return table 804 805 def _parse_column(self) -> t.Optional[exp.Expression]: 806 column = super()._parse_column() 807 if isinstance(column, exp.Column): 808 parts = column.parts 809 if any("." in p.name for p in parts): 810 catalog, db, table, this, *rest = ( 811 exp.to_identifier(p, quoted=True) 812 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 813 ) 814 815 if rest and this: 816 this = exp.Dot.build([this, *rest]) # type: ignore 817 818 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 819 column.meta["quoted_column"] = True 820 821 return column 822 823 @t.overload 824 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 825 826 @t.overload 827 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 828 829 def _parse_json_object(self, agg=False): 830 json_object = super()._parse_json_object() 831 array_kv_pair = seq_get(json_object.expressions, 0) 832 833 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 834 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 835 if ( 836 array_kv_pair 837 and isinstance(array_kv_pair.this, exp.Array) 838 and isinstance(array_kv_pair.expression, exp.Array) 839 ): 840 keys = array_kv_pair.this.expressions 841 values = array_kv_pair.expression.expressions 842 843 json_object.set( 844 "expressions", 845 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 846 ) 847 848 return json_object 849 850 def _parse_bracket( 851 self, this: t.Optional[exp.Expression] = None 852 ) -> t.Optional[exp.Expression]: 853 bracket = super()._parse_bracket(this) 854 855 if this is bracket: 856 return bracket 857 858 if isinstance(bracket, exp.Bracket): 859 for expression in bracket.expressions: 860 name = expression.name.upper() 861 862 if name not in self.BRACKET_OFFSETS: 863 break 864 865 offset, safe = self.BRACKET_OFFSETS[name] 866 bracket.set("offset", offset) 867 bracket.set("safe", safe) 868 expression.replace(expression.expressions[0]) 869 870 return bracket 871 872 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 873 unnest = super()._parse_unnest(with_alias=with_alias) 874 875 if not unnest: 876 return None 877 878 unnest_expr = seq_get(unnest.expressions, 0) 879 if unnest_expr: 880 from sqlglot.optimizer.annotate_types import annotate_types 881 882 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 883 884 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 885 # in contrast to other dialects such as DuckDB which flattens only the array by default 886 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 887 array_elem.is_type(exp.DataType.Type.STRUCT) 888 for array_elem in unnest_expr._type.expressions 889 ): 890 unnest.set("explode_array", True) 891 892 return unnest 893 894 def _parse_make_interval(self) -> exp.MakeInterval: 895 expr = exp.MakeInterval() 896 897 for arg_key in expr.arg_types: 898 value = self._parse_lambda() 899 900 if not value: 901 break 902 903 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 904 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 905 if isinstance(value, exp.Kwarg): 906 arg_key = value.this.name 907 908 expr.set(arg_key, value) 909 910 self._match(TokenType.COMMA) 911 912 return expr 913 914 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 915 expr = self.expression( 916 exp.FeaturesAtTime, 917 this=(self._match(TokenType.TABLE) and self._parse_table()) 918 or self._parse_select(nested=True), 919 ) 920 921 while self._match(TokenType.COMMA): 922 arg = self._parse_lambda() 923 924 # Get the LHS of the Kwarg and set the arg to that value, e.g 925 # "num_rows => 1" sets the expr's `num_rows` arg 926 if arg: 927 expr.set(arg.this.name, arg) 928 929 return expr 930 931 def _parse_export_data(self) -> exp.Export: 932 self._match_text_seq("DATA") 933 934 return self.expression( 935 exp.Export, 936 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 937 options=self._parse_properties(), 938 this=self._match_text_seq("AS") and self._parse_select(), 939 ) 940 941 class Generator(generator.Generator): 942 INTERVAL_ALLOWS_PLURAL_FORM = False 943 JOIN_HINTS = False 944 QUERY_HINTS = False 945 TABLE_HINTS = False 946 LIMIT_FETCH = "LIMIT" 947 RENAME_TABLE_WITH_DB = False 948 NVL2_SUPPORTED = False 949 UNNEST_WITH_ORDINALITY = False 950 COLLATE_IS_FUNC = True 951 LIMIT_ONLY_LITERALS = True 952 SUPPORTS_TABLE_ALIAS_COLUMNS = False 953 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 954 JSON_KEY_VALUE_PAIR_SEP = "," 955 NULL_ORDERING_SUPPORTED = False 956 IGNORE_NULLS_IN_FUNC = True 957 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 958 CAN_IMPLEMENT_ARRAY_ANY = True 959 SUPPORTS_TO_NUMBER = False 960 NAMED_PLACEHOLDER_TOKEN = "@" 961 HEX_FUNC = "TO_HEX" 962 WITH_PROPERTIES_PREFIX = "OPTIONS" 963 SUPPORTS_EXPLODING_PROJECTIONS = False 964 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 965 SUPPORTS_UNIX_SECONDS = True 966 967 TRANSFORMS = { 968 **generator.Generator.TRANSFORMS, 969 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 970 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 971 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 972 exp.Array: inline_array_unless_query, 973 exp.ArrayContains: _array_contains_sql, 974 exp.ArrayFilter: filter_array_using_unnest, 975 exp.ArrayRemove: filter_array_using_unnest, 976 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 977 exp.CollateProperty: lambda self, e: ( 978 f"DEFAULT COLLATE {self.sql(e, 'this')}" 979 if e.args.get("default") 980 else f"COLLATE {self.sql(e, 'this')}" 981 ), 982 exp.Commit: lambda *_: "COMMIT TRANSACTION", 983 exp.CountIf: rename_func("COUNTIF"), 984 exp.Create: _create_sql, 985 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 986 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 987 exp.DateDiff: lambda self, e: self.func( 988 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 989 ), 990 exp.DateFromParts: rename_func("DATE"), 991 exp.DateStrToDate: datestrtodate_sql, 992 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 993 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 994 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 995 exp.FromTimeZone: lambda self, e: self.func( 996 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 997 ), 998 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 999 exp.GroupConcat: lambda self, e: groupconcat_sql( 1000 self, e, func_name="STRING_AGG", within_group=False 1001 ), 1002 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1003 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1004 exp.If: if_sql(false_value="NULL"), 1005 exp.ILike: no_ilike_sql, 1006 exp.IntDiv: rename_func("DIV"), 1007 exp.Int64: rename_func("INT64"), 1008 exp.JSONExtract: _json_extract_sql, 1009 exp.JSONExtractArray: _json_extract_sql, 1010 exp.JSONExtractScalar: _json_extract_sql, 1011 exp.JSONFormat: rename_func("TO_JSON_STRING"), 1012 exp.Levenshtein: _levenshtein_sql, 1013 exp.Max: max_or_greatest, 1014 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1015 exp.MD5Digest: rename_func("MD5"), 1016 exp.Min: min_or_least, 1017 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1018 exp.RegexpExtract: lambda self, e: self.func( 1019 "REGEXP_EXTRACT", 1020 e.this, 1021 e.expression, 1022 e.args.get("position"), 1023 e.args.get("occurrence"), 1024 ), 1025 exp.RegexpExtractAll: lambda self, e: self.func( 1026 "REGEXP_EXTRACT_ALL", e.this, e.expression 1027 ), 1028 exp.RegexpReplace: regexp_replace_sql, 1029 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1030 exp.ReturnsProperty: _returnsproperty_sql, 1031 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1032 exp.Select: transforms.preprocess( 1033 [ 1034 transforms.explode_projection_to_unnest(), 1035 transforms.unqualify_unnest, 1036 transforms.eliminate_distinct_on, 1037 _alias_ordered_group, 1038 transforms.eliminate_semi_and_anti_joins, 1039 ] 1040 ), 1041 exp.SHA: rename_func("SHA1"), 1042 exp.SHA2: sha256_sql, 1043 exp.Space: space_sql, 1044 exp.StabilityProperty: lambda self, e: ( 1045 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1046 ), 1047 exp.String: rename_func("STRING"), 1048 exp.StrPosition: lambda self, e: ( 1049 strposition_sql( 1050 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1051 ) 1052 ), 1053 exp.StrToDate: _str_to_datetime_sql, 1054 exp.StrToTime: _str_to_datetime_sql, 1055 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1056 exp.TimeFromParts: rename_func("TIME"), 1057 exp.TimestampFromParts: rename_func("DATETIME"), 1058 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1059 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1060 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1061 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1062 exp.TimeStrToTime: timestrtotime_sql, 1063 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1064 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1065 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1066 exp.TsOrDsToTime: rename_func("TIME"), 1067 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1068 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1069 exp.Unhex: rename_func("FROM_HEX"), 1070 exp.UnixDate: rename_func("UNIX_DATE"), 1071 exp.UnixToTime: _unix_to_time_sql, 1072 exp.Uuid: lambda *_: "GENERATE_UUID()", 1073 exp.Values: _derived_table_values_to_unnest, 1074 exp.VariancePop: rename_func("VAR_POP"), 1075 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1076 } 1077 1078 SUPPORTED_JSON_PATH_PARTS = { 1079 exp.JSONPathKey, 1080 exp.JSONPathRoot, 1081 exp.JSONPathSubscript, 1082 } 1083 1084 TYPE_MAPPING = { 1085 **generator.Generator.TYPE_MAPPING, 1086 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1087 exp.DataType.Type.BIGINT: "INT64", 1088 exp.DataType.Type.BINARY: "BYTES", 1089 exp.DataType.Type.BLOB: "BYTES", 1090 exp.DataType.Type.BOOLEAN: "BOOL", 1091 exp.DataType.Type.CHAR: "STRING", 1092 exp.DataType.Type.DECIMAL: "NUMERIC", 1093 exp.DataType.Type.DOUBLE: "FLOAT64", 1094 exp.DataType.Type.FLOAT: "FLOAT64", 1095 exp.DataType.Type.INT: "INT64", 1096 exp.DataType.Type.NCHAR: "STRING", 1097 exp.DataType.Type.NVARCHAR: "STRING", 1098 exp.DataType.Type.SMALLINT: "INT64", 1099 exp.DataType.Type.TEXT: "STRING", 1100 exp.DataType.Type.TIMESTAMP: "DATETIME", 1101 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1102 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1103 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1104 exp.DataType.Type.TINYINT: "INT64", 1105 exp.DataType.Type.ROWVERSION: "BYTES", 1106 exp.DataType.Type.UUID: "STRING", 1107 exp.DataType.Type.VARBINARY: "BYTES", 1108 exp.DataType.Type.VARCHAR: "STRING", 1109 exp.DataType.Type.VARIANT: "ANY TYPE", 1110 } 1111 1112 PROPERTIES_LOCATION = { 1113 **generator.Generator.PROPERTIES_LOCATION, 1114 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1115 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1116 } 1117 1118 # WINDOW comes after QUALIFY 1119 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1120 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1121 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1122 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1123 } 1124 1125 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1126 RESERVED_KEYWORDS = { 1127 "all", 1128 "and", 1129 "any", 1130 "array", 1131 "as", 1132 "asc", 1133 "assert_rows_modified", 1134 "at", 1135 "between", 1136 "by", 1137 "case", 1138 "cast", 1139 "collate", 1140 "contains", 1141 "create", 1142 "cross", 1143 "cube", 1144 "current", 1145 "default", 1146 "define", 1147 "desc", 1148 "distinct", 1149 "else", 1150 "end", 1151 "enum", 1152 "escape", 1153 "except", 1154 "exclude", 1155 "exists", 1156 "extract", 1157 "false", 1158 "fetch", 1159 "following", 1160 "for", 1161 "from", 1162 "full", 1163 "group", 1164 "grouping", 1165 "groups", 1166 "hash", 1167 "having", 1168 "if", 1169 "ignore", 1170 "in", 1171 "inner", 1172 "intersect", 1173 "interval", 1174 "into", 1175 "is", 1176 "join", 1177 "lateral", 1178 "left", 1179 "like", 1180 "limit", 1181 "lookup", 1182 "merge", 1183 "natural", 1184 "new", 1185 "no", 1186 "not", 1187 "null", 1188 "nulls", 1189 "of", 1190 "on", 1191 "or", 1192 "order", 1193 "outer", 1194 "over", 1195 "partition", 1196 "preceding", 1197 "proto", 1198 "qualify", 1199 "range", 1200 "recursive", 1201 "respect", 1202 "right", 1203 "rollup", 1204 "rows", 1205 "select", 1206 "set", 1207 "some", 1208 "struct", 1209 "tablesample", 1210 "then", 1211 "to", 1212 "treat", 1213 "true", 1214 "unbounded", 1215 "union", 1216 "unnest", 1217 "using", 1218 "when", 1219 "where", 1220 "window", 1221 "with", 1222 "within", 1223 } 1224 1225 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1226 unit = expression.unit 1227 unit_sql = unit.name if unit.is_string else self.sql(unit) 1228 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1229 1230 def mod_sql(self, expression: exp.Mod) -> str: 1231 this = expression.this 1232 expr = expression.expression 1233 return self.func( 1234 "MOD", 1235 this.unnest() if isinstance(this, exp.Paren) else this, 1236 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1237 ) 1238 1239 def column_parts(self, expression: exp.Column) -> str: 1240 if expression.meta.get("quoted_column"): 1241 # If a column reference is of the form `dataset.table`.name, we need 1242 # to preserve the quoted table path, otherwise the reference breaks 1243 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1244 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1245 return f"{table_path}.{self.sql(expression, 'this')}" 1246 1247 return super().column_parts(expression) 1248 1249 def table_parts(self, expression: exp.Table) -> str: 1250 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1251 # we need to make sure the correct quoting is used in each case. 1252 # 1253 # For example, if there is a CTE x that clashes with a schema name, then the former will 1254 # return the table y in that schema, whereas the latter will return the CTE's y column: 1255 # 1256 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1257 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1258 if expression.meta.get("quoted_table"): 1259 table_parts = ".".join(p.name for p in expression.parts) 1260 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1261 1262 return super().table_parts(expression) 1263 1264 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1265 this = expression.this 1266 if isinstance(this, exp.TsOrDsToDatetime): 1267 func_name = "FORMAT_DATETIME" 1268 elif isinstance(this, exp.TsOrDsToTimestamp): 1269 func_name = "FORMAT_TIMESTAMP" 1270 else: 1271 func_name = "FORMAT_DATE" 1272 1273 time_expr = ( 1274 this 1275 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1276 else expression 1277 ) 1278 return self.func( 1279 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1280 ) 1281 1282 def eq_sql(self, expression: exp.EQ) -> str: 1283 # Operands of = cannot be NULL in BigQuery 1284 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1285 if not isinstance(expression.parent, exp.Update): 1286 return "NULL" 1287 1288 return self.binary(expression, "=") 1289 1290 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1291 parent = expression.parent 1292 1293 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1294 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1295 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1296 return self.func( 1297 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1298 ) 1299 1300 return super().attimezone_sql(expression) 1301 1302 def trycast_sql(self, expression: exp.TryCast) -> str: 1303 return self.cast_sql(expression, safe_prefix="SAFE_") 1304 1305 def bracket_sql(self, expression: exp.Bracket) -> str: 1306 this = expression.this 1307 expressions = expression.expressions 1308 1309 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1310 arg = expressions[0] 1311 if arg.type is None: 1312 from sqlglot.optimizer.annotate_types import annotate_types 1313 1314 arg = annotate_types(arg, dialect=self.dialect) 1315 1316 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1317 # BQ doesn't support bracket syntax with string values for structs 1318 return f"{self.sql(this)}.{arg.name}" 1319 1320 expressions_sql = self.expressions(expression, flat=True) 1321 offset = expression.args.get("offset") 1322 1323 if offset == 0: 1324 expressions_sql = f"OFFSET({expressions_sql})" 1325 elif offset == 1: 1326 expressions_sql = f"ORDINAL({expressions_sql})" 1327 elif offset is not None: 1328 self.unsupported(f"Unsupported array offset: {offset}") 1329 1330 if expression.args.get("safe"): 1331 expressions_sql = f"SAFE_{expressions_sql}" 1332 1333 return f"{self.sql(this)}[{expressions_sql}]" 1334 1335 def in_unnest_op(self, expression: exp.Unnest) -> str: 1336 return self.sql(expression) 1337 1338 def version_sql(self, expression: exp.Version) -> str: 1339 if expression.name == "TIMESTAMP": 1340 expression.set("this", "SYSTEM_TIME") 1341 return super().version_sql(expression) 1342 1343 def contains_sql(self, expression: exp.Contains) -> str: 1344 this = expression.this 1345 expr = expression.expression 1346 1347 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1348 this = this.this 1349 expr = expr.this 1350 1351 return self.func("CONTAINS_SUBSTR", this, expr) 1352 1353 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1354 this = expression.this 1355 1356 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1357 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1358 # because they aren't literals and so the above syntax is invalid BigQuery. 1359 if isinstance(this, exp.Array): 1360 elem = seq_get(this.expressions, 0) 1361 if not (elem and elem.find(exp.Query)): 1362 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1363 1364 return super().cast_sql(expression, safe_prefix=safe_prefix)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
478 def normalize_identifier(self, expression: E) -> E: 479 if ( 480 isinstance(expression, exp.Identifier) 481 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 482 ): 483 parent = expression.parent 484 while isinstance(parent, exp.Dot): 485 parent = parent.parent 486 487 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 488 # by default. The following check uses a heuristic to detect tables based on whether 489 # they are qualified. This should generally be correct, because tables in BigQuery 490 # must be qualified with at least a dataset, unless @@dataset_id is set. 491 case_sensitive = ( 492 isinstance(parent, exp.UserDefinedFunction) 493 or ( 494 isinstance(parent, exp.Table) 495 and parent.db 496 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 497 ) 498 or expression.meta.get("is_table") 499 ) 500 if not case_sensitive: 501 expression.set("this", expression.this.lower()) 502 503 return t.cast(E, expression) 504 505 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
507 class Tokenizer(tokens.Tokenizer): 508 QUOTES = ["'", '"', '"""', "'''"] 509 COMMENTS = ["--", "#", ("/*", "*/")] 510 IDENTIFIERS = ["`"] 511 STRING_ESCAPES = ["\\"] 512 513 HEX_STRINGS = [("0x", ""), ("0X", "")] 514 515 BYTE_STRINGS = [ 516 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 517 ] 518 519 RAW_STRINGS = [ 520 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 521 ] 522 523 NESTED_COMMENTS = False 524 525 KEYWORDS = { 526 **tokens.Tokenizer.KEYWORDS, 527 "ANY TYPE": TokenType.VARIANT, 528 "BEGIN": TokenType.COMMAND, 529 "BEGIN TRANSACTION": TokenType.BEGIN, 530 "BYTEINT": TokenType.INT, 531 "BYTES": TokenType.BINARY, 532 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 533 "DATETIME": TokenType.TIMESTAMP, 534 "DECLARE": TokenType.COMMAND, 535 "ELSEIF": TokenType.COMMAND, 536 "EXCEPTION": TokenType.COMMAND, 537 "EXPORT": TokenType.EXPORT, 538 "FLOAT64": TokenType.DOUBLE, 539 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 540 "MODEL": TokenType.MODEL, 541 "NOT DETERMINISTIC": TokenType.VOLATILE, 542 "RECORD": TokenType.STRUCT, 543 "TIMESTAMP": TokenType.TIMESTAMPTZ, 544 } 545 KEYWORDS.pop("DIV") 546 KEYWORDS.pop("VALUES") 547 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
549 class Parser(parser.Parser): 550 PREFIXED_PIVOT_COLUMNS = True 551 LOG_DEFAULTS_TO_LN = True 552 SUPPORTS_IMPLICIT_UNNEST = True 553 JOINS_HAVE_EQUAL_PRECEDENCE = True 554 555 # BigQuery does not allow ASC/DESC to be used as an identifier 556 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 557 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 558 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 559 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 560 TokenType.ASC, 561 TokenType.DESC, 562 } 563 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 564 565 FUNCTIONS = { 566 **parser.Parser.FUNCTIONS, 567 "CONTAINS_SUBSTR": _build_contains_substring, 568 "DATE": _build_date, 569 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 570 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 571 "DATE_TRUNC": lambda args: exp.DateTrunc( 572 unit=seq_get(args, 1), 573 this=seq_get(args, 0), 574 zone=seq_get(args, 2), 575 ), 576 "DATETIME": _build_datetime, 577 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 578 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 579 "DIV": binary_from_function(exp.IntDiv), 580 "EDIT_DISTANCE": _build_levenshtein, 581 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 582 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 583 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 584 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 585 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 586 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 587 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 588 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 589 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 590 "MD5": exp.MD5Digest.from_arg_list, 591 "TO_HEX": _build_to_hex, 592 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 593 [seq_get(args, 1), seq_get(args, 0)] 594 ), 595 "PARSE_TIMESTAMP": _build_parse_timestamp, 596 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 597 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 598 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 599 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 600 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 601 ), 602 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 603 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 604 "SPLIT": lambda args: exp.Split( 605 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 606 this=seq_get(args, 0), 607 expression=seq_get(args, 1) or exp.Literal.string(","), 608 ), 609 "STRPOS": exp.StrPosition.from_arg_list, 610 "TIME": _build_time, 611 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 612 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 613 "TIMESTAMP": _build_timestamp, 614 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 615 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 616 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 617 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 618 ), 619 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 620 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 621 ), 622 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 623 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 624 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 625 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 626 } 627 628 FUNCTION_PARSERS = { 629 **parser.Parser.FUNCTION_PARSERS, 630 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 631 "JSON_ARRAY": lambda self: self.expression( 632 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 633 ), 634 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 635 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 636 } 637 FUNCTION_PARSERS.pop("TRIM") 638 639 NO_PAREN_FUNCTIONS = { 640 **parser.Parser.NO_PAREN_FUNCTIONS, 641 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 642 } 643 644 NESTED_TYPE_TOKENS = { 645 *parser.Parser.NESTED_TYPE_TOKENS, 646 TokenType.TABLE, 647 } 648 649 PROPERTY_PARSERS = { 650 **parser.Parser.PROPERTY_PARSERS, 651 "NOT DETERMINISTIC": lambda self: self.expression( 652 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 653 ), 654 "OPTIONS": lambda self: self._parse_with_property(), 655 } 656 657 CONSTRAINT_PARSERS = { 658 **parser.Parser.CONSTRAINT_PARSERS, 659 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 660 } 661 662 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 663 RANGE_PARSERS.pop(TokenType.OVERLAPS) 664 665 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 666 667 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 668 669 STATEMENT_PARSERS = { 670 **parser.Parser.STATEMENT_PARSERS, 671 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 672 TokenType.END: lambda self: self._parse_as_command(self._prev), 673 TokenType.FOR: lambda self: self._parse_for_in(), 674 TokenType.EXPORT: lambda self: self._parse_export_data(), 675 } 676 677 BRACKET_OFFSETS = { 678 "OFFSET": (0, False), 679 "ORDINAL": (1, False), 680 "SAFE_OFFSET": (0, True), 681 "SAFE_ORDINAL": (1, True), 682 } 683 684 def _parse_for_in(self) -> exp.ForIn: 685 this = self._parse_range() 686 self._match_text_seq("DO") 687 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 688 689 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 690 this = super()._parse_table_part(schema=schema) or self._parse_number() 691 692 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 693 if isinstance(this, exp.Identifier): 694 table_name = this.name 695 while self._match(TokenType.DASH, advance=False) and self._next: 696 start = self._curr 697 while self._is_connected() and not self._match_set( 698 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 699 ): 700 self._advance() 701 702 if start == self._curr: 703 break 704 705 table_name += self._find_sql(start, self._prev) 706 707 this = exp.Identifier( 708 this=table_name, quoted=this.args.get("quoted") 709 ).update_positions(this) 710 elif isinstance(this, exp.Literal): 711 table_name = this.name 712 713 if self._is_connected() and self._parse_var(any_token=True): 714 table_name += self._prev.text 715 716 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 717 718 return this 719 720 def _parse_table_parts( 721 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 722 ) -> exp.Table: 723 table = super()._parse_table_parts( 724 schema=schema, is_db_reference=is_db_reference, wildcard=True 725 ) 726 727 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 728 if not table.catalog: 729 if table.db: 730 previous_db = table.args["db"] 731 parts = table.db.split(".") 732 if len(parts) == 2 and not table.args["db"].quoted: 733 table.set( 734 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 735 ) 736 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 737 else: 738 previous_this = table.this 739 parts = table.name.split(".") 740 if len(parts) == 2 and not table.this.quoted: 741 table.set( 742 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 743 ) 744 table.set( 745 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 746 ) 747 748 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 749 alias = table.this 750 catalog, db, this, *rest = ( 751 exp.to_identifier(p, quoted=True) 752 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 753 ) 754 755 for part in (catalog, db, this): 756 if part: 757 part.update_positions(table.this) 758 759 if rest and this: 760 this = exp.Dot.build([this, *rest]) # type: ignore 761 762 table = exp.Table( 763 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 764 ) 765 table.meta["quoted_table"] = True 766 else: 767 alias = None 768 769 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 770 # dataset, so if the project identifier is omitted we need to fix the ast so that 771 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 772 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 773 # views, because it would seem like the "catalog" part is set, when it'd actually 774 # be the region/dataset. Merging the two identifiers into a single one is done to 775 # avoid producing a 4-part Table reference, which would cause issues in the schema 776 # module, when there are 3-part table names mixed with information schema views. 777 # 778 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 779 table_parts = table.parts 780 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 781 # We need to alias the table here to avoid breaking existing qualified columns. 782 # This is expected to be safe, because if there's an actual alias coming up in 783 # the token stream, it will overwrite this one. If there isn't one, we are only 784 # exposing the name that can be used to reference the view explicitly (a no-op). 785 exp.alias_( 786 table, 787 t.cast(exp.Identifier, alias or table_parts[-1]), 788 table=True, 789 copy=False, 790 ) 791 792 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 793 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 794 line=table_parts[-2].meta.get("line"), 795 col=table_parts[-1].meta.get("col"), 796 start=table_parts[-2].meta.get("start"), 797 end=table_parts[-1].meta.get("end"), 798 ) 799 table.set("this", new_this) 800 table.set("db", seq_get(table_parts, -3)) 801 table.set("catalog", seq_get(table_parts, -4)) 802 803 return table 804 805 def _parse_column(self) -> t.Optional[exp.Expression]: 806 column = super()._parse_column() 807 if isinstance(column, exp.Column): 808 parts = column.parts 809 if any("." in p.name for p in parts): 810 catalog, db, table, this, *rest = ( 811 exp.to_identifier(p, quoted=True) 812 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 813 ) 814 815 if rest and this: 816 this = exp.Dot.build([this, *rest]) # type: ignore 817 818 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 819 column.meta["quoted_column"] = True 820 821 return column 822 823 @t.overload 824 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 825 826 @t.overload 827 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 828 829 def _parse_json_object(self, agg=False): 830 json_object = super()._parse_json_object() 831 array_kv_pair = seq_get(json_object.expressions, 0) 832 833 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 834 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 835 if ( 836 array_kv_pair 837 and isinstance(array_kv_pair.this, exp.Array) 838 and isinstance(array_kv_pair.expression, exp.Array) 839 ): 840 keys = array_kv_pair.this.expressions 841 values = array_kv_pair.expression.expressions 842 843 json_object.set( 844 "expressions", 845 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 846 ) 847 848 return json_object 849 850 def _parse_bracket( 851 self, this: t.Optional[exp.Expression] = None 852 ) -> t.Optional[exp.Expression]: 853 bracket = super()._parse_bracket(this) 854 855 if this is bracket: 856 return bracket 857 858 if isinstance(bracket, exp.Bracket): 859 for expression in bracket.expressions: 860 name = expression.name.upper() 861 862 if name not in self.BRACKET_OFFSETS: 863 break 864 865 offset, safe = self.BRACKET_OFFSETS[name] 866 bracket.set("offset", offset) 867 bracket.set("safe", safe) 868 expression.replace(expression.expressions[0]) 869 870 return bracket 871 872 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 873 unnest = super()._parse_unnest(with_alias=with_alias) 874 875 if not unnest: 876 return None 877 878 unnest_expr = seq_get(unnest.expressions, 0) 879 if unnest_expr: 880 from sqlglot.optimizer.annotate_types import annotate_types 881 882 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 883 884 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 885 # in contrast to other dialects such as DuckDB which flattens only the array by default 886 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 887 array_elem.is_type(exp.DataType.Type.STRUCT) 888 for array_elem in unnest_expr._type.expressions 889 ): 890 unnest.set("explode_array", True) 891 892 return unnest 893 894 def _parse_make_interval(self) -> exp.MakeInterval: 895 expr = exp.MakeInterval() 896 897 for arg_key in expr.arg_types: 898 value = self._parse_lambda() 899 900 if not value: 901 break 902 903 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 904 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 905 if isinstance(value, exp.Kwarg): 906 arg_key = value.this.name 907 908 expr.set(arg_key, value) 909 910 self._match(TokenType.COMMA) 911 912 return expr 913 914 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 915 expr = self.expression( 916 exp.FeaturesAtTime, 917 this=(self._match(TokenType.TABLE) and self._parse_table()) 918 or self._parse_select(nested=True), 919 ) 920 921 while self._match(TokenType.COMMA): 922 arg = self._parse_lambda() 923 924 # Get the LHS of the Kwarg and set the arg to that value, e.g 925 # "num_rows => 1" sets the expr's `num_rows` arg 926 if arg: 927 expr.set(arg.this.name, arg) 928 929 return expr 930 931 def _parse_export_data(self) -> exp.Export: 932 self._match_text_seq("DATA") 933 934 return self.expression( 935 exp.Export, 936 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 937 options=self._parse_properties(), 938 this=self._match_text_seq("AS") and self._parse_select(), 939 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
941 class Generator(generator.Generator): 942 INTERVAL_ALLOWS_PLURAL_FORM = False 943 JOIN_HINTS = False 944 QUERY_HINTS = False 945 TABLE_HINTS = False 946 LIMIT_FETCH = "LIMIT" 947 RENAME_TABLE_WITH_DB = False 948 NVL2_SUPPORTED = False 949 UNNEST_WITH_ORDINALITY = False 950 COLLATE_IS_FUNC = True 951 LIMIT_ONLY_LITERALS = True 952 SUPPORTS_TABLE_ALIAS_COLUMNS = False 953 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 954 JSON_KEY_VALUE_PAIR_SEP = "," 955 NULL_ORDERING_SUPPORTED = False 956 IGNORE_NULLS_IN_FUNC = True 957 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 958 CAN_IMPLEMENT_ARRAY_ANY = True 959 SUPPORTS_TO_NUMBER = False 960 NAMED_PLACEHOLDER_TOKEN = "@" 961 HEX_FUNC = "TO_HEX" 962 WITH_PROPERTIES_PREFIX = "OPTIONS" 963 SUPPORTS_EXPLODING_PROJECTIONS = False 964 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 965 SUPPORTS_UNIX_SECONDS = True 966 967 TRANSFORMS = { 968 **generator.Generator.TRANSFORMS, 969 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 970 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 971 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 972 exp.Array: inline_array_unless_query, 973 exp.ArrayContains: _array_contains_sql, 974 exp.ArrayFilter: filter_array_using_unnest, 975 exp.ArrayRemove: filter_array_using_unnest, 976 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 977 exp.CollateProperty: lambda self, e: ( 978 f"DEFAULT COLLATE {self.sql(e, 'this')}" 979 if e.args.get("default") 980 else f"COLLATE {self.sql(e, 'this')}" 981 ), 982 exp.Commit: lambda *_: "COMMIT TRANSACTION", 983 exp.CountIf: rename_func("COUNTIF"), 984 exp.Create: _create_sql, 985 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 986 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 987 exp.DateDiff: lambda self, e: self.func( 988 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 989 ), 990 exp.DateFromParts: rename_func("DATE"), 991 exp.DateStrToDate: datestrtodate_sql, 992 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 993 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 994 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 995 exp.FromTimeZone: lambda self, e: self.func( 996 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 997 ), 998 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 999 exp.GroupConcat: lambda self, e: groupconcat_sql( 1000 self, e, func_name="STRING_AGG", within_group=False 1001 ), 1002 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1003 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1004 exp.If: if_sql(false_value="NULL"), 1005 exp.ILike: no_ilike_sql, 1006 exp.IntDiv: rename_func("DIV"), 1007 exp.Int64: rename_func("INT64"), 1008 exp.JSONExtract: _json_extract_sql, 1009 exp.JSONExtractArray: _json_extract_sql, 1010 exp.JSONExtractScalar: _json_extract_sql, 1011 exp.JSONFormat: rename_func("TO_JSON_STRING"), 1012 exp.Levenshtein: _levenshtein_sql, 1013 exp.Max: max_or_greatest, 1014 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1015 exp.MD5Digest: rename_func("MD5"), 1016 exp.Min: min_or_least, 1017 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1018 exp.RegexpExtract: lambda self, e: self.func( 1019 "REGEXP_EXTRACT", 1020 e.this, 1021 e.expression, 1022 e.args.get("position"), 1023 e.args.get("occurrence"), 1024 ), 1025 exp.RegexpExtractAll: lambda self, e: self.func( 1026 "REGEXP_EXTRACT_ALL", e.this, e.expression 1027 ), 1028 exp.RegexpReplace: regexp_replace_sql, 1029 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1030 exp.ReturnsProperty: _returnsproperty_sql, 1031 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1032 exp.Select: transforms.preprocess( 1033 [ 1034 transforms.explode_projection_to_unnest(), 1035 transforms.unqualify_unnest, 1036 transforms.eliminate_distinct_on, 1037 _alias_ordered_group, 1038 transforms.eliminate_semi_and_anti_joins, 1039 ] 1040 ), 1041 exp.SHA: rename_func("SHA1"), 1042 exp.SHA2: sha256_sql, 1043 exp.Space: space_sql, 1044 exp.StabilityProperty: lambda self, e: ( 1045 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1046 ), 1047 exp.String: rename_func("STRING"), 1048 exp.StrPosition: lambda self, e: ( 1049 strposition_sql( 1050 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1051 ) 1052 ), 1053 exp.StrToDate: _str_to_datetime_sql, 1054 exp.StrToTime: _str_to_datetime_sql, 1055 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1056 exp.TimeFromParts: rename_func("TIME"), 1057 exp.TimestampFromParts: rename_func("DATETIME"), 1058 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1059 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1060 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1061 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1062 exp.TimeStrToTime: timestrtotime_sql, 1063 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1064 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1065 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1066 exp.TsOrDsToTime: rename_func("TIME"), 1067 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1068 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1069 exp.Unhex: rename_func("FROM_HEX"), 1070 exp.UnixDate: rename_func("UNIX_DATE"), 1071 exp.UnixToTime: _unix_to_time_sql, 1072 exp.Uuid: lambda *_: "GENERATE_UUID()", 1073 exp.Values: _derived_table_values_to_unnest, 1074 exp.VariancePop: rename_func("VAR_POP"), 1075 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1076 } 1077 1078 SUPPORTED_JSON_PATH_PARTS = { 1079 exp.JSONPathKey, 1080 exp.JSONPathRoot, 1081 exp.JSONPathSubscript, 1082 } 1083 1084 TYPE_MAPPING = { 1085 **generator.Generator.TYPE_MAPPING, 1086 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1087 exp.DataType.Type.BIGINT: "INT64", 1088 exp.DataType.Type.BINARY: "BYTES", 1089 exp.DataType.Type.BLOB: "BYTES", 1090 exp.DataType.Type.BOOLEAN: "BOOL", 1091 exp.DataType.Type.CHAR: "STRING", 1092 exp.DataType.Type.DECIMAL: "NUMERIC", 1093 exp.DataType.Type.DOUBLE: "FLOAT64", 1094 exp.DataType.Type.FLOAT: "FLOAT64", 1095 exp.DataType.Type.INT: "INT64", 1096 exp.DataType.Type.NCHAR: "STRING", 1097 exp.DataType.Type.NVARCHAR: "STRING", 1098 exp.DataType.Type.SMALLINT: "INT64", 1099 exp.DataType.Type.TEXT: "STRING", 1100 exp.DataType.Type.TIMESTAMP: "DATETIME", 1101 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1102 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1103 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1104 exp.DataType.Type.TINYINT: "INT64", 1105 exp.DataType.Type.ROWVERSION: "BYTES", 1106 exp.DataType.Type.UUID: "STRING", 1107 exp.DataType.Type.VARBINARY: "BYTES", 1108 exp.DataType.Type.VARCHAR: "STRING", 1109 exp.DataType.Type.VARIANT: "ANY TYPE", 1110 } 1111 1112 PROPERTIES_LOCATION = { 1113 **generator.Generator.PROPERTIES_LOCATION, 1114 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1115 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1116 } 1117 1118 # WINDOW comes after QUALIFY 1119 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1120 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1121 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1122 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1123 } 1124 1125 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1126 RESERVED_KEYWORDS = { 1127 "all", 1128 "and", 1129 "any", 1130 "array", 1131 "as", 1132 "asc", 1133 "assert_rows_modified", 1134 "at", 1135 "between", 1136 "by", 1137 "case", 1138 "cast", 1139 "collate", 1140 "contains", 1141 "create", 1142 "cross", 1143 "cube", 1144 "current", 1145 "default", 1146 "define", 1147 "desc", 1148 "distinct", 1149 "else", 1150 "end", 1151 "enum", 1152 "escape", 1153 "except", 1154 "exclude", 1155 "exists", 1156 "extract", 1157 "false", 1158 "fetch", 1159 "following", 1160 "for", 1161 "from", 1162 "full", 1163 "group", 1164 "grouping", 1165 "groups", 1166 "hash", 1167 "having", 1168 "if", 1169 "ignore", 1170 "in", 1171 "inner", 1172 "intersect", 1173 "interval", 1174 "into", 1175 "is", 1176 "join", 1177 "lateral", 1178 "left", 1179 "like", 1180 "limit", 1181 "lookup", 1182 "merge", 1183 "natural", 1184 "new", 1185 "no", 1186 "not", 1187 "null", 1188 "nulls", 1189 "of", 1190 "on", 1191 "or", 1192 "order", 1193 "outer", 1194 "over", 1195 "partition", 1196 "preceding", 1197 "proto", 1198 "qualify", 1199 "range", 1200 "recursive", 1201 "respect", 1202 "right", 1203 "rollup", 1204 "rows", 1205 "select", 1206 "set", 1207 "some", 1208 "struct", 1209 "tablesample", 1210 "then", 1211 "to", 1212 "treat", 1213 "true", 1214 "unbounded", 1215 "union", 1216 "unnest", 1217 "using", 1218 "when", 1219 "where", 1220 "window", 1221 "with", 1222 "within", 1223 } 1224 1225 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1226 unit = expression.unit 1227 unit_sql = unit.name if unit.is_string else self.sql(unit) 1228 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1229 1230 def mod_sql(self, expression: exp.Mod) -> str: 1231 this = expression.this 1232 expr = expression.expression 1233 return self.func( 1234 "MOD", 1235 this.unnest() if isinstance(this, exp.Paren) else this, 1236 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1237 ) 1238 1239 def column_parts(self, expression: exp.Column) -> str: 1240 if expression.meta.get("quoted_column"): 1241 # If a column reference is of the form `dataset.table`.name, we need 1242 # to preserve the quoted table path, otherwise the reference breaks 1243 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1244 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1245 return f"{table_path}.{self.sql(expression, 'this')}" 1246 1247 return super().column_parts(expression) 1248 1249 def table_parts(self, expression: exp.Table) -> str: 1250 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1251 # we need to make sure the correct quoting is used in each case. 1252 # 1253 # For example, if there is a CTE x that clashes with a schema name, then the former will 1254 # return the table y in that schema, whereas the latter will return the CTE's y column: 1255 # 1256 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1257 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1258 if expression.meta.get("quoted_table"): 1259 table_parts = ".".join(p.name for p in expression.parts) 1260 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1261 1262 return super().table_parts(expression) 1263 1264 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1265 this = expression.this 1266 if isinstance(this, exp.TsOrDsToDatetime): 1267 func_name = "FORMAT_DATETIME" 1268 elif isinstance(this, exp.TsOrDsToTimestamp): 1269 func_name = "FORMAT_TIMESTAMP" 1270 else: 1271 func_name = "FORMAT_DATE" 1272 1273 time_expr = ( 1274 this 1275 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1276 else expression 1277 ) 1278 return self.func( 1279 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1280 ) 1281 1282 def eq_sql(self, expression: exp.EQ) -> str: 1283 # Operands of = cannot be NULL in BigQuery 1284 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1285 if not isinstance(expression.parent, exp.Update): 1286 return "NULL" 1287 1288 return self.binary(expression, "=") 1289 1290 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1291 parent = expression.parent 1292 1293 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1294 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1295 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1296 return self.func( 1297 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1298 ) 1299 1300 return super().attimezone_sql(expression) 1301 1302 def trycast_sql(self, expression: exp.TryCast) -> str: 1303 return self.cast_sql(expression, safe_prefix="SAFE_") 1304 1305 def bracket_sql(self, expression: exp.Bracket) -> str: 1306 this = expression.this 1307 expressions = expression.expressions 1308 1309 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1310 arg = expressions[0] 1311 if arg.type is None: 1312 from sqlglot.optimizer.annotate_types import annotate_types 1313 1314 arg = annotate_types(arg, dialect=self.dialect) 1315 1316 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1317 # BQ doesn't support bracket syntax with string values for structs 1318 return f"{self.sql(this)}.{arg.name}" 1319 1320 expressions_sql = self.expressions(expression, flat=True) 1321 offset = expression.args.get("offset") 1322 1323 if offset == 0: 1324 expressions_sql = f"OFFSET({expressions_sql})" 1325 elif offset == 1: 1326 expressions_sql = f"ORDINAL({expressions_sql})" 1327 elif offset is not None: 1328 self.unsupported(f"Unsupported array offset: {offset}") 1329 1330 if expression.args.get("safe"): 1331 expressions_sql = f"SAFE_{expressions_sql}" 1332 1333 return f"{self.sql(this)}[{expressions_sql}]" 1334 1335 def in_unnest_op(self, expression: exp.Unnest) -> str: 1336 return self.sql(expression) 1337 1338 def version_sql(self, expression: exp.Version) -> str: 1339 if expression.name == "TIMESTAMP": 1340 expression.set("this", "SYSTEM_TIME") 1341 return super().version_sql(expression) 1342 1343 def contains_sql(self, expression: exp.Contains) -> str: 1344 this = expression.this 1345 expr = expression.expression 1346 1347 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1348 this = this.this 1349 expr = expr.this 1350 1351 return self.func("CONTAINS_SUBSTR", this, expr) 1352 1353 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1354 this = expression.this 1355 1356 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1357 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1358 # because they aren't literals and so the above syntax is invalid BigQuery. 1359 if isinstance(this, exp.Array): 1360 elem = seq_get(this.expressions, 0) 1361 if not (elem and elem.find(exp.Query)): 1362 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1363 1364 return super().cast_sql(expression, safe_prefix=safe_prefix)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1239 def column_parts(self, expression: exp.Column) -> str: 1240 if expression.meta.get("quoted_column"): 1241 # If a column reference is of the form `dataset.table`.name, we need 1242 # to preserve the quoted table path, otherwise the reference breaks 1243 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1244 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1245 return f"{table_path}.{self.sql(expression, 'this')}" 1246 1247 return super().column_parts(expression)
1249 def table_parts(self, expression: exp.Table) -> str: 1250 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1251 # we need to make sure the correct quoting is used in each case. 1252 # 1253 # For example, if there is a CTE x that clashes with a schema name, then the former will 1254 # return the table y in that schema, whereas the latter will return the CTE's y column: 1255 # 1256 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1257 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1258 if expression.meta.get("quoted_table"): 1259 table_parts = ".".join(p.name for p in expression.parts) 1260 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1261 1262 return super().table_parts(expression)
1264 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1265 this = expression.this 1266 if isinstance(this, exp.TsOrDsToDatetime): 1267 func_name = "FORMAT_DATETIME" 1268 elif isinstance(this, exp.TsOrDsToTimestamp): 1269 func_name = "FORMAT_TIMESTAMP" 1270 else: 1271 func_name = "FORMAT_DATE" 1272 1273 time_expr = ( 1274 this 1275 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1276 else expression 1277 ) 1278 return self.func( 1279 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1280 )
1282 def eq_sql(self, expression: exp.EQ) -> str: 1283 # Operands of = cannot be NULL in BigQuery 1284 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1285 if not isinstance(expression.parent, exp.Update): 1286 return "NULL" 1287 1288 return self.binary(expression, "=")
1290 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1291 parent = expression.parent 1292 1293 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1294 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1295 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1296 return self.func( 1297 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1298 ) 1299 1300 return super().attimezone_sql(expression)
1305 def bracket_sql(self, expression: exp.Bracket) -> str: 1306 this = expression.this 1307 expressions = expression.expressions 1308 1309 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1310 arg = expressions[0] 1311 if arg.type is None: 1312 from sqlglot.optimizer.annotate_types import annotate_types 1313 1314 arg = annotate_types(arg, dialect=self.dialect) 1315 1316 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1317 # BQ doesn't support bracket syntax with string values for structs 1318 return f"{self.sql(this)}.{arg.name}" 1319 1320 expressions_sql = self.expressions(expression, flat=True) 1321 offset = expression.args.get("offset") 1322 1323 if offset == 0: 1324 expressions_sql = f"OFFSET({expressions_sql})" 1325 elif offset == 1: 1326 expressions_sql = f"ORDINAL({expressions_sql})" 1327 elif offset is not None: 1328 self.unsupported(f"Unsupported array offset: {offset}") 1329 1330 if expression.args.get("safe"): 1331 expressions_sql = f"SAFE_{expressions_sql}" 1332 1333 return f"{self.sql(this)}[{expressions_sql}]"
1353 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1354 this = expression.this 1355 1356 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1357 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1358 # because they aren't literals and so the above syntax is invalid BigQuery. 1359 if isinstance(this, exp.Array): 1360 elem = seq_get(this.expressions, 0) 1361 if not (elem and elem.find(exp.Query)): 1362 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1363 1364 return super().cast_sql(expression, safe_prefix=safe_prefix)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql