sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7 8from sqlglot.optimizer.annotate_types import TypeAnnotator 9 10from sqlglot import exp, generator, jsonpath, parser, tokens, transforms 11from sqlglot._typing import E 12from sqlglot.dialects.dialect import ( 13 Dialect, 14 NormalizationStrategy, 15 annotate_with_type_lambda, 16 arg_max_or_min_no_count, 17 binary_from_function, 18 date_add_interval_sql, 19 datestrtodate_sql, 20 build_formatted_time, 21 filter_array_using_unnest, 22 if_sql, 23 inline_array_unless_query, 24 max_or_greatest, 25 min_or_least, 26 no_ilike_sql, 27 build_date_delta_with_interval, 28 regexp_replace_sql, 29 rename_func, 30 sha256_sql, 31 timestrtotime_sql, 32 ts_or_ds_add_cast, 33 unit_to_var, 34 strposition_sql, 35 groupconcat_sql, 36) 37from sqlglot.helper import seq_get, split_num_words 38from sqlglot.tokens import TokenType 39from sqlglot.generator import unsupported_args 40 41if t.TYPE_CHECKING: 42 from sqlglot._typing import Lit 43 44 from sqlglot.optimizer.annotate_types import TypeAnnotator 45 46logger = logging.getLogger("sqlglot") 47 48 49JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 50 51DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 52 53 54def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 55 if not expression.find_ancestor(exp.From, exp.Join): 56 return self.values_sql(expression) 57 58 structs = [] 59 alias = expression.args.get("alias") 60 for tup in expression.find_all(exp.Tuple): 61 field_aliases = ( 62 alias.columns 63 if alias and alias.columns 64 else (f"_c{i}" for i in range(len(tup.expressions))) 65 ) 66 expressions = [ 67 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 68 for name, fld in zip(field_aliases, tup.expressions) 69 ] 70 structs.append(exp.Struct(expressions=expressions)) 71 72 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 73 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 74 return self.unnest_sql( 75 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 76 ) 77 78 79def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 80 this = expression.this 81 if isinstance(this, exp.Schema): 82 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 83 else: 84 this = self.sql(this) 85 return f"RETURNS {this}" 86 87 88def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 89 returns = expression.find(exp.ReturnsProperty) 90 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 91 expression.set("kind", "TABLE FUNCTION") 92 93 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 94 expression.set("expression", expression.expression.this) 95 96 return self.create_sql(expression) 97 98 99# https://issuetracker.google.com/issues/162294746 100# workaround for bigquery bug when grouping by an expression and then ordering 101# WITH x AS (SELECT 1 y) 102# SELECT y + 1 z 103# FROM x 104# GROUP BY x + 1 105# ORDER by z 106def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 107 if isinstance(expression, exp.Select): 108 group = expression.args.get("group") 109 order = expression.args.get("order") 110 111 if group and order: 112 aliases = { 113 select.this: select.args["alias"] 114 for select in expression.selects 115 if isinstance(select, exp.Alias) 116 } 117 118 for grouped in group.expressions: 119 if grouped.is_int: 120 continue 121 alias = aliases.get(grouped) 122 if alias: 123 grouped.replace(exp.column(alias)) 124 125 return expression 126 127 128def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 129 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 130 if isinstance(expression, exp.CTE) and expression.alias_column_names: 131 cte_query = expression.this 132 133 if cte_query.is_star: 134 logger.warning( 135 "Can't push down CTE column names for star queries. Run the query through" 136 " the optimizer or use 'qualify' to expand the star projections first." 137 ) 138 return expression 139 140 column_names = expression.alias_column_names 141 expression.args["alias"].set("columns", None) 142 143 for name, select in zip(column_names, cte_query.selects): 144 to_replace = select 145 146 if isinstance(select, exp.Alias): 147 select = select.this 148 149 # Inner aliases are shadowed by the CTE column names 150 to_replace.replace(exp.alias_(select, name)) 151 152 return expression 153 154 155def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 156 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 157 this.set("zone", seq_get(args, 2)) 158 return this 159 160 161def _build_timestamp(args: t.List) -> exp.Timestamp: 162 timestamp = exp.Timestamp.from_arg_list(args) 163 timestamp.set("with_tz", True) 164 return timestamp 165 166 167def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 168 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 169 return expr_type.from_arg_list(args) 170 171 172def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 173 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 174 arg = seq_get(args, 0) 175 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 176 177 178def _build_json_strip_nulls(args: t.List) -> exp.JSONStripNulls: 179 expression = exp.JSONStripNulls(this=seq_get(args, 0)) 180 181 for arg in args[1:]: 182 if isinstance(arg, exp.Kwarg): 183 expression.set(arg.this.name.lower(), arg) 184 else: 185 expression.set("expression", arg) 186 187 return expression 188 189 190def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 191 return self.sql( 192 exp.Exists( 193 this=exp.select("1") 194 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 195 .where(exp.column("_col").eq(expression.right)) 196 ) 197 ) 198 199 200def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 201 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 202 203 204def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 205 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 206 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 207 unit = unit_to_var(expression) 208 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 209 210 211def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 212 scale = expression.args.get("scale") 213 timestamp = expression.this 214 215 if scale in (None, exp.UnixToTime.SECONDS): 216 return self.func("TIMESTAMP_SECONDS", timestamp) 217 if scale == exp.UnixToTime.MILLIS: 218 return self.func("TIMESTAMP_MILLIS", timestamp) 219 if scale == exp.UnixToTime.MICROS: 220 return self.func("TIMESTAMP_MICROS", timestamp) 221 222 unix_seconds = exp.cast( 223 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 224 ) 225 return self.func("TIMESTAMP_SECONDS", unix_seconds) 226 227 228def _build_time(args: t.List) -> exp.Func: 229 if len(args) == 1: 230 return exp.TsOrDsToTime(this=args[0]) 231 if len(args) == 2: 232 return exp.Time.from_arg_list(args) 233 return exp.TimeFromParts.from_arg_list(args) 234 235 236def _build_datetime(args: t.List) -> exp.Func: 237 if len(args) == 1: 238 return exp.TsOrDsToDatetime.from_arg_list(args) 239 if len(args) == 2: 240 return exp.Datetime.from_arg_list(args) 241 return exp.TimestampFromParts.from_arg_list(args) 242 243 244def _build_regexp_extract( 245 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 246) -> t.Callable[[t.List], E]: 247 def _builder(args: t.List) -> E: 248 try: 249 group = re.compile(args[1].name).groups == 1 250 except re.error: 251 group = False 252 253 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 254 return expr_type( 255 this=seq_get(args, 0), 256 expression=seq_get(args, 1), 257 position=seq_get(args, 2), 258 occurrence=seq_get(args, 3), 259 group=exp.Literal.number(1) if group else default_group, 260 ) 261 262 return _builder 263 264 265def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 266 def _builder(args: t.List, dialect: Dialect) -> E: 267 if len(args) == 1: 268 # The default value for the JSONPath is '$' i.e all of the data 269 args.append(exp.Literal.string("$")) 270 return parser.build_extract_json_with_path(expr_type)(args, dialect) 271 272 return _builder 273 274 275def _str_to_datetime_sql( 276 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 277) -> str: 278 this = self.sql(expression, "this") 279 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 280 281 if expression.args.get("safe"): 282 fmt = self.format_time( 283 expression, 284 self.dialect.INVERSE_FORMAT_MAPPING, 285 self.dialect.INVERSE_FORMAT_TRIE, 286 ) 287 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 288 289 fmt = self.format_time(expression) 290 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 291 292 293def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 294 """ 295 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 296 +---------+---------+---------+------------+---------+ 297 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 298 +---------+---------+---------+------------+---------+ 299 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 300 +---------+---------+---------+------------+---------+ 301 """ 302 self._annotate_args(expression) 303 304 this: exp.Expression = expression.this 305 306 self._set_type( 307 expression, 308 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 309 ) 310 return expression 311 312 313def _annotate_by_args_with_coerce(self: TypeAnnotator, expression: E) -> E: 314 """ 315 +------------+------------+------------+-------------+---------+ 316 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 317 +------------+------------+------------+-------------+---------+ 318 | INT64 | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 319 | NUMERIC | NUMERIC | NUMERIC | BIGNUMERIC | FLOAT64 | 320 | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | BIGNUMERIC | FLOAT64 | 321 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | FLOAT64 | 322 +------------+------------+------------+-------------+---------+ 323 """ 324 self._annotate_args(expression) 325 326 self._set_type(expression, self._maybe_coerce(expression.this.type, expression.expression.type)) 327 return expression 328 329 330def _annotate_by_args_approx_top(self: TypeAnnotator, expression: exp.ApproxTopK) -> exp.ApproxTopK: 331 self._annotate_args(expression) 332 333 struct_type = exp.DataType( 334 this=exp.DataType.Type.STRUCT, 335 expressions=[expression.this.type, exp.DataType(this=exp.DataType.Type.BIGINT)], 336 nested=True, 337 ) 338 self._set_type( 339 expression, 340 exp.DataType(this=exp.DataType.Type.ARRAY, expressions=[struct_type], nested=True), 341 ) 342 343 return expression 344 345 346@unsupported_args("ins_cost", "del_cost", "sub_cost") 347def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 348 max_dist = expression.args.get("max_dist") 349 if max_dist: 350 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 351 352 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 353 354 355def _build_levenshtein(args: t.List) -> exp.Levenshtein: 356 max_dist = seq_get(args, 2) 357 return exp.Levenshtein( 358 this=seq_get(args, 0), 359 expression=seq_get(args, 1), 360 max_dist=max_dist.expression if max_dist else None, 361 ) 362 363 364def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 365 def _builder(args: t.List) -> exp.TimeToStr: 366 return exp.TimeToStr( 367 this=expr_type(this=seq_get(args, 1)), 368 format=seq_get(args, 0), 369 zone=seq_get(args, 2), 370 ) 371 372 return _builder 373 374 375def _build_contains_substring(args: t.List) -> exp.Contains: 376 # Lowercase the operands in case of transpilation, as exp.Contains 377 # is case-sensitive on other dialects 378 this = exp.Lower(this=seq_get(args, 0)) 379 expr = exp.Lower(this=seq_get(args, 1)) 380 381 return exp.Contains(this=this, expression=expr, json_scope=seq_get(args, 2)) 382 383 384def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 385 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 386 upper = name.upper() 387 388 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 389 390 if dquote_escaping: 391 self._quote_json_path_key_using_brackets = False 392 393 sql = rename_func(upper)(self, expression) 394 395 if dquote_escaping: 396 self._quote_json_path_key_using_brackets = True 397 398 return sql 399 400 401def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 402 annotated = self._annotate_by_args(expression, "expressions") 403 404 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 405 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 406 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 407 annotated.type = exp.DataType.Type.VARCHAR 408 409 return annotated 410 411 412def _annotate_array(self: TypeAnnotator, expression: exp.Array) -> exp.Array: 413 array_args = expression.expressions 414 415 # BigQuery behaves as follows: 416 # 417 # SELECT t, TYPEOF(t) FROM (SELECT 'foo') AS t -- foo, STRUCT<STRING> 418 # SELECT ARRAY(SELECT 'foo'), TYPEOF(ARRAY(SELECT 'foo')) -- foo, ARRAY<STRING> 419 if ( 420 len(array_args) == 1 421 and isinstance(select := array_args[0].unnest(), exp.Select) 422 and (query_type := select.meta.get("query_type")) is not None 423 and query_type.is_type(exp.DataType.Type.STRUCT) 424 and len(query_type.expressions) == 1 425 and isinstance(col_def := query_type.expressions[0], exp.ColumnDef) 426 and (projection_type := col_def.kind) is not None 427 and not projection_type.is_type(exp.DataType.Type.UNKNOWN) 428 ): 429 array_type = exp.DataType( 430 this=exp.DataType.Type.ARRAY, 431 expressions=[projection_type.copy()], 432 nested=True, 433 ) 434 return self._annotate_with_type(expression, array_type) 435 436 return self._annotate_by_args(expression, "expressions", array=True) 437 438 439class BigQuery(Dialect): 440 WEEK_OFFSET = -1 441 UNNEST_COLUMN_ONLY = True 442 SUPPORTS_USER_DEFINED_TYPES = False 443 SUPPORTS_SEMI_ANTI_JOIN = False 444 LOG_BASE_FIRST = False 445 HEX_LOWERCASE = True 446 FORCE_EARLY_ALIAS_REF_EXPANSION = True 447 PRESERVE_ORIGINAL_NAMES = True 448 HEX_STRING_IS_INTEGER_TYPE = True 449 450 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 451 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 452 453 # bigquery udfs are case sensitive 454 NORMALIZE_FUNCTIONS = False 455 456 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 457 TIME_MAPPING = { 458 "%D": "%m/%d/%y", 459 "%E6S": "%S.%f", 460 "%e": "%-d", 461 } 462 463 FORMAT_MAPPING = { 464 "DD": "%d", 465 "MM": "%m", 466 "MON": "%b", 467 "MONTH": "%B", 468 "YYYY": "%Y", 469 "YY": "%y", 470 "HH": "%I", 471 "HH12": "%I", 472 "HH24": "%H", 473 "MI": "%M", 474 "SS": "%S", 475 "SSSSS": "%f", 476 "TZH": "%z", 477 } 478 479 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 480 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 481 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 482 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 483 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 484 485 # All set operations require either a DISTINCT or ALL specifier 486 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 487 488 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 489 COERCES_TO = { 490 **TypeAnnotator.COERCES_TO, 491 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 492 } 493 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 494 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 495 496 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 497 TYPE_TO_EXPRESSIONS = { 498 **Dialect.TYPE_TO_EXPRESSIONS, 499 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 500 } 501 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 502 503 ANNOTATORS = { 504 **Dialect.ANNOTATORS, 505 **{ 506 expr_type: annotate_with_type_lambda(data_type) 507 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 508 for expr_type in expressions 509 }, 510 **{ 511 expr_type: lambda self, e: _annotate_math_functions(self, e) 512 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 513 }, 514 **{ 515 expr_type: lambda self, e: self._annotate_by_args(e, "this") 516 for expr_type in ( 517 exp.Abs, 518 exp.ArgMax, 519 exp.ArgMin, 520 exp.DateTrunc, 521 exp.DatetimeTrunc, 522 exp.FirstValue, 523 exp.GroupConcat, 524 exp.IgnoreNulls, 525 exp.JSONExtract, 526 exp.Lead, 527 exp.Left, 528 exp.Lower, 529 exp.NthValue, 530 exp.Pad, 531 exp.PercentileDisc, 532 exp.RegexpExtract, 533 exp.RegexpReplace, 534 exp.Repeat, 535 exp.Replace, 536 exp.RespectNulls, 537 exp.Reverse, 538 exp.Right, 539 exp.SafeNegate, 540 exp.Sign, 541 exp.Substring, 542 exp.TimestampTrunc, 543 exp.Translate, 544 exp.Trim, 545 exp.Upper, 546 ) 547 }, 548 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 549 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 556 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 558 exp.Array: _annotate_array, 559 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 560 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 561 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 567 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 568 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 569 e, exp.DataType.Type.BINARY 570 ), 571 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 572 e, exp.DataType.Type.VARCHAR 573 ), 574 exp.Concat: _annotate_concat, 575 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 576 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 585 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 586 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 587 e, exp.DataType.Type.DOUBLE 588 ), 589 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 590 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 591 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 592 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 593 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 594 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 595 ), 596 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 597 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 598 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 600 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 603 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 604 e, exp.DataType.Type.VARCHAR 605 ), 606 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 607 exp.JSONFormat: lambda self, e: self._annotate_with_type( 608 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 609 ), 610 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 611 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 612 ), 613 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 614 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 618 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 619 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 620 ), 621 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 622 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 623 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 624 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 625 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 626 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 627 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 628 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 629 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 630 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 631 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 632 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 633 e, exp.DataType.Type.BIGDECIMAL 634 ), 635 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 636 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 637 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 638 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 639 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 641 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 642 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 644 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 645 e, exp.DataType.Type.VARCHAR 646 ), 647 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 648 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 651 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 653 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 654 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 656 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 658 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 659 e, exp.DataType.Type.DATETIME 660 ), 661 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 662 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 664 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 665 ), 666 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 667 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 668 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 669 } 670 671 def normalize_identifier(self, expression: E) -> E: 672 if ( 673 isinstance(expression, exp.Identifier) 674 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 675 ): 676 parent = expression.parent 677 while isinstance(parent, exp.Dot): 678 parent = parent.parent 679 680 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 681 # by default. The following check uses a heuristic to detect tables based on whether 682 # they are qualified. This should generally be correct, because tables in BigQuery 683 # must be qualified with at least a dataset, unless @@dataset_id is set. 684 case_sensitive = ( 685 isinstance(parent, exp.UserDefinedFunction) 686 or ( 687 isinstance(parent, exp.Table) 688 and parent.db 689 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 690 ) 691 or expression.meta.get("is_table") 692 ) 693 if not case_sensitive: 694 expression.set("this", expression.this.lower()) 695 696 return t.cast(E, expression) 697 698 return super().normalize_identifier(expression) 699 700 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 701 VAR_TOKENS = { 702 TokenType.DASH, 703 TokenType.VAR, 704 } 705 706 class Tokenizer(tokens.Tokenizer): 707 QUOTES = ["'", '"', '"""', "'''"] 708 COMMENTS = ["--", "#", ("/*", "*/")] 709 IDENTIFIERS = ["`"] 710 STRING_ESCAPES = ["\\"] 711 712 HEX_STRINGS = [("0x", ""), ("0X", "")] 713 714 BYTE_STRINGS = [ 715 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 716 ] 717 718 RAW_STRINGS = [ 719 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 720 ] 721 722 NESTED_COMMENTS = False 723 724 KEYWORDS = { 725 **tokens.Tokenizer.KEYWORDS, 726 "ANY TYPE": TokenType.VARIANT, 727 "BEGIN": TokenType.COMMAND, 728 "BEGIN TRANSACTION": TokenType.BEGIN, 729 "BYTEINT": TokenType.INT, 730 "BYTES": TokenType.BINARY, 731 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 732 "DATETIME": TokenType.TIMESTAMP, 733 "DECLARE": TokenType.DECLARE, 734 "ELSEIF": TokenType.COMMAND, 735 "EXCEPTION": TokenType.COMMAND, 736 "EXPORT": TokenType.EXPORT, 737 "FLOAT64": TokenType.DOUBLE, 738 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 739 "LOOP": TokenType.COMMAND, 740 "MODEL": TokenType.MODEL, 741 "NOT DETERMINISTIC": TokenType.VOLATILE, 742 "RECORD": TokenType.STRUCT, 743 "REPEAT": TokenType.COMMAND, 744 "TIMESTAMP": TokenType.TIMESTAMPTZ, 745 "WHILE": TokenType.COMMAND, 746 } 747 KEYWORDS.pop("DIV") 748 KEYWORDS.pop("VALUES") 749 KEYWORDS.pop("/*+") 750 751 class Parser(parser.Parser): 752 PREFIXED_PIVOT_COLUMNS = True 753 LOG_DEFAULTS_TO_LN = True 754 SUPPORTS_IMPLICIT_UNNEST = True 755 JOINS_HAVE_EQUAL_PRECEDENCE = True 756 757 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 758 ID_VAR_TOKENS = { 759 *parser.Parser.ID_VAR_TOKENS, 760 TokenType.GRANT, 761 } - {TokenType.ASC, TokenType.DESC} 762 763 ALIAS_TOKENS = { 764 *parser.Parser.ALIAS_TOKENS, 765 TokenType.GRANT, 766 } - {TokenType.ASC, TokenType.DESC} 767 768 TABLE_ALIAS_TOKENS = { 769 *parser.Parser.TABLE_ALIAS_TOKENS, 770 TokenType.GRANT, 771 } - {TokenType.ASC, TokenType.DESC} 772 773 COMMENT_TABLE_ALIAS_TOKENS = { 774 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 775 TokenType.GRANT, 776 } - {TokenType.ASC, TokenType.DESC} 777 778 UPDATE_ALIAS_TOKENS = { 779 *parser.Parser.UPDATE_ALIAS_TOKENS, 780 TokenType.GRANT, 781 } - {TokenType.ASC, TokenType.DESC} 782 783 FUNCTIONS = { 784 **parser.Parser.FUNCTIONS, 785 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 786 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 787 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 788 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 789 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 790 "BOOL": exp.JSONBool.from_arg_list, 791 "CONTAINS_SUBSTR": _build_contains_substring, 792 "DATE": _build_date, 793 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 794 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 795 "DATE_TRUNC": lambda args: exp.DateTrunc( 796 unit=seq_get(args, 1), 797 this=seq_get(args, 0), 798 zone=seq_get(args, 2), 799 ), 800 "DATETIME": _build_datetime, 801 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 802 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 803 "DIV": binary_from_function(exp.IntDiv), 804 "EDIT_DISTANCE": _build_levenshtein, 805 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 806 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 807 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 808 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 809 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 810 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 811 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 812 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 813 "JSON_STRIP_NULLS": _build_json_strip_nulls, 814 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 815 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 816 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 817 "MD5": exp.MD5Digest.from_arg_list, 818 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 819 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 820 ), 821 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 822 "TO_HEX": _build_to_hex, 823 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 824 [seq_get(args, 1), seq_get(args, 0)] 825 ), 826 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 827 [seq_get(args, 1), seq_get(args, 0)] 828 ), 829 "PARSE_TIMESTAMP": _build_parse_timestamp, 830 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 831 [seq_get(args, 1), seq_get(args, 0)] 832 ), 833 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 834 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 835 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 837 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 838 ), 839 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 840 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 841 "SPLIT": lambda args: exp.Split( 842 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 843 this=seq_get(args, 0), 844 expression=seq_get(args, 1) or exp.Literal.string(","), 845 ), 846 "STRPOS": exp.StrPosition.from_arg_list, 847 "TIME": _build_time, 848 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 849 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 850 "TIMESTAMP": _build_timestamp, 851 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 852 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 853 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 854 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 855 ), 856 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 857 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 858 ), 859 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 860 "TO_JSON": lambda args: exp.JSONFormat( 861 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 862 ), 863 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 864 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 865 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 866 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 867 "FROM_HEX": exp.Unhex.from_arg_list, 868 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 869 } 870 # Remove SEARCH to avoid parameter routing issues - let it fall back to Anonymous function 871 FUNCTIONS.pop("SEARCH") 872 873 FUNCTION_PARSERS = { 874 **parser.Parser.FUNCTION_PARSERS, 875 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 876 "JSON_ARRAY": lambda self: self.expression( 877 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 878 ), 879 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 880 "PREDICT": lambda self: self._parse_ml(exp.Predict), 881 "TRANSLATE": lambda self: self._parse_translate(), 882 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 883 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 884 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 885 exp.GenerateEmbedding, is_text=True 886 ), 887 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 888 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 889 } 890 FUNCTION_PARSERS.pop("TRIM") 891 892 NO_PAREN_FUNCTIONS = { 893 **parser.Parser.NO_PAREN_FUNCTIONS, 894 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 895 } 896 897 NESTED_TYPE_TOKENS = { 898 *parser.Parser.NESTED_TYPE_TOKENS, 899 TokenType.TABLE, 900 } 901 902 PROPERTY_PARSERS = { 903 **parser.Parser.PROPERTY_PARSERS, 904 "NOT DETERMINISTIC": lambda self: self.expression( 905 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 906 ), 907 "OPTIONS": lambda self: self._parse_with_property(), 908 } 909 910 CONSTRAINT_PARSERS = { 911 **parser.Parser.CONSTRAINT_PARSERS, 912 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 913 } 914 915 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 916 RANGE_PARSERS.pop(TokenType.OVERLAPS) 917 918 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 919 920 STATEMENT_PARSERS = { 921 **parser.Parser.STATEMENT_PARSERS, 922 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 923 TokenType.END: lambda self: self._parse_as_command(self._prev), 924 TokenType.FOR: lambda self: self._parse_for_in(), 925 TokenType.EXPORT: lambda self: self._parse_export_data(), 926 TokenType.DECLARE: lambda self: self._parse_declare(), 927 } 928 929 BRACKET_OFFSETS = { 930 "OFFSET": (0, False), 931 "ORDINAL": (1, False), 932 "SAFE_OFFSET": (0, True), 933 "SAFE_ORDINAL": (1, True), 934 } 935 936 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 937 index = self._index 938 this = self._parse_range() 939 self._match_text_seq("DO") 940 if self._match(TokenType.COMMAND): 941 self._retreat(index) 942 return self._parse_as_command(self._prev) 943 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 944 945 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 946 this = super()._parse_table_part(schema=schema) or self._parse_number() 947 948 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 949 if isinstance(this, exp.Identifier): 950 table_name = this.name 951 while self._match(TokenType.DASH, advance=False) and self._next: 952 start = self._curr 953 while self._is_connected() and not self._match_set( 954 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 955 ): 956 self._advance() 957 958 if start == self._curr: 959 break 960 961 table_name += self._find_sql(start, self._prev) 962 963 this = exp.Identifier( 964 this=table_name, quoted=this.args.get("quoted") 965 ).update_positions(this) 966 elif isinstance(this, exp.Literal): 967 table_name = this.name 968 969 if self._is_connected() and self._parse_var(any_token=True): 970 table_name += self._prev.text 971 972 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 973 974 return this 975 976 def _parse_table_parts( 977 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 978 ) -> exp.Table: 979 table = super()._parse_table_parts( 980 schema=schema, is_db_reference=is_db_reference, wildcard=True 981 ) 982 983 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 984 if not table.catalog: 985 if table.db: 986 previous_db = table.args["db"] 987 parts = table.db.split(".") 988 if len(parts) == 2 and not table.args["db"].quoted: 989 table.set( 990 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 991 ) 992 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 993 else: 994 previous_this = table.this 995 parts = table.name.split(".") 996 if len(parts) == 2 and not table.this.quoted: 997 table.set( 998 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 999 ) 1000 table.set( 1001 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1002 ) 1003 1004 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1005 alias = table.this 1006 catalog, db, this, *rest = ( 1007 exp.to_identifier(p, quoted=True) 1008 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1009 ) 1010 1011 for part in (catalog, db, this): 1012 if part: 1013 part.update_positions(table.this) 1014 1015 if rest and this: 1016 this = exp.Dot.build([this, *rest]) # type: ignore 1017 1018 table = exp.Table( 1019 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1020 ) 1021 table.meta["quoted_table"] = True 1022 else: 1023 alias = None 1024 1025 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1026 # dataset, so if the project identifier is omitted we need to fix the ast so that 1027 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1028 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1029 # views, because it would seem like the "catalog" part is set, when it'd actually 1030 # be the region/dataset. Merging the two identifiers into a single one is done to 1031 # avoid producing a 4-part Table reference, which would cause issues in the schema 1032 # module, when there are 3-part table names mixed with information schema views. 1033 # 1034 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1035 table_parts = table.parts 1036 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1037 # We need to alias the table here to avoid breaking existing qualified columns. 1038 # This is expected to be safe, because if there's an actual alias coming up in 1039 # the token stream, it will overwrite this one. If there isn't one, we are only 1040 # exposing the name that can be used to reference the view explicitly (a no-op). 1041 exp.alias_( 1042 table, 1043 t.cast(exp.Identifier, alias or table_parts[-1]), 1044 table=True, 1045 copy=False, 1046 ) 1047 1048 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1049 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1050 line=table_parts[-2].meta.get("line"), 1051 col=table_parts[-1].meta.get("col"), 1052 start=table_parts[-2].meta.get("start"), 1053 end=table_parts[-1].meta.get("end"), 1054 ) 1055 table.set("this", new_this) 1056 table.set("db", seq_get(table_parts, -3)) 1057 table.set("catalog", seq_get(table_parts, -4)) 1058 1059 return table 1060 1061 def _parse_column(self) -> t.Optional[exp.Expression]: 1062 column = super()._parse_column() 1063 if isinstance(column, exp.Column): 1064 parts = column.parts 1065 if any("." in p.name for p in parts): 1066 catalog, db, table, this, *rest = ( 1067 exp.to_identifier(p, quoted=True) 1068 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1069 ) 1070 1071 if rest and this: 1072 this = exp.Dot.build([this, *rest]) # type: ignore 1073 1074 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1075 column.meta["quoted_column"] = True 1076 1077 return column 1078 1079 @t.overload 1080 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1081 1082 @t.overload 1083 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1084 1085 def _parse_json_object(self, agg=False): 1086 json_object = super()._parse_json_object() 1087 array_kv_pair = seq_get(json_object.expressions, 0) 1088 1089 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1090 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1091 if ( 1092 array_kv_pair 1093 and isinstance(array_kv_pair.this, exp.Array) 1094 and isinstance(array_kv_pair.expression, exp.Array) 1095 ): 1096 keys = array_kv_pair.this.expressions 1097 values = array_kv_pair.expression.expressions 1098 1099 json_object.set( 1100 "expressions", 1101 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1102 ) 1103 1104 return json_object 1105 1106 def _parse_bracket( 1107 self, this: t.Optional[exp.Expression] = None 1108 ) -> t.Optional[exp.Expression]: 1109 bracket = super()._parse_bracket(this) 1110 1111 if this is bracket: 1112 return bracket 1113 1114 if isinstance(bracket, exp.Bracket): 1115 for expression in bracket.expressions: 1116 name = expression.name.upper() 1117 1118 if name not in self.BRACKET_OFFSETS: 1119 break 1120 1121 offset, safe = self.BRACKET_OFFSETS[name] 1122 bracket.set("offset", offset) 1123 bracket.set("safe", safe) 1124 expression.replace(expression.expressions[0]) 1125 1126 return bracket 1127 1128 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1129 unnest = super()._parse_unnest(with_alias=with_alias) 1130 1131 if not unnest: 1132 return None 1133 1134 unnest_expr = seq_get(unnest.expressions, 0) 1135 if unnest_expr: 1136 from sqlglot.optimizer.annotate_types import annotate_types 1137 1138 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1139 1140 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1141 # in contrast to other dialects such as DuckDB which flattens only the array by default 1142 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1143 array_elem.is_type(exp.DataType.Type.STRUCT) 1144 for array_elem in unnest_expr._type.expressions 1145 ): 1146 unnest.set("explode_array", True) 1147 1148 return unnest 1149 1150 def _parse_make_interval(self) -> exp.MakeInterval: 1151 expr = exp.MakeInterval() 1152 1153 for arg_key in expr.arg_types: 1154 value = self._parse_lambda() 1155 1156 if not value: 1157 break 1158 1159 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1160 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1161 if isinstance(value, exp.Kwarg): 1162 arg_key = value.this.name 1163 1164 expr.set(arg_key, value) 1165 1166 self._match(TokenType.COMMA) 1167 1168 return expr 1169 1170 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1171 self._match_text_seq("MODEL") 1172 this = self._parse_table() 1173 1174 self._match(TokenType.COMMA) 1175 self._match_text_seq("TABLE") 1176 1177 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1178 expression = ( 1179 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1180 ) 1181 1182 self._match(TokenType.COMMA) 1183 1184 return self.expression( 1185 expr_type, 1186 this=this, 1187 expression=expression, 1188 params_struct=self._parse_bitwise(), 1189 **kwargs, 1190 ) 1191 1192 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1193 # Check if this is ML.TRANSLATE by looking at previous tokens 1194 token = seq_get(self._tokens, self._index - 4) 1195 if token and token.text.upper() == "ML": 1196 return self._parse_ml(exp.MLTranslate) 1197 1198 return exp.Translate.from_arg_list(self._parse_function_args()) 1199 1200 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1201 self._match(TokenType.TABLE) 1202 this = self._parse_table() 1203 1204 expr = self.expression(exp.FeaturesAtTime, this=this) 1205 1206 while self._match(TokenType.COMMA): 1207 arg = self._parse_lambda() 1208 1209 # Get the LHS of the Kwarg and set the arg to that value, e.g 1210 # "num_rows => 1" sets the expr's `num_rows` arg 1211 if arg: 1212 expr.set(arg.this.name, arg) 1213 1214 return expr 1215 1216 def _parse_vector_search(self) -> exp.VectorSearch: 1217 self._match(TokenType.TABLE) 1218 base_table = self._parse_table() 1219 1220 self._match(TokenType.COMMA) 1221 1222 column_to_search = self._parse_bitwise() 1223 self._match(TokenType.COMMA) 1224 1225 self._match(TokenType.TABLE) 1226 query_table = self._parse_table() 1227 1228 expr = self.expression( 1229 exp.VectorSearch, 1230 this=base_table, 1231 column_to_search=column_to_search, 1232 query_table=query_table, 1233 ) 1234 1235 while self._match(TokenType.COMMA): 1236 # query_column_to_search can be named argument or positional 1237 if self._match(TokenType.STRING, advance=False): 1238 query_column = self._parse_string() 1239 expr.set("query_column_to_search", query_column) 1240 else: 1241 arg = self._parse_lambda() 1242 if arg: 1243 expr.set(arg.this.name, arg) 1244 1245 return expr 1246 1247 def _parse_export_data(self) -> exp.Export: 1248 self._match_text_seq("DATA") 1249 1250 return self.expression( 1251 exp.Export, 1252 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1253 options=self._parse_properties(), 1254 this=self._match_text_seq("AS") and self._parse_select(), 1255 ) 1256 1257 class Generator(generator.Generator): 1258 INTERVAL_ALLOWS_PLURAL_FORM = False 1259 JOIN_HINTS = False 1260 QUERY_HINTS = False 1261 TABLE_HINTS = False 1262 LIMIT_FETCH = "LIMIT" 1263 RENAME_TABLE_WITH_DB = False 1264 NVL2_SUPPORTED = False 1265 UNNEST_WITH_ORDINALITY = False 1266 COLLATE_IS_FUNC = True 1267 LIMIT_ONLY_LITERALS = True 1268 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1269 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1270 JSON_KEY_VALUE_PAIR_SEP = "," 1271 NULL_ORDERING_SUPPORTED = False 1272 IGNORE_NULLS_IN_FUNC = True 1273 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1274 CAN_IMPLEMENT_ARRAY_ANY = True 1275 SUPPORTS_TO_NUMBER = False 1276 NAMED_PLACEHOLDER_TOKEN = "@" 1277 HEX_FUNC = "TO_HEX" 1278 WITH_PROPERTIES_PREFIX = "OPTIONS" 1279 SUPPORTS_EXPLODING_PROJECTIONS = False 1280 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1281 SUPPORTS_UNIX_SECONDS = True 1282 1283 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1284 1285 TS_OR_DS_TYPES = ( 1286 exp.TsOrDsToDatetime, 1287 exp.TsOrDsToTimestamp, 1288 exp.TsOrDsToTime, 1289 exp.TsOrDsToDate, 1290 ) 1291 1292 TRANSFORMS = { 1293 **generator.Generator.TRANSFORMS, 1294 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1295 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1296 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1297 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1298 exp.Array: inline_array_unless_query, 1299 exp.ArrayContains: _array_contains_sql, 1300 exp.ArrayFilter: filter_array_using_unnest, 1301 exp.ArrayRemove: filter_array_using_unnest, 1302 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1303 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1304 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1305 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1306 exp.ByteLength: rename_func("BYTE_LENGTH"), 1307 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1308 exp.CollateProperty: lambda self, e: ( 1309 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1310 if e.args.get("default") 1311 else f"COLLATE {self.sql(e, 'this')}" 1312 ), 1313 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1314 exp.CountIf: rename_func("COUNTIF"), 1315 exp.Create: _create_sql, 1316 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1317 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1318 exp.DateDiff: lambda self, e: self.func( 1319 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1320 ), 1321 exp.DateFromParts: rename_func("DATE"), 1322 exp.DateStrToDate: datestrtodate_sql, 1323 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1324 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1325 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1326 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1327 exp.FromTimeZone: lambda self, e: self.func( 1328 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1329 ), 1330 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1331 exp.GroupConcat: lambda self, e: groupconcat_sql( 1332 self, e, func_name="STRING_AGG", within_group=False 1333 ), 1334 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1335 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1336 exp.If: if_sql(false_value="NULL"), 1337 exp.ILike: no_ilike_sql, 1338 exp.IntDiv: rename_func("DIV"), 1339 exp.Int64: rename_func("INT64"), 1340 exp.JSONBool: rename_func("BOOL"), 1341 exp.JSONExtract: _json_extract_sql, 1342 exp.JSONExtractArray: _json_extract_sql, 1343 exp.JSONExtractScalar: _json_extract_sql, 1344 exp.JSONFormat: lambda self, e: self.func( 1345 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1346 e.this, 1347 e.args.get("options"), 1348 ), 1349 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1350 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1351 exp.Levenshtein: _levenshtein_sql, 1352 exp.Max: max_or_greatest, 1353 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1354 exp.MD5Digest: rename_func("MD5"), 1355 exp.Min: min_or_least, 1356 exp.Normalize: lambda self, e: self.func( 1357 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1358 e.this, 1359 e.args.get("form"), 1360 ), 1361 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1362 exp.RegexpExtract: lambda self, e: self.func( 1363 "REGEXP_EXTRACT", 1364 e.this, 1365 e.expression, 1366 e.args.get("position"), 1367 e.args.get("occurrence"), 1368 ), 1369 exp.RegexpExtractAll: lambda self, e: self.func( 1370 "REGEXP_EXTRACT_ALL", e.this, e.expression 1371 ), 1372 exp.RegexpReplace: regexp_replace_sql, 1373 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1374 exp.ReturnsProperty: _returnsproperty_sql, 1375 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1376 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1377 exp.ParseDatetime: lambda self, e: self.func( 1378 "PARSE_DATETIME", self.format_time(e), e.this 1379 ), 1380 exp.Select: transforms.preprocess( 1381 [ 1382 transforms.explode_projection_to_unnest(), 1383 transforms.unqualify_unnest, 1384 transforms.eliminate_distinct_on, 1385 _alias_ordered_group, 1386 transforms.eliminate_semi_and_anti_joins, 1387 ] 1388 ), 1389 exp.SHA: rename_func("SHA1"), 1390 exp.SHA2: sha256_sql, 1391 exp.StabilityProperty: lambda self, e: ( 1392 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1393 ), 1394 exp.String: rename_func("STRING"), 1395 exp.StrPosition: lambda self, e: ( 1396 strposition_sql( 1397 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1398 ) 1399 ), 1400 exp.StrToDate: _str_to_datetime_sql, 1401 exp.StrToTime: _str_to_datetime_sql, 1402 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1403 exp.TimeFromParts: rename_func("TIME"), 1404 exp.TimestampFromParts: rename_func("DATETIME"), 1405 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1406 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1407 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1408 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1409 exp.TimeStrToTime: timestrtotime_sql, 1410 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1411 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1412 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1413 exp.TsOrDsToTime: rename_func("TIME"), 1414 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1415 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1416 exp.Unhex: rename_func("FROM_HEX"), 1417 exp.UnixDate: rename_func("UNIX_DATE"), 1418 exp.UnixToTime: _unix_to_time_sql, 1419 exp.Uuid: lambda *_: "GENERATE_UUID()", 1420 exp.Values: _derived_table_values_to_unnest, 1421 exp.VariancePop: rename_func("VAR_POP"), 1422 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1423 } 1424 1425 SUPPORTED_JSON_PATH_PARTS = { 1426 exp.JSONPathKey, 1427 exp.JSONPathRoot, 1428 exp.JSONPathSubscript, 1429 } 1430 1431 TYPE_MAPPING = { 1432 **generator.Generator.TYPE_MAPPING, 1433 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1434 exp.DataType.Type.BIGINT: "INT64", 1435 exp.DataType.Type.BINARY: "BYTES", 1436 exp.DataType.Type.BLOB: "BYTES", 1437 exp.DataType.Type.BOOLEAN: "BOOL", 1438 exp.DataType.Type.CHAR: "STRING", 1439 exp.DataType.Type.DECIMAL: "NUMERIC", 1440 exp.DataType.Type.DOUBLE: "FLOAT64", 1441 exp.DataType.Type.FLOAT: "FLOAT64", 1442 exp.DataType.Type.INT: "INT64", 1443 exp.DataType.Type.NCHAR: "STRING", 1444 exp.DataType.Type.NVARCHAR: "STRING", 1445 exp.DataType.Type.SMALLINT: "INT64", 1446 exp.DataType.Type.TEXT: "STRING", 1447 exp.DataType.Type.TIMESTAMP: "DATETIME", 1448 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1449 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1450 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1451 exp.DataType.Type.TINYINT: "INT64", 1452 exp.DataType.Type.ROWVERSION: "BYTES", 1453 exp.DataType.Type.UUID: "STRING", 1454 exp.DataType.Type.VARBINARY: "BYTES", 1455 exp.DataType.Type.VARCHAR: "STRING", 1456 exp.DataType.Type.VARIANT: "ANY TYPE", 1457 } 1458 1459 PROPERTIES_LOCATION = { 1460 **generator.Generator.PROPERTIES_LOCATION, 1461 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1462 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1463 } 1464 1465 # WINDOW comes after QUALIFY 1466 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1467 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1468 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1469 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1470 } 1471 1472 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1473 RESERVED_KEYWORDS = { 1474 "all", 1475 "and", 1476 "any", 1477 "array", 1478 "as", 1479 "asc", 1480 "assert_rows_modified", 1481 "at", 1482 "between", 1483 "by", 1484 "case", 1485 "cast", 1486 "collate", 1487 "contains", 1488 "create", 1489 "cross", 1490 "cube", 1491 "current", 1492 "default", 1493 "define", 1494 "desc", 1495 "distinct", 1496 "else", 1497 "end", 1498 "enum", 1499 "escape", 1500 "except", 1501 "exclude", 1502 "exists", 1503 "extract", 1504 "false", 1505 "fetch", 1506 "following", 1507 "for", 1508 "from", 1509 "full", 1510 "group", 1511 "grouping", 1512 "groups", 1513 "hash", 1514 "having", 1515 "if", 1516 "ignore", 1517 "in", 1518 "inner", 1519 "intersect", 1520 "interval", 1521 "into", 1522 "is", 1523 "join", 1524 "lateral", 1525 "left", 1526 "like", 1527 "limit", 1528 "lookup", 1529 "merge", 1530 "natural", 1531 "new", 1532 "no", 1533 "not", 1534 "null", 1535 "nulls", 1536 "of", 1537 "on", 1538 "or", 1539 "order", 1540 "outer", 1541 "over", 1542 "partition", 1543 "preceding", 1544 "proto", 1545 "qualify", 1546 "range", 1547 "recursive", 1548 "respect", 1549 "right", 1550 "rollup", 1551 "rows", 1552 "select", 1553 "set", 1554 "some", 1555 "struct", 1556 "tablesample", 1557 "then", 1558 "to", 1559 "treat", 1560 "true", 1561 "unbounded", 1562 "union", 1563 "unnest", 1564 "using", 1565 "when", 1566 "where", 1567 "window", 1568 "with", 1569 "within", 1570 } 1571 1572 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1573 unit = expression.unit 1574 unit_sql = unit.name if unit.is_string else self.sql(unit) 1575 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1576 1577 def mod_sql(self, expression: exp.Mod) -> str: 1578 this = expression.this 1579 expr = expression.expression 1580 return self.func( 1581 "MOD", 1582 this.unnest() if isinstance(this, exp.Paren) else this, 1583 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1584 ) 1585 1586 def column_parts(self, expression: exp.Column) -> str: 1587 if expression.meta.get("quoted_column"): 1588 # If a column reference is of the form `dataset.table`.name, we need 1589 # to preserve the quoted table path, otherwise the reference breaks 1590 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1591 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1592 return f"{table_path}.{self.sql(expression, 'this')}" 1593 1594 return super().column_parts(expression) 1595 1596 def table_parts(self, expression: exp.Table) -> str: 1597 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1598 # we need to make sure the correct quoting is used in each case. 1599 # 1600 # For example, if there is a CTE x that clashes with a schema name, then the former will 1601 # return the table y in that schema, whereas the latter will return the CTE's y column: 1602 # 1603 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1604 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1605 if expression.meta.get("quoted_table"): 1606 table_parts = ".".join(p.name for p in expression.parts) 1607 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1608 1609 return super().table_parts(expression) 1610 1611 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1612 this = expression.this 1613 if isinstance(this, exp.TsOrDsToDatetime): 1614 func_name = "FORMAT_DATETIME" 1615 elif isinstance(this, exp.TsOrDsToTimestamp): 1616 func_name = "FORMAT_TIMESTAMP" 1617 elif isinstance(this, exp.TsOrDsToTime): 1618 func_name = "FORMAT_TIME" 1619 else: 1620 func_name = "FORMAT_DATE" 1621 1622 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1623 return self.func( 1624 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1625 ) 1626 1627 def eq_sql(self, expression: exp.EQ) -> str: 1628 # Operands of = cannot be NULL in BigQuery 1629 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1630 if not isinstance(expression.parent, exp.Update): 1631 return "NULL" 1632 1633 return self.binary(expression, "=") 1634 1635 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1636 parent = expression.parent 1637 1638 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1639 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1640 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1641 return self.func( 1642 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1643 ) 1644 1645 return super().attimezone_sql(expression) 1646 1647 def trycast_sql(self, expression: exp.TryCast) -> str: 1648 return self.cast_sql(expression, safe_prefix="SAFE_") 1649 1650 def bracket_sql(self, expression: exp.Bracket) -> str: 1651 this = expression.this 1652 expressions = expression.expressions 1653 1654 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1655 arg = expressions[0] 1656 if arg.type is None: 1657 from sqlglot.optimizer.annotate_types import annotate_types 1658 1659 arg = annotate_types(arg, dialect=self.dialect) 1660 1661 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1662 # BQ doesn't support bracket syntax with string values for structs 1663 return f"{self.sql(this)}.{arg.name}" 1664 1665 expressions_sql = self.expressions(expression, flat=True) 1666 offset = expression.args.get("offset") 1667 1668 if offset == 0: 1669 expressions_sql = f"OFFSET({expressions_sql})" 1670 elif offset == 1: 1671 expressions_sql = f"ORDINAL({expressions_sql})" 1672 elif offset is not None: 1673 self.unsupported(f"Unsupported array offset: {offset}") 1674 1675 if expression.args.get("safe"): 1676 expressions_sql = f"SAFE_{expressions_sql}" 1677 1678 return f"{self.sql(this)}[{expressions_sql}]" 1679 1680 def in_unnest_op(self, expression: exp.Unnest) -> str: 1681 return self.sql(expression) 1682 1683 def version_sql(self, expression: exp.Version) -> str: 1684 if expression.name == "TIMESTAMP": 1685 expression.set("this", "SYSTEM_TIME") 1686 return super().version_sql(expression) 1687 1688 def contains_sql(self, expression: exp.Contains) -> str: 1689 this = expression.this 1690 expr = expression.expression 1691 1692 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1693 this = this.this 1694 expr = expr.this 1695 1696 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1697 1698 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1699 this = expression.this 1700 1701 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1702 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1703 # because they aren't literals and so the above syntax is invalid BigQuery. 1704 if isinstance(this, exp.Array): 1705 elem = seq_get(this.expressions, 0) 1706 if not (elem and elem.find(exp.Query)): 1707 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1708 1709 return super().cast_sql(expression, safe_prefix=safe_prefix) 1710 1711 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1712 variables = self.expressions(expression, "this") 1713 default = self.sql(expression, "default") 1714 default = f" DEFAULT {default}" if default else "" 1715 kind = self.sql(expression, "kind") 1716 kind = f" {kind}" if kind else "" 1717 1718 return f"{variables}{kind}{default}"
440class BigQuery(Dialect): 441 WEEK_OFFSET = -1 442 UNNEST_COLUMN_ONLY = True 443 SUPPORTS_USER_DEFINED_TYPES = False 444 SUPPORTS_SEMI_ANTI_JOIN = False 445 LOG_BASE_FIRST = False 446 HEX_LOWERCASE = True 447 FORCE_EARLY_ALIAS_REF_EXPANSION = True 448 PRESERVE_ORIGINAL_NAMES = True 449 HEX_STRING_IS_INTEGER_TYPE = True 450 451 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 452 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 453 454 # bigquery udfs are case sensitive 455 NORMALIZE_FUNCTIONS = False 456 457 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 458 TIME_MAPPING = { 459 "%D": "%m/%d/%y", 460 "%E6S": "%S.%f", 461 "%e": "%-d", 462 } 463 464 FORMAT_MAPPING = { 465 "DD": "%d", 466 "MM": "%m", 467 "MON": "%b", 468 "MONTH": "%B", 469 "YYYY": "%Y", 470 "YY": "%y", 471 "HH": "%I", 472 "HH12": "%I", 473 "HH24": "%H", 474 "MI": "%M", 475 "SS": "%S", 476 "SSSSS": "%f", 477 "TZH": "%z", 478 } 479 480 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 481 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 482 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 483 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 484 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE", "_TABLE_SUFFIX", "_FILE_NAME"} 485 486 # All set operations require either a DISTINCT or ALL specifier 487 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 488 489 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 490 COERCES_TO = { 491 **TypeAnnotator.COERCES_TO, 492 exp.DataType.Type.BIGDECIMAL: {exp.DataType.Type.DOUBLE}, 493 } 494 COERCES_TO[exp.DataType.Type.DECIMAL] |= {exp.DataType.Type.BIGDECIMAL} 495 COERCES_TO[exp.DataType.Type.BIGINT] |= {exp.DataType.Type.BIGDECIMAL} 496 497 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 498 TYPE_TO_EXPRESSIONS = { 499 **Dialect.TYPE_TO_EXPRESSIONS, 500 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 501 } 502 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 503 504 ANNOTATORS = { 505 **Dialect.ANNOTATORS, 506 **{ 507 expr_type: annotate_with_type_lambda(data_type) 508 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 509 for expr_type in expressions 510 }, 511 **{ 512 expr_type: lambda self, e: _annotate_math_functions(self, e) 513 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 514 }, 515 **{ 516 expr_type: lambda self, e: self._annotate_by_args(e, "this") 517 for expr_type in ( 518 exp.Abs, 519 exp.ArgMax, 520 exp.ArgMin, 521 exp.DateTrunc, 522 exp.DatetimeTrunc, 523 exp.FirstValue, 524 exp.GroupConcat, 525 exp.IgnoreNulls, 526 exp.JSONExtract, 527 exp.Lead, 528 exp.Left, 529 exp.Lower, 530 exp.NthValue, 531 exp.Pad, 532 exp.PercentileDisc, 533 exp.RegexpExtract, 534 exp.RegexpReplace, 535 exp.Repeat, 536 exp.Replace, 537 exp.RespectNulls, 538 exp.Reverse, 539 exp.Right, 540 exp.SafeNegate, 541 exp.Sign, 542 exp.Substring, 543 exp.TimestampTrunc, 544 exp.Translate, 545 exp.Trim, 546 exp.Upper, 547 ) 548 }, 549 exp.Acos: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 550 exp.Acosh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 551 exp.Asin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 552 exp.Asinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 553 exp.Atan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 554 exp.Atanh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 555 exp.Atan2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 556 exp.ApproxTopSum: lambda self, e: _annotate_by_args_approx_top(self, e), 557 exp.ApproxTopK: lambda self, e: _annotate_by_args_approx_top(self, e), 558 exp.ApproxQuantiles: lambda self, e: self._annotate_by_args(e, "this", array=True), 559 exp.Array: _annotate_array, 560 exp.ArrayConcat: lambda self, e: self._annotate_by_args(e, "this", "expressions"), 561 exp.Ascii: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 562 exp.BitwiseAndAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 563 exp.BitwiseOrAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 564 exp.BitwiseXorAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 565 exp.BitwiseCountAgg: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 566 exp.ByteLength: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 567 exp.ByteString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 568 exp.Cbrt: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 569 exp.CodePointsToBytes: lambda self, e: self._annotate_with_type( 570 e, exp.DataType.Type.BINARY 571 ), 572 exp.CodePointsToString: lambda self, e: self._annotate_with_type( 573 e, exp.DataType.Type.VARCHAR 574 ), 575 exp.Concat: _annotate_concat, 576 exp.Corr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 577 exp.Cot: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 578 exp.CosineDistance: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 579 exp.Coth: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 580 exp.CovarPop: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 581 exp.CovarSamp: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 582 exp.Csc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 583 exp.Csch: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 584 exp.CumeDist: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 585 exp.DateFromUnixDate: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATE), 586 exp.DenseRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 587 exp.EuclideanDistance: lambda self, e: self._annotate_with_type( 588 e, exp.DataType.Type.DOUBLE 589 ), 590 exp.FarmFingerprint: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 591 exp.Unhex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 592 exp.Float64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 593 exp.Format: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 594 exp.GenerateTimestampArray: lambda self, e: self._annotate_with_type( 595 e, exp.DataType.build("ARRAY<TIMESTAMP>", dialect="bigquery") 596 ), 597 exp.Grouping: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 598 exp.IsInf: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 599 exp.IsNan: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 600 exp.JSONArray: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 601 exp.JSONArrayAppend: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 602 exp.JSONArrayInsert: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 603 exp.JSONBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 604 exp.JSONExtractScalar: lambda self, e: self._annotate_with_type( 605 e, exp.DataType.Type.VARCHAR 606 ), 607 exp.JSONExtractArray: lambda self, e: self._annotate_by_args(e, "this", array=True), 608 exp.JSONFormat: lambda self, e: self._annotate_with_type( 609 e, exp.DataType.Type.JSON if e.args.get("to_json") else exp.DataType.Type.VARCHAR 610 ), 611 exp.JSONKeysAtDepth: lambda self, e: self._annotate_with_type( 612 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 613 ), 614 exp.JSONObject: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 615 exp.JSONRemove: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 616 exp.JSONSet: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 617 exp.JSONStripNulls: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.JSON), 618 exp.JSONType: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 619 exp.JSONValueArray: lambda self, e: self._annotate_with_type( 620 e, exp.DataType.build("ARRAY<VARCHAR>", dialect="bigquery") 621 ), 622 exp.Lag: lambda self, e: self._annotate_by_args(e, "this", "default"), 623 exp.LowerHex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 624 exp.LaxBool: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BOOLEAN), 625 exp.LaxFloat64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 626 exp.LaxInt64: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 627 exp.LaxString: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 628 exp.MD5Digest: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 629 exp.Normalize: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 630 exp.Ntile: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 631 exp.ParseTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 632 exp.ParseDatetime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DATETIME), 633 exp.ParseBignumeric: lambda self, e: self._annotate_with_type( 634 e, exp.DataType.Type.BIGDECIMAL 635 ), 636 exp.ParseNumeric: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DECIMAL), 637 exp.PercentileCont: lambda self, e: _annotate_by_args_with_coerce(self, e), 638 exp.PercentRank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 639 exp.Rank: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 640 exp.RangeBucket: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 641 exp.RegexpExtractAll: lambda self, e: self._annotate_by_args(e, "this", array=True), 642 exp.RegexpInstr: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 643 exp.RowNumber: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 644 exp.Rand: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 645 exp.SafeConvertBytesToString: lambda self, e: self._annotate_with_type( 646 e, exp.DataType.Type.VARCHAR 647 ), 648 exp.SafeAdd: lambda self, e: _annotate_by_args_with_coerce(self, e), 649 exp.SafeMultiply: lambda self, e: _annotate_by_args_with_coerce(self, e), 650 exp.SafeSubtract: lambda self, e: _annotate_by_args_with_coerce(self, e), 651 exp.Sec: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 652 exp.Sech: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 653 exp.Soundex: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 654 exp.SHA: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 655 exp.SHA2: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BINARY), 656 exp.Sin: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 657 exp.Sinh: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.DOUBLE), 658 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 659 exp.TimestampFromParts: lambda self, e: self._annotate_with_type( 660 e, exp.DataType.Type.DATETIME 661 ), 662 exp.TimeFromParts: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 663 exp.TimeTrunc: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 664 exp.ToCodePoints: lambda self, e: self._annotate_with_type( 665 e, exp.DataType.build("ARRAY<BIGINT>", dialect="bigquery") 666 ), 667 exp.TsOrDsToTime: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.TIME), 668 exp.Unicode: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.BIGINT), 669 exp.Uuid: lambda self, e: self._annotate_with_type(e, exp.DataType.Type.VARCHAR), 670 } 671 672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression) 700 701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 } 706 707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+") 751 752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 759 ID_VAR_TOKENS = { 760 *parser.Parser.ID_VAR_TOKENS, 761 TokenType.GRANT, 762 } - {TokenType.ASC, TokenType.DESC} 763 764 ALIAS_TOKENS = { 765 *parser.Parser.ALIAS_TOKENS, 766 TokenType.GRANT, 767 } - {TokenType.ASC, TokenType.DESC} 768 769 TABLE_ALIAS_TOKENS = { 770 *parser.Parser.TABLE_ALIAS_TOKENS, 771 TokenType.GRANT, 772 } - {TokenType.ASC, TokenType.DESC} 773 774 COMMENT_TABLE_ALIAS_TOKENS = { 775 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 776 TokenType.GRANT, 777 } - {TokenType.ASC, TokenType.DESC} 778 779 UPDATE_ALIAS_TOKENS = { 780 *parser.Parser.UPDATE_ALIAS_TOKENS, 781 TokenType.GRANT, 782 } - {TokenType.ASC, TokenType.DESC} 783 784 FUNCTIONS = { 785 **parser.Parser.FUNCTIONS, 786 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 787 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 788 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 789 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 790 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 791 "BOOL": exp.JSONBool.from_arg_list, 792 "CONTAINS_SUBSTR": _build_contains_substring, 793 "DATE": _build_date, 794 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 795 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 796 "DATE_TRUNC": lambda args: exp.DateTrunc( 797 unit=seq_get(args, 1), 798 this=seq_get(args, 0), 799 zone=seq_get(args, 2), 800 ), 801 "DATETIME": _build_datetime, 802 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 803 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 804 "DIV": binary_from_function(exp.IntDiv), 805 "EDIT_DISTANCE": _build_levenshtein, 806 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 807 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 808 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 809 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 810 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 811 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 812 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 813 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 814 "JSON_STRIP_NULLS": _build_json_strip_nulls, 815 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 816 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 817 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 818 "MD5": exp.MD5Digest.from_arg_list, 819 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 820 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 821 ), 822 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 823 "TO_HEX": _build_to_hex, 824 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 825 [seq_get(args, 1), seq_get(args, 0)] 826 ), 827 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 828 [seq_get(args, 1), seq_get(args, 0)] 829 ), 830 "PARSE_TIMESTAMP": _build_parse_timestamp, 831 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 832 [seq_get(args, 1), seq_get(args, 0)] 833 ), 834 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 835 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 837 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 838 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 839 ), 840 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 841 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 842 "SPLIT": lambda args: exp.Split( 843 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 844 this=seq_get(args, 0), 845 expression=seq_get(args, 1) or exp.Literal.string(","), 846 ), 847 "STRPOS": exp.StrPosition.from_arg_list, 848 "TIME": _build_time, 849 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 850 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 851 "TIMESTAMP": _build_timestamp, 852 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 853 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 854 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 855 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 856 ), 857 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 858 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 859 ), 860 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 861 "TO_JSON": lambda args: exp.JSONFormat( 862 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 863 ), 864 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 865 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 866 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 867 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 868 "FROM_HEX": exp.Unhex.from_arg_list, 869 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 870 } 871 # Remove SEARCH to avoid parameter routing issues - let it fall back to Anonymous function 872 FUNCTIONS.pop("SEARCH") 873 874 FUNCTION_PARSERS = { 875 **parser.Parser.FUNCTION_PARSERS, 876 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 877 "JSON_ARRAY": lambda self: self.expression( 878 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 879 ), 880 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 881 "PREDICT": lambda self: self._parse_ml(exp.Predict), 882 "TRANSLATE": lambda self: self._parse_translate(), 883 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 884 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 885 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 886 exp.GenerateEmbedding, is_text=True 887 ), 888 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 889 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 890 } 891 FUNCTION_PARSERS.pop("TRIM") 892 893 NO_PAREN_FUNCTIONS = { 894 **parser.Parser.NO_PAREN_FUNCTIONS, 895 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 896 } 897 898 NESTED_TYPE_TOKENS = { 899 *parser.Parser.NESTED_TYPE_TOKENS, 900 TokenType.TABLE, 901 } 902 903 PROPERTY_PARSERS = { 904 **parser.Parser.PROPERTY_PARSERS, 905 "NOT DETERMINISTIC": lambda self: self.expression( 906 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 907 ), 908 "OPTIONS": lambda self: self._parse_with_property(), 909 } 910 911 CONSTRAINT_PARSERS = { 912 **parser.Parser.CONSTRAINT_PARSERS, 913 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 914 } 915 916 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 917 RANGE_PARSERS.pop(TokenType.OVERLAPS) 918 919 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 920 921 STATEMENT_PARSERS = { 922 **parser.Parser.STATEMENT_PARSERS, 923 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 924 TokenType.END: lambda self: self._parse_as_command(self._prev), 925 TokenType.FOR: lambda self: self._parse_for_in(), 926 TokenType.EXPORT: lambda self: self._parse_export_data(), 927 TokenType.DECLARE: lambda self: self._parse_declare(), 928 } 929 930 BRACKET_OFFSETS = { 931 "OFFSET": (0, False), 932 "ORDINAL": (1, False), 933 "SAFE_OFFSET": (0, True), 934 "SAFE_ORDINAL": (1, True), 935 } 936 937 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 938 index = self._index 939 this = self._parse_range() 940 self._match_text_seq("DO") 941 if self._match(TokenType.COMMAND): 942 self._retreat(index) 943 return self._parse_as_command(self._prev) 944 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 945 946 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 947 this = super()._parse_table_part(schema=schema) or self._parse_number() 948 949 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 950 if isinstance(this, exp.Identifier): 951 table_name = this.name 952 while self._match(TokenType.DASH, advance=False) and self._next: 953 start = self._curr 954 while self._is_connected() and not self._match_set( 955 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 956 ): 957 self._advance() 958 959 if start == self._curr: 960 break 961 962 table_name += self._find_sql(start, self._prev) 963 964 this = exp.Identifier( 965 this=table_name, quoted=this.args.get("quoted") 966 ).update_positions(this) 967 elif isinstance(this, exp.Literal): 968 table_name = this.name 969 970 if self._is_connected() and self._parse_var(any_token=True): 971 table_name += self._prev.text 972 973 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 974 975 return this 976 977 def _parse_table_parts( 978 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 979 ) -> exp.Table: 980 table = super()._parse_table_parts( 981 schema=schema, is_db_reference=is_db_reference, wildcard=True 982 ) 983 984 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 985 if not table.catalog: 986 if table.db: 987 previous_db = table.args["db"] 988 parts = table.db.split(".") 989 if len(parts) == 2 and not table.args["db"].quoted: 990 table.set( 991 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 992 ) 993 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 994 else: 995 previous_this = table.this 996 parts = table.name.split(".") 997 if len(parts) == 2 and not table.this.quoted: 998 table.set( 999 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 1000 ) 1001 table.set( 1002 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1003 ) 1004 1005 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1006 alias = table.this 1007 catalog, db, this, *rest = ( 1008 exp.to_identifier(p, quoted=True) 1009 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1010 ) 1011 1012 for part in (catalog, db, this): 1013 if part: 1014 part.update_positions(table.this) 1015 1016 if rest and this: 1017 this = exp.Dot.build([this, *rest]) # type: ignore 1018 1019 table = exp.Table( 1020 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1021 ) 1022 table.meta["quoted_table"] = True 1023 else: 1024 alias = None 1025 1026 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1027 # dataset, so if the project identifier is omitted we need to fix the ast so that 1028 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1029 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1030 # views, because it would seem like the "catalog" part is set, when it'd actually 1031 # be the region/dataset. Merging the two identifiers into a single one is done to 1032 # avoid producing a 4-part Table reference, which would cause issues in the schema 1033 # module, when there are 3-part table names mixed with information schema views. 1034 # 1035 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1036 table_parts = table.parts 1037 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1038 # We need to alias the table here to avoid breaking existing qualified columns. 1039 # This is expected to be safe, because if there's an actual alias coming up in 1040 # the token stream, it will overwrite this one. If there isn't one, we are only 1041 # exposing the name that can be used to reference the view explicitly (a no-op). 1042 exp.alias_( 1043 table, 1044 t.cast(exp.Identifier, alias or table_parts[-1]), 1045 table=True, 1046 copy=False, 1047 ) 1048 1049 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1050 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1051 line=table_parts[-2].meta.get("line"), 1052 col=table_parts[-1].meta.get("col"), 1053 start=table_parts[-2].meta.get("start"), 1054 end=table_parts[-1].meta.get("end"), 1055 ) 1056 table.set("this", new_this) 1057 table.set("db", seq_get(table_parts, -3)) 1058 table.set("catalog", seq_get(table_parts, -4)) 1059 1060 return table 1061 1062 def _parse_column(self) -> t.Optional[exp.Expression]: 1063 column = super()._parse_column() 1064 if isinstance(column, exp.Column): 1065 parts = column.parts 1066 if any("." in p.name for p in parts): 1067 catalog, db, table, this, *rest = ( 1068 exp.to_identifier(p, quoted=True) 1069 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1070 ) 1071 1072 if rest and this: 1073 this = exp.Dot.build([this, *rest]) # type: ignore 1074 1075 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1076 column.meta["quoted_column"] = True 1077 1078 return column 1079 1080 @t.overload 1081 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1082 1083 @t.overload 1084 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1085 1086 def _parse_json_object(self, agg=False): 1087 json_object = super()._parse_json_object() 1088 array_kv_pair = seq_get(json_object.expressions, 0) 1089 1090 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1091 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1092 if ( 1093 array_kv_pair 1094 and isinstance(array_kv_pair.this, exp.Array) 1095 and isinstance(array_kv_pair.expression, exp.Array) 1096 ): 1097 keys = array_kv_pair.this.expressions 1098 values = array_kv_pair.expression.expressions 1099 1100 json_object.set( 1101 "expressions", 1102 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1103 ) 1104 1105 return json_object 1106 1107 def _parse_bracket( 1108 self, this: t.Optional[exp.Expression] = None 1109 ) -> t.Optional[exp.Expression]: 1110 bracket = super()._parse_bracket(this) 1111 1112 if this is bracket: 1113 return bracket 1114 1115 if isinstance(bracket, exp.Bracket): 1116 for expression in bracket.expressions: 1117 name = expression.name.upper() 1118 1119 if name not in self.BRACKET_OFFSETS: 1120 break 1121 1122 offset, safe = self.BRACKET_OFFSETS[name] 1123 bracket.set("offset", offset) 1124 bracket.set("safe", safe) 1125 expression.replace(expression.expressions[0]) 1126 1127 return bracket 1128 1129 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1130 unnest = super()._parse_unnest(with_alias=with_alias) 1131 1132 if not unnest: 1133 return None 1134 1135 unnest_expr = seq_get(unnest.expressions, 0) 1136 if unnest_expr: 1137 from sqlglot.optimizer.annotate_types import annotate_types 1138 1139 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1140 1141 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1142 # in contrast to other dialects such as DuckDB which flattens only the array by default 1143 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1144 array_elem.is_type(exp.DataType.Type.STRUCT) 1145 for array_elem in unnest_expr._type.expressions 1146 ): 1147 unnest.set("explode_array", True) 1148 1149 return unnest 1150 1151 def _parse_make_interval(self) -> exp.MakeInterval: 1152 expr = exp.MakeInterval() 1153 1154 for arg_key in expr.arg_types: 1155 value = self._parse_lambda() 1156 1157 if not value: 1158 break 1159 1160 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1161 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1162 if isinstance(value, exp.Kwarg): 1163 arg_key = value.this.name 1164 1165 expr.set(arg_key, value) 1166 1167 self._match(TokenType.COMMA) 1168 1169 return expr 1170 1171 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1172 self._match_text_seq("MODEL") 1173 this = self._parse_table() 1174 1175 self._match(TokenType.COMMA) 1176 self._match_text_seq("TABLE") 1177 1178 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1179 expression = ( 1180 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1181 ) 1182 1183 self._match(TokenType.COMMA) 1184 1185 return self.expression( 1186 expr_type, 1187 this=this, 1188 expression=expression, 1189 params_struct=self._parse_bitwise(), 1190 **kwargs, 1191 ) 1192 1193 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1194 # Check if this is ML.TRANSLATE by looking at previous tokens 1195 token = seq_get(self._tokens, self._index - 4) 1196 if token and token.text.upper() == "ML": 1197 return self._parse_ml(exp.MLTranslate) 1198 1199 return exp.Translate.from_arg_list(self._parse_function_args()) 1200 1201 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1202 self._match(TokenType.TABLE) 1203 this = self._parse_table() 1204 1205 expr = self.expression(exp.FeaturesAtTime, this=this) 1206 1207 while self._match(TokenType.COMMA): 1208 arg = self._parse_lambda() 1209 1210 # Get the LHS of the Kwarg and set the arg to that value, e.g 1211 # "num_rows => 1" sets the expr's `num_rows` arg 1212 if arg: 1213 expr.set(arg.this.name, arg) 1214 1215 return expr 1216 1217 def _parse_vector_search(self) -> exp.VectorSearch: 1218 self._match(TokenType.TABLE) 1219 base_table = self._parse_table() 1220 1221 self._match(TokenType.COMMA) 1222 1223 column_to_search = self._parse_bitwise() 1224 self._match(TokenType.COMMA) 1225 1226 self._match(TokenType.TABLE) 1227 query_table = self._parse_table() 1228 1229 expr = self.expression( 1230 exp.VectorSearch, 1231 this=base_table, 1232 column_to_search=column_to_search, 1233 query_table=query_table, 1234 ) 1235 1236 while self._match(TokenType.COMMA): 1237 # query_column_to_search can be named argument or positional 1238 if self._match(TokenType.STRING, advance=False): 1239 query_column = self._parse_string() 1240 expr.set("query_column_to_search", query_column) 1241 else: 1242 arg = self._parse_lambda() 1243 if arg: 1244 expr.set(arg.this.name, arg) 1245 1246 return expr 1247 1248 def _parse_export_data(self) -> exp.Export: 1249 self._match_text_seq("DATA") 1250 1251 return self.expression( 1252 exp.Export, 1253 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1254 options=self._parse_properties(), 1255 this=self._match_text_seq("AS") and self._parse_select(), 1256 ) 1257 1258 class Generator(generator.Generator): 1259 INTERVAL_ALLOWS_PLURAL_FORM = False 1260 JOIN_HINTS = False 1261 QUERY_HINTS = False 1262 TABLE_HINTS = False 1263 LIMIT_FETCH = "LIMIT" 1264 RENAME_TABLE_WITH_DB = False 1265 NVL2_SUPPORTED = False 1266 UNNEST_WITH_ORDINALITY = False 1267 COLLATE_IS_FUNC = True 1268 LIMIT_ONLY_LITERALS = True 1269 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1270 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1271 JSON_KEY_VALUE_PAIR_SEP = "," 1272 NULL_ORDERING_SUPPORTED = False 1273 IGNORE_NULLS_IN_FUNC = True 1274 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1275 CAN_IMPLEMENT_ARRAY_ANY = True 1276 SUPPORTS_TO_NUMBER = False 1277 NAMED_PLACEHOLDER_TOKEN = "@" 1278 HEX_FUNC = "TO_HEX" 1279 WITH_PROPERTIES_PREFIX = "OPTIONS" 1280 SUPPORTS_EXPLODING_PROJECTIONS = False 1281 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1282 SUPPORTS_UNIX_SECONDS = True 1283 1284 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1285 1286 TS_OR_DS_TYPES = ( 1287 exp.TsOrDsToDatetime, 1288 exp.TsOrDsToTimestamp, 1289 exp.TsOrDsToTime, 1290 exp.TsOrDsToDate, 1291 ) 1292 1293 TRANSFORMS = { 1294 **generator.Generator.TRANSFORMS, 1295 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1296 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1297 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1298 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1299 exp.Array: inline_array_unless_query, 1300 exp.ArrayContains: _array_contains_sql, 1301 exp.ArrayFilter: filter_array_using_unnest, 1302 exp.ArrayRemove: filter_array_using_unnest, 1303 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1304 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1305 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1306 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1307 exp.ByteLength: rename_func("BYTE_LENGTH"), 1308 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1309 exp.CollateProperty: lambda self, e: ( 1310 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1311 if e.args.get("default") 1312 else f"COLLATE {self.sql(e, 'this')}" 1313 ), 1314 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1315 exp.CountIf: rename_func("COUNTIF"), 1316 exp.Create: _create_sql, 1317 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1318 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1319 exp.DateDiff: lambda self, e: self.func( 1320 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1321 ), 1322 exp.DateFromParts: rename_func("DATE"), 1323 exp.DateStrToDate: datestrtodate_sql, 1324 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1325 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1326 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1327 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1328 exp.FromTimeZone: lambda self, e: self.func( 1329 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1330 ), 1331 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1332 exp.GroupConcat: lambda self, e: groupconcat_sql( 1333 self, e, func_name="STRING_AGG", within_group=False 1334 ), 1335 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1336 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1337 exp.If: if_sql(false_value="NULL"), 1338 exp.ILike: no_ilike_sql, 1339 exp.IntDiv: rename_func("DIV"), 1340 exp.Int64: rename_func("INT64"), 1341 exp.JSONBool: rename_func("BOOL"), 1342 exp.JSONExtract: _json_extract_sql, 1343 exp.JSONExtractArray: _json_extract_sql, 1344 exp.JSONExtractScalar: _json_extract_sql, 1345 exp.JSONFormat: lambda self, e: self.func( 1346 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1347 e.this, 1348 e.args.get("options"), 1349 ), 1350 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1351 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1352 exp.Levenshtein: _levenshtein_sql, 1353 exp.Max: max_or_greatest, 1354 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1355 exp.MD5Digest: rename_func("MD5"), 1356 exp.Min: min_or_least, 1357 exp.Normalize: lambda self, e: self.func( 1358 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1359 e.this, 1360 e.args.get("form"), 1361 ), 1362 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1363 exp.RegexpExtract: lambda self, e: self.func( 1364 "REGEXP_EXTRACT", 1365 e.this, 1366 e.expression, 1367 e.args.get("position"), 1368 e.args.get("occurrence"), 1369 ), 1370 exp.RegexpExtractAll: lambda self, e: self.func( 1371 "REGEXP_EXTRACT_ALL", e.this, e.expression 1372 ), 1373 exp.RegexpReplace: regexp_replace_sql, 1374 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1375 exp.ReturnsProperty: _returnsproperty_sql, 1376 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1377 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1378 exp.ParseDatetime: lambda self, e: self.func( 1379 "PARSE_DATETIME", self.format_time(e), e.this 1380 ), 1381 exp.Select: transforms.preprocess( 1382 [ 1383 transforms.explode_projection_to_unnest(), 1384 transforms.unqualify_unnest, 1385 transforms.eliminate_distinct_on, 1386 _alias_ordered_group, 1387 transforms.eliminate_semi_and_anti_joins, 1388 ] 1389 ), 1390 exp.SHA: rename_func("SHA1"), 1391 exp.SHA2: sha256_sql, 1392 exp.StabilityProperty: lambda self, e: ( 1393 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1394 ), 1395 exp.String: rename_func("STRING"), 1396 exp.StrPosition: lambda self, e: ( 1397 strposition_sql( 1398 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1399 ) 1400 ), 1401 exp.StrToDate: _str_to_datetime_sql, 1402 exp.StrToTime: _str_to_datetime_sql, 1403 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1404 exp.TimeFromParts: rename_func("TIME"), 1405 exp.TimestampFromParts: rename_func("DATETIME"), 1406 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1407 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1408 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1409 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1410 exp.TimeStrToTime: timestrtotime_sql, 1411 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1412 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1413 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1414 exp.TsOrDsToTime: rename_func("TIME"), 1415 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1416 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1417 exp.Unhex: rename_func("FROM_HEX"), 1418 exp.UnixDate: rename_func("UNIX_DATE"), 1419 exp.UnixToTime: _unix_to_time_sql, 1420 exp.Uuid: lambda *_: "GENERATE_UUID()", 1421 exp.Values: _derived_table_values_to_unnest, 1422 exp.VariancePop: rename_func("VAR_POP"), 1423 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1424 } 1425 1426 SUPPORTED_JSON_PATH_PARTS = { 1427 exp.JSONPathKey, 1428 exp.JSONPathRoot, 1429 exp.JSONPathSubscript, 1430 } 1431 1432 TYPE_MAPPING = { 1433 **generator.Generator.TYPE_MAPPING, 1434 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1435 exp.DataType.Type.BIGINT: "INT64", 1436 exp.DataType.Type.BINARY: "BYTES", 1437 exp.DataType.Type.BLOB: "BYTES", 1438 exp.DataType.Type.BOOLEAN: "BOOL", 1439 exp.DataType.Type.CHAR: "STRING", 1440 exp.DataType.Type.DECIMAL: "NUMERIC", 1441 exp.DataType.Type.DOUBLE: "FLOAT64", 1442 exp.DataType.Type.FLOAT: "FLOAT64", 1443 exp.DataType.Type.INT: "INT64", 1444 exp.DataType.Type.NCHAR: "STRING", 1445 exp.DataType.Type.NVARCHAR: "STRING", 1446 exp.DataType.Type.SMALLINT: "INT64", 1447 exp.DataType.Type.TEXT: "STRING", 1448 exp.DataType.Type.TIMESTAMP: "DATETIME", 1449 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1450 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1451 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1452 exp.DataType.Type.TINYINT: "INT64", 1453 exp.DataType.Type.ROWVERSION: "BYTES", 1454 exp.DataType.Type.UUID: "STRING", 1455 exp.DataType.Type.VARBINARY: "BYTES", 1456 exp.DataType.Type.VARCHAR: "STRING", 1457 exp.DataType.Type.VARIANT: "ANY TYPE", 1458 } 1459 1460 PROPERTIES_LOCATION = { 1461 **generator.Generator.PROPERTIES_LOCATION, 1462 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1463 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1464 } 1465 1466 # WINDOW comes after QUALIFY 1467 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1468 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1469 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1470 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1471 } 1472 1473 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1474 RESERVED_KEYWORDS = { 1475 "all", 1476 "and", 1477 "any", 1478 "array", 1479 "as", 1480 "asc", 1481 "assert_rows_modified", 1482 "at", 1483 "between", 1484 "by", 1485 "case", 1486 "cast", 1487 "collate", 1488 "contains", 1489 "create", 1490 "cross", 1491 "cube", 1492 "current", 1493 "default", 1494 "define", 1495 "desc", 1496 "distinct", 1497 "else", 1498 "end", 1499 "enum", 1500 "escape", 1501 "except", 1502 "exclude", 1503 "exists", 1504 "extract", 1505 "false", 1506 "fetch", 1507 "following", 1508 "for", 1509 "from", 1510 "full", 1511 "group", 1512 "grouping", 1513 "groups", 1514 "hash", 1515 "having", 1516 "if", 1517 "ignore", 1518 "in", 1519 "inner", 1520 "intersect", 1521 "interval", 1522 "into", 1523 "is", 1524 "join", 1525 "lateral", 1526 "left", 1527 "like", 1528 "limit", 1529 "lookup", 1530 "merge", 1531 "natural", 1532 "new", 1533 "no", 1534 "not", 1535 "null", 1536 "nulls", 1537 "of", 1538 "on", 1539 "or", 1540 "order", 1541 "outer", 1542 "over", 1543 "partition", 1544 "preceding", 1545 "proto", 1546 "qualify", 1547 "range", 1548 "recursive", 1549 "respect", 1550 "right", 1551 "rollup", 1552 "rows", 1553 "select", 1554 "set", 1555 "some", 1556 "struct", 1557 "tablesample", 1558 "then", 1559 "to", 1560 "treat", 1561 "true", 1562 "unbounded", 1563 "union", 1564 "unnest", 1565 "using", 1566 "when", 1567 "where", 1568 "window", 1569 "with", 1570 "within", 1571 } 1572 1573 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1574 unit = expression.unit 1575 unit_sql = unit.name if unit.is_string else self.sql(unit) 1576 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1577 1578 def mod_sql(self, expression: exp.Mod) -> str: 1579 this = expression.this 1580 expr = expression.expression 1581 return self.func( 1582 "MOD", 1583 this.unnest() if isinstance(this, exp.Paren) else this, 1584 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1585 ) 1586 1587 def column_parts(self, expression: exp.Column) -> str: 1588 if expression.meta.get("quoted_column"): 1589 # If a column reference is of the form `dataset.table`.name, we need 1590 # to preserve the quoted table path, otherwise the reference breaks 1591 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1592 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1593 return f"{table_path}.{self.sql(expression, 'this')}" 1594 1595 return super().column_parts(expression) 1596 1597 def table_parts(self, expression: exp.Table) -> str: 1598 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1599 # we need to make sure the correct quoting is used in each case. 1600 # 1601 # For example, if there is a CTE x that clashes with a schema name, then the former will 1602 # return the table y in that schema, whereas the latter will return the CTE's y column: 1603 # 1604 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1605 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1606 if expression.meta.get("quoted_table"): 1607 table_parts = ".".join(p.name for p in expression.parts) 1608 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1609 1610 return super().table_parts(expression) 1611 1612 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1613 this = expression.this 1614 if isinstance(this, exp.TsOrDsToDatetime): 1615 func_name = "FORMAT_DATETIME" 1616 elif isinstance(this, exp.TsOrDsToTimestamp): 1617 func_name = "FORMAT_TIMESTAMP" 1618 elif isinstance(this, exp.TsOrDsToTime): 1619 func_name = "FORMAT_TIME" 1620 else: 1621 func_name = "FORMAT_DATE" 1622 1623 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1624 return self.func( 1625 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1626 ) 1627 1628 def eq_sql(self, expression: exp.EQ) -> str: 1629 # Operands of = cannot be NULL in BigQuery 1630 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1631 if not isinstance(expression.parent, exp.Update): 1632 return "NULL" 1633 1634 return self.binary(expression, "=") 1635 1636 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1637 parent = expression.parent 1638 1639 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1640 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1641 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1642 return self.func( 1643 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1644 ) 1645 1646 return super().attimezone_sql(expression) 1647 1648 def trycast_sql(self, expression: exp.TryCast) -> str: 1649 return self.cast_sql(expression, safe_prefix="SAFE_") 1650 1651 def bracket_sql(self, expression: exp.Bracket) -> str: 1652 this = expression.this 1653 expressions = expression.expressions 1654 1655 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1656 arg = expressions[0] 1657 if arg.type is None: 1658 from sqlglot.optimizer.annotate_types import annotate_types 1659 1660 arg = annotate_types(arg, dialect=self.dialect) 1661 1662 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1663 # BQ doesn't support bracket syntax with string values for structs 1664 return f"{self.sql(this)}.{arg.name}" 1665 1666 expressions_sql = self.expressions(expression, flat=True) 1667 offset = expression.args.get("offset") 1668 1669 if offset == 0: 1670 expressions_sql = f"OFFSET({expressions_sql})" 1671 elif offset == 1: 1672 expressions_sql = f"ORDINAL({expressions_sql})" 1673 elif offset is not None: 1674 self.unsupported(f"Unsupported array offset: {offset}") 1675 1676 if expression.args.get("safe"): 1677 expressions_sql = f"SAFE_{expressions_sql}" 1678 1679 return f"{self.sql(this)}[{expressions_sql}]" 1680 1681 def in_unnest_op(self, expression: exp.Unnest) -> str: 1682 return self.sql(expression) 1683 1684 def version_sql(self, expression: exp.Version) -> str: 1685 if expression.name == "TIMESTAMP": 1686 expression.set("this", "SYSTEM_TIME") 1687 return super().version_sql(expression) 1688 1689 def contains_sql(self, expression: exp.Contains) -> str: 1690 this = expression.this 1691 expr = expression.expression 1692 1693 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1694 this = this.this 1695 expr = expr.this 1696 1697 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1698 1699 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1700 this = expression.this 1701 1702 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1703 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1704 # because they aren't literals and so the above syntax is invalid BigQuery. 1705 if isinstance(this, exp.Array): 1706 elem = seq_get(this.expressions, 0) 1707 if not (elem and elem.find(exp.Query)): 1708 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1709 1710 return super().cast_sql(expression, safe_prefix=safe_prefix) 1711 1712 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1713 variables = self.expressions(expression, "this") 1714 default = self.sql(expression, "default") 1715 default = f" DEFAULT {default}" if default else "" 1716 kind = self.sql(expression, "kind") 1717 kind = f" {kind}" if kind else "" 1718 1719 return f"{variables}{kind}{default}"
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
672 def normalize_identifier(self, expression: E) -> E: 673 if ( 674 isinstance(expression, exp.Identifier) 675 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 676 ): 677 parent = expression.parent 678 while isinstance(parent, exp.Dot): 679 parent = parent.parent 680 681 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 682 # by default. The following check uses a heuristic to detect tables based on whether 683 # they are qualified. This should generally be correct, because tables in BigQuery 684 # must be qualified with at least a dataset, unless @@dataset_id is set. 685 case_sensitive = ( 686 isinstance(parent, exp.UserDefinedFunction) 687 or ( 688 isinstance(parent, exp.Table) 689 and parent.db 690 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 691 ) 692 or expression.meta.get("is_table") 693 ) 694 if not case_sensitive: 695 expression.set("this", expression.this.lower()) 696 697 return t.cast(E, expression) 698 699 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
701 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 702 VAR_TOKENS = { 703 TokenType.DASH, 704 TokenType.VAR, 705 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
707 class Tokenizer(tokens.Tokenizer): 708 QUOTES = ["'", '"', '"""', "'''"] 709 COMMENTS = ["--", "#", ("/*", "*/")] 710 IDENTIFIERS = ["`"] 711 STRING_ESCAPES = ["\\"] 712 713 HEX_STRINGS = [("0x", ""), ("0X", "")] 714 715 BYTE_STRINGS = [ 716 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 717 ] 718 719 RAW_STRINGS = [ 720 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 721 ] 722 723 NESTED_COMMENTS = False 724 725 KEYWORDS = { 726 **tokens.Tokenizer.KEYWORDS, 727 "ANY TYPE": TokenType.VARIANT, 728 "BEGIN": TokenType.COMMAND, 729 "BEGIN TRANSACTION": TokenType.BEGIN, 730 "BYTEINT": TokenType.INT, 731 "BYTES": TokenType.BINARY, 732 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 733 "DATETIME": TokenType.TIMESTAMP, 734 "DECLARE": TokenType.DECLARE, 735 "ELSEIF": TokenType.COMMAND, 736 "EXCEPTION": TokenType.COMMAND, 737 "EXPORT": TokenType.EXPORT, 738 "FLOAT64": TokenType.DOUBLE, 739 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 740 "LOOP": TokenType.COMMAND, 741 "MODEL": TokenType.MODEL, 742 "NOT DETERMINISTIC": TokenType.VOLATILE, 743 "RECORD": TokenType.STRUCT, 744 "REPEAT": TokenType.COMMAND, 745 "TIMESTAMP": TokenType.TIMESTAMPTZ, 746 "WHILE": TokenType.COMMAND, 747 } 748 KEYWORDS.pop("DIV") 749 KEYWORDS.pop("VALUES") 750 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
752 class Parser(parser.Parser): 753 PREFIXED_PIVOT_COLUMNS = True 754 LOG_DEFAULTS_TO_LN = True 755 SUPPORTS_IMPLICIT_UNNEST = True 756 JOINS_HAVE_EQUAL_PRECEDENCE = True 757 758 # BigQuery does not allow ASC/DESC to be used as an identifier, allows GRANT as an identifier 759 ID_VAR_TOKENS = { 760 *parser.Parser.ID_VAR_TOKENS, 761 TokenType.GRANT, 762 } - {TokenType.ASC, TokenType.DESC} 763 764 ALIAS_TOKENS = { 765 *parser.Parser.ALIAS_TOKENS, 766 TokenType.GRANT, 767 } - {TokenType.ASC, TokenType.DESC} 768 769 TABLE_ALIAS_TOKENS = { 770 *parser.Parser.TABLE_ALIAS_TOKENS, 771 TokenType.GRANT, 772 } - {TokenType.ASC, TokenType.DESC} 773 774 COMMENT_TABLE_ALIAS_TOKENS = { 775 *parser.Parser.COMMENT_TABLE_ALIAS_TOKENS, 776 TokenType.GRANT, 777 } - {TokenType.ASC, TokenType.DESC} 778 779 UPDATE_ALIAS_TOKENS = { 780 *parser.Parser.UPDATE_ALIAS_TOKENS, 781 TokenType.GRANT, 782 } - {TokenType.ASC, TokenType.DESC} 783 784 FUNCTIONS = { 785 **parser.Parser.FUNCTIONS, 786 "APPROX_TOP_COUNT": exp.ApproxTopK.from_arg_list, 787 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 788 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 789 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 790 "BIT_COUNT": exp.BitwiseCountAgg.from_arg_list, 791 "BOOL": exp.JSONBool.from_arg_list, 792 "CONTAINS_SUBSTR": _build_contains_substring, 793 "DATE": _build_date, 794 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 795 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 796 "DATE_TRUNC": lambda args: exp.DateTrunc( 797 unit=seq_get(args, 1), 798 this=seq_get(args, 0), 799 zone=seq_get(args, 2), 800 ), 801 "DATETIME": _build_datetime, 802 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 803 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 804 "DIV": binary_from_function(exp.IntDiv), 805 "EDIT_DISTANCE": _build_levenshtein, 806 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 807 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 808 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 809 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 810 "JSON_EXTRACT_STRING_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 811 "JSON_KEYS": exp.JSONKeysAtDepth.from_arg_list, 812 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 813 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 814 "JSON_STRIP_NULLS": _build_json_strip_nulls, 815 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 816 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 817 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 818 "MD5": exp.MD5Digest.from_arg_list, 819 "NORMALIZE_AND_CASEFOLD": lambda args: exp.Normalize( 820 this=seq_get(args, 0), form=seq_get(args, 1), is_casefold=True 821 ), 822 "OCTET_LENGTH": exp.ByteLength.from_arg_list, 823 "TO_HEX": _build_to_hex, 824 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 825 [seq_get(args, 1), seq_get(args, 0)] 826 ), 827 "PARSE_TIME": lambda args: build_formatted_time(exp.ParseTime, "bigquery")( 828 [seq_get(args, 1), seq_get(args, 0)] 829 ), 830 "PARSE_TIMESTAMP": _build_parse_timestamp, 831 "PARSE_DATETIME": lambda args: build_formatted_time(exp.ParseDatetime, "bigquery")( 832 [seq_get(args, 1), seq_get(args, 0)] 833 ), 834 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 835 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 836 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 837 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 838 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 839 ), 840 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 841 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 842 "SPLIT": lambda args: exp.Split( 843 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 844 this=seq_get(args, 0), 845 expression=seq_get(args, 1) or exp.Literal.string(","), 846 ), 847 "STRPOS": exp.StrPosition.from_arg_list, 848 "TIME": _build_time, 849 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 850 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 851 "TIMESTAMP": _build_timestamp, 852 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 853 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 854 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 855 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 856 ), 857 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 858 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 859 ), 860 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 861 "TO_JSON": lambda args: exp.JSONFormat( 862 this=seq_get(args, 0), options=seq_get(args, 1), to_json=True 863 ), 864 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 865 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 866 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 867 "FORMAT_TIME": _build_format_time(exp.TsOrDsToTime), 868 "FROM_HEX": exp.Unhex.from_arg_list, 869 "WEEK": lambda args: exp.WeekStart(this=exp.var(seq_get(args, 0))), 870 } 871 # Remove SEARCH to avoid parameter routing issues - let it fall back to Anonymous function 872 FUNCTIONS.pop("SEARCH") 873 874 FUNCTION_PARSERS = { 875 **parser.Parser.FUNCTION_PARSERS, 876 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 877 "JSON_ARRAY": lambda self: self.expression( 878 exp.JSONArray, expressions=self._parse_csv(self._parse_bitwise) 879 ), 880 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 881 "PREDICT": lambda self: self._parse_ml(exp.Predict), 882 "TRANSLATE": lambda self: self._parse_translate(), 883 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 884 "GENERATE_EMBEDDING": lambda self: self._parse_ml(exp.GenerateEmbedding), 885 "GENERATE_TEXT_EMBEDDING": lambda self: self._parse_ml( 886 exp.GenerateEmbedding, is_text=True 887 ), 888 "VECTOR_SEARCH": lambda self: self._parse_vector_search(), 889 "FORECAST": lambda self: self._parse_ml(exp.MLForecast), 890 } 891 FUNCTION_PARSERS.pop("TRIM") 892 893 NO_PAREN_FUNCTIONS = { 894 **parser.Parser.NO_PAREN_FUNCTIONS, 895 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 896 } 897 898 NESTED_TYPE_TOKENS = { 899 *parser.Parser.NESTED_TYPE_TOKENS, 900 TokenType.TABLE, 901 } 902 903 PROPERTY_PARSERS = { 904 **parser.Parser.PROPERTY_PARSERS, 905 "NOT DETERMINISTIC": lambda self: self.expression( 906 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 907 ), 908 "OPTIONS": lambda self: self._parse_with_property(), 909 } 910 911 CONSTRAINT_PARSERS = { 912 **parser.Parser.CONSTRAINT_PARSERS, 913 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 914 } 915 916 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 917 RANGE_PARSERS.pop(TokenType.OVERLAPS) 918 919 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 920 921 STATEMENT_PARSERS = { 922 **parser.Parser.STATEMENT_PARSERS, 923 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 924 TokenType.END: lambda self: self._parse_as_command(self._prev), 925 TokenType.FOR: lambda self: self._parse_for_in(), 926 TokenType.EXPORT: lambda self: self._parse_export_data(), 927 TokenType.DECLARE: lambda self: self._parse_declare(), 928 } 929 930 BRACKET_OFFSETS = { 931 "OFFSET": (0, False), 932 "ORDINAL": (1, False), 933 "SAFE_OFFSET": (0, True), 934 "SAFE_ORDINAL": (1, True), 935 } 936 937 def _parse_for_in(self) -> t.Union[exp.ForIn, exp.Command]: 938 index = self._index 939 this = self._parse_range() 940 self._match_text_seq("DO") 941 if self._match(TokenType.COMMAND): 942 self._retreat(index) 943 return self._parse_as_command(self._prev) 944 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 945 946 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 947 this = super()._parse_table_part(schema=schema) or self._parse_number() 948 949 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 950 if isinstance(this, exp.Identifier): 951 table_name = this.name 952 while self._match(TokenType.DASH, advance=False) and self._next: 953 start = self._curr 954 while self._is_connected() and not self._match_set( 955 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 956 ): 957 self._advance() 958 959 if start == self._curr: 960 break 961 962 table_name += self._find_sql(start, self._prev) 963 964 this = exp.Identifier( 965 this=table_name, quoted=this.args.get("quoted") 966 ).update_positions(this) 967 elif isinstance(this, exp.Literal): 968 table_name = this.name 969 970 if self._is_connected() and self._parse_var(any_token=True): 971 table_name += self._prev.text 972 973 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 974 975 return this 976 977 def _parse_table_parts( 978 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 979 ) -> exp.Table: 980 table = super()._parse_table_parts( 981 schema=schema, is_db_reference=is_db_reference, wildcard=True 982 ) 983 984 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 985 if not table.catalog: 986 if table.db: 987 previous_db = table.args["db"] 988 parts = table.db.split(".") 989 if len(parts) == 2 and not table.args["db"].quoted: 990 table.set( 991 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 992 ) 993 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 994 else: 995 previous_this = table.this 996 parts = table.name.split(".") 997 if len(parts) == 2 and not table.this.quoted: 998 table.set( 999 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 1000 ) 1001 table.set( 1002 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 1003 ) 1004 1005 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 1006 alias = table.this 1007 catalog, db, this, *rest = ( 1008 exp.to_identifier(p, quoted=True) 1009 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 1010 ) 1011 1012 for part in (catalog, db, this): 1013 if part: 1014 part.update_positions(table.this) 1015 1016 if rest and this: 1017 this = exp.Dot.build([this, *rest]) # type: ignore 1018 1019 table = exp.Table( 1020 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 1021 ) 1022 table.meta["quoted_table"] = True 1023 else: 1024 alias = None 1025 1026 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 1027 # dataset, so if the project identifier is omitted we need to fix the ast so that 1028 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 1029 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 1030 # views, because it would seem like the "catalog" part is set, when it'd actually 1031 # be the region/dataset. Merging the two identifiers into a single one is done to 1032 # avoid producing a 4-part Table reference, which would cause issues in the schema 1033 # module, when there are 3-part table names mixed with information schema views. 1034 # 1035 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 1036 table_parts = table.parts 1037 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 1038 # We need to alias the table here to avoid breaking existing qualified columns. 1039 # This is expected to be safe, because if there's an actual alias coming up in 1040 # the token stream, it will overwrite this one. If there isn't one, we are only 1041 # exposing the name that can be used to reference the view explicitly (a no-op). 1042 exp.alias_( 1043 table, 1044 t.cast(exp.Identifier, alias or table_parts[-1]), 1045 table=True, 1046 copy=False, 1047 ) 1048 1049 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 1050 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 1051 line=table_parts[-2].meta.get("line"), 1052 col=table_parts[-1].meta.get("col"), 1053 start=table_parts[-2].meta.get("start"), 1054 end=table_parts[-1].meta.get("end"), 1055 ) 1056 table.set("this", new_this) 1057 table.set("db", seq_get(table_parts, -3)) 1058 table.set("catalog", seq_get(table_parts, -4)) 1059 1060 return table 1061 1062 def _parse_column(self) -> t.Optional[exp.Expression]: 1063 column = super()._parse_column() 1064 if isinstance(column, exp.Column): 1065 parts = column.parts 1066 if any("." in p.name for p in parts): 1067 catalog, db, table, this, *rest = ( 1068 exp.to_identifier(p, quoted=True) 1069 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 1070 ) 1071 1072 if rest and this: 1073 this = exp.Dot.build([this, *rest]) # type: ignore 1074 1075 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 1076 column.meta["quoted_column"] = True 1077 1078 return column 1079 1080 @t.overload 1081 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 1082 1083 @t.overload 1084 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 1085 1086 def _parse_json_object(self, agg=False): 1087 json_object = super()._parse_json_object() 1088 array_kv_pair = seq_get(json_object.expressions, 0) 1089 1090 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 1091 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 1092 if ( 1093 array_kv_pair 1094 and isinstance(array_kv_pair.this, exp.Array) 1095 and isinstance(array_kv_pair.expression, exp.Array) 1096 ): 1097 keys = array_kv_pair.this.expressions 1098 values = array_kv_pair.expression.expressions 1099 1100 json_object.set( 1101 "expressions", 1102 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 1103 ) 1104 1105 return json_object 1106 1107 def _parse_bracket( 1108 self, this: t.Optional[exp.Expression] = None 1109 ) -> t.Optional[exp.Expression]: 1110 bracket = super()._parse_bracket(this) 1111 1112 if this is bracket: 1113 return bracket 1114 1115 if isinstance(bracket, exp.Bracket): 1116 for expression in bracket.expressions: 1117 name = expression.name.upper() 1118 1119 if name not in self.BRACKET_OFFSETS: 1120 break 1121 1122 offset, safe = self.BRACKET_OFFSETS[name] 1123 bracket.set("offset", offset) 1124 bracket.set("safe", safe) 1125 expression.replace(expression.expressions[0]) 1126 1127 return bracket 1128 1129 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 1130 unnest = super()._parse_unnest(with_alias=with_alias) 1131 1132 if not unnest: 1133 return None 1134 1135 unnest_expr = seq_get(unnest.expressions, 0) 1136 if unnest_expr: 1137 from sqlglot.optimizer.annotate_types import annotate_types 1138 1139 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 1140 1141 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 1142 # in contrast to other dialects such as DuckDB which flattens only the array by default 1143 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 1144 array_elem.is_type(exp.DataType.Type.STRUCT) 1145 for array_elem in unnest_expr._type.expressions 1146 ): 1147 unnest.set("explode_array", True) 1148 1149 return unnest 1150 1151 def _parse_make_interval(self) -> exp.MakeInterval: 1152 expr = exp.MakeInterval() 1153 1154 for arg_key in expr.arg_types: 1155 value = self._parse_lambda() 1156 1157 if not value: 1158 break 1159 1160 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 1161 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 1162 if isinstance(value, exp.Kwarg): 1163 arg_key = value.this.name 1164 1165 expr.set(arg_key, value) 1166 1167 self._match(TokenType.COMMA) 1168 1169 return expr 1170 1171 def _parse_ml(self, expr_type: t.Type[E], **kwargs) -> E: 1172 self._match_text_seq("MODEL") 1173 this = self._parse_table() 1174 1175 self._match(TokenType.COMMA) 1176 self._match_text_seq("TABLE") 1177 1178 # Certain functions like ML.FORECAST require a STRUCT argument but not a TABLE/SELECT one 1179 expression = ( 1180 self._parse_table() if not self._match(TokenType.STRUCT, advance=False) else None 1181 ) 1182 1183 self._match(TokenType.COMMA) 1184 1185 return self.expression( 1186 expr_type, 1187 this=this, 1188 expression=expression, 1189 params_struct=self._parse_bitwise(), 1190 **kwargs, 1191 ) 1192 1193 def _parse_translate(self) -> exp.Translate | exp.MLTranslate: 1194 # Check if this is ML.TRANSLATE by looking at previous tokens 1195 token = seq_get(self._tokens, self._index - 4) 1196 if token and token.text.upper() == "ML": 1197 return self._parse_ml(exp.MLTranslate) 1198 1199 return exp.Translate.from_arg_list(self._parse_function_args()) 1200 1201 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 1202 self._match(TokenType.TABLE) 1203 this = self._parse_table() 1204 1205 expr = self.expression(exp.FeaturesAtTime, this=this) 1206 1207 while self._match(TokenType.COMMA): 1208 arg = self._parse_lambda() 1209 1210 # Get the LHS of the Kwarg and set the arg to that value, e.g 1211 # "num_rows => 1" sets the expr's `num_rows` arg 1212 if arg: 1213 expr.set(arg.this.name, arg) 1214 1215 return expr 1216 1217 def _parse_vector_search(self) -> exp.VectorSearch: 1218 self._match(TokenType.TABLE) 1219 base_table = self._parse_table() 1220 1221 self._match(TokenType.COMMA) 1222 1223 column_to_search = self._parse_bitwise() 1224 self._match(TokenType.COMMA) 1225 1226 self._match(TokenType.TABLE) 1227 query_table = self._parse_table() 1228 1229 expr = self.expression( 1230 exp.VectorSearch, 1231 this=base_table, 1232 column_to_search=column_to_search, 1233 query_table=query_table, 1234 ) 1235 1236 while self._match(TokenType.COMMA): 1237 # query_column_to_search can be named argument or positional 1238 if self._match(TokenType.STRING, advance=False): 1239 query_column = self._parse_string() 1240 expr.set("query_column_to_search", query_column) 1241 else: 1242 arg = self._parse_lambda() 1243 if arg: 1244 expr.set(arg.this.name, arg) 1245 1246 return expr 1247 1248 def _parse_export_data(self) -> exp.Export: 1249 self._match_text_seq("DATA") 1250 1251 return self.expression( 1252 exp.Export, 1253 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 1254 options=self._parse_properties(), 1255 this=self._match_text_seq("AS") and self._parse_select(), 1256 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ALTER_TABLE_PARTITIONS
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
1258 class Generator(generator.Generator): 1259 INTERVAL_ALLOWS_PLURAL_FORM = False 1260 JOIN_HINTS = False 1261 QUERY_HINTS = False 1262 TABLE_HINTS = False 1263 LIMIT_FETCH = "LIMIT" 1264 RENAME_TABLE_WITH_DB = False 1265 NVL2_SUPPORTED = False 1266 UNNEST_WITH_ORDINALITY = False 1267 COLLATE_IS_FUNC = True 1268 LIMIT_ONLY_LITERALS = True 1269 SUPPORTS_TABLE_ALIAS_COLUMNS = False 1270 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 1271 JSON_KEY_VALUE_PAIR_SEP = "," 1272 NULL_ORDERING_SUPPORTED = False 1273 IGNORE_NULLS_IN_FUNC = True 1274 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 1275 CAN_IMPLEMENT_ARRAY_ANY = True 1276 SUPPORTS_TO_NUMBER = False 1277 NAMED_PLACEHOLDER_TOKEN = "@" 1278 HEX_FUNC = "TO_HEX" 1279 WITH_PROPERTIES_PREFIX = "OPTIONS" 1280 SUPPORTS_EXPLODING_PROJECTIONS = False 1281 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 1282 SUPPORTS_UNIX_SECONDS = True 1283 1284 SAFE_JSON_PATH_KEY_RE = re.compile(r"^[_\-a-zA-Z][\-\w]*$") 1285 1286 TS_OR_DS_TYPES = ( 1287 exp.TsOrDsToDatetime, 1288 exp.TsOrDsToTimestamp, 1289 exp.TsOrDsToTime, 1290 exp.TsOrDsToDate, 1291 ) 1292 1293 TRANSFORMS = { 1294 **generator.Generator.TRANSFORMS, 1295 exp.ApproxTopK: rename_func("APPROX_TOP_COUNT"), 1296 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 1297 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 1298 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 1299 exp.Array: inline_array_unless_query, 1300 exp.ArrayContains: _array_contains_sql, 1301 exp.ArrayFilter: filter_array_using_unnest, 1302 exp.ArrayRemove: filter_array_using_unnest, 1303 exp.BitwiseAndAgg: rename_func("BIT_AND"), 1304 exp.BitwiseOrAgg: rename_func("BIT_OR"), 1305 exp.BitwiseXorAgg: rename_func("BIT_XOR"), 1306 exp.BitwiseCountAgg: rename_func("BIT_COUNT"), 1307 exp.ByteLength: rename_func("BYTE_LENGTH"), 1308 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 1309 exp.CollateProperty: lambda self, e: ( 1310 f"DEFAULT COLLATE {self.sql(e, 'this')}" 1311 if e.args.get("default") 1312 else f"COLLATE {self.sql(e, 'this')}" 1313 ), 1314 exp.Commit: lambda *_: "COMMIT TRANSACTION", 1315 exp.CountIf: rename_func("COUNTIF"), 1316 exp.Create: _create_sql, 1317 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 1318 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 1319 exp.DateDiff: lambda self, e: self.func( 1320 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 1321 ), 1322 exp.DateFromParts: rename_func("DATE"), 1323 exp.DateStrToDate: datestrtodate_sql, 1324 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 1325 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 1326 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 1327 exp.DateFromUnixDate: rename_func("DATE_FROM_UNIX_DATE"), 1328 exp.FromTimeZone: lambda self, e: self.func( 1329 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 1330 ), 1331 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 1332 exp.GroupConcat: lambda self, e: groupconcat_sql( 1333 self, e, func_name="STRING_AGG", within_group=False 1334 ), 1335 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 1336 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 1337 exp.If: if_sql(false_value="NULL"), 1338 exp.ILike: no_ilike_sql, 1339 exp.IntDiv: rename_func("DIV"), 1340 exp.Int64: rename_func("INT64"), 1341 exp.JSONBool: rename_func("BOOL"), 1342 exp.JSONExtract: _json_extract_sql, 1343 exp.JSONExtractArray: _json_extract_sql, 1344 exp.JSONExtractScalar: _json_extract_sql, 1345 exp.JSONFormat: lambda self, e: self.func( 1346 "TO_JSON" if e.args.get("to_json") else "TO_JSON_STRING", 1347 e.this, 1348 e.args.get("options"), 1349 ), 1350 exp.JSONKeysAtDepth: rename_func("JSON_KEYS"), 1351 exp.JSONValueArray: rename_func("JSON_VALUE_ARRAY"), 1352 exp.Levenshtein: _levenshtein_sql, 1353 exp.Max: max_or_greatest, 1354 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 1355 exp.MD5Digest: rename_func("MD5"), 1356 exp.Min: min_or_least, 1357 exp.Normalize: lambda self, e: self.func( 1358 "NORMALIZE_AND_CASEFOLD" if e.args.get("is_casefold") else "NORMALIZE", 1359 e.this, 1360 e.args.get("form"), 1361 ), 1362 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1363 exp.RegexpExtract: lambda self, e: self.func( 1364 "REGEXP_EXTRACT", 1365 e.this, 1366 e.expression, 1367 e.args.get("position"), 1368 e.args.get("occurrence"), 1369 ), 1370 exp.RegexpExtractAll: lambda self, e: self.func( 1371 "REGEXP_EXTRACT_ALL", e.this, e.expression 1372 ), 1373 exp.RegexpReplace: regexp_replace_sql, 1374 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1375 exp.ReturnsProperty: _returnsproperty_sql, 1376 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1377 exp.ParseTime: lambda self, e: self.func("PARSE_TIME", self.format_time(e), e.this), 1378 exp.ParseDatetime: lambda self, e: self.func( 1379 "PARSE_DATETIME", self.format_time(e), e.this 1380 ), 1381 exp.Select: transforms.preprocess( 1382 [ 1383 transforms.explode_projection_to_unnest(), 1384 transforms.unqualify_unnest, 1385 transforms.eliminate_distinct_on, 1386 _alias_ordered_group, 1387 transforms.eliminate_semi_and_anti_joins, 1388 ] 1389 ), 1390 exp.SHA: rename_func("SHA1"), 1391 exp.SHA2: sha256_sql, 1392 exp.StabilityProperty: lambda self, e: ( 1393 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1394 ), 1395 exp.String: rename_func("STRING"), 1396 exp.StrPosition: lambda self, e: ( 1397 strposition_sql( 1398 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1399 ) 1400 ), 1401 exp.StrToDate: _str_to_datetime_sql, 1402 exp.StrToTime: _str_to_datetime_sql, 1403 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1404 exp.TimeFromParts: rename_func("TIME"), 1405 exp.TimestampFromParts: rename_func("DATETIME"), 1406 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1407 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1408 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1409 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1410 exp.TimeStrToTime: timestrtotime_sql, 1411 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1412 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1413 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1414 exp.TsOrDsToTime: rename_func("TIME"), 1415 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1416 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1417 exp.Unhex: rename_func("FROM_HEX"), 1418 exp.UnixDate: rename_func("UNIX_DATE"), 1419 exp.UnixToTime: _unix_to_time_sql, 1420 exp.Uuid: lambda *_: "GENERATE_UUID()", 1421 exp.Values: _derived_table_values_to_unnest, 1422 exp.VariancePop: rename_func("VAR_POP"), 1423 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1424 } 1425 1426 SUPPORTED_JSON_PATH_PARTS = { 1427 exp.JSONPathKey, 1428 exp.JSONPathRoot, 1429 exp.JSONPathSubscript, 1430 } 1431 1432 TYPE_MAPPING = { 1433 **generator.Generator.TYPE_MAPPING, 1434 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1435 exp.DataType.Type.BIGINT: "INT64", 1436 exp.DataType.Type.BINARY: "BYTES", 1437 exp.DataType.Type.BLOB: "BYTES", 1438 exp.DataType.Type.BOOLEAN: "BOOL", 1439 exp.DataType.Type.CHAR: "STRING", 1440 exp.DataType.Type.DECIMAL: "NUMERIC", 1441 exp.DataType.Type.DOUBLE: "FLOAT64", 1442 exp.DataType.Type.FLOAT: "FLOAT64", 1443 exp.DataType.Type.INT: "INT64", 1444 exp.DataType.Type.NCHAR: "STRING", 1445 exp.DataType.Type.NVARCHAR: "STRING", 1446 exp.DataType.Type.SMALLINT: "INT64", 1447 exp.DataType.Type.TEXT: "STRING", 1448 exp.DataType.Type.TIMESTAMP: "DATETIME", 1449 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1450 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1451 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1452 exp.DataType.Type.TINYINT: "INT64", 1453 exp.DataType.Type.ROWVERSION: "BYTES", 1454 exp.DataType.Type.UUID: "STRING", 1455 exp.DataType.Type.VARBINARY: "BYTES", 1456 exp.DataType.Type.VARCHAR: "STRING", 1457 exp.DataType.Type.VARIANT: "ANY TYPE", 1458 } 1459 1460 PROPERTIES_LOCATION = { 1461 **generator.Generator.PROPERTIES_LOCATION, 1462 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1463 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1464 } 1465 1466 # WINDOW comes after QUALIFY 1467 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1468 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1469 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1470 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1471 } 1472 1473 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1474 RESERVED_KEYWORDS = { 1475 "all", 1476 "and", 1477 "any", 1478 "array", 1479 "as", 1480 "asc", 1481 "assert_rows_modified", 1482 "at", 1483 "between", 1484 "by", 1485 "case", 1486 "cast", 1487 "collate", 1488 "contains", 1489 "create", 1490 "cross", 1491 "cube", 1492 "current", 1493 "default", 1494 "define", 1495 "desc", 1496 "distinct", 1497 "else", 1498 "end", 1499 "enum", 1500 "escape", 1501 "except", 1502 "exclude", 1503 "exists", 1504 "extract", 1505 "false", 1506 "fetch", 1507 "following", 1508 "for", 1509 "from", 1510 "full", 1511 "group", 1512 "grouping", 1513 "groups", 1514 "hash", 1515 "having", 1516 "if", 1517 "ignore", 1518 "in", 1519 "inner", 1520 "intersect", 1521 "interval", 1522 "into", 1523 "is", 1524 "join", 1525 "lateral", 1526 "left", 1527 "like", 1528 "limit", 1529 "lookup", 1530 "merge", 1531 "natural", 1532 "new", 1533 "no", 1534 "not", 1535 "null", 1536 "nulls", 1537 "of", 1538 "on", 1539 "or", 1540 "order", 1541 "outer", 1542 "over", 1543 "partition", 1544 "preceding", 1545 "proto", 1546 "qualify", 1547 "range", 1548 "recursive", 1549 "respect", 1550 "right", 1551 "rollup", 1552 "rows", 1553 "select", 1554 "set", 1555 "some", 1556 "struct", 1557 "tablesample", 1558 "then", 1559 "to", 1560 "treat", 1561 "true", 1562 "unbounded", 1563 "union", 1564 "unnest", 1565 "using", 1566 "when", 1567 "where", 1568 "window", 1569 "with", 1570 "within", 1571 } 1572 1573 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1574 unit = expression.unit 1575 unit_sql = unit.name if unit.is_string else self.sql(unit) 1576 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1577 1578 def mod_sql(self, expression: exp.Mod) -> str: 1579 this = expression.this 1580 expr = expression.expression 1581 return self.func( 1582 "MOD", 1583 this.unnest() if isinstance(this, exp.Paren) else this, 1584 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1585 ) 1586 1587 def column_parts(self, expression: exp.Column) -> str: 1588 if expression.meta.get("quoted_column"): 1589 # If a column reference is of the form `dataset.table`.name, we need 1590 # to preserve the quoted table path, otherwise the reference breaks 1591 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1592 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1593 return f"{table_path}.{self.sql(expression, 'this')}" 1594 1595 return super().column_parts(expression) 1596 1597 def table_parts(self, expression: exp.Table) -> str: 1598 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1599 # we need to make sure the correct quoting is used in each case. 1600 # 1601 # For example, if there is a CTE x that clashes with a schema name, then the former will 1602 # return the table y in that schema, whereas the latter will return the CTE's y column: 1603 # 1604 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1605 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1606 if expression.meta.get("quoted_table"): 1607 table_parts = ".".join(p.name for p in expression.parts) 1608 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1609 1610 return super().table_parts(expression) 1611 1612 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1613 this = expression.this 1614 if isinstance(this, exp.TsOrDsToDatetime): 1615 func_name = "FORMAT_DATETIME" 1616 elif isinstance(this, exp.TsOrDsToTimestamp): 1617 func_name = "FORMAT_TIMESTAMP" 1618 elif isinstance(this, exp.TsOrDsToTime): 1619 func_name = "FORMAT_TIME" 1620 else: 1621 func_name = "FORMAT_DATE" 1622 1623 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1624 return self.func( 1625 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1626 ) 1627 1628 def eq_sql(self, expression: exp.EQ) -> str: 1629 # Operands of = cannot be NULL in BigQuery 1630 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1631 if not isinstance(expression.parent, exp.Update): 1632 return "NULL" 1633 1634 return self.binary(expression, "=") 1635 1636 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1637 parent = expression.parent 1638 1639 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1640 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1641 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1642 return self.func( 1643 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1644 ) 1645 1646 return super().attimezone_sql(expression) 1647 1648 def trycast_sql(self, expression: exp.TryCast) -> str: 1649 return self.cast_sql(expression, safe_prefix="SAFE_") 1650 1651 def bracket_sql(self, expression: exp.Bracket) -> str: 1652 this = expression.this 1653 expressions = expression.expressions 1654 1655 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1656 arg = expressions[0] 1657 if arg.type is None: 1658 from sqlglot.optimizer.annotate_types import annotate_types 1659 1660 arg = annotate_types(arg, dialect=self.dialect) 1661 1662 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1663 # BQ doesn't support bracket syntax with string values for structs 1664 return f"{self.sql(this)}.{arg.name}" 1665 1666 expressions_sql = self.expressions(expression, flat=True) 1667 offset = expression.args.get("offset") 1668 1669 if offset == 0: 1670 expressions_sql = f"OFFSET({expressions_sql})" 1671 elif offset == 1: 1672 expressions_sql = f"ORDINAL({expressions_sql})" 1673 elif offset is not None: 1674 self.unsupported(f"Unsupported array offset: {offset}") 1675 1676 if expression.args.get("safe"): 1677 expressions_sql = f"SAFE_{expressions_sql}" 1678 1679 return f"{self.sql(this)}[{expressions_sql}]" 1680 1681 def in_unnest_op(self, expression: exp.Unnest) -> str: 1682 return self.sql(expression) 1683 1684 def version_sql(self, expression: exp.Version) -> str: 1685 if expression.name == "TIMESTAMP": 1686 expression.set("this", "SYSTEM_TIME") 1687 return super().version_sql(expression) 1688 1689 def contains_sql(self, expression: exp.Contains) -> str: 1690 this = expression.this 1691 expr = expression.expression 1692 1693 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1694 this = this.this 1695 expr = expr.this 1696 1697 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope")) 1698 1699 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1700 this = expression.this 1701 1702 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1703 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1704 # because they aren't literals and so the above syntax is invalid BigQuery. 1705 if isinstance(this, exp.Array): 1706 elem = seq_get(this.expressions, 0) 1707 if not (elem and elem.find(exp.Query)): 1708 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1709 1710 return super().cast_sql(expression, safe_prefix=safe_prefix) 1711 1712 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1713 variables = self.expressions(expression, "this") 1714 default = self.sql(expression, "default") 1715 default = f" DEFAULT {default}" if default else "" 1716 kind = self.sql(expression, "kind") 1717 kind = f" {kind}" if kind else "" 1718 1719 return f"{variables}{kind}{default}"
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1587 def column_parts(self, expression: exp.Column) -> str: 1588 if expression.meta.get("quoted_column"): 1589 # If a column reference is of the form `dataset.table`.name, we need 1590 # to preserve the quoted table path, otherwise the reference breaks 1591 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1592 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1593 return f"{table_path}.{self.sql(expression, 'this')}" 1594 1595 return super().column_parts(expression)
1597 def table_parts(self, expression: exp.Table) -> str: 1598 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1599 # we need to make sure the correct quoting is used in each case. 1600 # 1601 # For example, if there is a CTE x that clashes with a schema name, then the former will 1602 # return the table y in that schema, whereas the latter will return the CTE's y column: 1603 # 1604 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1605 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1606 if expression.meta.get("quoted_table"): 1607 table_parts = ".".join(p.name for p in expression.parts) 1608 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1609 1610 return super().table_parts(expression)
1612 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1613 this = expression.this 1614 if isinstance(this, exp.TsOrDsToDatetime): 1615 func_name = "FORMAT_DATETIME" 1616 elif isinstance(this, exp.TsOrDsToTimestamp): 1617 func_name = "FORMAT_TIMESTAMP" 1618 elif isinstance(this, exp.TsOrDsToTime): 1619 func_name = "FORMAT_TIME" 1620 else: 1621 func_name = "FORMAT_DATE" 1622 1623 time_expr = this if isinstance(this, self.TS_OR_DS_TYPES) else expression 1624 return self.func( 1625 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1626 )
1628 def eq_sql(self, expression: exp.EQ) -> str: 1629 # Operands of = cannot be NULL in BigQuery 1630 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1631 if not isinstance(expression.parent, exp.Update): 1632 return "NULL" 1633 1634 return self.binary(expression, "=")
1636 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1637 parent = expression.parent 1638 1639 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1640 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1641 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1642 return self.func( 1643 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1644 ) 1645 1646 return super().attimezone_sql(expression)
1651 def bracket_sql(self, expression: exp.Bracket) -> str: 1652 this = expression.this 1653 expressions = expression.expressions 1654 1655 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1656 arg = expressions[0] 1657 if arg.type is None: 1658 from sqlglot.optimizer.annotate_types import annotate_types 1659 1660 arg = annotate_types(arg, dialect=self.dialect) 1661 1662 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1663 # BQ doesn't support bracket syntax with string values for structs 1664 return f"{self.sql(this)}.{arg.name}" 1665 1666 expressions_sql = self.expressions(expression, flat=True) 1667 offset = expression.args.get("offset") 1668 1669 if offset == 0: 1670 expressions_sql = f"OFFSET({expressions_sql})" 1671 elif offset == 1: 1672 expressions_sql = f"ORDINAL({expressions_sql})" 1673 elif offset is not None: 1674 self.unsupported(f"Unsupported array offset: {offset}") 1675 1676 if expression.args.get("safe"): 1677 expressions_sql = f"SAFE_{expressions_sql}" 1678 1679 return f"{self.sql(this)}[{expressions_sql}]"
1689 def contains_sql(self, expression: exp.Contains) -> str: 1690 this = expression.this 1691 expr = expression.expression 1692 1693 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1694 this = this.this 1695 expr = expr.this 1696 1697 return self.func("CONTAINS_SUBSTR", this, expr, expression.args.get("json_scope"))
1699 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1700 this = expression.this 1701 1702 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1703 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1704 # because they aren't literals and so the above syntax is invalid BigQuery. 1705 if isinstance(this, exp.Array): 1706 elem = seq_get(this.expressions, 0) 1707 if not (elem and elem.find(exp.Query)): 1708 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1709 1710 return super().cast_sql(expression, safe_prefix=safe_prefix)
1712 def declareitem_sql(self, expression: exp.DeclareItem) -> str: 1713 variables = self.expressions(expression, "this") 1714 default = self.sql(expression, "default") 1715 default = f" DEFAULT {default}" if default else "" 1716 kind = self.sql(expression, "kind") 1717 kind = f" {kind}" if kind else "" 1718 1719 return f"{variables}{kind}{default}"
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- install_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql