sqlglot.dialects.duckdb
1from __future__ import annotations 2 3from decimal import Decimal 4from itertools import groupby 5import re 6import typing as t 7 8from sqlglot import exp, generator, parser, tokens, transforms 9 10from sqlglot.dialects.dialect import ( 11 DATETIME_DELTA, 12 Dialect, 13 JSON_EXTRACT_TYPE, 14 NormalizationStrategy, 15 approx_count_distinct_sql, 16 array_append_sql, 17 array_compact_sql, 18 array_concat_sql, 19 arrow_json_extract_sql, 20 binary_from_function, 21 build_default_decimal_type, 22 build_formatted_time, 23 build_regexp_extract, 24 count_if_to_sum, 25 date_delta_to_binary_interval_op, 26 date_trunc_to_time, 27 datestrtodate_sql, 28 encode_decode_sql, 29 explode_to_unnest_sql, 30 getbit_sql, 31 groupconcat_sql, 32 inline_array_unless_query, 33 months_between_sql, 34 no_datetime_sql, 35 no_comment_column_constraint_sql, 36 no_make_interval_sql, 37 no_time_sql, 38 no_timestamp_sql, 39 pivot_column_names, 40 regexp_replace_global_modifier, 41 rename_func, 42 remove_from_array_using_filter, 43 sha2_digest_sql, 44 sha256_sql, 45 strposition_sql, 46 str_to_time_sql, 47 timestrtotime_sql, 48 unit_to_str, 49) 50from sqlglot.generator import unsupported_args 51from sqlglot.helper import is_date_unit, seq_get 52from sqlglot.tokens import TokenType 53from sqlglot.parser import binary_range_parser 54from sqlglot.typing.duckdb import EXPRESSION_METADATA 55 56# Regex to detect time zones in timestamps of the form [+|-]TT[:tt] 57# The pattern matches timezone offsets that appear after the time portion 58TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?") 59 60# Characters that must be escaped when building regex expressions in INITCAP 61REGEX_ESCAPE_REPLACEMENTS = { 62 "\\": "\\\\", 63 "-": r"\-", 64 "^": r"\^", 65 "[": r"\[", 66 "]": r"\]", 67} 68 69# Used to in RANDSTR transpilation 70RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 71RANDSTR_SEED = 123456 72 73# Whitespace control characters that DuckDB must process with `CHR({val})` calls 74WS_CONTROL_CHARS_TO_DUCK = { 75 "\u000b": 11, 76 "\u001c": 28, 77 "\u001d": 29, 78 "\u001e": 30, 79 "\u001f": 31, 80} 81 82# Days of week to ISO 8601 day-of-week numbers 83# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7 84WEEK_START_DAY_TO_DOW = { 85 "MONDAY": 1, 86 "TUESDAY": 2, 87 "WEDNESDAY": 3, 88 "THURSDAY": 4, 89 "FRIDAY": 5, 90 "SATURDAY": 6, 91 "SUNDAY": 7, 92} 93 94MAX_BIT_POSITION = exp.Literal.number(32768) 95 96# SEQ function constants 97_SEQ_BASE = "(ROW_NUMBER() OVER (ORDER BY 1) - 1)" 98_SEQ_RESTRICTED = (exp.Where, exp.Having, exp.AggFunc, exp.Order, exp.Select) 99 100 101def _apply_base64_alphabet_replacements( 102 result: exp.Expression, 103 alphabet: t.Optional[exp.Expression], 104 reverse: bool = False, 105) -> exp.Expression: 106 """ 107 Apply base64 alphabet character replacements. 108 109 Base64 alphabet can be 1-3 chars: 1st = index 62 ('+'), 2nd = index 63 ('/'), 3rd = padding ('='). 110 zip truncates to the shorter string, so 1-char alphabet only replaces '+', 2-char replaces '+/', etc. 111 112 Args: 113 result: The expression to apply replacements to 114 alphabet: Custom alphabet literal (expected chars for +/=) 115 reverse: If False, replace default with custom (encode) 116 If True, replace custom with default (decode) 117 """ 118 if isinstance(alphabet, exp.Literal) and alphabet.is_string: 119 for default_char, new_char in zip("+/=", alphabet.this): 120 if new_char != default_char: 121 find, replace = (new_char, default_char) if reverse else (default_char, new_char) 122 result = exp.Replace( 123 this=result, 124 expression=exp.Literal.string(find), 125 replacement=exp.Literal.string(replace), 126 ) 127 return result 128 129 130def _base64_decode_sql(self: DuckDB.Generator, expression: exp.Expression, to_string: bool) -> str: 131 """ 132 Transpile Snowflake BASE64_DECODE_STRING/BINARY to DuckDB. 133 134 DuckDB uses FROM_BASE64() which returns BLOB. For string output, wrap with DECODE(). 135 Custom alphabets require REPLACE() calls to convert to standard base64. 136 """ 137 input_expr = expression.this 138 alphabet = expression.args.get("alphabet") 139 140 # Handle custom alphabet by replacing non-standard chars with standard ones 141 input_expr = _apply_base64_alphabet_replacements(input_expr, alphabet, reverse=True) 142 143 # FROM_BASE64 returns BLOB 144 input_expr = exp.FromBase64(this=input_expr) 145 146 if to_string: 147 input_expr = exp.Decode(this=input_expr) 148 149 return self.sql(input_expr) 150 151 152def _last_day_sql(self: DuckDB.Generator, expression: exp.LastDay) -> str: 153 """ 154 DuckDB's LAST_DAY only supports finding the last day of a month. 155 For other date parts (year, quarter, week), we need to implement equivalent logic. 156 """ 157 date_expr = expression.this 158 unit = expression.text("unit") 159 160 if not unit or unit.upper() == "MONTH": 161 # Default behavior - use DuckDB's native LAST_DAY 162 return self.func("LAST_DAY", date_expr) 163 164 if unit.upper() == "YEAR": 165 # Last day of year: December 31st of the same year 166 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 167 make_date_expr = exp.func( 168 "MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31) 169 ) 170 return self.sql(make_date_expr) 171 172 if unit.upper() == "QUARTER": 173 # Last day of quarter 174 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 175 quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr) 176 177 # Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4 178 last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3)) 179 first_day_last_month_expr = exp.func( 180 "MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1) 181 ) 182 183 # Last day of the last month of the quarter 184 last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr) 185 return self.sql(last_day_expr) 186 187 if unit.upper() == "WEEK": 188 # DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6 189 dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr) 190 # Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake) 191 # Wrap in parentheses to ensure correct precedence 192 days_to_sunday_expr = exp.Mod( 193 this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)), 194 expression=exp.Literal.number(7), 195 ) 196 interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY")) 197 add_expr = exp.Add(this=date_expr, expression=interval_expr) 198 cast_expr = exp.cast(add_expr, exp.DataType.Type.DATE) 199 return self.sql(cast_expr) 200 201 self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function") 202 return self.function_fallback_sql(expression) 203 204 205def _is_nanosecond_unit(unit: t.Optional[exp.Expression]) -> bool: 206 return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND" 207 208 209def _handle_nanosecond_diff( 210 self: DuckDB.Generator, 211 end_time: exp.Expression, 212 start_time: exp.Expression, 213) -> str: 214 """Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it.""" 215 end_ns = exp.cast(end_time, exp.DataType.Type.TIMESTAMP_NS) 216 start_ns = exp.cast(start_time, exp.DataType.Type.TIMESTAMP_NS) 217 218 # Build expression tree: EPOCH_NS(end) - EPOCH_NS(start) 219 return self.sql( 220 exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns)) 221 ) 222 223 224def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str: 225 """ 226 Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent. 227 228 DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'. 229 We need to handle the 'on'/'off' cases explicitly. 230 231 For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR() 232 function to replicate this behavior with a clear error message. 233 234 For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL 235 for invalid inputs instead of throwing errors. 236 """ 237 arg = expression.this 238 is_safe = expression.args.get("safe", False) 239 240 base_case_expr = ( 241 exp.case() 242 .when( 243 # Handle 'on' -> TRUE (case insensitive) 244 exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")), 245 exp.true(), 246 ) 247 .when( 248 # Handle 'off' -> FALSE (case insensitive) 249 exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")), 250 exp.false(), 251 ) 252 ) 253 254 if is_safe: 255 # TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else 256 case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DataType.build("BOOLEAN"))) 257 else: 258 # TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST 259 cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL")) 260 261 # Check for NaN and INF values 262 nan_inf_check = exp.Or( 263 this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real) 264 ) 265 266 case_expr = base_case_expr.when( 267 nan_inf_check, 268 exp.func( 269 "ERROR", 270 exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"), 271 ), 272 ).else_(exp.cast(arg, exp.DataType.Type.BOOLEAN)) 273 274 return self.sql(case_expr) 275 276 277# BigQuery -> DuckDB conversion for the DATE function 278def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str: 279 this = expression.this 280 zone = self.sql(expression, "zone") 281 282 if zone: 283 # BigQuery considers "this" at UTC, converts it to the specified 284 # time zone and then keeps only the DATE part 285 # To micmic that, we: 286 # (1) Cast to TIMESTAMP to remove DuckDB's local tz 287 # (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion 288 this = exp.cast(this, exp.DataType.Type.TIMESTAMP) 289 at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC")) 290 this = exp.AtTimeZone(this=at_utc, zone=zone) 291 292 return self.sql(exp.cast(expression=this, to=exp.DataType.Type.DATE)) 293 294 295# BigQuery -> DuckDB conversion for the TIME_DIFF function 296def _timediff_sql(self: DuckDB.Generator, expression: exp.TimeDiff) -> str: 297 unit = expression.unit 298 299 if _is_nanosecond_unit(unit): 300 return _handle_nanosecond_diff(self, expression.expression, expression.this) 301 302 this = exp.cast(expression.this, exp.DataType.Type.TIME) 303 expr = exp.cast(expression.expression, exp.DataType.Type.TIME) 304 305 # Although the 2 dialects share similar signatures, BQ seems to inverse 306 # the sign of the result so the start/end time operands are flipped 307 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 308 309 310def _date_delta_to_binary_interval_op( 311 cast: bool = True, 312) -> t.Callable[[DuckDB.Generator, DATETIME_DELTA], str]: 313 """ 314 DuckDB override to handle: 315 1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND) 316 2. Float/decimal interval values (DuckDB INTERVAL requires integers) 317 """ 318 base_impl = date_delta_to_binary_interval_op(cast=cast) 319 320 def _duckdb_date_delta_sql(self: DuckDB.Generator, expression: DATETIME_DELTA) -> str: 321 unit = expression.unit 322 interval_value = expression.expression 323 324 # Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND) 325 if _is_nanosecond_unit(unit): 326 if isinstance(interval_value, exp.Interval): 327 interval_value = interval_value.this 328 329 timestamp_ns = exp.cast(expression.this, exp.DataType.Type.TIMESTAMP_NS) 330 331 return self.sql( 332 exp.func( 333 "MAKE_TIMESTAMP_NS", 334 exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value), 335 ) 336 ) 337 338 # Handle float/decimal interval values as duckDB INTERVAL requires integer expressions 339 if not interval_value or isinstance(interval_value, exp.Interval): 340 return base_impl(self, expression) 341 342 if interval_value.is_type(*exp.DataType.REAL_TYPES): 343 expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT")) 344 345 return base_impl(self, expression) 346 347 return _duckdb_date_delta_sql 348 349 350def _array_insert_sql(self: DuckDB.Generator, expression: exp.ArrayInsert) -> str: 351 """ 352 Transpile ARRAY_INSERT to DuckDB using LIST_CONCAT and slicing. 353 354 Handles: 355 - 0-based and 1-based indexing (normalizes to 0-based for calculations) 356 - Negative position conversion (requires array length) 357 - NULL propagation (source dialects return NULL, DuckDB creates single-element array) 358 - Assumes position is within bounds per user constraint 359 360 Note: All dialects that support ARRAY_INSERT (Snowflake, Spark, Databricks) have 361 ARRAY_FUNCS_PROPAGATES_NULLS=True, so we always assume source propagates NULLs. 362 363 Args: 364 expression: The ArrayInsert expression to transpile. 365 366 Returns: 367 SQL string implementing ARRAY_INSERT behavior. 368 """ 369 this = expression.this 370 position = expression.args.get("position") 371 element = expression.expression 372 element_array = exp.Array(expressions=[element]) 373 index_offset = expression.args.get("offset", 0) 374 375 if not position or not position.is_int: 376 self.unsupported("ARRAY_INSERT can only be transpiled with a literal position") 377 return self.func("ARRAY_INSERT", this, position, element) 378 379 pos_value = position.to_py() 380 381 # Normalize one-based indexing to zero-based for slice calculations 382 # Spark (1-based) → Snowflake (0-based): 383 # Positive: pos=1 → pos=0 (subtract 1) 384 # Negative: pos=-2 → pos=-1 (add 1) 385 # Example: Spark array_insert([a,b,c], -2, d) → [a,b,d,c] is same as Snowflake pos=-1 386 if pos_value > 0: 387 pos_value = pos_value - index_offset 388 elif pos_value < 0: 389 pos_value = pos_value + index_offset 390 391 # Build the appropriate list_concat expression based on position 392 if pos_value == 0: 393 # insert at beginning 394 concat_exprs = [element_array, this] 395 elif pos_value > 0: 396 # Positive position: LIST_CONCAT(arr[1:pos], [elem], arr[pos+1:]) 397 # 0-based -> DuckDB 1-based slicing 398 399 # left slice: arr[1:pos] 400 slice_start = exp.Bracket( 401 this=this, 402 expressions=[ 403 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 404 ], 405 ) 406 407 # right slice: arr[pos+1:] 408 slice_end = exp.Bracket( 409 this=this, expressions=[exp.Slice(this=exp.Literal.number(pos_value + 1))] 410 ) 411 412 concat_exprs = [slice_start, element_array, slice_end] 413 else: 414 # Negative position: arr[1:LEN(arr)+pos], [elem], arr[LEN(arr)+pos+1:] 415 # pos=-1 means insert before last element 416 arr_len = exp.Length(this=this) 417 418 # Calculate slice position: LEN(arr) + pos (e.g., LEN(arr) + (-1) = LEN(arr) - 1) 419 slice_end_pos = arr_len + exp.Literal.number(pos_value) 420 slice_start_pos = slice_end_pos + exp.Literal.number(1) 421 422 # left slice: arr[1:LEN(arr)+pos] 423 slice_start = exp.Bracket( 424 this=this, 425 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 426 ) 427 428 # right slice: arr[LEN(arr)+pos+1:] 429 slice_end = exp.Bracket(this=this, expressions=[exp.Slice(this=slice_start_pos)]) 430 431 concat_exprs = [slice_start, element_array, slice_end] 432 433 # All dialects that support ARRAY_INSERT propagate NULLs (Snowflake/Spark/Databricks) 434 # Wrap in CASE WHEN array IS NULL THEN NULL ELSE func_expr END 435 return self.sql( 436 exp.If( 437 this=exp.Is(this=this, expression=exp.Null()), 438 true=exp.Null(), 439 false=self.func("LIST_CONCAT", *concat_exprs), 440 ) 441 ) 442 443 444@unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator.")) 445def _array_sort_sql(self: DuckDB.Generator, expression: exp.ArraySort) -> str: 446 return self.func("ARRAY_SORT", expression.this) 447 448 449def _sort_array_sql(self: DuckDB.Generator, expression: exp.SortArray) -> str: 450 name = "ARRAY_REVERSE_SORT" if expression.args.get("asc") == exp.false() else "ARRAY_SORT" 451 return self.func(name, expression.this) 452 453 454def _build_sort_array_desc(args: t.List) -> exp.Expression: 455 return exp.SortArray(this=seq_get(args, 0), asc=exp.false()) 456 457 458def _build_array_prepend(args: t.List) -> exp.Expression: 459 return exp.ArrayPrepend(this=seq_get(args, 1), expression=seq_get(args, 0)) 460 461 462def _build_date_diff(args: t.List) -> exp.Expression: 463 return exp.DateDiff(this=seq_get(args, 2), expression=seq_get(args, 1), unit=seq_get(args, 0)) 464 465 466def _build_generate_series(end_exclusive: bool = False) -> t.Callable[[t.List], exp.GenerateSeries]: 467 def _builder(args: t.List) -> exp.GenerateSeries: 468 # Check https://duckdb.org/docs/sql/functions/nested.html#range-functions 469 if len(args) == 1: 470 # DuckDB uses 0 as a default for the series' start when it's omitted 471 args.insert(0, exp.Literal.number("0")) 472 473 gen_series = exp.GenerateSeries.from_arg_list(args) 474 gen_series.set("is_end_exclusive", end_exclusive) 475 476 return gen_series 477 478 return _builder 479 480 481def _build_make_timestamp(args: t.List) -> exp.Expression: 482 if len(args) == 1: 483 return exp.UnixToTime(this=seq_get(args, 0), scale=exp.UnixToTime.MICROS) 484 485 return exp.TimestampFromParts( 486 year=seq_get(args, 0), 487 month=seq_get(args, 1), 488 day=seq_get(args, 2), 489 hour=seq_get(args, 3), 490 min=seq_get(args, 4), 491 sec=seq_get(args, 5), 492 ) 493 494 495def _show_parser(*args: t.Any, **kwargs: t.Any) -> t.Callable[[DuckDB.Parser], exp.Show]: 496 def _parse(self: DuckDB.Parser) -> exp.Show: 497 return self._parse_show_duckdb(*args, **kwargs) 498 499 return _parse 500 501 502def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str: 503 ancestor_cast = expression.find_ancestor(exp.Cast, exp.Select) 504 ancestor_cast = None if isinstance(ancestor_cast, exp.Select) else ancestor_cast 505 506 # Empty struct cast works with MAP() since DuckDB can't parse {} 507 if not expression.expressions: 508 if isinstance(ancestor_cast, exp.Cast) and ancestor_cast.to.is_type(exp.DataType.Type.MAP): 509 return "MAP()" 510 511 args: t.List[str] = [] 512 513 # BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is 514 # canonicalized to "ROW('str', 1) AS STRUCT(a TEXT, b INT)" in DuckDB 515 # The transformation to ROW will take place if: 516 # 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would 517 # 2. A cast to STRUCT / ARRAY of STRUCTs is found 518 is_bq_inline_struct = ( 519 (expression.find(exp.PropertyEQ) is None) 520 and ancestor_cast 521 and any( 522 casted_type.is_type(exp.DataType.Type.STRUCT) 523 for casted_type in ancestor_cast.find_all(exp.DataType) 524 ) 525 ) 526 527 for i, expr in enumerate(expression.expressions): 528 is_property_eq = isinstance(expr, exp.PropertyEQ) 529 this = expr.this 530 value = expr.expression if is_property_eq else expr 531 532 if is_bq_inline_struct: 533 args.append(self.sql(value)) 534 else: 535 if isinstance(this, exp.Identifier): 536 key = self.sql(exp.Literal.string(expr.name)) 537 elif is_property_eq: 538 key = self.sql(this) 539 else: 540 key = self.sql(exp.Literal.string(f"_{i}")) 541 542 args.append(f"{key}: {self.sql(value)}") 543 544 csv_args = ", ".join(args) 545 546 return f"ROW({csv_args})" if is_bq_inline_struct else f"{{{csv_args}}}" 547 548 549def _datatype_sql(self: DuckDB.Generator, expression: exp.DataType) -> str: 550 if expression.is_type("array"): 551 return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]" 552 553 # Modifiers are not supported for TIME, [TIME | TIMESTAMP] WITH TIME ZONE 554 if expression.is_type( 555 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ, exp.DataType.Type.TIMESTAMPTZ 556 ): 557 return expression.this.value 558 559 return self.datatype_sql(expression) 560 561 562def _json_format_sql(self: DuckDB.Generator, expression: exp.JSONFormat) -> str: 563 sql = self.func("TO_JSON", expression.this, expression.args.get("options")) 564 return f"CAST({sql} AS TEXT)" 565 566 567def _seq_sql(self: DuckDB.Generator, expression: exp.Func, byte_width: int) -> str: 568 """ 569 Transpile Snowflake SEQ1/SEQ2/SEQ4/SEQ8 to DuckDB. 570 571 Generates monotonically increasing integers starting from 0. 572 The signed parameter (0 or 1) affects wrap-around behavior: 573 - Unsigned (0): wraps at 2^(bits) - 1 574 - Signed (1): wraps at 2^(bits-1) - 1, then goes negative 575 576 Note: SEQ in WHERE, HAVING, aggregates, or window ORDER BY is not supported 577 because these contexts don't allow window functions. Users should rewrite 578 using CTEs or subqueries. 579 580 Args: 581 expression: The SEQ function expression (may have 'this' arg for signed param) 582 byte_width: 1, 2, 4, or 8 bytes 583 584 Returns: 585 SQL string using ROW_NUMBER() with modulo for wrap-around 586 """ 587 # Warn if SEQ is in a restricted context (Select stops search at current scope) 588 ancestor = expression.find_ancestor(*_SEQ_RESTRICTED) 589 if ancestor and ( 590 (not isinstance(ancestor, (exp.Order, exp.Select))) 591 or (isinstance(ancestor, exp.Order) and isinstance(ancestor.parent, exp.Window)) 592 ): 593 self.unsupported("SEQ in restricted context is not supported - use CTE or subquery") 594 595 bits = byte_width * 8 596 max_val = exp.Literal.number(2**bits) 597 598 if expression.name == "1": 599 half = exp.Literal.number(2 ** (bits - 1)) 600 result = exp.replace_placeholders(self.SEQ_SIGNED.copy(), max_val=max_val, half=half) 601 else: 602 result = exp.replace_placeholders(self.SEQ_UNSIGNED.copy(), max_val=max_val) 603 604 return self.sql(result) 605 606 607def _unix_to_time_sql(self: DuckDB.Generator, expression: exp.UnixToTime) -> str: 608 scale = expression.args.get("scale") 609 timestamp = expression.this 610 target_type = expression.args.get("target_type") 611 612 # Check if we need NTZ (naive timestamp in UTC) 613 is_ntz = target_type and target_type.this in ( 614 exp.DataType.Type.TIMESTAMP, 615 exp.DataType.Type.TIMESTAMPNTZ, 616 ) 617 618 if scale == exp.UnixToTime.MILLIS: 619 # EPOCH_MS already returns TIMESTAMP (naive, UTC) 620 return self.func("EPOCH_MS", timestamp) 621 if scale == exp.UnixToTime.MICROS: 622 # MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC) 623 return self.func("MAKE_TIMESTAMP", timestamp) 624 625 # Other scales: divide and use TO_TIMESTAMP 626 if scale not in (None, exp.UnixToTime.SECONDS): 627 timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)) 628 629 to_timestamp: exp.Expression = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp]) 630 631 if is_ntz: 632 to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC")) 633 634 return self.sql(to_timestamp) 635 636 637WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not) 638 639 640def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYPE) -> str: 641 arrow_sql = arrow_json_extract_sql(self, expression) 642 if not expression.same_parent and isinstance( 643 expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS 644 ): 645 arrow_sql = self.wrap(arrow_sql) 646 return arrow_sql 647 648 649def _implicit_datetime_cast( 650 arg: t.Optional[exp.Expression], type: exp.DataType.Type = exp.DataType.Type.DATE 651) -> t.Optional[exp.Expression]: 652 if isinstance(arg, exp.Literal) and arg.is_string: 653 ts = arg.name 654 if type == exp.DataType.Type.DATE and ":" in ts: 655 type = ( 656 exp.DataType.Type.TIMESTAMPTZ 657 if TIMEZONE_PATTERN.search(ts) 658 else exp.DataType.Type.TIMESTAMP 659 ) 660 661 arg = exp.cast(arg, type) 662 663 return arg 664 665 666def _week_unit_to_dow(unit: t.Optional[exp.Expression]) -> t.Optional[int]: 667 """ 668 Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming 669 from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts. 670 671 Args: 672 unit: The unit expression (Var for ISOWEEK or WeekStart) 673 674 Returns: 675 The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant). 676 677 Examples: 678 "WEEK(SUNDAY)" -> 7 679 "WEEK(MONDAY)" -> 1 680 "ISOWEEK" -> 1 681 """ 682 # Handle plain Var expressions for ISOWEEK only 683 if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK": 684 return 1 685 686 # Handle WeekStart expressions with explicit day 687 if isinstance(unit, exp.WeekStart): 688 return WEEK_START_DAY_TO_DOW.get(unit.name.upper()) 689 690 return None 691 692 693def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> exp.Expression: 694 """ 695 Build DATE_TRUNC expression for week boundaries with custom start day. 696 697 Args: 698 date_expr: The date expression to truncate 699 shift_days: ISO 8601 day-of-week number (Monday=0, ..., Sunday=6) 700 701 DuckDB's DATE_TRUNC('WEEK', ...) aligns weeks to Monday (ISO standard). 702 To align to a different start day, we shift the date before truncating. 703 704 Shift formula: Sunday (7) gets +1, others get (1 - start_dow) 705 Examples: 706 Monday (1): shift = 0 (no shift needed) 707 Tuesday (2): shift = -1 (shift back 1 day) ... 708 Sunday (7): shift = +1 (shift forward 1 day, wraps to next Monday-based week) 709 """ 710 shift_days = 1 if start_dow == 7 else 1 - start_dow 711 712 # Shift date to align week boundaries with the desired start day 713 # No shift needed for Monday-based weeks (shift_days == 0) 714 shifted_date = ( 715 exp.DateAdd( 716 this=date_expr, 717 expression=exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")), 718 ) 719 if shift_days != 0 720 else date_expr 721 ) 722 723 return exp.DateTrunc(unit=exp.var("WEEK"), this=shifted_date) 724 725 726def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str: 727 unit = expression.unit 728 729 if _is_nanosecond_unit(unit): 730 return _handle_nanosecond_diff(self, expression.this, expression.expression) 731 732 this = _implicit_datetime_cast(expression.this) 733 expr = _implicit_datetime_cast(expression.expression) 734 735 # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7: 736 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed) 737 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference) 738 # Whereas for other units such as MONTH it does respect month boundaries: 739 # SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed) 740 date_part_boundary = expression.args.get("date_part_boundary") 741 742 # Extract week start day; returns None if day is dynamic (column/placeholder) 743 week_start = _week_unit_to_dow(unit) 744 if date_part_boundary and week_start and this and expr: 745 expression.set("unit", exp.Literal.string("WEEK")) 746 747 # Truncate both dates to week boundaries to respect input dialect semantics 748 this = _build_week_trunc_expression(this, week_start) 749 expr = _build_week_trunc_expression(expr, week_start) 750 751 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 752 753 754def _generate_datetime_array_sql( 755 self: DuckDB.Generator, expression: t.Union[exp.GenerateDateArray, exp.GenerateTimestampArray] 756) -> str: 757 is_generate_date_array = isinstance(expression, exp.GenerateDateArray) 758 759 type = exp.DataType.Type.DATE if is_generate_date_array else exp.DataType.Type.TIMESTAMP 760 start = _implicit_datetime_cast(expression.args.get("start"), type=type) 761 end = _implicit_datetime_cast(expression.args.get("end"), type=type) 762 763 # BQ's GENERATE_DATE_ARRAY & GENERATE_TIMESTAMP_ARRAY are transformed to DuckDB'S GENERATE_SERIES 764 gen_series: t.Union[exp.GenerateSeries, exp.Cast] = exp.GenerateSeries( 765 start=start, end=end, step=expression.args.get("step") 766 ) 767 768 if is_generate_date_array: 769 # The GENERATE_SERIES result type is TIMESTAMP array, so to match BQ's semantics for 770 # GENERATE_DATE_ARRAY we must cast it back to DATE array 771 gen_series = exp.cast(gen_series, exp.DataType.build("ARRAY<DATE>")) 772 773 return self.sql(gen_series) 774 775 776def _json_extract_value_array_sql( 777 self: DuckDB.Generator, expression: exp.JSONValueArray | exp.JSONExtractArray 778) -> str: 779 json_extract = exp.JSONExtract(this=expression.this, expression=expression.expression) 780 data_type = "ARRAY<STRING>" if isinstance(expression, exp.JSONValueArray) else "ARRAY<JSON>" 781 return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type))) 782 783 784def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 785 if arg and arg.type and not arg.is_type(*exp.DataType.TEXT_TYPES, exp.DataType.Type.UNKNOWN): 786 return exp.cast(arg, exp.DataType.Type.VARCHAR) 787 return arg 788 789 790def _cast_to_boolean(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 791 if arg and not arg.is_type(exp.DataType.Type.BOOLEAN): 792 return exp.cast(arg, exp.DataType.Type.BOOLEAN) 793 return arg 794 795 796def _is_binary(arg: exp.Expression) -> bool: 797 return arg.is_type( 798 exp.DataType.Type.BINARY, 799 exp.DataType.Type.VARBINARY, 800 exp.DataType.Type.BLOB, 801 ) 802 803 804def _gen_with_cast_to_blob( 805 self: DuckDB.Generator, expression: exp.Expression, result_sql: str 806) -> str: 807 if _is_binary(expression): 808 blob = exp.DataType.build("BLOB", dialect="duckdb") 809 result_sql = self.sql(exp.Cast(this=result_sql, to=blob)) 810 return result_sql 811 812 813def _cast_to_bit(arg: exp.Expression) -> exp.Expression: 814 if not _is_binary(arg): 815 return arg 816 817 if isinstance(arg, exp.HexString): 818 arg = exp.Unhex(this=exp.Literal.string(arg.this)) 819 820 return exp.cast(arg, exp.DataType.Type.BIT) 821 822 823def _prepare_binary_bitwise_args(expression: exp.Binary) -> None: 824 if _is_binary(expression.this): 825 expression.set("this", _cast_to_bit(expression.this)) 826 if _is_binary(expression.expression): 827 expression.set("expression", _cast_to_bit(expression.expression)) 828 829 830def _day_navigation_sql( 831 self: DuckDB.Generator, expression: t.Union[exp.NextDay, exp.PreviousDay] 832) -> str: 833 """ 834 Transpile Snowflake's NEXT_DAY / PREVIOUS_DAY to DuckDB using date arithmetic. 835 836 Returns the DATE of the next/previous occurrence of the specified weekday. 837 838 Formulas: 839 - NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 840 - PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 841 842 Supports both literal and non-literal day names: 843 - Literal: Direct lookup (e.g., 'Monday' → 1) 844 - Non-literal: CASE statement for runtime evaluation 845 846 Examples: 847 NEXT_DAY('2024-01-01' (Monday), 'Monday') 848 → (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days → 2024-01-08 849 850 PREVIOUS_DAY('2024-01-15' (Monday), 'Friday') 851 → (1 - 5 + 6) % 7 + 1 = 2 % 7 + 1 = 3 days → 2024-01-12 852 """ 853 date_expr = expression.this 854 day_name_expr = expression.expression 855 856 # Build ISODOW call for current day of week 857 isodow_call = exp.func("ISODOW", date_expr) 858 859 # Determine target day of week 860 if isinstance(day_name_expr, exp.Literal): 861 # Literal day name: lookup target_dow directly 862 day_name_str = day_name_expr.name.upper() 863 matching_day = next( 864 (day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None 865 ) 866 if matching_day: 867 target_dow: exp.Expression = exp.Literal.number(WEEK_START_DAY_TO_DOW[matching_day]) 868 else: 869 # Unrecognized day name, use fallback 870 return self.function_fallback_sql(expression) 871 else: 872 # Non-literal day name: build CASE statement for runtime mapping 873 upper_day_name = exp.Upper(this=day_name_expr) 874 target_dow = exp.Case( 875 ifs=[ 876 exp.If( 877 this=exp.func( 878 "STARTS_WITH", upper_day_name.copy(), exp.Literal.string(day[:2]) 879 ), 880 true=exp.Literal.number(dow_num), 881 ) 882 for day, dow_num in WEEK_START_DAY_TO_DOW.items() 883 ] 884 ) 885 886 # Calculate days offset and apply interval based on direction 887 if isinstance(expression, exp.NextDay): 888 # NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 889 days_offset = exp.paren(target_dow - isodow_call + 6, copy=False) % 7 + 1 890 date_with_offset = date_expr + exp.Interval(this=days_offset, unit=exp.var("DAY")) 891 else: # exp.PreviousDay 892 # PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 893 days_offset = exp.paren(isodow_call - target_dow + 6, copy=False) % 7 + 1 894 date_with_offset = date_expr - exp.Interval(this=days_offset, unit=exp.var("DAY")) 895 896 # Build final: CAST(date_with_offset AS DATE) 897 return self.sql(exp.cast(date_with_offset, exp.DataType.Type.DATE)) 898 899 900def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str: 901 # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL 902 having = expression.this 903 if isinstance(having, exp.HavingMax): 904 func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL" 905 return self.func(func_name, having.this, having.expression) 906 return self.function_fallback_sql(expression) 907 908 909def _bitwise_agg_sql( 910 self: DuckDB.Generator, 911 expression: t.Union[exp.BitwiseOrAgg, exp.BitwiseAndAgg, exp.BitwiseXorAgg], 912) -> str: 913 """ 914 DuckDB's bitwise aggregate functions only accept integer types. For other types: 915 - DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int 916 - FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss 917 """ 918 if isinstance(expression, exp.BitwiseOrAgg): 919 func_name = "BIT_OR" 920 elif isinstance(expression, exp.BitwiseAndAgg): 921 func_name = "BIT_AND" 922 else: # exp.BitwiseXorAgg 923 func_name = "BIT_XOR" 924 925 arg = expression.this 926 927 if not arg.type: 928 from sqlglot.optimizer.annotate_types import annotate_types 929 930 arg = annotate_types(arg, dialect=self.dialect) 931 932 if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES): 933 if arg.is_type(*exp.DataType.FLOAT_TYPES): 934 # float types need to be rounded first due to precision loss 935 arg = exp.func("ROUND", arg) 936 937 arg = exp.cast(arg, exp.DataType.Type.INT) 938 939 return self.func(func_name, arg) 940 941 942def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str: 943 # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly 944 if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal): 945 return self.sql(exp.Literal.string(literal)) 946 947 sql_segments: t.List[str] = [] 948 for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK): 949 if is_ws_control: 950 for ch in group: 951 duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch] 952 sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code)))) 953 else: 954 sql_segments.append(self.sql(exp.Literal.string("".join(group)))) 955 956 sql = " || ".join(sql_segments) 957 return sql if len(sql_segments) == 1 else f"({sql})" 958 959 960def _escape_regex_metachars( 961 self: DuckDB.Generator, delimiters: t.Optional[exp.Expression], delimiters_sql: str 962) -> str: 963 r""" 964 Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions. 965 966 Literal strings are escaped at transpile time, expressions handled with REPLACE() calls. 967 """ 968 if not delimiters: 969 return delimiters_sql 970 971 if delimiters.is_string: 972 literal_value = delimiters.this 973 escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value) 974 return _literal_sql_with_ws_chr(self, escaped_literal) 975 976 escaped_sql = delimiters_sql 977 for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items(): 978 escaped_sql = self.func( 979 "REPLACE", 980 escaped_sql, 981 self.sql(exp.Literal.string(raw)), 982 self.sql(exp.Literal.string(escaped)), 983 ) 984 985 return escaped_sql 986 987 988def _build_capitalization_sql( 989 self: DuckDB.Generator, 990 value_to_split: str, 991 delimiters_sql: str, 992) -> str: 993 # empty string delimiter --> treat value as one word, no need to split 994 if delimiters_sql == "''": 995 return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))" 996 997 delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')" 998 split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')" 999 1000 # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments. 1001 # We do not know whether the first segment is a delimiter or not, so we check the first character of the string 1002 # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd. 1003 return self.func( 1004 "ARRAY_TO_STRING", 1005 exp.case() 1006 .when( 1007 f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})", 1008 self.func( 1009 "LIST_TRANSFORM", 1010 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1011 "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1012 ), 1013 ) 1014 .else_( 1015 self.func( 1016 "LIST_TRANSFORM", 1017 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1018 "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1019 ), 1020 ), 1021 "''", 1022 ) 1023 1024 1025def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str: 1026 this_sql = self.sql(expression, "this") 1027 delimiters = expression.args.get("expression") 1028 if delimiters is None: 1029 # fallback for manually created exp.Initcap w/o delimiters arg 1030 delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS) 1031 delimiters_sql = self.sql(delimiters) 1032 1033 escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql) 1034 1035 return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql) 1036 1037 1038def _boolxor_agg_sql(self: DuckDB.Generator, expression: exp.BoolxorAgg) -> str: 1039 """ 1040 Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise; 1041 Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`. 1042 1043 DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean. 1044 """ 1045 return self.sql( 1046 exp.EQ( 1047 this=exp.CountIf(this=_cast_to_boolean(expression.this)), 1048 expression=exp.Literal.number(1), 1049 ) 1050 ) 1051 1052 1053def _bitshift_sql( 1054 self: DuckDB.Generator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift 1055) -> str: 1056 """ 1057 Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts. 1058 1059 DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast 1060 them to BIT for the operation, then cast the result back to the original type. 1061 1062 Note: Assumes type annotation has been applied with the source dialect. 1063 """ 1064 operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>" 1065 result_is_blob = False 1066 this = expression.this 1067 1068 if _is_binary(this): 1069 result_is_blob = True 1070 expression.set("this", exp.cast(this, exp.DataType.Type.BIT)) 1071 elif expression.args.get("requires_int128"): 1072 this.replace(exp.cast(this, exp.DataType.Type.INT128)) 1073 1074 result_sql = self.binary(expression, operator) 1075 1076 # Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue 1077 # DuckDB parses: a << b | c << d as (a << b | c) << d 1078 if isinstance(expression.parent, exp.Binary): 1079 result_sql = self.sql(exp.Paren(this=result_sql)) 1080 1081 if result_is_blob: 1082 result_sql = self.sql( 1083 exp.Cast(this=result_sql, to=exp.DataType.build("BLOB", dialect="duckdb")) 1084 ) 1085 1086 return result_sql 1087 1088 1089def _scale_rounding_sql( 1090 self: DuckDB.Generator, 1091 expression: exp.Expression, 1092 rounding_func: type[exp.Expression], 1093) -> str | None: 1094 """ 1095 Handle scale parameter transformation for rounding functions. 1096 1097 DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL), 1098 so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n) 1099 1100 Args: 1101 self: The DuckDB generator instance 1102 expression: The expression to transform (must have 'this', 'decimals', and 'to' args) 1103 rounding_func: The rounding function class to use in the transformation 1104 1105 Returns: 1106 The transformed SQL string if decimals parameter exists, None otherwise 1107 """ 1108 decimals = expression.args.get("decimals") 1109 1110 if decimals is None or expression.args.get("to") is not None: 1111 return None 1112 1113 this = expression.this 1114 if isinstance(this, exp.Binary): 1115 this = exp.Paren(this=this) 1116 1117 n_int = decimals 1118 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 1119 n_int = exp.cast(decimals, exp.DataType.Type.INT) 1120 1121 pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int) 1122 rounded = rounding_func(this=exp.Mul(this=this, expression=pow_)) 1123 result = exp.Div(this=rounded, expression=pow_.copy()) 1124 1125 return self.round_sql( 1126 exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True) 1127 ) 1128 1129 1130def _ceil_floor(self: DuckDB.Generator, expression: exp.Floor | exp.Ceil) -> str: 1131 scaled_sql = _scale_rounding_sql(self, expression, type(expression)) 1132 if scaled_sql is not None: 1133 return scaled_sql 1134 return self.ceil_floor(expression) 1135 1136 1137def _regr_val_sql( 1138 self: DuckDB.Generator, 1139 expression: exp.RegrValx | exp.RegrValy, 1140) -> str: 1141 """ 1142 Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent. 1143 1144 REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x. 1145 REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y. 1146 """ 1147 from sqlglot.optimizer.annotate_types import annotate_types 1148 1149 y = expression.this 1150 x = expression.expression 1151 1152 # Determine which argument to check for NULL and which to return based on expression type 1153 if isinstance(expression, exp.RegrValx): 1154 # REGR_VALX: check y for NULL, return x 1155 check_for_null = y 1156 return_value = x 1157 return_value_attr = "expression" 1158 else: 1159 # REGR_VALY: check x for NULL, return y 1160 check_for_null = x 1161 return_value = y 1162 return_value_attr = "this" 1163 1164 # Get the type from the return argument 1165 result_type = return_value.type 1166 1167 # If no type info, annotate the expression to infer types 1168 if not result_type or result_type.this == exp.DataType.Type.UNKNOWN: 1169 try: 1170 annotated = annotate_types(expression.copy(), dialect=self.dialect) 1171 result_type = getattr(annotated, return_value_attr).type 1172 except Exception: 1173 pass 1174 1175 # Default to DOUBLE for regression functions if type still unknown 1176 if not result_type or result_type.this == exp.DataType.Type.UNKNOWN: 1177 result_type = exp.DataType.build("DOUBLE") 1178 1179 # Cast NULL to the same type as return_value to avoid DuckDB type inference issues 1180 typed_null = exp.Cast(this=exp.Null(), to=result_type) 1181 1182 return self.sql( 1183 exp.If( 1184 this=exp.Is(this=check_for_null.copy(), expression=exp.Null()), 1185 true=typed_null, 1186 false=return_value.copy(), 1187 ) 1188 ) 1189 1190 1191def _maybe_corr_null_to_false( 1192 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 1193) -> t.Optional[t.Union[exp.Filter, exp.Window, exp.Corr]]: 1194 corr = expression 1195 while isinstance(corr, (exp.Window, exp.Filter)): 1196 corr = corr.this 1197 1198 if not isinstance(corr, exp.Corr) or not corr.args.get("null_on_zero_variance"): 1199 return None 1200 1201 corr.set("null_on_zero_variance", False) 1202 return expression 1203 1204 1205def _date_from_parts_sql(self, expression: exp.DateFromParts) -> str: 1206 """ 1207 Snowflake's DATE_FROM_PARTS allows out-of-range values for the month and day input. 1208 E.g., larger values (month=13, day=100), zero-values (month=0, day=0), negative values (month=-13, day=-100). 1209 1210 DuckDB's MAKE_DATE does not support out-of-range values, but DuckDB's INTERVAL type does. 1211 1212 We convert to date arithmetic: 1213 DATE_FROM_PARTS(year, month, day) 1214 - MAKE_DATE(year, 1, 1) + INTERVAL (month-1) MONTH + INTERVAL (day-1) DAY 1215 """ 1216 year_expr = expression.args.get("year") 1217 month_expr = expression.args.get("month") 1218 day_expr = expression.args.get("day") 1219 1220 if expression.args.get("allow_overflow"): 1221 base_date: exp.Expression = exp.func( 1222 "MAKE_DATE", year_expr, exp.Literal.number(1), exp.Literal.number(1) 1223 ) 1224 1225 if month_expr: 1226 base_date = base_date + exp.Interval(this=month_expr - 1, unit=exp.var("MONTH")) 1227 1228 if day_expr: 1229 base_date = base_date + exp.Interval(this=day_expr - 1, unit=exp.var("DAY")) 1230 1231 return self.sql(exp.cast(expression=base_date, to=exp.DataType.Type.DATE)) 1232 1233 return self.func("MAKE_DATE", year_expr, month_expr, day_expr) 1234 1235 1236def _round_arg(arg: exp.Expression, round_input: t.Optional[bool] = None) -> exp.Expression: 1237 if round_input: 1238 return exp.func("ROUND", arg, exp.Literal.number(0)) 1239 return arg 1240 1241 1242def _boolnot_sql(self: DuckDB.Generator, expression: exp.Boolnot) -> str: 1243 arg = _round_arg(expression.this, expression.args.get("round_input")) 1244 return self.sql(exp.not_(exp.paren(arg))) 1245 1246 1247def _booland_sql(self: DuckDB.Generator, expression: exp.Booland) -> str: 1248 round_input = expression.args.get("round_input") 1249 left = _round_arg(expression.this, round_input) 1250 right = _round_arg(expression.expression, round_input) 1251 return self.sql(exp.paren(exp.and_(exp.paren(left), exp.paren(right), wrap=False))) 1252 1253 1254def _boolor_sql(self: DuckDB.Generator, expression: exp.Boolor) -> str: 1255 round_input = expression.args.get("round_input") 1256 left = _round_arg(expression.this, round_input) 1257 right = _round_arg(expression.expression, round_input) 1258 return self.sql(exp.paren(exp.or_(exp.paren(left), exp.paren(right), wrap=False))) 1259 1260 1261def _xor_sql(self: DuckDB.Generator, expression: exp.Xor) -> str: 1262 round_input = expression.args.get("round_input") 1263 left = _round_arg(expression.this, round_input) 1264 right = _round_arg(expression.expression, round_input) 1265 return self.sql( 1266 exp.or_( 1267 exp.paren(exp.and_(left.copy(), exp.paren(right.not_()), wrap=False)), 1268 exp.paren(exp.and_(exp.paren(left.not_()), right.copy(), wrap=False)), 1269 wrap=False, 1270 ) 1271 ) 1272 1273 1274class DuckDB(Dialect): 1275 NULL_ORDERING = "nulls_are_last" 1276 SUPPORTS_USER_DEFINED_TYPES = True 1277 SAFE_DIVISION = True 1278 INDEX_OFFSET = 1 1279 CONCAT_COALESCE = True 1280 SUPPORTS_ORDER_BY_ALL = True 1281 SUPPORTS_FIXED_SIZE_ARRAYS = True 1282 STRICT_JSON_PATH_SYNTAX = False 1283 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 1284 1285 # https://duckdb.org/docs/sql/introduction.html#creating-a-new-table 1286 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 1287 1288 DATE_PART_MAPPING = { 1289 **Dialect.DATE_PART_MAPPING, 1290 "DAYOFWEEKISO": "ISODOW", 1291 } 1292 1293 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 1294 1295 DATE_PART_MAPPING.pop("WEEKDAY") 1296 1297 INVERSE_TIME_MAPPING = { 1298 "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d) 1299 "%:z": "%z", # In DuckDB %z can represent ±HH:MM, ±HHMM, or ±HH. 1300 "%-z": "%z", 1301 "%f_zero": "%n", 1302 "%f_one": "%n", 1303 "%f_two": "%n", 1304 "%f_three": "%g", 1305 "%f_four": "%n", 1306 "%f_five": "%n", 1307 "%f_seven": "%n", 1308 "%f_eight": "%n", 1309 "%f_nine": "%n", 1310 } 1311 1312 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1313 if isinstance(path, exp.Literal): 1314 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1315 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1316 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1317 # either result in a noisy warning or in an invalid representation of the path. 1318 path_text = path.name 1319 if path_text.startswith("/") or "[#" in path_text: 1320 return path 1321 1322 return super().to_json_path(path) 1323 1324 class Tokenizer(tokens.Tokenizer): 1325 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1326 BYTE_STRING_ESCAPES = ["'", "\\"] 1327 HEREDOC_STRINGS = ["$"] 1328 1329 HEREDOC_TAG_IS_IDENTIFIER = True 1330 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1331 1332 KEYWORDS = { 1333 **tokens.Tokenizer.KEYWORDS, 1334 "//": TokenType.DIV, 1335 "**": TokenType.DSTAR, 1336 "^@": TokenType.CARET_AT, 1337 "@>": TokenType.AT_GT, 1338 "<@": TokenType.LT_AT, 1339 "ATTACH": TokenType.ATTACH, 1340 "BINARY": TokenType.VARBINARY, 1341 "BITSTRING": TokenType.BIT, 1342 "BPCHAR": TokenType.TEXT, 1343 "CHAR": TokenType.TEXT, 1344 "DATETIME": TokenType.TIMESTAMPNTZ, 1345 "DETACH": TokenType.DETACH, 1346 "FORCE": TokenType.FORCE, 1347 "INSTALL": TokenType.INSTALL, 1348 "INT8": TokenType.BIGINT, 1349 "LOGICAL": TokenType.BOOLEAN, 1350 "MACRO": TokenType.FUNCTION, 1351 "ONLY": TokenType.ONLY, 1352 "PIVOT_WIDER": TokenType.PIVOT, 1353 "POSITIONAL": TokenType.POSITIONAL, 1354 "RESET": TokenType.COMMAND, 1355 "ROW": TokenType.STRUCT, 1356 "SIGNED": TokenType.INT, 1357 "STRING": TokenType.TEXT, 1358 "SUMMARIZE": TokenType.SUMMARIZE, 1359 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1360 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1361 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1362 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1363 "TIMESTAMP_US": TokenType.TIMESTAMP, 1364 "UBIGINT": TokenType.UBIGINT, 1365 "UINTEGER": TokenType.UINT, 1366 "USMALLINT": TokenType.USMALLINT, 1367 "UTINYINT": TokenType.UTINYINT, 1368 "VARCHAR": TokenType.TEXT, 1369 } 1370 KEYWORDS.pop("/*+") 1371 1372 SINGLE_TOKENS = { 1373 **tokens.Tokenizer.SINGLE_TOKENS, 1374 "$": TokenType.PARAMETER, 1375 } 1376 1377 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW} 1378 1379 class Parser(parser.Parser): 1380 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1381 1382 BITWISE = parser.Parser.BITWISE.copy() 1383 BITWISE.pop(TokenType.CARET) 1384 1385 RANGE_PARSERS = { 1386 **parser.Parser.RANGE_PARSERS, 1387 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1388 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1389 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1390 } 1391 1392 EXPONENT = { 1393 **parser.Parser.EXPONENT, 1394 TokenType.CARET: exp.Pow, 1395 TokenType.DSTAR: exp.Pow, 1396 } 1397 1398 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1399 1400 SHOW_PARSERS = { 1401 "TABLES": _show_parser("TABLES"), 1402 "ALL TABLES": _show_parser("ALL TABLES"), 1403 } 1404 1405 FUNCTIONS = { 1406 **parser.Parser.FUNCTIONS, 1407 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1408 "ARRAY_PREPEND": _build_array_prepend, 1409 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1410 "ARRAY_SORT": exp.SortArray.from_arg_list, 1411 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1412 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1413 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1414 "DATEDIFF": _build_date_diff, 1415 "DATE_DIFF": _build_date_diff, 1416 "DATE_TRUNC": date_trunc_to_time, 1417 "DATETRUNC": date_trunc_to_time, 1418 "DECODE": lambda args: exp.Decode( 1419 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1420 ), 1421 "EDITDIST3": exp.Levenshtein.from_arg_list, 1422 "ENCODE": lambda args: exp.Encode( 1423 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1424 ), 1425 "EPOCH": exp.TimeToUnix.from_arg_list, 1426 "EPOCH_MS": lambda args: exp.UnixToTime( 1427 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1428 ), 1429 "GENERATE_SERIES": _build_generate_series(), 1430 "GET_BIT": lambda args: exp.Getbit( 1431 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1432 ), 1433 "JSON": exp.ParseJSON.from_arg_list, 1434 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1435 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1436 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1437 "LIST_CONCAT": parser.build_array_concat, 1438 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1439 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1440 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1441 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1442 "LIST_HAS": exp.ArrayContains.from_arg_list, 1443 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1444 "LIST_PREPEND": _build_array_prepend, 1445 "LIST_REVERSE_SORT": _build_sort_array_desc, 1446 "LIST_SORT": exp.SortArray.from_arg_list, 1447 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1448 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1449 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1450 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1451 "MAKE_TIMESTAMP": _build_make_timestamp, 1452 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1453 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1454 "RANGE": _build_generate_series(end_exclusive=True), 1455 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1456 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1457 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1458 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1459 this=seq_get(args, 0), 1460 expression=seq_get(args, 1), 1461 replacement=seq_get(args, 2), 1462 modifiers=seq_get(args, 3), 1463 single_replace=True, 1464 ), 1465 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1466 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1467 "STRING_SPLIT": exp.Split.from_arg_list, 1468 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1469 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1470 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1471 "STRUCT_PACK": exp.Struct.from_arg_list, 1472 "STR_SPLIT": exp.Split.from_arg_list, 1473 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1474 "TIME_BUCKET": exp.DateBin.from_arg_list, 1475 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1476 "UNNEST": exp.Explode.from_arg_list, 1477 "XOR": binary_from_function(exp.BitwiseXor), 1478 } 1479 1480 FUNCTIONS.pop("DATE_SUB") 1481 FUNCTIONS.pop("GLOB") 1482 1483 FUNCTION_PARSERS = { 1484 **parser.Parser.FUNCTION_PARSERS, 1485 **dict.fromkeys( 1486 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1487 ), 1488 } 1489 FUNCTION_PARSERS.pop("DECODE") 1490 1491 NO_PAREN_FUNCTION_PARSERS = { 1492 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1493 "MAP": lambda self: self._parse_map(), 1494 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1495 } 1496 1497 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1498 TokenType.SEMI, 1499 TokenType.ANTI, 1500 } 1501 1502 PLACEHOLDER_PARSERS = { 1503 **parser.Parser.PLACEHOLDER_PARSERS, 1504 TokenType.PARAMETER: lambda self: ( 1505 self.expression(exp.Placeholder, this=self._prev.text) 1506 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1507 else None 1508 ), 1509 } 1510 1511 TYPE_CONVERTERS = { 1512 # https://duckdb.org/docs/sql/data_types/numeric 1513 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1514 # https://duckdb.org/docs/sql/data_types/text 1515 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1516 } 1517 1518 STATEMENT_PARSERS = { 1519 **parser.Parser.STATEMENT_PARSERS, 1520 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1521 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1522 TokenType.FORCE: lambda self: self._parse_force(), 1523 TokenType.INSTALL: lambda self: self._parse_install(), 1524 TokenType.SHOW: lambda self: self._parse_show(), 1525 } 1526 1527 SET_PARSERS = { 1528 **parser.Parser.SET_PARSERS, 1529 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1530 } 1531 1532 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1533 index = self._index 1534 if not self._match_text_seq("LAMBDA"): 1535 return super()._parse_lambda(alias=alias) 1536 1537 expressions = self._parse_csv(self._parse_lambda_arg) 1538 if not self._match(TokenType.COLON): 1539 self._retreat(index) 1540 return None 1541 1542 this = self._replace_lambda(self._parse_assignment(), expressions) 1543 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1544 1545 def _parse_expression(self) -> t.Optional[exp.Expression]: 1546 # DuckDB supports prefix aliases, e.g. foo: 1 1547 if self._next and self._next.token_type == TokenType.COLON: 1548 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1549 self._match(TokenType.COLON) 1550 comments = self._prev_comments or [] 1551 1552 this = self._parse_assignment() 1553 if isinstance(this, exp.Expression): 1554 # Moves the comment next to the alias in `alias: expr /* comment */` 1555 comments += this.pop_comments() or [] 1556 1557 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1558 1559 return super()._parse_expression() 1560 1561 def _parse_table( 1562 self, 1563 schema: bool = False, 1564 joins: bool = False, 1565 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1566 parse_bracket: bool = False, 1567 is_db_reference: bool = False, 1568 parse_partition: bool = False, 1569 consume_pipe: bool = False, 1570 ) -> t.Optional[exp.Expression]: 1571 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1572 if self._next and self._next.token_type == TokenType.COLON: 1573 alias = self._parse_table_alias( 1574 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1575 ) 1576 self._match(TokenType.COLON) 1577 comments = self._prev_comments or [] 1578 else: 1579 alias = None 1580 comments = [] 1581 1582 table = super()._parse_table( 1583 schema=schema, 1584 joins=joins, 1585 alias_tokens=alias_tokens, 1586 parse_bracket=parse_bracket, 1587 is_db_reference=is_db_reference, 1588 parse_partition=parse_partition, 1589 ) 1590 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1591 # Moves the comment next to the alias in `alias: table /* comment */` 1592 comments += table.pop_comments() or [] 1593 alias.comments = alias.pop_comments() + comments 1594 table.set("alias", alias) 1595 1596 return table 1597 1598 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1599 # https://duckdb.org/docs/sql/samples.html 1600 sample = super()._parse_table_sample(as_modifier=as_modifier) 1601 if sample and not sample.args.get("method"): 1602 if sample.args.get("size"): 1603 sample.set("method", exp.var("RESERVOIR")) 1604 else: 1605 sample.set("method", exp.var("SYSTEM")) 1606 1607 return sample 1608 1609 def _parse_bracket( 1610 self, this: t.Optional[exp.Expression] = None 1611 ) -> t.Optional[exp.Expression]: 1612 bracket = super()._parse_bracket(this) 1613 1614 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1615 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1616 bracket.set("returns_list_for_maps", True) 1617 1618 return bracket 1619 1620 def _parse_map(self) -> exp.ToMap | exp.Map: 1621 if self._match(TokenType.L_BRACE, advance=False): 1622 return self.expression(exp.ToMap, this=self._parse_bracket()) 1623 1624 args = self._parse_wrapped_csv(self._parse_assignment) 1625 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1626 1627 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1628 return self._parse_field_def() 1629 1630 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1631 if len(aggregations) == 1: 1632 return super()._pivot_column_names(aggregations) 1633 return pivot_column_names(aggregations, dialect="duckdb") 1634 1635 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1636 def _parse_attach_option() -> exp.AttachOption: 1637 return self.expression( 1638 exp.AttachOption, 1639 this=self._parse_var(any_token=True), 1640 expression=self._parse_field(any_token=True), 1641 ) 1642 1643 self._match(TokenType.DATABASE) 1644 exists = self._parse_exists(not_=is_attach) 1645 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1646 1647 if self._match(TokenType.L_PAREN, advance=False): 1648 expressions = self._parse_wrapped_csv(_parse_attach_option) 1649 else: 1650 expressions = None 1651 1652 return ( 1653 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1654 if is_attach 1655 else self.expression(exp.Detach, this=this, exists=exists) 1656 ) 1657 1658 def _parse_show_duckdb(self, this: str) -> exp.Show: 1659 return self.expression(exp.Show, this=this) 1660 1661 def _parse_force(self) -> exp.Install | exp.Command: 1662 # FORCE can only be followed by INSTALL or CHECKPOINT 1663 # In the case of CHECKPOINT, we fallback 1664 if not self._match(TokenType.INSTALL): 1665 return self._parse_as_command(self._prev) 1666 1667 return self._parse_install(force=True) 1668 1669 def _parse_install(self, force: bool = False) -> exp.Install: 1670 return self.expression( 1671 exp.Install, 1672 this=self._parse_id_var(), 1673 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1674 force=force, 1675 ) 1676 1677 def _parse_primary(self) -> t.Optional[exp.Expression]: 1678 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1679 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1680 1681 return super()._parse_primary() 1682 1683 class Generator(generator.Generator): 1684 PARAMETER_TOKEN = "$" 1685 NAMED_PLACEHOLDER_TOKEN = "$" 1686 JOIN_HINTS = False 1687 TABLE_HINTS = False 1688 QUERY_HINTS = False 1689 LIMIT_FETCH = "LIMIT" 1690 STRUCT_DELIMITER = ("(", ")") 1691 RENAME_TABLE_WITH_DB = False 1692 NVL2_SUPPORTED = False 1693 SEMI_ANTI_JOIN_WITH_SIDE = False 1694 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1695 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1696 LAST_DAY_SUPPORTS_DATE_PART = False 1697 JSON_KEY_VALUE_PAIR_SEP = "," 1698 IGNORE_NULLS_IN_FUNC = True 1699 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1700 SUPPORTS_CREATE_TABLE_LIKE = False 1701 MULTI_ARG_DISTINCT = False 1702 CAN_IMPLEMENT_ARRAY_ANY = True 1703 SUPPORTS_TO_NUMBER = False 1704 SUPPORTS_WINDOW_EXCLUDE = True 1705 COPY_HAS_INTO_KEYWORD = False 1706 STAR_EXCEPT = "EXCLUDE" 1707 PAD_FILL_PATTERN_IS_REQUIRED = True 1708 ARRAY_SIZE_DIM_REQUIRED = False 1709 NORMALIZE_EXTRACT_DATE_PARTS = True 1710 SUPPORTS_LIKE_QUANTIFIERS = False 1711 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1712 1713 TRANSFORMS = { 1714 **generator.Generator.TRANSFORMS, 1715 exp.AnyValue: _anyvalue_sql, 1716 exp.ApproxDistinct: approx_count_distinct_sql, 1717 exp.Boolnot: _boolnot_sql, 1718 exp.Booland: _booland_sql, 1719 exp.Boolor: _boolor_sql, 1720 exp.Array: transforms.preprocess( 1721 [transforms.inherit_struct_field_names], 1722 generator=inline_array_unless_query, 1723 ), 1724 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1725 exp.ArrayCompact: array_compact_sql, 1726 exp.ArrayConstructCompact: lambda self, e: self.sql( 1727 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1728 ), 1729 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1730 exp.ArrayFilter: rename_func("LIST_FILTER"), 1731 exp.ArrayInsert: _array_insert_sql, 1732 exp.ArrayRemove: remove_from_array_using_filter, 1733 exp.ArraySort: _array_sort_sql, 1734 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1735 exp.ArraySum: rename_func("LIST_SUM"), 1736 exp.ArrayUniqueAgg: lambda self, e: self.func( 1737 "LIST", exp.Distinct(expressions=[e.this]) 1738 ), 1739 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1740 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1741 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1742 exp.BitwiseAndAgg: _bitwise_agg_sql, 1743 exp.BitwiseLeftShift: _bitshift_sql, 1744 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1745 exp.BitwiseOrAgg: _bitwise_agg_sql, 1746 exp.BitwiseRightShift: _bitshift_sql, 1747 exp.BitwiseXorAgg: _bitwise_agg_sql, 1748 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1749 exp.Corr: lambda self, e: self._corr_sql(e), 1750 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1751 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1752 exp.CurrentTimestamp: lambda self, e: self.sql( 1753 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1754 ) 1755 if e.args.get("sysdate") 1756 else "CURRENT_TIMESTAMP", 1757 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1758 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1759 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1760 exp.DayOfWeekIso: rename_func("ISODOW"), 1761 exp.DayOfYear: rename_func("DAYOFYEAR"), 1762 exp.Dayname: lambda self, e: ( 1763 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1764 if e.args.get("abbreviated") 1765 else self.func("DAYNAME", e.this) 1766 ), 1767 exp.Monthname: lambda self, e: ( 1768 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1769 if e.args.get("abbreviated") 1770 else self.func("MONTHNAME", e.this) 1771 ), 1772 exp.DataType: _datatype_sql, 1773 exp.Date: _date_sql, 1774 exp.DateAdd: _date_delta_to_binary_interval_op(), 1775 exp.DateFromParts: _date_from_parts_sql, 1776 exp.DateSub: _date_delta_to_binary_interval_op(), 1777 exp.DateDiff: _date_diff_sql, 1778 exp.DateStrToDate: datestrtodate_sql, 1779 exp.Datetime: no_datetime_sql, 1780 exp.DatetimeDiff: _date_diff_sql, 1781 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1782 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1783 exp.DateToDi: lambda self, 1784 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1785 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1786 exp.DiToDate: lambda self, 1787 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1788 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1789 exp.EqualNull: lambda self, e: self.sql( 1790 exp.NullSafeEQ(this=e.this, expression=e.expression) 1791 ), 1792 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1793 exp.GenerateDateArray: _generate_datetime_array_sql, 1794 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1795 exp.Getbit: getbit_sql, 1796 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1797 exp.Explode: rename_func("UNNEST"), 1798 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1799 exp.IsInf: rename_func("ISINF"), 1800 exp.IsNan: rename_func("ISNAN"), 1801 exp.IsNullValue: lambda self, e: self.sql( 1802 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1803 ), 1804 exp.IsArray: lambda self, e: self.sql( 1805 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1806 ), 1807 exp.Ceil: _ceil_floor, 1808 exp.Floor: _ceil_floor, 1809 exp.JSONBExists: rename_func("JSON_EXISTS"), 1810 exp.JSONExtract: _arrow_json_extract_sql, 1811 exp.JSONExtractArray: _json_extract_value_array_sql, 1812 exp.JSONFormat: _json_format_sql, 1813 exp.JSONValueArray: _json_extract_value_array_sql, 1814 exp.Lateral: explode_to_unnest_sql, 1815 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1816 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1817 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1818 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1819 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1820 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1821 exp.BoolxorAgg: _boolxor_agg_sql, 1822 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1823 exp.Initcap: _initcap_sql, 1824 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1825 exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)), 1826 exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)), 1827 exp.MonthsBetween: months_between_sql, 1828 exp.NextDay: _day_navigation_sql, 1829 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1830 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1831 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1832 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1833 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1834 exp.PreviousDay: _day_navigation_sql, 1835 exp.RegexpReplace: lambda self, e: self.func( 1836 "REGEXP_REPLACE", 1837 e.this, 1838 e.expression, 1839 e.args.get("replacement"), 1840 regexp_replace_global_modifier(e), 1841 ), 1842 exp.RegexpLike: rename_func("REGEXP_MATCHES"), 1843 exp.RegexpILike: lambda self, e: self.func( 1844 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1845 ), 1846 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1847 exp.RegrValx: _regr_val_sql, 1848 exp.RegrValy: _regr_val_sql, 1849 exp.Return: lambda self, e: self.sql(e, "this"), 1850 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1851 exp.Rand: rename_func("RANDOM"), 1852 exp.SHA2: sha256_sql, 1853 exp.Split: rename_func("STR_SPLIT"), 1854 exp.SortArray: _sort_array_sql, 1855 exp.StrPosition: strposition_sql, 1856 exp.StrToUnix: lambda self, e: self.func( 1857 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1858 ), 1859 exp.Struct: _struct_sql, 1860 exp.Transform: rename_func("LIST_TRANSFORM"), 1861 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1862 exp.TimeSub: _date_delta_to_binary_interval_op(), 1863 exp.Time: no_time_sql, 1864 exp.TimeDiff: _timediff_sql, 1865 exp.Timestamp: no_timestamp_sql, 1866 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1867 exp.TimestampDiff: lambda self, e: self.func( 1868 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1869 ), 1870 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1871 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 1872 exp.TimeStrToTime: timestrtotime_sql, 1873 exp.TimeStrToUnix: lambda self, e: self.func( 1874 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 1875 ), 1876 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1877 exp.ToBoolean: _to_boolean_sql, 1878 exp.TimeToUnix: rename_func("EPOCH"), 1879 exp.TsOrDiToDi: lambda self, 1880 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 1881 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1882 exp.TsOrDsDiff: lambda self, e: self.func( 1883 "DATE_DIFF", 1884 f"'{e.args.get('unit') or 'DAY'}'", 1885 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 1886 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 1887 ), 1888 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1889 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1890 exp.UnixSeconds: lambda self, e: self.sql( 1891 exp.cast( 1892 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 1893 ) 1894 ), 1895 exp.UnixToStr: lambda self, e: self.func( 1896 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1897 ), 1898 exp.DatetimeTrunc: lambda self, e: self.func( 1899 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 1900 ), 1901 exp.UnixToTime: _unix_to_time_sql, 1902 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1903 exp.VariancePop: rename_func("VAR_POP"), 1904 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1905 exp.YearOfWeek: lambda self, e: self.sql( 1906 exp.Extract( 1907 this=exp.Var(this="ISOYEAR"), 1908 expression=e.this, 1909 ) 1910 ), 1911 exp.YearOfWeekIso: lambda self, e: self.sql( 1912 exp.Extract( 1913 this=exp.Var(this="ISOYEAR"), 1914 expression=e.this, 1915 ) 1916 ), 1917 exp.Xor: _xor_sql, 1918 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1919 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1920 exp.DateBin: rename_func("TIME_BUCKET"), 1921 exp.LastDay: _last_day_sql, 1922 } 1923 1924 SUPPORTED_JSON_PATH_PARTS = { 1925 exp.JSONPathKey, 1926 exp.JSONPathRoot, 1927 exp.JSONPathSubscript, 1928 exp.JSONPathWildcard, 1929 } 1930 1931 TYPE_MAPPING = { 1932 **generator.Generator.TYPE_MAPPING, 1933 exp.DataType.Type.BINARY: "BLOB", 1934 exp.DataType.Type.BPCHAR: "TEXT", 1935 exp.DataType.Type.CHAR: "TEXT", 1936 exp.DataType.Type.DATETIME: "TIMESTAMP", 1937 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 1938 exp.DataType.Type.FLOAT: "REAL", 1939 exp.DataType.Type.JSONB: "JSON", 1940 exp.DataType.Type.NCHAR: "TEXT", 1941 exp.DataType.Type.NVARCHAR: "TEXT", 1942 exp.DataType.Type.UINT: "UINTEGER", 1943 exp.DataType.Type.VARBINARY: "BLOB", 1944 exp.DataType.Type.ROWVERSION: "BLOB", 1945 exp.DataType.Type.VARCHAR: "TEXT", 1946 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 1947 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 1948 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 1949 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 1950 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 1951 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 1952 } 1953 1954 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1955 RESERVED_KEYWORDS = { 1956 "array", 1957 "analyse", 1958 "union", 1959 "all", 1960 "when", 1961 "in_p", 1962 "default", 1963 "create_p", 1964 "window", 1965 "asymmetric", 1966 "to", 1967 "else", 1968 "localtime", 1969 "from", 1970 "end_p", 1971 "select", 1972 "current_date", 1973 "foreign", 1974 "with", 1975 "grant", 1976 "session_user", 1977 "or", 1978 "except", 1979 "references", 1980 "fetch", 1981 "limit", 1982 "group_p", 1983 "leading", 1984 "into", 1985 "collate", 1986 "offset", 1987 "do", 1988 "then", 1989 "localtimestamp", 1990 "check_p", 1991 "lateral_p", 1992 "current_role", 1993 "where", 1994 "asc_p", 1995 "placing", 1996 "desc_p", 1997 "user", 1998 "unique", 1999 "initially", 2000 "column", 2001 "both", 2002 "some", 2003 "as", 2004 "any", 2005 "only", 2006 "deferrable", 2007 "null_p", 2008 "current_time", 2009 "true_p", 2010 "table", 2011 "case", 2012 "trailing", 2013 "variadic", 2014 "for", 2015 "on", 2016 "distinct", 2017 "false_p", 2018 "not", 2019 "constraint", 2020 "current_timestamp", 2021 "returning", 2022 "primary", 2023 "intersect", 2024 "having", 2025 "analyze", 2026 "current_user", 2027 "and", 2028 "cast", 2029 "symmetric", 2030 "using", 2031 "order", 2032 "current_catalog", 2033 } 2034 2035 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2036 2037 # DuckDB doesn't generally support CREATE TABLE .. properties 2038 # https://duckdb.org/docs/sql/statements/create_table.html 2039 PROPERTIES_LOCATION = { 2040 prop: exp.Properties.Location.UNSUPPORTED 2041 for prop in generator.Generator.PROPERTIES_LOCATION 2042 } 2043 2044 # There are a few exceptions (e.g. temporary tables) which are supported or 2045 # can be transpiled to DuckDB, so we explicitly override them accordingly 2046 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2047 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2048 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2049 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2050 2051 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2052 exp.FirstValue, 2053 exp.Lag, 2054 exp.LastValue, 2055 exp.Lead, 2056 exp.NthValue, 2057 ) 2058 2059 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2060 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2061 """ 2062 WITH rand AS (SELECT :random_expr AS r), 2063 weights AS ( 2064 SELECT i, 1.0 / POWER(i, :s) AS w 2065 FROM RANGE(1, :n + 1) AS t(i) 2066 ), 2067 cdf AS ( 2068 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2069 FROM weights 2070 ) 2071 SELECT MIN(i) 2072 FROM cdf 2073 WHERE p >= (SELECT r FROM rand) 2074 """ 2075 ) 2076 2077 # Template for NORMAL transpilation using Box-Muller transform 2078 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2079 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2080 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2081 ) 2082 2083 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2084 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2085 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2086 ) 2087 2088 # Template for generating signed and unsigned SEQ values within a specified range 2089 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(f"{_SEQ_BASE} % :max_val") 2090 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2091 f"(CASE WHEN {_SEQ_BASE} % :max_val >= :half " 2092 f"THEN {_SEQ_BASE} % :max_val - :max_val " 2093 f"ELSE {_SEQ_BASE} % :max_val END)" 2094 ) 2095 2096 # Template for MAP_CAT transpilation - Snowflake semantics: 2097 # 1. Returns NULL if either input is NULL 2098 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2099 # 3. Filters out entries with NULL values from the result 2100 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2101 """ 2102 CASE 2103 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2104 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2105 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2106 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2107 ), __x -> __x.value IS NOT NULL)) 2108 END 2109 """ 2110 ) 2111 2112 # Mappings for EXTRACT/DATE_PART transpilation 2113 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2114 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2115 "WEEKISO": ("%V", "INTEGER"), 2116 "YEAROFWEEK": ("%G", "INTEGER"), 2117 "YEAROFWEEKISO": ("%G", "INTEGER"), 2118 "NANOSECOND": ("%n", "BIGINT"), 2119 } 2120 2121 # Maps epoch-based specifiers to DuckDB epoch functions 2122 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2123 "EPOCH_SECOND": "EPOCH", 2124 "EPOCH_MILLISECOND": "EPOCH_MS", 2125 "EPOCH_MICROSECOND": "EPOCH_US", 2126 "EPOCH_NANOSECOND": "EPOCH_NS", 2127 } 2128 2129 # Template for BITMAP_CONSTRUCT_AGG transpilation 2130 # 2131 # BACKGROUND: 2132 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2133 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2134 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2135 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2136 # 2137 # Snowflake uses two different formats based on the number of unique values: 2138 # 2139 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2140 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2141 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2142 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2143 # count v1 v2 v3 pad 2144 # 2145 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2146 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2147 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2148 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2149 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2150 # 2151 # TEMPLATE STRUCTURE 2152 # 2153 # Phase 1 - Innermost subquery: Data preparation 2154 # SELECT LIST_SORT(...) AS l 2155 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2156 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2157 # 2158 # Phase 2 - Middle subquery: Hex string construction 2159 # LIST_TRANSFORM(...) 2160 # - Converts each integer to 2-byte little-endian hex representation 2161 # - & 255 extracts low byte, >> 8 extracts high byte 2162 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2163 # Result: Hex string of all values 2164 # 2165 # Phase 3 - Outer SELECT: Final bitmap assembly 2166 # LENGTH(l) < 5: 2167 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2168 # LENGTH(l) >= 5: 2169 # - Large format: Fixed 10-byte header + values (no padding needed) 2170 # Result: Complete binary bitmap as BLOB 2171 # 2172 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2173 """ 2174 SELECT CASE 2175 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2176 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2177 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2178 ELSE UNHEX('08000000000000000000' || h) 2179 END 2180 FROM ( 2181 SELECT l, COALESCE(LIST_REDUCE( 2182 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2183 (__a, __b) -> __a || __b, '' 2184 ), '') AS h 2185 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2186 ) 2187 """ 2188 ) 2189 2190 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2191 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2192 f""" 2193 SELECT LISTAGG( 2194 SUBSTRING( 2195 '{RANDSTR_CHAR_POOL}', 2196 1 + CAST(FLOOR(random_value * 62) AS INT), 2197 1 2198 ), 2199 '' 2200 ) 2201 FROM ( 2202 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2203 FROM RANGE(:length) AS t(i) 2204 ) 2205 """, 2206 ) 2207 2208 # Template for MINHASH transpilation 2209 # Computes k minimum hash values across aggregated data using DuckDB list functions 2210 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2211 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2212 """ 2213 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2214 FROM ( 2215 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2216 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2217 ) 2218 """, 2219 ) 2220 2221 # Template for MINHASH_COMBINE transpilation 2222 # Combines multiple minhash signatures by taking element-wise minimum 2223 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2224 """ 2225 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2226 FROM ( 2227 SELECT 2228 pos AS idx, 2229 MIN(val) AS min_h 2230 FROM 2231 UNNEST(LIST(:expr)) AS _(sig), 2232 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2233 GROUP BY pos 2234 ) 2235 """, 2236 ) 2237 2238 # Template for APPROXIMATE_SIMILARITY transpilation 2239 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2240 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2241 """ 2242 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2243 FROM ( 2244 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2245 FROM ( 2246 SELECT h, pos 2247 FROM UNNEST(LIST(:expr)) AS _(sig), 2248 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2249 ) 2250 GROUP BY pos 2251 ) 2252 """, 2253 ) 2254 2255 # Template for ARRAYS_ZIP transpilation 2256 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2257 # Uses RANGE + indexing to match Snowflake behavior 2258 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2259 """ 2260 CASE WHEN :null_check THEN NULL 2261 WHEN :all_empty_check THEN [:empty_struct] 2262 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2263 END 2264 """, 2265 ) 2266 2267 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2268 """ 2269 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2270 2271 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2272 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2273 2274 For 'END' kind, add the interval to get the end of the slice. 2275 For DATE type with 'END', cast result back to DATE to preserve type. 2276 """ 2277 date_expr = expression.this 2278 slice_length = expression.expression 2279 unit = expression.unit 2280 kind = expression.text("kind").upper() 2281 2282 # Create INTERVAL expression: INTERVAL 'N' UNIT 2283 interval_expr = exp.Interval(this=slice_length, unit=unit) 2284 2285 # Create base time_bucket expression 2286 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2287 2288 # Check if we need the end of the slice (default is start) 2289 if not kind == "END": 2290 # For 'START', return time_bucket directly 2291 return self.sql(time_bucket_expr) 2292 2293 # For 'END', add the interval to get end of slice 2294 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2295 2296 # If input is DATE type, cast result back to DATE to preserve type 2297 # DuckDB converts DATE to TIMESTAMP when adding intervals 2298 if date_expr.is_type(exp.DataType.Type.DATE): 2299 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2300 2301 return self.sql(add_expr) 2302 2303 def bitmapbucketnumber_sql( 2304 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2305 ) -> str: 2306 """ 2307 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2308 2309 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2310 - Each bucket covers 32,768 values 2311 - Bucket numbering starts at 1 2312 - Formula: ((value - 1) // 32768) + 1 for positive values 2313 2314 For non-positive values (0 and negative), we use value // 32768 to avoid 2315 producing bucket 0 or positive bucket IDs for negative inputs. 2316 """ 2317 value = expression.this 2318 2319 positive_formula = ((value - 1) // 32768) + 1 2320 non_positive_formula = value // 32768 2321 2322 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2323 case_expr = ( 2324 exp.case() 2325 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2326 .else_(non_positive_formula) 2327 ) 2328 return self.sql(case_expr) 2329 2330 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2331 """ 2332 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2333 2334 Snowflake's BITMAP_BIT_POSITION behavior: 2335 - For n <= 0: returns ABS(n) % 32768 2336 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2337 """ 2338 this = expression.this 2339 2340 return self.sql( 2341 exp.Mod( 2342 this=exp.Paren( 2343 this=exp.If( 2344 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2345 true=this - exp.Literal.number(1), 2346 false=exp.Abs(this=this), 2347 ) 2348 ), 2349 expression=MAX_BIT_POSITION, 2350 ) 2351 ) 2352 2353 def bitmapconstructagg_sql( 2354 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2355 ) -> str: 2356 """ 2357 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2358 Uses a pre-parsed template with placeholders replaced by expression nodes. 2359 2360 Snowflake bitmap format: 2361 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2362 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2363 """ 2364 arg = expression.this 2365 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2366 2367 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2368 """ 2369 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2370 Uses a pre-parsed template with placeholders replaced by expression nodes. 2371 2372 RANDSTR(length, generator) generates a random string of specified length. 2373 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2374 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2375 - No generator: Use default seed value 2376 """ 2377 length = expression.this 2378 generator = expression.args.get("generator") 2379 2380 if generator: 2381 if isinstance(generator, exp.Rand): 2382 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2383 seed_value = generator.this or generator 2384 else: 2385 # Const/int or other expression - use as seed directly 2386 seed_value = generator 2387 else: 2388 # No generator specified, use default seed (arbitrary but deterministic) 2389 seed_value = exp.Literal.number(RANDSTR_SEED) 2390 2391 replacements = {"seed": seed_value, "length": length} 2392 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2393 2394 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2395 """ 2396 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2397 Uses a pre-parsed template with placeholders replaced by expression nodes. 2398 """ 2399 s = expression.this 2400 n = expression.args["elementcount"] 2401 gen = expression.args["gen"] 2402 2403 if not isinstance(gen, exp.Rand): 2404 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2405 random_expr: exp.Expression = exp.Div( 2406 this=exp.Paren( 2407 this=exp.Mod( 2408 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2409 expression=exp.Literal.number(1000000), 2410 ) 2411 ), 2412 expression=exp.Literal.number(1000000.0), 2413 ) 2414 else: 2415 # Use RANDOM() for non-deterministic output 2416 random_expr = exp.Rand() 2417 2418 replacements = {"s": s, "n": n, "random_expr": random_expr} 2419 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2420 2421 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2422 """ 2423 TO_BINARY and TRY_TO_BINARY transpilation: 2424 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2425 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2426 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2427 2428 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2429 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2430 """ 2431 value = expression.this 2432 format_arg = expression.args.get("format") 2433 is_safe = expression.args.get("safe") 2434 2435 fmt = "HEX" 2436 if format_arg: 2437 fmt = format_arg.name.upper() 2438 2439 if expression.is_type(exp.DataType.Type.BINARY): 2440 if fmt == "UTF-8": 2441 result = self.func("ENCODE", value) 2442 elif fmt == "BASE64": 2443 result = self.func("FROM_BASE64", value) 2444 elif fmt == "HEX": 2445 result = self.func("UNHEX", value) 2446 else: 2447 if is_safe: 2448 return self.sql(exp.null()) 2449 else: 2450 self.unsupported(f"format {fmt} is not supported") 2451 result = self.func("TO_BINARY", value) 2452 2453 # Wrap with TRY() for TRY_TO_BINARY 2454 if is_safe: 2455 result = self.func("TRY", result) 2456 2457 return result 2458 2459 # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake 2460 return self.func("TO_BINARY", value) 2461 2462 def _greatest_least_sql( 2463 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2464 ) -> str: 2465 """ 2466 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2467 2468 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2469 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2470 """ 2471 # Get all arguments 2472 all_args = [expression.this, *expression.expressions] 2473 fallback_sql = self.function_fallback_sql(expression) 2474 2475 if expression.args.get("ignore_nulls"): 2476 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2477 return self.sql(fallback_sql) 2478 2479 # return NULL if any argument is NULL 2480 case_expr = exp.case().when( 2481 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2482 exp.null(), 2483 copy=False, 2484 ) 2485 case_expr.set("default", fallback_sql) 2486 return self.sql(case_expr) 2487 2488 def generator_sql(self, expression: exp.Generator) -> str: 2489 # Transpile Snowflake GENERATOR to DuckDB range() 2490 rowcount = expression.args.get("rowcount") 2491 time_limit = expression.args.get("time_limit") 2492 2493 if time_limit: 2494 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2495 2496 if not rowcount: 2497 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2498 return self.func("range", exp.Literal.number(0)) 2499 2500 return self.func("range", rowcount) 2501 2502 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2503 return self._greatest_least_sql(expression) 2504 2505 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2506 return self._greatest_least_sql(expression) 2507 2508 def lambda_sql( 2509 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2510 ) -> str: 2511 if expression.args.get("colon"): 2512 prefix = "LAMBDA " 2513 arrow_sep = ":" 2514 wrap = False 2515 else: 2516 prefix = "" 2517 2518 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2519 return f"{prefix}{lambda_sql}" 2520 2521 def show_sql(self, expression: exp.Show) -> str: 2522 return f"SHOW {expression.name}" 2523 2524 def install_sql(self, expression: exp.Install) -> str: 2525 force = "FORCE " if expression.args.get("force") else "" 2526 this = self.sql(expression, "this") 2527 from_clause = expression.args.get("from_") 2528 from_clause = f" FROM {from_clause}" if from_clause else "" 2529 return f"{force}INSTALL {this}{from_clause}" 2530 2531 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2532 self.unsupported( 2533 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2534 ) 2535 return self.function_fallback_sql(expression) 2536 2537 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2538 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2539 2540 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2541 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2542 target_type = expression.args.get("target_type") 2543 needs_tz = target_type and target_type.this in ( 2544 exp.DataType.Type.TIMESTAMPLTZ, 2545 exp.DataType.Type.TIMESTAMPTZ, 2546 ) 2547 2548 if expression.args.get("safe"): 2549 formatted_time = self.format_time(expression) 2550 cast_type = ( 2551 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2552 ) 2553 return self.sql( 2554 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2555 ) 2556 2557 base_sql = str_to_time_sql(self, expression) 2558 if needs_tz: 2559 return self.sql( 2560 exp.cast( 2561 base_sql, 2562 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2563 ) 2564 ) 2565 return base_sql 2566 2567 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2568 formatted_time = self.format_time(expression) 2569 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2570 return self.sql( 2571 exp.cast( 2572 self.func(function_name, expression.this, formatted_time), 2573 exp.DataType(this=exp.DataType.Type.DATE), 2574 ) 2575 ) 2576 2577 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2578 this = expression.this 2579 time_format = self.format_time(expression) 2580 safe = expression.args.get("safe") 2581 time_type = exp.DataType.build("TIME", dialect="duckdb") 2582 cast_expr = exp.TryCast if safe else exp.Cast 2583 2584 if time_format: 2585 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2586 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2587 return self.sql(cast_expr(this=strptime, to=time_type)) 2588 2589 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2590 return self.sql(this) 2591 2592 return self.sql(cast_expr(this=this, to=time_type)) 2593 2594 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2595 if not expression.this: 2596 return "CURRENT_DATE" 2597 2598 expr = exp.Cast( 2599 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2600 to=exp.DataType(this=exp.DataType.Type.DATE), 2601 ) 2602 return self.sql(expr) 2603 2604 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2605 arg = expression.this 2606 if expression.args.get("safe"): 2607 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2608 return self.func("JSON", arg) 2609 2610 def normal_sql(self, expression: exp.Normal) -> str: 2611 """ 2612 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2613 2614 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2615 """ 2616 mean = expression.this 2617 stddev = expression.args["stddev"] 2618 gen: exp.Expression = expression.args["gen"] 2619 2620 # Build two uniform random values [0, 1) for Box-Muller transform 2621 if isinstance(gen, exp.Rand) and gen.this is None: 2622 u1: exp.Expression = exp.Rand() 2623 u2: exp.Expression = exp.Rand() 2624 else: 2625 # Seeded: derive two values using HASH with different inputs 2626 seed = gen.this if isinstance(gen, exp.Rand) else gen 2627 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2628 u2 = exp.replace_placeholders( 2629 self.SEEDED_RANDOM_TEMPLATE, 2630 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2631 ) 2632 2633 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2634 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2635 2636 def uniform_sql(self, expression: exp.Uniform) -> str: 2637 """ 2638 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2639 2640 UNIFORM returns a random value in [min, max]: 2641 - Integer result if both min and max are integers 2642 - Float result if either min or max is a float 2643 """ 2644 min_val = expression.this 2645 max_val = expression.expression 2646 gen = expression.args.get("gen") 2647 2648 # Determine if result should be integer (both bounds are integers). 2649 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2650 is_int_result = min_val.is_int and max_val.is_int 2651 2652 # Build the random value expression [0, 1) 2653 if not isinstance(gen, exp.Rand): 2654 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2655 random_expr: exp.Expression = exp.Div( 2656 this=exp.Paren( 2657 this=exp.Mod( 2658 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2659 expression=exp.Literal.number(1000000), 2660 ) 2661 ), 2662 expression=exp.Literal.number(1000000.0), 2663 ) 2664 else: 2665 random_expr = exp.Rand() 2666 2667 # Build: min + random * (max - min [+ 1 for int]) 2668 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2669 if is_int_result: 2670 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2671 2672 result: exp.Expression = exp.Add( 2673 this=min_val, 2674 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2675 ) 2676 2677 if is_int_result: 2678 result = exp.Cast( 2679 this=exp.Floor(this=result), 2680 to=exp.DataType.build("BIGINT"), 2681 ) 2682 2683 return self.sql(result) 2684 2685 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2686 nano = expression.args.get("nano") 2687 overflow = expression.args.get("overflow") 2688 2689 # Snowflake's TIME_FROM_PARTS supports overflow 2690 if overflow: 2691 hour = expression.args["hour"] 2692 minute = expression.args["min"] 2693 sec = expression.args["sec"] 2694 2695 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2696 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2697 try: 2698 h_val = hour.to_py() 2699 m_val = minute.to_py() 2700 s_val = sec.to_py() 2701 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2702 return rename_func("MAKE_TIME")(self, expression) 2703 except ValueError: 2704 pass 2705 2706 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2707 if nano: 2708 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2709 2710 total_seconds = ( 2711 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2712 ) 2713 2714 return self.sql( 2715 exp.Add( 2716 this=exp.Cast( 2717 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2718 ), 2719 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2720 ) 2721 ) 2722 2723 # Default: MAKE_TIME 2724 if nano: 2725 expression.set( 2726 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2727 ) 2728 2729 return rename_func("MAKE_TIME")(self, expression) 2730 2731 def extract_sql(self, expression: exp.Extract) -> str: 2732 """ 2733 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2734 2735 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2736 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2737 """ 2738 this = expression.this 2739 datetime_expr = expression.expression 2740 2741 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2742 # because Snowflake applies server timezone while DuckDB uses local timezone 2743 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2744 self.unsupported( 2745 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2746 ) 2747 2748 part_name = this.name.upper() 2749 2750 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2751 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2752 2753 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2754 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2755 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2756 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2757 ) 2758 2759 if is_nano_time: 2760 self.unsupported( 2761 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2762 ) 2763 return self.sql( 2764 exp.cast( 2765 exp.Mul( 2766 this=exp.Extract( 2767 this=exp.var("MICROSECOND"), expression=datetime_expr 2768 ), 2769 expression=exp.Literal.number(1000), 2770 ), 2771 exp.DataType.build(cast_type, dialect="duckdb"), 2772 ) 2773 ) 2774 2775 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2776 strftime_input = datetime_expr 2777 if part_name == "NANOSECOND": 2778 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 2779 2780 return self.sql( 2781 exp.cast( 2782 exp.Anonymous( 2783 this="STRFTIME", 2784 expressions=[strftime_input, exp.Literal.string(fmt)], 2785 ), 2786 exp.DataType.build(cast_type, dialect="duckdb"), 2787 ) 2788 ) 2789 2790 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2791 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2792 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2793 # EPOCH returns float, cast to BIGINT for integer result 2794 if part_name == "EPOCH_SECOND": 2795 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2796 return self.sql(result) 2797 2798 return super().extract_sql(expression) 2799 2800 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2801 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2802 date_expr = expression.this 2803 time_expr = expression.expression 2804 2805 if date_expr is not None and time_expr is not None: 2806 # In DuckDB, DATE + TIME produces TIMESTAMP 2807 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2808 2809 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2810 sec = expression.args.get("sec") 2811 if sec is None: 2812 # This shouldn't happen with valid input, but handle gracefully 2813 return rename_func("MAKE_TIMESTAMP")(self, expression) 2814 2815 milli = expression.args.get("milli") 2816 if milli is not None: 2817 sec += milli.pop() / exp.Literal.number(1000.0) 2818 2819 nano = expression.args.get("nano") 2820 if nano is not None: 2821 sec += nano.pop() / exp.Literal.number(1000000000.0) 2822 2823 if milli or nano: 2824 expression.set("sec", sec) 2825 2826 return rename_func("MAKE_TIMESTAMP")(self, expression) 2827 2828 @unsupported_args("nano") 2829 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2830 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2831 if nano := expression.args.get("nano"): 2832 nano.pop() 2833 2834 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2835 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2836 2837 @unsupported_args("nano") 2838 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2839 # Extract zone before popping 2840 zone = expression.args.get("zone") 2841 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2842 if zone: 2843 zone = zone.pop() 2844 2845 if nano := expression.args.get("nano"): 2846 nano.pop() 2847 2848 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2849 2850 if zone: 2851 # Use AT TIME ZONE to apply the explicit timezone 2852 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2853 2854 return timestamp 2855 2856 def tablesample_sql( 2857 self, 2858 expression: exp.TableSample, 2859 tablesample_keyword: t.Optional[str] = None, 2860 ) -> str: 2861 if not isinstance(expression.parent, exp.Select): 2862 # This sample clause only applies to a single source, not the entire resulting relation 2863 tablesample_keyword = "TABLESAMPLE" 2864 2865 if expression.args.get("size"): 2866 method = expression.args.get("method") 2867 if method and method.name.upper() != "RESERVOIR": 2868 self.unsupported( 2869 f"Sampling method {method} is not supported with a discrete sample count, " 2870 "defaulting to reservoir sampling" 2871 ) 2872 expression.set("method", exp.var("RESERVOIR")) 2873 2874 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2875 2876 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 2877 if isinstance(expression.parent, exp.UserDefinedFunction): 2878 return self.sql(expression, "this") 2879 return super().columndef_sql(expression, sep) 2880 2881 def join_sql(self, expression: exp.Join) -> str: 2882 if ( 2883 not expression.args.get("using") 2884 and not expression.args.get("on") 2885 and not expression.method 2886 and (expression.kind in ("", "INNER", "OUTER")) 2887 ): 2888 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2889 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2890 if isinstance(expression.this, exp.Unnest): 2891 return super().join_sql(expression.on(exp.true())) 2892 2893 expression.set("side", None) 2894 expression.set("kind", None) 2895 2896 return super().join_sql(expression) 2897 2898 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 2899 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 2900 if expression.args.get("is_end_exclusive"): 2901 return rename_func("RANGE")(self, expression) 2902 2903 return self.function_fallback_sql(expression) 2904 2905 def countif_sql(self, expression: exp.CountIf) -> str: 2906 if self.dialect.version >= (1, 2): 2907 return self.function_fallback_sql(expression) 2908 2909 # https://github.com/tobymao/sqlglot/pull/4749 2910 return count_if_to_sum(self, expression) 2911 2912 def bracket_sql(self, expression: exp.Bracket) -> str: 2913 if self.dialect.version >= (1, 2): 2914 return super().bracket_sql(expression) 2915 2916 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2917 this = expression.this 2918 if isinstance(this, exp.Array): 2919 this.replace(exp.paren(this)) 2920 2921 bracket = super().bracket_sql(expression) 2922 2923 if not expression.args.get("returns_list_for_maps"): 2924 if not this.type: 2925 from sqlglot.optimizer.annotate_types import annotate_types 2926 2927 this = annotate_types(this, dialect=self.dialect) 2928 2929 if this.is_type(exp.DataType.Type.MAP): 2930 bracket = f"({bracket})[1]" 2931 2932 return bracket 2933 2934 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2935 func = expression.this 2936 2937 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2938 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2939 if isinstance(func, exp.ArrayAgg): 2940 if not isinstance(order := expression.expression, exp.Order): 2941 return self.sql(func) 2942 2943 # Save the original column for FILTER clause (before wrapping with Order) 2944 original_this = func.this 2945 2946 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 2947 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 2948 func.set( 2949 "this", 2950 exp.Order( 2951 this=func.this.copy(), 2952 expressions=order.expressions, 2953 ), 2954 ) 2955 2956 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 2957 # Use original_this (not the Order-wrapped version) for the FILTER condition 2958 array_agg_sql = self.function_fallback_sql(func) 2959 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 2960 2961 # For other functions (like PERCENTILES), use existing logic 2962 expression_sql = self.sql(expression, "expression") 2963 2964 if isinstance(func, exp.PERCENTILES): 2965 # Make the order key the first arg and slide the fraction to the right 2966 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 2967 order_col = expression.find(exp.Ordered) 2968 if order_col: 2969 func.set("expression", func.this) 2970 func.set("this", order_col.this) 2971 2972 this = self.sql(expression, "this").rstrip(")") 2973 2974 return f"{this}{expression_sql})" 2975 2976 def length_sql(self, expression: exp.Length) -> str: 2977 arg = expression.this 2978 2979 # Dialects like BQ and Snowflake also accept binary values as args, so 2980 # DDB will attempt to infer the type or resort to case/when resolution 2981 if not expression.args.get("binary") or arg.is_string: 2982 return self.func("LENGTH", arg) 2983 2984 if not arg.type: 2985 from sqlglot.optimizer.annotate_types import annotate_types 2986 2987 arg = annotate_types(arg, dialect=self.dialect) 2988 2989 if arg.is_type(*exp.DataType.TEXT_TYPES): 2990 return self.func("LENGTH", arg) 2991 2992 # We need these casts to make duckdb's static type checker happy 2993 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 2994 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 2995 2996 case = ( 2997 exp.case(self.func("TYPEOF", arg)) 2998 .when("'BLOB'", self.func("OCTET_LENGTH", blob)) 2999 .else_( 3000 exp.Anonymous(this="LENGTH", expressions=[varchar]) 3001 ) # anonymous to break length_sql recursion 3002 ) 3003 3004 return self.sql(case) 3005 3006 def sha_sql(self, expression: exp.SHA) -> str: 3007 arg = expression.this 3008 3009 # If type is compatible with DuckDB or is an unknown type, use directly 3010 if ( 3011 arg.type 3012 and arg.type.this != exp.DataType.Type.UNKNOWN 3013 and not arg.is_type(*exp.DataType.TEXT_TYPES) 3014 and not _is_binary(arg) 3015 ): 3016 arg = exp.cast(arg, exp.DataType.Type.VARCHAR) 3017 3018 return self.func("SHA1", arg) 3019 3020 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3021 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3022 this = expression.this 3023 expr = expression.expression 3024 max_dist = expression.args.get("max_dist") 3025 3026 if max_dist is None: 3027 return self.func("LEVENSHTEIN", this, expr) 3028 3029 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3030 levenshtein = exp.Levenshtein(this=this, expression=expr) 3031 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3032 3033 def minhash_sql(self, expression: exp.Minhash) -> str: 3034 k = expression.this 3035 exprs = expression.expressions 3036 3037 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3038 self.unsupported( 3039 "MINHASH with multiple expressions or * requires manual query restructuring" 3040 ) 3041 return self.func("MINHASH", k, *exprs) 3042 3043 expr = exprs[0] 3044 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3045 return f"({self.sql(result)})" 3046 3047 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3048 expr = expression.this 3049 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3050 return f"({self.sql(result)})" 3051 3052 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3053 expr = expression.this 3054 result = exp.replace_placeholders( 3055 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3056 ) 3057 return f"({self.sql(result)})" 3058 3059 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3060 args = expression.expressions 3061 3062 if not args: 3063 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3064 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3065 3066 # Build placeholder values for template 3067 lengths = [exp.Length(this=arg) for arg in args] 3068 max_len = ( 3069 lengths[0] 3070 if len(lengths) == 1 3071 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3072 ) 3073 3074 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3075 empty_struct = exp.func( 3076 "STRUCT", 3077 *[ 3078 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3079 for i in range(len(args)) 3080 ], 3081 ) 3082 3083 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3084 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3085 index = exp.column("__i") + 1 3086 transform_struct = exp.func( 3087 "STRUCT", 3088 *[ 3089 exp.PropertyEQ( 3090 this=exp.Literal.string(f"${i + 1}"), 3091 expression=exp.func("COALESCE", arg, exp.array())[index], 3092 ) 3093 for i, arg in enumerate(args) 3094 ], 3095 ) 3096 3097 result = exp.replace_placeholders( 3098 self.ARRAYS_ZIP_TEMPLATE.copy(), 3099 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3100 all_empty_check=exp.and_( 3101 *[ 3102 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3103 for arg in args 3104 ] 3105 ), 3106 empty_struct=empty_struct, 3107 max_len=max_len, 3108 transform_struct=transform_struct, 3109 ) 3110 return self.sql(result) 3111 3112 def lower_sql(self, expression: exp.Lower) -> str: 3113 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3114 return _gen_with_cast_to_blob(self, expression, result_sql) 3115 3116 def upper_sql(self, expression: exp.Upper) -> str: 3117 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3118 return _gen_with_cast_to_blob(self, expression, result_sql) 3119 3120 def reverse_sql(self, expression: exp.Reverse) -> str: 3121 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3122 return _gen_with_cast_to_blob(self, expression, result_sql) 3123 3124 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3125 # DuckDB TO_BASE64 requires BLOB input 3126 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3127 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3128 result = expression.this 3129 3130 # Check if input is a string type - ENCODE only accepts VARCHAR 3131 if result.is_type(*exp.DataType.TEXT_TYPES): 3132 result = exp.Encode(this=result) 3133 3134 result = exp.ToBase64(this=result) 3135 3136 max_line_length = expression.args.get("max_line_length") 3137 alphabet = expression.args.get("alphabet") 3138 3139 # Handle custom alphabet by replacing standard chars with custom ones 3140 result = _apply_base64_alphabet_replacements(result, alphabet) 3141 3142 # Handle max_line_length by inserting newlines every N characters 3143 line_length = ( 3144 t.cast(int, max_line_length.to_py()) 3145 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3146 else 0 3147 ) 3148 if line_length > 0: 3149 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3150 result = exp.Trim( 3151 this=exp.RegexpReplace( 3152 this=result, 3153 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3154 replacement=exp.Concat( 3155 expressions=[exp.Literal.string("\\1"), newline.copy()] 3156 ), 3157 ), 3158 expression=newline, 3159 position="TRAILING", 3160 ) 3161 3162 return self.sql(result) 3163 3164 def replace_sql(self, expression: exp.Replace) -> str: 3165 result_sql = self.func( 3166 "REPLACE", 3167 _cast_to_varchar(expression.this), 3168 _cast_to_varchar(expression.expression), 3169 _cast_to_varchar(expression.args.get("replacement")), 3170 ) 3171 return _gen_with_cast_to_blob(self, expression, result_sql) 3172 3173 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3174 _prepare_binary_bitwise_args(expression) 3175 result_sql = self.binary(expression, op) 3176 return _gen_with_cast_to_blob(self, expression, result_sql) 3177 3178 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3179 _prepare_binary_bitwise_args(expression) 3180 result_sql = self.func("XOR", expression.this, expression.expression) 3181 return _gen_with_cast_to_blob(self, expression, result_sql) 3182 3183 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3184 this = expression.this 3185 key = expression.args.get("key") 3186 key_sql = key.name if isinstance(key, exp.Expression) else "" 3187 value_sql = self.sql(expression, "value") 3188 3189 kv_sql = f"{key_sql} := {value_sql}" 3190 3191 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3192 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3193 if isinstance(this, exp.Struct) and not this.expressions: 3194 return self.func("STRUCT_PACK", kv_sql) 3195 3196 return self.func("STRUCT_INSERT", this, kv_sql) 3197 3198 def mapcat_sql(self, expression: exp.MapCat) -> str: 3199 result = exp.replace_placeholders( 3200 self.MAPCAT_TEMPLATE.copy(), 3201 map1=expression.this, 3202 map2=expression.expression, 3203 ) 3204 return self.sql(result) 3205 3206 def startswith_sql(self, expression: exp.StartsWith) -> str: 3207 return self.func( 3208 "STARTS_WITH", 3209 _cast_to_varchar(expression.this), 3210 _cast_to_varchar(expression.expression), 3211 ) 3212 3213 def space_sql(self, expression: exp.Space) -> str: 3214 # DuckDB's REPEAT requires BIGINT for the count parameter 3215 return self.sql( 3216 exp.Repeat( 3217 this=exp.Literal.string(" "), 3218 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3219 ) 3220 ) 3221 3222 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3223 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3224 if isinstance(expression.this, exp.Generator): 3225 # Preserve alias, joins, and other table-level args 3226 table = exp.Table( 3227 this=expression.this, 3228 alias=expression.args.get("alias"), 3229 joins=expression.args.get("joins"), 3230 ) 3231 return self.sql(table) 3232 3233 return super().tablefromrows_sql(expression) 3234 3235 def unnest_sql(self, expression: exp.Unnest) -> str: 3236 explode_array = expression.args.get("explode_array") 3237 if explode_array: 3238 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3239 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3240 expression.expressions.append( 3241 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3242 ) 3243 3244 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3245 alias = expression.args.get("alias") 3246 if isinstance(alias, exp.TableAlias): 3247 expression.set("alias", None) 3248 if alias.columns: 3249 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3250 3251 unnest_sql = super().unnest_sql(expression) 3252 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3253 return self.sql(select) 3254 3255 return super().unnest_sql(expression) 3256 3257 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3258 this = expression.this 3259 3260 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3261 # DuckDB should render IGNORE NULLS only for the general-purpose 3262 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3263 return super().ignorenulls_sql(expression) 3264 3265 if isinstance(this, exp.First): 3266 this = exp.AnyValue(this=this.this) 3267 3268 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3269 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3270 3271 return self.sql(this) 3272 3273 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3274 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3275 # DuckDB should render RESPECT NULLS only for the general-purpose 3276 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3277 return super().respectnulls_sql(expression) 3278 3279 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3280 return self.sql(expression, "this") 3281 3282 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3283 this = self.sql(expression, "this") 3284 null_text = self.sql(expression, "null") 3285 3286 if null_text: 3287 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3288 3289 return self.func("ARRAY_TO_STRING", this, expression.expression) 3290 3291 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3292 this = expression.this 3293 group = expression.args.get("group") 3294 params = expression.args.get("parameters") 3295 position = expression.args.get("position") 3296 occurrence = expression.args.get("occurrence") 3297 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3298 3299 if position and (not position.is_int or position.to_py() > 1): 3300 this = exp.Substring(this=this, start=position) 3301 3302 if null_if_pos_overflow: 3303 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3304 3305 # Do not render group if there is no following argument, 3306 # and it's the default value for this dialect 3307 if ( 3308 not params 3309 and group 3310 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3311 ): 3312 group = None 3313 3314 if occurrence and (not occurrence.is_int or occurrence.to_py() > 1): 3315 return self.func( 3316 "ARRAY_EXTRACT", 3317 self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params), 3318 exp.Literal.number(occurrence), 3319 ) 3320 3321 return self.func("REGEXP_EXTRACT", this, expression.expression, group, params) 3322 3323 @unsupported_args("culture") 3324 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3325 fmt = expression.args.get("format") 3326 if fmt and fmt.is_int: 3327 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3328 3329 self.unsupported("Only integer formats are supported by NumberToStr") 3330 return self.function_fallback_sql(expression) 3331 3332 def autoincrementcolumnconstraint_sql(self, _) -> str: 3333 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3334 return "" 3335 3336 def aliases_sql(self, expression: exp.Aliases) -> str: 3337 this = expression.this 3338 if isinstance(this, exp.Posexplode): 3339 return self.posexplode_sql(this) 3340 3341 return super().aliases_sql(expression) 3342 3343 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3344 this = expression.this 3345 parent = expression.parent 3346 3347 # The default Spark aliases are "pos" and "col", unless specified otherwise 3348 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3349 3350 if isinstance(parent, exp.Aliases): 3351 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3352 pos, col = parent.expressions 3353 elif isinstance(parent, exp.Table): 3354 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3355 alias = parent.args.get("alias") 3356 if alias: 3357 pos, col = alias.columns or [pos, col] 3358 alias.pop() 3359 3360 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3361 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3362 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3363 gen_subscripts = self.sql( 3364 exp.Alias( 3365 this=exp.Anonymous( 3366 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3367 ) 3368 - exp.Literal.number(1), 3369 alias=pos, 3370 ) 3371 ) 3372 3373 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3374 3375 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3376 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3377 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3378 3379 return posexplode_sql 3380 3381 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3382 """ 3383 Handles three key issues: 3384 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3385 2. End-of-month preservation: If input is last day of month, result is last day of result month 3386 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3387 """ 3388 from sqlglot.optimizer.annotate_types import annotate_types 3389 3390 this = expression.this 3391 if not this.type: 3392 this = annotate_types(this, dialect=self.dialect) 3393 3394 if this.is_type(*exp.DataType.TEXT_TYPES): 3395 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3396 3397 # Detect float/decimal months to apply rounding (Snowflake behavior) 3398 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3399 months_expr = expression.expression 3400 if not months_expr.type: 3401 months_expr = annotate_types(months_expr, dialect=self.dialect) 3402 3403 # Build interval or to_months expression based on type 3404 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3405 interval_or_to_months = ( 3406 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3407 if months_expr.is_type( 3408 exp.DataType.Type.FLOAT, 3409 exp.DataType.Type.DOUBLE, 3410 exp.DataType.Type.DECIMAL, 3411 ) 3412 # Integer case: standard INTERVAL N MONTH syntax 3413 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3414 ) 3415 3416 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3417 3418 # Apply end-of-month preservation if Snowflake flag is set 3419 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3420 preserve_eom = expression.args.get("preserve_end_of_month") 3421 result_expr = ( 3422 exp.case() 3423 .when( 3424 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3425 exp.func("LAST_DAY", date_add_expr), 3426 ) 3427 .else_(date_add_expr) 3428 if preserve_eom 3429 else date_add_expr 3430 ) 3431 3432 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3433 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3434 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3435 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3436 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3437 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3438 return self.sql(result_expr) 3439 3440 def format_sql(self, expression: exp.Format) -> str: 3441 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3442 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3443 3444 return self.function_fallback_sql(expression) 3445 3446 def hexstring_sql( 3447 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3448 ) -> str: 3449 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3450 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3451 3452 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3453 unit = unit_to_str(expression) 3454 date = expression.this 3455 result = self.func("DATE_TRUNC", unit, date) 3456 3457 if ( 3458 expression.args.get("input_type_preserved") 3459 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3460 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3461 ): 3462 return self.sql(exp.Cast(this=result, to=date.type)) 3463 3464 return result 3465 3466 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3467 unit = unit_to_str(expression) 3468 zone = expression.args.get("zone") 3469 timestamp = expression.this 3470 date_unit = is_date_unit(unit) 3471 3472 if date_unit and zone: 3473 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3474 # Double AT TIME ZONE needed for BigQuery compatibility: 3475 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 3476 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 3477 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 3478 result_sql = self.func("DATE_TRUNC", unit, timestamp) 3479 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 3480 3481 result = self.func("DATE_TRUNC", unit, timestamp) 3482 if expression.args.get("input_type_preserved"): 3483 if timestamp.type and timestamp.is_type( 3484 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 3485 ): 3486 dummy_date = exp.Cast( 3487 this=exp.Literal.string("1970-01-01"), 3488 to=exp.DataType(this=exp.DataType.Type.DATE), 3489 ) 3490 date_time = exp.Add(this=dummy_date, expression=timestamp) 3491 result = self.func("DATE_TRUNC", unit, date_time) 3492 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3493 3494 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 3495 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 3496 ): 3497 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3498 3499 return result 3500 3501 def trim_sql(self, expression: exp.Trim) -> str: 3502 expression.this.replace(_cast_to_varchar(expression.this)) 3503 if expression.expression: 3504 expression.expression.replace(_cast_to_varchar(expression.expression)) 3505 3506 result_sql = super().trim_sql(expression) 3507 return _gen_with_cast_to_blob(self, expression, result_sql) 3508 3509 def round_sql(self, expression: exp.Round) -> str: 3510 this = expression.this 3511 decimals = expression.args.get("decimals") 3512 truncate = expression.args.get("truncate") 3513 3514 # DuckDB requires the scale (decimals) argument to be an INT 3515 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 3516 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 3517 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 3518 decimals = exp.cast(decimals, exp.DataType.Type.INT) 3519 3520 func = "ROUND" 3521 if truncate: 3522 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 3523 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 3524 func = "ROUND_EVEN" 3525 truncate = None 3526 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 3527 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 3528 truncate = None 3529 3530 return self.func(func, this, decimals, truncate) 3531 3532 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 3533 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 3534 3535 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 3536 if expression.is_type(*exp.DataType.REAL_TYPES): 3537 result = f"CAST({result} AS DOUBLE)" 3538 3539 return result 3540 3541 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 3542 """ 3543 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 3544 dividing the input distribution into n equal-sized buckets. 3545 3546 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 3547 does not document the specific algorithm used so results may differ. DuckDB does not 3548 support RESPECT NULLS. 3549 """ 3550 this = expression.this 3551 if isinstance(this, exp.Distinct): 3552 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 3553 if len(this.expressions) < 2: 3554 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 3555 return self.function_fallback_sql(expression) 3556 num_quantiles_expr = this.expressions[1].pop() 3557 else: 3558 num_quantiles_expr = expression.expression 3559 3560 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 3561 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3562 return self.function_fallback_sql(expression) 3563 3564 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 3565 if num_quantiles <= 0: 3566 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3567 return self.function_fallback_sql(expression) 3568 3569 quantiles = [ 3570 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 3571 for i in range(num_quantiles + 1) 3572 ] 3573 3574 return self.sql( 3575 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 3576 ) 3577 3578 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 3579 if expression.args.get("scalar_only"): 3580 expression = exp.JSONExtractScalar( 3581 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 3582 ) 3583 return _arrow_json_extract_sql(self, expression) 3584 3585 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 3586 this = expression.this 3587 3588 if _is_binary(this): 3589 expression.type = exp.DataType.build("BINARY") 3590 3591 arg = _cast_to_bit(this) 3592 3593 if isinstance(this, exp.Neg): 3594 arg = exp.Paren(this=arg) 3595 3596 expression.set("this", arg) 3597 3598 result_sql = f"~{self.sql(expression, 'this')}" 3599 3600 return _gen_with_cast_to_blob(self, expression, result_sql) 3601 3602 def window_sql(self, expression: exp.Window) -> str: 3603 this = expression.this 3604 if isinstance(this, exp.Corr) or ( 3605 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 3606 ): 3607 return self._corr_sql(expression) 3608 3609 return super().window_sql(expression) 3610 3611 def filter_sql(self, expression: exp.Filter) -> str: 3612 if isinstance(expression.this, exp.Corr): 3613 return self._corr_sql(expression) 3614 3615 return super().filter_sql(expression) 3616 3617 def _corr_sql( 3618 self, 3619 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 3620 ) -> str: 3621 if isinstance(expression, exp.Corr) and not expression.args.get( 3622 "null_on_zero_variance" 3623 ): 3624 return self.func("CORR", expression.this, expression.expression) 3625 3626 corr_expr = _maybe_corr_null_to_false(expression) 3627 if corr_expr is None: 3628 if isinstance(expression, exp.Window): 3629 return super().window_sql(expression) 3630 if isinstance(expression, exp.Filter): 3631 return super().filter_sql(expression) 3632 corr_expr = expression # make mypy happy 3633 3634 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
1275class DuckDB(Dialect): 1276 NULL_ORDERING = "nulls_are_last" 1277 SUPPORTS_USER_DEFINED_TYPES = True 1278 SAFE_DIVISION = True 1279 INDEX_OFFSET = 1 1280 CONCAT_COALESCE = True 1281 SUPPORTS_ORDER_BY_ALL = True 1282 SUPPORTS_FIXED_SIZE_ARRAYS = True 1283 STRICT_JSON_PATH_SYNTAX = False 1284 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 1285 1286 # https://duckdb.org/docs/sql/introduction.html#creating-a-new-table 1287 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 1288 1289 DATE_PART_MAPPING = { 1290 **Dialect.DATE_PART_MAPPING, 1291 "DAYOFWEEKISO": "ISODOW", 1292 } 1293 1294 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 1295 1296 DATE_PART_MAPPING.pop("WEEKDAY") 1297 1298 INVERSE_TIME_MAPPING = { 1299 "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d) 1300 "%:z": "%z", # In DuckDB %z can represent ±HH:MM, ±HHMM, or ±HH. 1301 "%-z": "%z", 1302 "%f_zero": "%n", 1303 "%f_one": "%n", 1304 "%f_two": "%n", 1305 "%f_three": "%g", 1306 "%f_four": "%n", 1307 "%f_five": "%n", 1308 "%f_seven": "%n", 1309 "%f_eight": "%n", 1310 "%f_nine": "%n", 1311 } 1312 1313 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1314 if isinstance(path, exp.Literal): 1315 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1316 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1317 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1318 # either result in a noisy warning or in an invalid representation of the path. 1319 path_text = path.name 1320 if path_text.startswith("/") or "[#" in path_text: 1321 return path 1322 1323 return super().to_json_path(path) 1324 1325 class Tokenizer(tokens.Tokenizer): 1326 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1327 BYTE_STRING_ESCAPES = ["'", "\\"] 1328 HEREDOC_STRINGS = ["$"] 1329 1330 HEREDOC_TAG_IS_IDENTIFIER = True 1331 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1332 1333 KEYWORDS = { 1334 **tokens.Tokenizer.KEYWORDS, 1335 "//": TokenType.DIV, 1336 "**": TokenType.DSTAR, 1337 "^@": TokenType.CARET_AT, 1338 "@>": TokenType.AT_GT, 1339 "<@": TokenType.LT_AT, 1340 "ATTACH": TokenType.ATTACH, 1341 "BINARY": TokenType.VARBINARY, 1342 "BITSTRING": TokenType.BIT, 1343 "BPCHAR": TokenType.TEXT, 1344 "CHAR": TokenType.TEXT, 1345 "DATETIME": TokenType.TIMESTAMPNTZ, 1346 "DETACH": TokenType.DETACH, 1347 "FORCE": TokenType.FORCE, 1348 "INSTALL": TokenType.INSTALL, 1349 "INT8": TokenType.BIGINT, 1350 "LOGICAL": TokenType.BOOLEAN, 1351 "MACRO": TokenType.FUNCTION, 1352 "ONLY": TokenType.ONLY, 1353 "PIVOT_WIDER": TokenType.PIVOT, 1354 "POSITIONAL": TokenType.POSITIONAL, 1355 "RESET": TokenType.COMMAND, 1356 "ROW": TokenType.STRUCT, 1357 "SIGNED": TokenType.INT, 1358 "STRING": TokenType.TEXT, 1359 "SUMMARIZE": TokenType.SUMMARIZE, 1360 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1361 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1362 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1363 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1364 "TIMESTAMP_US": TokenType.TIMESTAMP, 1365 "UBIGINT": TokenType.UBIGINT, 1366 "UINTEGER": TokenType.UINT, 1367 "USMALLINT": TokenType.USMALLINT, 1368 "UTINYINT": TokenType.UTINYINT, 1369 "VARCHAR": TokenType.TEXT, 1370 } 1371 KEYWORDS.pop("/*+") 1372 1373 SINGLE_TOKENS = { 1374 **tokens.Tokenizer.SINGLE_TOKENS, 1375 "$": TokenType.PARAMETER, 1376 } 1377 1378 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW} 1379 1380 class Parser(parser.Parser): 1381 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1382 1383 BITWISE = parser.Parser.BITWISE.copy() 1384 BITWISE.pop(TokenType.CARET) 1385 1386 RANGE_PARSERS = { 1387 **parser.Parser.RANGE_PARSERS, 1388 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1389 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1390 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1391 } 1392 1393 EXPONENT = { 1394 **parser.Parser.EXPONENT, 1395 TokenType.CARET: exp.Pow, 1396 TokenType.DSTAR: exp.Pow, 1397 } 1398 1399 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1400 1401 SHOW_PARSERS = { 1402 "TABLES": _show_parser("TABLES"), 1403 "ALL TABLES": _show_parser("ALL TABLES"), 1404 } 1405 1406 FUNCTIONS = { 1407 **parser.Parser.FUNCTIONS, 1408 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1409 "ARRAY_PREPEND": _build_array_prepend, 1410 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1411 "ARRAY_SORT": exp.SortArray.from_arg_list, 1412 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1413 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1414 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1415 "DATEDIFF": _build_date_diff, 1416 "DATE_DIFF": _build_date_diff, 1417 "DATE_TRUNC": date_trunc_to_time, 1418 "DATETRUNC": date_trunc_to_time, 1419 "DECODE": lambda args: exp.Decode( 1420 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1421 ), 1422 "EDITDIST3": exp.Levenshtein.from_arg_list, 1423 "ENCODE": lambda args: exp.Encode( 1424 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1425 ), 1426 "EPOCH": exp.TimeToUnix.from_arg_list, 1427 "EPOCH_MS": lambda args: exp.UnixToTime( 1428 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1429 ), 1430 "GENERATE_SERIES": _build_generate_series(), 1431 "GET_BIT": lambda args: exp.Getbit( 1432 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1433 ), 1434 "JSON": exp.ParseJSON.from_arg_list, 1435 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1436 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1437 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1438 "LIST_CONCAT": parser.build_array_concat, 1439 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1440 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1441 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1442 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1443 "LIST_HAS": exp.ArrayContains.from_arg_list, 1444 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1445 "LIST_PREPEND": _build_array_prepend, 1446 "LIST_REVERSE_SORT": _build_sort_array_desc, 1447 "LIST_SORT": exp.SortArray.from_arg_list, 1448 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1449 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1450 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1451 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1452 "MAKE_TIMESTAMP": _build_make_timestamp, 1453 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1454 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1455 "RANGE": _build_generate_series(end_exclusive=True), 1456 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1457 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1458 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1459 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1460 this=seq_get(args, 0), 1461 expression=seq_get(args, 1), 1462 replacement=seq_get(args, 2), 1463 modifiers=seq_get(args, 3), 1464 single_replace=True, 1465 ), 1466 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1467 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1468 "STRING_SPLIT": exp.Split.from_arg_list, 1469 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1470 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1471 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1472 "STRUCT_PACK": exp.Struct.from_arg_list, 1473 "STR_SPLIT": exp.Split.from_arg_list, 1474 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1475 "TIME_BUCKET": exp.DateBin.from_arg_list, 1476 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1477 "UNNEST": exp.Explode.from_arg_list, 1478 "XOR": binary_from_function(exp.BitwiseXor), 1479 } 1480 1481 FUNCTIONS.pop("DATE_SUB") 1482 FUNCTIONS.pop("GLOB") 1483 1484 FUNCTION_PARSERS = { 1485 **parser.Parser.FUNCTION_PARSERS, 1486 **dict.fromkeys( 1487 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1488 ), 1489 } 1490 FUNCTION_PARSERS.pop("DECODE") 1491 1492 NO_PAREN_FUNCTION_PARSERS = { 1493 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1494 "MAP": lambda self: self._parse_map(), 1495 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1496 } 1497 1498 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1499 TokenType.SEMI, 1500 TokenType.ANTI, 1501 } 1502 1503 PLACEHOLDER_PARSERS = { 1504 **parser.Parser.PLACEHOLDER_PARSERS, 1505 TokenType.PARAMETER: lambda self: ( 1506 self.expression(exp.Placeholder, this=self._prev.text) 1507 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1508 else None 1509 ), 1510 } 1511 1512 TYPE_CONVERTERS = { 1513 # https://duckdb.org/docs/sql/data_types/numeric 1514 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1515 # https://duckdb.org/docs/sql/data_types/text 1516 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1517 } 1518 1519 STATEMENT_PARSERS = { 1520 **parser.Parser.STATEMENT_PARSERS, 1521 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1522 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1523 TokenType.FORCE: lambda self: self._parse_force(), 1524 TokenType.INSTALL: lambda self: self._parse_install(), 1525 TokenType.SHOW: lambda self: self._parse_show(), 1526 } 1527 1528 SET_PARSERS = { 1529 **parser.Parser.SET_PARSERS, 1530 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1531 } 1532 1533 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1534 index = self._index 1535 if not self._match_text_seq("LAMBDA"): 1536 return super()._parse_lambda(alias=alias) 1537 1538 expressions = self._parse_csv(self._parse_lambda_arg) 1539 if not self._match(TokenType.COLON): 1540 self._retreat(index) 1541 return None 1542 1543 this = self._replace_lambda(self._parse_assignment(), expressions) 1544 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1545 1546 def _parse_expression(self) -> t.Optional[exp.Expression]: 1547 # DuckDB supports prefix aliases, e.g. foo: 1 1548 if self._next and self._next.token_type == TokenType.COLON: 1549 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1550 self._match(TokenType.COLON) 1551 comments = self._prev_comments or [] 1552 1553 this = self._parse_assignment() 1554 if isinstance(this, exp.Expression): 1555 # Moves the comment next to the alias in `alias: expr /* comment */` 1556 comments += this.pop_comments() or [] 1557 1558 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1559 1560 return super()._parse_expression() 1561 1562 def _parse_table( 1563 self, 1564 schema: bool = False, 1565 joins: bool = False, 1566 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1567 parse_bracket: bool = False, 1568 is_db_reference: bool = False, 1569 parse_partition: bool = False, 1570 consume_pipe: bool = False, 1571 ) -> t.Optional[exp.Expression]: 1572 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1573 if self._next and self._next.token_type == TokenType.COLON: 1574 alias = self._parse_table_alias( 1575 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1576 ) 1577 self._match(TokenType.COLON) 1578 comments = self._prev_comments or [] 1579 else: 1580 alias = None 1581 comments = [] 1582 1583 table = super()._parse_table( 1584 schema=schema, 1585 joins=joins, 1586 alias_tokens=alias_tokens, 1587 parse_bracket=parse_bracket, 1588 is_db_reference=is_db_reference, 1589 parse_partition=parse_partition, 1590 ) 1591 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1592 # Moves the comment next to the alias in `alias: table /* comment */` 1593 comments += table.pop_comments() or [] 1594 alias.comments = alias.pop_comments() + comments 1595 table.set("alias", alias) 1596 1597 return table 1598 1599 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1600 # https://duckdb.org/docs/sql/samples.html 1601 sample = super()._parse_table_sample(as_modifier=as_modifier) 1602 if sample and not sample.args.get("method"): 1603 if sample.args.get("size"): 1604 sample.set("method", exp.var("RESERVOIR")) 1605 else: 1606 sample.set("method", exp.var("SYSTEM")) 1607 1608 return sample 1609 1610 def _parse_bracket( 1611 self, this: t.Optional[exp.Expression] = None 1612 ) -> t.Optional[exp.Expression]: 1613 bracket = super()._parse_bracket(this) 1614 1615 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1616 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1617 bracket.set("returns_list_for_maps", True) 1618 1619 return bracket 1620 1621 def _parse_map(self) -> exp.ToMap | exp.Map: 1622 if self._match(TokenType.L_BRACE, advance=False): 1623 return self.expression(exp.ToMap, this=self._parse_bracket()) 1624 1625 args = self._parse_wrapped_csv(self._parse_assignment) 1626 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1627 1628 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1629 return self._parse_field_def() 1630 1631 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1632 if len(aggregations) == 1: 1633 return super()._pivot_column_names(aggregations) 1634 return pivot_column_names(aggregations, dialect="duckdb") 1635 1636 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1637 def _parse_attach_option() -> exp.AttachOption: 1638 return self.expression( 1639 exp.AttachOption, 1640 this=self._parse_var(any_token=True), 1641 expression=self._parse_field(any_token=True), 1642 ) 1643 1644 self._match(TokenType.DATABASE) 1645 exists = self._parse_exists(not_=is_attach) 1646 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1647 1648 if self._match(TokenType.L_PAREN, advance=False): 1649 expressions = self._parse_wrapped_csv(_parse_attach_option) 1650 else: 1651 expressions = None 1652 1653 return ( 1654 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1655 if is_attach 1656 else self.expression(exp.Detach, this=this, exists=exists) 1657 ) 1658 1659 def _parse_show_duckdb(self, this: str) -> exp.Show: 1660 return self.expression(exp.Show, this=this) 1661 1662 def _parse_force(self) -> exp.Install | exp.Command: 1663 # FORCE can only be followed by INSTALL or CHECKPOINT 1664 # In the case of CHECKPOINT, we fallback 1665 if not self._match(TokenType.INSTALL): 1666 return self._parse_as_command(self._prev) 1667 1668 return self._parse_install(force=True) 1669 1670 def _parse_install(self, force: bool = False) -> exp.Install: 1671 return self.expression( 1672 exp.Install, 1673 this=self._parse_id_var(), 1674 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1675 force=force, 1676 ) 1677 1678 def _parse_primary(self) -> t.Optional[exp.Expression]: 1679 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1680 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1681 1682 return super()._parse_primary() 1683 1684 class Generator(generator.Generator): 1685 PARAMETER_TOKEN = "$" 1686 NAMED_PLACEHOLDER_TOKEN = "$" 1687 JOIN_HINTS = False 1688 TABLE_HINTS = False 1689 QUERY_HINTS = False 1690 LIMIT_FETCH = "LIMIT" 1691 STRUCT_DELIMITER = ("(", ")") 1692 RENAME_TABLE_WITH_DB = False 1693 NVL2_SUPPORTED = False 1694 SEMI_ANTI_JOIN_WITH_SIDE = False 1695 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1696 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1697 LAST_DAY_SUPPORTS_DATE_PART = False 1698 JSON_KEY_VALUE_PAIR_SEP = "," 1699 IGNORE_NULLS_IN_FUNC = True 1700 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1701 SUPPORTS_CREATE_TABLE_LIKE = False 1702 MULTI_ARG_DISTINCT = False 1703 CAN_IMPLEMENT_ARRAY_ANY = True 1704 SUPPORTS_TO_NUMBER = False 1705 SUPPORTS_WINDOW_EXCLUDE = True 1706 COPY_HAS_INTO_KEYWORD = False 1707 STAR_EXCEPT = "EXCLUDE" 1708 PAD_FILL_PATTERN_IS_REQUIRED = True 1709 ARRAY_SIZE_DIM_REQUIRED = False 1710 NORMALIZE_EXTRACT_DATE_PARTS = True 1711 SUPPORTS_LIKE_QUANTIFIERS = False 1712 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1713 1714 TRANSFORMS = { 1715 **generator.Generator.TRANSFORMS, 1716 exp.AnyValue: _anyvalue_sql, 1717 exp.ApproxDistinct: approx_count_distinct_sql, 1718 exp.Boolnot: _boolnot_sql, 1719 exp.Booland: _booland_sql, 1720 exp.Boolor: _boolor_sql, 1721 exp.Array: transforms.preprocess( 1722 [transforms.inherit_struct_field_names], 1723 generator=inline_array_unless_query, 1724 ), 1725 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1726 exp.ArrayCompact: array_compact_sql, 1727 exp.ArrayConstructCompact: lambda self, e: self.sql( 1728 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1729 ), 1730 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1731 exp.ArrayFilter: rename_func("LIST_FILTER"), 1732 exp.ArrayInsert: _array_insert_sql, 1733 exp.ArrayRemove: remove_from_array_using_filter, 1734 exp.ArraySort: _array_sort_sql, 1735 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1736 exp.ArraySum: rename_func("LIST_SUM"), 1737 exp.ArrayUniqueAgg: lambda self, e: self.func( 1738 "LIST", exp.Distinct(expressions=[e.this]) 1739 ), 1740 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1741 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1742 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1743 exp.BitwiseAndAgg: _bitwise_agg_sql, 1744 exp.BitwiseLeftShift: _bitshift_sql, 1745 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1746 exp.BitwiseOrAgg: _bitwise_agg_sql, 1747 exp.BitwiseRightShift: _bitshift_sql, 1748 exp.BitwiseXorAgg: _bitwise_agg_sql, 1749 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1750 exp.Corr: lambda self, e: self._corr_sql(e), 1751 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1752 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1753 exp.CurrentTimestamp: lambda self, e: self.sql( 1754 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1755 ) 1756 if e.args.get("sysdate") 1757 else "CURRENT_TIMESTAMP", 1758 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1759 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1760 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1761 exp.DayOfWeekIso: rename_func("ISODOW"), 1762 exp.DayOfYear: rename_func("DAYOFYEAR"), 1763 exp.Dayname: lambda self, e: ( 1764 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1765 if e.args.get("abbreviated") 1766 else self.func("DAYNAME", e.this) 1767 ), 1768 exp.Monthname: lambda self, e: ( 1769 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1770 if e.args.get("abbreviated") 1771 else self.func("MONTHNAME", e.this) 1772 ), 1773 exp.DataType: _datatype_sql, 1774 exp.Date: _date_sql, 1775 exp.DateAdd: _date_delta_to_binary_interval_op(), 1776 exp.DateFromParts: _date_from_parts_sql, 1777 exp.DateSub: _date_delta_to_binary_interval_op(), 1778 exp.DateDiff: _date_diff_sql, 1779 exp.DateStrToDate: datestrtodate_sql, 1780 exp.Datetime: no_datetime_sql, 1781 exp.DatetimeDiff: _date_diff_sql, 1782 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1783 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1784 exp.DateToDi: lambda self, 1785 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1786 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1787 exp.DiToDate: lambda self, 1788 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1789 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1790 exp.EqualNull: lambda self, e: self.sql( 1791 exp.NullSafeEQ(this=e.this, expression=e.expression) 1792 ), 1793 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1794 exp.GenerateDateArray: _generate_datetime_array_sql, 1795 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1796 exp.Getbit: getbit_sql, 1797 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1798 exp.Explode: rename_func("UNNEST"), 1799 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1800 exp.IsInf: rename_func("ISINF"), 1801 exp.IsNan: rename_func("ISNAN"), 1802 exp.IsNullValue: lambda self, e: self.sql( 1803 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1804 ), 1805 exp.IsArray: lambda self, e: self.sql( 1806 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1807 ), 1808 exp.Ceil: _ceil_floor, 1809 exp.Floor: _ceil_floor, 1810 exp.JSONBExists: rename_func("JSON_EXISTS"), 1811 exp.JSONExtract: _arrow_json_extract_sql, 1812 exp.JSONExtractArray: _json_extract_value_array_sql, 1813 exp.JSONFormat: _json_format_sql, 1814 exp.JSONValueArray: _json_extract_value_array_sql, 1815 exp.Lateral: explode_to_unnest_sql, 1816 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1817 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1818 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1819 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1820 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1821 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1822 exp.BoolxorAgg: _boolxor_agg_sql, 1823 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1824 exp.Initcap: _initcap_sql, 1825 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1826 exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)), 1827 exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)), 1828 exp.MonthsBetween: months_between_sql, 1829 exp.NextDay: _day_navigation_sql, 1830 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1831 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1832 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1833 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1834 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1835 exp.PreviousDay: _day_navigation_sql, 1836 exp.RegexpReplace: lambda self, e: self.func( 1837 "REGEXP_REPLACE", 1838 e.this, 1839 e.expression, 1840 e.args.get("replacement"), 1841 regexp_replace_global_modifier(e), 1842 ), 1843 exp.RegexpLike: rename_func("REGEXP_MATCHES"), 1844 exp.RegexpILike: lambda self, e: self.func( 1845 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1846 ), 1847 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1848 exp.RegrValx: _regr_val_sql, 1849 exp.RegrValy: _regr_val_sql, 1850 exp.Return: lambda self, e: self.sql(e, "this"), 1851 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1852 exp.Rand: rename_func("RANDOM"), 1853 exp.SHA2: sha256_sql, 1854 exp.Split: rename_func("STR_SPLIT"), 1855 exp.SortArray: _sort_array_sql, 1856 exp.StrPosition: strposition_sql, 1857 exp.StrToUnix: lambda self, e: self.func( 1858 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1859 ), 1860 exp.Struct: _struct_sql, 1861 exp.Transform: rename_func("LIST_TRANSFORM"), 1862 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1863 exp.TimeSub: _date_delta_to_binary_interval_op(), 1864 exp.Time: no_time_sql, 1865 exp.TimeDiff: _timediff_sql, 1866 exp.Timestamp: no_timestamp_sql, 1867 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1868 exp.TimestampDiff: lambda self, e: self.func( 1869 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1870 ), 1871 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1872 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 1873 exp.TimeStrToTime: timestrtotime_sql, 1874 exp.TimeStrToUnix: lambda self, e: self.func( 1875 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 1876 ), 1877 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1878 exp.ToBoolean: _to_boolean_sql, 1879 exp.TimeToUnix: rename_func("EPOCH"), 1880 exp.TsOrDiToDi: lambda self, 1881 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 1882 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1883 exp.TsOrDsDiff: lambda self, e: self.func( 1884 "DATE_DIFF", 1885 f"'{e.args.get('unit') or 'DAY'}'", 1886 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 1887 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 1888 ), 1889 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1890 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1891 exp.UnixSeconds: lambda self, e: self.sql( 1892 exp.cast( 1893 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 1894 ) 1895 ), 1896 exp.UnixToStr: lambda self, e: self.func( 1897 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1898 ), 1899 exp.DatetimeTrunc: lambda self, e: self.func( 1900 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 1901 ), 1902 exp.UnixToTime: _unix_to_time_sql, 1903 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1904 exp.VariancePop: rename_func("VAR_POP"), 1905 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1906 exp.YearOfWeek: lambda self, e: self.sql( 1907 exp.Extract( 1908 this=exp.Var(this="ISOYEAR"), 1909 expression=e.this, 1910 ) 1911 ), 1912 exp.YearOfWeekIso: lambda self, e: self.sql( 1913 exp.Extract( 1914 this=exp.Var(this="ISOYEAR"), 1915 expression=e.this, 1916 ) 1917 ), 1918 exp.Xor: _xor_sql, 1919 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1920 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1921 exp.DateBin: rename_func("TIME_BUCKET"), 1922 exp.LastDay: _last_day_sql, 1923 } 1924 1925 SUPPORTED_JSON_PATH_PARTS = { 1926 exp.JSONPathKey, 1927 exp.JSONPathRoot, 1928 exp.JSONPathSubscript, 1929 exp.JSONPathWildcard, 1930 } 1931 1932 TYPE_MAPPING = { 1933 **generator.Generator.TYPE_MAPPING, 1934 exp.DataType.Type.BINARY: "BLOB", 1935 exp.DataType.Type.BPCHAR: "TEXT", 1936 exp.DataType.Type.CHAR: "TEXT", 1937 exp.DataType.Type.DATETIME: "TIMESTAMP", 1938 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 1939 exp.DataType.Type.FLOAT: "REAL", 1940 exp.DataType.Type.JSONB: "JSON", 1941 exp.DataType.Type.NCHAR: "TEXT", 1942 exp.DataType.Type.NVARCHAR: "TEXT", 1943 exp.DataType.Type.UINT: "UINTEGER", 1944 exp.DataType.Type.VARBINARY: "BLOB", 1945 exp.DataType.Type.ROWVERSION: "BLOB", 1946 exp.DataType.Type.VARCHAR: "TEXT", 1947 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 1948 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 1949 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 1950 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 1951 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 1952 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 1953 } 1954 1955 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1956 RESERVED_KEYWORDS = { 1957 "array", 1958 "analyse", 1959 "union", 1960 "all", 1961 "when", 1962 "in_p", 1963 "default", 1964 "create_p", 1965 "window", 1966 "asymmetric", 1967 "to", 1968 "else", 1969 "localtime", 1970 "from", 1971 "end_p", 1972 "select", 1973 "current_date", 1974 "foreign", 1975 "with", 1976 "grant", 1977 "session_user", 1978 "or", 1979 "except", 1980 "references", 1981 "fetch", 1982 "limit", 1983 "group_p", 1984 "leading", 1985 "into", 1986 "collate", 1987 "offset", 1988 "do", 1989 "then", 1990 "localtimestamp", 1991 "check_p", 1992 "lateral_p", 1993 "current_role", 1994 "where", 1995 "asc_p", 1996 "placing", 1997 "desc_p", 1998 "user", 1999 "unique", 2000 "initially", 2001 "column", 2002 "both", 2003 "some", 2004 "as", 2005 "any", 2006 "only", 2007 "deferrable", 2008 "null_p", 2009 "current_time", 2010 "true_p", 2011 "table", 2012 "case", 2013 "trailing", 2014 "variadic", 2015 "for", 2016 "on", 2017 "distinct", 2018 "false_p", 2019 "not", 2020 "constraint", 2021 "current_timestamp", 2022 "returning", 2023 "primary", 2024 "intersect", 2025 "having", 2026 "analyze", 2027 "current_user", 2028 "and", 2029 "cast", 2030 "symmetric", 2031 "using", 2032 "order", 2033 "current_catalog", 2034 } 2035 2036 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2037 2038 # DuckDB doesn't generally support CREATE TABLE .. properties 2039 # https://duckdb.org/docs/sql/statements/create_table.html 2040 PROPERTIES_LOCATION = { 2041 prop: exp.Properties.Location.UNSUPPORTED 2042 for prop in generator.Generator.PROPERTIES_LOCATION 2043 } 2044 2045 # There are a few exceptions (e.g. temporary tables) which are supported or 2046 # can be transpiled to DuckDB, so we explicitly override them accordingly 2047 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2048 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2049 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2050 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2051 2052 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2053 exp.FirstValue, 2054 exp.Lag, 2055 exp.LastValue, 2056 exp.Lead, 2057 exp.NthValue, 2058 ) 2059 2060 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2061 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2062 """ 2063 WITH rand AS (SELECT :random_expr AS r), 2064 weights AS ( 2065 SELECT i, 1.0 / POWER(i, :s) AS w 2066 FROM RANGE(1, :n + 1) AS t(i) 2067 ), 2068 cdf AS ( 2069 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2070 FROM weights 2071 ) 2072 SELECT MIN(i) 2073 FROM cdf 2074 WHERE p >= (SELECT r FROM rand) 2075 """ 2076 ) 2077 2078 # Template for NORMAL transpilation using Box-Muller transform 2079 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2080 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2081 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2082 ) 2083 2084 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2085 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2086 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2087 ) 2088 2089 # Template for generating signed and unsigned SEQ values within a specified range 2090 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(f"{_SEQ_BASE} % :max_val") 2091 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2092 f"(CASE WHEN {_SEQ_BASE} % :max_val >= :half " 2093 f"THEN {_SEQ_BASE} % :max_val - :max_val " 2094 f"ELSE {_SEQ_BASE} % :max_val END)" 2095 ) 2096 2097 # Template for MAP_CAT transpilation - Snowflake semantics: 2098 # 1. Returns NULL if either input is NULL 2099 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2100 # 3. Filters out entries with NULL values from the result 2101 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2102 """ 2103 CASE 2104 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2105 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2106 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2107 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2108 ), __x -> __x.value IS NOT NULL)) 2109 END 2110 """ 2111 ) 2112 2113 # Mappings for EXTRACT/DATE_PART transpilation 2114 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2115 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2116 "WEEKISO": ("%V", "INTEGER"), 2117 "YEAROFWEEK": ("%G", "INTEGER"), 2118 "YEAROFWEEKISO": ("%G", "INTEGER"), 2119 "NANOSECOND": ("%n", "BIGINT"), 2120 } 2121 2122 # Maps epoch-based specifiers to DuckDB epoch functions 2123 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2124 "EPOCH_SECOND": "EPOCH", 2125 "EPOCH_MILLISECOND": "EPOCH_MS", 2126 "EPOCH_MICROSECOND": "EPOCH_US", 2127 "EPOCH_NANOSECOND": "EPOCH_NS", 2128 } 2129 2130 # Template for BITMAP_CONSTRUCT_AGG transpilation 2131 # 2132 # BACKGROUND: 2133 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2134 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2135 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2136 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2137 # 2138 # Snowflake uses two different formats based on the number of unique values: 2139 # 2140 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2141 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2142 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2143 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2144 # count v1 v2 v3 pad 2145 # 2146 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2147 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2148 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2149 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2150 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2151 # 2152 # TEMPLATE STRUCTURE 2153 # 2154 # Phase 1 - Innermost subquery: Data preparation 2155 # SELECT LIST_SORT(...) AS l 2156 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2157 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2158 # 2159 # Phase 2 - Middle subquery: Hex string construction 2160 # LIST_TRANSFORM(...) 2161 # - Converts each integer to 2-byte little-endian hex representation 2162 # - & 255 extracts low byte, >> 8 extracts high byte 2163 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2164 # Result: Hex string of all values 2165 # 2166 # Phase 3 - Outer SELECT: Final bitmap assembly 2167 # LENGTH(l) < 5: 2168 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2169 # LENGTH(l) >= 5: 2170 # - Large format: Fixed 10-byte header + values (no padding needed) 2171 # Result: Complete binary bitmap as BLOB 2172 # 2173 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2174 """ 2175 SELECT CASE 2176 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2177 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2178 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2179 ELSE UNHEX('08000000000000000000' || h) 2180 END 2181 FROM ( 2182 SELECT l, COALESCE(LIST_REDUCE( 2183 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2184 (__a, __b) -> __a || __b, '' 2185 ), '') AS h 2186 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2187 ) 2188 """ 2189 ) 2190 2191 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2192 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2193 f""" 2194 SELECT LISTAGG( 2195 SUBSTRING( 2196 '{RANDSTR_CHAR_POOL}', 2197 1 + CAST(FLOOR(random_value * 62) AS INT), 2198 1 2199 ), 2200 '' 2201 ) 2202 FROM ( 2203 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2204 FROM RANGE(:length) AS t(i) 2205 ) 2206 """, 2207 ) 2208 2209 # Template for MINHASH transpilation 2210 # Computes k minimum hash values across aggregated data using DuckDB list functions 2211 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2212 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2213 """ 2214 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2215 FROM ( 2216 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2217 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2218 ) 2219 """, 2220 ) 2221 2222 # Template for MINHASH_COMBINE transpilation 2223 # Combines multiple minhash signatures by taking element-wise minimum 2224 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2225 """ 2226 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2227 FROM ( 2228 SELECT 2229 pos AS idx, 2230 MIN(val) AS min_h 2231 FROM 2232 UNNEST(LIST(:expr)) AS _(sig), 2233 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2234 GROUP BY pos 2235 ) 2236 """, 2237 ) 2238 2239 # Template for APPROXIMATE_SIMILARITY transpilation 2240 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2241 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2242 """ 2243 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2244 FROM ( 2245 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2246 FROM ( 2247 SELECT h, pos 2248 FROM UNNEST(LIST(:expr)) AS _(sig), 2249 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2250 ) 2251 GROUP BY pos 2252 ) 2253 """, 2254 ) 2255 2256 # Template for ARRAYS_ZIP transpilation 2257 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2258 # Uses RANGE + indexing to match Snowflake behavior 2259 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2260 """ 2261 CASE WHEN :null_check THEN NULL 2262 WHEN :all_empty_check THEN [:empty_struct] 2263 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2264 END 2265 """, 2266 ) 2267 2268 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2269 """ 2270 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2271 2272 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2273 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2274 2275 For 'END' kind, add the interval to get the end of the slice. 2276 For DATE type with 'END', cast result back to DATE to preserve type. 2277 """ 2278 date_expr = expression.this 2279 slice_length = expression.expression 2280 unit = expression.unit 2281 kind = expression.text("kind").upper() 2282 2283 # Create INTERVAL expression: INTERVAL 'N' UNIT 2284 interval_expr = exp.Interval(this=slice_length, unit=unit) 2285 2286 # Create base time_bucket expression 2287 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2288 2289 # Check if we need the end of the slice (default is start) 2290 if not kind == "END": 2291 # For 'START', return time_bucket directly 2292 return self.sql(time_bucket_expr) 2293 2294 # For 'END', add the interval to get end of slice 2295 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2296 2297 # If input is DATE type, cast result back to DATE to preserve type 2298 # DuckDB converts DATE to TIMESTAMP when adding intervals 2299 if date_expr.is_type(exp.DataType.Type.DATE): 2300 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2301 2302 return self.sql(add_expr) 2303 2304 def bitmapbucketnumber_sql( 2305 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2306 ) -> str: 2307 """ 2308 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2309 2310 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2311 - Each bucket covers 32,768 values 2312 - Bucket numbering starts at 1 2313 - Formula: ((value - 1) // 32768) + 1 for positive values 2314 2315 For non-positive values (0 and negative), we use value // 32768 to avoid 2316 producing bucket 0 or positive bucket IDs for negative inputs. 2317 """ 2318 value = expression.this 2319 2320 positive_formula = ((value - 1) // 32768) + 1 2321 non_positive_formula = value // 32768 2322 2323 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2324 case_expr = ( 2325 exp.case() 2326 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2327 .else_(non_positive_formula) 2328 ) 2329 return self.sql(case_expr) 2330 2331 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2332 """ 2333 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2334 2335 Snowflake's BITMAP_BIT_POSITION behavior: 2336 - For n <= 0: returns ABS(n) % 32768 2337 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2338 """ 2339 this = expression.this 2340 2341 return self.sql( 2342 exp.Mod( 2343 this=exp.Paren( 2344 this=exp.If( 2345 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2346 true=this - exp.Literal.number(1), 2347 false=exp.Abs(this=this), 2348 ) 2349 ), 2350 expression=MAX_BIT_POSITION, 2351 ) 2352 ) 2353 2354 def bitmapconstructagg_sql( 2355 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2356 ) -> str: 2357 """ 2358 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2359 Uses a pre-parsed template with placeholders replaced by expression nodes. 2360 2361 Snowflake bitmap format: 2362 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2363 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2364 """ 2365 arg = expression.this 2366 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2367 2368 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2369 """ 2370 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2371 Uses a pre-parsed template with placeholders replaced by expression nodes. 2372 2373 RANDSTR(length, generator) generates a random string of specified length. 2374 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2375 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2376 - No generator: Use default seed value 2377 """ 2378 length = expression.this 2379 generator = expression.args.get("generator") 2380 2381 if generator: 2382 if isinstance(generator, exp.Rand): 2383 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2384 seed_value = generator.this or generator 2385 else: 2386 # Const/int or other expression - use as seed directly 2387 seed_value = generator 2388 else: 2389 # No generator specified, use default seed (arbitrary but deterministic) 2390 seed_value = exp.Literal.number(RANDSTR_SEED) 2391 2392 replacements = {"seed": seed_value, "length": length} 2393 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2394 2395 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2396 """ 2397 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2398 Uses a pre-parsed template with placeholders replaced by expression nodes. 2399 """ 2400 s = expression.this 2401 n = expression.args["elementcount"] 2402 gen = expression.args["gen"] 2403 2404 if not isinstance(gen, exp.Rand): 2405 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2406 random_expr: exp.Expression = exp.Div( 2407 this=exp.Paren( 2408 this=exp.Mod( 2409 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2410 expression=exp.Literal.number(1000000), 2411 ) 2412 ), 2413 expression=exp.Literal.number(1000000.0), 2414 ) 2415 else: 2416 # Use RANDOM() for non-deterministic output 2417 random_expr = exp.Rand() 2418 2419 replacements = {"s": s, "n": n, "random_expr": random_expr} 2420 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2421 2422 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2423 """ 2424 TO_BINARY and TRY_TO_BINARY transpilation: 2425 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2426 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2427 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2428 2429 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2430 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2431 """ 2432 value = expression.this 2433 format_arg = expression.args.get("format") 2434 is_safe = expression.args.get("safe") 2435 2436 fmt = "HEX" 2437 if format_arg: 2438 fmt = format_arg.name.upper() 2439 2440 if expression.is_type(exp.DataType.Type.BINARY): 2441 if fmt == "UTF-8": 2442 result = self.func("ENCODE", value) 2443 elif fmt == "BASE64": 2444 result = self.func("FROM_BASE64", value) 2445 elif fmt == "HEX": 2446 result = self.func("UNHEX", value) 2447 else: 2448 if is_safe: 2449 return self.sql(exp.null()) 2450 else: 2451 self.unsupported(f"format {fmt} is not supported") 2452 result = self.func("TO_BINARY", value) 2453 2454 # Wrap with TRY() for TRY_TO_BINARY 2455 if is_safe: 2456 result = self.func("TRY", result) 2457 2458 return result 2459 2460 # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake 2461 return self.func("TO_BINARY", value) 2462 2463 def _greatest_least_sql( 2464 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2465 ) -> str: 2466 """ 2467 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2468 2469 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2470 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2471 """ 2472 # Get all arguments 2473 all_args = [expression.this, *expression.expressions] 2474 fallback_sql = self.function_fallback_sql(expression) 2475 2476 if expression.args.get("ignore_nulls"): 2477 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2478 return self.sql(fallback_sql) 2479 2480 # return NULL if any argument is NULL 2481 case_expr = exp.case().when( 2482 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2483 exp.null(), 2484 copy=False, 2485 ) 2486 case_expr.set("default", fallback_sql) 2487 return self.sql(case_expr) 2488 2489 def generator_sql(self, expression: exp.Generator) -> str: 2490 # Transpile Snowflake GENERATOR to DuckDB range() 2491 rowcount = expression.args.get("rowcount") 2492 time_limit = expression.args.get("time_limit") 2493 2494 if time_limit: 2495 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2496 2497 if not rowcount: 2498 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2499 return self.func("range", exp.Literal.number(0)) 2500 2501 return self.func("range", rowcount) 2502 2503 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2504 return self._greatest_least_sql(expression) 2505 2506 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2507 return self._greatest_least_sql(expression) 2508 2509 def lambda_sql( 2510 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2511 ) -> str: 2512 if expression.args.get("colon"): 2513 prefix = "LAMBDA " 2514 arrow_sep = ":" 2515 wrap = False 2516 else: 2517 prefix = "" 2518 2519 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2520 return f"{prefix}{lambda_sql}" 2521 2522 def show_sql(self, expression: exp.Show) -> str: 2523 return f"SHOW {expression.name}" 2524 2525 def install_sql(self, expression: exp.Install) -> str: 2526 force = "FORCE " if expression.args.get("force") else "" 2527 this = self.sql(expression, "this") 2528 from_clause = expression.args.get("from_") 2529 from_clause = f" FROM {from_clause}" if from_clause else "" 2530 return f"{force}INSTALL {this}{from_clause}" 2531 2532 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2533 self.unsupported( 2534 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2535 ) 2536 return self.function_fallback_sql(expression) 2537 2538 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2539 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2540 2541 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2542 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2543 target_type = expression.args.get("target_type") 2544 needs_tz = target_type and target_type.this in ( 2545 exp.DataType.Type.TIMESTAMPLTZ, 2546 exp.DataType.Type.TIMESTAMPTZ, 2547 ) 2548 2549 if expression.args.get("safe"): 2550 formatted_time = self.format_time(expression) 2551 cast_type = ( 2552 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2553 ) 2554 return self.sql( 2555 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2556 ) 2557 2558 base_sql = str_to_time_sql(self, expression) 2559 if needs_tz: 2560 return self.sql( 2561 exp.cast( 2562 base_sql, 2563 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2564 ) 2565 ) 2566 return base_sql 2567 2568 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2569 formatted_time = self.format_time(expression) 2570 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2571 return self.sql( 2572 exp.cast( 2573 self.func(function_name, expression.this, formatted_time), 2574 exp.DataType(this=exp.DataType.Type.DATE), 2575 ) 2576 ) 2577 2578 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2579 this = expression.this 2580 time_format = self.format_time(expression) 2581 safe = expression.args.get("safe") 2582 time_type = exp.DataType.build("TIME", dialect="duckdb") 2583 cast_expr = exp.TryCast if safe else exp.Cast 2584 2585 if time_format: 2586 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2587 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2588 return self.sql(cast_expr(this=strptime, to=time_type)) 2589 2590 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2591 return self.sql(this) 2592 2593 return self.sql(cast_expr(this=this, to=time_type)) 2594 2595 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2596 if not expression.this: 2597 return "CURRENT_DATE" 2598 2599 expr = exp.Cast( 2600 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2601 to=exp.DataType(this=exp.DataType.Type.DATE), 2602 ) 2603 return self.sql(expr) 2604 2605 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2606 arg = expression.this 2607 if expression.args.get("safe"): 2608 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2609 return self.func("JSON", arg) 2610 2611 def normal_sql(self, expression: exp.Normal) -> str: 2612 """ 2613 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2614 2615 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2616 """ 2617 mean = expression.this 2618 stddev = expression.args["stddev"] 2619 gen: exp.Expression = expression.args["gen"] 2620 2621 # Build two uniform random values [0, 1) for Box-Muller transform 2622 if isinstance(gen, exp.Rand) and gen.this is None: 2623 u1: exp.Expression = exp.Rand() 2624 u2: exp.Expression = exp.Rand() 2625 else: 2626 # Seeded: derive two values using HASH with different inputs 2627 seed = gen.this if isinstance(gen, exp.Rand) else gen 2628 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2629 u2 = exp.replace_placeholders( 2630 self.SEEDED_RANDOM_TEMPLATE, 2631 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2632 ) 2633 2634 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2635 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2636 2637 def uniform_sql(self, expression: exp.Uniform) -> str: 2638 """ 2639 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2640 2641 UNIFORM returns a random value in [min, max]: 2642 - Integer result if both min and max are integers 2643 - Float result if either min or max is a float 2644 """ 2645 min_val = expression.this 2646 max_val = expression.expression 2647 gen = expression.args.get("gen") 2648 2649 # Determine if result should be integer (both bounds are integers). 2650 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2651 is_int_result = min_val.is_int and max_val.is_int 2652 2653 # Build the random value expression [0, 1) 2654 if not isinstance(gen, exp.Rand): 2655 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2656 random_expr: exp.Expression = exp.Div( 2657 this=exp.Paren( 2658 this=exp.Mod( 2659 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2660 expression=exp.Literal.number(1000000), 2661 ) 2662 ), 2663 expression=exp.Literal.number(1000000.0), 2664 ) 2665 else: 2666 random_expr = exp.Rand() 2667 2668 # Build: min + random * (max - min [+ 1 for int]) 2669 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2670 if is_int_result: 2671 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2672 2673 result: exp.Expression = exp.Add( 2674 this=min_val, 2675 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2676 ) 2677 2678 if is_int_result: 2679 result = exp.Cast( 2680 this=exp.Floor(this=result), 2681 to=exp.DataType.build("BIGINT"), 2682 ) 2683 2684 return self.sql(result) 2685 2686 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2687 nano = expression.args.get("nano") 2688 overflow = expression.args.get("overflow") 2689 2690 # Snowflake's TIME_FROM_PARTS supports overflow 2691 if overflow: 2692 hour = expression.args["hour"] 2693 minute = expression.args["min"] 2694 sec = expression.args["sec"] 2695 2696 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2697 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2698 try: 2699 h_val = hour.to_py() 2700 m_val = minute.to_py() 2701 s_val = sec.to_py() 2702 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2703 return rename_func("MAKE_TIME")(self, expression) 2704 except ValueError: 2705 pass 2706 2707 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2708 if nano: 2709 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2710 2711 total_seconds = ( 2712 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2713 ) 2714 2715 return self.sql( 2716 exp.Add( 2717 this=exp.Cast( 2718 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2719 ), 2720 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2721 ) 2722 ) 2723 2724 # Default: MAKE_TIME 2725 if nano: 2726 expression.set( 2727 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2728 ) 2729 2730 return rename_func("MAKE_TIME")(self, expression) 2731 2732 def extract_sql(self, expression: exp.Extract) -> str: 2733 """ 2734 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2735 2736 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2737 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2738 """ 2739 this = expression.this 2740 datetime_expr = expression.expression 2741 2742 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2743 # because Snowflake applies server timezone while DuckDB uses local timezone 2744 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2745 self.unsupported( 2746 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2747 ) 2748 2749 part_name = this.name.upper() 2750 2751 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2752 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2753 2754 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2755 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2756 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2757 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2758 ) 2759 2760 if is_nano_time: 2761 self.unsupported( 2762 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2763 ) 2764 return self.sql( 2765 exp.cast( 2766 exp.Mul( 2767 this=exp.Extract( 2768 this=exp.var("MICROSECOND"), expression=datetime_expr 2769 ), 2770 expression=exp.Literal.number(1000), 2771 ), 2772 exp.DataType.build(cast_type, dialect="duckdb"), 2773 ) 2774 ) 2775 2776 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2777 strftime_input = datetime_expr 2778 if part_name == "NANOSECOND": 2779 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 2780 2781 return self.sql( 2782 exp.cast( 2783 exp.Anonymous( 2784 this="STRFTIME", 2785 expressions=[strftime_input, exp.Literal.string(fmt)], 2786 ), 2787 exp.DataType.build(cast_type, dialect="duckdb"), 2788 ) 2789 ) 2790 2791 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2792 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2793 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2794 # EPOCH returns float, cast to BIGINT for integer result 2795 if part_name == "EPOCH_SECOND": 2796 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2797 return self.sql(result) 2798 2799 return super().extract_sql(expression) 2800 2801 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2802 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2803 date_expr = expression.this 2804 time_expr = expression.expression 2805 2806 if date_expr is not None and time_expr is not None: 2807 # In DuckDB, DATE + TIME produces TIMESTAMP 2808 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2809 2810 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2811 sec = expression.args.get("sec") 2812 if sec is None: 2813 # This shouldn't happen with valid input, but handle gracefully 2814 return rename_func("MAKE_TIMESTAMP")(self, expression) 2815 2816 milli = expression.args.get("milli") 2817 if milli is not None: 2818 sec += milli.pop() / exp.Literal.number(1000.0) 2819 2820 nano = expression.args.get("nano") 2821 if nano is not None: 2822 sec += nano.pop() / exp.Literal.number(1000000000.0) 2823 2824 if milli or nano: 2825 expression.set("sec", sec) 2826 2827 return rename_func("MAKE_TIMESTAMP")(self, expression) 2828 2829 @unsupported_args("nano") 2830 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2831 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2832 if nano := expression.args.get("nano"): 2833 nano.pop() 2834 2835 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2836 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2837 2838 @unsupported_args("nano") 2839 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2840 # Extract zone before popping 2841 zone = expression.args.get("zone") 2842 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2843 if zone: 2844 zone = zone.pop() 2845 2846 if nano := expression.args.get("nano"): 2847 nano.pop() 2848 2849 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2850 2851 if zone: 2852 # Use AT TIME ZONE to apply the explicit timezone 2853 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2854 2855 return timestamp 2856 2857 def tablesample_sql( 2858 self, 2859 expression: exp.TableSample, 2860 tablesample_keyword: t.Optional[str] = None, 2861 ) -> str: 2862 if not isinstance(expression.parent, exp.Select): 2863 # This sample clause only applies to a single source, not the entire resulting relation 2864 tablesample_keyword = "TABLESAMPLE" 2865 2866 if expression.args.get("size"): 2867 method = expression.args.get("method") 2868 if method and method.name.upper() != "RESERVOIR": 2869 self.unsupported( 2870 f"Sampling method {method} is not supported with a discrete sample count, " 2871 "defaulting to reservoir sampling" 2872 ) 2873 expression.set("method", exp.var("RESERVOIR")) 2874 2875 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2876 2877 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 2878 if isinstance(expression.parent, exp.UserDefinedFunction): 2879 return self.sql(expression, "this") 2880 return super().columndef_sql(expression, sep) 2881 2882 def join_sql(self, expression: exp.Join) -> str: 2883 if ( 2884 not expression.args.get("using") 2885 and not expression.args.get("on") 2886 and not expression.method 2887 and (expression.kind in ("", "INNER", "OUTER")) 2888 ): 2889 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2890 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2891 if isinstance(expression.this, exp.Unnest): 2892 return super().join_sql(expression.on(exp.true())) 2893 2894 expression.set("side", None) 2895 expression.set("kind", None) 2896 2897 return super().join_sql(expression) 2898 2899 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 2900 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 2901 if expression.args.get("is_end_exclusive"): 2902 return rename_func("RANGE")(self, expression) 2903 2904 return self.function_fallback_sql(expression) 2905 2906 def countif_sql(self, expression: exp.CountIf) -> str: 2907 if self.dialect.version >= (1, 2): 2908 return self.function_fallback_sql(expression) 2909 2910 # https://github.com/tobymao/sqlglot/pull/4749 2911 return count_if_to_sum(self, expression) 2912 2913 def bracket_sql(self, expression: exp.Bracket) -> str: 2914 if self.dialect.version >= (1, 2): 2915 return super().bracket_sql(expression) 2916 2917 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2918 this = expression.this 2919 if isinstance(this, exp.Array): 2920 this.replace(exp.paren(this)) 2921 2922 bracket = super().bracket_sql(expression) 2923 2924 if not expression.args.get("returns_list_for_maps"): 2925 if not this.type: 2926 from sqlglot.optimizer.annotate_types import annotate_types 2927 2928 this = annotate_types(this, dialect=self.dialect) 2929 2930 if this.is_type(exp.DataType.Type.MAP): 2931 bracket = f"({bracket})[1]" 2932 2933 return bracket 2934 2935 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2936 func = expression.this 2937 2938 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2939 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2940 if isinstance(func, exp.ArrayAgg): 2941 if not isinstance(order := expression.expression, exp.Order): 2942 return self.sql(func) 2943 2944 # Save the original column for FILTER clause (before wrapping with Order) 2945 original_this = func.this 2946 2947 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 2948 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 2949 func.set( 2950 "this", 2951 exp.Order( 2952 this=func.this.copy(), 2953 expressions=order.expressions, 2954 ), 2955 ) 2956 2957 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 2958 # Use original_this (not the Order-wrapped version) for the FILTER condition 2959 array_agg_sql = self.function_fallback_sql(func) 2960 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 2961 2962 # For other functions (like PERCENTILES), use existing logic 2963 expression_sql = self.sql(expression, "expression") 2964 2965 if isinstance(func, exp.PERCENTILES): 2966 # Make the order key the first arg and slide the fraction to the right 2967 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 2968 order_col = expression.find(exp.Ordered) 2969 if order_col: 2970 func.set("expression", func.this) 2971 func.set("this", order_col.this) 2972 2973 this = self.sql(expression, "this").rstrip(")") 2974 2975 return f"{this}{expression_sql})" 2976 2977 def length_sql(self, expression: exp.Length) -> str: 2978 arg = expression.this 2979 2980 # Dialects like BQ and Snowflake also accept binary values as args, so 2981 # DDB will attempt to infer the type or resort to case/when resolution 2982 if not expression.args.get("binary") or arg.is_string: 2983 return self.func("LENGTH", arg) 2984 2985 if not arg.type: 2986 from sqlglot.optimizer.annotate_types import annotate_types 2987 2988 arg = annotate_types(arg, dialect=self.dialect) 2989 2990 if arg.is_type(*exp.DataType.TEXT_TYPES): 2991 return self.func("LENGTH", arg) 2992 2993 # We need these casts to make duckdb's static type checker happy 2994 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 2995 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 2996 2997 case = ( 2998 exp.case(self.func("TYPEOF", arg)) 2999 .when("'BLOB'", self.func("OCTET_LENGTH", blob)) 3000 .else_( 3001 exp.Anonymous(this="LENGTH", expressions=[varchar]) 3002 ) # anonymous to break length_sql recursion 3003 ) 3004 3005 return self.sql(case) 3006 3007 def sha_sql(self, expression: exp.SHA) -> str: 3008 arg = expression.this 3009 3010 # If type is compatible with DuckDB or is an unknown type, use directly 3011 if ( 3012 arg.type 3013 and arg.type.this != exp.DataType.Type.UNKNOWN 3014 and not arg.is_type(*exp.DataType.TEXT_TYPES) 3015 and not _is_binary(arg) 3016 ): 3017 arg = exp.cast(arg, exp.DataType.Type.VARCHAR) 3018 3019 return self.func("SHA1", arg) 3020 3021 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3022 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3023 this = expression.this 3024 expr = expression.expression 3025 max_dist = expression.args.get("max_dist") 3026 3027 if max_dist is None: 3028 return self.func("LEVENSHTEIN", this, expr) 3029 3030 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3031 levenshtein = exp.Levenshtein(this=this, expression=expr) 3032 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3033 3034 def minhash_sql(self, expression: exp.Minhash) -> str: 3035 k = expression.this 3036 exprs = expression.expressions 3037 3038 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3039 self.unsupported( 3040 "MINHASH with multiple expressions or * requires manual query restructuring" 3041 ) 3042 return self.func("MINHASH", k, *exprs) 3043 3044 expr = exprs[0] 3045 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3046 return f"({self.sql(result)})" 3047 3048 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3049 expr = expression.this 3050 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3051 return f"({self.sql(result)})" 3052 3053 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3054 expr = expression.this 3055 result = exp.replace_placeholders( 3056 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3057 ) 3058 return f"({self.sql(result)})" 3059 3060 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3061 args = expression.expressions 3062 3063 if not args: 3064 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3065 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3066 3067 # Build placeholder values for template 3068 lengths = [exp.Length(this=arg) for arg in args] 3069 max_len = ( 3070 lengths[0] 3071 if len(lengths) == 1 3072 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3073 ) 3074 3075 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3076 empty_struct = exp.func( 3077 "STRUCT", 3078 *[ 3079 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3080 for i in range(len(args)) 3081 ], 3082 ) 3083 3084 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3085 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3086 index = exp.column("__i") + 1 3087 transform_struct = exp.func( 3088 "STRUCT", 3089 *[ 3090 exp.PropertyEQ( 3091 this=exp.Literal.string(f"${i + 1}"), 3092 expression=exp.func("COALESCE", arg, exp.array())[index], 3093 ) 3094 for i, arg in enumerate(args) 3095 ], 3096 ) 3097 3098 result = exp.replace_placeholders( 3099 self.ARRAYS_ZIP_TEMPLATE.copy(), 3100 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3101 all_empty_check=exp.and_( 3102 *[ 3103 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3104 for arg in args 3105 ] 3106 ), 3107 empty_struct=empty_struct, 3108 max_len=max_len, 3109 transform_struct=transform_struct, 3110 ) 3111 return self.sql(result) 3112 3113 def lower_sql(self, expression: exp.Lower) -> str: 3114 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3115 return _gen_with_cast_to_blob(self, expression, result_sql) 3116 3117 def upper_sql(self, expression: exp.Upper) -> str: 3118 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3119 return _gen_with_cast_to_blob(self, expression, result_sql) 3120 3121 def reverse_sql(self, expression: exp.Reverse) -> str: 3122 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3123 return _gen_with_cast_to_blob(self, expression, result_sql) 3124 3125 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3126 # DuckDB TO_BASE64 requires BLOB input 3127 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3128 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3129 result = expression.this 3130 3131 # Check if input is a string type - ENCODE only accepts VARCHAR 3132 if result.is_type(*exp.DataType.TEXT_TYPES): 3133 result = exp.Encode(this=result) 3134 3135 result = exp.ToBase64(this=result) 3136 3137 max_line_length = expression.args.get("max_line_length") 3138 alphabet = expression.args.get("alphabet") 3139 3140 # Handle custom alphabet by replacing standard chars with custom ones 3141 result = _apply_base64_alphabet_replacements(result, alphabet) 3142 3143 # Handle max_line_length by inserting newlines every N characters 3144 line_length = ( 3145 t.cast(int, max_line_length.to_py()) 3146 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3147 else 0 3148 ) 3149 if line_length > 0: 3150 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3151 result = exp.Trim( 3152 this=exp.RegexpReplace( 3153 this=result, 3154 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3155 replacement=exp.Concat( 3156 expressions=[exp.Literal.string("\\1"), newline.copy()] 3157 ), 3158 ), 3159 expression=newline, 3160 position="TRAILING", 3161 ) 3162 3163 return self.sql(result) 3164 3165 def replace_sql(self, expression: exp.Replace) -> str: 3166 result_sql = self.func( 3167 "REPLACE", 3168 _cast_to_varchar(expression.this), 3169 _cast_to_varchar(expression.expression), 3170 _cast_to_varchar(expression.args.get("replacement")), 3171 ) 3172 return _gen_with_cast_to_blob(self, expression, result_sql) 3173 3174 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3175 _prepare_binary_bitwise_args(expression) 3176 result_sql = self.binary(expression, op) 3177 return _gen_with_cast_to_blob(self, expression, result_sql) 3178 3179 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3180 _prepare_binary_bitwise_args(expression) 3181 result_sql = self.func("XOR", expression.this, expression.expression) 3182 return _gen_with_cast_to_blob(self, expression, result_sql) 3183 3184 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3185 this = expression.this 3186 key = expression.args.get("key") 3187 key_sql = key.name if isinstance(key, exp.Expression) else "" 3188 value_sql = self.sql(expression, "value") 3189 3190 kv_sql = f"{key_sql} := {value_sql}" 3191 3192 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3193 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3194 if isinstance(this, exp.Struct) and not this.expressions: 3195 return self.func("STRUCT_PACK", kv_sql) 3196 3197 return self.func("STRUCT_INSERT", this, kv_sql) 3198 3199 def mapcat_sql(self, expression: exp.MapCat) -> str: 3200 result = exp.replace_placeholders( 3201 self.MAPCAT_TEMPLATE.copy(), 3202 map1=expression.this, 3203 map2=expression.expression, 3204 ) 3205 return self.sql(result) 3206 3207 def startswith_sql(self, expression: exp.StartsWith) -> str: 3208 return self.func( 3209 "STARTS_WITH", 3210 _cast_to_varchar(expression.this), 3211 _cast_to_varchar(expression.expression), 3212 ) 3213 3214 def space_sql(self, expression: exp.Space) -> str: 3215 # DuckDB's REPEAT requires BIGINT for the count parameter 3216 return self.sql( 3217 exp.Repeat( 3218 this=exp.Literal.string(" "), 3219 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3220 ) 3221 ) 3222 3223 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3224 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3225 if isinstance(expression.this, exp.Generator): 3226 # Preserve alias, joins, and other table-level args 3227 table = exp.Table( 3228 this=expression.this, 3229 alias=expression.args.get("alias"), 3230 joins=expression.args.get("joins"), 3231 ) 3232 return self.sql(table) 3233 3234 return super().tablefromrows_sql(expression) 3235 3236 def unnest_sql(self, expression: exp.Unnest) -> str: 3237 explode_array = expression.args.get("explode_array") 3238 if explode_array: 3239 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3240 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3241 expression.expressions.append( 3242 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3243 ) 3244 3245 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3246 alias = expression.args.get("alias") 3247 if isinstance(alias, exp.TableAlias): 3248 expression.set("alias", None) 3249 if alias.columns: 3250 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3251 3252 unnest_sql = super().unnest_sql(expression) 3253 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3254 return self.sql(select) 3255 3256 return super().unnest_sql(expression) 3257 3258 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3259 this = expression.this 3260 3261 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3262 # DuckDB should render IGNORE NULLS only for the general-purpose 3263 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3264 return super().ignorenulls_sql(expression) 3265 3266 if isinstance(this, exp.First): 3267 this = exp.AnyValue(this=this.this) 3268 3269 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3270 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3271 3272 return self.sql(this) 3273 3274 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3275 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3276 # DuckDB should render RESPECT NULLS only for the general-purpose 3277 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3278 return super().respectnulls_sql(expression) 3279 3280 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3281 return self.sql(expression, "this") 3282 3283 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3284 this = self.sql(expression, "this") 3285 null_text = self.sql(expression, "null") 3286 3287 if null_text: 3288 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3289 3290 return self.func("ARRAY_TO_STRING", this, expression.expression) 3291 3292 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3293 this = expression.this 3294 group = expression.args.get("group") 3295 params = expression.args.get("parameters") 3296 position = expression.args.get("position") 3297 occurrence = expression.args.get("occurrence") 3298 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3299 3300 if position and (not position.is_int or position.to_py() > 1): 3301 this = exp.Substring(this=this, start=position) 3302 3303 if null_if_pos_overflow: 3304 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3305 3306 # Do not render group if there is no following argument, 3307 # and it's the default value for this dialect 3308 if ( 3309 not params 3310 and group 3311 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3312 ): 3313 group = None 3314 3315 if occurrence and (not occurrence.is_int or occurrence.to_py() > 1): 3316 return self.func( 3317 "ARRAY_EXTRACT", 3318 self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params), 3319 exp.Literal.number(occurrence), 3320 ) 3321 3322 return self.func("REGEXP_EXTRACT", this, expression.expression, group, params) 3323 3324 @unsupported_args("culture") 3325 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3326 fmt = expression.args.get("format") 3327 if fmt and fmt.is_int: 3328 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3329 3330 self.unsupported("Only integer formats are supported by NumberToStr") 3331 return self.function_fallback_sql(expression) 3332 3333 def autoincrementcolumnconstraint_sql(self, _) -> str: 3334 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3335 return "" 3336 3337 def aliases_sql(self, expression: exp.Aliases) -> str: 3338 this = expression.this 3339 if isinstance(this, exp.Posexplode): 3340 return self.posexplode_sql(this) 3341 3342 return super().aliases_sql(expression) 3343 3344 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3345 this = expression.this 3346 parent = expression.parent 3347 3348 # The default Spark aliases are "pos" and "col", unless specified otherwise 3349 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3350 3351 if isinstance(parent, exp.Aliases): 3352 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3353 pos, col = parent.expressions 3354 elif isinstance(parent, exp.Table): 3355 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3356 alias = parent.args.get("alias") 3357 if alias: 3358 pos, col = alias.columns or [pos, col] 3359 alias.pop() 3360 3361 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3362 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3363 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3364 gen_subscripts = self.sql( 3365 exp.Alias( 3366 this=exp.Anonymous( 3367 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3368 ) 3369 - exp.Literal.number(1), 3370 alias=pos, 3371 ) 3372 ) 3373 3374 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3375 3376 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3377 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3378 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3379 3380 return posexplode_sql 3381 3382 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3383 """ 3384 Handles three key issues: 3385 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3386 2. End-of-month preservation: If input is last day of month, result is last day of result month 3387 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3388 """ 3389 from sqlglot.optimizer.annotate_types import annotate_types 3390 3391 this = expression.this 3392 if not this.type: 3393 this = annotate_types(this, dialect=self.dialect) 3394 3395 if this.is_type(*exp.DataType.TEXT_TYPES): 3396 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3397 3398 # Detect float/decimal months to apply rounding (Snowflake behavior) 3399 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3400 months_expr = expression.expression 3401 if not months_expr.type: 3402 months_expr = annotate_types(months_expr, dialect=self.dialect) 3403 3404 # Build interval or to_months expression based on type 3405 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3406 interval_or_to_months = ( 3407 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3408 if months_expr.is_type( 3409 exp.DataType.Type.FLOAT, 3410 exp.DataType.Type.DOUBLE, 3411 exp.DataType.Type.DECIMAL, 3412 ) 3413 # Integer case: standard INTERVAL N MONTH syntax 3414 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3415 ) 3416 3417 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3418 3419 # Apply end-of-month preservation if Snowflake flag is set 3420 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3421 preserve_eom = expression.args.get("preserve_end_of_month") 3422 result_expr = ( 3423 exp.case() 3424 .when( 3425 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3426 exp.func("LAST_DAY", date_add_expr), 3427 ) 3428 .else_(date_add_expr) 3429 if preserve_eom 3430 else date_add_expr 3431 ) 3432 3433 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3434 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3435 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3436 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3437 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3438 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3439 return self.sql(result_expr) 3440 3441 def format_sql(self, expression: exp.Format) -> str: 3442 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3443 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3444 3445 return self.function_fallback_sql(expression) 3446 3447 def hexstring_sql( 3448 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3449 ) -> str: 3450 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3451 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3452 3453 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3454 unit = unit_to_str(expression) 3455 date = expression.this 3456 result = self.func("DATE_TRUNC", unit, date) 3457 3458 if ( 3459 expression.args.get("input_type_preserved") 3460 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3461 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3462 ): 3463 return self.sql(exp.Cast(this=result, to=date.type)) 3464 3465 return result 3466 3467 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3468 unit = unit_to_str(expression) 3469 zone = expression.args.get("zone") 3470 timestamp = expression.this 3471 date_unit = is_date_unit(unit) 3472 3473 if date_unit and zone: 3474 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3475 # Double AT TIME ZONE needed for BigQuery compatibility: 3476 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 3477 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 3478 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 3479 result_sql = self.func("DATE_TRUNC", unit, timestamp) 3480 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 3481 3482 result = self.func("DATE_TRUNC", unit, timestamp) 3483 if expression.args.get("input_type_preserved"): 3484 if timestamp.type and timestamp.is_type( 3485 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 3486 ): 3487 dummy_date = exp.Cast( 3488 this=exp.Literal.string("1970-01-01"), 3489 to=exp.DataType(this=exp.DataType.Type.DATE), 3490 ) 3491 date_time = exp.Add(this=dummy_date, expression=timestamp) 3492 result = self.func("DATE_TRUNC", unit, date_time) 3493 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3494 3495 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 3496 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 3497 ): 3498 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3499 3500 return result 3501 3502 def trim_sql(self, expression: exp.Trim) -> str: 3503 expression.this.replace(_cast_to_varchar(expression.this)) 3504 if expression.expression: 3505 expression.expression.replace(_cast_to_varchar(expression.expression)) 3506 3507 result_sql = super().trim_sql(expression) 3508 return _gen_with_cast_to_blob(self, expression, result_sql) 3509 3510 def round_sql(self, expression: exp.Round) -> str: 3511 this = expression.this 3512 decimals = expression.args.get("decimals") 3513 truncate = expression.args.get("truncate") 3514 3515 # DuckDB requires the scale (decimals) argument to be an INT 3516 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 3517 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 3518 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 3519 decimals = exp.cast(decimals, exp.DataType.Type.INT) 3520 3521 func = "ROUND" 3522 if truncate: 3523 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 3524 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 3525 func = "ROUND_EVEN" 3526 truncate = None 3527 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 3528 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 3529 truncate = None 3530 3531 return self.func(func, this, decimals, truncate) 3532 3533 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 3534 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 3535 3536 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 3537 if expression.is_type(*exp.DataType.REAL_TYPES): 3538 result = f"CAST({result} AS DOUBLE)" 3539 3540 return result 3541 3542 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 3543 """ 3544 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 3545 dividing the input distribution into n equal-sized buckets. 3546 3547 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 3548 does not document the specific algorithm used so results may differ. DuckDB does not 3549 support RESPECT NULLS. 3550 """ 3551 this = expression.this 3552 if isinstance(this, exp.Distinct): 3553 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 3554 if len(this.expressions) < 2: 3555 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 3556 return self.function_fallback_sql(expression) 3557 num_quantiles_expr = this.expressions[1].pop() 3558 else: 3559 num_quantiles_expr = expression.expression 3560 3561 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 3562 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3563 return self.function_fallback_sql(expression) 3564 3565 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 3566 if num_quantiles <= 0: 3567 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3568 return self.function_fallback_sql(expression) 3569 3570 quantiles = [ 3571 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 3572 for i in range(num_quantiles + 1) 3573 ] 3574 3575 return self.sql( 3576 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 3577 ) 3578 3579 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 3580 if expression.args.get("scalar_only"): 3581 expression = exp.JSONExtractScalar( 3582 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 3583 ) 3584 return _arrow_json_extract_sql(self, expression) 3585 3586 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 3587 this = expression.this 3588 3589 if _is_binary(this): 3590 expression.type = exp.DataType.build("BINARY") 3591 3592 arg = _cast_to_bit(this) 3593 3594 if isinstance(this, exp.Neg): 3595 arg = exp.Paren(this=arg) 3596 3597 expression.set("this", arg) 3598 3599 result_sql = f"~{self.sql(expression, 'this')}" 3600 3601 return _gen_with_cast_to_blob(self, expression, result_sql) 3602 3603 def window_sql(self, expression: exp.Window) -> str: 3604 this = expression.this 3605 if isinstance(this, exp.Corr) or ( 3606 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 3607 ): 3608 return self._corr_sql(expression) 3609 3610 return super().window_sql(expression) 3611 3612 def filter_sql(self, expression: exp.Filter) -> str: 3613 if isinstance(expression.this, exp.Corr): 3614 return self._corr_sql(expression) 3615 3616 return super().filter_sql(expression) 3617 3618 def _corr_sql( 3619 self, 3620 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 3621 ) -> str: 3622 if isinstance(expression, exp.Corr) and not expression.args.get( 3623 "null_on_zero_variance" 3624 ): 3625 return self.func("CORR", expression.this, expression.expression) 3626 3627 corr_expr = _maybe_corr_null_to_false(expression) 3628 if corr_expr is None: 3629 if isinstance(expression, exp.Window): 3630 return super().window_sql(expression) 3631 if isinstance(expression, exp.Filter): 3632 return super().filter_sql(expression) 3633 corr_expr = expression # make mypy happy 3634 3635 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
Default NULL ordering method to use if not explicitly set.
Possible values: "nulls_are_small", "nulls_are_large", "nulls_are_last"
A NULL arg in CONCAT yields NULL by default, but in some dialects it yields an empty string.
Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
Whether expressions such as x::INT[5] should be parsed as fixed-size array defs/casts e.g. in DuckDB. In dialects which don't support fixed size arrays such as Snowflake, this should be interpreted as a subscript/index operator.
Whether failing to parse a JSON path expression using the JSONPath dialect will log a warning.
Whether number literals can include underscores for better readability
Specifies the strategy according to which identifiers should be normalized.
1313 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1314 if isinstance(path, exp.Literal): 1315 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1316 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1317 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1318 # either result in a noisy warning or in an invalid representation of the path. 1319 path_text = path.name 1320 if path_text.startswith("/") or "[#" in path_text: 1321 return path 1322 1323 return super().to_json_path(path)
Mapping of an escaped sequence (\n) to its unescaped version (
).
1325 class Tokenizer(tokens.Tokenizer): 1326 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1327 BYTE_STRING_ESCAPES = ["'", "\\"] 1328 HEREDOC_STRINGS = ["$"] 1329 1330 HEREDOC_TAG_IS_IDENTIFIER = True 1331 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1332 1333 KEYWORDS = { 1334 **tokens.Tokenizer.KEYWORDS, 1335 "//": TokenType.DIV, 1336 "**": TokenType.DSTAR, 1337 "^@": TokenType.CARET_AT, 1338 "@>": TokenType.AT_GT, 1339 "<@": TokenType.LT_AT, 1340 "ATTACH": TokenType.ATTACH, 1341 "BINARY": TokenType.VARBINARY, 1342 "BITSTRING": TokenType.BIT, 1343 "BPCHAR": TokenType.TEXT, 1344 "CHAR": TokenType.TEXT, 1345 "DATETIME": TokenType.TIMESTAMPNTZ, 1346 "DETACH": TokenType.DETACH, 1347 "FORCE": TokenType.FORCE, 1348 "INSTALL": TokenType.INSTALL, 1349 "INT8": TokenType.BIGINT, 1350 "LOGICAL": TokenType.BOOLEAN, 1351 "MACRO": TokenType.FUNCTION, 1352 "ONLY": TokenType.ONLY, 1353 "PIVOT_WIDER": TokenType.PIVOT, 1354 "POSITIONAL": TokenType.POSITIONAL, 1355 "RESET": TokenType.COMMAND, 1356 "ROW": TokenType.STRUCT, 1357 "SIGNED": TokenType.INT, 1358 "STRING": TokenType.TEXT, 1359 "SUMMARIZE": TokenType.SUMMARIZE, 1360 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1361 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1362 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1363 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1364 "TIMESTAMP_US": TokenType.TIMESTAMP, 1365 "UBIGINT": TokenType.UBIGINT, 1366 "UINTEGER": TokenType.UINT, 1367 "USMALLINT": TokenType.USMALLINT, 1368 "UTINYINT": TokenType.UTINYINT, 1369 "VARCHAR": TokenType.TEXT, 1370 } 1371 KEYWORDS.pop("/*+") 1372 1373 SINGLE_TOKENS = { 1374 **tokens.Tokenizer.SINGLE_TOKENS, 1375 "$": TokenType.PARAMETER, 1376 } 1377 1378 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW}
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- STRING_ESCAPES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- IDENTIFIER_ESCAPES
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
1380 class Parser(parser.Parser): 1381 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1382 1383 BITWISE = parser.Parser.BITWISE.copy() 1384 BITWISE.pop(TokenType.CARET) 1385 1386 RANGE_PARSERS = { 1387 **parser.Parser.RANGE_PARSERS, 1388 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1389 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1390 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1391 } 1392 1393 EXPONENT = { 1394 **parser.Parser.EXPONENT, 1395 TokenType.CARET: exp.Pow, 1396 TokenType.DSTAR: exp.Pow, 1397 } 1398 1399 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1400 1401 SHOW_PARSERS = { 1402 "TABLES": _show_parser("TABLES"), 1403 "ALL TABLES": _show_parser("ALL TABLES"), 1404 } 1405 1406 FUNCTIONS = { 1407 **parser.Parser.FUNCTIONS, 1408 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1409 "ARRAY_PREPEND": _build_array_prepend, 1410 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1411 "ARRAY_SORT": exp.SortArray.from_arg_list, 1412 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1413 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1414 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1415 "DATEDIFF": _build_date_diff, 1416 "DATE_DIFF": _build_date_diff, 1417 "DATE_TRUNC": date_trunc_to_time, 1418 "DATETRUNC": date_trunc_to_time, 1419 "DECODE": lambda args: exp.Decode( 1420 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1421 ), 1422 "EDITDIST3": exp.Levenshtein.from_arg_list, 1423 "ENCODE": lambda args: exp.Encode( 1424 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1425 ), 1426 "EPOCH": exp.TimeToUnix.from_arg_list, 1427 "EPOCH_MS": lambda args: exp.UnixToTime( 1428 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1429 ), 1430 "GENERATE_SERIES": _build_generate_series(), 1431 "GET_BIT": lambda args: exp.Getbit( 1432 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1433 ), 1434 "JSON": exp.ParseJSON.from_arg_list, 1435 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1436 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1437 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1438 "LIST_CONCAT": parser.build_array_concat, 1439 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1440 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1441 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1442 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1443 "LIST_HAS": exp.ArrayContains.from_arg_list, 1444 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1445 "LIST_PREPEND": _build_array_prepend, 1446 "LIST_REVERSE_SORT": _build_sort_array_desc, 1447 "LIST_SORT": exp.SortArray.from_arg_list, 1448 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1449 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1450 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1451 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1452 "MAKE_TIMESTAMP": _build_make_timestamp, 1453 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1454 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1455 "RANGE": _build_generate_series(end_exclusive=True), 1456 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1457 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1458 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1459 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1460 this=seq_get(args, 0), 1461 expression=seq_get(args, 1), 1462 replacement=seq_get(args, 2), 1463 modifiers=seq_get(args, 3), 1464 single_replace=True, 1465 ), 1466 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1467 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1468 "STRING_SPLIT": exp.Split.from_arg_list, 1469 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1470 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1471 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1472 "STRUCT_PACK": exp.Struct.from_arg_list, 1473 "STR_SPLIT": exp.Split.from_arg_list, 1474 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1475 "TIME_BUCKET": exp.DateBin.from_arg_list, 1476 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1477 "UNNEST": exp.Explode.from_arg_list, 1478 "XOR": binary_from_function(exp.BitwiseXor), 1479 } 1480 1481 FUNCTIONS.pop("DATE_SUB") 1482 FUNCTIONS.pop("GLOB") 1483 1484 FUNCTION_PARSERS = { 1485 **parser.Parser.FUNCTION_PARSERS, 1486 **dict.fromkeys( 1487 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1488 ), 1489 } 1490 FUNCTION_PARSERS.pop("DECODE") 1491 1492 NO_PAREN_FUNCTION_PARSERS = { 1493 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1494 "MAP": lambda self: self._parse_map(), 1495 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1496 } 1497 1498 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1499 TokenType.SEMI, 1500 TokenType.ANTI, 1501 } 1502 1503 PLACEHOLDER_PARSERS = { 1504 **parser.Parser.PLACEHOLDER_PARSERS, 1505 TokenType.PARAMETER: lambda self: ( 1506 self.expression(exp.Placeholder, this=self._prev.text) 1507 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1508 else None 1509 ), 1510 } 1511 1512 TYPE_CONVERTERS = { 1513 # https://duckdb.org/docs/sql/data_types/numeric 1514 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1515 # https://duckdb.org/docs/sql/data_types/text 1516 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1517 } 1518 1519 STATEMENT_PARSERS = { 1520 **parser.Parser.STATEMENT_PARSERS, 1521 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1522 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1523 TokenType.FORCE: lambda self: self._parse_force(), 1524 TokenType.INSTALL: lambda self: self._parse_install(), 1525 TokenType.SHOW: lambda self: self._parse_show(), 1526 } 1527 1528 SET_PARSERS = { 1529 **parser.Parser.SET_PARSERS, 1530 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1531 } 1532 1533 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1534 index = self._index 1535 if not self._match_text_seq("LAMBDA"): 1536 return super()._parse_lambda(alias=alias) 1537 1538 expressions = self._parse_csv(self._parse_lambda_arg) 1539 if not self._match(TokenType.COLON): 1540 self._retreat(index) 1541 return None 1542 1543 this = self._replace_lambda(self._parse_assignment(), expressions) 1544 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1545 1546 def _parse_expression(self) -> t.Optional[exp.Expression]: 1547 # DuckDB supports prefix aliases, e.g. foo: 1 1548 if self._next and self._next.token_type == TokenType.COLON: 1549 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1550 self._match(TokenType.COLON) 1551 comments = self._prev_comments or [] 1552 1553 this = self._parse_assignment() 1554 if isinstance(this, exp.Expression): 1555 # Moves the comment next to the alias in `alias: expr /* comment */` 1556 comments += this.pop_comments() or [] 1557 1558 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1559 1560 return super()._parse_expression() 1561 1562 def _parse_table( 1563 self, 1564 schema: bool = False, 1565 joins: bool = False, 1566 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1567 parse_bracket: bool = False, 1568 is_db_reference: bool = False, 1569 parse_partition: bool = False, 1570 consume_pipe: bool = False, 1571 ) -> t.Optional[exp.Expression]: 1572 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1573 if self._next and self._next.token_type == TokenType.COLON: 1574 alias = self._parse_table_alias( 1575 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1576 ) 1577 self._match(TokenType.COLON) 1578 comments = self._prev_comments or [] 1579 else: 1580 alias = None 1581 comments = [] 1582 1583 table = super()._parse_table( 1584 schema=schema, 1585 joins=joins, 1586 alias_tokens=alias_tokens, 1587 parse_bracket=parse_bracket, 1588 is_db_reference=is_db_reference, 1589 parse_partition=parse_partition, 1590 ) 1591 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1592 # Moves the comment next to the alias in `alias: table /* comment */` 1593 comments += table.pop_comments() or [] 1594 alias.comments = alias.pop_comments() + comments 1595 table.set("alias", alias) 1596 1597 return table 1598 1599 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1600 # https://duckdb.org/docs/sql/samples.html 1601 sample = super()._parse_table_sample(as_modifier=as_modifier) 1602 if sample and not sample.args.get("method"): 1603 if sample.args.get("size"): 1604 sample.set("method", exp.var("RESERVOIR")) 1605 else: 1606 sample.set("method", exp.var("SYSTEM")) 1607 1608 return sample 1609 1610 def _parse_bracket( 1611 self, this: t.Optional[exp.Expression] = None 1612 ) -> t.Optional[exp.Expression]: 1613 bracket = super()._parse_bracket(this) 1614 1615 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1616 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1617 bracket.set("returns_list_for_maps", True) 1618 1619 return bracket 1620 1621 def _parse_map(self) -> exp.ToMap | exp.Map: 1622 if self._match(TokenType.L_BRACE, advance=False): 1623 return self.expression(exp.ToMap, this=self._parse_bracket()) 1624 1625 args = self._parse_wrapped_csv(self._parse_assignment) 1626 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1627 1628 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1629 return self._parse_field_def() 1630 1631 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1632 if len(aggregations) == 1: 1633 return super()._pivot_column_names(aggregations) 1634 return pivot_column_names(aggregations, dialect="duckdb") 1635 1636 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1637 def _parse_attach_option() -> exp.AttachOption: 1638 return self.expression( 1639 exp.AttachOption, 1640 this=self._parse_var(any_token=True), 1641 expression=self._parse_field(any_token=True), 1642 ) 1643 1644 self._match(TokenType.DATABASE) 1645 exists = self._parse_exists(not_=is_attach) 1646 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1647 1648 if self._match(TokenType.L_PAREN, advance=False): 1649 expressions = self._parse_wrapped_csv(_parse_attach_option) 1650 else: 1651 expressions = None 1652 1653 return ( 1654 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1655 if is_attach 1656 else self.expression(exp.Detach, this=this, exists=exists) 1657 ) 1658 1659 def _parse_show_duckdb(self, this: str) -> exp.Show: 1660 return self.expression(exp.Show, this=this) 1661 1662 def _parse_force(self) -> exp.Install | exp.Command: 1663 # FORCE can only be followed by INSTALL or CHECKPOINT 1664 # In the case of CHECKPOINT, we fallback 1665 if not self._match(TokenType.INSTALL): 1666 return self._parse_as_command(self._prev) 1667 1668 return self._parse_install(force=True) 1669 1670 def _parse_install(self, force: bool = False) -> exp.Install: 1671 return self.expression( 1672 exp.Install, 1673 this=self._parse_id_var(), 1674 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1675 force=force, 1676 ) 1677 1678 def _parse_primary(self) -> t.Optional[exp.Expression]: 1679 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1680 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1681 1682 return super()._parse_primary()
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- ALIAS_TOKENS
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- TERM
- FACTOR
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- PROPERTY_PARSERS
- CONSTRAINT_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- INVALID_FUNC_NAME_TOKENS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- TYPE_LITERAL_PARSERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- SET_ASSIGNMENT_DELIMITERS
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- LOG_DEFAULTS_TO_LN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ALTER_TABLE_PARTITIONS
- JOINS_HAVE_EQUAL_PRECEDENCE
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
1684 class Generator(generator.Generator): 1685 PARAMETER_TOKEN = "$" 1686 NAMED_PLACEHOLDER_TOKEN = "$" 1687 JOIN_HINTS = False 1688 TABLE_HINTS = False 1689 QUERY_HINTS = False 1690 LIMIT_FETCH = "LIMIT" 1691 STRUCT_DELIMITER = ("(", ")") 1692 RENAME_TABLE_WITH_DB = False 1693 NVL2_SUPPORTED = False 1694 SEMI_ANTI_JOIN_WITH_SIDE = False 1695 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1696 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1697 LAST_DAY_SUPPORTS_DATE_PART = False 1698 JSON_KEY_VALUE_PAIR_SEP = "," 1699 IGNORE_NULLS_IN_FUNC = True 1700 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1701 SUPPORTS_CREATE_TABLE_LIKE = False 1702 MULTI_ARG_DISTINCT = False 1703 CAN_IMPLEMENT_ARRAY_ANY = True 1704 SUPPORTS_TO_NUMBER = False 1705 SUPPORTS_WINDOW_EXCLUDE = True 1706 COPY_HAS_INTO_KEYWORD = False 1707 STAR_EXCEPT = "EXCLUDE" 1708 PAD_FILL_PATTERN_IS_REQUIRED = True 1709 ARRAY_SIZE_DIM_REQUIRED = False 1710 NORMALIZE_EXTRACT_DATE_PARTS = True 1711 SUPPORTS_LIKE_QUANTIFIERS = False 1712 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1713 1714 TRANSFORMS = { 1715 **generator.Generator.TRANSFORMS, 1716 exp.AnyValue: _anyvalue_sql, 1717 exp.ApproxDistinct: approx_count_distinct_sql, 1718 exp.Boolnot: _boolnot_sql, 1719 exp.Booland: _booland_sql, 1720 exp.Boolor: _boolor_sql, 1721 exp.Array: transforms.preprocess( 1722 [transforms.inherit_struct_field_names], 1723 generator=inline_array_unless_query, 1724 ), 1725 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1726 exp.ArrayCompact: array_compact_sql, 1727 exp.ArrayConstructCompact: lambda self, e: self.sql( 1728 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1729 ), 1730 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1731 exp.ArrayFilter: rename_func("LIST_FILTER"), 1732 exp.ArrayInsert: _array_insert_sql, 1733 exp.ArrayRemove: remove_from_array_using_filter, 1734 exp.ArraySort: _array_sort_sql, 1735 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1736 exp.ArraySum: rename_func("LIST_SUM"), 1737 exp.ArrayUniqueAgg: lambda self, e: self.func( 1738 "LIST", exp.Distinct(expressions=[e.this]) 1739 ), 1740 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1741 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1742 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1743 exp.BitwiseAndAgg: _bitwise_agg_sql, 1744 exp.BitwiseLeftShift: _bitshift_sql, 1745 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1746 exp.BitwiseOrAgg: _bitwise_agg_sql, 1747 exp.BitwiseRightShift: _bitshift_sql, 1748 exp.BitwiseXorAgg: _bitwise_agg_sql, 1749 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1750 exp.Corr: lambda self, e: self._corr_sql(e), 1751 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1752 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1753 exp.CurrentTimestamp: lambda self, e: self.sql( 1754 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1755 ) 1756 if e.args.get("sysdate") 1757 else "CURRENT_TIMESTAMP", 1758 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1759 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1760 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1761 exp.DayOfWeekIso: rename_func("ISODOW"), 1762 exp.DayOfYear: rename_func("DAYOFYEAR"), 1763 exp.Dayname: lambda self, e: ( 1764 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1765 if e.args.get("abbreviated") 1766 else self.func("DAYNAME", e.this) 1767 ), 1768 exp.Monthname: lambda self, e: ( 1769 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1770 if e.args.get("abbreviated") 1771 else self.func("MONTHNAME", e.this) 1772 ), 1773 exp.DataType: _datatype_sql, 1774 exp.Date: _date_sql, 1775 exp.DateAdd: _date_delta_to_binary_interval_op(), 1776 exp.DateFromParts: _date_from_parts_sql, 1777 exp.DateSub: _date_delta_to_binary_interval_op(), 1778 exp.DateDiff: _date_diff_sql, 1779 exp.DateStrToDate: datestrtodate_sql, 1780 exp.Datetime: no_datetime_sql, 1781 exp.DatetimeDiff: _date_diff_sql, 1782 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1783 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1784 exp.DateToDi: lambda self, 1785 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1786 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1787 exp.DiToDate: lambda self, 1788 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1789 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1790 exp.EqualNull: lambda self, e: self.sql( 1791 exp.NullSafeEQ(this=e.this, expression=e.expression) 1792 ), 1793 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1794 exp.GenerateDateArray: _generate_datetime_array_sql, 1795 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1796 exp.Getbit: getbit_sql, 1797 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1798 exp.Explode: rename_func("UNNEST"), 1799 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1800 exp.IsInf: rename_func("ISINF"), 1801 exp.IsNan: rename_func("ISNAN"), 1802 exp.IsNullValue: lambda self, e: self.sql( 1803 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1804 ), 1805 exp.IsArray: lambda self, e: self.sql( 1806 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1807 ), 1808 exp.Ceil: _ceil_floor, 1809 exp.Floor: _ceil_floor, 1810 exp.JSONBExists: rename_func("JSON_EXISTS"), 1811 exp.JSONExtract: _arrow_json_extract_sql, 1812 exp.JSONExtractArray: _json_extract_value_array_sql, 1813 exp.JSONFormat: _json_format_sql, 1814 exp.JSONValueArray: _json_extract_value_array_sql, 1815 exp.Lateral: explode_to_unnest_sql, 1816 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1817 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1818 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1819 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1820 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1821 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1822 exp.BoolxorAgg: _boolxor_agg_sql, 1823 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1824 exp.Initcap: _initcap_sql, 1825 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1826 exp.SHA1Digest: lambda self, e: self.func("UNHEX", self.func("SHA1", e.this)), 1827 exp.SHA2Digest: lambda self, e: self.func("UNHEX", sha2_digest_sql(self, e)), 1828 exp.MonthsBetween: months_between_sql, 1829 exp.NextDay: _day_navigation_sql, 1830 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1831 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1832 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1833 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1834 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1835 exp.PreviousDay: _day_navigation_sql, 1836 exp.RegexpReplace: lambda self, e: self.func( 1837 "REGEXP_REPLACE", 1838 e.this, 1839 e.expression, 1840 e.args.get("replacement"), 1841 regexp_replace_global_modifier(e), 1842 ), 1843 exp.RegexpLike: rename_func("REGEXP_MATCHES"), 1844 exp.RegexpILike: lambda self, e: self.func( 1845 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1846 ), 1847 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1848 exp.RegrValx: _regr_val_sql, 1849 exp.RegrValy: _regr_val_sql, 1850 exp.Return: lambda self, e: self.sql(e, "this"), 1851 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1852 exp.Rand: rename_func("RANDOM"), 1853 exp.SHA2: sha256_sql, 1854 exp.Split: rename_func("STR_SPLIT"), 1855 exp.SortArray: _sort_array_sql, 1856 exp.StrPosition: strposition_sql, 1857 exp.StrToUnix: lambda self, e: self.func( 1858 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1859 ), 1860 exp.Struct: _struct_sql, 1861 exp.Transform: rename_func("LIST_TRANSFORM"), 1862 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1863 exp.TimeSub: _date_delta_to_binary_interval_op(), 1864 exp.Time: no_time_sql, 1865 exp.TimeDiff: _timediff_sql, 1866 exp.Timestamp: no_timestamp_sql, 1867 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1868 exp.TimestampDiff: lambda self, e: self.func( 1869 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1870 ), 1871 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1872 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 1873 exp.TimeStrToTime: timestrtotime_sql, 1874 exp.TimeStrToUnix: lambda self, e: self.func( 1875 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 1876 ), 1877 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1878 exp.ToBoolean: _to_boolean_sql, 1879 exp.TimeToUnix: rename_func("EPOCH"), 1880 exp.TsOrDiToDi: lambda self, 1881 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 1882 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1883 exp.TsOrDsDiff: lambda self, e: self.func( 1884 "DATE_DIFF", 1885 f"'{e.args.get('unit') or 'DAY'}'", 1886 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 1887 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 1888 ), 1889 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1890 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1891 exp.UnixSeconds: lambda self, e: self.sql( 1892 exp.cast( 1893 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 1894 ) 1895 ), 1896 exp.UnixToStr: lambda self, e: self.func( 1897 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1898 ), 1899 exp.DatetimeTrunc: lambda self, e: self.func( 1900 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 1901 ), 1902 exp.UnixToTime: _unix_to_time_sql, 1903 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1904 exp.VariancePop: rename_func("VAR_POP"), 1905 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1906 exp.YearOfWeek: lambda self, e: self.sql( 1907 exp.Extract( 1908 this=exp.Var(this="ISOYEAR"), 1909 expression=e.this, 1910 ) 1911 ), 1912 exp.YearOfWeekIso: lambda self, e: self.sql( 1913 exp.Extract( 1914 this=exp.Var(this="ISOYEAR"), 1915 expression=e.this, 1916 ) 1917 ), 1918 exp.Xor: _xor_sql, 1919 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1920 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1921 exp.DateBin: rename_func("TIME_BUCKET"), 1922 exp.LastDay: _last_day_sql, 1923 } 1924 1925 SUPPORTED_JSON_PATH_PARTS = { 1926 exp.JSONPathKey, 1927 exp.JSONPathRoot, 1928 exp.JSONPathSubscript, 1929 exp.JSONPathWildcard, 1930 } 1931 1932 TYPE_MAPPING = { 1933 **generator.Generator.TYPE_MAPPING, 1934 exp.DataType.Type.BINARY: "BLOB", 1935 exp.DataType.Type.BPCHAR: "TEXT", 1936 exp.DataType.Type.CHAR: "TEXT", 1937 exp.DataType.Type.DATETIME: "TIMESTAMP", 1938 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 1939 exp.DataType.Type.FLOAT: "REAL", 1940 exp.DataType.Type.JSONB: "JSON", 1941 exp.DataType.Type.NCHAR: "TEXT", 1942 exp.DataType.Type.NVARCHAR: "TEXT", 1943 exp.DataType.Type.UINT: "UINTEGER", 1944 exp.DataType.Type.VARBINARY: "BLOB", 1945 exp.DataType.Type.ROWVERSION: "BLOB", 1946 exp.DataType.Type.VARCHAR: "TEXT", 1947 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 1948 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 1949 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 1950 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 1951 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 1952 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 1953 } 1954 1955 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1956 RESERVED_KEYWORDS = { 1957 "array", 1958 "analyse", 1959 "union", 1960 "all", 1961 "when", 1962 "in_p", 1963 "default", 1964 "create_p", 1965 "window", 1966 "asymmetric", 1967 "to", 1968 "else", 1969 "localtime", 1970 "from", 1971 "end_p", 1972 "select", 1973 "current_date", 1974 "foreign", 1975 "with", 1976 "grant", 1977 "session_user", 1978 "or", 1979 "except", 1980 "references", 1981 "fetch", 1982 "limit", 1983 "group_p", 1984 "leading", 1985 "into", 1986 "collate", 1987 "offset", 1988 "do", 1989 "then", 1990 "localtimestamp", 1991 "check_p", 1992 "lateral_p", 1993 "current_role", 1994 "where", 1995 "asc_p", 1996 "placing", 1997 "desc_p", 1998 "user", 1999 "unique", 2000 "initially", 2001 "column", 2002 "both", 2003 "some", 2004 "as", 2005 "any", 2006 "only", 2007 "deferrable", 2008 "null_p", 2009 "current_time", 2010 "true_p", 2011 "table", 2012 "case", 2013 "trailing", 2014 "variadic", 2015 "for", 2016 "on", 2017 "distinct", 2018 "false_p", 2019 "not", 2020 "constraint", 2021 "current_timestamp", 2022 "returning", 2023 "primary", 2024 "intersect", 2025 "having", 2026 "analyze", 2027 "current_user", 2028 "and", 2029 "cast", 2030 "symmetric", 2031 "using", 2032 "order", 2033 "current_catalog", 2034 } 2035 2036 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2037 2038 # DuckDB doesn't generally support CREATE TABLE .. properties 2039 # https://duckdb.org/docs/sql/statements/create_table.html 2040 PROPERTIES_LOCATION = { 2041 prop: exp.Properties.Location.UNSUPPORTED 2042 for prop in generator.Generator.PROPERTIES_LOCATION 2043 } 2044 2045 # There are a few exceptions (e.g. temporary tables) which are supported or 2046 # can be transpiled to DuckDB, so we explicitly override them accordingly 2047 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2048 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2049 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2050 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2051 2052 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2053 exp.FirstValue, 2054 exp.Lag, 2055 exp.LastValue, 2056 exp.Lead, 2057 exp.NthValue, 2058 ) 2059 2060 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2061 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2062 """ 2063 WITH rand AS (SELECT :random_expr AS r), 2064 weights AS ( 2065 SELECT i, 1.0 / POWER(i, :s) AS w 2066 FROM RANGE(1, :n + 1) AS t(i) 2067 ), 2068 cdf AS ( 2069 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2070 FROM weights 2071 ) 2072 SELECT MIN(i) 2073 FROM cdf 2074 WHERE p >= (SELECT r FROM rand) 2075 """ 2076 ) 2077 2078 # Template for NORMAL transpilation using Box-Muller transform 2079 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2080 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2081 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2082 ) 2083 2084 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2085 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2086 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2087 ) 2088 2089 # Template for generating signed and unsigned SEQ values within a specified range 2090 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(f"{_SEQ_BASE} % :max_val") 2091 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2092 f"(CASE WHEN {_SEQ_BASE} % :max_val >= :half " 2093 f"THEN {_SEQ_BASE} % :max_val - :max_val " 2094 f"ELSE {_SEQ_BASE} % :max_val END)" 2095 ) 2096 2097 # Template for MAP_CAT transpilation - Snowflake semantics: 2098 # 1. Returns NULL if either input is NULL 2099 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2100 # 3. Filters out entries with NULL values from the result 2101 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2102 """ 2103 CASE 2104 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2105 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2106 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2107 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2108 ), __x -> __x.value IS NOT NULL)) 2109 END 2110 """ 2111 ) 2112 2113 # Mappings for EXTRACT/DATE_PART transpilation 2114 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2115 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2116 "WEEKISO": ("%V", "INTEGER"), 2117 "YEAROFWEEK": ("%G", "INTEGER"), 2118 "YEAROFWEEKISO": ("%G", "INTEGER"), 2119 "NANOSECOND": ("%n", "BIGINT"), 2120 } 2121 2122 # Maps epoch-based specifiers to DuckDB epoch functions 2123 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2124 "EPOCH_SECOND": "EPOCH", 2125 "EPOCH_MILLISECOND": "EPOCH_MS", 2126 "EPOCH_MICROSECOND": "EPOCH_US", 2127 "EPOCH_NANOSECOND": "EPOCH_NS", 2128 } 2129 2130 # Template for BITMAP_CONSTRUCT_AGG transpilation 2131 # 2132 # BACKGROUND: 2133 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2134 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2135 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2136 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2137 # 2138 # Snowflake uses two different formats based on the number of unique values: 2139 # 2140 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2141 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2142 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2143 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2144 # count v1 v2 v3 pad 2145 # 2146 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2147 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2148 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2149 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2150 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2151 # 2152 # TEMPLATE STRUCTURE 2153 # 2154 # Phase 1 - Innermost subquery: Data preparation 2155 # SELECT LIST_SORT(...) AS l 2156 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2157 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2158 # 2159 # Phase 2 - Middle subquery: Hex string construction 2160 # LIST_TRANSFORM(...) 2161 # - Converts each integer to 2-byte little-endian hex representation 2162 # - & 255 extracts low byte, >> 8 extracts high byte 2163 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2164 # Result: Hex string of all values 2165 # 2166 # Phase 3 - Outer SELECT: Final bitmap assembly 2167 # LENGTH(l) < 5: 2168 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2169 # LENGTH(l) >= 5: 2170 # - Large format: Fixed 10-byte header + values (no padding needed) 2171 # Result: Complete binary bitmap as BLOB 2172 # 2173 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2174 """ 2175 SELECT CASE 2176 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2177 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2178 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2179 ELSE UNHEX('08000000000000000000' || h) 2180 END 2181 FROM ( 2182 SELECT l, COALESCE(LIST_REDUCE( 2183 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2184 (__a, __b) -> __a || __b, '' 2185 ), '') AS h 2186 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2187 ) 2188 """ 2189 ) 2190 2191 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2192 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2193 f""" 2194 SELECT LISTAGG( 2195 SUBSTRING( 2196 '{RANDSTR_CHAR_POOL}', 2197 1 + CAST(FLOOR(random_value * 62) AS INT), 2198 1 2199 ), 2200 '' 2201 ) 2202 FROM ( 2203 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2204 FROM RANGE(:length) AS t(i) 2205 ) 2206 """, 2207 ) 2208 2209 # Template for MINHASH transpilation 2210 # Computes k minimum hash values across aggregated data using DuckDB list functions 2211 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2212 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2213 """ 2214 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2215 FROM ( 2216 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2217 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2218 ) 2219 """, 2220 ) 2221 2222 # Template for MINHASH_COMBINE transpilation 2223 # Combines multiple minhash signatures by taking element-wise minimum 2224 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2225 """ 2226 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2227 FROM ( 2228 SELECT 2229 pos AS idx, 2230 MIN(val) AS min_h 2231 FROM 2232 UNNEST(LIST(:expr)) AS _(sig), 2233 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2234 GROUP BY pos 2235 ) 2236 """, 2237 ) 2238 2239 # Template for APPROXIMATE_SIMILARITY transpilation 2240 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2241 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2242 """ 2243 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2244 FROM ( 2245 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2246 FROM ( 2247 SELECT h, pos 2248 FROM UNNEST(LIST(:expr)) AS _(sig), 2249 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2250 ) 2251 GROUP BY pos 2252 ) 2253 """, 2254 ) 2255 2256 # Template for ARRAYS_ZIP transpilation 2257 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2258 # Uses RANGE + indexing to match Snowflake behavior 2259 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2260 """ 2261 CASE WHEN :null_check THEN NULL 2262 WHEN :all_empty_check THEN [:empty_struct] 2263 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2264 END 2265 """, 2266 ) 2267 2268 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2269 """ 2270 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2271 2272 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2273 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2274 2275 For 'END' kind, add the interval to get the end of the slice. 2276 For DATE type with 'END', cast result back to DATE to preserve type. 2277 """ 2278 date_expr = expression.this 2279 slice_length = expression.expression 2280 unit = expression.unit 2281 kind = expression.text("kind").upper() 2282 2283 # Create INTERVAL expression: INTERVAL 'N' UNIT 2284 interval_expr = exp.Interval(this=slice_length, unit=unit) 2285 2286 # Create base time_bucket expression 2287 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2288 2289 # Check if we need the end of the slice (default is start) 2290 if not kind == "END": 2291 # For 'START', return time_bucket directly 2292 return self.sql(time_bucket_expr) 2293 2294 # For 'END', add the interval to get end of slice 2295 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2296 2297 # If input is DATE type, cast result back to DATE to preserve type 2298 # DuckDB converts DATE to TIMESTAMP when adding intervals 2299 if date_expr.is_type(exp.DataType.Type.DATE): 2300 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2301 2302 return self.sql(add_expr) 2303 2304 def bitmapbucketnumber_sql( 2305 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2306 ) -> str: 2307 """ 2308 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2309 2310 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2311 - Each bucket covers 32,768 values 2312 - Bucket numbering starts at 1 2313 - Formula: ((value - 1) // 32768) + 1 for positive values 2314 2315 For non-positive values (0 and negative), we use value // 32768 to avoid 2316 producing bucket 0 or positive bucket IDs for negative inputs. 2317 """ 2318 value = expression.this 2319 2320 positive_formula = ((value - 1) // 32768) + 1 2321 non_positive_formula = value // 32768 2322 2323 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2324 case_expr = ( 2325 exp.case() 2326 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2327 .else_(non_positive_formula) 2328 ) 2329 return self.sql(case_expr) 2330 2331 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2332 """ 2333 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2334 2335 Snowflake's BITMAP_BIT_POSITION behavior: 2336 - For n <= 0: returns ABS(n) % 32768 2337 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2338 """ 2339 this = expression.this 2340 2341 return self.sql( 2342 exp.Mod( 2343 this=exp.Paren( 2344 this=exp.If( 2345 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2346 true=this - exp.Literal.number(1), 2347 false=exp.Abs(this=this), 2348 ) 2349 ), 2350 expression=MAX_BIT_POSITION, 2351 ) 2352 ) 2353 2354 def bitmapconstructagg_sql( 2355 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2356 ) -> str: 2357 """ 2358 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2359 Uses a pre-parsed template with placeholders replaced by expression nodes. 2360 2361 Snowflake bitmap format: 2362 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2363 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2364 """ 2365 arg = expression.this 2366 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2367 2368 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2369 """ 2370 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2371 Uses a pre-parsed template with placeholders replaced by expression nodes. 2372 2373 RANDSTR(length, generator) generates a random string of specified length. 2374 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2375 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2376 - No generator: Use default seed value 2377 """ 2378 length = expression.this 2379 generator = expression.args.get("generator") 2380 2381 if generator: 2382 if isinstance(generator, exp.Rand): 2383 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2384 seed_value = generator.this or generator 2385 else: 2386 # Const/int or other expression - use as seed directly 2387 seed_value = generator 2388 else: 2389 # No generator specified, use default seed (arbitrary but deterministic) 2390 seed_value = exp.Literal.number(RANDSTR_SEED) 2391 2392 replacements = {"seed": seed_value, "length": length} 2393 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2394 2395 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2396 """ 2397 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2398 Uses a pre-parsed template with placeholders replaced by expression nodes. 2399 """ 2400 s = expression.this 2401 n = expression.args["elementcount"] 2402 gen = expression.args["gen"] 2403 2404 if not isinstance(gen, exp.Rand): 2405 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2406 random_expr: exp.Expression = exp.Div( 2407 this=exp.Paren( 2408 this=exp.Mod( 2409 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2410 expression=exp.Literal.number(1000000), 2411 ) 2412 ), 2413 expression=exp.Literal.number(1000000.0), 2414 ) 2415 else: 2416 # Use RANDOM() for non-deterministic output 2417 random_expr = exp.Rand() 2418 2419 replacements = {"s": s, "n": n, "random_expr": random_expr} 2420 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2421 2422 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2423 """ 2424 TO_BINARY and TRY_TO_BINARY transpilation: 2425 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2426 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2427 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2428 2429 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2430 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2431 """ 2432 value = expression.this 2433 format_arg = expression.args.get("format") 2434 is_safe = expression.args.get("safe") 2435 2436 fmt = "HEX" 2437 if format_arg: 2438 fmt = format_arg.name.upper() 2439 2440 if expression.is_type(exp.DataType.Type.BINARY): 2441 if fmt == "UTF-8": 2442 result = self.func("ENCODE", value) 2443 elif fmt == "BASE64": 2444 result = self.func("FROM_BASE64", value) 2445 elif fmt == "HEX": 2446 result = self.func("UNHEX", value) 2447 else: 2448 if is_safe: 2449 return self.sql(exp.null()) 2450 else: 2451 self.unsupported(f"format {fmt} is not supported") 2452 result = self.func("TO_BINARY", value) 2453 2454 # Wrap with TRY() for TRY_TO_BINARY 2455 if is_safe: 2456 result = self.func("TRY", result) 2457 2458 return result 2459 2460 # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake 2461 return self.func("TO_BINARY", value) 2462 2463 def _greatest_least_sql( 2464 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2465 ) -> str: 2466 """ 2467 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2468 2469 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2470 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2471 """ 2472 # Get all arguments 2473 all_args = [expression.this, *expression.expressions] 2474 fallback_sql = self.function_fallback_sql(expression) 2475 2476 if expression.args.get("ignore_nulls"): 2477 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2478 return self.sql(fallback_sql) 2479 2480 # return NULL if any argument is NULL 2481 case_expr = exp.case().when( 2482 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2483 exp.null(), 2484 copy=False, 2485 ) 2486 case_expr.set("default", fallback_sql) 2487 return self.sql(case_expr) 2488 2489 def generator_sql(self, expression: exp.Generator) -> str: 2490 # Transpile Snowflake GENERATOR to DuckDB range() 2491 rowcount = expression.args.get("rowcount") 2492 time_limit = expression.args.get("time_limit") 2493 2494 if time_limit: 2495 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2496 2497 if not rowcount: 2498 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2499 return self.func("range", exp.Literal.number(0)) 2500 2501 return self.func("range", rowcount) 2502 2503 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2504 return self._greatest_least_sql(expression) 2505 2506 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2507 return self._greatest_least_sql(expression) 2508 2509 def lambda_sql( 2510 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2511 ) -> str: 2512 if expression.args.get("colon"): 2513 prefix = "LAMBDA " 2514 arrow_sep = ":" 2515 wrap = False 2516 else: 2517 prefix = "" 2518 2519 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2520 return f"{prefix}{lambda_sql}" 2521 2522 def show_sql(self, expression: exp.Show) -> str: 2523 return f"SHOW {expression.name}" 2524 2525 def install_sql(self, expression: exp.Install) -> str: 2526 force = "FORCE " if expression.args.get("force") else "" 2527 this = self.sql(expression, "this") 2528 from_clause = expression.args.get("from_") 2529 from_clause = f" FROM {from_clause}" if from_clause else "" 2530 return f"{force}INSTALL {this}{from_clause}" 2531 2532 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2533 self.unsupported( 2534 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2535 ) 2536 return self.function_fallback_sql(expression) 2537 2538 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2539 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2540 2541 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2542 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2543 target_type = expression.args.get("target_type") 2544 needs_tz = target_type and target_type.this in ( 2545 exp.DataType.Type.TIMESTAMPLTZ, 2546 exp.DataType.Type.TIMESTAMPTZ, 2547 ) 2548 2549 if expression.args.get("safe"): 2550 formatted_time = self.format_time(expression) 2551 cast_type = ( 2552 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2553 ) 2554 return self.sql( 2555 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2556 ) 2557 2558 base_sql = str_to_time_sql(self, expression) 2559 if needs_tz: 2560 return self.sql( 2561 exp.cast( 2562 base_sql, 2563 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2564 ) 2565 ) 2566 return base_sql 2567 2568 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2569 formatted_time = self.format_time(expression) 2570 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2571 return self.sql( 2572 exp.cast( 2573 self.func(function_name, expression.this, formatted_time), 2574 exp.DataType(this=exp.DataType.Type.DATE), 2575 ) 2576 ) 2577 2578 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2579 this = expression.this 2580 time_format = self.format_time(expression) 2581 safe = expression.args.get("safe") 2582 time_type = exp.DataType.build("TIME", dialect="duckdb") 2583 cast_expr = exp.TryCast if safe else exp.Cast 2584 2585 if time_format: 2586 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2587 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2588 return self.sql(cast_expr(this=strptime, to=time_type)) 2589 2590 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2591 return self.sql(this) 2592 2593 return self.sql(cast_expr(this=this, to=time_type)) 2594 2595 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2596 if not expression.this: 2597 return "CURRENT_DATE" 2598 2599 expr = exp.Cast( 2600 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2601 to=exp.DataType(this=exp.DataType.Type.DATE), 2602 ) 2603 return self.sql(expr) 2604 2605 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2606 arg = expression.this 2607 if expression.args.get("safe"): 2608 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2609 return self.func("JSON", arg) 2610 2611 def normal_sql(self, expression: exp.Normal) -> str: 2612 """ 2613 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2614 2615 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2616 """ 2617 mean = expression.this 2618 stddev = expression.args["stddev"] 2619 gen: exp.Expression = expression.args["gen"] 2620 2621 # Build two uniform random values [0, 1) for Box-Muller transform 2622 if isinstance(gen, exp.Rand) and gen.this is None: 2623 u1: exp.Expression = exp.Rand() 2624 u2: exp.Expression = exp.Rand() 2625 else: 2626 # Seeded: derive two values using HASH with different inputs 2627 seed = gen.this if isinstance(gen, exp.Rand) else gen 2628 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2629 u2 = exp.replace_placeholders( 2630 self.SEEDED_RANDOM_TEMPLATE, 2631 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2632 ) 2633 2634 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2635 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2636 2637 def uniform_sql(self, expression: exp.Uniform) -> str: 2638 """ 2639 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2640 2641 UNIFORM returns a random value in [min, max]: 2642 - Integer result if both min and max are integers 2643 - Float result if either min or max is a float 2644 """ 2645 min_val = expression.this 2646 max_val = expression.expression 2647 gen = expression.args.get("gen") 2648 2649 # Determine if result should be integer (both bounds are integers). 2650 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2651 is_int_result = min_val.is_int and max_val.is_int 2652 2653 # Build the random value expression [0, 1) 2654 if not isinstance(gen, exp.Rand): 2655 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2656 random_expr: exp.Expression = exp.Div( 2657 this=exp.Paren( 2658 this=exp.Mod( 2659 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2660 expression=exp.Literal.number(1000000), 2661 ) 2662 ), 2663 expression=exp.Literal.number(1000000.0), 2664 ) 2665 else: 2666 random_expr = exp.Rand() 2667 2668 # Build: min + random * (max - min [+ 1 for int]) 2669 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2670 if is_int_result: 2671 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2672 2673 result: exp.Expression = exp.Add( 2674 this=min_val, 2675 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2676 ) 2677 2678 if is_int_result: 2679 result = exp.Cast( 2680 this=exp.Floor(this=result), 2681 to=exp.DataType.build("BIGINT"), 2682 ) 2683 2684 return self.sql(result) 2685 2686 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2687 nano = expression.args.get("nano") 2688 overflow = expression.args.get("overflow") 2689 2690 # Snowflake's TIME_FROM_PARTS supports overflow 2691 if overflow: 2692 hour = expression.args["hour"] 2693 minute = expression.args["min"] 2694 sec = expression.args["sec"] 2695 2696 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2697 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2698 try: 2699 h_val = hour.to_py() 2700 m_val = minute.to_py() 2701 s_val = sec.to_py() 2702 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2703 return rename_func("MAKE_TIME")(self, expression) 2704 except ValueError: 2705 pass 2706 2707 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2708 if nano: 2709 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2710 2711 total_seconds = ( 2712 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2713 ) 2714 2715 return self.sql( 2716 exp.Add( 2717 this=exp.Cast( 2718 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2719 ), 2720 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2721 ) 2722 ) 2723 2724 # Default: MAKE_TIME 2725 if nano: 2726 expression.set( 2727 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2728 ) 2729 2730 return rename_func("MAKE_TIME")(self, expression) 2731 2732 def extract_sql(self, expression: exp.Extract) -> str: 2733 """ 2734 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2735 2736 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2737 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2738 """ 2739 this = expression.this 2740 datetime_expr = expression.expression 2741 2742 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2743 # because Snowflake applies server timezone while DuckDB uses local timezone 2744 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2745 self.unsupported( 2746 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2747 ) 2748 2749 part_name = this.name.upper() 2750 2751 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2752 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2753 2754 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2755 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2756 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2757 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2758 ) 2759 2760 if is_nano_time: 2761 self.unsupported( 2762 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2763 ) 2764 return self.sql( 2765 exp.cast( 2766 exp.Mul( 2767 this=exp.Extract( 2768 this=exp.var("MICROSECOND"), expression=datetime_expr 2769 ), 2770 expression=exp.Literal.number(1000), 2771 ), 2772 exp.DataType.build(cast_type, dialect="duckdb"), 2773 ) 2774 ) 2775 2776 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2777 strftime_input = datetime_expr 2778 if part_name == "NANOSECOND": 2779 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 2780 2781 return self.sql( 2782 exp.cast( 2783 exp.Anonymous( 2784 this="STRFTIME", 2785 expressions=[strftime_input, exp.Literal.string(fmt)], 2786 ), 2787 exp.DataType.build(cast_type, dialect="duckdb"), 2788 ) 2789 ) 2790 2791 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2792 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2793 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2794 # EPOCH returns float, cast to BIGINT for integer result 2795 if part_name == "EPOCH_SECOND": 2796 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2797 return self.sql(result) 2798 2799 return super().extract_sql(expression) 2800 2801 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2802 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2803 date_expr = expression.this 2804 time_expr = expression.expression 2805 2806 if date_expr is not None and time_expr is not None: 2807 # In DuckDB, DATE + TIME produces TIMESTAMP 2808 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2809 2810 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2811 sec = expression.args.get("sec") 2812 if sec is None: 2813 # This shouldn't happen with valid input, but handle gracefully 2814 return rename_func("MAKE_TIMESTAMP")(self, expression) 2815 2816 milli = expression.args.get("milli") 2817 if milli is not None: 2818 sec += milli.pop() / exp.Literal.number(1000.0) 2819 2820 nano = expression.args.get("nano") 2821 if nano is not None: 2822 sec += nano.pop() / exp.Literal.number(1000000000.0) 2823 2824 if milli or nano: 2825 expression.set("sec", sec) 2826 2827 return rename_func("MAKE_TIMESTAMP")(self, expression) 2828 2829 @unsupported_args("nano") 2830 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2831 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2832 if nano := expression.args.get("nano"): 2833 nano.pop() 2834 2835 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2836 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2837 2838 @unsupported_args("nano") 2839 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2840 # Extract zone before popping 2841 zone = expression.args.get("zone") 2842 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2843 if zone: 2844 zone = zone.pop() 2845 2846 if nano := expression.args.get("nano"): 2847 nano.pop() 2848 2849 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2850 2851 if zone: 2852 # Use AT TIME ZONE to apply the explicit timezone 2853 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2854 2855 return timestamp 2856 2857 def tablesample_sql( 2858 self, 2859 expression: exp.TableSample, 2860 tablesample_keyword: t.Optional[str] = None, 2861 ) -> str: 2862 if not isinstance(expression.parent, exp.Select): 2863 # This sample clause only applies to a single source, not the entire resulting relation 2864 tablesample_keyword = "TABLESAMPLE" 2865 2866 if expression.args.get("size"): 2867 method = expression.args.get("method") 2868 if method and method.name.upper() != "RESERVOIR": 2869 self.unsupported( 2870 f"Sampling method {method} is not supported with a discrete sample count, " 2871 "defaulting to reservoir sampling" 2872 ) 2873 expression.set("method", exp.var("RESERVOIR")) 2874 2875 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2876 2877 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 2878 if isinstance(expression.parent, exp.UserDefinedFunction): 2879 return self.sql(expression, "this") 2880 return super().columndef_sql(expression, sep) 2881 2882 def join_sql(self, expression: exp.Join) -> str: 2883 if ( 2884 not expression.args.get("using") 2885 and not expression.args.get("on") 2886 and not expression.method 2887 and (expression.kind in ("", "INNER", "OUTER")) 2888 ): 2889 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2890 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2891 if isinstance(expression.this, exp.Unnest): 2892 return super().join_sql(expression.on(exp.true())) 2893 2894 expression.set("side", None) 2895 expression.set("kind", None) 2896 2897 return super().join_sql(expression) 2898 2899 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 2900 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 2901 if expression.args.get("is_end_exclusive"): 2902 return rename_func("RANGE")(self, expression) 2903 2904 return self.function_fallback_sql(expression) 2905 2906 def countif_sql(self, expression: exp.CountIf) -> str: 2907 if self.dialect.version >= (1, 2): 2908 return self.function_fallback_sql(expression) 2909 2910 # https://github.com/tobymao/sqlglot/pull/4749 2911 return count_if_to_sum(self, expression) 2912 2913 def bracket_sql(self, expression: exp.Bracket) -> str: 2914 if self.dialect.version >= (1, 2): 2915 return super().bracket_sql(expression) 2916 2917 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2918 this = expression.this 2919 if isinstance(this, exp.Array): 2920 this.replace(exp.paren(this)) 2921 2922 bracket = super().bracket_sql(expression) 2923 2924 if not expression.args.get("returns_list_for_maps"): 2925 if not this.type: 2926 from sqlglot.optimizer.annotate_types import annotate_types 2927 2928 this = annotate_types(this, dialect=self.dialect) 2929 2930 if this.is_type(exp.DataType.Type.MAP): 2931 bracket = f"({bracket})[1]" 2932 2933 return bracket 2934 2935 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2936 func = expression.this 2937 2938 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2939 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2940 if isinstance(func, exp.ArrayAgg): 2941 if not isinstance(order := expression.expression, exp.Order): 2942 return self.sql(func) 2943 2944 # Save the original column for FILTER clause (before wrapping with Order) 2945 original_this = func.this 2946 2947 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 2948 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 2949 func.set( 2950 "this", 2951 exp.Order( 2952 this=func.this.copy(), 2953 expressions=order.expressions, 2954 ), 2955 ) 2956 2957 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 2958 # Use original_this (not the Order-wrapped version) for the FILTER condition 2959 array_agg_sql = self.function_fallback_sql(func) 2960 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 2961 2962 # For other functions (like PERCENTILES), use existing logic 2963 expression_sql = self.sql(expression, "expression") 2964 2965 if isinstance(func, exp.PERCENTILES): 2966 # Make the order key the first arg and slide the fraction to the right 2967 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 2968 order_col = expression.find(exp.Ordered) 2969 if order_col: 2970 func.set("expression", func.this) 2971 func.set("this", order_col.this) 2972 2973 this = self.sql(expression, "this").rstrip(")") 2974 2975 return f"{this}{expression_sql})" 2976 2977 def length_sql(self, expression: exp.Length) -> str: 2978 arg = expression.this 2979 2980 # Dialects like BQ and Snowflake also accept binary values as args, so 2981 # DDB will attempt to infer the type or resort to case/when resolution 2982 if not expression.args.get("binary") or arg.is_string: 2983 return self.func("LENGTH", arg) 2984 2985 if not arg.type: 2986 from sqlglot.optimizer.annotate_types import annotate_types 2987 2988 arg = annotate_types(arg, dialect=self.dialect) 2989 2990 if arg.is_type(*exp.DataType.TEXT_TYPES): 2991 return self.func("LENGTH", arg) 2992 2993 # We need these casts to make duckdb's static type checker happy 2994 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 2995 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 2996 2997 case = ( 2998 exp.case(self.func("TYPEOF", arg)) 2999 .when("'BLOB'", self.func("OCTET_LENGTH", blob)) 3000 .else_( 3001 exp.Anonymous(this="LENGTH", expressions=[varchar]) 3002 ) # anonymous to break length_sql recursion 3003 ) 3004 3005 return self.sql(case) 3006 3007 def sha_sql(self, expression: exp.SHA) -> str: 3008 arg = expression.this 3009 3010 # If type is compatible with DuckDB or is an unknown type, use directly 3011 if ( 3012 arg.type 3013 and arg.type.this != exp.DataType.Type.UNKNOWN 3014 and not arg.is_type(*exp.DataType.TEXT_TYPES) 3015 and not _is_binary(arg) 3016 ): 3017 arg = exp.cast(arg, exp.DataType.Type.VARCHAR) 3018 3019 return self.func("SHA1", arg) 3020 3021 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3022 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3023 this = expression.this 3024 expr = expression.expression 3025 max_dist = expression.args.get("max_dist") 3026 3027 if max_dist is None: 3028 return self.func("LEVENSHTEIN", this, expr) 3029 3030 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3031 levenshtein = exp.Levenshtein(this=this, expression=expr) 3032 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3033 3034 def minhash_sql(self, expression: exp.Minhash) -> str: 3035 k = expression.this 3036 exprs = expression.expressions 3037 3038 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3039 self.unsupported( 3040 "MINHASH with multiple expressions or * requires manual query restructuring" 3041 ) 3042 return self.func("MINHASH", k, *exprs) 3043 3044 expr = exprs[0] 3045 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3046 return f"({self.sql(result)})" 3047 3048 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3049 expr = expression.this 3050 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3051 return f"({self.sql(result)})" 3052 3053 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3054 expr = expression.this 3055 result = exp.replace_placeholders( 3056 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3057 ) 3058 return f"({self.sql(result)})" 3059 3060 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3061 args = expression.expressions 3062 3063 if not args: 3064 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3065 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3066 3067 # Build placeholder values for template 3068 lengths = [exp.Length(this=arg) for arg in args] 3069 max_len = ( 3070 lengths[0] 3071 if len(lengths) == 1 3072 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3073 ) 3074 3075 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3076 empty_struct = exp.func( 3077 "STRUCT", 3078 *[ 3079 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3080 for i in range(len(args)) 3081 ], 3082 ) 3083 3084 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3085 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3086 index = exp.column("__i") + 1 3087 transform_struct = exp.func( 3088 "STRUCT", 3089 *[ 3090 exp.PropertyEQ( 3091 this=exp.Literal.string(f"${i + 1}"), 3092 expression=exp.func("COALESCE", arg, exp.array())[index], 3093 ) 3094 for i, arg in enumerate(args) 3095 ], 3096 ) 3097 3098 result = exp.replace_placeholders( 3099 self.ARRAYS_ZIP_TEMPLATE.copy(), 3100 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3101 all_empty_check=exp.and_( 3102 *[ 3103 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3104 for arg in args 3105 ] 3106 ), 3107 empty_struct=empty_struct, 3108 max_len=max_len, 3109 transform_struct=transform_struct, 3110 ) 3111 return self.sql(result) 3112 3113 def lower_sql(self, expression: exp.Lower) -> str: 3114 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3115 return _gen_with_cast_to_blob(self, expression, result_sql) 3116 3117 def upper_sql(self, expression: exp.Upper) -> str: 3118 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3119 return _gen_with_cast_to_blob(self, expression, result_sql) 3120 3121 def reverse_sql(self, expression: exp.Reverse) -> str: 3122 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3123 return _gen_with_cast_to_blob(self, expression, result_sql) 3124 3125 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3126 # DuckDB TO_BASE64 requires BLOB input 3127 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3128 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3129 result = expression.this 3130 3131 # Check if input is a string type - ENCODE only accepts VARCHAR 3132 if result.is_type(*exp.DataType.TEXT_TYPES): 3133 result = exp.Encode(this=result) 3134 3135 result = exp.ToBase64(this=result) 3136 3137 max_line_length = expression.args.get("max_line_length") 3138 alphabet = expression.args.get("alphabet") 3139 3140 # Handle custom alphabet by replacing standard chars with custom ones 3141 result = _apply_base64_alphabet_replacements(result, alphabet) 3142 3143 # Handle max_line_length by inserting newlines every N characters 3144 line_length = ( 3145 t.cast(int, max_line_length.to_py()) 3146 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3147 else 0 3148 ) 3149 if line_length > 0: 3150 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3151 result = exp.Trim( 3152 this=exp.RegexpReplace( 3153 this=result, 3154 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3155 replacement=exp.Concat( 3156 expressions=[exp.Literal.string("\\1"), newline.copy()] 3157 ), 3158 ), 3159 expression=newline, 3160 position="TRAILING", 3161 ) 3162 3163 return self.sql(result) 3164 3165 def replace_sql(self, expression: exp.Replace) -> str: 3166 result_sql = self.func( 3167 "REPLACE", 3168 _cast_to_varchar(expression.this), 3169 _cast_to_varchar(expression.expression), 3170 _cast_to_varchar(expression.args.get("replacement")), 3171 ) 3172 return _gen_with_cast_to_blob(self, expression, result_sql) 3173 3174 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3175 _prepare_binary_bitwise_args(expression) 3176 result_sql = self.binary(expression, op) 3177 return _gen_with_cast_to_blob(self, expression, result_sql) 3178 3179 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3180 _prepare_binary_bitwise_args(expression) 3181 result_sql = self.func("XOR", expression.this, expression.expression) 3182 return _gen_with_cast_to_blob(self, expression, result_sql) 3183 3184 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3185 this = expression.this 3186 key = expression.args.get("key") 3187 key_sql = key.name if isinstance(key, exp.Expression) else "" 3188 value_sql = self.sql(expression, "value") 3189 3190 kv_sql = f"{key_sql} := {value_sql}" 3191 3192 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3193 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3194 if isinstance(this, exp.Struct) and not this.expressions: 3195 return self.func("STRUCT_PACK", kv_sql) 3196 3197 return self.func("STRUCT_INSERT", this, kv_sql) 3198 3199 def mapcat_sql(self, expression: exp.MapCat) -> str: 3200 result = exp.replace_placeholders( 3201 self.MAPCAT_TEMPLATE.copy(), 3202 map1=expression.this, 3203 map2=expression.expression, 3204 ) 3205 return self.sql(result) 3206 3207 def startswith_sql(self, expression: exp.StartsWith) -> str: 3208 return self.func( 3209 "STARTS_WITH", 3210 _cast_to_varchar(expression.this), 3211 _cast_to_varchar(expression.expression), 3212 ) 3213 3214 def space_sql(self, expression: exp.Space) -> str: 3215 # DuckDB's REPEAT requires BIGINT for the count parameter 3216 return self.sql( 3217 exp.Repeat( 3218 this=exp.Literal.string(" "), 3219 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3220 ) 3221 ) 3222 3223 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3224 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3225 if isinstance(expression.this, exp.Generator): 3226 # Preserve alias, joins, and other table-level args 3227 table = exp.Table( 3228 this=expression.this, 3229 alias=expression.args.get("alias"), 3230 joins=expression.args.get("joins"), 3231 ) 3232 return self.sql(table) 3233 3234 return super().tablefromrows_sql(expression) 3235 3236 def unnest_sql(self, expression: exp.Unnest) -> str: 3237 explode_array = expression.args.get("explode_array") 3238 if explode_array: 3239 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3240 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3241 expression.expressions.append( 3242 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3243 ) 3244 3245 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3246 alias = expression.args.get("alias") 3247 if isinstance(alias, exp.TableAlias): 3248 expression.set("alias", None) 3249 if alias.columns: 3250 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3251 3252 unnest_sql = super().unnest_sql(expression) 3253 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3254 return self.sql(select) 3255 3256 return super().unnest_sql(expression) 3257 3258 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3259 this = expression.this 3260 3261 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3262 # DuckDB should render IGNORE NULLS only for the general-purpose 3263 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3264 return super().ignorenulls_sql(expression) 3265 3266 if isinstance(this, exp.First): 3267 this = exp.AnyValue(this=this.this) 3268 3269 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3270 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3271 3272 return self.sql(this) 3273 3274 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3275 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3276 # DuckDB should render RESPECT NULLS only for the general-purpose 3277 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3278 return super().respectnulls_sql(expression) 3279 3280 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3281 return self.sql(expression, "this") 3282 3283 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3284 this = self.sql(expression, "this") 3285 null_text = self.sql(expression, "null") 3286 3287 if null_text: 3288 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3289 3290 return self.func("ARRAY_TO_STRING", this, expression.expression) 3291 3292 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3293 this = expression.this 3294 group = expression.args.get("group") 3295 params = expression.args.get("parameters") 3296 position = expression.args.get("position") 3297 occurrence = expression.args.get("occurrence") 3298 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3299 3300 if position and (not position.is_int or position.to_py() > 1): 3301 this = exp.Substring(this=this, start=position) 3302 3303 if null_if_pos_overflow: 3304 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3305 3306 # Do not render group if there is no following argument, 3307 # and it's the default value for this dialect 3308 if ( 3309 not params 3310 and group 3311 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3312 ): 3313 group = None 3314 3315 if occurrence and (not occurrence.is_int or occurrence.to_py() > 1): 3316 return self.func( 3317 "ARRAY_EXTRACT", 3318 self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params), 3319 exp.Literal.number(occurrence), 3320 ) 3321 3322 return self.func("REGEXP_EXTRACT", this, expression.expression, group, params) 3323 3324 @unsupported_args("culture") 3325 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3326 fmt = expression.args.get("format") 3327 if fmt and fmt.is_int: 3328 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3329 3330 self.unsupported("Only integer formats are supported by NumberToStr") 3331 return self.function_fallback_sql(expression) 3332 3333 def autoincrementcolumnconstraint_sql(self, _) -> str: 3334 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3335 return "" 3336 3337 def aliases_sql(self, expression: exp.Aliases) -> str: 3338 this = expression.this 3339 if isinstance(this, exp.Posexplode): 3340 return self.posexplode_sql(this) 3341 3342 return super().aliases_sql(expression) 3343 3344 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3345 this = expression.this 3346 parent = expression.parent 3347 3348 # The default Spark aliases are "pos" and "col", unless specified otherwise 3349 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3350 3351 if isinstance(parent, exp.Aliases): 3352 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3353 pos, col = parent.expressions 3354 elif isinstance(parent, exp.Table): 3355 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3356 alias = parent.args.get("alias") 3357 if alias: 3358 pos, col = alias.columns or [pos, col] 3359 alias.pop() 3360 3361 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3362 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3363 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3364 gen_subscripts = self.sql( 3365 exp.Alias( 3366 this=exp.Anonymous( 3367 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3368 ) 3369 - exp.Literal.number(1), 3370 alias=pos, 3371 ) 3372 ) 3373 3374 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3375 3376 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3377 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3378 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3379 3380 return posexplode_sql 3381 3382 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3383 """ 3384 Handles three key issues: 3385 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3386 2. End-of-month preservation: If input is last day of month, result is last day of result month 3387 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3388 """ 3389 from sqlglot.optimizer.annotate_types import annotate_types 3390 3391 this = expression.this 3392 if not this.type: 3393 this = annotate_types(this, dialect=self.dialect) 3394 3395 if this.is_type(*exp.DataType.TEXT_TYPES): 3396 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3397 3398 # Detect float/decimal months to apply rounding (Snowflake behavior) 3399 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3400 months_expr = expression.expression 3401 if not months_expr.type: 3402 months_expr = annotate_types(months_expr, dialect=self.dialect) 3403 3404 # Build interval or to_months expression based on type 3405 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3406 interval_or_to_months = ( 3407 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3408 if months_expr.is_type( 3409 exp.DataType.Type.FLOAT, 3410 exp.DataType.Type.DOUBLE, 3411 exp.DataType.Type.DECIMAL, 3412 ) 3413 # Integer case: standard INTERVAL N MONTH syntax 3414 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3415 ) 3416 3417 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3418 3419 # Apply end-of-month preservation if Snowflake flag is set 3420 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3421 preserve_eom = expression.args.get("preserve_end_of_month") 3422 result_expr = ( 3423 exp.case() 3424 .when( 3425 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3426 exp.func("LAST_DAY", date_add_expr), 3427 ) 3428 .else_(date_add_expr) 3429 if preserve_eom 3430 else date_add_expr 3431 ) 3432 3433 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3434 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3435 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3436 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3437 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3438 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3439 return self.sql(result_expr) 3440 3441 def format_sql(self, expression: exp.Format) -> str: 3442 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3443 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3444 3445 return self.function_fallback_sql(expression) 3446 3447 def hexstring_sql( 3448 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3449 ) -> str: 3450 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3451 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3452 3453 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3454 unit = unit_to_str(expression) 3455 date = expression.this 3456 result = self.func("DATE_TRUNC", unit, date) 3457 3458 if ( 3459 expression.args.get("input_type_preserved") 3460 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3461 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3462 ): 3463 return self.sql(exp.Cast(this=result, to=date.type)) 3464 3465 return result 3466 3467 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3468 unit = unit_to_str(expression) 3469 zone = expression.args.get("zone") 3470 timestamp = expression.this 3471 date_unit = is_date_unit(unit) 3472 3473 if date_unit and zone: 3474 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3475 # Double AT TIME ZONE needed for BigQuery compatibility: 3476 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 3477 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 3478 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 3479 result_sql = self.func("DATE_TRUNC", unit, timestamp) 3480 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 3481 3482 result = self.func("DATE_TRUNC", unit, timestamp) 3483 if expression.args.get("input_type_preserved"): 3484 if timestamp.type and timestamp.is_type( 3485 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 3486 ): 3487 dummy_date = exp.Cast( 3488 this=exp.Literal.string("1970-01-01"), 3489 to=exp.DataType(this=exp.DataType.Type.DATE), 3490 ) 3491 date_time = exp.Add(this=dummy_date, expression=timestamp) 3492 result = self.func("DATE_TRUNC", unit, date_time) 3493 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3494 3495 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 3496 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 3497 ): 3498 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3499 3500 return result 3501 3502 def trim_sql(self, expression: exp.Trim) -> str: 3503 expression.this.replace(_cast_to_varchar(expression.this)) 3504 if expression.expression: 3505 expression.expression.replace(_cast_to_varchar(expression.expression)) 3506 3507 result_sql = super().trim_sql(expression) 3508 return _gen_with_cast_to_blob(self, expression, result_sql) 3509 3510 def round_sql(self, expression: exp.Round) -> str: 3511 this = expression.this 3512 decimals = expression.args.get("decimals") 3513 truncate = expression.args.get("truncate") 3514 3515 # DuckDB requires the scale (decimals) argument to be an INT 3516 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 3517 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 3518 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 3519 decimals = exp.cast(decimals, exp.DataType.Type.INT) 3520 3521 func = "ROUND" 3522 if truncate: 3523 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 3524 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 3525 func = "ROUND_EVEN" 3526 truncate = None 3527 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 3528 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 3529 truncate = None 3530 3531 return self.func(func, this, decimals, truncate) 3532 3533 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 3534 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 3535 3536 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 3537 if expression.is_type(*exp.DataType.REAL_TYPES): 3538 result = f"CAST({result} AS DOUBLE)" 3539 3540 return result 3541 3542 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 3543 """ 3544 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 3545 dividing the input distribution into n equal-sized buckets. 3546 3547 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 3548 does not document the specific algorithm used so results may differ. DuckDB does not 3549 support RESPECT NULLS. 3550 """ 3551 this = expression.this 3552 if isinstance(this, exp.Distinct): 3553 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 3554 if len(this.expressions) < 2: 3555 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 3556 return self.function_fallback_sql(expression) 3557 num_quantiles_expr = this.expressions[1].pop() 3558 else: 3559 num_quantiles_expr = expression.expression 3560 3561 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 3562 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3563 return self.function_fallback_sql(expression) 3564 3565 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 3566 if num_quantiles <= 0: 3567 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3568 return self.function_fallback_sql(expression) 3569 3570 quantiles = [ 3571 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 3572 for i in range(num_quantiles + 1) 3573 ] 3574 3575 return self.sql( 3576 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 3577 ) 3578 3579 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 3580 if expression.args.get("scalar_only"): 3581 expression = exp.JSONExtractScalar( 3582 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 3583 ) 3584 return _arrow_json_extract_sql(self, expression) 3585 3586 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 3587 this = expression.this 3588 3589 if _is_binary(this): 3590 expression.type = exp.DataType.build("BINARY") 3591 3592 arg = _cast_to_bit(this) 3593 3594 if isinstance(this, exp.Neg): 3595 arg = exp.Paren(this=arg) 3596 3597 expression.set("this", arg) 3598 3599 result_sql = f"~{self.sql(expression, 'this')}" 3600 3601 return _gen_with_cast_to_blob(self, expression, result_sql) 3602 3603 def window_sql(self, expression: exp.Window) -> str: 3604 this = expression.this 3605 if isinstance(this, exp.Corr) or ( 3606 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 3607 ): 3608 return self._corr_sql(expression) 3609 3610 return super().window_sql(expression) 3611 3612 def filter_sql(self, expression: exp.Filter) -> str: 3613 if isinstance(expression.this, exp.Corr): 3614 return self._corr_sql(expression) 3615 3616 return super().filter_sql(expression) 3617 3618 def _corr_sql( 3619 self, 3620 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 3621 ) -> str: 3622 if isinstance(expression, exp.Corr) and not expression.args.get( 3623 "null_on_zero_variance" 3624 ): 3625 return self.func("CORR", expression.this, expression.expression) 3626 3627 corr_expr = _maybe_corr_null_to_false(expression) 3628 if corr_expr is None: 3629 if isinstance(expression, exp.Window): 3630 return super().window_sql(expression) 3631 if isinstance(expression, exp.Filter): 3632 return super().filter_sql(expression) 3633 corr_expr = expression # make mypy happy 3634 3635 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True: Always quote except for specials cases. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
2268 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2269 """ 2270 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2271 2272 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2273 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2274 2275 For 'END' kind, add the interval to get the end of the slice. 2276 For DATE type with 'END', cast result back to DATE to preserve type. 2277 """ 2278 date_expr = expression.this 2279 slice_length = expression.expression 2280 unit = expression.unit 2281 kind = expression.text("kind").upper() 2282 2283 # Create INTERVAL expression: INTERVAL 'N' UNIT 2284 interval_expr = exp.Interval(this=slice_length, unit=unit) 2285 2286 # Create base time_bucket expression 2287 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2288 2289 # Check if we need the end of the slice (default is start) 2290 if not kind == "END": 2291 # For 'START', return time_bucket directly 2292 return self.sql(time_bucket_expr) 2293 2294 # For 'END', add the interval to get end of slice 2295 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2296 2297 # If input is DATE type, cast result back to DATE to preserve type 2298 # DuckDB converts DATE to TIMESTAMP when adding intervals 2299 if date_expr.is_type(exp.DataType.Type.DATE): 2300 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2301 2302 return self.sql(add_expr)
Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr)
For 'END' kind, add the interval to get the end of the slice. For DATE type with 'END', cast result back to DATE to preserve type.
2304 def bitmapbucketnumber_sql( 2305 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2306 ) -> str: 2307 """ 2308 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2309 2310 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2311 - Each bucket covers 32,768 values 2312 - Bucket numbering starts at 1 2313 - Formula: ((value - 1) // 32768) + 1 for positive values 2314 2315 For non-positive values (0 and negative), we use value // 32768 to avoid 2316 producing bucket 0 or positive bucket IDs for negative inputs. 2317 """ 2318 value = expression.this 2319 2320 positive_formula = ((value - 1) // 32768) + 1 2321 non_positive_formula = value // 32768 2322 2323 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2324 case_expr = ( 2325 exp.case() 2326 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2327 .else_(non_positive_formula) 2328 ) 2329 return self.sql(case_expr)
Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent.
Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where:
- Each bucket covers 32,768 values
- Bucket numbering starts at 1
- Formula: ((value - 1) // 32768) + 1 for positive values
For non-positive values (0 and negative), we use value // 32768 to avoid producing bucket 0 or positive bucket IDs for negative inputs.
2331 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2332 """ 2333 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2334 2335 Snowflake's BITMAP_BIT_POSITION behavior: 2336 - For n <= 0: returns ABS(n) % 32768 2337 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2338 """ 2339 this = expression.this 2340 2341 return self.sql( 2342 exp.Mod( 2343 this=exp.Paren( 2344 this=exp.If( 2345 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2346 true=this - exp.Literal.number(1), 2347 false=exp.Abs(this=this), 2348 ) 2349 ), 2350 expression=MAX_BIT_POSITION, 2351 ) 2352 )
Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
Snowflake's BITMAP_BIT_POSITION behavior:
- For n <= 0: returns ABS(n) % 32768
- For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
2354 def bitmapconstructagg_sql( 2355 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2356 ) -> str: 2357 """ 2358 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2359 Uses a pre-parsed template with placeholders replaced by expression nodes. 2360 2361 Snowflake bitmap format: 2362 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2363 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2364 """ 2365 arg = expression.this 2366 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})"
Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. Uses a pre-parsed template with placeholders replaced by expression nodes.
Snowflake bitmap format:
- Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
- Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
2368 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2369 """ 2370 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2371 Uses a pre-parsed template with placeholders replaced by expression nodes. 2372 2373 RANDSTR(length, generator) generates a random string of specified length. 2374 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2375 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2376 - No generator: Use default seed value 2377 """ 2378 length = expression.this 2379 generator = expression.args.get("generator") 2380 2381 if generator: 2382 if isinstance(generator, exp.Rand): 2383 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2384 seed_value = generator.this or generator 2385 else: 2386 # Const/int or other expression - use as seed directly 2387 seed_value = generator 2388 else: 2389 # No generator specified, use default seed (arbitrary but deterministic) 2390 seed_value = exp.Literal.number(RANDSTR_SEED) 2391 2392 replacements = {"seed": seed_value, "length": length} 2393 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})"
Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. Uses a pre-parsed template with placeholders replaced by expression nodes.
RANDSTR(length, generator) generates a random string of specified length.
- With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
- With RANDOM(): Use RANDOM() in the hash for non-deterministic output
- No generator: Use default seed value
2395 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2396 """ 2397 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2398 Uses a pre-parsed template with placeholders replaced by expression nodes. 2399 """ 2400 s = expression.this 2401 n = expression.args["elementcount"] 2402 gen = expression.args["gen"] 2403 2404 if not isinstance(gen, exp.Rand): 2405 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2406 random_expr: exp.Expression = exp.Div( 2407 this=exp.Paren( 2408 this=exp.Mod( 2409 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2410 expression=exp.Literal.number(1000000), 2411 ) 2412 ), 2413 expression=exp.Literal.number(1000000.0), 2414 ) 2415 else: 2416 # Use RANDOM() for non-deterministic output 2417 random_expr = exp.Rand() 2418 2419 replacements = {"s": s, "n": n, "random_expr": random_expr} 2420 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})"
Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. Uses a pre-parsed template with placeholders replaced by expression nodes.
2422 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2423 """ 2424 TO_BINARY and TRY_TO_BINARY transpilation: 2425 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2426 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2427 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2428 2429 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2430 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2431 """ 2432 value = expression.this 2433 format_arg = expression.args.get("format") 2434 is_safe = expression.args.get("safe") 2435 2436 fmt = "HEX" 2437 if format_arg: 2438 fmt = format_arg.name.upper() 2439 2440 if expression.is_type(exp.DataType.Type.BINARY): 2441 if fmt == "UTF-8": 2442 result = self.func("ENCODE", value) 2443 elif fmt == "BASE64": 2444 result = self.func("FROM_BASE64", value) 2445 elif fmt == "HEX": 2446 result = self.func("UNHEX", value) 2447 else: 2448 if is_safe: 2449 return self.sql(exp.null()) 2450 else: 2451 self.unsupported(f"format {fmt} is not supported") 2452 result = self.func("TO_BINARY", value) 2453 2454 # Wrap with TRY() for TRY_TO_BINARY 2455 if is_safe: 2456 result = self.func("TRY", result) 2457 2458 return result 2459 2460 # Fallback, which needs to be updated if want to support transpilation from other dialects than Snowflake 2461 return self.func("TO_BINARY", value)
TO_BINARY and TRY_TO_BINARY transpilation:
- 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50')
- 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST')
- 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==')
For TRY_TO_BINARY (safe=True), wrap with TRY():
- 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid'))
2489 def generator_sql(self, expression: exp.Generator) -> str: 2490 # Transpile Snowflake GENERATOR to DuckDB range() 2491 rowcount = expression.args.get("rowcount") 2492 time_limit = expression.args.get("time_limit") 2493 2494 if time_limit: 2495 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2496 2497 if not rowcount: 2498 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2499 return self.func("range", exp.Literal.number(0)) 2500 2501 return self.func("range", rowcount)
2509 def lambda_sql( 2510 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2511 ) -> str: 2512 if expression.args.get("colon"): 2513 prefix = "LAMBDA " 2514 arrow_sep = ":" 2515 wrap = False 2516 else: 2517 prefix = "" 2518 2519 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2520 return f"{prefix}{lambda_sql}"
2525 def install_sql(self, expression: exp.Install) -> str: 2526 force = "FORCE " if expression.args.get("force") else "" 2527 this = self.sql(expression, "this") 2528 from_clause = expression.args.get("from_") 2529 from_clause = f" FROM {from_clause}" if from_clause else "" 2530 return f"{force}INSTALL {this}{from_clause}"
2541 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2542 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2543 target_type = expression.args.get("target_type") 2544 needs_tz = target_type and target_type.this in ( 2545 exp.DataType.Type.TIMESTAMPLTZ, 2546 exp.DataType.Type.TIMESTAMPTZ, 2547 ) 2548 2549 if expression.args.get("safe"): 2550 formatted_time = self.format_time(expression) 2551 cast_type = ( 2552 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2553 ) 2554 return self.sql( 2555 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2556 ) 2557 2558 base_sql = str_to_time_sql(self, expression) 2559 if needs_tz: 2560 return self.sql( 2561 exp.cast( 2562 base_sql, 2563 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2564 ) 2565 ) 2566 return base_sql
2568 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2569 formatted_time = self.format_time(expression) 2570 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2571 return self.sql( 2572 exp.cast( 2573 self.func(function_name, expression.this, formatted_time), 2574 exp.DataType(this=exp.DataType.Type.DATE), 2575 ) 2576 )
2578 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2579 this = expression.this 2580 time_format = self.format_time(expression) 2581 safe = expression.args.get("safe") 2582 time_type = exp.DataType.build("TIME", dialect="duckdb") 2583 cast_expr = exp.TryCast if safe else exp.Cast 2584 2585 if time_format: 2586 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2587 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2588 return self.sql(cast_expr(this=strptime, to=time_type)) 2589 2590 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2591 return self.sql(this) 2592 2593 return self.sql(cast_expr(this=this, to=time_type))
2595 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2596 if not expression.this: 2597 return "CURRENT_DATE" 2598 2599 expr = exp.Cast( 2600 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2601 to=exp.DataType(this=exp.DataType.Type.DATE), 2602 ) 2603 return self.sql(expr)
2611 def normal_sql(self, expression: exp.Normal) -> str: 2612 """ 2613 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2614 2615 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2616 """ 2617 mean = expression.this 2618 stddev = expression.args["stddev"] 2619 gen: exp.Expression = expression.args["gen"] 2620 2621 # Build two uniform random values [0, 1) for Box-Muller transform 2622 if isinstance(gen, exp.Rand) and gen.this is None: 2623 u1: exp.Expression = exp.Rand() 2624 u2: exp.Expression = exp.Rand() 2625 else: 2626 # Seeded: derive two values using HASH with different inputs 2627 seed = gen.this if isinstance(gen, exp.Rand) else gen 2628 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2629 u2 = exp.replace_placeholders( 2630 self.SEEDED_RANDOM_TEMPLATE, 2631 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2632 ) 2633 2634 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2635 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements))
Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB.
Uses the Box-Muller transform via NORMAL_TEMPLATE.
2637 def uniform_sql(self, expression: exp.Uniform) -> str: 2638 """ 2639 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2640 2641 UNIFORM returns a random value in [min, max]: 2642 - Integer result if both min and max are integers 2643 - Float result if either min or max is a float 2644 """ 2645 min_val = expression.this 2646 max_val = expression.expression 2647 gen = expression.args.get("gen") 2648 2649 # Determine if result should be integer (both bounds are integers). 2650 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2651 is_int_result = min_val.is_int and max_val.is_int 2652 2653 # Build the random value expression [0, 1) 2654 if not isinstance(gen, exp.Rand): 2655 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2656 random_expr: exp.Expression = exp.Div( 2657 this=exp.Paren( 2658 this=exp.Mod( 2659 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2660 expression=exp.Literal.number(1000000), 2661 ) 2662 ), 2663 expression=exp.Literal.number(1000000.0), 2664 ) 2665 else: 2666 random_expr = exp.Rand() 2667 2668 # Build: min + random * (max - min [+ 1 for int]) 2669 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2670 if is_int_result: 2671 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2672 2673 result: exp.Expression = exp.Add( 2674 this=min_val, 2675 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2676 ) 2677 2678 if is_int_result: 2679 result = exp.Cast( 2680 this=exp.Floor(this=result), 2681 to=exp.DataType.build("BIGINT"), 2682 ) 2683 2684 return self.sql(result)
Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB.
UNIFORM returns a random value in [min, max]:
- Integer result if both min and max are integers
- Float result if either min or max is a float
2686 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2687 nano = expression.args.get("nano") 2688 overflow = expression.args.get("overflow") 2689 2690 # Snowflake's TIME_FROM_PARTS supports overflow 2691 if overflow: 2692 hour = expression.args["hour"] 2693 minute = expression.args["min"] 2694 sec = expression.args["sec"] 2695 2696 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2697 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2698 try: 2699 h_val = hour.to_py() 2700 m_val = minute.to_py() 2701 s_val = sec.to_py() 2702 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2703 return rename_func("MAKE_TIME")(self, expression) 2704 except ValueError: 2705 pass 2706 2707 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2708 if nano: 2709 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2710 2711 total_seconds = ( 2712 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2713 ) 2714 2715 return self.sql( 2716 exp.Add( 2717 this=exp.Cast( 2718 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2719 ), 2720 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2721 ) 2722 ) 2723 2724 # Default: MAKE_TIME 2725 if nano: 2726 expression.set( 2727 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2728 ) 2729 2730 return rename_func("MAKE_TIME")(self, expression)
2732 def extract_sql(self, expression: exp.Extract) -> str: 2733 """ 2734 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2735 2736 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2737 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2738 """ 2739 this = expression.this 2740 datetime_expr = expression.expression 2741 2742 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2743 # because Snowflake applies server timezone while DuckDB uses local timezone 2744 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2745 self.unsupported( 2746 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2747 ) 2748 2749 part_name = this.name.upper() 2750 2751 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2752 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2753 2754 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2755 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2756 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2757 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2758 ) 2759 2760 if is_nano_time: 2761 self.unsupported( 2762 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2763 ) 2764 return self.sql( 2765 exp.cast( 2766 exp.Mul( 2767 this=exp.Extract( 2768 this=exp.var("MICROSECOND"), expression=datetime_expr 2769 ), 2770 expression=exp.Literal.number(1000), 2771 ), 2772 exp.DataType.build(cast_type, dialect="duckdb"), 2773 ) 2774 ) 2775 2776 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2777 strftime_input = datetime_expr 2778 if part_name == "NANOSECOND": 2779 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 2780 2781 return self.sql( 2782 exp.cast( 2783 exp.Anonymous( 2784 this="STRFTIME", 2785 expressions=[strftime_input, exp.Literal.string(fmt)], 2786 ), 2787 exp.DataType.build(cast_type, dialect="duckdb"), 2788 ) 2789 ) 2790 2791 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2792 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2793 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2794 # EPOCH returns float, cast to BIGINT for integer result 2795 if part_name == "EPOCH_SECOND": 2796 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2797 return self.sql(result) 2798 2799 return super().extract_sql(expression)
Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported.
DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND
2801 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2802 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2803 date_expr = expression.this 2804 time_expr = expression.expression 2805 2806 if date_expr is not None and time_expr is not None: 2807 # In DuckDB, DATE + TIME produces TIMESTAMP 2808 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2809 2810 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2811 sec = expression.args.get("sec") 2812 if sec is None: 2813 # This shouldn't happen with valid input, but handle gracefully 2814 return rename_func("MAKE_TIMESTAMP")(self, expression) 2815 2816 milli = expression.args.get("milli") 2817 if milli is not None: 2818 sec += milli.pop() / exp.Literal.number(1000.0) 2819 2820 nano = expression.args.get("nano") 2821 if nano is not None: 2822 sec += nano.pop() / exp.Literal.number(1000000000.0) 2823 2824 if milli or nano: 2825 expression.set("sec", sec) 2826 2827 return rename_func("MAKE_TIMESTAMP")(self, expression)
2829 @unsupported_args("nano") 2830 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2831 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2832 if nano := expression.args.get("nano"): 2833 nano.pop() 2834 2835 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2836 return f"CAST({timestamp} AS TIMESTAMPTZ)"
2838 @unsupported_args("nano") 2839 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2840 # Extract zone before popping 2841 zone = expression.args.get("zone") 2842 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2843 if zone: 2844 zone = zone.pop() 2845 2846 if nano := expression.args.get("nano"): 2847 nano.pop() 2848 2849 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2850 2851 if zone: 2852 # Use AT TIME ZONE to apply the explicit timezone 2853 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2854 2855 return timestamp
2857 def tablesample_sql( 2858 self, 2859 expression: exp.TableSample, 2860 tablesample_keyword: t.Optional[str] = None, 2861 ) -> str: 2862 if not isinstance(expression.parent, exp.Select): 2863 # This sample clause only applies to a single source, not the entire resulting relation 2864 tablesample_keyword = "TABLESAMPLE" 2865 2866 if expression.args.get("size"): 2867 method = expression.args.get("method") 2868 if method and method.name.upper() != "RESERVOIR": 2869 self.unsupported( 2870 f"Sampling method {method} is not supported with a discrete sample count, " 2871 "defaulting to reservoir sampling" 2872 ) 2873 expression.set("method", exp.var("RESERVOIR")) 2874 2875 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword)
2882 def join_sql(self, expression: exp.Join) -> str: 2883 if ( 2884 not expression.args.get("using") 2885 and not expression.args.get("on") 2886 and not expression.method 2887 and (expression.kind in ("", "INNER", "OUTER")) 2888 ): 2889 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2890 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2891 if isinstance(expression.this, exp.Unnest): 2892 return super().join_sql(expression.on(exp.true())) 2893 2894 expression.set("side", None) 2895 expression.set("kind", None) 2896 2897 return super().join_sql(expression)
2913 def bracket_sql(self, expression: exp.Bracket) -> str: 2914 if self.dialect.version >= (1, 2): 2915 return super().bracket_sql(expression) 2916 2917 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2918 this = expression.this 2919 if isinstance(this, exp.Array): 2920 this.replace(exp.paren(this)) 2921 2922 bracket = super().bracket_sql(expression) 2923 2924 if not expression.args.get("returns_list_for_maps"): 2925 if not this.type: 2926 from sqlglot.optimizer.annotate_types import annotate_types 2927 2928 this = annotate_types(this, dialect=self.dialect) 2929 2930 if this.is_type(exp.DataType.Type.MAP): 2931 bracket = f"({bracket})[1]" 2932 2933 return bracket
2935 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2936 func = expression.this 2937 2938 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2939 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2940 if isinstance(func, exp.ArrayAgg): 2941 if not isinstance(order := expression.expression, exp.Order): 2942 return self.sql(func) 2943 2944 # Save the original column for FILTER clause (before wrapping with Order) 2945 original_this = func.this 2946 2947 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 2948 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 2949 func.set( 2950 "this", 2951 exp.Order( 2952 this=func.this.copy(), 2953 expressions=order.expressions, 2954 ), 2955 ) 2956 2957 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 2958 # Use original_this (not the Order-wrapped version) for the FILTER condition 2959 array_agg_sql = self.function_fallback_sql(func) 2960 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 2961 2962 # For other functions (like PERCENTILES), use existing logic 2963 expression_sql = self.sql(expression, "expression") 2964 2965 if isinstance(func, exp.PERCENTILES): 2966 # Make the order key the first arg and slide the fraction to the right 2967 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 2968 order_col = expression.find(exp.Ordered) 2969 if order_col: 2970 func.set("expression", func.this) 2971 func.set("this", order_col.this) 2972 2973 this = self.sql(expression, "this").rstrip(")") 2974 2975 return f"{this}{expression_sql})"
2977 def length_sql(self, expression: exp.Length) -> str: 2978 arg = expression.this 2979 2980 # Dialects like BQ and Snowflake also accept binary values as args, so 2981 # DDB will attempt to infer the type or resort to case/when resolution 2982 if not expression.args.get("binary") or arg.is_string: 2983 return self.func("LENGTH", arg) 2984 2985 if not arg.type: 2986 from sqlglot.optimizer.annotate_types import annotate_types 2987 2988 arg = annotate_types(arg, dialect=self.dialect) 2989 2990 if arg.is_type(*exp.DataType.TEXT_TYPES): 2991 return self.func("LENGTH", arg) 2992 2993 # We need these casts to make duckdb's static type checker happy 2994 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 2995 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 2996 2997 case = ( 2998 exp.case(self.func("TYPEOF", arg)) 2999 .when("'BLOB'", self.func("OCTET_LENGTH", blob)) 3000 .else_( 3001 exp.Anonymous(this="LENGTH", expressions=[varchar]) 3002 ) # anonymous to break length_sql recursion 3003 ) 3004 3005 return self.sql(case)
3007 def sha_sql(self, expression: exp.SHA) -> str: 3008 arg = expression.this 3009 3010 # If type is compatible with DuckDB or is an unknown type, use directly 3011 if ( 3012 arg.type 3013 and arg.type.this != exp.DataType.Type.UNKNOWN 3014 and not arg.is_type(*exp.DataType.TEXT_TYPES) 3015 and not _is_binary(arg) 3016 ): 3017 arg = exp.cast(arg, exp.DataType.Type.VARCHAR) 3018 3019 return self.func("SHA1", arg)
3021 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3022 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3023 this = expression.this 3024 expr = expression.expression 3025 max_dist = expression.args.get("max_dist") 3026 3027 if max_dist is None: 3028 return self.func("LEVENSHTEIN", this, expr) 3029 3030 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3031 levenshtein = exp.Levenshtein(this=this, expression=expr) 3032 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
3034 def minhash_sql(self, expression: exp.Minhash) -> str: 3035 k = expression.this 3036 exprs = expression.expressions 3037 3038 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3039 self.unsupported( 3040 "MINHASH with multiple expressions or * requires manual query restructuring" 3041 ) 3042 return self.func("MINHASH", k, *exprs) 3043 3044 expr = exprs[0] 3045 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3046 return f"({self.sql(result)})"
3060 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3061 args = expression.expressions 3062 3063 if not args: 3064 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3065 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3066 3067 # Build placeholder values for template 3068 lengths = [exp.Length(this=arg) for arg in args] 3069 max_len = ( 3070 lengths[0] 3071 if len(lengths) == 1 3072 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3073 ) 3074 3075 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3076 empty_struct = exp.func( 3077 "STRUCT", 3078 *[ 3079 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3080 for i in range(len(args)) 3081 ], 3082 ) 3083 3084 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3085 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3086 index = exp.column("__i") + 1 3087 transform_struct = exp.func( 3088 "STRUCT", 3089 *[ 3090 exp.PropertyEQ( 3091 this=exp.Literal.string(f"${i + 1}"), 3092 expression=exp.func("COALESCE", arg, exp.array())[index], 3093 ) 3094 for i, arg in enumerate(args) 3095 ], 3096 ) 3097 3098 result = exp.replace_placeholders( 3099 self.ARRAYS_ZIP_TEMPLATE.copy(), 3100 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3101 all_empty_check=exp.and_( 3102 *[ 3103 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3104 for arg in args 3105 ] 3106 ), 3107 empty_struct=empty_struct, 3108 max_len=max_len, 3109 transform_struct=transform_struct, 3110 ) 3111 return self.sql(result)
3125 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3126 # DuckDB TO_BASE64 requires BLOB input 3127 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3128 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3129 result = expression.this 3130 3131 # Check if input is a string type - ENCODE only accepts VARCHAR 3132 if result.is_type(*exp.DataType.TEXT_TYPES): 3133 result = exp.Encode(this=result) 3134 3135 result = exp.ToBase64(this=result) 3136 3137 max_line_length = expression.args.get("max_line_length") 3138 alphabet = expression.args.get("alphabet") 3139 3140 # Handle custom alphabet by replacing standard chars with custom ones 3141 result = _apply_base64_alphabet_replacements(result, alphabet) 3142 3143 # Handle max_line_length by inserting newlines every N characters 3144 line_length = ( 3145 t.cast(int, max_line_length.to_py()) 3146 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3147 else 0 3148 ) 3149 if line_length > 0: 3150 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3151 result = exp.Trim( 3152 this=exp.RegexpReplace( 3153 this=result, 3154 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3155 replacement=exp.Concat( 3156 expressions=[exp.Literal.string("\\1"), newline.copy()] 3157 ), 3158 ), 3159 expression=newline, 3160 position="TRAILING", 3161 ) 3162 3163 return self.sql(result)
3165 def replace_sql(self, expression: exp.Replace) -> str: 3166 result_sql = self.func( 3167 "REPLACE", 3168 _cast_to_varchar(expression.this), 3169 _cast_to_varchar(expression.expression), 3170 _cast_to_varchar(expression.args.get("replacement")), 3171 ) 3172 return _gen_with_cast_to_blob(self, expression, result_sql)
3184 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3185 this = expression.this 3186 key = expression.args.get("key") 3187 key_sql = key.name if isinstance(key, exp.Expression) else "" 3188 value_sql = self.sql(expression, "value") 3189 3190 kv_sql = f"{key_sql} := {value_sql}" 3191 3192 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3193 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3194 if isinstance(this, exp.Struct) and not this.expressions: 3195 return self.func("STRUCT_PACK", kv_sql) 3196 3197 return self.func("STRUCT_INSERT", this, kv_sql)
3223 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3224 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3225 if isinstance(expression.this, exp.Generator): 3226 # Preserve alias, joins, and other table-level args 3227 table = exp.Table( 3228 this=expression.this, 3229 alias=expression.args.get("alias"), 3230 joins=expression.args.get("joins"), 3231 ) 3232 return self.sql(table) 3233 3234 return super().tablefromrows_sql(expression)
3236 def unnest_sql(self, expression: exp.Unnest) -> str: 3237 explode_array = expression.args.get("explode_array") 3238 if explode_array: 3239 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3240 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3241 expression.expressions.append( 3242 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3243 ) 3244 3245 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3246 alias = expression.args.get("alias") 3247 if isinstance(alias, exp.TableAlias): 3248 expression.set("alias", None) 3249 if alias.columns: 3250 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3251 3252 unnest_sql = super().unnest_sql(expression) 3253 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3254 return self.sql(select) 3255 3256 return super().unnest_sql(expression)
3258 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3259 this = expression.this 3260 3261 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3262 # DuckDB should render IGNORE NULLS only for the general-purpose 3263 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3264 return super().ignorenulls_sql(expression) 3265 3266 if isinstance(this, exp.First): 3267 this = exp.AnyValue(this=this.this) 3268 3269 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3270 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3271 3272 return self.sql(this)
3274 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3275 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3276 # DuckDB should render RESPECT NULLS only for the general-purpose 3277 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3278 return super().respectnulls_sql(expression) 3279 3280 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3281 return self.sql(expression, "this")
3283 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3284 this = self.sql(expression, "this") 3285 null_text = self.sql(expression, "null") 3286 3287 if null_text: 3288 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3289 3290 return self.func("ARRAY_TO_STRING", this, expression.expression)
3292 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3293 this = expression.this 3294 group = expression.args.get("group") 3295 params = expression.args.get("parameters") 3296 position = expression.args.get("position") 3297 occurrence = expression.args.get("occurrence") 3298 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3299 3300 if position and (not position.is_int or position.to_py() > 1): 3301 this = exp.Substring(this=this, start=position) 3302 3303 if null_if_pos_overflow: 3304 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3305 3306 # Do not render group if there is no following argument, 3307 # and it's the default value for this dialect 3308 if ( 3309 not params 3310 and group 3311 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3312 ): 3313 group = None 3314 3315 if occurrence and (not occurrence.is_int or occurrence.to_py() > 1): 3316 return self.func( 3317 "ARRAY_EXTRACT", 3318 self.func("REGEXP_EXTRACT_ALL", this, expression.expression, group, params), 3319 exp.Literal.number(occurrence), 3320 ) 3321 3322 return self.func("REGEXP_EXTRACT", this, expression.expression, group, params)
3324 @unsupported_args("culture") 3325 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3326 fmt = expression.args.get("format") 3327 if fmt and fmt.is_int: 3328 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3329 3330 self.unsupported("Only integer formats are supported by NumberToStr") 3331 return self.function_fallback_sql(expression)
3344 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3345 this = expression.this 3346 parent = expression.parent 3347 3348 # The default Spark aliases are "pos" and "col", unless specified otherwise 3349 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3350 3351 if isinstance(parent, exp.Aliases): 3352 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3353 pos, col = parent.expressions 3354 elif isinstance(parent, exp.Table): 3355 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3356 alias = parent.args.get("alias") 3357 if alias: 3358 pos, col = alias.columns or [pos, col] 3359 alias.pop() 3360 3361 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3362 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3363 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3364 gen_subscripts = self.sql( 3365 exp.Alias( 3366 this=exp.Anonymous( 3367 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3368 ) 3369 - exp.Literal.number(1), 3370 alias=pos, 3371 ) 3372 ) 3373 3374 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3375 3376 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3377 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3378 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3379 3380 return posexplode_sql
3382 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3383 """ 3384 Handles three key issues: 3385 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3386 2. End-of-month preservation: If input is last day of month, result is last day of result month 3387 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3388 """ 3389 from sqlglot.optimizer.annotate_types import annotate_types 3390 3391 this = expression.this 3392 if not this.type: 3393 this = annotate_types(this, dialect=self.dialect) 3394 3395 if this.is_type(*exp.DataType.TEXT_TYPES): 3396 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3397 3398 # Detect float/decimal months to apply rounding (Snowflake behavior) 3399 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3400 months_expr = expression.expression 3401 if not months_expr.type: 3402 months_expr = annotate_types(months_expr, dialect=self.dialect) 3403 3404 # Build interval or to_months expression based on type 3405 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3406 interval_or_to_months = ( 3407 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3408 if months_expr.is_type( 3409 exp.DataType.Type.FLOAT, 3410 exp.DataType.Type.DOUBLE, 3411 exp.DataType.Type.DECIMAL, 3412 ) 3413 # Integer case: standard INTERVAL N MONTH syntax 3414 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3415 ) 3416 3417 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3418 3419 # Apply end-of-month preservation if Snowflake flag is set 3420 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3421 preserve_eom = expression.args.get("preserve_end_of_month") 3422 result_expr = ( 3423 exp.case() 3424 .when( 3425 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3426 exp.func("LAST_DAY", date_add_expr), 3427 ) 3428 .else_(date_add_expr) 3429 if preserve_eom 3430 else date_add_expr 3431 ) 3432 3433 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3434 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3435 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3436 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3437 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3438 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3439 return self.sql(result_expr)
Handles three key issues:
- Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers
- End-of-month preservation: If input is last day of month, result is last day of result month
- Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP)
3453 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3454 unit = unit_to_str(expression) 3455 date = expression.this 3456 result = self.func("DATE_TRUNC", unit, date) 3457 3458 if ( 3459 expression.args.get("input_type_preserved") 3460 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3461 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3462 ): 3463 return self.sql(exp.Cast(this=result, to=date.type)) 3464 3465 return result
3467 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3468 unit = unit_to_str(expression) 3469 zone = expression.args.get("zone") 3470 timestamp = expression.this 3471 date_unit = is_date_unit(unit) 3472 3473 if date_unit and zone: 3474 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3475 # Double AT TIME ZONE needed for BigQuery compatibility: 3476 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 3477 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 3478 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 3479 result_sql = self.func("DATE_TRUNC", unit, timestamp) 3480 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 3481 3482 result = self.func("DATE_TRUNC", unit, timestamp) 3483 if expression.args.get("input_type_preserved"): 3484 if timestamp.type and timestamp.is_type( 3485 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 3486 ): 3487 dummy_date = exp.Cast( 3488 this=exp.Literal.string("1970-01-01"), 3489 to=exp.DataType(this=exp.DataType.Type.DATE), 3490 ) 3491 date_time = exp.Add(this=dummy_date, expression=timestamp) 3492 result = self.func("DATE_TRUNC", unit, date_time) 3493 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3494 3495 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 3496 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 3497 ): 3498 return self.sql(exp.Cast(this=result, to=timestamp.type)) 3499 3500 return result
3502 def trim_sql(self, expression: exp.Trim) -> str: 3503 expression.this.replace(_cast_to_varchar(expression.this)) 3504 if expression.expression: 3505 expression.expression.replace(_cast_to_varchar(expression.expression)) 3506 3507 result_sql = super().trim_sql(expression) 3508 return _gen_with_cast_to_blob(self, expression, result_sql)
3510 def round_sql(self, expression: exp.Round) -> str: 3511 this = expression.this 3512 decimals = expression.args.get("decimals") 3513 truncate = expression.args.get("truncate") 3514 3515 # DuckDB requires the scale (decimals) argument to be an INT 3516 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 3517 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 3518 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 3519 decimals = exp.cast(decimals, exp.DataType.Type.INT) 3520 3521 func = "ROUND" 3522 if truncate: 3523 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 3524 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 3525 func = "ROUND_EVEN" 3526 truncate = None 3527 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 3528 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 3529 truncate = None 3530 3531 return self.func(func, this, decimals, truncate)
3533 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 3534 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 3535 3536 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 3537 if expression.is_type(*exp.DataType.REAL_TYPES): 3538 result = f"CAST({result} AS DOUBLE)" 3539 3540 return result
3542 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 3543 """ 3544 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 3545 dividing the input distribution into n equal-sized buckets. 3546 3547 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 3548 does not document the specific algorithm used so results may differ. DuckDB does not 3549 support RESPECT NULLS. 3550 """ 3551 this = expression.this 3552 if isinstance(this, exp.Distinct): 3553 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 3554 if len(this.expressions) < 2: 3555 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 3556 return self.function_fallback_sql(expression) 3557 num_quantiles_expr = this.expressions[1].pop() 3558 else: 3559 num_quantiles_expr = expression.expression 3560 3561 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 3562 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3563 return self.function_fallback_sql(expression) 3564 3565 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 3566 if num_quantiles <= 0: 3567 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 3568 return self.function_fallback_sql(expression) 3569 3570 quantiles = [ 3571 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 3572 for i in range(num_quantiles + 1) 3573 ] 3574 3575 return self.sql( 3576 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 3577 )
BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values dividing the input distribution into n equal-sized buckets.
Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery does not document the specific algorithm used so results may differ. DuckDB does not support RESPECT NULLS.
3586 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 3587 this = expression.this 3588 3589 if _is_binary(this): 3590 expression.type = exp.DataType.build("BINARY") 3591 3592 arg = _cast_to_bit(this) 3593 3594 if isinstance(this, exp.Neg): 3595 arg = exp.Paren(this=arg) 3596 3597 expression.set("this", arg) 3598 3599 result_sql = f"~{self.sql(expression, 'this')}" 3600 3601 return _gen_with_cast_to_blob(self, expression, result_sql)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_ONLY_LITERALS
- GROUPINGS_SEP
- INDEX_ON
- INOUT_SEPARATOR
- DIRECTED_JOINS
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_WITH_METHOD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- LIKE_PROPERTY_INSIDE_SCHEMA
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- TRY_SUPPORTED
- SUPPORTS_UESCAPE
- UNICODE_SUBSTITUTE
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- SUPPORTS_BETWEEN_FLAGS
- MATCH_AGAINST_TABLE_PREFIX
- UPDATE_STATEMENT_SUPPORTS_FROM
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- pseudocolumn_sql
- columnposition_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- inoutcolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- rollupindex_sql
- rollupproperty_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- booland_sql
- boolor_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- prewhere_sql
- where_sql
- partition_by_sql
- windowspec_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwiseor_sql
- bitwiserightshift_sql
- cast_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- eq_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- match_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- sub_sql
- trycast_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- slice_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql
- uuid_sql
- initcap_sql
- localtime_sql
- localtimestamp_sql
- weekstart_sql
- chr_sql