sqlglot.generators.duckdb
1from __future__ import annotations 2 3from decimal import Decimal 4from itertools import groupby 5import re 6import typing as t 7 8from sqlglot import exp, generator, transforms 9 10from sqlglot.dialects.dialect import ( 11 DATETIME_DELTA, 12 JSON_EXTRACT_TYPE, 13 approx_count_distinct_sql, 14 array_append_sql, 15 array_compact_sql, 16 array_concat_sql, 17 arrow_json_extract_sql, 18 count_if_to_sum, 19 date_delta_to_binary_interval_op, 20 datestrtodate_sql, 21 encode_decode_sql, 22 explode_to_unnest_sql, 23 generate_series_sql, 24 getbit_sql, 25 groupconcat_sql, 26 inline_array_unless_query, 27 months_between_sql, 28 no_datetime_sql, 29 no_comment_column_constraint_sql, 30 no_make_interval_sql, 31 no_time_sql, 32 no_timestamp_sql, 33 rename_func, 34 remove_from_array_using_filter, 35 strposition_sql, 36 str_to_time_sql, 37 timestrtotime_sql, 38 unit_to_str, 39) 40from sqlglot.generator import unsupported_args 41from sqlglot.helper import is_date_unit, seq_get 42from builtins import type as Type 43 44# Regex to detect time zones in timestamps of the form [+|-]TT[:tt] 45# The pattern matches timezone offsets that appear after the time portion 46TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?") 47 48# Characters that must be escaped when building regex expressions in INITCAP 49REGEX_ESCAPE_REPLACEMENTS = { 50 "\\": "\\\\", 51 "-": r"\-", 52 "^": r"\^", 53 "[": r"\[", 54 "]": r"\]", 55} 56 57# Used to in RANDSTR transpilation 58RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 59RANDSTR_SEED = 123456 60 61# Whitespace control characters that DuckDB must process with `CHR({val})` calls 62WS_CONTROL_CHARS_TO_DUCK = { 63 "\u000b": 11, 64 "\u001c": 28, 65 "\u001d": 29, 66 "\u001e": 30, 67 "\u001f": 31, 68} 69 70# Days of week to ISO 8601 day-of-week numbers 71# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7 72WEEK_START_DAY_TO_DOW = { 73 "MONDAY": 1, 74 "TUESDAY": 2, 75 "WEDNESDAY": 3, 76 "THURSDAY": 4, 77 "FRIDAY": 5, 78 "SATURDAY": 6, 79 "SUNDAY": 7, 80} 81 82MAX_BIT_POSITION = exp.Literal.number(32768) 83 84# cs/as/ps are Snowflake defaults; DuckDB already behaves the same way, so they are safe to drop. 85# Note: "as" is also a reserved keyword in DuckDB, making it impossible to pass through. 86_SNOWFLAKE_COLLATION_DEFAULTS = frozenset({"cs", "as", "ps"}) 87_SNOWFLAKE_COLLATION_UNSUPPORTED = frozenset( 88 {"ci", "ai", "upper", "lower", "utf8", "bin", "pi", "fl", "fu", "trim", "ltrim", "rtrim"} 89) 90 91# Window functions that support IGNORE/RESPECT NULLS in DuckDB 92_IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 93 exp.FirstValue, 94 exp.Lag, 95 exp.LastValue, 96 exp.Lead, 97 exp.NthValue, 98) 99 100# SEQ function constants 101_SEQ_BASE: exp.Expr = exp.maybe_parse("(ROW_NUMBER() OVER (ORDER BY 1) - 1)") 102_SEQ_RESTRICTED = (exp.Where, exp.Having, exp.AggFunc, exp.Order, exp.Select) 103# Maps SEQ expression types to their byte width (suffix indicates bytes: SEQ1=1, SEQ2=2, etc.) 104_SEQ_BYTE_WIDTH = {exp.Seq1: 1, exp.Seq2: 2, exp.Seq4: 4, exp.Seq8: 8} 105 106# Template for generating signed and unsigned SEQ values within a specified range 107_SEQ_UNSIGNED: exp.Expr = exp.maybe_parse(":base % :max_val") 108_SEQ_SIGNED: exp.Expr = exp.maybe_parse( 109 "(CASE WHEN :base % :max_val >= :half " 110 "THEN :base % :max_val - :max_val " 111 "ELSE :base % :max_val END)" 112) 113 114 115def _apply_base64_alphabet_replacements( 116 result: exp.Expr, 117 alphabet: exp.Expr | None, 118 reverse: bool = False, 119) -> exp.Expr: 120 """ 121 Apply base64 alphabet character replacements. 122 123 Base64 alphabet can be 1-3 chars: 1st = index 62 ('+'), 2nd = index 63 ('/'), 3rd = padding ('='). 124 zip truncates to the shorter string, so 1-char alphabet only replaces '+', 2-char replaces '+/', etc. 125 126 Args: 127 result: The expression to apply replacements to 128 alphabet: Custom alphabet literal (expected chars for +/=) 129 reverse: If False, replace default with custom (encode) 130 If True, replace custom with default (decode) 131 """ 132 if isinstance(alphabet, exp.Literal) and alphabet.is_string: 133 for default_char, new_char in zip("+/=", alphabet.this): 134 if new_char != default_char: 135 find, replace = (new_char, default_char) if reverse else (default_char, new_char) 136 result = exp.Replace( 137 this=result, 138 expression=exp.Literal.string(find), 139 replacement=exp.Literal.string(replace), 140 ) 141 return result 142 143 144def _base64_decode_sql(self: DuckDBGenerator, expression: exp.Expr, to_string: bool) -> str: 145 """ 146 Transpile Snowflake BASE64_DECODE_STRING/BINARY to DuckDB. 147 148 DuckDB uses FROM_BASE64() which returns BLOB. For string output, wrap with DECODE(). 149 Custom alphabets require REPLACE() calls to convert to standard base64. 150 """ 151 input_expr = expression.this 152 alphabet = expression.args.get("alphabet") 153 154 # Handle custom alphabet by replacing non-standard chars with standard ones 155 input_expr = _apply_base64_alphabet_replacements(input_expr, alphabet, reverse=True) 156 157 # FROM_BASE64 returns BLOB 158 input_expr = exp.FromBase64(this=input_expr) 159 160 if to_string: 161 input_expr = exp.Decode(this=input_expr) 162 163 return self.sql(input_expr) 164 165 166def _last_day_sql(self: DuckDBGenerator, expression: exp.LastDay) -> str: 167 """ 168 DuckDB's LAST_DAY only supports finding the last day of a month. 169 For other date parts (year, quarter, week), we need to implement equivalent logic. 170 """ 171 date_expr = expression.this 172 unit = expression.text("unit") 173 174 if not unit or unit.upper() == "MONTH": 175 # Default behavior - use DuckDB's native LAST_DAY 176 return self.func("LAST_DAY", date_expr) 177 178 if unit.upper() == "YEAR": 179 # Last day of year: December 31st of the same year 180 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 181 make_date_expr = exp.func( 182 "MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31) 183 ) 184 return self.sql(make_date_expr) 185 186 if unit.upper() == "QUARTER": 187 # Last day of quarter 188 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 189 quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr) 190 191 # Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4 192 last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3)) 193 first_day_last_month_expr = exp.func( 194 "MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1) 195 ) 196 197 # Last day of the last month of the quarter 198 last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr) 199 return self.sql(last_day_expr) 200 201 if unit.upper() == "WEEK": 202 # DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6 203 dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr) 204 # Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake) 205 # Wrap in parentheses to ensure correct precedence 206 days_to_sunday_expr = exp.Mod( 207 this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)), 208 expression=exp.Literal.number(7), 209 ) 210 interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY")) 211 add_expr = exp.Add(this=date_expr, expression=interval_expr) 212 cast_expr = exp.cast(add_expr, exp.DType.DATE) 213 return self.sql(cast_expr) 214 215 self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function") 216 return self.function_fallback_sql(expression) 217 218 219def _is_nanosecond_unit(unit: exp.Expr | None) -> bool: 220 return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND" 221 222 223def _handle_nanosecond_diff( 224 self: DuckDBGenerator, 225 end_time: exp.Expr, 226 start_time: exp.Expr, 227) -> str: 228 """Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it.""" 229 end_ns = exp.cast(end_time, exp.DType.TIMESTAMP_NS) 230 start_ns = exp.cast(start_time, exp.DType.TIMESTAMP_NS) 231 232 # Build expression tree: EPOCH_NS(end) - EPOCH_NS(start) 233 return self.sql( 234 exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns)) 235 ) 236 237 238def _to_boolean_sql(self: DuckDBGenerator, expression: exp.ToBoolean) -> str: 239 """ 240 Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent. 241 242 DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'. 243 We need to handle the 'on'/'off' cases explicitly. 244 245 For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR() 246 function to replicate this behavior with a clear error message. 247 248 For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL 249 for invalid inputs instead of throwing errors. 250 """ 251 arg = expression.this 252 is_safe = expression.args.get("safe", False) 253 254 base_case_expr = ( 255 exp.case() 256 .when( 257 # Handle 'on' -> TRUE (case insensitive) 258 exp.Upper(this=exp.cast(arg, exp.DType.VARCHAR)).eq(exp.Literal.string("ON")), 259 exp.true(), 260 ) 261 .when( 262 # Handle 'off' -> FALSE (case insensitive) 263 exp.Upper(this=exp.cast(arg, exp.DType.VARCHAR)).eq(exp.Literal.string("OFF")), 264 exp.false(), 265 ) 266 ) 267 268 if is_safe: 269 # TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else 270 case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DType.BOOLEAN.into_expr())) 271 else: 272 # TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST 273 cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build(exp.DType.FLOAT)) 274 275 # Check for NaN and INF values 276 nan_inf_check = exp.Or( 277 this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real) 278 ) 279 280 case_expr = base_case_expr.when( 281 nan_inf_check, 282 exp.func( 283 "ERROR", 284 exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"), 285 ), 286 ).else_(exp.cast(arg, exp.DType.BOOLEAN)) 287 288 return self.sql(case_expr) 289 290 291# BigQuery -> DuckDB conversion for the DATE function 292def _date_sql(self: DuckDBGenerator, expression: exp.Date) -> str: 293 this = expression.this 294 zone = self.sql(expression, "zone") 295 296 if zone: 297 # BigQuery considers "this" at UTC, converts it to the specified 298 # time zone and then keeps only the DATE part 299 # To micmic that, we: 300 # (1) Cast to TIMESTAMP to remove DuckDB's local tz 301 # (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion 302 this = exp.cast(this, exp.DType.TIMESTAMP) 303 at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC")) 304 this = exp.AtTimeZone(this=at_utc, zone=zone) 305 306 return self.sql(exp.cast(expression=this, to=exp.DType.DATE)) 307 308 309# BigQuery -> DuckDB conversion for the TIME_DIFF function 310def _timediff_sql(self: DuckDBGenerator, expression: exp.TimeDiff) -> str: 311 unit = expression.unit 312 313 if _is_nanosecond_unit(unit): 314 return _handle_nanosecond_diff(self, expression.expression, expression.this) 315 316 this = exp.cast(expression.this, exp.DType.TIME) 317 expr = exp.cast(expression.expression, exp.DType.TIME) 318 319 # Although the 2 dialects share similar signatures, BQ seems to inverse 320 # the sign of the result so the start/end time operands are flipped 321 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 322 323 324def _date_delta_to_binary_interval_op( 325 cast: bool = True, 326) -> t.Callable[[DuckDBGenerator, DATETIME_DELTA], str]: 327 """ 328 DuckDB override to handle: 329 1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND) 330 2. Float/decimal interval values (DuckDB INTERVAL requires integers) 331 """ 332 base_impl = date_delta_to_binary_interval_op(cast=cast) 333 334 def _duckdb_date_delta_sql(self: DuckDBGenerator, expression: DATETIME_DELTA) -> str: 335 unit = expression.unit 336 interval_value = expression.expression 337 338 # Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND) 339 if _is_nanosecond_unit(unit): 340 if isinstance(interval_value, exp.Interval): 341 interval_value = interval_value.this 342 343 timestamp_ns = exp.cast(expression.this, exp.DType.TIMESTAMP_NS) 344 345 return self.sql( 346 exp.func( 347 "MAKE_TIMESTAMP_NS", 348 exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value), 349 ) 350 ) 351 352 # Handle float/decimal interval values as duckDB INTERVAL requires integer expressions 353 if not interval_value or isinstance(interval_value, exp.Interval): 354 return base_impl(self, expression) 355 356 if interval_value.is_type(*exp.DataType.REAL_TYPES): 357 expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT")) 358 359 return base_impl(self, expression) 360 361 return _duckdb_date_delta_sql 362 363 364def _array_insert_sql(self: DuckDBGenerator, expression: exp.ArrayInsert) -> str: 365 """ 366 Transpile ARRAY_INSERT to DuckDB using LIST_CONCAT and slicing. 367 368 Handles: 369 - 0-based and 1-based indexing (normalizes to 0-based for calculations) 370 - Negative position conversion (requires array length) 371 - NULL propagation (source dialects return NULL, DuckDB creates single-element array) 372 - Assumes position is within bounds per user constraint 373 374 Note: All dialects that support ARRAY_INSERT (Snowflake, Spark, Databricks) have 375 ARRAY_FUNCS_PROPAGATES_NULLS=True, so we always assume source propagates NULLs. 376 377 Args: 378 expression: The ArrayInsert expression to transpile. 379 380 Returns: 381 SQL string implementing ARRAY_INSERT behavior. 382 """ 383 this = expression.this 384 position = expression.args.get("position") 385 element = expression.expression 386 element_array = exp.Array(expressions=[element]) 387 index_offset = expression.args.get("offset", 0) 388 389 if not position or not position.is_int: 390 self.unsupported("ARRAY_INSERT can only be transpiled with a literal position") 391 return self.func("ARRAY_INSERT", this, position, element) 392 393 pos_value = position.to_py() 394 395 # Normalize one-based indexing to zero-based for slice calculations 396 # Spark (1-based) -> Snowflake (0-based): 397 # Positive: pos=1 -> pos=0 (subtract 1) 398 # Negative: pos=-2 -> pos=-1 (add 1) 399 # Example: Spark array_insert([a,b,c], -2, d) -> [a,b,d,c] is same as Snowflake pos=-1 400 if pos_value > 0: 401 pos_value = pos_value - index_offset 402 elif pos_value < 0: 403 pos_value = pos_value + index_offset 404 405 # Build the appropriate list_concat expression based on position 406 if pos_value == 0: 407 # insert at beginning 408 concat_exprs = [element_array, this] 409 elif pos_value > 0: 410 # Positive position: LIST_CONCAT(arr[1:pos], [elem], arr[pos+1:]) 411 # 0-based -> DuckDB 1-based slicing 412 413 # left slice: arr[1:pos] 414 slice_start = exp.Bracket( 415 this=this, 416 expressions=[ 417 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 418 ], 419 ) 420 421 # right slice: arr[pos+1:] 422 slice_end = exp.Bracket( 423 this=this, expressions=[exp.Slice(this=exp.Literal.number(pos_value + 1))] 424 ) 425 426 concat_exprs = [slice_start, element_array, slice_end] 427 else: 428 # Negative position: arr[1:LEN(arr)+pos], [elem], arr[LEN(arr)+pos+1:] 429 # pos=-1 means insert before last element 430 arr_len = exp.Length(this=this) 431 432 # Calculate slice position: LEN(arr) + pos (e.g., LEN(arr) + (-1) = LEN(arr) - 1) 433 slice_end_pos = arr_len + exp.Literal.number(pos_value) 434 slice_start_pos = slice_end_pos + exp.Literal.number(1) 435 436 # left slice: arr[1:LEN(arr)+pos] 437 slice_start = exp.Bracket( 438 this=this, 439 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 440 ) 441 442 # right slice: arr[LEN(arr)+pos+1:] 443 slice_end = exp.Bracket(this=this, expressions=[exp.Slice(this=slice_start_pos)]) 444 445 concat_exprs = [slice_start, element_array, slice_end] 446 447 # All dialects that support ARRAY_INSERT propagate NULLs (Snowflake/Spark/Databricks) 448 # Wrap in CASE WHEN array IS NULL THEN NULL ELSE func_expr END 449 return self.sql( 450 exp.If( 451 this=exp.Is(this=this, expression=exp.Null()), 452 true=exp.Null(), 453 false=self.func("LIST_CONCAT", *concat_exprs), 454 ) 455 ) 456 457 458def _array_remove_at_sql(self: DuckDBGenerator, expression: exp.ArrayRemoveAt) -> str: 459 """ 460 Transpile ARRAY_REMOVE_AT to DuckDB using LIST_CONCAT and slicing. 461 462 Handles: 463 - Positive positions (0-based indexing) 464 - Negative positions (from end of array) 465 - NULL propagation (Snowflake returns NULL for NULL array, DuckDB doesn't auto-propagate) 466 - Only supports literal integer positions (non-literals remain untranspiled) 467 468 Transpilation patterns: 469 - pos=0 (first): arr[2:] 470 - pos>0 (middle): LIST_CONCAT(arr[1:p], arr[p+2:]) 471 - pos=-1 (last): arr[1:LEN(arr)-1] 472 - pos<-1: LIST_CONCAT(arr[1:LEN(arr)+p], arr[LEN(arr)+p+2:]) 473 474 All wrapped in: CASE WHEN arr IS NULL THEN NULL ELSE ... END 475 476 Args: 477 expression: The ArrayRemoveAt expression to transpile. 478 479 Returns: 480 SQL string implementing ARRAY_REMOVE_AT behavior. 481 """ 482 this = expression.this 483 position = expression.args.get("position") 484 485 if not position or not position.is_int: 486 self.unsupported("ARRAY_REMOVE_AT can only be transpiled with a literal position") 487 return self.func("ARRAY_REMOVE_AT", this, position) 488 489 pos_value = position.to_py() 490 491 # Build the appropriate expression based on position 492 if pos_value == 0: 493 # Remove first element: arr[2:] 494 result_expr: exp.Expr | str = exp.Bracket( 495 this=this, 496 expressions=[exp.Slice(this=exp.Literal.number(2))], 497 ) 498 elif pos_value > 0: 499 # Remove at positive position: LIST_CONCAT(arr[1:pos], arr[pos+2:]) 500 # DuckDB uses 1-based slicing 501 left_slice = exp.Bracket( 502 this=this, 503 expressions=[ 504 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 505 ], 506 ) 507 right_slice = exp.Bracket( 508 this=this, 509 expressions=[exp.Slice(this=exp.Literal.number(pos_value + 2))], 510 ) 511 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 512 elif pos_value == -1: 513 # Remove last element: arr[1:LEN(arr)-1] 514 # Optimization: simpler than general negative case 515 arr_len = exp.Length(this=this) 516 slice_end = arr_len + exp.Literal.number(-1) 517 result_expr = exp.Bracket( 518 this=this, 519 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end)], 520 ) 521 else: 522 # Remove at negative position: LIST_CONCAT(arr[1:LEN(arr)+pos], arr[LEN(arr)+pos+2:]) 523 arr_len = exp.Length(this=this) 524 slice_end_pos = arr_len + exp.Literal.number(pos_value) 525 slice_start_pos = slice_end_pos + exp.Literal.number(2) 526 527 left_slice = exp.Bracket( 528 this=this, 529 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 530 ) 531 right_slice = exp.Bracket( 532 this=this, 533 expressions=[exp.Slice(this=slice_start_pos)], 534 ) 535 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 536 537 # Snowflake ARRAY_FUNCS_PROPAGATES_NULLS=True, so wrap in NULL check 538 # CASE WHEN array IS NULL THEN NULL ELSE result_expr END 539 return self.sql( 540 exp.If( 541 this=exp.Is(this=this, expression=exp.Null()), 542 true=exp.Null(), 543 false=result_expr, 544 ) 545 ) 546 547 548@unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator.")) 549def _array_sort_sql(self: DuckDBGenerator, expression: exp.ArraySort) -> str: 550 return self.func("ARRAY_SORT", expression.this) 551 552 553def _array_contains_sql(self: DuckDBGenerator, expression: exp.ArrayContains) -> str: 554 this = expression.this 555 expr = expression.expression 556 557 func = self.func("ARRAY_CONTAINS", this, expr) 558 559 if expression.args.get("check_null"): 560 check_null_in_array = exp.Nullif( 561 this=exp.NEQ(this=exp.ArraySize(this=this), expression=exp.func("LIST_COUNT", this)), 562 expression=exp.false(), 563 ) 564 return self.sql(exp.If(this=expr.is_(exp.Null()), true=check_null_in_array, false=func)) 565 566 return func 567 568 569def _array_overlaps_sql(self: DuckDBGenerator, expression: exp.ArrayOverlaps) -> str: 570 """ 571 Translates Snowflake's NULL-safe ARRAYS_OVERLAP to DuckDB. 572 573 DuckDB's native && operator is not NULL-safe: [1,NULL,3] && [NULL,4,5] returns FALSE. 574 Snowflake returns TRUE when both arrays contain NULL (NULLs are treated as known values). 575 576 Generated SQL: (arr1 && arr2) OR (ARRAY_LENGTH(arr1) <> LIST_COUNT(arr1) AND ARRAY_LENGTH(arr2) <> LIST_COUNT(arr2)) 577 578 ARRAY_LENGTH counts all elements (including NULLs); LIST_COUNT counts only non-NULLs. 579 When they differ, the array contains at least one NULL, matching Snowflake's NULL-safe semantics. 580 """ 581 if not expression.args.get("null_safe"): 582 return self.binary(expression, "&&") 583 584 arr1 = expression.this 585 arr2 = expression.expression 586 587 check_nulls = exp.and_( 588 exp.NEQ( 589 this=exp.ArraySize(this=arr1.copy()), 590 expression=exp.func("LIST_COUNT", arr1.copy()), 591 ), 592 exp.NEQ( 593 this=exp.ArraySize(this=arr2.copy()), 594 expression=exp.func("LIST_COUNT", arr2.copy()), 595 ), 596 copy=False, 597 ) 598 599 overlap = exp.ArrayOverlaps(this=arr1.copy(), expression=arr2.copy()) 600 601 return self.sql( 602 exp.or_( 603 exp.paren(overlap, copy=False), 604 exp.paren(check_nulls, copy=False), 605 copy=False, 606 wrap=False, 607 ) 608 ) 609 610 611def _struct_sql(self: DuckDBGenerator, expression: exp.Struct) -> str: 612 ancestor_cast = expression.find_ancestor(exp.Cast, exp.Select) 613 ancestor_cast = None if isinstance(ancestor_cast, exp.Select) else ancestor_cast 614 615 # Empty struct cast works with MAP() since DuckDB can't parse {} 616 if not expression.expressions: 617 if isinstance(ancestor_cast, exp.Cast) and ancestor_cast.to.is_type(exp.DType.MAP): 618 return "MAP()" 619 620 args: list[str] = [] 621 622 # BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is 623 # canonicalized to "ROW('str', 1) AS STRUCT(a TEXT, b INT)" in DuckDB 624 # The transformation to ROW will take place if: 625 # 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would 626 # 2. A cast to STRUCT / ARRAY of STRUCTs is found 627 is_bq_inline_struct = ( 628 (expression.find(exp.PropertyEQ) is None) 629 and ancestor_cast 630 and any( 631 casted_type.is_type(exp.DType.STRUCT) 632 for casted_type in ancestor_cast.find_all(exp.DataType) 633 ) 634 ) 635 636 for i, expr in enumerate(expression.expressions): 637 is_property_eq = isinstance(expr, exp.PropertyEQ) 638 this = expr.this 639 value = expr.expression if is_property_eq else expr 640 641 if is_bq_inline_struct: 642 args.append(self.sql(value)) 643 else: 644 if isinstance(this, exp.Identifier): 645 key = self.sql(exp.Literal.string(expr.name)) 646 elif is_property_eq: 647 key = self.sql(this) 648 else: 649 key = self.sql(exp.Literal.string(f"_{i}")) 650 651 args.append(f"{key}: {self.sql(value)}") 652 653 csv_args = ", ".join(args) 654 655 return f"ROW({csv_args})" if is_bq_inline_struct else f"{{{csv_args}}}" 656 657 658def _datatype_sql(self: DuckDBGenerator, expression: exp.DataType) -> str: 659 if expression.is_type("array"): 660 return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]" 661 662 # Modifiers are not supported for TIME, [TIME | TIMESTAMP] WITH TIME ZONE 663 if expression.is_type(exp.DType.TIME, exp.DType.TIMETZ, exp.DType.TIMESTAMPTZ): 664 return expression.this.value 665 666 return self.datatype_sql(expression) 667 668 669def _json_format_sql(self: DuckDBGenerator, expression: exp.JSONFormat) -> str: 670 sql = self.func("TO_JSON", expression.this, expression.args.get("options")) 671 return f"CAST({sql} AS TEXT)" 672 673 674def _build_seq_expression(base: exp.Expr, byte_width: int, signed: bool) -> exp.Expr: 675 """Build a SEQ expression with the given base, byte width, and signedness.""" 676 bits = byte_width * 8 677 max_val = exp.Literal.number(2**bits) 678 679 if signed: 680 half = exp.Literal.number(2 ** (bits - 1)) 681 return exp.replace_placeholders(_SEQ_SIGNED.copy(), base=base, max_val=max_val, half=half) 682 return exp.replace_placeholders(_SEQ_UNSIGNED.copy(), base=base, max_val=max_val) 683 684 685def _seq_to_range_in_generator(expression: exp.Expr) -> exp.Expr: 686 """ 687 Transform SEQ functions to `range` column references when inside a GENERATOR context. 688 689 When GENERATOR(ROWCOUNT => N) becomes RANGE(N) in DuckDB, it produces a column 690 named `range` with values 0, 1, ..., N-1. SEQ functions produce the same sequence, 691 so we replace them with `range % max_val` to avoid nested window function issues. 692 """ 693 if not isinstance(expression, exp.Select): 694 return expression 695 696 from_ = expression.args.get("from_") 697 if not ( 698 from_ 699 and isinstance(from_.this, exp.TableFromRows) 700 and isinstance(from_.this.this, exp.Generator) 701 ): 702 return expression 703 704 def replace_seq(node: exp.Expr) -> exp.Expr: 705 if isinstance(node, (exp.Seq1, exp.Seq2, exp.Seq4, exp.Seq8)): 706 byte_width = _SEQ_BYTE_WIDTH[type(node)] 707 return _build_seq_expression(exp.column("range"), byte_width, signed=node.name == "1") 708 return node 709 710 return expression.transform(replace_seq, copy=False) 711 712 713def _seq_sql(self: DuckDBGenerator, expression: exp.Func, byte_width: int) -> str: 714 """ 715 Transpile Snowflake SEQ1/SEQ2/SEQ4/SEQ8 to DuckDB. 716 717 Generates monotonically increasing integers starting from 0. 718 The signed parameter (0 or 1) affects wrap-around behavior: 719 - Unsigned (0): wraps at 2^(bits) - 1 720 - Signed (1): wraps at 2^(bits-1) - 1, then goes negative 721 """ 722 # Warn if SEQ is in a restricted context (Select stops search at current scope) 723 ancestor = expression.find_ancestor(*_SEQ_RESTRICTED) 724 if ancestor and ( 725 (not isinstance(ancestor, (exp.Order, exp.Select))) 726 or (isinstance(ancestor, exp.Order) and isinstance(ancestor.parent, exp.Window)) 727 ): 728 self.unsupported("SEQ in restricted context is not supported - use CTE or subquery") 729 730 result = _build_seq_expression(_SEQ_BASE.copy(), byte_width, signed=expression.name == "1") 731 return self.sql(result) 732 733 734def _unix_to_time_sql(self: DuckDBGenerator, expression: exp.UnixToTime) -> str: 735 scale = expression.args.get("scale") 736 timestamp = expression.this 737 target_type = expression.args.get("target_type") 738 739 # Check if we need NTZ (naive timestamp in UTC) 740 is_ntz = target_type and target_type.this in ( 741 exp.DType.TIMESTAMP, 742 exp.DType.TIMESTAMPNTZ, 743 ) 744 745 if scale == exp.UnixToTime.MILLIS: 746 # EPOCH_MS already returns TIMESTAMP (naive, UTC) 747 return self.func("EPOCH_MS", timestamp) 748 if scale == exp.UnixToTime.MICROS: 749 # MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC) 750 return self.func("MAKE_TIMESTAMP", timestamp) 751 752 # Other scales: divide and use TO_TIMESTAMP 753 if scale not in (None, exp.UnixToTime.SECONDS): 754 timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)) 755 756 to_timestamp: exp.Expr = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp]) 757 758 if is_ntz: 759 to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC")) 760 761 return self.sql(to_timestamp) 762 763 764WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not) 765 766 767def _arrow_json_extract_sql(self: DuckDBGenerator, expression: JSON_EXTRACT_TYPE) -> str: 768 arrow_sql = arrow_json_extract_sql(self, expression) 769 if not expression.same_parent and isinstance( 770 expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS 771 ): 772 arrow_sql = self.wrap(arrow_sql) 773 return arrow_sql 774 775 776def _implicit_datetime_cast( 777 arg: exp.Expr | None, type: exp.DType = exp.DType.DATE 778) -> exp.Expr | None: 779 if isinstance(arg, exp.Literal) and arg.is_string: 780 ts = arg.name 781 if type == exp.DType.DATE and ":" in ts: 782 type = exp.DType.TIMESTAMPTZ if TIMEZONE_PATTERN.search(ts) else exp.DType.TIMESTAMP 783 784 arg = exp.cast(arg, type) 785 786 return arg 787 788 789def _week_unit_to_dow(unit: exp.Expr | None) -> int | None: 790 """ 791 Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming 792 from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts. 793 794 Args: 795 unit: The unit expression (Var for ISOWEEK or WeekStart) 796 797 Returns: 798 The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant). 799 800 Examples: 801 "WEEK(SUNDAY)" -> 7 802 "WEEK(MONDAY)" -> 1 803 "ISOWEEK" -> 1 804 """ 805 # Handle plain Var expressions for ISOWEEK only 806 if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK": 807 return 1 808 809 # Handle WeekStart expressions with explicit day 810 if isinstance(unit, exp.WeekStart): 811 return WEEK_START_DAY_TO_DOW.get(unit.name.upper()) 812 813 return None 814 815 816def _build_week_trunc_expression( 817 date_expr: exp.Expr, 818 start_dow: int, 819 preserve_start_day: bool = False, 820) -> exp.Expr: 821 """ 822 Build DATE_TRUNC expression for week boundaries with custom start day. 823 824 DuckDB's DATE_TRUNC('WEEK', ...) always returns Monday. To align to a different 825 start day, we shift the date before truncating. 826 827 Args: 828 date_expr: The date expression to truncate. 829 start_dow: ISO 8601 day-of-week number (Monday=1, ..., Sunday=7). 830 preserve_start_day: If True, reverse the shift after truncating so the result lands on the 831 correct week start day. Needed for DATE_TRUNC (absolute result matters) but 832 not for DATE_DIFF (only relative alignment matters). 833 834 Shift formula: Sunday (7) gets +1, others get (1 - start_dow). 835 """ 836 shift_days = 1 if start_dow == 7 else 1 - start_dow 837 truncated = exp.func("DATE_TRUNC", unit=exp.var("WEEK"), this=date_expr) 838 839 if shift_days == 0: 840 return truncated 841 842 shift = exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")) 843 shifted_date = exp.DateAdd(this=date_expr, expression=shift) 844 truncated.set("this", shifted_date) 845 846 if preserve_start_day: 847 interval = exp.Interval(this=exp.Literal.string(str(-shift_days)), unit=exp.var("DAY")) 848 return exp.cast( 849 exp.DateAdd(this=truncated, expression=interval), to=exp.DType.DATE, copy=False 850 ) 851 852 return truncated 853 854 855def _date_diff_sql(self: DuckDBGenerator, expression: exp.DateDiff | exp.DatetimeDiff) -> str: 856 unit = expression.unit 857 858 if _is_nanosecond_unit(unit): 859 return _handle_nanosecond_diff(self, expression.this, expression.expression) 860 861 this = _implicit_datetime_cast(expression.this) 862 expr = _implicit_datetime_cast(expression.expression) 863 864 # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7: 865 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed) 866 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference) 867 # Whereas for other units such as MONTH it does respect month boundaries: 868 # SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed) 869 date_part_boundary = expression.args.get("date_part_boundary") 870 871 # Extract week start day; returns None if day is dynamic (column/placeholder) 872 week_start = _week_unit_to_dow(unit) 873 if date_part_boundary and week_start and this and expr: 874 expression.set("unit", exp.Literal.string("WEEK")) 875 876 # Truncate both dates to week boundaries to respect input dialect semantics 877 this = _build_week_trunc_expression(this, week_start) 878 expr = _build_week_trunc_expression(expr, week_start) 879 880 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 881 882 883def _generate_datetime_array_sql( 884 self: DuckDBGenerator, expression: exp.GenerateDateArray | exp.GenerateTimestampArray 885) -> str: 886 is_generate_date_array = isinstance(expression, exp.GenerateDateArray) 887 888 type = exp.DType.DATE if is_generate_date_array else exp.DType.TIMESTAMP 889 start = _implicit_datetime_cast(expression.args.get("start"), type=type) 890 end = _implicit_datetime_cast(expression.args.get("end"), type=type) 891 892 # BQ's GENERATE_DATE_ARRAY & GENERATE_TIMESTAMP_ARRAY are transformed to DuckDB'S GENERATE_SERIES 893 gen_series: exp.GenerateSeries | exp.Cast = exp.GenerateSeries( 894 start=start, end=end, step=expression.args.get("step") 895 ) 896 897 if is_generate_date_array: 898 # The GENERATE_SERIES result type is TIMESTAMP array, so to match BQ's semantics for 899 # GENERATE_DATE_ARRAY we must cast it back to DATE array 900 gen_series = exp.cast(gen_series, exp.DataType.build("ARRAY<DATE>")) 901 902 return self.sql(gen_series) 903 904 905def _json_extract_value_array_sql( 906 self: DuckDBGenerator, expression: exp.JSONValueArray | exp.JSONExtractArray 907) -> str: 908 json_extract = exp.JSONExtract(this=expression.this, expression=expression.expression) 909 data_type = "ARRAY<STRING>" if isinstance(expression, exp.JSONValueArray) else "ARRAY<JSON>" 910 return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type))) 911 912 913def _cast_to_varchar(arg: exp.Expr | None) -> exp.Expr | None: 914 if arg and arg.type and not arg.is_type(*exp.DataType.TEXT_TYPES, exp.DType.UNKNOWN): 915 return exp.cast(arg, exp.DType.VARCHAR) 916 return arg 917 918 919def _cast_to_boolean(arg: exp.Expr | None) -> exp.Expr | None: 920 if arg and not arg.is_type(exp.DType.BOOLEAN): 921 return exp.cast(arg, exp.DType.BOOLEAN) 922 return arg 923 924 925def _is_binary(arg: exp.Expr) -> bool: 926 return arg.is_type( 927 exp.DType.BINARY, 928 exp.DType.VARBINARY, 929 exp.DType.BLOB, 930 ) 931 932 933def _gen_with_cast_to_blob(self: DuckDBGenerator, expression: exp.Expr, result_sql: str) -> str: 934 if _is_binary(expression): 935 blob = exp.DataType.build("BLOB", dialect="duckdb") 936 result_sql = self.sql(exp.Cast(this=result_sql, to=blob)) 937 return result_sql 938 939 940def _cast_to_bit(arg: exp.Expr) -> exp.Expr: 941 if not _is_binary(arg): 942 return arg 943 944 if isinstance(arg, exp.HexString): 945 arg = exp.Unhex(this=exp.Literal.string(arg.this)) 946 947 return exp.cast(arg, exp.DType.BIT) 948 949 950def _prepare_binary_bitwise_args(expression: exp.Binary) -> None: 951 if _is_binary(expression.this): 952 expression.set("this", _cast_to_bit(expression.this)) 953 if _is_binary(expression.expression): 954 expression.set("expression", _cast_to_bit(expression.expression)) 955 956 957def _day_navigation_sql(self: DuckDBGenerator, expression: exp.NextDay | exp.PreviousDay) -> str: 958 """ 959 Transpile Snowflake's NEXT_DAY / PREVIOUS_DAY to DuckDB using date arithmetic. 960 961 Returns the DATE of the next/previous occurrence of the specified weekday. 962 963 Formulas: 964 - NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 965 - PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 966 967 Supports both literal and non-literal day names: 968 - Literal: Direct lookup (e.g., 'Monday' -> 1) 969 - Non-literal: CASE statement for runtime evaluation 970 971 Examples: 972 NEXT_DAY('2024-01-01' (Monday), 'Monday') 973 -> (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days -> 2024-01-08 974 975 PREVIOUS_DAY('2024-01-15' (Monday), 'Friday') 976 -> (1 - 5 + 6) % 7 + 1 = 2 % 7 + 1 = 3 days -> 2024-01-12 977 """ 978 date_expr = expression.this 979 day_name_expr = expression.expression 980 981 # Build ISODOW call for current day of week 982 isodow_call = exp.func("ISODOW", date_expr) 983 984 # Determine target day of week 985 if isinstance(day_name_expr, exp.Literal): 986 # Literal day name: lookup target_dow directly 987 day_name_str = day_name_expr.name.upper() 988 matching_day = next( 989 (day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None 990 ) 991 if matching_day: 992 target_dow: exp.Expr = exp.Literal.number(WEEK_START_DAY_TO_DOW[matching_day]) 993 else: 994 # Unrecognized day name, use fallback 995 return self.function_fallback_sql(expression) 996 else: 997 # Non-literal day name: build CASE statement for runtime mapping 998 upper_day_name = exp.Upper(this=day_name_expr) 999 target_dow = exp.Case( 1000 ifs=[ 1001 exp.If( 1002 this=exp.func( 1003 "STARTS_WITH", upper_day_name.copy(), exp.Literal.string(day[:2]) 1004 ), 1005 true=exp.Literal.number(dow_num), 1006 ) 1007 for day, dow_num in WEEK_START_DAY_TO_DOW.items() 1008 ] 1009 ) 1010 1011 # Calculate days offset and apply interval based on direction 1012 if isinstance(expression, exp.NextDay): 1013 # NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 1014 days_offset = exp.paren(target_dow - isodow_call + 6, copy=False) % 7 + 1 1015 date_with_offset = date_expr + exp.Interval(this=days_offset, unit=exp.var("DAY")) 1016 else: # exp.PreviousDay 1017 # PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 1018 days_offset = exp.paren(isodow_call - target_dow + 6, copy=False) % 7 + 1 1019 date_with_offset = date_expr - exp.Interval(this=days_offset, unit=exp.var("DAY")) 1020 1021 # Build final: CAST(date_with_offset AS DATE) 1022 return self.sql(exp.cast(date_with_offset, exp.DType.DATE)) 1023 1024 1025def _anyvalue_sql(self: DuckDBGenerator, expression: exp.AnyValue) -> str: 1026 # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL 1027 having = expression.this 1028 if isinstance(having, exp.HavingMax): 1029 func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL" 1030 return self.func(func_name, having.this, having.expression) 1031 return self.function_fallback_sql(expression) 1032 1033 1034def _bitwise_agg_sql( 1035 self: DuckDBGenerator, 1036 expression: exp.BitwiseOrAgg | exp.BitwiseAndAgg | exp.BitwiseXorAgg, 1037) -> str: 1038 """ 1039 DuckDB's bitwise aggregate functions only accept integer types. For other types: 1040 - DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int 1041 - FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss 1042 """ 1043 if isinstance(expression, exp.BitwiseOrAgg): 1044 func_name = "BIT_OR" 1045 elif isinstance(expression, exp.BitwiseAndAgg): 1046 func_name = "BIT_AND" 1047 else: # exp.BitwiseXorAgg 1048 func_name = "BIT_XOR" 1049 1050 arg = expression.this 1051 1052 if not arg.type: 1053 from sqlglot.optimizer.annotate_types import annotate_types 1054 1055 arg = annotate_types(arg, dialect=self.dialect) 1056 1057 if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES): 1058 if arg.is_type(*exp.DataType.FLOAT_TYPES): 1059 # float types need to be rounded first due to precision loss 1060 arg = exp.func("ROUND", arg) 1061 1062 arg = exp.cast(arg, exp.DType.INT) 1063 1064 return self.func(func_name, arg) 1065 1066 1067def _literal_sql_with_ws_chr(self: DuckDBGenerator, literal: str) -> str: 1068 # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly 1069 if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal): 1070 return self.sql(exp.Literal.string(literal)) 1071 1072 sql_segments: list[str] = [] 1073 for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK): 1074 if is_ws_control: 1075 for ch in group: 1076 duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch] 1077 sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code)))) 1078 else: 1079 sql_segments.append(self.sql(exp.Literal.string("".join(group)))) 1080 1081 sql = " || ".join(sql_segments) 1082 return sql if len(sql_segments) == 1 else f"({sql})" 1083 1084 1085def _escape_regex_metachars( 1086 self: DuckDBGenerator, delimiters: exp.Expr | None, delimiters_sql: str 1087) -> str: 1088 r""" 1089 Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions. 1090 1091 Literal strings are escaped at transpile time, expressions handled with REPLACE() calls. 1092 """ 1093 if not delimiters: 1094 return delimiters_sql 1095 1096 if delimiters.is_string: 1097 literal_value = delimiters.this 1098 escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value) 1099 return _literal_sql_with_ws_chr(self, escaped_literal) 1100 1101 escaped_sql = delimiters_sql 1102 for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items(): 1103 escaped_sql = self.func( 1104 "REPLACE", 1105 escaped_sql, 1106 self.sql(exp.Literal.string(raw)), 1107 self.sql(exp.Literal.string(escaped)), 1108 ) 1109 1110 return escaped_sql 1111 1112 1113def _build_capitalization_sql( 1114 self: DuckDBGenerator, 1115 value_to_split: str, 1116 delimiters_sql: str, 1117) -> str: 1118 # empty string delimiter --> treat value as one word, no need to split 1119 if delimiters_sql == "''": 1120 return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))" 1121 1122 delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')" 1123 split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')" 1124 1125 # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments. 1126 # We do not know whether the first segment is a delimiter or not, so we check the first character of the string 1127 # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd. 1128 return self.func( 1129 "ARRAY_TO_STRING", 1130 exp.case() 1131 .when( 1132 f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})", 1133 self.func( 1134 "LIST_TRANSFORM", 1135 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1136 "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1137 ), 1138 ) 1139 .else_( 1140 self.func( 1141 "LIST_TRANSFORM", 1142 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1143 "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1144 ), 1145 ), 1146 "''", 1147 ) 1148 1149 1150def _initcap_sql(self: DuckDBGenerator, expression: exp.Initcap) -> str: 1151 this_sql = self.sql(expression, "this") 1152 delimiters = expression.args.get("expression") 1153 if delimiters is None: 1154 # fallback for manually created exp.Initcap w/o delimiters arg 1155 delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS) 1156 delimiters_sql = self.sql(delimiters) 1157 1158 escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql) 1159 1160 return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql) 1161 1162 1163def _boolxor_agg_sql(self: DuckDBGenerator, expression: exp.BoolxorAgg) -> str: 1164 """ 1165 Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise; 1166 Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`. 1167 1168 DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean. 1169 """ 1170 return self.sql( 1171 exp.EQ( 1172 this=exp.CountIf(this=_cast_to_boolean(expression.this)), 1173 expression=exp.Literal.number(1), 1174 ) 1175 ) 1176 1177 1178def _bitshift_sql( 1179 self: DuckDBGenerator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift 1180) -> str: 1181 """ 1182 Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts. 1183 1184 DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast 1185 them to BIT for the operation, then cast the result back to the original type. 1186 1187 Note: Assumes type annotation has been applied with the source dialect. 1188 """ 1189 operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>" 1190 result_is_blob = False 1191 this = expression.this 1192 1193 if _is_binary(this): 1194 result_is_blob = True 1195 expression.set("this", exp.cast(this, exp.DType.BIT)) 1196 elif expression.args.get("requires_int128"): 1197 this.replace(exp.cast(this, exp.DType.INT128)) 1198 1199 result_sql = self.binary(expression, operator) 1200 1201 # Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue 1202 # DuckDB parses: a << b | c << d as (a << b | c) << d 1203 if isinstance(expression.parent, exp.Binary): 1204 result_sql = self.sql(exp.Paren(this=result_sql)) 1205 1206 if result_is_blob: 1207 result_sql = self.sql( 1208 exp.Cast(this=result_sql, to=exp.DataType.build("BLOB", dialect="duckdb")) 1209 ) 1210 1211 return result_sql 1212 1213 1214def _scale_rounding_sql( 1215 self: DuckDBGenerator, 1216 expression: exp.Expr, 1217 rounding_func: Type[exp.Expr], 1218) -> str | None: 1219 """ 1220 Handle scale parameter transformation for rounding functions. 1221 1222 DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL), 1223 so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n) 1224 1225 Args: 1226 self: The DuckDB generator instance 1227 expression: The expression to transform (must have 'this', 'decimals', and 'to' args) 1228 rounding_func: The rounding function class to use in the transformation 1229 1230 Returns: 1231 The transformed SQL string if decimals parameter exists, None otherwise 1232 """ 1233 decimals = expression.args.get("decimals") 1234 1235 if decimals is None or expression.args.get("to") is not None: 1236 return None 1237 1238 this = expression.this 1239 if isinstance(this, exp.Binary): 1240 this = exp.Paren(this=this) 1241 1242 n_int = decimals 1243 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 1244 n_int = exp.cast(decimals, exp.DType.INT) 1245 1246 pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int) 1247 rounded = rounding_func(this=exp.Mul(this=this, expression=pow_)) 1248 result = exp.Div(this=rounded, expression=pow_.copy()) 1249 1250 return self.round_sql( 1251 exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True) 1252 ) 1253 1254 1255def _ceil_floor(self: DuckDBGenerator, expression: exp.Floor | exp.Ceil) -> str: 1256 scaled_sql = _scale_rounding_sql(self, expression, type(expression)) 1257 if scaled_sql is not None: 1258 return scaled_sql 1259 return self.ceil_floor(expression) 1260 1261 1262def _regr_val_sql( 1263 self: DuckDBGenerator, 1264 expression: exp.RegrValx | exp.RegrValy, 1265) -> str: 1266 """ 1267 Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent. 1268 1269 REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x. 1270 REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y. 1271 """ 1272 from sqlglot.optimizer.annotate_types import annotate_types 1273 1274 y = expression.this 1275 x = expression.expression 1276 1277 # Determine which argument to check for NULL and which to return based on expression type 1278 if isinstance(expression, exp.RegrValx): 1279 # REGR_VALX: check y for NULL, return x 1280 check_for_null = y 1281 return_value = x 1282 return_value_attr = "expression" 1283 else: 1284 # REGR_VALY: check x for NULL, return y 1285 check_for_null = x 1286 return_value = y 1287 return_value_attr = "this" 1288 1289 # Get the type from the return argument 1290 result_type = return_value.type 1291 1292 # If no type info, annotate the expression to infer types 1293 if not result_type or result_type.this == exp.DType.UNKNOWN: 1294 try: 1295 annotated = annotate_types(expression.copy(), dialect=self.dialect) 1296 result_type = getattr(annotated, return_value_attr).type 1297 except Exception: 1298 pass 1299 1300 # Default to DOUBLE for regression functions if type still unknown 1301 if not result_type or result_type.this == exp.DType.UNKNOWN: 1302 result_type = exp.DType.DOUBLE.into_expr() 1303 1304 # Cast NULL to the same type as return_value to avoid DuckDB type inference issues 1305 typed_null = exp.Cast(this=exp.Null(), to=result_type) 1306 1307 return self.sql( 1308 exp.If( 1309 this=exp.Is(this=check_for_null.copy(), expression=exp.Null()), 1310 true=typed_null, 1311 false=return_value.copy(), 1312 ) 1313 ) 1314 1315 1316def _maybe_corr_null_to_false( 1317 expression: exp.Filter | exp.Window | exp.Corr, 1318) -> exp.Filter | exp.Window | exp.Corr | None: 1319 corr = expression 1320 while isinstance(corr, (exp.Window, exp.Filter)): 1321 corr = corr.this 1322 1323 if not isinstance(corr, exp.Corr) or not corr.args.get("null_on_zero_variance"): 1324 return None 1325 1326 corr.set("null_on_zero_variance", False) 1327 return expression 1328 1329 1330def _date_from_parts_sql(self, expression: exp.DateFromParts) -> str: 1331 """ 1332 Snowflake's DATE_FROM_PARTS allows out-of-range values for the month and day input. 1333 E.g., larger values (month=13, day=100), zero-values (month=0, day=0), negative values (month=-13, day=-100). 1334 1335 DuckDB's MAKE_DATE does not support out-of-range values, but DuckDB's INTERVAL type does. 1336 1337 We convert to date arithmetic: 1338 DATE_FROM_PARTS(year, month, day) 1339 - MAKE_DATE(year, 1, 1) + INTERVAL (month-1) MONTH + INTERVAL (day-1) DAY 1340 """ 1341 year_expr = expression.args.get("year") 1342 month_expr = expression.args.get("month") 1343 day_expr = expression.args.get("day") 1344 1345 if expression.args.get("allow_overflow"): 1346 base_date: exp.Expr = exp.func( 1347 "MAKE_DATE", year_expr, exp.Literal.number(1), exp.Literal.number(1) 1348 ) 1349 1350 if month_expr: 1351 base_date = base_date + exp.Interval(this=month_expr - 1, unit=exp.var("MONTH")) 1352 1353 if day_expr: 1354 base_date = base_date + exp.Interval(this=day_expr - 1, unit=exp.var("DAY")) 1355 1356 return self.sql(exp.cast(expression=base_date, to=exp.DType.DATE)) 1357 1358 return self.func("MAKE_DATE", year_expr, month_expr, day_expr) 1359 1360 1361def _round_arg(arg: exp.Expr, round_input: bool | None = None) -> exp.Expr: 1362 if round_input: 1363 return exp.func("ROUND", arg, exp.Literal.number(0)) 1364 return arg 1365 1366 1367def _boolnot_sql(self: DuckDBGenerator, expression: exp.Boolnot) -> str: 1368 arg = _round_arg(expression.this, expression.args.get("round_input")) 1369 return self.sql(exp.not_(exp.paren(arg))) 1370 1371 1372def _booland_sql(self: DuckDBGenerator, expression: exp.Booland) -> str: 1373 round_input = expression.args.get("round_input") 1374 left = _round_arg(expression.this, round_input) 1375 right = _round_arg(expression.expression, round_input) 1376 return self.sql(exp.paren(exp.and_(exp.paren(left), exp.paren(right), wrap=False))) 1377 1378 1379def _boolor_sql(self: DuckDBGenerator, expression: exp.Boolor) -> str: 1380 round_input = expression.args.get("round_input") 1381 left = _round_arg(expression.this, round_input) 1382 right = _round_arg(expression.expression, round_input) 1383 return self.sql(exp.paren(exp.or_(exp.paren(left), exp.paren(right), wrap=False))) 1384 1385 1386def _xor_sql(self: DuckDBGenerator, expression: exp.Xor) -> str: 1387 round_input = expression.args.get("round_input") 1388 left = _round_arg(expression.this, round_input) 1389 right = _round_arg(expression.expression, round_input) 1390 return self.sql( 1391 exp.or_( 1392 exp.paren(exp.and_(left.copy(), exp.paren(right.not_()), wrap=False)), 1393 exp.paren(exp.and_(exp.paren(left.not_()), right.copy(), wrap=False)), 1394 wrap=False, 1395 ) 1396 ) 1397 1398 1399def _explode_to_unnest_sql(self: DuckDBGenerator, expression: exp.Lateral) -> str: 1400 """Handle LATERAL VIEW EXPLODE/INLINE conversion to UNNEST for DuckDB.""" 1401 explode = expression.this 1402 1403 if isinstance(explode, exp.Inline): 1404 # For INLINE, create CROSS JOIN LATERAL (SELECT UNNEST(..., max_depth => 2)) 1405 # Build the UNNEST call with DuckDB-style named parameter 1406 unnest_expr = exp.Unnest( 1407 expressions=[ 1408 explode.this, 1409 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)), 1410 ] 1411 ) 1412 select_expr = exp.Select(expressions=[unnest_expr]).subquery() 1413 1414 alias_expr = expression.args.get("alias") 1415 if alias_expr and not alias_expr.this: 1416 # we need to provide a table name if not present 1417 alias_expr.set("this", exp.to_identifier(f"_u_{expression.index}")) 1418 1419 transformed_lateral_expr = exp.Lateral(this=select_expr, alias=alias_expr) 1420 cross_join_lateral_expr = exp.Join(this=transformed_lateral_expr, kind="CROSS") 1421 1422 return self.sql(cross_join_lateral_expr) 1423 1424 # For other cases, use the standard conversion 1425 return explode_to_unnest_sql(self, expression) 1426 1427 1428def _sha_sql( 1429 self: DuckDBGenerator, 1430 expression: exp.Expr, 1431 hash_func: str, 1432 is_binary: bool = False, 1433) -> str: 1434 arg = expression.this 1435 1436 # For SHA2 variants, check digest length (DuckDB only supports SHA256) 1437 if hash_func == "SHA256": 1438 length = expression.text("length") or "256" 1439 if length != "256": 1440 self.unsupported("DuckDB only supports SHA256 hashing algorithm.") 1441 1442 # Cast if type is incompatible with DuckDB 1443 if ( 1444 arg.type 1445 and arg.type.this != exp.DType.UNKNOWN 1446 and not arg.is_type(*exp.DataType.TEXT_TYPES) 1447 and not _is_binary(arg) 1448 ): 1449 arg = exp.cast(arg, exp.DType.VARCHAR) 1450 1451 result = self.func(hash_func, arg) 1452 return self.func("UNHEX", result) if is_binary else result 1453 1454 1455class DuckDBGenerator(generator.Generator): 1456 PARAMETER_TOKEN = "$" 1457 NAMED_PLACEHOLDER_TOKEN = "$" 1458 JOIN_HINTS = False 1459 TABLE_HINTS = False 1460 QUERY_HINTS = False 1461 LIMIT_FETCH = "LIMIT" 1462 STRUCT_DELIMITER = ("(", ")") 1463 RENAME_TABLE_WITH_DB = False 1464 NVL2_SUPPORTED = False 1465 SEMI_ANTI_JOIN_WITH_SIDE = False 1466 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1467 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1468 LAST_DAY_SUPPORTS_DATE_PART = False 1469 JSON_KEY_VALUE_PAIR_SEP = "," 1470 IGNORE_NULLS_IN_FUNC = True 1471 IGNORE_NULLS_BEFORE_ORDER = False 1472 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1473 SUPPORTS_CREATE_TABLE_LIKE = False 1474 MULTI_ARG_DISTINCT = False 1475 CAN_IMPLEMENT_ARRAY_ANY = True 1476 SUPPORTS_TO_NUMBER = False 1477 SELECT_KINDS: tuple[str, ...] = () 1478 SUPPORTS_DECODE_CASE = False 1479 SUPPORTS_DROP_ALTER_ICEBERG_PROPERTY = False 1480 1481 AFTER_HAVING_MODIFIER_TRANSFORMS = generator.AFTER_HAVING_MODIFIER_TRANSFORMS 1482 SUPPORTS_WINDOW_EXCLUDE = True 1483 COPY_HAS_INTO_KEYWORD = False 1484 STAR_EXCEPT = "EXCLUDE" 1485 PAD_FILL_PATTERN_IS_REQUIRED = True 1486 ARRAY_SIZE_DIM_REQUIRED: bool | None = False 1487 NORMALIZE_EXTRACT_DATE_PARTS = True 1488 SUPPORTS_LIKE_QUANTIFIERS = False 1489 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1490 1491 TRANSFORMS = { 1492 **generator.Generator.TRANSFORMS, 1493 exp.AnyValue: _anyvalue_sql, 1494 exp.ApproxDistinct: approx_count_distinct_sql, 1495 exp.Boolnot: _boolnot_sql, 1496 exp.Booland: _booland_sql, 1497 exp.Boolor: _boolor_sql, 1498 exp.Array: transforms.preprocess( 1499 [transforms.inherit_struct_field_names], 1500 generator=inline_array_unless_query, 1501 ), 1502 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1503 exp.ArrayCompact: array_compact_sql, 1504 exp.ArrayConstructCompact: lambda self, e: self.sql( 1505 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1506 ), 1507 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1508 exp.ArrayContains: _array_contains_sql, 1509 exp.ArrayOverlaps: _array_overlaps_sql, 1510 exp.ArrayFilter: rename_func("LIST_FILTER"), 1511 exp.ArrayInsert: _array_insert_sql, 1512 exp.ArrayPosition: lambda self, e: ( 1513 self.sql( 1514 exp.Sub( 1515 this=exp.ArrayPosition(this=e.this, expression=e.expression), 1516 expression=exp.Literal.number(1), 1517 ) 1518 ) 1519 if e.args.get("zero_based") 1520 else self.func("ARRAY_POSITION", e.this, e.expression) 1521 ), 1522 exp.ArrayRemoveAt: _array_remove_at_sql, 1523 exp.ArrayRemove: remove_from_array_using_filter, 1524 exp.ArraySort: _array_sort_sql, 1525 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1526 exp.ArraySum: rename_func("LIST_SUM"), 1527 exp.ArrayMax: rename_func("LIST_MAX"), 1528 exp.ArrayMin: rename_func("LIST_MIN"), 1529 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1530 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1531 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1532 exp.BitwiseAndAgg: _bitwise_agg_sql, 1533 exp.BitwiseCount: rename_func("BIT_COUNT"), 1534 exp.BitwiseLeftShift: _bitshift_sql, 1535 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1536 exp.BitwiseOrAgg: _bitwise_agg_sql, 1537 exp.BitwiseRightShift: _bitshift_sql, 1538 exp.BitwiseXorAgg: _bitwise_agg_sql, 1539 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1540 exp.Corr: lambda self, e: self._corr_sql(e), 1541 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1542 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1543 exp.CurrentSchemas: lambda self, e: self.func( 1544 "current_schemas", e.this if e.this else exp.true() 1545 ), 1546 exp.CurrentTimestamp: lambda self, e: ( 1547 self.sql( 1548 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1549 ) 1550 if e.args.get("sysdate") 1551 else "CURRENT_TIMESTAMP" 1552 ), 1553 exp.CurrentVersion: rename_func("version"), 1554 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1555 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1556 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1557 exp.DayOfWeekIso: rename_func("ISODOW"), 1558 exp.DayOfYear: rename_func("DAYOFYEAR"), 1559 exp.Dayname: lambda self, e: ( 1560 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1561 if e.args.get("abbreviated") 1562 else self.func("DAYNAME", e.this) 1563 ), 1564 exp.Monthname: lambda self, e: ( 1565 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1566 if e.args.get("abbreviated") 1567 else self.func("MONTHNAME", e.this) 1568 ), 1569 exp.DataType: _datatype_sql, 1570 exp.Date: _date_sql, 1571 exp.DateAdd: _date_delta_to_binary_interval_op(), 1572 exp.DateFromParts: _date_from_parts_sql, 1573 exp.DateSub: _date_delta_to_binary_interval_op(), 1574 exp.DateDiff: _date_diff_sql, 1575 exp.DateStrToDate: datestrtodate_sql, 1576 exp.Datetime: no_datetime_sql, 1577 exp.DatetimeDiff: _date_diff_sql, 1578 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1579 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1580 exp.DateToDi: lambda self, e: ( 1581 f"CAST(STRFTIME({self.sql(e, 'this')}, {self.dialect.DATEINT_FORMAT}) AS INT)" 1582 ), 1583 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1584 exp.DiToDate: lambda self, e: ( 1585 f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {self.dialect.DATEINT_FORMAT}) AS DATE)" 1586 ), 1587 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1588 exp.EqualNull: lambda self, e: self.sql( 1589 exp.NullSafeEQ(this=e.this, expression=e.expression) 1590 ), 1591 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1592 exp.GenerateDateArray: _generate_datetime_array_sql, 1593 exp.GenerateSeries: generate_series_sql("GENERATE_SERIES", "RANGE"), 1594 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1595 exp.Getbit: getbit_sql, 1596 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1597 exp.Explode: rename_func("UNNEST"), 1598 exp.IcebergProperty: lambda *_: "", 1599 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1600 exp.IsInf: rename_func("ISINF"), 1601 exp.IsNan: rename_func("ISNAN"), 1602 exp.IsNullValue: lambda self, e: self.sql( 1603 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1604 ), 1605 exp.IsArray: lambda self, e: self.sql( 1606 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1607 ), 1608 exp.Ceil: _ceil_floor, 1609 exp.Floor: _ceil_floor, 1610 exp.JSONBExists: rename_func("JSON_EXISTS"), 1611 exp.JSONExtract: _arrow_json_extract_sql, 1612 exp.JSONExtractArray: _json_extract_value_array_sql, 1613 exp.JSONFormat: _json_format_sql, 1614 exp.JSONValueArray: _json_extract_value_array_sql, 1615 exp.Lateral: _explode_to_unnest_sql, 1616 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1617 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1618 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 1619 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1620 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1621 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1622 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1623 exp.BoolxorAgg: _boolxor_agg_sql, 1624 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1625 exp.Initcap: _initcap_sql, 1626 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1627 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 1628 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 1629 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 1630 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 1631 exp.MonthsBetween: months_between_sql, 1632 exp.NextDay: _day_navigation_sql, 1633 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1634 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1635 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1636 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1637 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1638 exp.PreviousDay: _day_navigation_sql, 1639 exp.RegexpILike: lambda self, e: self.func( 1640 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1641 ), 1642 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1643 exp.RegrValx: _regr_val_sql, 1644 exp.RegrValy: _regr_val_sql, 1645 exp.Return: lambda self, e: self.sql(e, "this"), 1646 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1647 exp.StrToUnix: lambda self, e: self.func( 1648 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1649 ), 1650 exp.Struct: _struct_sql, 1651 exp.Transform: rename_func("LIST_TRANSFORM"), 1652 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1653 exp.TimeSub: _date_delta_to_binary_interval_op(), 1654 exp.Time: no_time_sql, 1655 exp.TimeDiff: _timediff_sql, 1656 exp.Timestamp: no_timestamp_sql, 1657 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1658 exp.TimestampDiff: lambda self, e: self.func( 1659 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1660 ), 1661 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1662 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DType.DATE)), 1663 exp.TimeStrToTime: timestrtotime_sql, 1664 exp.TimeStrToUnix: lambda self, e: self.func( 1665 "EPOCH", exp.cast(e.this, exp.DType.TIMESTAMP) 1666 ), 1667 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1668 exp.ToBoolean: _to_boolean_sql, 1669 exp.ToVariant: lambda self, e: self.sql( 1670 exp.cast(e.this, exp.DataType.build("VARIANT", dialect="duckdb")) 1671 ), 1672 exp.TimeToUnix: rename_func("EPOCH"), 1673 exp.TsOrDiToDi: lambda self, e: ( 1674 f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)" 1675 ), 1676 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1677 exp.TsOrDsDiff: lambda self, e: self.func( 1678 "DATE_DIFF", 1679 f"'{e.args.get('unit') or 'DAY'}'", 1680 exp.cast(e.expression, exp.DType.TIMESTAMP), 1681 exp.cast(e.this, exp.DType.TIMESTAMP), 1682 ), 1683 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1684 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1685 exp.UnixSeconds: lambda self, e: self.sql( 1686 exp.cast(self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DType.BIGINT) 1687 ), 1688 exp.UnixToStr: lambda self, e: self.func( 1689 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1690 ), 1691 exp.DatetimeTrunc: lambda self, e: self.func( 1692 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DType.DATETIME) 1693 ), 1694 exp.UnixToTime: _unix_to_time_sql, 1695 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1696 exp.VariancePop: rename_func("VAR_POP"), 1697 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1698 exp.YearOfWeek: lambda self, e: self.sql( 1699 exp.Extract( 1700 this=exp.Var(this="ISOYEAR"), 1701 expression=e.this, 1702 ) 1703 ), 1704 exp.YearOfWeekIso: lambda self, e: self.sql( 1705 exp.Extract( 1706 this=exp.Var(this="ISOYEAR"), 1707 expression=e.this, 1708 ) 1709 ), 1710 exp.Xor: _xor_sql, 1711 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1712 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1713 exp.DateBin: rename_func("TIME_BUCKET"), 1714 exp.LastDay: _last_day_sql, 1715 } 1716 1717 SUPPORTED_JSON_PATH_PARTS = { 1718 exp.JSONPathKey, 1719 exp.JSONPathRoot, 1720 exp.JSONPathSubscript, 1721 exp.JSONPathWildcard, 1722 } 1723 1724 TYPE_MAPPING = { 1725 **generator.Generator.TYPE_MAPPING, 1726 exp.DType.BINARY: "BLOB", 1727 exp.DType.BPCHAR: "TEXT", 1728 exp.DType.CHAR: "TEXT", 1729 exp.DType.DATETIME: "TIMESTAMP", 1730 exp.DType.DECFLOAT: "DECIMAL(38, 5)", 1731 exp.DType.FLOAT: "REAL", 1732 exp.DType.JSONB: "JSON", 1733 exp.DType.NCHAR: "TEXT", 1734 exp.DType.NVARCHAR: "TEXT", 1735 exp.DType.UINT: "UINTEGER", 1736 exp.DType.VARBINARY: "BLOB", 1737 exp.DType.ROWVERSION: "BLOB", 1738 exp.DType.VARCHAR: "TEXT", 1739 exp.DType.TIMESTAMPLTZ: "TIMESTAMPTZ", 1740 exp.DType.TIMESTAMPNTZ: "TIMESTAMP", 1741 exp.DType.TIMESTAMP_S: "TIMESTAMP_S", 1742 exp.DType.TIMESTAMP_MS: "TIMESTAMP_MS", 1743 exp.DType.TIMESTAMP_NS: "TIMESTAMP_NS", 1744 exp.DType.BIGDECIMAL: "DECIMAL(38, 5)", 1745 } 1746 1747 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1748 RESERVED_KEYWORDS = { 1749 "array", 1750 "analyse", 1751 "union", 1752 "all", 1753 "when", 1754 "in_p", 1755 "default", 1756 "create_p", 1757 "window", 1758 "asymmetric", 1759 "to", 1760 "else", 1761 "localtime", 1762 "from", 1763 "end_p", 1764 "select", 1765 "current_date", 1766 "foreign", 1767 "with", 1768 "grant", 1769 "session_user", 1770 "or", 1771 "except", 1772 "references", 1773 "fetch", 1774 "limit", 1775 "group_p", 1776 "leading", 1777 "into", 1778 "collate", 1779 "offset", 1780 "do", 1781 "then", 1782 "localtimestamp", 1783 "check_p", 1784 "lateral_p", 1785 "current_role", 1786 "where", 1787 "asc_p", 1788 "placing", 1789 "desc_p", 1790 "user", 1791 "unique", 1792 "initially", 1793 "column", 1794 "both", 1795 "some", 1796 "as", 1797 "any", 1798 "only", 1799 "deferrable", 1800 "null_p", 1801 "current_time", 1802 "true_p", 1803 "table", 1804 "case", 1805 "trailing", 1806 "variadic", 1807 "for", 1808 "on", 1809 "distinct", 1810 "false_p", 1811 "not", 1812 "constraint", 1813 "current_timestamp", 1814 "returning", 1815 "primary", 1816 "intersect", 1817 "having", 1818 "analyze", 1819 "current_user", 1820 "and", 1821 "cast", 1822 "symmetric", 1823 "using", 1824 "order", 1825 "current_catalog", 1826 } 1827 1828 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 1829 1830 # DuckDB doesn't generally support CREATE TABLE .. properties 1831 # https://duckdb.org/docs/sql/statements/create_table.html 1832 # There are a few exceptions (e.g. temporary tables) which are supported or 1833 # can be transpiled to DuckDB, so we explicitly override them accordingly 1834 PROPERTIES_LOCATION = { 1835 **{ 1836 prop: exp.Properties.Location.UNSUPPORTED 1837 for prop in generator.Generator.PROPERTIES_LOCATION 1838 }, 1839 exp.LikeProperty: exp.Properties.Location.POST_SCHEMA, 1840 exp.TemporaryProperty: exp.Properties.Location.POST_CREATE, 1841 exp.ReturnsProperty: exp.Properties.Location.POST_ALIAS, 1842 exp.SequenceProperties: exp.Properties.Location.POST_EXPRESSION, 1843 exp.IcebergProperty: exp.Properties.Location.POST_CREATE, 1844 } 1845 1846 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS: t.ClassVar = _IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS 1847 1848 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 1849 ZIPF_TEMPLATE: exp.Expr = exp.maybe_parse( 1850 """ 1851 WITH rand AS (SELECT :random_expr AS r), 1852 weights AS ( 1853 SELECT i, 1.0 / POWER(i, :s) AS w 1854 FROM RANGE(1, :n + 1) AS t(i) 1855 ), 1856 cdf AS ( 1857 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 1858 FROM weights 1859 ) 1860 SELECT MIN(i) 1861 FROM cdf 1862 WHERE p >= (SELECT r FROM rand) 1863 """ 1864 ) 1865 1866 # Template for NORMAL transpilation using Box-Muller transform 1867 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 1868 NORMAL_TEMPLATE: exp.Expr = exp.maybe_parse( 1869 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 1870 ) 1871 1872 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 1873 SEEDED_RANDOM_TEMPLATE: exp.Expr = exp.maybe_parse("(ABS(HASH(:seed)) % 1000000) / 1000000.0") 1874 1875 # Template for generating signed and unsigned SEQ values within a specified range 1876 SEQ_UNSIGNED: exp.Expr = _SEQ_UNSIGNED 1877 SEQ_SIGNED: exp.Expr = _SEQ_SIGNED 1878 1879 # Template for MAP_CAT transpilation - Snowflake semantics: 1880 # 1. Returns NULL if either input is NULL 1881 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 1882 # 3. Filters out entries with NULL values from the result 1883 MAPCAT_TEMPLATE: exp.Expr = exp.maybe_parse( 1884 """ 1885 CASE 1886 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 1887 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 1888 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 1889 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 1890 ), __x -> __x.value IS NOT NULL)) 1891 END 1892 """ 1893 ) 1894 1895 # Mappings for EXTRACT/DATE_PART transpilation 1896 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 1897 EXTRACT_STRFTIME_MAPPINGS: dict[str, tuple[str, str]] = { 1898 "WEEKISO": ("%V", "INTEGER"), 1899 "YEAROFWEEK": ("%G", "INTEGER"), 1900 "YEAROFWEEKISO": ("%G", "INTEGER"), 1901 "NANOSECOND": ("%n", "BIGINT"), 1902 } 1903 1904 # Maps epoch-based specifiers to DuckDB epoch functions 1905 EXTRACT_EPOCH_MAPPINGS: dict[str, str] = { 1906 "EPOCH_SECOND": "EPOCH", 1907 "EPOCH_MILLISECOND": "EPOCH_MS", 1908 "EPOCH_MICROSECOND": "EPOCH_US", 1909 "EPOCH_NANOSECOND": "EPOCH_NS", 1910 } 1911 1912 # Template for BITMAP_CONSTRUCT_AGG transpilation 1913 # 1914 # BACKGROUND: 1915 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 1916 # Supports values in range 0-32767, this version returns NULL if any value is out of range 1917 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 1918 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 1919 # 1920 # Snowflake uses two different formats based on the number of unique values: 1921 # 1922 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 1923 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 1924 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 1925 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 1926 # count v1 v2 v3 pad 1927 # 1928 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 1929 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 1930 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 1931 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 1932 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 1933 # 1934 # TEMPLATE STRUCTURE 1935 # 1936 # Phase 1 - Innermost subquery: Data preparation 1937 # SELECT LIST_SORT(...) AS l 1938 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 1939 # Result: Clean, sorted list of unique non-null integers stored as 'l' 1940 # 1941 # Phase 2 - Middle subquery: Hex string construction 1942 # LIST_TRANSFORM(...) 1943 # - Converts each integer to 2-byte little-endian hex representation 1944 # - & 255 extracts low byte, >> 8 extracts high byte 1945 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 1946 # Result: Hex string of all values 1947 # 1948 # Phase 3 - Outer SELECT: Final bitmap assembly 1949 # LENGTH(l) < 5: 1950 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 1951 # LENGTH(l) >= 5: 1952 # - Large format: Fixed 10-byte header + values (no padding needed) 1953 # Result: Complete binary bitmap as BLOB 1954 # 1955 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expr = exp.maybe_parse( 1956 """ 1957 SELECT CASE 1958 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 1959 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 1960 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 1961 ELSE UNHEX('08000000000000000000' || h) 1962 END 1963 FROM ( 1964 SELECT l, COALESCE(LIST_REDUCE( 1965 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 1966 (__a, __b) -> __a || __b, '' 1967 ), '') AS h 1968 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 1969 ) 1970 """ 1971 ) 1972 1973 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 1974 RANDSTR_TEMPLATE: exp.Expr = exp.maybe_parse( 1975 f""" 1976 SELECT LISTAGG( 1977 SUBSTRING( 1978 '{RANDSTR_CHAR_POOL}', 1979 1 + CAST(FLOOR(random_value * 62) AS INT), 1980 1 1981 ), 1982 '' 1983 ) 1984 FROM ( 1985 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 1986 FROM RANGE(:length) AS t(i) 1987 ) 1988 """, 1989 ) 1990 1991 # Template for MINHASH transpilation 1992 # Computes k minimum hash values across aggregated data using DuckDB list functions 1993 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 1994 MINHASH_TEMPLATE: exp.Expr = exp.maybe_parse( 1995 """ 1996 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 1997 FROM ( 1998 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 1999 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2000 ) 2001 """, 2002 ) 2003 2004 # Template for MINHASH_COMBINE transpilation 2005 # Combines multiple minhash signatures by taking element-wise minimum 2006 MINHASH_COMBINE_TEMPLATE: exp.Expr = exp.maybe_parse( 2007 """ 2008 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2009 FROM ( 2010 SELECT 2011 pos AS idx, 2012 MIN(val) AS min_h 2013 FROM 2014 UNNEST(LIST(:expr)) AS _(sig), 2015 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2016 GROUP BY pos 2017 ) 2018 """, 2019 ) 2020 2021 # Template for APPROXIMATE_SIMILARITY transpilation 2022 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2023 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expr = exp.maybe_parse( 2024 """ 2025 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2026 FROM ( 2027 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2028 FROM ( 2029 SELECT h, pos 2030 FROM UNNEST(LIST(:expr)) AS _(sig), 2031 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2032 ) 2033 GROUP BY pos 2034 ) 2035 """, 2036 ) 2037 2038 # Template for ARRAYS_ZIP transpilation 2039 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2040 # Uses RANGE + indexing to match Snowflake behavior 2041 ARRAYS_ZIP_TEMPLATE: exp.Expr = exp.maybe_parse( 2042 """ 2043 CASE WHEN :null_check THEN NULL 2044 WHEN :all_empty_check THEN [:empty_struct] 2045 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2046 END 2047 """, 2048 ) 2049 2050 # Shared bag semantics outer frame for ARRAY_EXCEPT and ARRAY_INTERSECTION. 2051 # Each element is paired with its 1-based position via LIST_ZIP, then filtered 2052 # by a comparison operator (supplied via :cond) that determines the operation: 2053 # EXCEPT (>): keep the N-th occurrence only if N > count in arr2 2054 # e.g. [2,2,2] EXCEPT [2,2] -> [2] 2055 # INTERSECTION (<=): keep the N-th occurrence only if N <= count in arr2 2056 # e.g. [2,2,2] INTERSECT [2,2] -> [2,2] 2057 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2058 ARRAY_BAG_TEMPLATE: exp.Expr = exp.maybe_parse( 2059 """ 2060 CASE 2061 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2062 ELSE LIST_TRANSFORM( 2063 LIST_FILTER( 2064 LIST_ZIP(:arr1, GENERATE_SERIES(1, LEN(:arr1))), 2065 pair -> :cond 2066 ), 2067 pair -> pair[0] 2068 ) 2069 END 2070 """ 2071 ) 2072 2073 ARRAY_EXCEPT_CONDITION: exp.Expr = exp.maybe_parse( 2074 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2075 " > LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2076 ) 2077 2078 ARRAY_INTERSECTION_CONDITION: exp.Expr = exp.maybe_parse( 2079 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2080 " <= LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2081 ) 2082 2083 # Set semantics for ARRAY_EXCEPT. Deduplicates arr1 via LIST_DISTINCT, then 2084 # filters out any element that appears at least once in arr2. 2085 # e.g. [1,1,2,3] EXCEPT [1] -> [2,3] 2086 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2087 ARRAY_EXCEPT_SET_TEMPLATE: exp.Expr = exp.maybe_parse( 2088 """ 2089 CASE 2090 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2091 ELSE LIST_FILTER( 2092 LIST_DISTINCT(:arr1), 2093 e -> LEN(LIST_FILTER(:arr2, x -> x IS NOT DISTINCT FROM e)) = 0 2094 ) 2095 END 2096 """ 2097 ) 2098 2099 # Template for STRTOK function transpilation 2100 # 2101 # DuckDB itself doesn't have a strtok function. This handles the transpilation from Snowflake to DuckDB. 2102 # We may need to adjust this if we want to support transpilation from other dialects 2103 # 2104 # CASE 2105 # -- Snowflake: empty delimiter + empty input string -> NULL 2106 # WHEN delimiter = '' AND input_str = '' THEN NULL 2107 # 2108 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return input string if index is 1 2109 # WHEN delimiter = '' AND index = 1 THEN input_str 2110 # 2111 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return NULL if index is not 1 2112 # WHEN delimiter = '' THEN NULL 2113 # 2114 # -- Snowflake: negative indices return NULL 2115 # WHEN index < 0 THEN NULL 2116 # 2117 # -- Snowflake: return NULL if any argument is NULL 2118 # WHEN input_str IS NULL OR delimiter IS NULL OR index IS NULL THEN NULL 2119 # 2120 # 2121 # ELSE LIST_FILTER( 2122 # REGEXP_SPLIT_TO_ARRAY( 2123 # input_str, 2124 # CASE 2125 # -- if delimiter is '', we don't want to surround it with '[' and ']' as '[]' is invalid for DuckDB 2126 # WHEN delimiter = '' THEN '' 2127 # 2128 # -- handle problematic regex characters in delimiter with REGEXP_REPLACE 2129 # -- turn delimiter into a regex char set, otherwise DuckDB will match in order, which we don't want 2130 # ELSE '[' || REGEXP_REPLACE(delimiter, problematic_char_set, '\\\1', 'g') || ']' 2131 # END 2132 # ), 2133 # 2134 # -- Snowflake: don't return empty strings 2135 # x -> NOT x = '' 2136 # )[index] 2137 # END 2138 STRTOK_TEMPLATE: exp.Expr = exp.maybe_parse( 2139 """ 2140 CASE 2141 WHEN :delimiter = '' AND :string = '' THEN NULL 2142 WHEN :delimiter = '' AND :part_index = 1 THEN :string 2143 WHEN :delimiter = '' THEN NULL 2144 WHEN :part_index < 0 THEN NULL 2145 WHEN :string IS NULL OR :delimiter IS NULL OR :part_index IS NULL THEN NULL 2146 ELSE :base_func 2147 END 2148 """ 2149 ) 2150 2151 def _array_bag_sql(self, condition: exp.Expr, arr1: exp.Expr, arr2: exp.Expr) -> str: 2152 cond = exp.Paren(this=exp.replace_placeholders(condition, arr1=arr1, arr2=arr2)) 2153 return self.sql( 2154 exp.replace_placeholders(self.ARRAY_BAG_TEMPLATE, arr1=arr1, arr2=arr2, cond=cond) 2155 ) 2156 2157 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2158 """ 2159 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2160 2161 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2162 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2163 2164 For 'END' kind, add the interval to get the end of the slice. 2165 For DATE type with 'END', cast result back to DATE to preserve type. 2166 """ 2167 date_expr = expression.this 2168 slice_length = expression.expression 2169 unit = expression.unit 2170 kind = expression.text("kind").upper() 2171 2172 # Create INTERVAL expression: INTERVAL 'N' UNIT 2173 interval_expr = exp.Interval(this=slice_length, unit=unit) 2174 2175 # Create base time_bucket expression 2176 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2177 2178 # Check if we need the end of the slice (default is start) 2179 if not kind == "END": 2180 # For 'START', return time_bucket directly 2181 return self.sql(time_bucket_expr) 2182 2183 # For 'END', add the interval to get end of slice 2184 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2185 2186 # If input is DATE type, cast result back to DATE to preserve type 2187 # DuckDB converts DATE to TIMESTAMP when adding intervals 2188 if date_expr.is_type(exp.DType.DATE): 2189 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2190 2191 return self.sql(add_expr) 2192 2193 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2194 """ 2195 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2196 2197 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2198 - Each bucket covers 32,768 values 2199 - Bucket numbering starts at 1 2200 - Formula: ((value - 1) // 32768) + 1 for positive values 2201 2202 For non-positive values (0 and negative), we use value // 32768 to avoid 2203 producing bucket 0 or positive bucket IDs for negative inputs. 2204 """ 2205 value = expression.this 2206 2207 positive_formula = ((value - 1) // 32768) + 1 2208 non_positive_formula = value // 32768 2209 2210 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2211 case_expr = ( 2212 exp.case() 2213 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2214 .else_(non_positive_formula) 2215 ) 2216 return self.sql(case_expr) 2217 2218 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2219 """ 2220 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2221 2222 Snowflake's BITMAP_BIT_POSITION behavior: 2223 - For n <= 0: returns ABS(n) % 32768 2224 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2225 """ 2226 this = expression.this 2227 2228 return self.sql( 2229 exp.Mod( 2230 this=exp.Paren( 2231 this=exp.If( 2232 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2233 true=this - exp.Literal.number(1), 2234 false=exp.Abs(this=this), 2235 ) 2236 ), 2237 expression=MAX_BIT_POSITION, 2238 ) 2239 ) 2240 2241 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2242 """ 2243 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2244 Uses a pre-parsed template with placeholders replaced by expression nodes. 2245 2246 Snowflake bitmap format: 2247 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2248 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2249 """ 2250 arg = expression.this 2251 return ( 2252 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2253 ) 2254 2255 def compress_sql(self, expression: exp.Compress) -> str: 2256 self.unsupported("DuckDB does not support the COMPRESS() function") 2257 return self.function_fallback_sql(expression) 2258 2259 def encrypt_sql(self, expression: exp.Encrypt) -> str: 2260 self.unsupported("ENCRYPT is not supported in DuckDB") 2261 return self.function_fallback_sql(expression) 2262 2263 def decrypt_sql(self, expression: exp.Decrypt) -> str: 2264 func_name = "TRY_DECRYPT" if expression.args.get("safe") else "DECRYPT" 2265 self.unsupported(f"{func_name} is not supported in DuckDB") 2266 return self.function_fallback_sql(expression) 2267 2268 def decryptraw_sql(self, expression: exp.DecryptRaw) -> str: 2269 func_name = "TRY_DECRYPT_RAW" if expression.args.get("safe") else "DECRYPT_RAW" 2270 self.unsupported(f"{func_name} is not supported in DuckDB") 2271 return self.function_fallback_sql(expression) 2272 2273 def encryptraw_sql(self, expression: exp.EncryptRaw) -> str: 2274 self.unsupported("ENCRYPT_RAW is not supported in DuckDB") 2275 return self.function_fallback_sql(expression) 2276 2277 def parseurl_sql(self, expression: exp.ParseUrl) -> str: 2278 self.unsupported("PARSE_URL is not supported in DuckDB") 2279 return self.function_fallback_sql(expression) 2280 2281 def parseip_sql(self, expression: exp.ParseIp) -> str: 2282 self.unsupported("PARSE_IP is not supported in DuckDB") 2283 return self.function_fallback_sql(expression) 2284 2285 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2286 this = expression.this 2287 expr = expression.expression 2288 2289 if expression.args.get("case_insensitive"): 2290 this = exp.Upper(this=this) 2291 expr = exp.Upper(this=expr) 2292 2293 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2294 2295 if expression.args.get("integer_scale"): 2296 result = exp.cast(result * 100, "INTEGER") 2297 2298 return self.sql(result) 2299 2300 def nthvalue_sql(self, expression: exp.NthValue) -> str: 2301 from_first = expression.args.get("from_first", True) 2302 if not from_first: 2303 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2304 2305 return self.function_fallback_sql(expression) 2306 2307 def randstr_sql(self, expression: exp.Randstr) -> str: 2308 """ 2309 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2310 Uses a pre-parsed template with placeholders replaced by expression nodes. 2311 2312 RANDSTR(length, generator) generates a random string of specified length. 2313 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2314 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2315 - No generator: Use default seed value 2316 """ 2317 length = expression.this 2318 generator = expression.args.get("generator") 2319 2320 if generator: 2321 if isinstance(generator, exp.Rand): 2322 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2323 seed_value = generator.this or generator 2324 else: 2325 # Const/int or other expression - use as seed directly 2326 seed_value = generator 2327 else: 2328 # No generator specified, use default seed (arbitrary but deterministic) 2329 seed_value = exp.Literal.number(RANDSTR_SEED) 2330 2331 replacements = {"seed": seed_value, "length": length} 2332 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2333 2334 @unsupported_args("finish") 2335 def reduce_sql(self, expression: exp.Reduce) -> str: 2336 array_arg = expression.this 2337 initial_value = expression.args.get("initial") 2338 merge_lambda = expression.args.get("merge") 2339 2340 if merge_lambda: 2341 merge_lambda.set("colon", True) 2342 2343 return self.func("list_reduce", array_arg, merge_lambda, initial_value) 2344 2345 def zipf_sql(self, expression: exp.Zipf) -> str: 2346 """ 2347 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2348 Uses a pre-parsed template with placeholders replaced by expression nodes. 2349 """ 2350 s = expression.this 2351 n = expression.args["elementcount"] 2352 gen = expression.args["gen"] 2353 2354 if not isinstance(gen, exp.Rand): 2355 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2356 random_expr: exp.Expr = exp.Div( 2357 this=exp.Paren( 2358 this=exp.Mod( 2359 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2360 expression=exp.Literal.number(1000000), 2361 ) 2362 ), 2363 expression=exp.Literal.number(1000000.0), 2364 ) 2365 else: 2366 # Use RANDOM() for non-deterministic output 2367 random_expr = exp.Rand() 2368 2369 replacements = {"s": s, "n": n, "random_expr": random_expr} 2370 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2371 2372 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2373 """ 2374 TO_BINARY and TRY_TO_BINARY transpilation: 2375 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2376 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2377 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2378 2379 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2380 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2381 """ 2382 value = expression.this 2383 format_arg = expression.args.get("format") 2384 is_safe = expression.args.get("safe") 2385 is_binary = _is_binary(expression) 2386 2387 if not format_arg and not is_binary: 2388 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2389 return self.func(func_name, value) 2390 2391 # Snowflake defaults to HEX encoding when no format is specified 2392 fmt = format_arg.name.upper() if format_arg else "HEX" 2393 2394 if fmt in ("UTF-8", "UTF8"): 2395 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2396 result = self.func("ENCODE", value) 2397 elif fmt == "BASE64": 2398 result = self.func("FROM_BASE64", value) 2399 elif fmt == "HEX": 2400 result = self.func("UNHEX", value) 2401 else: 2402 if is_safe: 2403 return self.sql(exp.null()) 2404 else: 2405 self.unsupported(f"format {fmt} is not supported") 2406 result = self.func("TO_BINARY", value) 2407 return f"TRY({result})" if is_safe else result 2408 2409 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2410 fmt = expression.args.get("format") 2411 precision = expression.args.get("precision") 2412 scale = expression.args.get("scale") 2413 2414 if not fmt and precision and scale: 2415 return self.sql( 2416 exp.cast( 2417 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2418 ) 2419 ) 2420 2421 return super().tonumber_sql(expression) 2422 2423 def _greatest_least_sql(self, expression: exp.Greatest | exp.Least) -> str: 2424 """ 2425 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2426 2427 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2428 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2429 """ 2430 # Get all arguments 2431 all_args = [expression.this, *expression.expressions] 2432 fallback_sql = self.function_fallback_sql(expression) 2433 2434 if expression.args.get("ignore_nulls"): 2435 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2436 return self.sql(fallback_sql) 2437 2438 # return NULL if any argument is NULL 2439 case_expr = exp.case().when( 2440 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2441 exp.null(), 2442 copy=False, 2443 ) 2444 case_expr.set("default", fallback_sql) 2445 return self.sql(case_expr) 2446 2447 def generator_sql(self, expression: exp.Generator) -> str: 2448 # Transpile Snowflake GENERATOR to DuckDB range() 2449 rowcount = expression.args.get("rowcount") 2450 time_limit = expression.args.get("time_limit") 2451 2452 if time_limit: 2453 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2454 2455 if not rowcount: 2456 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2457 return self.func("range", exp.Literal.number(0)) 2458 2459 return self.func("range", rowcount) 2460 2461 def greatest_sql(self, expression: exp.Greatest) -> str: 2462 return self._greatest_least_sql(expression) 2463 2464 def least_sql(self, expression: exp.Least) -> str: 2465 return self._greatest_least_sql(expression) 2466 2467 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2468 if expression.args.get("colon"): 2469 prefix = "LAMBDA " 2470 arrow_sep = ":" 2471 wrap = False 2472 else: 2473 prefix = "" 2474 2475 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2476 return f"{prefix}{lambda_sql}" 2477 2478 def show_sql(self, expression: exp.Show) -> str: 2479 from_ = self.sql(expression, "from_") 2480 from_ = f" FROM {from_}" if from_ else "" 2481 return f"SHOW {expression.name}{from_}" 2482 2483 def soundex_sql(self, expression: exp.Soundex) -> str: 2484 self.unsupported("SOUNDEX is not supported in DuckDB") 2485 return self.func("SOUNDEX", expression.this) 2486 2487 def sortarray_sql(self, expression: exp.SortArray) -> str: 2488 arr = expression.this 2489 asc = expression.args.get("asc") 2490 nulls_first = expression.args.get("nulls_first") 2491 2492 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2493 return self.func("LIST_SORT", arr, asc, nulls_first) 2494 2495 nulls_are_first = nulls_first == exp.true() 2496 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2497 2498 if not isinstance(asc, exp.Boolean): 2499 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2500 2501 descending = asc == exp.false() 2502 2503 if not descending and not nulls_are_first: 2504 return self.func("LIST_SORT", arr) 2505 if not nulls_are_first: 2506 return self.func("ARRAY_REVERSE_SORT", arr) 2507 return self.func( 2508 "LIST_SORT", 2509 arr, 2510 exp.Literal.string("DESC" if descending else "ASC"), 2511 exp.Literal.string("NULLS FIRST"), 2512 ) 2513 2514 def install_sql(self, expression: exp.Install) -> str: 2515 force = "FORCE " if expression.args.get("force") else "" 2516 this = self.sql(expression, "this") 2517 from_clause = expression.args.get("from_") 2518 from_clause = f" FROM {from_clause}" if from_clause else "" 2519 return f"{force}INSTALL {this}{from_clause}" 2520 2521 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2522 self.unsupported( 2523 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2524 ) 2525 return self.function_fallback_sql(expression) 2526 2527 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2528 return self.sql(exp.cast(expression.this, exp.DType.TIMESTAMPTZ)) 2529 2530 def strposition_sql(self, expression: exp.StrPosition) -> str: 2531 this = expression.this 2532 substr = expression.args.get("substr") 2533 position = expression.args.get("position") 2534 2535 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2536 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2537 if _is_binary(this): 2538 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2539 hex_strpos = exp.StrPosition( 2540 this=exp.Hex(this=this), 2541 substr=exp.Hex(this=substr), 2542 ) 2543 2544 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2545 2546 # For VARCHAR: handle clamp_position 2547 if expression.args.get("clamp_position") and position: 2548 expression = expression.copy() 2549 expression.set( 2550 "position", 2551 exp.If( 2552 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2553 true=exp.Literal.number(1), 2554 false=position.copy(), 2555 ), 2556 ) 2557 2558 return strposition_sql(self, expression) 2559 2560 def substring_sql(self, expression: exp.Substring) -> str: 2561 if expression.args.get("zero_start"): 2562 start = expression.args.get("start") 2563 length = expression.args.get("length") 2564 2565 if start := expression.args.get("start"): 2566 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2567 if length := expression.args.get("length"): 2568 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2569 2570 return self.func("SUBSTRING", expression.this, start, length) 2571 2572 return self.function_fallback_sql(expression) 2573 2574 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2575 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2576 target_type = expression.args.get("target_type") 2577 needs_tz = target_type and target_type.this in ( 2578 exp.DType.TIMESTAMPLTZ, 2579 exp.DType.TIMESTAMPTZ, 2580 ) 2581 2582 if expression.args.get("safe"): 2583 formatted_time = self.format_time(expression) 2584 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2585 return self.sql( 2586 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2587 ) 2588 2589 base_sql = str_to_time_sql(self, expression) 2590 if needs_tz: 2591 return self.sql( 2592 exp.cast( 2593 base_sql, 2594 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2595 ) 2596 ) 2597 return base_sql 2598 2599 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2600 formatted_time = self.format_time(expression) 2601 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2602 return self.sql( 2603 exp.cast( 2604 self.func(function_name, expression.this, formatted_time), 2605 exp.DataType(this=exp.DType.DATE), 2606 ) 2607 ) 2608 2609 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2610 this = expression.this 2611 time_format = self.format_time(expression) 2612 safe = expression.args.get("safe") 2613 time_type = exp.DataType.build("TIME", dialect="duckdb") 2614 cast_expr = exp.TryCast if safe else exp.Cast 2615 2616 if time_format: 2617 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2618 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2619 return self.sql(cast_expr(this=strptime, to=time_type)) 2620 2621 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2622 return self.sql(this) 2623 2624 return self.sql(cast_expr(this=this, to=time_type)) 2625 2626 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2627 if not expression.this: 2628 return "CURRENT_DATE" 2629 2630 expr = exp.Cast( 2631 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2632 to=exp.DataType(this=exp.DType.DATE), 2633 ) 2634 return self.sql(expr) 2635 2636 def checkjson_sql(self, expression: exp.CheckJson) -> str: 2637 arg = expression.this 2638 return self.sql( 2639 exp.case() 2640 .when( 2641 exp.or_(arg.is_(exp.Null()), arg.eq(""), exp.func("json_valid", arg)), 2642 exp.null(), 2643 ) 2644 .else_(exp.Literal.string("Invalid JSON")) 2645 ) 2646 2647 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2648 arg = expression.this 2649 if expression.args.get("safe"): 2650 return self.sql( 2651 exp.case() 2652 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2653 .else_(exp.null()) 2654 ) 2655 return self.func("JSON", arg) 2656 2657 def unicode_sql(self, expression: exp.Unicode) -> str: 2658 if expression.args.get("empty_is_zero"): 2659 return self.sql( 2660 exp.case() 2661 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2662 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2663 ) 2664 2665 return self.func("UNICODE", expression.this) 2666 2667 def stripnullvalue_sql(self, expression: exp.StripNullValue) -> str: 2668 return self.sql( 2669 exp.case() 2670 .when(exp.func("json_type", expression.this).eq("NULL"), exp.null()) 2671 .else_(expression.this) 2672 ) 2673 2674 def trunc_sql(self, expression: exp.Trunc) -> str: 2675 decimals = expression.args.get("decimals") 2676 if ( 2677 expression.args.get("fractions_supported") 2678 and decimals 2679 and not decimals.is_type(exp.DType.INT) 2680 ): 2681 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2682 2683 return self.func("TRUNC", expression.this, decimals) 2684 2685 def normal_sql(self, expression: exp.Normal) -> str: 2686 """ 2687 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2688 2689 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2690 """ 2691 mean = expression.this 2692 stddev = expression.args["stddev"] 2693 gen: exp.Expr = expression.args["gen"] 2694 2695 # Build two uniform random values [0, 1) for Box-Muller transform 2696 if isinstance(gen, exp.Rand) and gen.this is None: 2697 u1: exp.Expr = exp.Rand() 2698 u2: exp.Expr = exp.Rand() 2699 else: 2700 # Seeded: derive two values using HASH with different inputs 2701 seed = gen.this if isinstance(gen, exp.Rand) else gen 2702 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2703 u2 = exp.replace_placeholders( 2704 self.SEEDED_RANDOM_TEMPLATE, 2705 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2706 ) 2707 2708 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2709 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2710 2711 def uniform_sql(self, expression: exp.Uniform) -> str: 2712 """ 2713 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2714 2715 UNIFORM returns a random value in [min, max]: 2716 - Integer result if both min and max are integers 2717 - Float result if either min or max is a float 2718 """ 2719 min_val = expression.this 2720 max_val = expression.expression 2721 gen = expression.args.get("gen") 2722 2723 # Determine if result should be integer (both bounds are integers). 2724 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2725 is_int_result = min_val.is_int and max_val.is_int 2726 2727 # Build the random value expression [0, 1) 2728 if not isinstance(gen, exp.Rand): 2729 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2730 random_expr: exp.Expr = exp.Div( 2731 this=exp.Paren( 2732 this=exp.Mod( 2733 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2734 expression=exp.Literal.number(1000000), 2735 ) 2736 ), 2737 expression=exp.Literal.number(1000000.0), 2738 ) 2739 else: 2740 random_expr = exp.Rand() 2741 2742 # Build: min + random * (max - min [+ 1 for int]) 2743 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2744 if is_int_result: 2745 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2746 2747 result: exp.Expr = exp.Add( 2748 this=min_val, 2749 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2750 ) 2751 2752 if is_int_result: 2753 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2754 2755 return self.sql(result) 2756 2757 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2758 nano = expression.args.get("nano") 2759 overflow = expression.args.get("overflow") 2760 2761 # Snowflake's TIME_FROM_PARTS supports overflow 2762 if overflow: 2763 hour = expression.args["hour"] 2764 minute = expression.args["min"] 2765 sec = expression.args["sec"] 2766 2767 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2768 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2769 try: 2770 h_val = hour.to_py() 2771 m_val = minute.to_py() 2772 s_val = sec.to_py() 2773 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2774 return rename_func("MAKE_TIME")(self, expression) 2775 except ValueError: 2776 pass 2777 2778 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2779 if nano: 2780 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2781 2782 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2783 2784 return self.sql( 2785 exp.Add( 2786 this=exp.Cast( 2787 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2788 ), 2789 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2790 ) 2791 ) 2792 2793 # Default: MAKE_TIME 2794 if nano: 2795 expression.set( 2796 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2797 ) 2798 2799 return rename_func("MAKE_TIME")(self, expression) 2800 2801 def extract_sql(self, expression: exp.Extract) -> str: 2802 """ 2803 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2804 2805 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2806 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2807 """ 2808 this = expression.this 2809 datetime_expr = expression.expression 2810 2811 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2812 # because Snowflake applies server timezone while DuckDB uses local timezone 2813 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2814 self.unsupported( 2815 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2816 ) 2817 2818 part_name = this.name.upper() 2819 2820 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2821 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2822 2823 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2824 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2825 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2826 exp.DType.TIME, exp.DType.TIMETZ 2827 ) 2828 2829 if is_nano_time: 2830 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2831 return self.sql( 2832 exp.cast( 2833 exp.Mul( 2834 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2835 expression=exp.Literal.number(1000), 2836 ), 2837 exp.DataType.build(cast_type, dialect="duckdb"), 2838 ) 2839 ) 2840 2841 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2842 strftime_input = datetime_expr 2843 if part_name == "NANOSECOND": 2844 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2845 2846 return self.sql( 2847 exp.cast( 2848 exp.Anonymous( 2849 this="STRFTIME", 2850 expressions=[strftime_input, exp.Literal.string(fmt)], 2851 ), 2852 exp.DataType.build(cast_type, dialect="duckdb"), 2853 ) 2854 ) 2855 2856 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2857 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2858 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2859 # EPOCH returns float, cast to BIGINT for integer result 2860 if part_name == "EPOCH_SECOND": 2861 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2862 return self.sql(result) 2863 2864 return super().extract_sql(expression) 2865 2866 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2867 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2868 date_expr = expression.this 2869 time_expr = expression.expression 2870 2871 if date_expr is not None and time_expr is not None: 2872 # In DuckDB, DATE + TIME produces TIMESTAMP 2873 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2874 2875 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2876 sec = expression.args.get("sec") 2877 if sec is None: 2878 # This shouldn't happen with valid input, but handle gracefully 2879 return rename_func("MAKE_TIMESTAMP")(self, expression) 2880 2881 milli = expression.args.get("milli") 2882 if milli is not None: 2883 sec += milli.pop() / exp.Literal.number(1000.0) 2884 2885 nano = expression.args.get("nano") 2886 if nano is not None: 2887 sec += nano.pop() / exp.Literal.number(1000000000.0) 2888 2889 if milli or nano: 2890 expression.set("sec", sec) 2891 2892 return rename_func("MAKE_TIMESTAMP")(self, expression) 2893 2894 @unsupported_args("nano") 2895 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2896 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2897 if nano := expression.args.get("nano"): 2898 nano.pop() 2899 2900 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2901 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2902 2903 @unsupported_args("nano") 2904 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2905 # Extract zone before popping 2906 zone = expression.args.get("zone") 2907 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2908 if zone: 2909 zone = zone.pop() 2910 2911 if nano := expression.args.get("nano"): 2912 nano.pop() 2913 2914 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2915 2916 if zone: 2917 # Use AT TIME ZONE to apply the explicit timezone 2918 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2919 2920 return timestamp 2921 2922 def tablesample_sql( 2923 self, 2924 expression: exp.TableSample, 2925 tablesample_keyword: str | None = None, 2926 ) -> str: 2927 if not isinstance(expression.parent, exp.Select): 2928 # This sample clause only applies to a single source, not the entire resulting relation 2929 tablesample_keyword = "TABLESAMPLE" 2930 2931 if expression.args.get("size"): 2932 method = expression.args.get("method") 2933 if method and method.name.upper() != "RESERVOIR": 2934 self.unsupported( 2935 f"Sampling method {method} is not supported with a discrete sample count, " 2936 "defaulting to reservoir sampling" 2937 ) 2938 expression.set("method", exp.var("RESERVOIR")) 2939 2940 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2941 2942 def join_sql(self, expression: exp.Join) -> str: 2943 if ( 2944 not expression.args.get("using") 2945 and not expression.args.get("on") 2946 and not expression.method 2947 and (expression.kind in ("", "INNER", "OUTER")) 2948 ): 2949 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2950 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2951 if isinstance(expression.this, exp.Unnest): 2952 return super().join_sql(expression.on(exp.true())) 2953 2954 expression.set("side", None) 2955 expression.set("kind", None) 2956 2957 return super().join_sql(expression) 2958 2959 def countif_sql(self, expression: exp.CountIf) -> str: 2960 if self.dialect.version >= (1, 2): 2961 return self.function_fallback_sql(expression) 2962 2963 # https://github.com/tobymao/sqlglot/pull/4749 2964 return count_if_to_sum(self, expression) 2965 2966 def bracket_sql(self, expression: exp.Bracket) -> str: 2967 if self.dialect.version >= (1, 2): 2968 return super().bracket_sql(expression) 2969 2970 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2971 this = expression.this 2972 if isinstance(this, exp.Array): 2973 this.replace(exp.paren(this)) 2974 2975 bracket = super().bracket_sql(expression) 2976 2977 if not expression.args.get("returns_list_for_maps"): 2978 if not this.type: 2979 from sqlglot.optimizer.annotate_types import annotate_types 2980 2981 this = annotate_types(this, dialect=self.dialect) 2982 2983 if this.is_type(exp.DType.MAP): 2984 bracket = f"({bracket})[1]" 2985 2986 return bracket 2987 2988 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2989 func = expression.this 2990 2991 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2992 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2993 if isinstance(func, exp.ArrayAgg): 2994 if not isinstance(order := expression.expression, exp.Order): 2995 return self.sql(func) 2996 2997 # Save the original column for FILTER clause (before wrapping with Order) 2998 original_this = func.this 2999 3000 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3001 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3002 func.set( 3003 "this", 3004 exp.Order( 3005 this=func.this.copy(), 3006 expressions=order.expressions, 3007 ), 3008 ) 3009 3010 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3011 # Use original_this (not the Order-wrapped version) for the FILTER condition 3012 array_agg_sql = self.function_fallback_sql(func) 3013 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3014 3015 # For other functions (like PERCENTILES), use existing logic 3016 expression_sql = self.sql(expression, "expression") 3017 3018 if isinstance(func, exp.PERCENTILES): 3019 # Make the order key the first arg and slide the fraction to the right 3020 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3021 order_col = expression.find(exp.Ordered) 3022 if order_col: 3023 func.set("expression", func.this) 3024 func.set("this", order_col.this) 3025 3026 this = self.sql(expression, "this").rstrip(")") 3027 3028 return f"{this}{expression_sql})" 3029 3030 def length_sql(self, expression: exp.Length) -> str: 3031 arg = expression.this 3032 3033 # Dialects like BQ and Snowflake also accept binary values as args, so 3034 # DDB will attempt to infer the type or resort to case/when resolution 3035 if not expression.args.get("binary") or arg.is_string: 3036 return self.func("LENGTH", arg) 3037 3038 if not arg.type: 3039 from sqlglot.optimizer.annotate_types import annotate_types 3040 3041 arg = annotate_types(arg, dialect=self.dialect) 3042 3043 if arg.is_type(*exp.DataType.TEXT_TYPES): 3044 return self.func("LENGTH", arg) 3045 3046 # We need these casts to make duckdb's static type checker happy 3047 blob = exp.cast(arg, exp.DType.VARBINARY) 3048 varchar = exp.cast(arg, exp.DType.VARCHAR) 3049 3050 case = ( 3051 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3052 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3053 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3054 ) 3055 return self.sql(case) 3056 3057 def bitlength_sql(self, expression: exp.BitLength) -> str: 3058 if not _is_binary(arg := expression.this): 3059 return self.func("BIT_LENGTH", arg) 3060 3061 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3062 return self.sql(exp.ByteLength(this=blob) * exp.Literal.number(8)) 3063 3064 def chr_sql(self, expression: exp.Chr, name: str = "CHR") -> str: 3065 arg = expression.expressions[0] 3066 if arg.is_type(*exp.DataType.REAL_TYPES): 3067 arg = exp.cast(arg, exp.DType.INT) 3068 return self.func("CHR", arg) 3069 3070 def collation_sql(self, expression: exp.Collation) -> str: 3071 self.unsupported("COLLATION function is not supported by DuckDB") 3072 return self.function_fallback_sql(expression) 3073 3074 def collate_sql(self, expression: exp.Collate) -> str: 3075 if not expression.expression.is_string: 3076 return super().collate_sql(expression) 3077 3078 raw = expression.expression.name 3079 if not raw: 3080 return self.sql(expression.this) 3081 3082 parts = [] 3083 for part in raw.split("-"): 3084 lower = part.lower() 3085 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3086 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3087 self.unsupported( 3088 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3089 ) 3090 parts.append(lower) 3091 3092 if not parts: 3093 return self.sql(expression.this) 3094 return super().collate_sql( 3095 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3096 ) 3097 3098 def _validate_regexp_flags(self, flags: exp.Expr | None, supported_flags: str) -> str | None: 3099 """ 3100 Validate and filter regexp flags for DuckDB compatibility. 3101 3102 Args: 3103 flags: The flags expression to validate 3104 supported_flags: String of supported flags (e.g., "ims", "cims"). 3105 Only these flags will be returned. 3106 3107 Returns: 3108 Validated/filtered flag string, or None if no valid flags remain 3109 """ 3110 if not isinstance(flags, exp.Expr): 3111 return None 3112 3113 if not flags.is_string: 3114 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3115 return None 3116 3117 flag_str = flags.this 3118 unsupported = set(flag_str) - set(supported_flags) 3119 3120 if unsupported: 3121 self.unsupported( 3122 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3123 ) 3124 3125 flag_str = "".join(f for f in flag_str if f in supported_flags) 3126 return flag_str if flag_str else None 3127 3128 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3129 this = expression.this 3130 pattern = expression.expression 3131 position = expression.args.get("position") 3132 parameters = expression.args.get("parameters") 3133 3134 # Validate flags - only "ims" flags are supported for embedded patterns 3135 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3136 3137 if position: 3138 this = exp.Substring(this=this, start=position) 3139 3140 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3141 if validated_flags: 3142 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3143 3144 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3145 result = ( 3146 exp.case() 3147 .when( 3148 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3149 exp.Literal.number(0), 3150 ) 3151 .else_( 3152 exp.Length( 3153 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3154 ) 3155 ) 3156 ) 3157 3158 return self.sql(result) 3159 3160 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3161 subject = expression.this 3162 pattern = expression.expression 3163 replacement = expression.args.get("replacement") or exp.Literal.string("") 3164 position = expression.args.get("position") 3165 occurrence = expression.args.get("occurrence") 3166 modifiers = expression.args.get("modifiers") 3167 3168 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3169 3170 # Handle occurrence (only literals supported) 3171 if occurrence and not occurrence.is_int: 3172 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3173 else: 3174 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3175 if occurrence > 1: 3176 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3177 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3178 elif ( 3179 occurrence == 0 3180 and "g" not in validated_flags 3181 and not expression.args.get("single_replace") 3182 ): 3183 validated_flags += "g" 3184 3185 # Handle position (only literals supported) 3186 prefix = None 3187 if position and not position.is_int: 3188 self.unsupported("REGEXP_REPLACE with non-literal position") 3189 elif position and position.is_int and position.to_py() > 1: 3190 pos = position.to_py() 3191 prefix = exp.Substring( 3192 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3193 ) 3194 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3195 3196 result: exp.Expr = exp.Anonymous( 3197 this="REGEXP_REPLACE", 3198 expressions=[ 3199 subject, 3200 pattern, 3201 replacement, 3202 exp.Literal.string(validated_flags) if validated_flags else None, 3203 ], 3204 ) 3205 3206 if prefix: 3207 result = exp.Concat(expressions=[prefix, result]) 3208 3209 return self.sql(result) 3210 3211 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3212 this = expression.this 3213 pattern = expression.expression 3214 flag = expression.args.get("flag") 3215 3216 if expression.args.get("full_match"): 3217 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3218 flag = exp.Literal.string(validated_flags) if validated_flags else None 3219 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3220 3221 return self.func("REGEXP_MATCHES", this, pattern, flag) 3222 3223 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3224 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3225 this = expression.this 3226 expr = expression.expression 3227 max_dist = expression.args.get("max_dist") 3228 3229 if max_dist is None: 3230 return self.func("LEVENSHTEIN", this, expr) 3231 3232 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3233 levenshtein = exp.Levenshtein(this=this, expression=expr) 3234 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3235 3236 def pad_sql(self, expression: exp.Pad) -> str: 3237 """ 3238 Handle RPAD/LPAD for VARCHAR and BINARY types. 3239 3240 For VARCHAR: Delegate to parent class 3241 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3242 """ 3243 string_arg = expression.this 3244 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3245 3246 if _is_binary(string_arg) or _is_binary(fill_arg): 3247 length_arg = expression.expression 3248 is_left = expression.args.get("is_left") 3249 3250 input_len = exp.ByteLength(this=string_arg) 3251 chars_needed = length_arg - input_len 3252 pad_count = exp.Greatest( 3253 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3254 ) 3255 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3256 3257 left, right = string_arg, repeat_expr 3258 if is_left: 3259 left, right = right, left 3260 3261 result = exp.DPipe(this=left, expression=right) 3262 return self.sql(result) 3263 3264 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3265 return super().pad_sql(expression) 3266 3267 def minhash_sql(self, expression: exp.Minhash) -> str: 3268 k = expression.this 3269 exprs = expression.expressions 3270 3271 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3272 self.unsupported( 3273 "MINHASH with multiple expressions or * requires manual query restructuring" 3274 ) 3275 return self.func("MINHASH", k, *exprs) 3276 3277 expr = exprs[0] 3278 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3279 return f"({self.sql(result)})" 3280 3281 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3282 expr = expression.this 3283 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3284 return f"({self.sql(result)})" 3285 3286 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3287 expr = expression.this 3288 result = exp.replace_placeholders(self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr) 3289 return f"({self.sql(result)})" 3290 3291 def arrayuniqueagg_sql(self, expression: exp.ArrayUniqueAgg) -> str: 3292 return self.sql( 3293 exp.Filter( 3294 this=exp.func("LIST", exp.Distinct(expressions=[expression.this])), 3295 expression=exp.Where(this=expression.this.copy().is_(exp.null()).not_()), 3296 ) 3297 ) 3298 3299 def arrayunionagg_sql(self, expression: exp.ArrayUnionAgg) -> str: 3300 self.unsupported("ARRAY_UNION_AGG is not supported in DuckDB") 3301 return self.function_fallback_sql(expression) 3302 3303 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3304 arr = expression.this 3305 func = self.func("LIST_DISTINCT", arr) 3306 3307 if expression.args.get("check_null"): 3308 add_null_to_array = exp.func( 3309 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3310 ) 3311 return self.sql( 3312 exp.If( 3313 this=exp.NEQ( 3314 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3315 ), 3316 true=add_null_to_array, 3317 false=func, 3318 ) 3319 ) 3320 3321 return func 3322 3323 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3324 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3325 return self._array_bag_sql( 3326 self.ARRAY_INTERSECTION_CONDITION, 3327 expression.expressions[0], 3328 expression.expressions[1], 3329 ) 3330 return self.function_fallback_sql(expression) 3331 3332 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3333 arr1, arr2 = expression.this, expression.expression 3334 if expression.args.get("is_multiset"): 3335 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3336 return self.sql( 3337 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3338 ) 3339 3340 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3341 """ 3342 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3343 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3344 expressions that adjust the index at query time: 3345 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3346 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3347 """ 3348 start, end = expression.args.get("start"), expression.args.get("end") 3349 3350 if expression.args.get("zero_based"): 3351 if start is not None: 3352 start = ( 3353 exp.case() 3354 .when( 3355 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3356 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3357 ) 3358 .else_(start) 3359 ) 3360 if end is not None: 3361 end = ( 3362 exp.case() 3363 .when( 3364 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3365 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3366 ) 3367 .else_(end) 3368 ) 3369 3370 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step")) 3371 3372 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3373 args = expression.expressions 3374 3375 if not args: 3376 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3377 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3378 3379 # Build placeholder values for template 3380 lengths = [exp.Length(this=arg) for arg in args] 3381 max_len = ( 3382 lengths[0] 3383 if len(lengths) == 1 3384 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3385 ) 3386 3387 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3388 empty_struct = exp.func( 3389 "STRUCT", 3390 *[ 3391 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3392 for i in range(len(args)) 3393 ], 3394 ) 3395 3396 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3397 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3398 index = exp.column("__i") + 1 3399 transform_struct = exp.func( 3400 "STRUCT", 3401 *[ 3402 exp.PropertyEQ( 3403 this=exp.Literal.string(f"${i + 1}"), 3404 expression=exp.func("COALESCE", arg, exp.array())[index], 3405 ) 3406 for i, arg in enumerate(args) 3407 ], 3408 ) 3409 3410 result = exp.replace_placeholders( 3411 self.ARRAYS_ZIP_TEMPLATE.copy(), 3412 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3413 all_empty_check=exp.and_( 3414 *[ 3415 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3416 for arg in args 3417 ] 3418 ), 3419 empty_struct=empty_struct, 3420 max_len=max_len, 3421 transform_struct=transform_struct, 3422 ) 3423 return self.sql(result) 3424 3425 def lower_sql(self, expression: exp.Lower) -> str: 3426 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3427 return _gen_with_cast_to_blob(self, expression, result_sql) 3428 3429 def upper_sql(self, expression: exp.Upper) -> str: 3430 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3431 return _gen_with_cast_to_blob(self, expression, result_sql) 3432 3433 def reverse_sql(self, expression: exp.Reverse) -> str: 3434 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3435 return _gen_with_cast_to_blob(self, expression, result_sql) 3436 3437 def _left_right_sql(self, expression: exp.Left | exp.Right, func_name: str) -> str: 3438 arg = expression.this 3439 length = expression.expression 3440 is_binary = _is_binary(arg) 3441 3442 if is_binary: 3443 # LEFT/RIGHT(blob, n) becomes UNHEX(LEFT/RIGHT(HEX(blob), n * 2)) 3444 # Each byte becomes 2 hex chars, so multiply length by 2 3445 hex_arg = exp.Hex(this=arg) 3446 hex_length = exp.Mul(this=length, expression=exp.Literal.number(2)) 3447 result: exp.Expression = exp.Unhex( 3448 this=exp.Anonymous(this=func_name, expressions=[hex_arg, hex_length]) 3449 ) 3450 else: 3451 result = exp.Anonymous(this=func_name, expressions=[arg, length]) 3452 3453 if expression.args.get("negative_length_returns_empty"): 3454 empty: exp.Expression = exp.Literal.string("") 3455 if is_binary: 3456 empty = exp.Unhex(this=empty) 3457 result = exp.case().when(length < exp.Literal.number(0), empty).else_(result) 3458 3459 return self.sql(result) 3460 3461 def left_sql(self, expression: exp.Left) -> str: 3462 return self._left_right_sql(expression, "LEFT") 3463 3464 def right_sql(self, expression: exp.Right) -> str: 3465 return self._left_right_sql(expression, "RIGHT") 3466 3467 def rtrimmedlength_sql(self, expression: exp.RtrimmedLength) -> str: 3468 return self.func("LENGTH", exp.Trim(this=expression.this, position="TRAILING")) 3469 3470 def stuff_sql(self, expression: exp.Stuff) -> str: 3471 base = expression.this 3472 start = expression.args["start"] 3473 length = expression.args["length"] 3474 insertion = expression.expression 3475 is_binary = _is_binary(base) 3476 3477 if is_binary: 3478 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3479 # (each byte = 2 hex chars), then UNHEX back to BLOB 3480 base = exp.Hex(this=base) 3481 insertion = exp.Hex(this=insertion) 3482 left = exp.Substring( 3483 this=base.copy(), 3484 start=exp.Literal.number(1), 3485 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3486 ) 3487 right = exp.Substring( 3488 this=base.copy(), 3489 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3490 + exp.Literal.number(1), 3491 ) 3492 else: 3493 left = exp.Substring( 3494 this=base.copy(), 3495 start=exp.Literal.number(1), 3496 length=start.copy() - exp.Literal.number(1), 3497 ) 3498 right = exp.Substring(this=base.copy(), start=start + length) 3499 result: exp.Expr = exp.DPipe( 3500 this=exp.DPipe(this=left, expression=insertion), expression=right 3501 ) 3502 3503 if is_binary: 3504 result = exp.Unhex(this=result) 3505 3506 return self.sql(result) 3507 3508 def rand_sql(self, expression: exp.Rand) -> str: 3509 seed = expression.this 3510 if seed is not None: 3511 self.unsupported("RANDOM with seed is not supported in DuckDB") 3512 3513 lower = expression.args.get("lower") 3514 upper = expression.args.get("upper") 3515 3516 if lower and upper: 3517 # scale DuckDB's [0,1) to the specified range 3518 range_size = exp.paren(upper - lower) 3519 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3520 3521 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3522 result = exp.cast(scaled, exp.DType.BIGINT) 3523 return self.sql(result) 3524 3525 # Default DuckDB behavior - just return RANDOM() as float 3526 return "RANDOM()" 3527 3528 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3529 arg = expression.this 3530 3531 # Check if it's a text type (handles both literals and annotated expressions) 3532 if arg.is_type(*exp.DataType.TEXT_TYPES): 3533 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3534 3535 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3536 return self.func("OCTET_LENGTH", arg) 3537 3538 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3539 # DuckDB TO_BASE64 requires BLOB input 3540 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3541 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3542 result = expression.this 3543 3544 # Check if input is a string type - ENCODE only accepts VARCHAR 3545 if result.is_type(*exp.DataType.TEXT_TYPES): 3546 result = exp.Encode(this=result) 3547 3548 result = exp.ToBase64(this=result) 3549 3550 max_line_length = expression.args.get("max_line_length") 3551 alphabet = expression.args.get("alphabet") 3552 3553 # Handle custom alphabet by replacing standard chars with custom ones 3554 result = _apply_base64_alphabet_replacements(result, alphabet) 3555 3556 # Handle max_line_length by inserting newlines every N characters 3557 line_length = ( 3558 t.cast(int, max_line_length.to_py()) 3559 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3560 else 0 3561 ) 3562 if line_length > 0: 3563 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3564 result = exp.Trim( 3565 this=exp.RegexpReplace( 3566 this=result, 3567 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3568 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3569 ), 3570 expression=newline, 3571 position="TRAILING", 3572 ) 3573 3574 return self.sql(result) 3575 3576 def replace_sql(self, expression: exp.Replace) -> str: 3577 result_sql = self.func( 3578 "REPLACE", 3579 _cast_to_varchar(expression.this), 3580 _cast_to_varchar(expression.expression), 3581 _cast_to_varchar(expression.args.get("replacement")), 3582 ) 3583 return _gen_with_cast_to_blob(self, expression, result_sql) 3584 3585 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3586 _prepare_binary_bitwise_args(expression) 3587 result_sql = self.binary(expression, op) 3588 return _gen_with_cast_to_blob(self, expression, result_sql) 3589 3590 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3591 _prepare_binary_bitwise_args(expression) 3592 result_sql = self.func("XOR", expression.this, expression.expression) 3593 return _gen_with_cast_to_blob(self, expression, result_sql) 3594 3595 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3596 this = expression.this 3597 key = expression.args.get("key") 3598 key_sql = key.name if isinstance(key, exp.Expr) else "" 3599 value_sql = self.sql(expression, "value") 3600 3601 kv_sql = f"{key_sql} := {value_sql}" 3602 3603 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3604 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3605 if isinstance(this, exp.Struct) and not this.expressions: 3606 return self.func("STRUCT_PACK", kv_sql) 3607 3608 return self.func("STRUCT_INSERT", this, kv_sql) 3609 3610 def mapcat_sql(self, expression: exp.MapCat) -> str: 3611 result = exp.replace_placeholders( 3612 self.MAPCAT_TEMPLATE.copy(), 3613 map1=expression.this, 3614 map2=expression.expression, 3615 ) 3616 return self.sql(result) 3617 3618 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3619 return self.func( 3620 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3621 ) 3622 3623 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3624 map_arg = expression.this 3625 keys_to_delete = expression.expressions 3626 3627 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3628 3629 lambda_expr = exp.Lambda( 3630 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3631 expressions=[exp.to_identifier("x")], 3632 ) 3633 result = exp.func( 3634 "MAP_FROM_ENTRIES", 3635 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3636 ) 3637 return self.sql(result) 3638 3639 def mappick_sql(self, expression: exp.MapPick) -> str: 3640 map_arg = expression.this 3641 keys_to_pick = expression.expressions 3642 3643 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3644 3645 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3646 lambda_expr = exp.Lambda( 3647 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3648 expressions=[exp.to_identifier("x")], 3649 ) 3650 else: 3651 lambda_expr = exp.Lambda( 3652 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3653 expressions=[exp.to_identifier("x")], 3654 ) 3655 3656 result = exp.func( 3657 "MAP_FROM_ENTRIES", 3658 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3659 ) 3660 return self.sql(result) 3661 3662 def mapsize_sql(self, expression: exp.MapSize) -> str: 3663 return self.func("CARDINALITY", expression.this) 3664 3665 @unsupported_args("update_flag") 3666 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3667 map_arg = expression.this 3668 key = expression.args.get("key") 3669 value = expression.args.get("value") 3670 3671 map_type = map_arg.type 3672 3673 if value is not None: 3674 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3675 # Extract the value type from MAP(key_type, value_type) 3676 value_type = map_type.expressions[1] 3677 # Cast value to match the map's value type to avoid type conflicts 3678 value = exp.cast(value, value_type) 3679 # else: polymorphic MAP case - no type parameters available, use value as-is 3680 3681 # Create a single-entry map for the new key-value pair 3682 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3683 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3684 3685 # Use MAP_CONCAT to merge the original map with the new entry 3686 # This automatically handles both insert and update cases 3687 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3688 3689 return self.sql(result) 3690 3691 def startswith_sql(self, expression: exp.StartsWith) -> str: 3692 return self.func( 3693 "STARTS_WITH", 3694 _cast_to_varchar(expression.this), 3695 _cast_to_varchar(expression.expression), 3696 ) 3697 3698 def space_sql(self, expression: exp.Space) -> str: 3699 # DuckDB's REPEAT requires BIGINT for the count parameter 3700 return self.sql( 3701 exp.Repeat( 3702 this=exp.Literal.string(" "), 3703 times=exp.cast(expression.this, exp.DType.BIGINT), 3704 ) 3705 ) 3706 3707 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3708 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3709 if isinstance(expression.this, exp.Generator): 3710 # Preserve alias, joins, and other table-level args 3711 table = exp.Table( 3712 this=expression.this, 3713 alias=expression.args.get("alias"), 3714 joins=expression.args.get("joins"), 3715 ) 3716 return self.sql(table) 3717 3718 return super().tablefromrows_sql(expression) 3719 3720 def unnest_sql(self, expression: exp.Unnest) -> str: 3721 explode_array = expression.args.get("explode_array") 3722 if explode_array: 3723 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3724 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3725 expression.expressions.append( 3726 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3727 ) 3728 3729 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3730 alias = expression.args.get("alias") 3731 if isinstance(alias, exp.TableAlias): 3732 expression.set("alias", None) 3733 if alias.columns: 3734 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3735 3736 unnest_sql = super().unnest_sql(expression) 3737 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3738 return self.sql(select) 3739 3740 return super().unnest_sql(expression) 3741 3742 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3743 this = expression.this 3744 3745 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3746 # DuckDB should render IGNORE NULLS only for the general-purpose 3747 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3748 return super().ignorenulls_sql(expression) 3749 3750 if isinstance(this, exp.First): 3751 this = exp.AnyValue(this=this.this) 3752 3753 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3754 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3755 3756 return self.sql(this) 3757 3758 def split_sql(self, expression: exp.Split) -> str: 3759 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3760 3761 case_expr = exp.case().else_(base_func) 3762 needs_case = False 3763 3764 if expression.args.get("null_returns_null"): 3765 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3766 needs_case = True 3767 3768 if expression.args.get("empty_delimiter_returns_whole"): 3769 # When delimiter is empty string, return input string as single array element 3770 array_with_input = exp.array(expression.this) 3771 case_expr = case_expr.when( 3772 expression.expression.eq(exp.Literal.string("")), array_with_input 3773 ) 3774 needs_case = True 3775 3776 return self.sql(case_expr if needs_case else base_func) 3777 3778 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3779 string_arg = expression.this 3780 delimiter_arg = expression.args.get("delimiter") 3781 part_index_arg = expression.args.get("part_index") 3782 3783 if delimiter_arg and part_index_arg: 3784 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3785 if expression.args.get("part_index_zero_as_one"): 3786 # Convert 0 to 1 for compatibility 3787 3788 part_index_arg = exp.Paren( 3789 this=exp.case() 3790 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3791 .else_(part_index_arg) 3792 ) 3793 3794 # Use Anonymous to avoid recursion 3795 base_func_expr: exp.Expr = exp.Anonymous( 3796 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3797 ) 3798 needs_case_transform = False 3799 case_expr = exp.case().else_(base_func_expr) 3800 3801 if expression.args.get("empty_delimiter_returns_whole"): 3802 # When delimiter is empty string: 3803 # - Return whole string if part_index is 1 or -1 3804 # - Return empty string otherwise 3805 empty_case = exp.Paren( 3806 this=exp.case() 3807 .when( 3808 exp.or_( 3809 part_index_arg.eq(exp.Literal.number("1")), 3810 part_index_arg.eq(exp.Literal.number("-1")), 3811 ), 3812 string_arg, 3813 ) 3814 .else_(exp.Literal.string("")) 3815 ) 3816 3817 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3818 needs_case_transform = True 3819 3820 """ 3821 Output looks something like this: 3822 3823 CASE 3824 WHEN delimiter is '' THEN 3825 ( 3826 CASE 3827 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3828 ELSE '' END 3829 ) 3830 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3831 END 3832 3833 """ 3834 return self.sql(case_expr if needs_case_transform else base_func_expr) 3835 3836 return self.function_fallback_sql(expression) 3837 3838 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3839 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3840 # DuckDB should render RESPECT NULLS only for the general-purpose 3841 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3842 return super().respectnulls_sql(expression) 3843 3844 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3845 return self.sql(expression, "this") 3846 3847 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3848 null = expression.args.get("null") 3849 3850 if expression.args.get("null_is_empty"): 3851 x = exp.to_identifier("x") 3852 list_transform = exp.Transform( 3853 this=expression.this.copy(), 3854 expression=exp.Lambda( 3855 this=exp.Coalesce( 3856 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3857 ), 3858 expressions=[x], 3859 ), 3860 ) 3861 array_to_string = exp.ArrayToString( 3862 this=list_transform, expression=expression.expression 3863 ) 3864 if expression.args.get("null_delim_is_null"): 3865 return self.sql( 3866 exp.case() 3867 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3868 .else_(array_to_string) 3869 ) 3870 return self.sql(array_to_string) 3871 3872 if null: 3873 x = exp.to_identifier("x") 3874 return self.sql( 3875 exp.ArrayToString( 3876 this=exp.Transform( 3877 this=expression.this, 3878 expression=exp.Lambda( 3879 this=exp.Coalesce(this=x, expressions=[null]), 3880 expressions=[x], 3881 ), 3882 ), 3883 expression=expression.expression, 3884 ) 3885 ) 3886 3887 return self.func("ARRAY_TO_STRING", expression.this, expression.expression) 3888 3889 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3890 # DuckDB-specific: handle binary types using DPipe (||) operator 3891 separator = seq_get(expression.expressions, 0) 3892 args = expression.expressions[1:] 3893 3894 if any(_is_binary(arg) for arg in [separator, *args]): 3895 result = args[0] 3896 for arg in args[1:]: 3897 result = exp.DPipe( 3898 this=exp.DPipe(this=result, expression=separator), expression=arg 3899 ) 3900 return self.sql(result) 3901 3902 return super().concatws_sql(expression) 3903 3904 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3905 this = expression.this 3906 group = expression.args.get("group") 3907 params = expression.args.get("parameters") 3908 position = expression.args.get("position") 3909 occurrence = expression.args.get("occurrence") 3910 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3911 3912 # Handle Snowflake's 'e' flag: it enables capture group extraction 3913 # In DuckDB, this is controlled by the group parameter directly 3914 if params and params.is_string and "e" in params.name: 3915 params = exp.Literal.string(params.name.replace("e", "")) 3916 3917 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3918 3919 # Strip default group when no following params (DuckDB default is same as group=0) 3920 if ( 3921 not validated_flags 3922 and group 3923 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3924 ): 3925 group = None 3926 3927 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3928 3929 # use substring to handle position argument 3930 if position and (not position.is_int or position.to_py() > 1): 3931 this = exp.Substring(this=this, start=position) 3932 3933 if null_if_pos_overflow: 3934 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3935 3936 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3937 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3938 3939 if is_extract_all or non_single_occurrence: 3940 name = "REGEXP_EXTRACT_ALL" 3941 else: 3942 name = "REGEXP_EXTRACT" 3943 3944 result: exp.Expr = exp.Anonymous( 3945 this=name, expressions=[this, expression.expression, group, flags_expr] 3946 ) 3947 3948 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 3949 if is_extract_all and non_single_occurrence: 3950 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 3951 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 3952 elif non_single_occurrence: 3953 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 3954 3955 return self.sql(result) 3956 3957 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3958 return self._regexp_extract_sql(expression) 3959 3960 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 3961 return self._regexp_extract_sql(expression) 3962 3963 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3964 this = expression.this 3965 pattern = expression.expression 3966 position = expression.args.get("position") 3967 orig_occ = expression.args.get("occurrence") 3968 occurrence = orig_occ or exp.Literal.number(1) 3969 option = expression.args.get("option") 3970 parameters = expression.args.get("parameters") 3971 3972 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3973 if validated_flags: 3974 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3975 3976 # Handle starting position offset 3977 pos_offset: exp.Expr = exp.Literal.number(0) 3978 if position and (not position.is_int or position.to_py() > 1): 3979 this = exp.Substring(this=this, start=position) 3980 pos_offset = position - exp.Literal.number(1) 3981 3982 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3983 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 3984 lst = exp.Bracket( 3985 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3986 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3987 offset=1, 3988 ) 3989 transform = exp.Anonymous( 3990 this="LIST_TRANSFORM", 3991 expressions=[ 3992 lst, 3993 exp.Lambda( 3994 this=exp.Length(this=exp.to_identifier("x")), 3995 expressions=[exp.to_identifier("x")], 3996 ), 3997 ], 3998 ) 3999 return exp.Coalesce( 4000 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4001 expressions=[exp.Literal.number(0)], 4002 ) 4003 4004 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4005 base_pos: exp.Expr = ( 4006 exp.Literal.number(1) 4007 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4008 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4009 + pos_offset 4010 ) 4011 4012 # option=1: add match length for end position 4013 if option and option.is_int and option.to_py() == 1: 4014 match_at_occ = exp.Bracket( 4015 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4016 expressions=[occurrence], 4017 offset=1, 4018 ) 4019 base_pos = base_pos + exp.Coalesce( 4020 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4021 ) 4022 4023 # NULL checks for all provided arguments 4024 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4025 null_args = [ 4026 expression.this, 4027 expression.expression, 4028 position, 4029 orig_occ, 4030 option, 4031 parameters, 4032 ] 4033 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4034 4035 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4036 4037 return self.sql( 4038 exp.case() 4039 .when(exp.or_(*null_checks), exp.Null()) 4040 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4041 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4042 .else_(base_pos) 4043 ) 4044 4045 @unsupported_args("culture") 4046 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4047 fmt = expression.args.get("format") 4048 if fmt and fmt.is_int: 4049 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4050 4051 self.unsupported("Only integer formats are supported by NumberToStr") 4052 return self.function_fallback_sql(expression) 4053 4054 def autoincrementcolumnconstraint_sql(self, _) -> str: 4055 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 4056 return "" 4057 4058 def aliases_sql(self, expression: exp.Aliases) -> str: 4059 this = expression.this 4060 if isinstance(this, exp.Posexplode): 4061 return self.posexplode_sql(this) 4062 4063 return super().aliases_sql(expression) 4064 4065 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4066 this = expression.this 4067 parent = expression.parent 4068 4069 # The default Spark aliases are "pos" and "col", unless specified otherwise 4070 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4071 4072 if isinstance(parent, exp.Aliases): 4073 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4074 pos, col = parent.expressions 4075 elif isinstance(parent, exp.Table): 4076 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4077 alias = parent.args.get("alias") 4078 if alias: 4079 pos, col = alias.columns or [pos, col] 4080 alias.pop() 4081 4082 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4083 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4084 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4085 gen_subscripts = self.sql( 4086 exp.Alias( 4087 this=exp.Anonymous( 4088 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4089 ) 4090 - exp.Literal.number(1), 4091 alias=pos, 4092 ) 4093 ) 4094 4095 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4096 4097 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4098 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4099 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4100 4101 return posexplode_sql 4102 4103 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4104 """ 4105 Handles three key issues: 4106 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4107 2. End-of-month preservation: If input is last day of month, result is last day of result month 4108 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4109 """ 4110 from sqlglot.optimizer.annotate_types import annotate_types 4111 4112 this = expression.this 4113 if not this.type: 4114 this = annotate_types(this, dialect=self.dialect) 4115 4116 if this.is_type(*exp.DataType.TEXT_TYPES): 4117 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4118 4119 # Detect float/decimal months to apply rounding (Snowflake behavior) 4120 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4121 months_expr = expression.expression 4122 if not months_expr.type: 4123 months_expr = annotate_types(months_expr, dialect=self.dialect) 4124 4125 # Build interval or to_months expression based on type 4126 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4127 interval_or_to_months = ( 4128 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4129 if months_expr.is_type( 4130 exp.DType.FLOAT, 4131 exp.DType.DOUBLE, 4132 exp.DType.DECIMAL, 4133 ) 4134 # Integer case: standard INTERVAL N MONTH syntax 4135 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4136 ) 4137 4138 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4139 4140 # Apply end-of-month preservation if Snowflake flag is set 4141 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4142 preserve_eom = expression.args.get("preserve_end_of_month") 4143 result_expr = ( 4144 exp.case() 4145 .when( 4146 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4147 exp.func("LAST_DAY", date_add_expr), 4148 ) 4149 .else_(date_add_expr) 4150 if preserve_eom 4151 else date_add_expr 4152 ) 4153 4154 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4155 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4156 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4157 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4158 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4159 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4160 return self.sql(result_expr) 4161 4162 def format_sql(self, expression: exp.Format) -> str: 4163 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 4164 return self.func("FORMAT", "'{}'", expression.expressions[0]) 4165 4166 return self.function_fallback_sql(expression) 4167 4168 def hexstring_sql( 4169 self, expression: exp.HexString, binary_function_repr: str | None = None 4170 ) -> str: 4171 # UNHEX('FF') correctly produces blob \xFF in DuckDB 4172 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 4173 4174 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4175 unit = expression.args.get("unit") 4176 date = expression.this 4177 4178 week_start = _week_unit_to_dow(unit) 4179 unit = unit_to_str(expression) 4180 4181 if week_start: 4182 result = self.sql( 4183 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4184 ) 4185 else: 4186 result = self.func("DATE_TRUNC", unit, date) 4187 4188 if ( 4189 expression.args.get("input_type_preserved") 4190 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4191 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4192 ): 4193 return self.sql(exp.Cast(this=result, to=date.type)) 4194 4195 return result 4196 4197 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4198 unit = unit_to_str(expression) 4199 zone = expression.args.get("zone") 4200 timestamp = expression.this 4201 date_unit = is_date_unit(unit) 4202 4203 if date_unit and zone: 4204 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4205 # Double AT TIME ZONE needed for BigQuery compatibility: 4206 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4207 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4208 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4209 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4210 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4211 4212 result = self.func("DATE_TRUNC", unit, timestamp) 4213 if expression.args.get("input_type_preserved"): 4214 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4215 dummy_date = exp.Cast( 4216 this=exp.Literal.string("1970-01-01"), 4217 to=exp.DataType(this=exp.DType.DATE), 4218 ) 4219 date_time = exp.Add(this=dummy_date, expression=timestamp) 4220 result = self.func("DATE_TRUNC", unit, date_time) 4221 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4222 4223 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4224 date_unit and timestamp.is_type(exp.DType.DATE) 4225 ): 4226 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4227 4228 return result 4229 4230 def trim_sql(self, expression: exp.Trim) -> str: 4231 expression.this.replace(_cast_to_varchar(expression.this)) 4232 if expression.expression: 4233 expression.expression.replace(_cast_to_varchar(expression.expression)) 4234 4235 result_sql = super().trim_sql(expression) 4236 return _gen_with_cast_to_blob(self, expression, result_sql) 4237 4238 def round_sql(self, expression: exp.Round) -> str: 4239 this = expression.this 4240 decimals = expression.args.get("decimals") 4241 truncate = expression.args.get("truncate") 4242 4243 # DuckDB requires the scale (decimals) argument to be an INT 4244 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4245 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4246 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4247 decimals = exp.cast(decimals, exp.DType.INT) 4248 4249 func = "ROUND" 4250 if truncate: 4251 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4252 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4253 func = "ROUND_EVEN" 4254 truncate = None 4255 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4256 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4257 truncate = None 4258 4259 return self.func(func, this, decimals, truncate) 4260 4261 def strtok_sql(self, expression: exp.Strtok) -> str: 4262 string_arg = expression.this 4263 delimiter_arg = expression.args.get("delimiter") 4264 part_index_arg = expression.args.get("part_index") 4265 4266 if delimiter_arg and part_index_arg: 4267 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4268 escaped_delimiter = exp.Anonymous( 4269 this="REGEXP_REPLACE", 4270 expressions=[ 4271 delimiter_arg, 4272 exp.Literal.string( 4273 r"([\[\]^.\-*+?(){}|$\\])" 4274 ), # Escape problematic regex chars 4275 exp.Literal.string( 4276 r"\\\1" 4277 ), # Replace with escaped version using $1 backreference 4278 exp.Literal.string("g"), # Global flag 4279 ], 4280 ) 4281 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4282 regex_pattern = ( 4283 exp.case() 4284 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4285 .else_( 4286 exp.func( 4287 "CONCAT", 4288 exp.Literal.string("["), 4289 escaped_delimiter, 4290 exp.Literal.string("]"), 4291 ) 4292 ) 4293 ) 4294 4295 # STRTOK skips empty strings, so we need to filter them out 4296 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4297 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4298 x = exp.to_identifier("x") 4299 is_empty = x.eq(exp.Literal.string("")) 4300 filtered_array = exp.func( 4301 "LIST_FILTER", 4302 split_array, 4303 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4304 ) 4305 base_func = exp.Bracket( 4306 this=filtered_array, 4307 expressions=[part_index_arg], 4308 offset=1, 4309 ) 4310 4311 # Use template with the built regex pattern 4312 result = exp.replace_placeholders( 4313 self.STRTOK_TEMPLATE.copy(), 4314 string=string_arg, 4315 delimiter=delimiter_arg, 4316 part_index=part_index_arg, 4317 base_func=base_func, 4318 ) 4319 4320 return self.sql(result) 4321 4322 return self.function_fallback_sql(expression) 4323 4324 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4325 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4326 4327 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4328 if expression.is_type(*exp.DataType.REAL_TYPES): 4329 result = f"CAST({result} AS DOUBLE)" 4330 4331 return result 4332 4333 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4334 """ 4335 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4336 dividing the input distribution into n equal-sized buckets. 4337 4338 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4339 does not document the specific algorithm used so results may differ. DuckDB does not 4340 support RESPECT NULLS. 4341 """ 4342 this = expression.this 4343 if isinstance(this, exp.Distinct): 4344 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4345 if len(this.expressions) < 2: 4346 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4347 return self.function_fallback_sql(expression) 4348 num_quantiles_expr = this.expressions[1].pop() 4349 else: 4350 num_quantiles_expr = expression.expression 4351 4352 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4353 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4354 return self.function_fallback_sql(expression) 4355 4356 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4357 if num_quantiles <= 0: 4358 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4359 return self.function_fallback_sql(expression) 4360 4361 quantiles = [ 4362 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4363 for i in range(num_quantiles + 1) 4364 ] 4365 4366 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))) 4367 4368 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4369 if expression.args.get("scalar_only"): 4370 expression = exp.JSONExtractScalar( 4371 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4372 ) 4373 return _arrow_json_extract_sql(self, expression) 4374 4375 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4376 this = expression.this 4377 4378 if _is_binary(this): 4379 expression.type = exp.DType.BINARY.into_expr() 4380 4381 arg = _cast_to_bit(this) 4382 4383 if isinstance(this, exp.Neg): 4384 arg = exp.Paren(this=arg) 4385 4386 expression.set("this", arg) 4387 4388 result_sql = f"~{self.sql(expression, 'this')}" 4389 4390 return _gen_with_cast_to_blob(self, expression, result_sql) 4391 4392 def window_sql(self, expression: exp.Window) -> str: 4393 this = expression.this 4394 if isinstance(this, exp.Corr) or ( 4395 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4396 ): 4397 return self._corr_sql(expression) 4398 4399 return super().window_sql(expression) 4400 4401 def filter_sql(self, expression: exp.Filter) -> str: 4402 if isinstance(expression.this, exp.Corr): 4403 return self._corr_sql(expression) 4404 4405 return super().filter_sql(expression) 4406 4407 def _corr_sql( 4408 self, 4409 expression: exp.Filter | exp.Window | exp.Corr, 4410 ) -> str: 4411 if isinstance(expression, exp.Corr) and not expression.args.get("null_on_zero_variance"): 4412 return self.func("CORR", expression.this, expression.expression) 4413 4414 corr_expr = _maybe_corr_null_to_false(expression) 4415 if corr_expr is None: 4416 if isinstance(expression, exp.Window): 4417 return super().window_sql(expression) 4418 if isinstance(expression, exp.Filter): 4419 return super().filter_sql(expression) 4420 corr_expr = expression # make mypy happy 4421 4422 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
1456class DuckDBGenerator(generator.Generator): 1457 PARAMETER_TOKEN = "$" 1458 NAMED_PLACEHOLDER_TOKEN = "$" 1459 JOIN_HINTS = False 1460 TABLE_HINTS = False 1461 QUERY_HINTS = False 1462 LIMIT_FETCH = "LIMIT" 1463 STRUCT_DELIMITER = ("(", ")") 1464 RENAME_TABLE_WITH_DB = False 1465 NVL2_SUPPORTED = False 1466 SEMI_ANTI_JOIN_WITH_SIDE = False 1467 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1468 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1469 LAST_DAY_SUPPORTS_DATE_PART = False 1470 JSON_KEY_VALUE_PAIR_SEP = "," 1471 IGNORE_NULLS_IN_FUNC = True 1472 IGNORE_NULLS_BEFORE_ORDER = False 1473 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1474 SUPPORTS_CREATE_TABLE_LIKE = False 1475 MULTI_ARG_DISTINCT = False 1476 CAN_IMPLEMENT_ARRAY_ANY = True 1477 SUPPORTS_TO_NUMBER = False 1478 SELECT_KINDS: tuple[str, ...] = () 1479 SUPPORTS_DECODE_CASE = False 1480 SUPPORTS_DROP_ALTER_ICEBERG_PROPERTY = False 1481 1482 AFTER_HAVING_MODIFIER_TRANSFORMS = generator.AFTER_HAVING_MODIFIER_TRANSFORMS 1483 SUPPORTS_WINDOW_EXCLUDE = True 1484 COPY_HAS_INTO_KEYWORD = False 1485 STAR_EXCEPT = "EXCLUDE" 1486 PAD_FILL_PATTERN_IS_REQUIRED = True 1487 ARRAY_SIZE_DIM_REQUIRED: bool | None = False 1488 NORMALIZE_EXTRACT_DATE_PARTS = True 1489 SUPPORTS_LIKE_QUANTIFIERS = False 1490 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1491 1492 TRANSFORMS = { 1493 **generator.Generator.TRANSFORMS, 1494 exp.AnyValue: _anyvalue_sql, 1495 exp.ApproxDistinct: approx_count_distinct_sql, 1496 exp.Boolnot: _boolnot_sql, 1497 exp.Booland: _booland_sql, 1498 exp.Boolor: _boolor_sql, 1499 exp.Array: transforms.preprocess( 1500 [transforms.inherit_struct_field_names], 1501 generator=inline_array_unless_query, 1502 ), 1503 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1504 exp.ArrayCompact: array_compact_sql, 1505 exp.ArrayConstructCompact: lambda self, e: self.sql( 1506 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1507 ), 1508 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1509 exp.ArrayContains: _array_contains_sql, 1510 exp.ArrayOverlaps: _array_overlaps_sql, 1511 exp.ArrayFilter: rename_func("LIST_FILTER"), 1512 exp.ArrayInsert: _array_insert_sql, 1513 exp.ArrayPosition: lambda self, e: ( 1514 self.sql( 1515 exp.Sub( 1516 this=exp.ArrayPosition(this=e.this, expression=e.expression), 1517 expression=exp.Literal.number(1), 1518 ) 1519 ) 1520 if e.args.get("zero_based") 1521 else self.func("ARRAY_POSITION", e.this, e.expression) 1522 ), 1523 exp.ArrayRemoveAt: _array_remove_at_sql, 1524 exp.ArrayRemove: remove_from_array_using_filter, 1525 exp.ArraySort: _array_sort_sql, 1526 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1527 exp.ArraySum: rename_func("LIST_SUM"), 1528 exp.ArrayMax: rename_func("LIST_MAX"), 1529 exp.ArrayMin: rename_func("LIST_MIN"), 1530 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1531 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1532 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1533 exp.BitwiseAndAgg: _bitwise_agg_sql, 1534 exp.BitwiseCount: rename_func("BIT_COUNT"), 1535 exp.BitwiseLeftShift: _bitshift_sql, 1536 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1537 exp.BitwiseOrAgg: _bitwise_agg_sql, 1538 exp.BitwiseRightShift: _bitshift_sql, 1539 exp.BitwiseXorAgg: _bitwise_agg_sql, 1540 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1541 exp.Corr: lambda self, e: self._corr_sql(e), 1542 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1543 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1544 exp.CurrentSchemas: lambda self, e: self.func( 1545 "current_schemas", e.this if e.this else exp.true() 1546 ), 1547 exp.CurrentTimestamp: lambda self, e: ( 1548 self.sql( 1549 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1550 ) 1551 if e.args.get("sysdate") 1552 else "CURRENT_TIMESTAMP" 1553 ), 1554 exp.CurrentVersion: rename_func("version"), 1555 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1556 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1557 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1558 exp.DayOfWeekIso: rename_func("ISODOW"), 1559 exp.DayOfYear: rename_func("DAYOFYEAR"), 1560 exp.Dayname: lambda self, e: ( 1561 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1562 if e.args.get("abbreviated") 1563 else self.func("DAYNAME", e.this) 1564 ), 1565 exp.Monthname: lambda self, e: ( 1566 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1567 if e.args.get("abbreviated") 1568 else self.func("MONTHNAME", e.this) 1569 ), 1570 exp.DataType: _datatype_sql, 1571 exp.Date: _date_sql, 1572 exp.DateAdd: _date_delta_to_binary_interval_op(), 1573 exp.DateFromParts: _date_from_parts_sql, 1574 exp.DateSub: _date_delta_to_binary_interval_op(), 1575 exp.DateDiff: _date_diff_sql, 1576 exp.DateStrToDate: datestrtodate_sql, 1577 exp.Datetime: no_datetime_sql, 1578 exp.DatetimeDiff: _date_diff_sql, 1579 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1580 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1581 exp.DateToDi: lambda self, e: ( 1582 f"CAST(STRFTIME({self.sql(e, 'this')}, {self.dialect.DATEINT_FORMAT}) AS INT)" 1583 ), 1584 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1585 exp.DiToDate: lambda self, e: ( 1586 f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {self.dialect.DATEINT_FORMAT}) AS DATE)" 1587 ), 1588 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1589 exp.EqualNull: lambda self, e: self.sql( 1590 exp.NullSafeEQ(this=e.this, expression=e.expression) 1591 ), 1592 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1593 exp.GenerateDateArray: _generate_datetime_array_sql, 1594 exp.GenerateSeries: generate_series_sql("GENERATE_SERIES", "RANGE"), 1595 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1596 exp.Getbit: getbit_sql, 1597 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1598 exp.Explode: rename_func("UNNEST"), 1599 exp.IcebergProperty: lambda *_: "", 1600 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1601 exp.IsInf: rename_func("ISINF"), 1602 exp.IsNan: rename_func("ISNAN"), 1603 exp.IsNullValue: lambda self, e: self.sql( 1604 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1605 ), 1606 exp.IsArray: lambda self, e: self.sql( 1607 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1608 ), 1609 exp.Ceil: _ceil_floor, 1610 exp.Floor: _ceil_floor, 1611 exp.JSONBExists: rename_func("JSON_EXISTS"), 1612 exp.JSONExtract: _arrow_json_extract_sql, 1613 exp.JSONExtractArray: _json_extract_value_array_sql, 1614 exp.JSONFormat: _json_format_sql, 1615 exp.JSONValueArray: _json_extract_value_array_sql, 1616 exp.Lateral: _explode_to_unnest_sql, 1617 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1618 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1619 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 1620 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1621 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1622 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1623 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1624 exp.BoolxorAgg: _boolxor_agg_sql, 1625 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1626 exp.Initcap: _initcap_sql, 1627 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1628 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 1629 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 1630 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 1631 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 1632 exp.MonthsBetween: months_between_sql, 1633 exp.NextDay: _day_navigation_sql, 1634 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1635 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1636 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1637 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1638 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1639 exp.PreviousDay: _day_navigation_sql, 1640 exp.RegexpILike: lambda self, e: self.func( 1641 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1642 ), 1643 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1644 exp.RegrValx: _regr_val_sql, 1645 exp.RegrValy: _regr_val_sql, 1646 exp.Return: lambda self, e: self.sql(e, "this"), 1647 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1648 exp.StrToUnix: lambda self, e: self.func( 1649 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1650 ), 1651 exp.Struct: _struct_sql, 1652 exp.Transform: rename_func("LIST_TRANSFORM"), 1653 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1654 exp.TimeSub: _date_delta_to_binary_interval_op(), 1655 exp.Time: no_time_sql, 1656 exp.TimeDiff: _timediff_sql, 1657 exp.Timestamp: no_timestamp_sql, 1658 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1659 exp.TimestampDiff: lambda self, e: self.func( 1660 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1661 ), 1662 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1663 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DType.DATE)), 1664 exp.TimeStrToTime: timestrtotime_sql, 1665 exp.TimeStrToUnix: lambda self, e: self.func( 1666 "EPOCH", exp.cast(e.this, exp.DType.TIMESTAMP) 1667 ), 1668 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1669 exp.ToBoolean: _to_boolean_sql, 1670 exp.ToVariant: lambda self, e: self.sql( 1671 exp.cast(e.this, exp.DataType.build("VARIANT", dialect="duckdb")) 1672 ), 1673 exp.TimeToUnix: rename_func("EPOCH"), 1674 exp.TsOrDiToDi: lambda self, e: ( 1675 f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)" 1676 ), 1677 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1678 exp.TsOrDsDiff: lambda self, e: self.func( 1679 "DATE_DIFF", 1680 f"'{e.args.get('unit') or 'DAY'}'", 1681 exp.cast(e.expression, exp.DType.TIMESTAMP), 1682 exp.cast(e.this, exp.DType.TIMESTAMP), 1683 ), 1684 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1685 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1686 exp.UnixSeconds: lambda self, e: self.sql( 1687 exp.cast(self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DType.BIGINT) 1688 ), 1689 exp.UnixToStr: lambda self, e: self.func( 1690 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1691 ), 1692 exp.DatetimeTrunc: lambda self, e: self.func( 1693 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DType.DATETIME) 1694 ), 1695 exp.UnixToTime: _unix_to_time_sql, 1696 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1697 exp.VariancePop: rename_func("VAR_POP"), 1698 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1699 exp.YearOfWeek: lambda self, e: self.sql( 1700 exp.Extract( 1701 this=exp.Var(this="ISOYEAR"), 1702 expression=e.this, 1703 ) 1704 ), 1705 exp.YearOfWeekIso: lambda self, e: self.sql( 1706 exp.Extract( 1707 this=exp.Var(this="ISOYEAR"), 1708 expression=e.this, 1709 ) 1710 ), 1711 exp.Xor: _xor_sql, 1712 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1713 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1714 exp.DateBin: rename_func("TIME_BUCKET"), 1715 exp.LastDay: _last_day_sql, 1716 } 1717 1718 SUPPORTED_JSON_PATH_PARTS = { 1719 exp.JSONPathKey, 1720 exp.JSONPathRoot, 1721 exp.JSONPathSubscript, 1722 exp.JSONPathWildcard, 1723 } 1724 1725 TYPE_MAPPING = { 1726 **generator.Generator.TYPE_MAPPING, 1727 exp.DType.BINARY: "BLOB", 1728 exp.DType.BPCHAR: "TEXT", 1729 exp.DType.CHAR: "TEXT", 1730 exp.DType.DATETIME: "TIMESTAMP", 1731 exp.DType.DECFLOAT: "DECIMAL(38, 5)", 1732 exp.DType.FLOAT: "REAL", 1733 exp.DType.JSONB: "JSON", 1734 exp.DType.NCHAR: "TEXT", 1735 exp.DType.NVARCHAR: "TEXT", 1736 exp.DType.UINT: "UINTEGER", 1737 exp.DType.VARBINARY: "BLOB", 1738 exp.DType.ROWVERSION: "BLOB", 1739 exp.DType.VARCHAR: "TEXT", 1740 exp.DType.TIMESTAMPLTZ: "TIMESTAMPTZ", 1741 exp.DType.TIMESTAMPNTZ: "TIMESTAMP", 1742 exp.DType.TIMESTAMP_S: "TIMESTAMP_S", 1743 exp.DType.TIMESTAMP_MS: "TIMESTAMP_MS", 1744 exp.DType.TIMESTAMP_NS: "TIMESTAMP_NS", 1745 exp.DType.BIGDECIMAL: "DECIMAL(38, 5)", 1746 } 1747 1748 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1749 RESERVED_KEYWORDS = { 1750 "array", 1751 "analyse", 1752 "union", 1753 "all", 1754 "when", 1755 "in_p", 1756 "default", 1757 "create_p", 1758 "window", 1759 "asymmetric", 1760 "to", 1761 "else", 1762 "localtime", 1763 "from", 1764 "end_p", 1765 "select", 1766 "current_date", 1767 "foreign", 1768 "with", 1769 "grant", 1770 "session_user", 1771 "or", 1772 "except", 1773 "references", 1774 "fetch", 1775 "limit", 1776 "group_p", 1777 "leading", 1778 "into", 1779 "collate", 1780 "offset", 1781 "do", 1782 "then", 1783 "localtimestamp", 1784 "check_p", 1785 "lateral_p", 1786 "current_role", 1787 "where", 1788 "asc_p", 1789 "placing", 1790 "desc_p", 1791 "user", 1792 "unique", 1793 "initially", 1794 "column", 1795 "both", 1796 "some", 1797 "as", 1798 "any", 1799 "only", 1800 "deferrable", 1801 "null_p", 1802 "current_time", 1803 "true_p", 1804 "table", 1805 "case", 1806 "trailing", 1807 "variadic", 1808 "for", 1809 "on", 1810 "distinct", 1811 "false_p", 1812 "not", 1813 "constraint", 1814 "current_timestamp", 1815 "returning", 1816 "primary", 1817 "intersect", 1818 "having", 1819 "analyze", 1820 "current_user", 1821 "and", 1822 "cast", 1823 "symmetric", 1824 "using", 1825 "order", 1826 "current_catalog", 1827 } 1828 1829 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 1830 1831 # DuckDB doesn't generally support CREATE TABLE .. properties 1832 # https://duckdb.org/docs/sql/statements/create_table.html 1833 # There are a few exceptions (e.g. temporary tables) which are supported or 1834 # can be transpiled to DuckDB, so we explicitly override them accordingly 1835 PROPERTIES_LOCATION = { 1836 **{ 1837 prop: exp.Properties.Location.UNSUPPORTED 1838 for prop in generator.Generator.PROPERTIES_LOCATION 1839 }, 1840 exp.LikeProperty: exp.Properties.Location.POST_SCHEMA, 1841 exp.TemporaryProperty: exp.Properties.Location.POST_CREATE, 1842 exp.ReturnsProperty: exp.Properties.Location.POST_ALIAS, 1843 exp.SequenceProperties: exp.Properties.Location.POST_EXPRESSION, 1844 exp.IcebergProperty: exp.Properties.Location.POST_CREATE, 1845 } 1846 1847 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS: t.ClassVar = _IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS 1848 1849 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 1850 ZIPF_TEMPLATE: exp.Expr = exp.maybe_parse( 1851 """ 1852 WITH rand AS (SELECT :random_expr AS r), 1853 weights AS ( 1854 SELECT i, 1.0 / POWER(i, :s) AS w 1855 FROM RANGE(1, :n + 1) AS t(i) 1856 ), 1857 cdf AS ( 1858 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 1859 FROM weights 1860 ) 1861 SELECT MIN(i) 1862 FROM cdf 1863 WHERE p >= (SELECT r FROM rand) 1864 """ 1865 ) 1866 1867 # Template for NORMAL transpilation using Box-Muller transform 1868 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 1869 NORMAL_TEMPLATE: exp.Expr = exp.maybe_parse( 1870 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 1871 ) 1872 1873 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 1874 SEEDED_RANDOM_TEMPLATE: exp.Expr = exp.maybe_parse("(ABS(HASH(:seed)) % 1000000) / 1000000.0") 1875 1876 # Template for generating signed and unsigned SEQ values within a specified range 1877 SEQ_UNSIGNED: exp.Expr = _SEQ_UNSIGNED 1878 SEQ_SIGNED: exp.Expr = _SEQ_SIGNED 1879 1880 # Template for MAP_CAT transpilation - Snowflake semantics: 1881 # 1. Returns NULL if either input is NULL 1882 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 1883 # 3. Filters out entries with NULL values from the result 1884 MAPCAT_TEMPLATE: exp.Expr = exp.maybe_parse( 1885 """ 1886 CASE 1887 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 1888 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 1889 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 1890 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 1891 ), __x -> __x.value IS NOT NULL)) 1892 END 1893 """ 1894 ) 1895 1896 # Mappings for EXTRACT/DATE_PART transpilation 1897 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 1898 EXTRACT_STRFTIME_MAPPINGS: dict[str, tuple[str, str]] = { 1899 "WEEKISO": ("%V", "INTEGER"), 1900 "YEAROFWEEK": ("%G", "INTEGER"), 1901 "YEAROFWEEKISO": ("%G", "INTEGER"), 1902 "NANOSECOND": ("%n", "BIGINT"), 1903 } 1904 1905 # Maps epoch-based specifiers to DuckDB epoch functions 1906 EXTRACT_EPOCH_MAPPINGS: dict[str, str] = { 1907 "EPOCH_SECOND": "EPOCH", 1908 "EPOCH_MILLISECOND": "EPOCH_MS", 1909 "EPOCH_MICROSECOND": "EPOCH_US", 1910 "EPOCH_NANOSECOND": "EPOCH_NS", 1911 } 1912 1913 # Template for BITMAP_CONSTRUCT_AGG transpilation 1914 # 1915 # BACKGROUND: 1916 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 1917 # Supports values in range 0-32767, this version returns NULL if any value is out of range 1918 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 1919 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 1920 # 1921 # Snowflake uses two different formats based on the number of unique values: 1922 # 1923 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 1924 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 1925 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 1926 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 1927 # count v1 v2 v3 pad 1928 # 1929 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 1930 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 1931 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 1932 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 1933 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 1934 # 1935 # TEMPLATE STRUCTURE 1936 # 1937 # Phase 1 - Innermost subquery: Data preparation 1938 # SELECT LIST_SORT(...) AS l 1939 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 1940 # Result: Clean, sorted list of unique non-null integers stored as 'l' 1941 # 1942 # Phase 2 - Middle subquery: Hex string construction 1943 # LIST_TRANSFORM(...) 1944 # - Converts each integer to 2-byte little-endian hex representation 1945 # - & 255 extracts low byte, >> 8 extracts high byte 1946 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 1947 # Result: Hex string of all values 1948 # 1949 # Phase 3 - Outer SELECT: Final bitmap assembly 1950 # LENGTH(l) < 5: 1951 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 1952 # LENGTH(l) >= 5: 1953 # - Large format: Fixed 10-byte header + values (no padding needed) 1954 # Result: Complete binary bitmap as BLOB 1955 # 1956 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expr = exp.maybe_parse( 1957 """ 1958 SELECT CASE 1959 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 1960 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 1961 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 1962 ELSE UNHEX('08000000000000000000' || h) 1963 END 1964 FROM ( 1965 SELECT l, COALESCE(LIST_REDUCE( 1966 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 1967 (__a, __b) -> __a || __b, '' 1968 ), '') AS h 1969 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 1970 ) 1971 """ 1972 ) 1973 1974 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 1975 RANDSTR_TEMPLATE: exp.Expr = exp.maybe_parse( 1976 f""" 1977 SELECT LISTAGG( 1978 SUBSTRING( 1979 '{RANDSTR_CHAR_POOL}', 1980 1 + CAST(FLOOR(random_value * 62) AS INT), 1981 1 1982 ), 1983 '' 1984 ) 1985 FROM ( 1986 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 1987 FROM RANGE(:length) AS t(i) 1988 ) 1989 """, 1990 ) 1991 1992 # Template for MINHASH transpilation 1993 # Computes k minimum hash values across aggregated data using DuckDB list functions 1994 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 1995 MINHASH_TEMPLATE: exp.Expr = exp.maybe_parse( 1996 """ 1997 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 1998 FROM ( 1999 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2000 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2001 ) 2002 """, 2003 ) 2004 2005 # Template for MINHASH_COMBINE transpilation 2006 # Combines multiple minhash signatures by taking element-wise minimum 2007 MINHASH_COMBINE_TEMPLATE: exp.Expr = exp.maybe_parse( 2008 """ 2009 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2010 FROM ( 2011 SELECT 2012 pos AS idx, 2013 MIN(val) AS min_h 2014 FROM 2015 UNNEST(LIST(:expr)) AS _(sig), 2016 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2017 GROUP BY pos 2018 ) 2019 """, 2020 ) 2021 2022 # Template for APPROXIMATE_SIMILARITY transpilation 2023 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2024 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expr = exp.maybe_parse( 2025 """ 2026 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2027 FROM ( 2028 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2029 FROM ( 2030 SELECT h, pos 2031 FROM UNNEST(LIST(:expr)) AS _(sig), 2032 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2033 ) 2034 GROUP BY pos 2035 ) 2036 """, 2037 ) 2038 2039 # Template for ARRAYS_ZIP transpilation 2040 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2041 # Uses RANGE + indexing to match Snowflake behavior 2042 ARRAYS_ZIP_TEMPLATE: exp.Expr = exp.maybe_parse( 2043 """ 2044 CASE WHEN :null_check THEN NULL 2045 WHEN :all_empty_check THEN [:empty_struct] 2046 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2047 END 2048 """, 2049 ) 2050 2051 # Shared bag semantics outer frame for ARRAY_EXCEPT and ARRAY_INTERSECTION. 2052 # Each element is paired with its 1-based position via LIST_ZIP, then filtered 2053 # by a comparison operator (supplied via :cond) that determines the operation: 2054 # EXCEPT (>): keep the N-th occurrence only if N > count in arr2 2055 # e.g. [2,2,2] EXCEPT [2,2] -> [2] 2056 # INTERSECTION (<=): keep the N-th occurrence only if N <= count in arr2 2057 # e.g. [2,2,2] INTERSECT [2,2] -> [2,2] 2058 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2059 ARRAY_BAG_TEMPLATE: exp.Expr = exp.maybe_parse( 2060 """ 2061 CASE 2062 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2063 ELSE LIST_TRANSFORM( 2064 LIST_FILTER( 2065 LIST_ZIP(:arr1, GENERATE_SERIES(1, LEN(:arr1))), 2066 pair -> :cond 2067 ), 2068 pair -> pair[0] 2069 ) 2070 END 2071 """ 2072 ) 2073 2074 ARRAY_EXCEPT_CONDITION: exp.Expr = exp.maybe_parse( 2075 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2076 " > LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2077 ) 2078 2079 ARRAY_INTERSECTION_CONDITION: exp.Expr = exp.maybe_parse( 2080 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2081 " <= LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2082 ) 2083 2084 # Set semantics for ARRAY_EXCEPT. Deduplicates arr1 via LIST_DISTINCT, then 2085 # filters out any element that appears at least once in arr2. 2086 # e.g. [1,1,2,3] EXCEPT [1] -> [2,3] 2087 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2088 ARRAY_EXCEPT_SET_TEMPLATE: exp.Expr = exp.maybe_parse( 2089 """ 2090 CASE 2091 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2092 ELSE LIST_FILTER( 2093 LIST_DISTINCT(:arr1), 2094 e -> LEN(LIST_FILTER(:arr2, x -> x IS NOT DISTINCT FROM e)) = 0 2095 ) 2096 END 2097 """ 2098 ) 2099 2100 # Template for STRTOK function transpilation 2101 # 2102 # DuckDB itself doesn't have a strtok function. This handles the transpilation from Snowflake to DuckDB. 2103 # We may need to adjust this if we want to support transpilation from other dialects 2104 # 2105 # CASE 2106 # -- Snowflake: empty delimiter + empty input string -> NULL 2107 # WHEN delimiter = '' AND input_str = '' THEN NULL 2108 # 2109 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return input string if index is 1 2110 # WHEN delimiter = '' AND index = 1 THEN input_str 2111 # 2112 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return NULL if index is not 1 2113 # WHEN delimiter = '' THEN NULL 2114 # 2115 # -- Snowflake: negative indices return NULL 2116 # WHEN index < 0 THEN NULL 2117 # 2118 # -- Snowflake: return NULL if any argument is NULL 2119 # WHEN input_str IS NULL OR delimiter IS NULL OR index IS NULL THEN NULL 2120 # 2121 # 2122 # ELSE LIST_FILTER( 2123 # REGEXP_SPLIT_TO_ARRAY( 2124 # input_str, 2125 # CASE 2126 # -- if delimiter is '', we don't want to surround it with '[' and ']' as '[]' is invalid for DuckDB 2127 # WHEN delimiter = '' THEN '' 2128 # 2129 # -- handle problematic regex characters in delimiter with REGEXP_REPLACE 2130 # -- turn delimiter into a regex char set, otherwise DuckDB will match in order, which we don't want 2131 # ELSE '[' || REGEXP_REPLACE(delimiter, problematic_char_set, '\\\1', 'g') || ']' 2132 # END 2133 # ), 2134 # 2135 # -- Snowflake: don't return empty strings 2136 # x -> NOT x = '' 2137 # )[index] 2138 # END 2139 STRTOK_TEMPLATE: exp.Expr = exp.maybe_parse( 2140 """ 2141 CASE 2142 WHEN :delimiter = '' AND :string = '' THEN NULL 2143 WHEN :delimiter = '' AND :part_index = 1 THEN :string 2144 WHEN :delimiter = '' THEN NULL 2145 WHEN :part_index < 0 THEN NULL 2146 WHEN :string IS NULL OR :delimiter IS NULL OR :part_index IS NULL THEN NULL 2147 ELSE :base_func 2148 END 2149 """ 2150 ) 2151 2152 def _array_bag_sql(self, condition: exp.Expr, arr1: exp.Expr, arr2: exp.Expr) -> str: 2153 cond = exp.Paren(this=exp.replace_placeholders(condition, arr1=arr1, arr2=arr2)) 2154 return self.sql( 2155 exp.replace_placeholders(self.ARRAY_BAG_TEMPLATE, arr1=arr1, arr2=arr2, cond=cond) 2156 ) 2157 2158 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2159 """ 2160 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2161 2162 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2163 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2164 2165 For 'END' kind, add the interval to get the end of the slice. 2166 For DATE type with 'END', cast result back to DATE to preserve type. 2167 """ 2168 date_expr = expression.this 2169 slice_length = expression.expression 2170 unit = expression.unit 2171 kind = expression.text("kind").upper() 2172 2173 # Create INTERVAL expression: INTERVAL 'N' UNIT 2174 interval_expr = exp.Interval(this=slice_length, unit=unit) 2175 2176 # Create base time_bucket expression 2177 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2178 2179 # Check if we need the end of the slice (default is start) 2180 if not kind == "END": 2181 # For 'START', return time_bucket directly 2182 return self.sql(time_bucket_expr) 2183 2184 # For 'END', add the interval to get end of slice 2185 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2186 2187 # If input is DATE type, cast result back to DATE to preserve type 2188 # DuckDB converts DATE to TIMESTAMP when adding intervals 2189 if date_expr.is_type(exp.DType.DATE): 2190 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2191 2192 return self.sql(add_expr) 2193 2194 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2195 """ 2196 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2197 2198 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2199 - Each bucket covers 32,768 values 2200 - Bucket numbering starts at 1 2201 - Formula: ((value - 1) // 32768) + 1 for positive values 2202 2203 For non-positive values (0 and negative), we use value // 32768 to avoid 2204 producing bucket 0 or positive bucket IDs for negative inputs. 2205 """ 2206 value = expression.this 2207 2208 positive_formula = ((value - 1) // 32768) + 1 2209 non_positive_formula = value // 32768 2210 2211 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2212 case_expr = ( 2213 exp.case() 2214 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2215 .else_(non_positive_formula) 2216 ) 2217 return self.sql(case_expr) 2218 2219 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2220 """ 2221 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2222 2223 Snowflake's BITMAP_BIT_POSITION behavior: 2224 - For n <= 0: returns ABS(n) % 32768 2225 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2226 """ 2227 this = expression.this 2228 2229 return self.sql( 2230 exp.Mod( 2231 this=exp.Paren( 2232 this=exp.If( 2233 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2234 true=this - exp.Literal.number(1), 2235 false=exp.Abs(this=this), 2236 ) 2237 ), 2238 expression=MAX_BIT_POSITION, 2239 ) 2240 ) 2241 2242 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2243 """ 2244 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2245 Uses a pre-parsed template with placeholders replaced by expression nodes. 2246 2247 Snowflake bitmap format: 2248 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2249 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2250 """ 2251 arg = expression.this 2252 return ( 2253 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2254 ) 2255 2256 def compress_sql(self, expression: exp.Compress) -> str: 2257 self.unsupported("DuckDB does not support the COMPRESS() function") 2258 return self.function_fallback_sql(expression) 2259 2260 def encrypt_sql(self, expression: exp.Encrypt) -> str: 2261 self.unsupported("ENCRYPT is not supported in DuckDB") 2262 return self.function_fallback_sql(expression) 2263 2264 def decrypt_sql(self, expression: exp.Decrypt) -> str: 2265 func_name = "TRY_DECRYPT" if expression.args.get("safe") else "DECRYPT" 2266 self.unsupported(f"{func_name} is not supported in DuckDB") 2267 return self.function_fallback_sql(expression) 2268 2269 def decryptraw_sql(self, expression: exp.DecryptRaw) -> str: 2270 func_name = "TRY_DECRYPT_RAW" if expression.args.get("safe") else "DECRYPT_RAW" 2271 self.unsupported(f"{func_name} is not supported in DuckDB") 2272 return self.function_fallback_sql(expression) 2273 2274 def encryptraw_sql(self, expression: exp.EncryptRaw) -> str: 2275 self.unsupported("ENCRYPT_RAW is not supported in DuckDB") 2276 return self.function_fallback_sql(expression) 2277 2278 def parseurl_sql(self, expression: exp.ParseUrl) -> str: 2279 self.unsupported("PARSE_URL is not supported in DuckDB") 2280 return self.function_fallback_sql(expression) 2281 2282 def parseip_sql(self, expression: exp.ParseIp) -> str: 2283 self.unsupported("PARSE_IP is not supported in DuckDB") 2284 return self.function_fallback_sql(expression) 2285 2286 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2287 this = expression.this 2288 expr = expression.expression 2289 2290 if expression.args.get("case_insensitive"): 2291 this = exp.Upper(this=this) 2292 expr = exp.Upper(this=expr) 2293 2294 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2295 2296 if expression.args.get("integer_scale"): 2297 result = exp.cast(result * 100, "INTEGER") 2298 2299 return self.sql(result) 2300 2301 def nthvalue_sql(self, expression: exp.NthValue) -> str: 2302 from_first = expression.args.get("from_first", True) 2303 if not from_first: 2304 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2305 2306 return self.function_fallback_sql(expression) 2307 2308 def randstr_sql(self, expression: exp.Randstr) -> str: 2309 """ 2310 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2311 Uses a pre-parsed template with placeholders replaced by expression nodes. 2312 2313 RANDSTR(length, generator) generates a random string of specified length. 2314 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2315 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2316 - No generator: Use default seed value 2317 """ 2318 length = expression.this 2319 generator = expression.args.get("generator") 2320 2321 if generator: 2322 if isinstance(generator, exp.Rand): 2323 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2324 seed_value = generator.this or generator 2325 else: 2326 # Const/int or other expression - use as seed directly 2327 seed_value = generator 2328 else: 2329 # No generator specified, use default seed (arbitrary but deterministic) 2330 seed_value = exp.Literal.number(RANDSTR_SEED) 2331 2332 replacements = {"seed": seed_value, "length": length} 2333 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2334 2335 @unsupported_args("finish") 2336 def reduce_sql(self, expression: exp.Reduce) -> str: 2337 array_arg = expression.this 2338 initial_value = expression.args.get("initial") 2339 merge_lambda = expression.args.get("merge") 2340 2341 if merge_lambda: 2342 merge_lambda.set("colon", True) 2343 2344 return self.func("list_reduce", array_arg, merge_lambda, initial_value) 2345 2346 def zipf_sql(self, expression: exp.Zipf) -> str: 2347 """ 2348 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2349 Uses a pre-parsed template with placeholders replaced by expression nodes. 2350 """ 2351 s = expression.this 2352 n = expression.args["elementcount"] 2353 gen = expression.args["gen"] 2354 2355 if not isinstance(gen, exp.Rand): 2356 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2357 random_expr: exp.Expr = exp.Div( 2358 this=exp.Paren( 2359 this=exp.Mod( 2360 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2361 expression=exp.Literal.number(1000000), 2362 ) 2363 ), 2364 expression=exp.Literal.number(1000000.0), 2365 ) 2366 else: 2367 # Use RANDOM() for non-deterministic output 2368 random_expr = exp.Rand() 2369 2370 replacements = {"s": s, "n": n, "random_expr": random_expr} 2371 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2372 2373 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2374 """ 2375 TO_BINARY and TRY_TO_BINARY transpilation: 2376 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2377 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2378 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2379 2380 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2381 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2382 """ 2383 value = expression.this 2384 format_arg = expression.args.get("format") 2385 is_safe = expression.args.get("safe") 2386 is_binary = _is_binary(expression) 2387 2388 if not format_arg and not is_binary: 2389 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2390 return self.func(func_name, value) 2391 2392 # Snowflake defaults to HEX encoding when no format is specified 2393 fmt = format_arg.name.upper() if format_arg else "HEX" 2394 2395 if fmt in ("UTF-8", "UTF8"): 2396 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2397 result = self.func("ENCODE", value) 2398 elif fmt == "BASE64": 2399 result = self.func("FROM_BASE64", value) 2400 elif fmt == "HEX": 2401 result = self.func("UNHEX", value) 2402 else: 2403 if is_safe: 2404 return self.sql(exp.null()) 2405 else: 2406 self.unsupported(f"format {fmt} is not supported") 2407 result = self.func("TO_BINARY", value) 2408 return f"TRY({result})" if is_safe else result 2409 2410 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2411 fmt = expression.args.get("format") 2412 precision = expression.args.get("precision") 2413 scale = expression.args.get("scale") 2414 2415 if not fmt and precision and scale: 2416 return self.sql( 2417 exp.cast( 2418 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2419 ) 2420 ) 2421 2422 return super().tonumber_sql(expression) 2423 2424 def _greatest_least_sql(self, expression: exp.Greatest | exp.Least) -> str: 2425 """ 2426 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2427 2428 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2429 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2430 """ 2431 # Get all arguments 2432 all_args = [expression.this, *expression.expressions] 2433 fallback_sql = self.function_fallback_sql(expression) 2434 2435 if expression.args.get("ignore_nulls"): 2436 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2437 return self.sql(fallback_sql) 2438 2439 # return NULL if any argument is NULL 2440 case_expr = exp.case().when( 2441 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2442 exp.null(), 2443 copy=False, 2444 ) 2445 case_expr.set("default", fallback_sql) 2446 return self.sql(case_expr) 2447 2448 def generator_sql(self, expression: exp.Generator) -> str: 2449 # Transpile Snowflake GENERATOR to DuckDB range() 2450 rowcount = expression.args.get("rowcount") 2451 time_limit = expression.args.get("time_limit") 2452 2453 if time_limit: 2454 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2455 2456 if not rowcount: 2457 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2458 return self.func("range", exp.Literal.number(0)) 2459 2460 return self.func("range", rowcount) 2461 2462 def greatest_sql(self, expression: exp.Greatest) -> str: 2463 return self._greatest_least_sql(expression) 2464 2465 def least_sql(self, expression: exp.Least) -> str: 2466 return self._greatest_least_sql(expression) 2467 2468 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2469 if expression.args.get("colon"): 2470 prefix = "LAMBDA " 2471 arrow_sep = ":" 2472 wrap = False 2473 else: 2474 prefix = "" 2475 2476 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2477 return f"{prefix}{lambda_sql}" 2478 2479 def show_sql(self, expression: exp.Show) -> str: 2480 from_ = self.sql(expression, "from_") 2481 from_ = f" FROM {from_}" if from_ else "" 2482 return f"SHOW {expression.name}{from_}" 2483 2484 def soundex_sql(self, expression: exp.Soundex) -> str: 2485 self.unsupported("SOUNDEX is not supported in DuckDB") 2486 return self.func("SOUNDEX", expression.this) 2487 2488 def sortarray_sql(self, expression: exp.SortArray) -> str: 2489 arr = expression.this 2490 asc = expression.args.get("asc") 2491 nulls_first = expression.args.get("nulls_first") 2492 2493 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2494 return self.func("LIST_SORT", arr, asc, nulls_first) 2495 2496 nulls_are_first = nulls_first == exp.true() 2497 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2498 2499 if not isinstance(asc, exp.Boolean): 2500 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2501 2502 descending = asc == exp.false() 2503 2504 if not descending and not nulls_are_first: 2505 return self.func("LIST_SORT", arr) 2506 if not nulls_are_first: 2507 return self.func("ARRAY_REVERSE_SORT", arr) 2508 return self.func( 2509 "LIST_SORT", 2510 arr, 2511 exp.Literal.string("DESC" if descending else "ASC"), 2512 exp.Literal.string("NULLS FIRST"), 2513 ) 2514 2515 def install_sql(self, expression: exp.Install) -> str: 2516 force = "FORCE " if expression.args.get("force") else "" 2517 this = self.sql(expression, "this") 2518 from_clause = expression.args.get("from_") 2519 from_clause = f" FROM {from_clause}" if from_clause else "" 2520 return f"{force}INSTALL {this}{from_clause}" 2521 2522 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2523 self.unsupported( 2524 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2525 ) 2526 return self.function_fallback_sql(expression) 2527 2528 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2529 return self.sql(exp.cast(expression.this, exp.DType.TIMESTAMPTZ)) 2530 2531 def strposition_sql(self, expression: exp.StrPosition) -> str: 2532 this = expression.this 2533 substr = expression.args.get("substr") 2534 position = expression.args.get("position") 2535 2536 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2537 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2538 if _is_binary(this): 2539 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2540 hex_strpos = exp.StrPosition( 2541 this=exp.Hex(this=this), 2542 substr=exp.Hex(this=substr), 2543 ) 2544 2545 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2546 2547 # For VARCHAR: handle clamp_position 2548 if expression.args.get("clamp_position") and position: 2549 expression = expression.copy() 2550 expression.set( 2551 "position", 2552 exp.If( 2553 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2554 true=exp.Literal.number(1), 2555 false=position.copy(), 2556 ), 2557 ) 2558 2559 return strposition_sql(self, expression) 2560 2561 def substring_sql(self, expression: exp.Substring) -> str: 2562 if expression.args.get("zero_start"): 2563 start = expression.args.get("start") 2564 length = expression.args.get("length") 2565 2566 if start := expression.args.get("start"): 2567 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2568 if length := expression.args.get("length"): 2569 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2570 2571 return self.func("SUBSTRING", expression.this, start, length) 2572 2573 return self.function_fallback_sql(expression) 2574 2575 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2576 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2577 target_type = expression.args.get("target_type") 2578 needs_tz = target_type and target_type.this in ( 2579 exp.DType.TIMESTAMPLTZ, 2580 exp.DType.TIMESTAMPTZ, 2581 ) 2582 2583 if expression.args.get("safe"): 2584 formatted_time = self.format_time(expression) 2585 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2586 return self.sql( 2587 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2588 ) 2589 2590 base_sql = str_to_time_sql(self, expression) 2591 if needs_tz: 2592 return self.sql( 2593 exp.cast( 2594 base_sql, 2595 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2596 ) 2597 ) 2598 return base_sql 2599 2600 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2601 formatted_time = self.format_time(expression) 2602 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2603 return self.sql( 2604 exp.cast( 2605 self.func(function_name, expression.this, formatted_time), 2606 exp.DataType(this=exp.DType.DATE), 2607 ) 2608 ) 2609 2610 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2611 this = expression.this 2612 time_format = self.format_time(expression) 2613 safe = expression.args.get("safe") 2614 time_type = exp.DataType.build("TIME", dialect="duckdb") 2615 cast_expr = exp.TryCast if safe else exp.Cast 2616 2617 if time_format: 2618 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2619 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2620 return self.sql(cast_expr(this=strptime, to=time_type)) 2621 2622 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2623 return self.sql(this) 2624 2625 return self.sql(cast_expr(this=this, to=time_type)) 2626 2627 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2628 if not expression.this: 2629 return "CURRENT_DATE" 2630 2631 expr = exp.Cast( 2632 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2633 to=exp.DataType(this=exp.DType.DATE), 2634 ) 2635 return self.sql(expr) 2636 2637 def checkjson_sql(self, expression: exp.CheckJson) -> str: 2638 arg = expression.this 2639 return self.sql( 2640 exp.case() 2641 .when( 2642 exp.or_(arg.is_(exp.Null()), arg.eq(""), exp.func("json_valid", arg)), 2643 exp.null(), 2644 ) 2645 .else_(exp.Literal.string("Invalid JSON")) 2646 ) 2647 2648 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2649 arg = expression.this 2650 if expression.args.get("safe"): 2651 return self.sql( 2652 exp.case() 2653 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2654 .else_(exp.null()) 2655 ) 2656 return self.func("JSON", arg) 2657 2658 def unicode_sql(self, expression: exp.Unicode) -> str: 2659 if expression.args.get("empty_is_zero"): 2660 return self.sql( 2661 exp.case() 2662 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2663 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2664 ) 2665 2666 return self.func("UNICODE", expression.this) 2667 2668 def stripnullvalue_sql(self, expression: exp.StripNullValue) -> str: 2669 return self.sql( 2670 exp.case() 2671 .when(exp.func("json_type", expression.this).eq("NULL"), exp.null()) 2672 .else_(expression.this) 2673 ) 2674 2675 def trunc_sql(self, expression: exp.Trunc) -> str: 2676 decimals = expression.args.get("decimals") 2677 if ( 2678 expression.args.get("fractions_supported") 2679 and decimals 2680 and not decimals.is_type(exp.DType.INT) 2681 ): 2682 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2683 2684 return self.func("TRUNC", expression.this, decimals) 2685 2686 def normal_sql(self, expression: exp.Normal) -> str: 2687 """ 2688 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2689 2690 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2691 """ 2692 mean = expression.this 2693 stddev = expression.args["stddev"] 2694 gen: exp.Expr = expression.args["gen"] 2695 2696 # Build two uniform random values [0, 1) for Box-Muller transform 2697 if isinstance(gen, exp.Rand) and gen.this is None: 2698 u1: exp.Expr = exp.Rand() 2699 u2: exp.Expr = exp.Rand() 2700 else: 2701 # Seeded: derive two values using HASH with different inputs 2702 seed = gen.this if isinstance(gen, exp.Rand) else gen 2703 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2704 u2 = exp.replace_placeholders( 2705 self.SEEDED_RANDOM_TEMPLATE, 2706 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2707 ) 2708 2709 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2710 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2711 2712 def uniform_sql(self, expression: exp.Uniform) -> str: 2713 """ 2714 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2715 2716 UNIFORM returns a random value in [min, max]: 2717 - Integer result if both min and max are integers 2718 - Float result if either min or max is a float 2719 """ 2720 min_val = expression.this 2721 max_val = expression.expression 2722 gen = expression.args.get("gen") 2723 2724 # Determine if result should be integer (both bounds are integers). 2725 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2726 is_int_result = min_val.is_int and max_val.is_int 2727 2728 # Build the random value expression [0, 1) 2729 if not isinstance(gen, exp.Rand): 2730 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2731 random_expr: exp.Expr = exp.Div( 2732 this=exp.Paren( 2733 this=exp.Mod( 2734 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2735 expression=exp.Literal.number(1000000), 2736 ) 2737 ), 2738 expression=exp.Literal.number(1000000.0), 2739 ) 2740 else: 2741 random_expr = exp.Rand() 2742 2743 # Build: min + random * (max - min [+ 1 for int]) 2744 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2745 if is_int_result: 2746 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2747 2748 result: exp.Expr = exp.Add( 2749 this=min_val, 2750 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2751 ) 2752 2753 if is_int_result: 2754 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2755 2756 return self.sql(result) 2757 2758 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2759 nano = expression.args.get("nano") 2760 overflow = expression.args.get("overflow") 2761 2762 # Snowflake's TIME_FROM_PARTS supports overflow 2763 if overflow: 2764 hour = expression.args["hour"] 2765 minute = expression.args["min"] 2766 sec = expression.args["sec"] 2767 2768 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2769 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2770 try: 2771 h_val = hour.to_py() 2772 m_val = minute.to_py() 2773 s_val = sec.to_py() 2774 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2775 return rename_func("MAKE_TIME")(self, expression) 2776 except ValueError: 2777 pass 2778 2779 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2780 if nano: 2781 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2782 2783 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2784 2785 return self.sql( 2786 exp.Add( 2787 this=exp.Cast( 2788 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2789 ), 2790 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2791 ) 2792 ) 2793 2794 # Default: MAKE_TIME 2795 if nano: 2796 expression.set( 2797 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2798 ) 2799 2800 return rename_func("MAKE_TIME")(self, expression) 2801 2802 def extract_sql(self, expression: exp.Extract) -> str: 2803 """ 2804 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2805 2806 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2807 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2808 """ 2809 this = expression.this 2810 datetime_expr = expression.expression 2811 2812 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2813 # because Snowflake applies server timezone while DuckDB uses local timezone 2814 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2815 self.unsupported( 2816 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2817 ) 2818 2819 part_name = this.name.upper() 2820 2821 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2822 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2823 2824 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2825 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2826 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2827 exp.DType.TIME, exp.DType.TIMETZ 2828 ) 2829 2830 if is_nano_time: 2831 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2832 return self.sql( 2833 exp.cast( 2834 exp.Mul( 2835 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2836 expression=exp.Literal.number(1000), 2837 ), 2838 exp.DataType.build(cast_type, dialect="duckdb"), 2839 ) 2840 ) 2841 2842 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2843 strftime_input = datetime_expr 2844 if part_name == "NANOSECOND": 2845 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2846 2847 return self.sql( 2848 exp.cast( 2849 exp.Anonymous( 2850 this="STRFTIME", 2851 expressions=[strftime_input, exp.Literal.string(fmt)], 2852 ), 2853 exp.DataType.build(cast_type, dialect="duckdb"), 2854 ) 2855 ) 2856 2857 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2858 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2859 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2860 # EPOCH returns float, cast to BIGINT for integer result 2861 if part_name == "EPOCH_SECOND": 2862 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2863 return self.sql(result) 2864 2865 return super().extract_sql(expression) 2866 2867 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2868 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2869 date_expr = expression.this 2870 time_expr = expression.expression 2871 2872 if date_expr is not None and time_expr is not None: 2873 # In DuckDB, DATE + TIME produces TIMESTAMP 2874 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2875 2876 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2877 sec = expression.args.get("sec") 2878 if sec is None: 2879 # This shouldn't happen with valid input, but handle gracefully 2880 return rename_func("MAKE_TIMESTAMP")(self, expression) 2881 2882 milli = expression.args.get("milli") 2883 if milli is not None: 2884 sec += milli.pop() / exp.Literal.number(1000.0) 2885 2886 nano = expression.args.get("nano") 2887 if nano is not None: 2888 sec += nano.pop() / exp.Literal.number(1000000000.0) 2889 2890 if milli or nano: 2891 expression.set("sec", sec) 2892 2893 return rename_func("MAKE_TIMESTAMP")(self, expression) 2894 2895 @unsupported_args("nano") 2896 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2897 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2898 if nano := expression.args.get("nano"): 2899 nano.pop() 2900 2901 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2902 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2903 2904 @unsupported_args("nano") 2905 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2906 # Extract zone before popping 2907 zone = expression.args.get("zone") 2908 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2909 if zone: 2910 zone = zone.pop() 2911 2912 if nano := expression.args.get("nano"): 2913 nano.pop() 2914 2915 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2916 2917 if zone: 2918 # Use AT TIME ZONE to apply the explicit timezone 2919 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2920 2921 return timestamp 2922 2923 def tablesample_sql( 2924 self, 2925 expression: exp.TableSample, 2926 tablesample_keyword: str | None = None, 2927 ) -> str: 2928 if not isinstance(expression.parent, exp.Select): 2929 # This sample clause only applies to a single source, not the entire resulting relation 2930 tablesample_keyword = "TABLESAMPLE" 2931 2932 if expression.args.get("size"): 2933 method = expression.args.get("method") 2934 if method and method.name.upper() != "RESERVOIR": 2935 self.unsupported( 2936 f"Sampling method {method} is not supported with a discrete sample count, " 2937 "defaulting to reservoir sampling" 2938 ) 2939 expression.set("method", exp.var("RESERVOIR")) 2940 2941 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2942 2943 def join_sql(self, expression: exp.Join) -> str: 2944 if ( 2945 not expression.args.get("using") 2946 and not expression.args.get("on") 2947 and not expression.method 2948 and (expression.kind in ("", "INNER", "OUTER")) 2949 ): 2950 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2951 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2952 if isinstance(expression.this, exp.Unnest): 2953 return super().join_sql(expression.on(exp.true())) 2954 2955 expression.set("side", None) 2956 expression.set("kind", None) 2957 2958 return super().join_sql(expression) 2959 2960 def countif_sql(self, expression: exp.CountIf) -> str: 2961 if self.dialect.version >= (1, 2): 2962 return self.function_fallback_sql(expression) 2963 2964 # https://github.com/tobymao/sqlglot/pull/4749 2965 return count_if_to_sum(self, expression) 2966 2967 def bracket_sql(self, expression: exp.Bracket) -> str: 2968 if self.dialect.version >= (1, 2): 2969 return super().bracket_sql(expression) 2970 2971 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2972 this = expression.this 2973 if isinstance(this, exp.Array): 2974 this.replace(exp.paren(this)) 2975 2976 bracket = super().bracket_sql(expression) 2977 2978 if not expression.args.get("returns_list_for_maps"): 2979 if not this.type: 2980 from sqlglot.optimizer.annotate_types import annotate_types 2981 2982 this = annotate_types(this, dialect=self.dialect) 2983 2984 if this.is_type(exp.DType.MAP): 2985 bracket = f"({bracket})[1]" 2986 2987 return bracket 2988 2989 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2990 func = expression.this 2991 2992 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2993 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2994 if isinstance(func, exp.ArrayAgg): 2995 if not isinstance(order := expression.expression, exp.Order): 2996 return self.sql(func) 2997 2998 # Save the original column for FILTER clause (before wrapping with Order) 2999 original_this = func.this 3000 3001 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3002 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3003 func.set( 3004 "this", 3005 exp.Order( 3006 this=func.this.copy(), 3007 expressions=order.expressions, 3008 ), 3009 ) 3010 3011 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3012 # Use original_this (not the Order-wrapped version) for the FILTER condition 3013 array_agg_sql = self.function_fallback_sql(func) 3014 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3015 3016 # For other functions (like PERCENTILES), use existing logic 3017 expression_sql = self.sql(expression, "expression") 3018 3019 if isinstance(func, exp.PERCENTILES): 3020 # Make the order key the first arg and slide the fraction to the right 3021 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3022 order_col = expression.find(exp.Ordered) 3023 if order_col: 3024 func.set("expression", func.this) 3025 func.set("this", order_col.this) 3026 3027 this = self.sql(expression, "this").rstrip(")") 3028 3029 return f"{this}{expression_sql})" 3030 3031 def length_sql(self, expression: exp.Length) -> str: 3032 arg = expression.this 3033 3034 # Dialects like BQ and Snowflake also accept binary values as args, so 3035 # DDB will attempt to infer the type or resort to case/when resolution 3036 if not expression.args.get("binary") or arg.is_string: 3037 return self.func("LENGTH", arg) 3038 3039 if not arg.type: 3040 from sqlglot.optimizer.annotate_types import annotate_types 3041 3042 arg = annotate_types(arg, dialect=self.dialect) 3043 3044 if arg.is_type(*exp.DataType.TEXT_TYPES): 3045 return self.func("LENGTH", arg) 3046 3047 # We need these casts to make duckdb's static type checker happy 3048 blob = exp.cast(arg, exp.DType.VARBINARY) 3049 varchar = exp.cast(arg, exp.DType.VARCHAR) 3050 3051 case = ( 3052 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3053 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3054 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3055 ) 3056 return self.sql(case) 3057 3058 def bitlength_sql(self, expression: exp.BitLength) -> str: 3059 if not _is_binary(arg := expression.this): 3060 return self.func("BIT_LENGTH", arg) 3061 3062 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3063 return self.sql(exp.ByteLength(this=blob) * exp.Literal.number(8)) 3064 3065 def chr_sql(self, expression: exp.Chr, name: str = "CHR") -> str: 3066 arg = expression.expressions[0] 3067 if arg.is_type(*exp.DataType.REAL_TYPES): 3068 arg = exp.cast(arg, exp.DType.INT) 3069 return self.func("CHR", arg) 3070 3071 def collation_sql(self, expression: exp.Collation) -> str: 3072 self.unsupported("COLLATION function is not supported by DuckDB") 3073 return self.function_fallback_sql(expression) 3074 3075 def collate_sql(self, expression: exp.Collate) -> str: 3076 if not expression.expression.is_string: 3077 return super().collate_sql(expression) 3078 3079 raw = expression.expression.name 3080 if not raw: 3081 return self.sql(expression.this) 3082 3083 parts = [] 3084 for part in raw.split("-"): 3085 lower = part.lower() 3086 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3087 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3088 self.unsupported( 3089 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3090 ) 3091 parts.append(lower) 3092 3093 if not parts: 3094 return self.sql(expression.this) 3095 return super().collate_sql( 3096 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3097 ) 3098 3099 def _validate_regexp_flags(self, flags: exp.Expr | None, supported_flags: str) -> str | None: 3100 """ 3101 Validate and filter regexp flags for DuckDB compatibility. 3102 3103 Args: 3104 flags: The flags expression to validate 3105 supported_flags: String of supported flags (e.g., "ims", "cims"). 3106 Only these flags will be returned. 3107 3108 Returns: 3109 Validated/filtered flag string, or None if no valid flags remain 3110 """ 3111 if not isinstance(flags, exp.Expr): 3112 return None 3113 3114 if not flags.is_string: 3115 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3116 return None 3117 3118 flag_str = flags.this 3119 unsupported = set(flag_str) - set(supported_flags) 3120 3121 if unsupported: 3122 self.unsupported( 3123 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3124 ) 3125 3126 flag_str = "".join(f for f in flag_str if f in supported_flags) 3127 return flag_str if flag_str else None 3128 3129 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3130 this = expression.this 3131 pattern = expression.expression 3132 position = expression.args.get("position") 3133 parameters = expression.args.get("parameters") 3134 3135 # Validate flags - only "ims" flags are supported for embedded patterns 3136 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3137 3138 if position: 3139 this = exp.Substring(this=this, start=position) 3140 3141 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3142 if validated_flags: 3143 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3144 3145 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3146 result = ( 3147 exp.case() 3148 .when( 3149 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3150 exp.Literal.number(0), 3151 ) 3152 .else_( 3153 exp.Length( 3154 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3155 ) 3156 ) 3157 ) 3158 3159 return self.sql(result) 3160 3161 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3162 subject = expression.this 3163 pattern = expression.expression 3164 replacement = expression.args.get("replacement") or exp.Literal.string("") 3165 position = expression.args.get("position") 3166 occurrence = expression.args.get("occurrence") 3167 modifiers = expression.args.get("modifiers") 3168 3169 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3170 3171 # Handle occurrence (only literals supported) 3172 if occurrence and not occurrence.is_int: 3173 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3174 else: 3175 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3176 if occurrence > 1: 3177 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3178 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3179 elif ( 3180 occurrence == 0 3181 and "g" not in validated_flags 3182 and not expression.args.get("single_replace") 3183 ): 3184 validated_flags += "g" 3185 3186 # Handle position (only literals supported) 3187 prefix = None 3188 if position and not position.is_int: 3189 self.unsupported("REGEXP_REPLACE with non-literal position") 3190 elif position and position.is_int and position.to_py() > 1: 3191 pos = position.to_py() 3192 prefix = exp.Substring( 3193 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3194 ) 3195 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3196 3197 result: exp.Expr = exp.Anonymous( 3198 this="REGEXP_REPLACE", 3199 expressions=[ 3200 subject, 3201 pattern, 3202 replacement, 3203 exp.Literal.string(validated_flags) if validated_flags else None, 3204 ], 3205 ) 3206 3207 if prefix: 3208 result = exp.Concat(expressions=[prefix, result]) 3209 3210 return self.sql(result) 3211 3212 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3213 this = expression.this 3214 pattern = expression.expression 3215 flag = expression.args.get("flag") 3216 3217 if expression.args.get("full_match"): 3218 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3219 flag = exp.Literal.string(validated_flags) if validated_flags else None 3220 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3221 3222 return self.func("REGEXP_MATCHES", this, pattern, flag) 3223 3224 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3225 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3226 this = expression.this 3227 expr = expression.expression 3228 max_dist = expression.args.get("max_dist") 3229 3230 if max_dist is None: 3231 return self.func("LEVENSHTEIN", this, expr) 3232 3233 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3234 levenshtein = exp.Levenshtein(this=this, expression=expr) 3235 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3236 3237 def pad_sql(self, expression: exp.Pad) -> str: 3238 """ 3239 Handle RPAD/LPAD for VARCHAR and BINARY types. 3240 3241 For VARCHAR: Delegate to parent class 3242 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3243 """ 3244 string_arg = expression.this 3245 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3246 3247 if _is_binary(string_arg) or _is_binary(fill_arg): 3248 length_arg = expression.expression 3249 is_left = expression.args.get("is_left") 3250 3251 input_len = exp.ByteLength(this=string_arg) 3252 chars_needed = length_arg - input_len 3253 pad_count = exp.Greatest( 3254 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3255 ) 3256 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3257 3258 left, right = string_arg, repeat_expr 3259 if is_left: 3260 left, right = right, left 3261 3262 result = exp.DPipe(this=left, expression=right) 3263 return self.sql(result) 3264 3265 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3266 return super().pad_sql(expression) 3267 3268 def minhash_sql(self, expression: exp.Minhash) -> str: 3269 k = expression.this 3270 exprs = expression.expressions 3271 3272 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3273 self.unsupported( 3274 "MINHASH with multiple expressions or * requires manual query restructuring" 3275 ) 3276 return self.func("MINHASH", k, *exprs) 3277 3278 expr = exprs[0] 3279 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3280 return f"({self.sql(result)})" 3281 3282 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3283 expr = expression.this 3284 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3285 return f"({self.sql(result)})" 3286 3287 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3288 expr = expression.this 3289 result = exp.replace_placeholders(self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr) 3290 return f"({self.sql(result)})" 3291 3292 def arrayuniqueagg_sql(self, expression: exp.ArrayUniqueAgg) -> str: 3293 return self.sql( 3294 exp.Filter( 3295 this=exp.func("LIST", exp.Distinct(expressions=[expression.this])), 3296 expression=exp.Where(this=expression.this.copy().is_(exp.null()).not_()), 3297 ) 3298 ) 3299 3300 def arrayunionagg_sql(self, expression: exp.ArrayUnionAgg) -> str: 3301 self.unsupported("ARRAY_UNION_AGG is not supported in DuckDB") 3302 return self.function_fallback_sql(expression) 3303 3304 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3305 arr = expression.this 3306 func = self.func("LIST_DISTINCT", arr) 3307 3308 if expression.args.get("check_null"): 3309 add_null_to_array = exp.func( 3310 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3311 ) 3312 return self.sql( 3313 exp.If( 3314 this=exp.NEQ( 3315 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3316 ), 3317 true=add_null_to_array, 3318 false=func, 3319 ) 3320 ) 3321 3322 return func 3323 3324 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3325 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3326 return self._array_bag_sql( 3327 self.ARRAY_INTERSECTION_CONDITION, 3328 expression.expressions[0], 3329 expression.expressions[1], 3330 ) 3331 return self.function_fallback_sql(expression) 3332 3333 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3334 arr1, arr2 = expression.this, expression.expression 3335 if expression.args.get("is_multiset"): 3336 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3337 return self.sql( 3338 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3339 ) 3340 3341 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3342 """ 3343 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3344 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3345 expressions that adjust the index at query time: 3346 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3347 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3348 """ 3349 start, end = expression.args.get("start"), expression.args.get("end") 3350 3351 if expression.args.get("zero_based"): 3352 if start is not None: 3353 start = ( 3354 exp.case() 3355 .when( 3356 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3357 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3358 ) 3359 .else_(start) 3360 ) 3361 if end is not None: 3362 end = ( 3363 exp.case() 3364 .when( 3365 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3366 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3367 ) 3368 .else_(end) 3369 ) 3370 3371 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step")) 3372 3373 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3374 args = expression.expressions 3375 3376 if not args: 3377 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3378 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3379 3380 # Build placeholder values for template 3381 lengths = [exp.Length(this=arg) for arg in args] 3382 max_len = ( 3383 lengths[0] 3384 if len(lengths) == 1 3385 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3386 ) 3387 3388 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3389 empty_struct = exp.func( 3390 "STRUCT", 3391 *[ 3392 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3393 for i in range(len(args)) 3394 ], 3395 ) 3396 3397 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3398 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3399 index = exp.column("__i") + 1 3400 transform_struct = exp.func( 3401 "STRUCT", 3402 *[ 3403 exp.PropertyEQ( 3404 this=exp.Literal.string(f"${i + 1}"), 3405 expression=exp.func("COALESCE", arg, exp.array())[index], 3406 ) 3407 for i, arg in enumerate(args) 3408 ], 3409 ) 3410 3411 result = exp.replace_placeholders( 3412 self.ARRAYS_ZIP_TEMPLATE.copy(), 3413 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3414 all_empty_check=exp.and_( 3415 *[ 3416 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3417 for arg in args 3418 ] 3419 ), 3420 empty_struct=empty_struct, 3421 max_len=max_len, 3422 transform_struct=transform_struct, 3423 ) 3424 return self.sql(result) 3425 3426 def lower_sql(self, expression: exp.Lower) -> str: 3427 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3428 return _gen_with_cast_to_blob(self, expression, result_sql) 3429 3430 def upper_sql(self, expression: exp.Upper) -> str: 3431 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3432 return _gen_with_cast_to_blob(self, expression, result_sql) 3433 3434 def reverse_sql(self, expression: exp.Reverse) -> str: 3435 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3436 return _gen_with_cast_to_blob(self, expression, result_sql) 3437 3438 def _left_right_sql(self, expression: exp.Left | exp.Right, func_name: str) -> str: 3439 arg = expression.this 3440 length = expression.expression 3441 is_binary = _is_binary(arg) 3442 3443 if is_binary: 3444 # LEFT/RIGHT(blob, n) becomes UNHEX(LEFT/RIGHT(HEX(blob), n * 2)) 3445 # Each byte becomes 2 hex chars, so multiply length by 2 3446 hex_arg = exp.Hex(this=arg) 3447 hex_length = exp.Mul(this=length, expression=exp.Literal.number(2)) 3448 result: exp.Expression = exp.Unhex( 3449 this=exp.Anonymous(this=func_name, expressions=[hex_arg, hex_length]) 3450 ) 3451 else: 3452 result = exp.Anonymous(this=func_name, expressions=[arg, length]) 3453 3454 if expression.args.get("negative_length_returns_empty"): 3455 empty: exp.Expression = exp.Literal.string("") 3456 if is_binary: 3457 empty = exp.Unhex(this=empty) 3458 result = exp.case().when(length < exp.Literal.number(0), empty).else_(result) 3459 3460 return self.sql(result) 3461 3462 def left_sql(self, expression: exp.Left) -> str: 3463 return self._left_right_sql(expression, "LEFT") 3464 3465 def right_sql(self, expression: exp.Right) -> str: 3466 return self._left_right_sql(expression, "RIGHT") 3467 3468 def rtrimmedlength_sql(self, expression: exp.RtrimmedLength) -> str: 3469 return self.func("LENGTH", exp.Trim(this=expression.this, position="TRAILING")) 3470 3471 def stuff_sql(self, expression: exp.Stuff) -> str: 3472 base = expression.this 3473 start = expression.args["start"] 3474 length = expression.args["length"] 3475 insertion = expression.expression 3476 is_binary = _is_binary(base) 3477 3478 if is_binary: 3479 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3480 # (each byte = 2 hex chars), then UNHEX back to BLOB 3481 base = exp.Hex(this=base) 3482 insertion = exp.Hex(this=insertion) 3483 left = exp.Substring( 3484 this=base.copy(), 3485 start=exp.Literal.number(1), 3486 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3487 ) 3488 right = exp.Substring( 3489 this=base.copy(), 3490 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3491 + exp.Literal.number(1), 3492 ) 3493 else: 3494 left = exp.Substring( 3495 this=base.copy(), 3496 start=exp.Literal.number(1), 3497 length=start.copy() - exp.Literal.number(1), 3498 ) 3499 right = exp.Substring(this=base.copy(), start=start + length) 3500 result: exp.Expr = exp.DPipe( 3501 this=exp.DPipe(this=left, expression=insertion), expression=right 3502 ) 3503 3504 if is_binary: 3505 result = exp.Unhex(this=result) 3506 3507 return self.sql(result) 3508 3509 def rand_sql(self, expression: exp.Rand) -> str: 3510 seed = expression.this 3511 if seed is not None: 3512 self.unsupported("RANDOM with seed is not supported in DuckDB") 3513 3514 lower = expression.args.get("lower") 3515 upper = expression.args.get("upper") 3516 3517 if lower and upper: 3518 # scale DuckDB's [0,1) to the specified range 3519 range_size = exp.paren(upper - lower) 3520 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3521 3522 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3523 result = exp.cast(scaled, exp.DType.BIGINT) 3524 return self.sql(result) 3525 3526 # Default DuckDB behavior - just return RANDOM() as float 3527 return "RANDOM()" 3528 3529 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3530 arg = expression.this 3531 3532 # Check if it's a text type (handles both literals and annotated expressions) 3533 if arg.is_type(*exp.DataType.TEXT_TYPES): 3534 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3535 3536 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3537 return self.func("OCTET_LENGTH", arg) 3538 3539 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3540 # DuckDB TO_BASE64 requires BLOB input 3541 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3542 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3543 result = expression.this 3544 3545 # Check if input is a string type - ENCODE only accepts VARCHAR 3546 if result.is_type(*exp.DataType.TEXT_TYPES): 3547 result = exp.Encode(this=result) 3548 3549 result = exp.ToBase64(this=result) 3550 3551 max_line_length = expression.args.get("max_line_length") 3552 alphabet = expression.args.get("alphabet") 3553 3554 # Handle custom alphabet by replacing standard chars with custom ones 3555 result = _apply_base64_alphabet_replacements(result, alphabet) 3556 3557 # Handle max_line_length by inserting newlines every N characters 3558 line_length = ( 3559 t.cast(int, max_line_length.to_py()) 3560 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3561 else 0 3562 ) 3563 if line_length > 0: 3564 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3565 result = exp.Trim( 3566 this=exp.RegexpReplace( 3567 this=result, 3568 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3569 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3570 ), 3571 expression=newline, 3572 position="TRAILING", 3573 ) 3574 3575 return self.sql(result) 3576 3577 def replace_sql(self, expression: exp.Replace) -> str: 3578 result_sql = self.func( 3579 "REPLACE", 3580 _cast_to_varchar(expression.this), 3581 _cast_to_varchar(expression.expression), 3582 _cast_to_varchar(expression.args.get("replacement")), 3583 ) 3584 return _gen_with_cast_to_blob(self, expression, result_sql) 3585 3586 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3587 _prepare_binary_bitwise_args(expression) 3588 result_sql = self.binary(expression, op) 3589 return _gen_with_cast_to_blob(self, expression, result_sql) 3590 3591 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3592 _prepare_binary_bitwise_args(expression) 3593 result_sql = self.func("XOR", expression.this, expression.expression) 3594 return _gen_with_cast_to_blob(self, expression, result_sql) 3595 3596 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3597 this = expression.this 3598 key = expression.args.get("key") 3599 key_sql = key.name if isinstance(key, exp.Expr) else "" 3600 value_sql = self.sql(expression, "value") 3601 3602 kv_sql = f"{key_sql} := {value_sql}" 3603 3604 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3605 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3606 if isinstance(this, exp.Struct) and not this.expressions: 3607 return self.func("STRUCT_PACK", kv_sql) 3608 3609 return self.func("STRUCT_INSERT", this, kv_sql) 3610 3611 def mapcat_sql(self, expression: exp.MapCat) -> str: 3612 result = exp.replace_placeholders( 3613 self.MAPCAT_TEMPLATE.copy(), 3614 map1=expression.this, 3615 map2=expression.expression, 3616 ) 3617 return self.sql(result) 3618 3619 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3620 return self.func( 3621 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3622 ) 3623 3624 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3625 map_arg = expression.this 3626 keys_to_delete = expression.expressions 3627 3628 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3629 3630 lambda_expr = exp.Lambda( 3631 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3632 expressions=[exp.to_identifier("x")], 3633 ) 3634 result = exp.func( 3635 "MAP_FROM_ENTRIES", 3636 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3637 ) 3638 return self.sql(result) 3639 3640 def mappick_sql(self, expression: exp.MapPick) -> str: 3641 map_arg = expression.this 3642 keys_to_pick = expression.expressions 3643 3644 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3645 3646 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3647 lambda_expr = exp.Lambda( 3648 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3649 expressions=[exp.to_identifier("x")], 3650 ) 3651 else: 3652 lambda_expr = exp.Lambda( 3653 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3654 expressions=[exp.to_identifier("x")], 3655 ) 3656 3657 result = exp.func( 3658 "MAP_FROM_ENTRIES", 3659 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3660 ) 3661 return self.sql(result) 3662 3663 def mapsize_sql(self, expression: exp.MapSize) -> str: 3664 return self.func("CARDINALITY", expression.this) 3665 3666 @unsupported_args("update_flag") 3667 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3668 map_arg = expression.this 3669 key = expression.args.get("key") 3670 value = expression.args.get("value") 3671 3672 map_type = map_arg.type 3673 3674 if value is not None: 3675 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3676 # Extract the value type from MAP(key_type, value_type) 3677 value_type = map_type.expressions[1] 3678 # Cast value to match the map's value type to avoid type conflicts 3679 value = exp.cast(value, value_type) 3680 # else: polymorphic MAP case - no type parameters available, use value as-is 3681 3682 # Create a single-entry map for the new key-value pair 3683 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3684 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3685 3686 # Use MAP_CONCAT to merge the original map with the new entry 3687 # This automatically handles both insert and update cases 3688 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3689 3690 return self.sql(result) 3691 3692 def startswith_sql(self, expression: exp.StartsWith) -> str: 3693 return self.func( 3694 "STARTS_WITH", 3695 _cast_to_varchar(expression.this), 3696 _cast_to_varchar(expression.expression), 3697 ) 3698 3699 def space_sql(self, expression: exp.Space) -> str: 3700 # DuckDB's REPEAT requires BIGINT for the count parameter 3701 return self.sql( 3702 exp.Repeat( 3703 this=exp.Literal.string(" "), 3704 times=exp.cast(expression.this, exp.DType.BIGINT), 3705 ) 3706 ) 3707 3708 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3709 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3710 if isinstance(expression.this, exp.Generator): 3711 # Preserve alias, joins, and other table-level args 3712 table = exp.Table( 3713 this=expression.this, 3714 alias=expression.args.get("alias"), 3715 joins=expression.args.get("joins"), 3716 ) 3717 return self.sql(table) 3718 3719 return super().tablefromrows_sql(expression) 3720 3721 def unnest_sql(self, expression: exp.Unnest) -> str: 3722 explode_array = expression.args.get("explode_array") 3723 if explode_array: 3724 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3725 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3726 expression.expressions.append( 3727 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3728 ) 3729 3730 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3731 alias = expression.args.get("alias") 3732 if isinstance(alias, exp.TableAlias): 3733 expression.set("alias", None) 3734 if alias.columns: 3735 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3736 3737 unnest_sql = super().unnest_sql(expression) 3738 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3739 return self.sql(select) 3740 3741 return super().unnest_sql(expression) 3742 3743 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3744 this = expression.this 3745 3746 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3747 # DuckDB should render IGNORE NULLS only for the general-purpose 3748 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3749 return super().ignorenulls_sql(expression) 3750 3751 if isinstance(this, exp.First): 3752 this = exp.AnyValue(this=this.this) 3753 3754 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3755 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3756 3757 return self.sql(this) 3758 3759 def split_sql(self, expression: exp.Split) -> str: 3760 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3761 3762 case_expr = exp.case().else_(base_func) 3763 needs_case = False 3764 3765 if expression.args.get("null_returns_null"): 3766 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3767 needs_case = True 3768 3769 if expression.args.get("empty_delimiter_returns_whole"): 3770 # When delimiter is empty string, return input string as single array element 3771 array_with_input = exp.array(expression.this) 3772 case_expr = case_expr.when( 3773 expression.expression.eq(exp.Literal.string("")), array_with_input 3774 ) 3775 needs_case = True 3776 3777 return self.sql(case_expr if needs_case else base_func) 3778 3779 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3780 string_arg = expression.this 3781 delimiter_arg = expression.args.get("delimiter") 3782 part_index_arg = expression.args.get("part_index") 3783 3784 if delimiter_arg and part_index_arg: 3785 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3786 if expression.args.get("part_index_zero_as_one"): 3787 # Convert 0 to 1 for compatibility 3788 3789 part_index_arg = exp.Paren( 3790 this=exp.case() 3791 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3792 .else_(part_index_arg) 3793 ) 3794 3795 # Use Anonymous to avoid recursion 3796 base_func_expr: exp.Expr = exp.Anonymous( 3797 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3798 ) 3799 needs_case_transform = False 3800 case_expr = exp.case().else_(base_func_expr) 3801 3802 if expression.args.get("empty_delimiter_returns_whole"): 3803 # When delimiter is empty string: 3804 # - Return whole string if part_index is 1 or -1 3805 # - Return empty string otherwise 3806 empty_case = exp.Paren( 3807 this=exp.case() 3808 .when( 3809 exp.or_( 3810 part_index_arg.eq(exp.Literal.number("1")), 3811 part_index_arg.eq(exp.Literal.number("-1")), 3812 ), 3813 string_arg, 3814 ) 3815 .else_(exp.Literal.string("")) 3816 ) 3817 3818 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3819 needs_case_transform = True 3820 3821 """ 3822 Output looks something like this: 3823 3824 CASE 3825 WHEN delimiter is '' THEN 3826 ( 3827 CASE 3828 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3829 ELSE '' END 3830 ) 3831 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3832 END 3833 3834 """ 3835 return self.sql(case_expr if needs_case_transform else base_func_expr) 3836 3837 return self.function_fallback_sql(expression) 3838 3839 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3840 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3841 # DuckDB should render RESPECT NULLS only for the general-purpose 3842 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3843 return super().respectnulls_sql(expression) 3844 3845 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3846 return self.sql(expression, "this") 3847 3848 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3849 null = expression.args.get("null") 3850 3851 if expression.args.get("null_is_empty"): 3852 x = exp.to_identifier("x") 3853 list_transform = exp.Transform( 3854 this=expression.this.copy(), 3855 expression=exp.Lambda( 3856 this=exp.Coalesce( 3857 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3858 ), 3859 expressions=[x], 3860 ), 3861 ) 3862 array_to_string = exp.ArrayToString( 3863 this=list_transform, expression=expression.expression 3864 ) 3865 if expression.args.get("null_delim_is_null"): 3866 return self.sql( 3867 exp.case() 3868 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3869 .else_(array_to_string) 3870 ) 3871 return self.sql(array_to_string) 3872 3873 if null: 3874 x = exp.to_identifier("x") 3875 return self.sql( 3876 exp.ArrayToString( 3877 this=exp.Transform( 3878 this=expression.this, 3879 expression=exp.Lambda( 3880 this=exp.Coalesce(this=x, expressions=[null]), 3881 expressions=[x], 3882 ), 3883 ), 3884 expression=expression.expression, 3885 ) 3886 ) 3887 3888 return self.func("ARRAY_TO_STRING", expression.this, expression.expression) 3889 3890 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3891 # DuckDB-specific: handle binary types using DPipe (||) operator 3892 separator = seq_get(expression.expressions, 0) 3893 args = expression.expressions[1:] 3894 3895 if any(_is_binary(arg) for arg in [separator, *args]): 3896 result = args[0] 3897 for arg in args[1:]: 3898 result = exp.DPipe( 3899 this=exp.DPipe(this=result, expression=separator), expression=arg 3900 ) 3901 return self.sql(result) 3902 3903 return super().concatws_sql(expression) 3904 3905 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3906 this = expression.this 3907 group = expression.args.get("group") 3908 params = expression.args.get("parameters") 3909 position = expression.args.get("position") 3910 occurrence = expression.args.get("occurrence") 3911 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3912 3913 # Handle Snowflake's 'e' flag: it enables capture group extraction 3914 # In DuckDB, this is controlled by the group parameter directly 3915 if params and params.is_string and "e" in params.name: 3916 params = exp.Literal.string(params.name.replace("e", "")) 3917 3918 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3919 3920 # Strip default group when no following params (DuckDB default is same as group=0) 3921 if ( 3922 not validated_flags 3923 and group 3924 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3925 ): 3926 group = None 3927 3928 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3929 3930 # use substring to handle position argument 3931 if position and (not position.is_int or position.to_py() > 1): 3932 this = exp.Substring(this=this, start=position) 3933 3934 if null_if_pos_overflow: 3935 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3936 3937 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3938 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3939 3940 if is_extract_all or non_single_occurrence: 3941 name = "REGEXP_EXTRACT_ALL" 3942 else: 3943 name = "REGEXP_EXTRACT" 3944 3945 result: exp.Expr = exp.Anonymous( 3946 this=name, expressions=[this, expression.expression, group, flags_expr] 3947 ) 3948 3949 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 3950 if is_extract_all and non_single_occurrence: 3951 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 3952 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 3953 elif non_single_occurrence: 3954 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 3955 3956 return self.sql(result) 3957 3958 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3959 return self._regexp_extract_sql(expression) 3960 3961 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 3962 return self._regexp_extract_sql(expression) 3963 3964 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3965 this = expression.this 3966 pattern = expression.expression 3967 position = expression.args.get("position") 3968 orig_occ = expression.args.get("occurrence") 3969 occurrence = orig_occ or exp.Literal.number(1) 3970 option = expression.args.get("option") 3971 parameters = expression.args.get("parameters") 3972 3973 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3974 if validated_flags: 3975 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3976 3977 # Handle starting position offset 3978 pos_offset: exp.Expr = exp.Literal.number(0) 3979 if position and (not position.is_int or position.to_py() > 1): 3980 this = exp.Substring(this=this, start=position) 3981 pos_offset = position - exp.Literal.number(1) 3982 3983 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3984 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 3985 lst = exp.Bracket( 3986 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3987 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3988 offset=1, 3989 ) 3990 transform = exp.Anonymous( 3991 this="LIST_TRANSFORM", 3992 expressions=[ 3993 lst, 3994 exp.Lambda( 3995 this=exp.Length(this=exp.to_identifier("x")), 3996 expressions=[exp.to_identifier("x")], 3997 ), 3998 ], 3999 ) 4000 return exp.Coalesce( 4001 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4002 expressions=[exp.Literal.number(0)], 4003 ) 4004 4005 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4006 base_pos: exp.Expr = ( 4007 exp.Literal.number(1) 4008 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4009 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4010 + pos_offset 4011 ) 4012 4013 # option=1: add match length for end position 4014 if option and option.is_int and option.to_py() == 1: 4015 match_at_occ = exp.Bracket( 4016 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4017 expressions=[occurrence], 4018 offset=1, 4019 ) 4020 base_pos = base_pos + exp.Coalesce( 4021 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4022 ) 4023 4024 # NULL checks for all provided arguments 4025 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4026 null_args = [ 4027 expression.this, 4028 expression.expression, 4029 position, 4030 orig_occ, 4031 option, 4032 parameters, 4033 ] 4034 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4035 4036 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4037 4038 return self.sql( 4039 exp.case() 4040 .when(exp.or_(*null_checks), exp.Null()) 4041 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4042 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4043 .else_(base_pos) 4044 ) 4045 4046 @unsupported_args("culture") 4047 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4048 fmt = expression.args.get("format") 4049 if fmt and fmt.is_int: 4050 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4051 4052 self.unsupported("Only integer formats are supported by NumberToStr") 4053 return self.function_fallback_sql(expression) 4054 4055 def autoincrementcolumnconstraint_sql(self, _) -> str: 4056 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 4057 return "" 4058 4059 def aliases_sql(self, expression: exp.Aliases) -> str: 4060 this = expression.this 4061 if isinstance(this, exp.Posexplode): 4062 return self.posexplode_sql(this) 4063 4064 return super().aliases_sql(expression) 4065 4066 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4067 this = expression.this 4068 parent = expression.parent 4069 4070 # The default Spark aliases are "pos" and "col", unless specified otherwise 4071 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4072 4073 if isinstance(parent, exp.Aliases): 4074 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4075 pos, col = parent.expressions 4076 elif isinstance(parent, exp.Table): 4077 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4078 alias = parent.args.get("alias") 4079 if alias: 4080 pos, col = alias.columns or [pos, col] 4081 alias.pop() 4082 4083 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4084 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4085 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4086 gen_subscripts = self.sql( 4087 exp.Alias( 4088 this=exp.Anonymous( 4089 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4090 ) 4091 - exp.Literal.number(1), 4092 alias=pos, 4093 ) 4094 ) 4095 4096 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4097 4098 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4099 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4100 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4101 4102 return posexplode_sql 4103 4104 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4105 """ 4106 Handles three key issues: 4107 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4108 2. End-of-month preservation: If input is last day of month, result is last day of result month 4109 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4110 """ 4111 from sqlglot.optimizer.annotate_types import annotate_types 4112 4113 this = expression.this 4114 if not this.type: 4115 this = annotate_types(this, dialect=self.dialect) 4116 4117 if this.is_type(*exp.DataType.TEXT_TYPES): 4118 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4119 4120 # Detect float/decimal months to apply rounding (Snowflake behavior) 4121 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4122 months_expr = expression.expression 4123 if not months_expr.type: 4124 months_expr = annotate_types(months_expr, dialect=self.dialect) 4125 4126 # Build interval or to_months expression based on type 4127 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4128 interval_or_to_months = ( 4129 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4130 if months_expr.is_type( 4131 exp.DType.FLOAT, 4132 exp.DType.DOUBLE, 4133 exp.DType.DECIMAL, 4134 ) 4135 # Integer case: standard INTERVAL N MONTH syntax 4136 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4137 ) 4138 4139 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4140 4141 # Apply end-of-month preservation if Snowflake flag is set 4142 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4143 preserve_eom = expression.args.get("preserve_end_of_month") 4144 result_expr = ( 4145 exp.case() 4146 .when( 4147 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4148 exp.func("LAST_DAY", date_add_expr), 4149 ) 4150 .else_(date_add_expr) 4151 if preserve_eom 4152 else date_add_expr 4153 ) 4154 4155 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4156 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4157 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4158 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4159 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4160 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4161 return self.sql(result_expr) 4162 4163 def format_sql(self, expression: exp.Format) -> str: 4164 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 4165 return self.func("FORMAT", "'{}'", expression.expressions[0]) 4166 4167 return self.function_fallback_sql(expression) 4168 4169 def hexstring_sql( 4170 self, expression: exp.HexString, binary_function_repr: str | None = None 4171 ) -> str: 4172 # UNHEX('FF') correctly produces blob \xFF in DuckDB 4173 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 4174 4175 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4176 unit = expression.args.get("unit") 4177 date = expression.this 4178 4179 week_start = _week_unit_to_dow(unit) 4180 unit = unit_to_str(expression) 4181 4182 if week_start: 4183 result = self.sql( 4184 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4185 ) 4186 else: 4187 result = self.func("DATE_TRUNC", unit, date) 4188 4189 if ( 4190 expression.args.get("input_type_preserved") 4191 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4192 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4193 ): 4194 return self.sql(exp.Cast(this=result, to=date.type)) 4195 4196 return result 4197 4198 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4199 unit = unit_to_str(expression) 4200 zone = expression.args.get("zone") 4201 timestamp = expression.this 4202 date_unit = is_date_unit(unit) 4203 4204 if date_unit and zone: 4205 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4206 # Double AT TIME ZONE needed for BigQuery compatibility: 4207 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4208 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4209 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4210 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4211 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4212 4213 result = self.func("DATE_TRUNC", unit, timestamp) 4214 if expression.args.get("input_type_preserved"): 4215 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4216 dummy_date = exp.Cast( 4217 this=exp.Literal.string("1970-01-01"), 4218 to=exp.DataType(this=exp.DType.DATE), 4219 ) 4220 date_time = exp.Add(this=dummy_date, expression=timestamp) 4221 result = self.func("DATE_TRUNC", unit, date_time) 4222 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4223 4224 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4225 date_unit and timestamp.is_type(exp.DType.DATE) 4226 ): 4227 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4228 4229 return result 4230 4231 def trim_sql(self, expression: exp.Trim) -> str: 4232 expression.this.replace(_cast_to_varchar(expression.this)) 4233 if expression.expression: 4234 expression.expression.replace(_cast_to_varchar(expression.expression)) 4235 4236 result_sql = super().trim_sql(expression) 4237 return _gen_with_cast_to_blob(self, expression, result_sql) 4238 4239 def round_sql(self, expression: exp.Round) -> str: 4240 this = expression.this 4241 decimals = expression.args.get("decimals") 4242 truncate = expression.args.get("truncate") 4243 4244 # DuckDB requires the scale (decimals) argument to be an INT 4245 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4246 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4247 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4248 decimals = exp.cast(decimals, exp.DType.INT) 4249 4250 func = "ROUND" 4251 if truncate: 4252 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4253 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4254 func = "ROUND_EVEN" 4255 truncate = None 4256 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4257 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4258 truncate = None 4259 4260 return self.func(func, this, decimals, truncate) 4261 4262 def strtok_sql(self, expression: exp.Strtok) -> str: 4263 string_arg = expression.this 4264 delimiter_arg = expression.args.get("delimiter") 4265 part_index_arg = expression.args.get("part_index") 4266 4267 if delimiter_arg and part_index_arg: 4268 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4269 escaped_delimiter = exp.Anonymous( 4270 this="REGEXP_REPLACE", 4271 expressions=[ 4272 delimiter_arg, 4273 exp.Literal.string( 4274 r"([\[\]^.\-*+?(){}|$\\])" 4275 ), # Escape problematic regex chars 4276 exp.Literal.string( 4277 r"\\\1" 4278 ), # Replace with escaped version using $1 backreference 4279 exp.Literal.string("g"), # Global flag 4280 ], 4281 ) 4282 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4283 regex_pattern = ( 4284 exp.case() 4285 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4286 .else_( 4287 exp.func( 4288 "CONCAT", 4289 exp.Literal.string("["), 4290 escaped_delimiter, 4291 exp.Literal.string("]"), 4292 ) 4293 ) 4294 ) 4295 4296 # STRTOK skips empty strings, so we need to filter them out 4297 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4298 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4299 x = exp.to_identifier("x") 4300 is_empty = x.eq(exp.Literal.string("")) 4301 filtered_array = exp.func( 4302 "LIST_FILTER", 4303 split_array, 4304 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4305 ) 4306 base_func = exp.Bracket( 4307 this=filtered_array, 4308 expressions=[part_index_arg], 4309 offset=1, 4310 ) 4311 4312 # Use template with the built regex pattern 4313 result = exp.replace_placeholders( 4314 self.STRTOK_TEMPLATE.copy(), 4315 string=string_arg, 4316 delimiter=delimiter_arg, 4317 part_index=part_index_arg, 4318 base_func=base_func, 4319 ) 4320 4321 return self.sql(result) 4322 4323 return self.function_fallback_sql(expression) 4324 4325 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4326 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4327 4328 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4329 if expression.is_type(*exp.DataType.REAL_TYPES): 4330 result = f"CAST({result} AS DOUBLE)" 4331 4332 return result 4333 4334 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4335 """ 4336 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4337 dividing the input distribution into n equal-sized buckets. 4338 4339 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4340 does not document the specific algorithm used so results may differ. DuckDB does not 4341 support RESPECT NULLS. 4342 """ 4343 this = expression.this 4344 if isinstance(this, exp.Distinct): 4345 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4346 if len(this.expressions) < 2: 4347 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4348 return self.function_fallback_sql(expression) 4349 num_quantiles_expr = this.expressions[1].pop() 4350 else: 4351 num_quantiles_expr = expression.expression 4352 4353 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4354 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4355 return self.function_fallback_sql(expression) 4356 4357 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4358 if num_quantiles <= 0: 4359 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4360 return self.function_fallback_sql(expression) 4361 4362 quantiles = [ 4363 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4364 for i in range(num_quantiles + 1) 4365 ] 4366 4367 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))) 4368 4369 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4370 if expression.args.get("scalar_only"): 4371 expression = exp.JSONExtractScalar( 4372 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4373 ) 4374 return _arrow_json_extract_sql(self, expression) 4375 4376 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4377 this = expression.this 4378 4379 if _is_binary(this): 4380 expression.type = exp.DType.BINARY.into_expr() 4381 4382 arg = _cast_to_bit(this) 4383 4384 if isinstance(this, exp.Neg): 4385 arg = exp.Paren(this=arg) 4386 4387 expression.set("this", arg) 4388 4389 result_sql = f"~{self.sql(expression, 'this')}" 4390 4391 return _gen_with_cast_to_blob(self, expression, result_sql) 4392 4393 def window_sql(self, expression: exp.Window) -> str: 4394 this = expression.this 4395 if isinstance(this, exp.Corr) or ( 4396 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4397 ): 4398 return self._corr_sql(expression) 4399 4400 return super().window_sql(expression) 4401 4402 def filter_sql(self, expression: exp.Filter) -> str: 4403 if isinstance(expression.this, exp.Corr): 4404 return self._corr_sql(expression) 4405 4406 return super().filter_sql(expression) 4407 4408 def _corr_sql( 4409 self, 4410 expression: exp.Filter | exp.Window | exp.Corr, 4411 ) -> str: 4412 if isinstance(expression, exp.Corr) and not expression.args.get("null_on_zero_variance"): 4413 return self.func("CORR", expression.this, expression.expression) 4414 4415 corr_expr = _maybe_corr_null_to_false(expression) 4416 if corr_expr is None: 4417 if isinstance(expression, exp.Window): 4418 return super().window_sql(expression) 4419 if isinstance(expression, exp.Filter): 4420 return super().filter_sql(expression) 4421 corr_expr = expression # make mypy happy 4422 4423 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True: Always quote except for specials cases. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
2158 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2159 """ 2160 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2161 2162 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2163 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2164 2165 For 'END' kind, add the interval to get the end of the slice. 2166 For DATE type with 'END', cast result back to DATE to preserve type. 2167 """ 2168 date_expr = expression.this 2169 slice_length = expression.expression 2170 unit = expression.unit 2171 kind = expression.text("kind").upper() 2172 2173 # Create INTERVAL expression: INTERVAL 'N' UNIT 2174 interval_expr = exp.Interval(this=slice_length, unit=unit) 2175 2176 # Create base time_bucket expression 2177 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2178 2179 # Check if we need the end of the slice (default is start) 2180 if not kind == "END": 2181 # For 'START', return time_bucket directly 2182 return self.sql(time_bucket_expr) 2183 2184 # For 'END', add the interval to get end of slice 2185 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2186 2187 # If input is DATE type, cast result back to DATE to preserve type 2188 # DuckDB converts DATE to TIMESTAMP when adding intervals 2189 if date_expr.is_type(exp.DType.DATE): 2190 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2191 2192 return self.sql(add_expr)
Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr)
For 'END' kind, add the interval to get the end of the slice. For DATE type with 'END', cast result back to DATE to preserve type.
2194 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2195 """ 2196 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2197 2198 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2199 - Each bucket covers 32,768 values 2200 - Bucket numbering starts at 1 2201 - Formula: ((value - 1) // 32768) + 1 for positive values 2202 2203 For non-positive values (0 and negative), we use value // 32768 to avoid 2204 producing bucket 0 or positive bucket IDs for negative inputs. 2205 """ 2206 value = expression.this 2207 2208 positive_formula = ((value - 1) // 32768) + 1 2209 non_positive_formula = value // 32768 2210 2211 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2212 case_expr = ( 2213 exp.case() 2214 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2215 .else_(non_positive_formula) 2216 ) 2217 return self.sql(case_expr)
Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent.
Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where:
- Each bucket covers 32,768 values
- Bucket numbering starts at 1
- Formula: ((value - 1) // 32768) + 1 for positive values
For non-positive values (0 and negative), we use value // 32768 to avoid producing bucket 0 or positive bucket IDs for negative inputs.
2219 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2220 """ 2221 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2222 2223 Snowflake's BITMAP_BIT_POSITION behavior: 2224 - For n <= 0: returns ABS(n) % 32768 2225 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2226 """ 2227 this = expression.this 2228 2229 return self.sql( 2230 exp.Mod( 2231 this=exp.Paren( 2232 this=exp.If( 2233 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2234 true=this - exp.Literal.number(1), 2235 false=exp.Abs(this=this), 2236 ) 2237 ), 2238 expression=MAX_BIT_POSITION, 2239 ) 2240 )
Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
Snowflake's BITMAP_BIT_POSITION behavior:
- For n <= 0: returns ABS(n) % 32768
- For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
2242 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2243 """ 2244 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2245 Uses a pre-parsed template with placeholders replaced by expression nodes. 2246 2247 Snowflake bitmap format: 2248 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2249 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2250 """ 2251 arg = expression.this 2252 return ( 2253 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2254 )
Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. Uses a pre-parsed template with placeholders replaced by expression nodes.
Snowflake bitmap format:
- Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
- Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
2286 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2287 this = expression.this 2288 expr = expression.expression 2289 2290 if expression.args.get("case_insensitive"): 2291 this = exp.Upper(this=this) 2292 expr = exp.Upper(this=expr) 2293 2294 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2295 2296 if expression.args.get("integer_scale"): 2297 result = exp.cast(result * 100, "INTEGER") 2298 2299 return self.sql(result)
2308 def randstr_sql(self, expression: exp.Randstr) -> str: 2309 """ 2310 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2311 Uses a pre-parsed template with placeholders replaced by expression nodes. 2312 2313 RANDSTR(length, generator) generates a random string of specified length. 2314 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2315 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2316 - No generator: Use default seed value 2317 """ 2318 length = expression.this 2319 generator = expression.args.get("generator") 2320 2321 if generator: 2322 if isinstance(generator, exp.Rand): 2323 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2324 seed_value = generator.this or generator 2325 else: 2326 # Const/int or other expression - use as seed directly 2327 seed_value = generator 2328 else: 2329 # No generator specified, use default seed (arbitrary but deterministic) 2330 seed_value = exp.Literal.number(RANDSTR_SEED) 2331 2332 replacements = {"seed": seed_value, "length": length} 2333 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})"
Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. Uses a pre-parsed template with placeholders replaced by expression nodes.
RANDSTR(length, generator) generates a random string of specified length.
- With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
- With RANDOM(): Use RANDOM() in the hash for non-deterministic output
- No generator: Use default seed value
2335 @unsupported_args("finish") 2336 def reduce_sql(self, expression: exp.Reduce) -> str: 2337 array_arg = expression.this 2338 initial_value = expression.args.get("initial") 2339 merge_lambda = expression.args.get("merge") 2340 2341 if merge_lambda: 2342 merge_lambda.set("colon", True) 2343 2344 return self.func("list_reduce", array_arg, merge_lambda, initial_value)
2346 def zipf_sql(self, expression: exp.Zipf) -> str: 2347 """ 2348 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2349 Uses a pre-parsed template with placeholders replaced by expression nodes. 2350 """ 2351 s = expression.this 2352 n = expression.args["elementcount"] 2353 gen = expression.args["gen"] 2354 2355 if not isinstance(gen, exp.Rand): 2356 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2357 random_expr: exp.Expr = exp.Div( 2358 this=exp.Paren( 2359 this=exp.Mod( 2360 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2361 expression=exp.Literal.number(1000000), 2362 ) 2363 ), 2364 expression=exp.Literal.number(1000000.0), 2365 ) 2366 else: 2367 # Use RANDOM() for non-deterministic output 2368 random_expr = exp.Rand() 2369 2370 replacements = {"s": s, "n": n, "random_expr": random_expr} 2371 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})"
Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. Uses a pre-parsed template with placeholders replaced by expression nodes.
2373 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2374 """ 2375 TO_BINARY and TRY_TO_BINARY transpilation: 2376 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2377 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2378 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2379 2380 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2381 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2382 """ 2383 value = expression.this 2384 format_arg = expression.args.get("format") 2385 is_safe = expression.args.get("safe") 2386 is_binary = _is_binary(expression) 2387 2388 if not format_arg and not is_binary: 2389 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2390 return self.func(func_name, value) 2391 2392 # Snowflake defaults to HEX encoding when no format is specified 2393 fmt = format_arg.name.upper() if format_arg else "HEX" 2394 2395 if fmt in ("UTF-8", "UTF8"): 2396 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2397 result = self.func("ENCODE", value) 2398 elif fmt == "BASE64": 2399 result = self.func("FROM_BASE64", value) 2400 elif fmt == "HEX": 2401 result = self.func("UNHEX", value) 2402 else: 2403 if is_safe: 2404 return self.sql(exp.null()) 2405 else: 2406 self.unsupported(f"format {fmt} is not supported") 2407 result = self.func("TO_BINARY", value) 2408 return f"TRY({result})" if is_safe else result
TO_BINARY and TRY_TO_BINARY transpilation:
- 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50')
- 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST')
- 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==')
For TRY_TO_BINARY (safe=True), wrap with TRY():
- 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid'))
2410 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2411 fmt = expression.args.get("format") 2412 precision = expression.args.get("precision") 2413 scale = expression.args.get("scale") 2414 2415 if not fmt and precision and scale: 2416 return self.sql( 2417 exp.cast( 2418 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2419 ) 2420 ) 2421 2422 return super().tonumber_sql(expression)
2448 def generator_sql(self, expression: exp.Generator) -> str: 2449 # Transpile Snowflake GENERATOR to DuckDB range() 2450 rowcount = expression.args.get("rowcount") 2451 time_limit = expression.args.get("time_limit") 2452 2453 if time_limit: 2454 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2455 2456 if not rowcount: 2457 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2458 return self.func("range", exp.Literal.number(0)) 2459 2460 return self.func("range", rowcount)
2468 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2469 if expression.args.get("colon"): 2470 prefix = "LAMBDA " 2471 arrow_sep = ":" 2472 wrap = False 2473 else: 2474 prefix = "" 2475 2476 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2477 return f"{prefix}{lambda_sql}"
2488 def sortarray_sql(self, expression: exp.SortArray) -> str: 2489 arr = expression.this 2490 asc = expression.args.get("asc") 2491 nulls_first = expression.args.get("nulls_first") 2492 2493 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2494 return self.func("LIST_SORT", arr, asc, nulls_first) 2495 2496 nulls_are_first = nulls_first == exp.true() 2497 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2498 2499 if not isinstance(asc, exp.Boolean): 2500 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2501 2502 descending = asc == exp.false() 2503 2504 if not descending and not nulls_are_first: 2505 return self.func("LIST_SORT", arr) 2506 if not nulls_are_first: 2507 return self.func("ARRAY_REVERSE_SORT", arr) 2508 return self.func( 2509 "LIST_SORT", 2510 arr, 2511 exp.Literal.string("DESC" if descending else "ASC"), 2512 exp.Literal.string("NULLS FIRST"), 2513 )
2515 def install_sql(self, expression: exp.Install) -> str: 2516 force = "FORCE " if expression.args.get("force") else "" 2517 this = self.sql(expression, "this") 2518 from_clause = expression.args.get("from_") 2519 from_clause = f" FROM {from_clause}" if from_clause else "" 2520 return f"{force}INSTALL {this}{from_clause}"
2531 def strposition_sql(self, expression: exp.StrPosition) -> str: 2532 this = expression.this 2533 substr = expression.args.get("substr") 2534 position = expression.args.get("position") 2535 2536 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2537 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2538 if _is_binary(this): 2539 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2540 hex_strpos = exp.StrPosition( 2541 this=exp.Hex(this=this), 2542 substr=exp.Hex(this=substr), 2543 ) 2544 2545 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2546 2547 # For VARCHAR: handle clamp_position 2548 if expression.args.get("clamp_position") and position: 2549 expression = expression.copy() 2550 expression.set( 2551 "position", 2552 exp.If( 2553 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2554 true=exp.Literal.number(1), 2555 false=position.copy(), 2556 ), 2557 ) 2558 2559 return strposition_sql(self, expression)
2561 def substring_sql(self, expression: exp.Substring) -> str: 2562 if expression.args.get("zero_start"): 2563 start = expression.args.get("start") 2564 length = expression.args.get("length") 2565 2566 if start := expression.args.get("start"): 2567 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2568 if length := expression.args.get("length"): 2569 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2570 2571 return self.func("SUBSTRING", expression.this, start, length) 2572 2573 return self.function_fallback_sql(expression)
2575 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2576 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2577 target_type = expression.args.get("target_type") 2578 needs_tz = target_type and target_type.this in ( 2579 exp.DType.TIMESTAMPLTZ, 2580 exp.DType.TIMESTAMPTZ, 2581 ) 2582 2583 if expression.args.get("safe"): 2584 formatted_time = self.format_time(expression) 2585 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2586 return self.sql( 2587 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2588 ) 2589 2590 base_sql = str_to_time_sql(self, expression) 2591 if needs_tz: 2592 return self.sql( 2593 exp.cast( 2594 base_sql, 2595 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2596 ) 2597 ) 2598 return base_sql
2600 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2601 formatted_time = self.format_time(expression) 2602 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2603 return self.sql( 2604 exp.cast( 2605 self.func(function_name, expression.this, formatted_time), 2606 exp.DataType(this=exp.DType.DATE), 2607 ) 2608 )
2610 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2611 this = expression.this 2612 time_format = self.format_time(expression) 2613 safe = expression.args.get("safe") 2614 time_type = exp.DataType.build("TIME", dialect="duckdb") 2615 cast_expr = exp.TryCast if safe else exp.Cast 2616 2617 if time_format: 2618 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2619 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2620 return self.sql(cast_expr(this=strptime, to=time_type)) 2621 2622 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2623 return self.sql(this) 2624 2625 return self.sql(cast_expr(this=this, to=time_type))
2627 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2628 if not expression.this: 2629 return "CURRENT_DATE" 2630 2631 expr = exp.Cast( 2632 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2633 to=exp.DataType(this=exp.DType.DATE), 2634 ) 2635 return self.sql(expr)
2648 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2649 arg = expression.this 2650 if expression.args.get("safe"): 2651 return self.sql( 2652 exp.case() 2653 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2654 .else_(exp.null()) 2655 ) 2656 return self.func("JSON", arg)
2658 def unicode_sql(self, expression: exp.Unicode) -> str: 2659 if expression.args.get("empty_is_zero"): 2660 return self.sql( 2661 exp.case() 2662 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2663 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2664 ) 2665 2666 return self.func("UNICODE", expression.this)
2675 def trunc_sql(self, expression: exp.Trunc) -> str: 2676 decimals = expression.args.get("decimals") 2677 if ( 2678 expression.args.get("fractions_supported") 2679 and decimals 2680 and not decimals.is_type(exp.DType.INT) 2681 ): 2682 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2683 2684 return self.func("TRUNC", expression.this, decimals)
2686 def normal_sql(self, expression: exp.Normal) -> str: 2687 """ 2688 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2689 2690 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2691 """ 2692 mean = expression.this 2693 stddev = expression.args["stddev"] 2694 gen: exp.Expr = expression.args["gen"] 2695 2696 # Build two uniform random values [0, 1) for Box-Muller transform 2697 if isinstance(gen, exp.Rand) and gen.this is None: 2698 u1: exp.Expr = exp.Rand() 2699 u2: exp.Expr = exp.Rand() 2700 else: 2701 # Seeded: derive two values using HASH with different inputs 2702 seed = gen.this if isinstance(gen, exp.Rand) else gen 2703 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2704 u2 = exp.replace_placeholders( 2705 self.SEEDED_RANDOM_TEMPLATE, 2706 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2707 ) 2708 2709 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2710 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements))
Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB.
Uses the Box-Muller transform via NORMAL_TEMPLATE.
2712 def uniform_sql(self, expression: exp.Uniform) -> str: 2713 """ 2714 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2715 2716 UNIFORM returns a random value in [min, max]: 2717 - Integer result if both min and max are integers 2718 - Float result if either min or max is a float 2719 """ 2720 min_val = expression.this 2721 max_val = expression.expression 2722 gen = expression.args.get("gen") 2723 2724 # Determine if result should be integer (both bounds are integers). 2725 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2726 is_int_result = min_val.is_int and max_val.is_int 2727 2728 # Build the random value expression [0, 1) 2729 if not isinstance(gen, exp.Rand): 2730 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2731 random_expr: exp.Expr = exp.Div( 2732 this=exp.Paren( 2733 this=exp.Mod( 2734 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2735 expression=exp.Literal.number(1000000), 2736 ) 2737 ), 2738 expression=exp.Literal.number(1000000.0), 2739 ) 2740 else: 2741 random_expr = exp.Rand() 2742 2743 # Build: min + random * (max - min [+ 1 for int]) 2744 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2745 if is_int_result: 2746 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2747 2748 result: exp.Expr = exp.Add( 2749 this=min_val, 2750 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2751 ) 2752 2753 if is_int_result: 2754 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2755 2756 return self.sql(result)
Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB.
UNIFORM returns a random value in [min, max]:
- Integer result if both min and max are integers
- Float result if either min or max is a float
2758 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2759 nano = expression.args.get("nano") 2760 overflow = expression.args.get("overflow") 2761 2762 # Snowflake's TIME_FROM_PARTS supports overflow 2763 if overflow: 2764 hour = expression.args["hour"] 2765 minute = expression.args["min"] 2766 sec = expression.args["sec"] 2767 2768 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2769 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2770 try: 2771 h_val = hour.to_py() 2772 m_val = minute.to_py() 2773 s_val = sec.to_py() 2774 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2775 return rename_func("MAKE_TIME")(self, expression) 2776 except ValueError: 2777 pass 2778 2779 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2780 if nano: 2781 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2782 2783 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2784 2785 return self.sql( 2786 exp.Add( 2787 this=exp.Cast( 2788 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2789 ), 2790 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2791 ) 2792 ) 2793 2794 # Default: MAKE_TIME 2795 if nano: 2796 expression.set( 2797 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2798 ) 2799 2800 return rename_func("MAKE_TIME")(self, expression)
2802 def extract_sql(self, expression: exp.Extract) -> str: 2803 """ 2804 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2805 2806 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2807 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2808 """ 2809 this = expression.this 2810 datetime_expr = expression.expression 2811 2812 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2813 # because Snowflake applies server timezone while DuckDB uses local timezone 2814 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2815 self.unsupported( 2816 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2817 ) 2818 2819 part_name = this.name.upper() 2820 2821 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2822 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2823 2824 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2825 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2826 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2827 exp.DType.TIME, exp.DType.TIMETZ 2828 ) 2829 2830 if is_nano_time: 2831 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2832 return self.sql( 2833 exp.cast( 2834 exp.Mul( 2835 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2836 expression=exp.Literal.number(1000), 2837 ), 2838 exp.DataType.build(cast_type, dialect="duckdb"), 2839 ) 2840 ) 2841 2842 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2843 strftime_input = datetime_expr 2844 if part_name == "NANOSECOND": 2845 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2846 2847 return self.sql( 2848 exp.cast( 2849 exp.Anonymous( 2850 this="STRFTIME", 2851 expressions=[strftime_input, exp.Literal.string(fmt)], 2852 ), 2853 exp.DataType.build(cast_type, dialect="duckdb"), 2854 ) 2855 ) 2856 2857 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2858 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2859 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2860 # EPOCH returns float, cast to BIGINT for integer result 2861 if part_name == "EPOCH_SECOND": 2862 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 2863 return self.sql(result) 2864 2865 return super().extract_sql(expression)
Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported.
DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND
2867 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2868 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2869 date_expr = expression.this 2870 time_expr = expression.expression 2871 2872 if date_expr is not None and time_expr is not None: 2873 # In DuckDB, DATE + TIME produces TIMESTAMP 2874 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2875 2876 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2877 sec = expression.args.get("sec") 2878 if sec is None: 2879 # This shouldn't happen with valid input, but handle gracefully 2880 return rename_func("MAKE_TIMESTAMP")(self, expression) 2881 2882 milli = expression.args.get("milli") 2883 if milli is not None: 2884 sec += milli.pop() / exp.Literal.number(1000.0) 2885 2886 nano = expression.args.get("nano") 2887 if nano is not None: 2888 sec += nano.pop() / exp.Literal.number(1000000000.0) 2889 2890 if milli or nano: 2891 expression.set("sec", sec) 2892 2893 return rename_func("MAKE_TIMESTAMP")(self, expression)
2895 @unsupported_args("nano") 2896 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2897 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2898 if nano := expression.args.get("nano"): 2899 nano.pop() 2900 2901 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2902 return f"CAST({timestamp} AS TIMESTAMPTZ)"
2904 @unsupported_args("nano") 2905 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2906 # Extract zone before popping 2907 zone = expression.args.get("zone") 2908 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2909 if zone: 2910 zone = zone.pop() 2911 2912 if nano := expression.args.get("nano"): 2913 nano.pop() 2914 2915 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2916 2917 if zone: 2918 # Use AT TIME ZONE to apply the explicit timezone 2919 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2920 2921 return timestamp
2923 def tablesample_sql( 2924 self, 2925 expression: exp.TableSample, 2926 tablesample_keyword: str | None = None, 2927 ) -> str: 2928 if not isinstance(expression.parent, exp.Select): 2929 # This sample clause only applies to a single source, not the entire resulting relation 2930 tablesample_keyword = "TABLESAMPLE" 2931 2932 if expression.args.get("size"): 2933 method = expression.args.get("method") 2934 if method and method.name.upper() != "RESERVOIR": 2935 self.unsupported( 2936 f"Sampling method {method} is not supported with a discrete sample count, " 2937 "defaulting to reservoir sampling" 2938 ) 2939 expression.set("method", exp.var("RESERVOIR")) 2940 2941 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword)
2943 def join_sql(self, expression: exp.Join) -> str: 2944 if ( 2945 not expression.args.get("using") 2946 and not expression.args.get("on") 2947 and not expression.method 2948 and (expression.kind in ("", "INNER", "OUTER")) 2949 ): 2950 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2951 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2952 if isinstance(expression.this, exp.Unnest): 2953 return super().join_sql(expression.on(exp.true())) 2954 2955 expression.set("side", None) 2956 expression.set("kind", None) 2957 2958 return super().join_sql(expression)
2967 def bracket_sql(self, expression: exp.Bracket) -> str: 2968 if self.dialect.version >= (1, 2): 2969 return super().bracket_sql(expression) 2970 2971 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 2972 this = expression.this 2973 if isinstance(this, exp.Array): 2974 this.replace(exp.paren(this)) 2975 2976 bracket = super().bracket_sql(expression) 2977 2978 if not expression.args.get("returns_list_for_maps"): 2979 if not this.type: 2980 from sqlglot.optimizer.annotate_types import annotate_types 2981 2982 this = annotate_types(this, dialect=self.dialect) 2983 2984 if this.is_type(exp.DType.MAP): 2985 bracket = f"({bracket})[1]" 2986 2987 return bracket
2989 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 2990 func = expression.this 2991 2992 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 2993 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 2994 if isinstance(func, exp.ArrayAgg): 2995 if not isinstance(order := expression.expression, exp.Order): 2996 return self.sql(func) 2997 2998 # Save the original column for FILTER clause (before wrapping with Order) 2999 original_this = func.this 3000 3001 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3002 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3003 func.set( 3004 "this", 3005 exp.Order( 3006 this=func.this.copy(), 3007 expressions=order.expressions, 3008 ), 3009 ) 3010 3011 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3012 # Use original_this (not the Order-wrapped version) for the FILTER condition 3013 array_agg_sql = self.function_fallback_sql(func) 3014 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3015 3016 # For other functions (like PERCENTILES), use existing logic 3017 expression_sql = self.sql(expression, "expression") 3018 3019 if isinstance(func, exp.PERCENTILES): 3020 # Make the order key the first arg and slide the fraction to the right 3021 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3022 order_col = expression.find(exp.Ordered) 3023 if order_col: 3024 func.set("expression", func.this) 3025 func.set("this", order_col.this) 3026 3027 this = self.sql(expression, "this").rstrip(")") 3028 3029 return f"{this}{expression_sql})"
3031 def length_sql(self, expression: exp.Length) -> str: 3032 arg = expression.this 3033 3034 # Dialects like BQ and Snowflake also accept binary values as args, so 3035 # DDB will attempt to infer the type or resort to case/when resolution 3036 if not expression.args.get("binary") or arg.is_string: 3037 return self.func("LENGTH", arg) 3038 3039 if not arg.type: 3040 from sqlglot.optimizer.annotate_types import annotate_types 3041 3042 arg = annotate_types(arg, dialect=self.dialect) 3043 3044 if arg.is_type(*exp.DataType.TEXT_TYPES): 3045 return self.func("LENGTH", arg) 3046 3047 # We need these casts to make duckdb's static type checker happy 3048 blob = exp.cast(arg, exp.DType.VARBINARY) 3049 varchar = exp.cast(arg, exp.DType.VARCHAR) 3050 3051 case = ( 3052 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3053 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3054 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3055 ) 3056 return self.sql(case)
3075 def collate_sql(self, expression: exp.Collate) -> str: 3076 if not expression.expression.is_string: 3077 return super().collate_sql(expression) 3078 3079 raw = expression.expression.name 3080 if not raw: 3081 return self.sql(expression.this) 3082 3083 parts = [] 3084 for part in raw.split("-"): 3085 lower = part.lower() 3086 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3087 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3088 self.unsupported( 3089 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3090 ) 3091 parts.append(lower) 3092 3093 if not parts: 3094 return self.sql(expression.this) 3095 return super().collate_sql( 3096 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3097 )
3129 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3130 this = expression.this 3131 pattern = expression.expression 3132 position = expression.args.get("position") 3133 parameters = expression.args.get("parameters") 3134 3135 # Validate flags - only "ims" flags are supported for embedded patterns 3136 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3137 3138 if position: 3139 this = exp.Substring(this=this, start=position) 3140 3141 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3142 if validated_flags: 3143 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3144 3145 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3146 result = ( 3147 exp.case() 3148 .when( 3149 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3150 exp.Literal.number(0), 3151 ) 3152 .else_( 3153 exp.Length( 3154 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3155 ) 3156 ) 3157 ) 3158 3159 return self.sql(result)
3161 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3162 subject = expression.this 3163 pattern = expression.expression 3164 replacement = expression.args.get("replacement") or exp.Literal.string("") 3165 position = expression.args.get("position") 3166 occurrence = expression.args.get("occurrence") 3167 modifiers = expression.args.get("modifiers") 3168 3169 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3170 3171 # Handle occurrence (only literals supported) 3172 if occurrence and not occurrence.is_int: 3173 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3174 else: 3175 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3176 if occurrence > 1: 3177 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3178 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3179 elif ( 3180 occurrence == 0 3181 and "g" not in validated_flags 3182 and not expression.args.get("single_replace") 3183 ): 3184 validated_flags += "g" 3185 3186 # Handle position (only literals supported) 3187 prefix = None 3188 if position and not position.is_int: 3189 self.unsupported("REGEXP_REPLACE with non-literal position") 3190 elif position and position.is_int and position.to_py() > 1: 3191 pos = position.to_py() 3192 prefix = exp.Substring( 3193 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3194 ) 3195 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3196 3197 result: exp.Expr = exp.Anonymous( 3198 this="REGEXP_REPLACE", 3199 expressions=[ 3200 subject, 3201 pattern, 3202 replacement, 3203 exp.Literal.string(validated_flags) if validated_flags else None, 3204 ], 3205 ) 3206 3207 if prefix: 3208 result = exp.Concat(expressions=[prefix, result]) 3209 3210 return self.sql(result)
3212 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3213 this = expression.this 3214 pattern = expression.expression 3215 flag = expression.args.get("flag") 3216 3217 if expression.args.get("full_match"): 3218 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3219 flag = exp.Literal.string(validated_flags) if validated_flags else None 3220 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3221 3222 return self.func("REGEXP_MATCHES", this, pattern, flag)
3224 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3225 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3226 this = expression.this 3227 expr = expression.expression 3228 max_dist = expression.args.get("max_dist") 3229 3230 if max_dist is None: 3231 return self.func("LEVENSHTEIN", this, expr) 3232 3233 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3234 levenshtein = exp.Levenshtein(this=this, expression=expr) 3235 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
3237 def pad_sql(self, expression: exp.Pad) -> str: 3238 """ 3239 Handle RPAD/LPAD for VARCHAR and BINARY types. 3240 3241 For VARCHAR: Delegate to parent class 3242 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3243 """ 3244 string_arg = expression.this 3245 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3246 3247 if _is_binary(string_arg) or _is_binary(fill_arg): 3248 length_arg = expression.expression 3249 is_left = expression.args.get("is_left") 3250 3251 input_len = exp.ByteLength(this=string_arg) 3252 chars_needed = length_arg - input_len 3253 pad_count = exp.Greatest( 3254 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3255 ) 3256 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3257 3258 left, right = string_arg, repeat_expr 3259 if is_left: 3260 left, right = right, left 3261 3262 result = exp.DPipe(this=left, expression=right) 3263 return self.sql(result) 3264 3265 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3266 return super().pad_sql(expression)
Handle RPAD/LPAD for VARCHAR and BINARY types.
For VARCHAR: Delegate to parent class For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input)))
3268 def minhash_sql(self, expression: exp.Minhash) -> str: 3269 k = expression.this 3270 exprs = expression.expressions 3271 3272 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3273 self.unsupported( 3274 "MINHASH with multiple expressions or * requires manual query restructuring" 3275 ) 3276 return self.func("MINHASH", k, *exprs) 3277 3278 expr = exprs[0] 3279 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3280 return f"({self.sql(result)})"
3304 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3305 arr = expression.this 3306 func = self.func("LIST_DISTINCT", arr) 3307 3308 if expression.args.get("check_null"): 3309 add_null_to_array = exp.func( 3310 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3311 ) 3312 return self.sql( 3313 exp.If( 3314 this=exp.NEQ( 3315 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3316 ), 3317 true=add_null_to_array, 3318 false=func, 3319 ) 3320 ) 3321 3322 return func
3324 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3325 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3326 return self._array_bag_sql( 3327 self.ARRAY_INTERSECTION_CONDITION, 3328 expression.expressions[0], 3329 expression.expressions[1], 3330 ) 3331 return self.function_fallback_sql(expression)
3333 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3334 arr1, arr2 = expression.this, expression.expression 3335 if expression.args.get("is_multiset"): 3336 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3337 return self.sql( 3338 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3339 )
3341 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3342 """ 3343 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3344 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3345 expressions that adjust the index at query time: 3346 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3347 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3348 """ 3349 start, end = expression.args.get("start"), expression.args.get("end") 3350 3351 if expression.args.get("zero_based"): 3352 if start is not None: 3353 start = ( 3354 exp.case() 3355 .when( 3356 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3357 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3358 ) 3359 .else_(start) 3360 ) 3361 if end is not None: 3362 end = ( 3363 exp.case() 3364 .when( 3365 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3366 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3367 ) 3368 .else_(end) 3369 ) 3370 3371 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step"))
Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE expressions that adjust the index at query time:
- start: CASE WHEN start >= 0 THEN start + 1 ELSE start END
- end: CASE WHEN end < 0 THEN end - 1 ELSE end END
3373 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3374 args = expression.expressions 3375 3376 if not args: 3377 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3378 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3379 3380 # Build placeholder values for template 3381 lengths = [exp.Length(this=arg) for arg in args] 3382 max_len = ( 3383 lengths[0] 3384 if len(lengths) == 1 3385 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3386 ) 3387 3388 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3389 empty_struct = exp.func( 3390 "STRUCT", 3391 *[ 3392 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3393 for i in range(len(args)) 3394 ], 3395 ) 3396 3397 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3398 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3399 index = exp.column("__i") + 1 3400 transform_struct = exp.func( 3401 "STRUCT", 3402 *[ 3403 exp.PropertyEQ( 3404 this=exp.Literal.string(f"${i + 1}"), 3405 expression=exp.func("COALESCE", arg, exp.array())[index], 3406 ) 3407 for i, arg in enumerate(args) 3408 ], 3409 ) 3410 3411 result = exp.replace_placeholders( 3412 self.ARRAYS_ZIP_TEMPLATE.copy(), 3413 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3414 all_empty_check=exp.and_( 3415 *[ 3416 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3417 for arg in args 3418 ] 3419 ), 3420 empty_struct=empty_struct, 3421 max_len=max_len, 3422 transform_struct=transform_struct, 3423 ) 3424 return self.sql(result)
3471 def stuff_sql(self, expression: exp.Stuff) -> str: 3472 base = expression.this 3473 start = expression.args["start"] 3474 length = expression.args["length"] 3475 insertion = expression.expression 3476 is_binary = _is_binary(base) 3477 3478 if is_binary: 3479 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3480 # (each byte = 2 hex chars), then UNHEX back to BLOB 3481 base = exp.Hex(this=base) 3482 insertion = exp.Hex(this=insertion) 3483 left = exp.Substring( 3484 this=base.copy(), 3485 start=exp.Literal.number(1), 3486 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3487 ) 3488 right = exp.Substring( 3489 this=base.copy(), 3490 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3491 + exp.Literal.number(1), 3492 ) 3493 else: 3494 left = exp.Substring( 3495 this=base.copy(), 3496 start=exp.Literal.number(1), 3497 length=start.copy() - exp.Literal.number(1), 3498 ) 3499 right = exp.Substring(this=base.copy(), start=start + length) 3500 result: exp.Expr = exp.DPipe( 3501 this=exp.DPipe(this=left, expression=insertion), expression=right 3502 ) 3503 3504 if is_binary: 3505 result = exp.Unhex(this=result) 3506 3507 return self.sql(result)
3509 def rand_sql(self, expression: exp.Rand) -> str: 3510 seed = expression.this 3511 if seed is not None: 3512 self.unsupported("RANDOM with seed is not supported in DuckDB") 3513 3514 lower = expression.args.get("lower") 3515 upper = expression.args.get("upper") 3516 3517 if lower and upper: 3518 # scale DuckDB's [0,1) to the specified range 3519 range_size = exp.paren(upper - lower) 3520 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3521 3522 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3523 result = exp.cast(scaled, exp.DType.BIGINT) 3524 return self.sql(result) 3525 3526 # Default DuckDB behavior - just return RANDOM() as float 3527 return "RANDOM()"
3529 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3530 arg = expression.this 3531 3532 # Check if it's a text type (handles both literals and annotated expressions) 3533 if arg.is_type(*exp.DataType.TEXT_TYPES): 3534 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3535 3536 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3537 return self.func("OCTET_LENGTH", arg)
3539 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3540 # DuckDB TO_BASE64 requires BLOB input 3541 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3542 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3543 result = expression.this 3544 3545 # Check if input is a string type - ENCODE only accepts VARCHAR 3546 if result.is_type(*exp.DataType.TEXT_TYPES): 3547 result = exp.Encode(this=result) 3548 3549 result = exp.ToBase64(this=result) 3550 3551 max_line_length = expression.args.get("max_line_length") 3552 alphabet = expression.args.get("alphabet") 3553 3554 # Handle custom alphabet by replacing standard chars with custom ones 3555 result = _apply_base64_alphabet_replacements(result, alphabet) 3556 3557 # Handle max_line_length by inserting newlines every N characters 3558 line_length = ( 3559 t.cast(int, max_line_length.to_py()) 3560 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3561 else 0 3562 ) 3563 if line_length > 0: 3564 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3565 result = exp.Trim( 3566 this=exp.RegexpReplace( 3567 this=result, 3568 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3569 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3570 ), 3571 expression=newline, 3572 position="TRAILING", 3573 ) 3574 3575 return self.sql(result)
3577 def replace_sql(self, expression: exp.Replace) -> str: 3578 result_sql = self.func( 3579 "REPLACE", 3580 _cast_to_varchar(expression.this), 3581 _cast_to_varchar(expression.expression), 3582 _cast_to_varchar(expression.args.get("replacement")), 3583 ) 3584 return _gen_with_cast_to_blob(self, expression, result_sql)
3596 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3597 this = expression.this 3598 key = expression.args.get("key") 3599 key_sql = key.name if isinstance(key, exp.Expr) else "" 3600 value_sql = self.sql(expression, "value") 3601 3602 kv_sql = f"{key_sql} := {value_sql}" 3603 3604 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3605 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3606 if isinstance(this, exp.Struct) and not this.expressions: 3607 return self.func("STRUCT_PACK", kv_sql) 3608 3609 return self.func("STRUCT_INSERT", this, kv_sql)
3624 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3625 map_arg = expression.this 3626 keys_to_delete = expression.expressions 3627 3628 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3629 3630 lambda_expr = exp.Lambda( 3631 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3632 expressions=[exp.to_identifier("x")], 3633 ) 3634 result = exp.func( 3635 "MAP_FROM_ENTRIES", 3636 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3637 ) 3638 return self.sql(result)
3640 def mappick_sql(self, expression: exp.MapPick) -> str: 3641 map_arg = expression.this 3642 keys_to_pick = expression.expressions 3643 3644 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3645 3646 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3647 lambda_expr = exp.Lambda( 3648 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3649 expressions=[exp.to_identifier("x")], 3650 ) 3651 else: 3652 lambda_expr = exp.Lambda( 3653 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3654 expressions=[exp.to_identifier("x")], 3655 ) 3656 3657 result = exp.func( 3658 "MAP_FROM_ENTRIES", 3659 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3660 ) 3661 return self.sql(result)
3666 @unsupported_args("update_flag") 3667 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3668 map_arg = expression.this 3669 key = expression.args.get("key") 3670 value = expression.args.get("value") 3671 3672 map_type = map_arg.type 3673 3674 if value is not None: 3675 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3676 # Extract the value type from MAP(key_type, value_type) 3677 value_type = map_type.expressions[1] 3678 # Cast value to match the map's value type to avoid type conflicts 3679 value = exp.cast(value, value_type) 3680 # else: polymorphic MAP case - no type parameters available, use value as-is 3681 3682 # Create a single-entry map for the new key-value pair 3683 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3684 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3685 3686 # Use MAP_CONCAT to merge the original map with the new entry 3687 # This automatically handles both insert and update cases 3688 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3689 3690 return self.sql(result)
3708 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3709 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3710 if isinstance(expression.this, exp.Generator): 3711 # Preserve alias, joins, and other table-level args 3712 table = exp.Table( 3713 this=expression.this, 3714 alias=expression.args.get("alias"), 3715 joins=expression.args.get("joins"), 3716 ) 3717 return self.sql(table) 3718 3719 return super().tablefromrows_sql(expression)
3721 def unnest_sql(self, expression: exp.Unnest) -> str: 3722 explode_array = expression.args.get("explode_array") 3723 if explode_array: 3724 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3725 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3726 expression.expressions.append( 3727 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3728 ) 3729 3730 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3731 alias = expression.args.get("alias") 3732 if isinstance(alias, exp.TableAlias): 3733 expression.set("alias", None) 3734 if alias.columns: 3735 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3736 3737 unnest_sql = super().unnest_sql(expression) 3738 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3739 return self.sql(select) 3740 3741 return super().unnest_sql(expression)
3743 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3744 this = expression.this 3745 3746 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3747 # DuckDB should render IGNORE NULLS only for the general-purpose 3748 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3749 return super().ignorenulls_sql(expression) 3750 3751 if isinstance(this, exp.First): 3752 this = exp.AnyValue(this=this.this) 3753 3754 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3755 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3756 3757 return self.sql(this)
3759 def split_sql(self, expression: exp.Split) -> str: 3760 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3761 3762 case_expr = exp.case().else_(base_func) 3763 needs_case = False 3764 3765 if expression.args.get("null_returns_null"): 3766 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3767 needs_case = True 3768 3769 if expression.args.get("empty_delimiter_returns_whole"): 3770 # When delimiter is empty string, return input string as single array element 3771 array_with_input = exp.array(expression.this) 3772 case_expr = case_expr.when( 3773 expression.expression.eq(exp.Literal.string("")), array_with_input 3774 ) 3775 needs_case = True 3776 3777 return self.sql(case_expr if needs_case else base_func)
3779 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3780 string_arg = expression.this 3781 delimiter_arg = expression.args.get("delimiter") 3782 part_index_arg = expression.args.get("part_index") 3783 3784 if delimiter_arg and part_index_arg: 3785 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3786 if expression.args.get("part_index_zero_as_one"): 3787 # Convert 0 to 1 for compatibility 3788 3789 part_index_arg = exp.Paren( 3790 this=exp.case() 3791 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3792 .else_(part_index_arg) 3793 ) 3794 3795 # Use Anonymous to avoid recursion 3796 base_func_expr: exp.Expr = exp.Anonymous( 3797 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3798 ) 3799 needs_case_transform = False 3800 case_expr = exp.case().else_(base_func_expr) 3801 3802 if expression.args.get("empty_delimiter_returns_whole"): 3803 # When delimiter is empty string: 3804 # - Return whole string if part_index is 1 or -1 3805 # - Return empty string otherwise 3806 empty_case = exp.Paren( 3807 this=exp.case() 3808 .when( 3809 exp.or_( 3810 part_index_arg.eq(exp.Literal.number("1")), 3811 part_index_arg.eq(exp.Literal.number("-1")), 3812 ), 3813 string_arg, 3814 ) 3815 .else_(exp.Literal.string("")) 3816 ) 3817 3818 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3819 needs_case_transform = True 3820 3821 """ 3822 Output looks something like this: 3823 3824 CASE 3825 WHEN delimiter is '' THEN 3826 ( 3827 CASE 3828 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3829 ELSE '' END 3830 ) 3831 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3832 END 3833 3834 """ 3835 return self.sql(case_expr if needs_case_transform else base_func_expr) 3836 3837 return self.function_fallback_sql(expression)
3839 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3840 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3841 # DuckDB should render RESPECT NULLS only for the general-purpose 3842 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3843 return super().respectnulls_sql(expression) 3844 3845 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3846 return self.sql(expression, "this")
3848 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3849 null = expression.args.get("null") 3850 3851 if expression.args.get("null_is_empty"): 3852 x = exp.to_identifier("x") 3853 list_transform = exp.Transform( 3854 this=expression.this.copy(), 3855 expression=exp.Lambda( 3856 this=exp.Coalesce( 3857 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3858 ), 3859 expressions=[x], 3860 ), 3861 ) 3862 array_to_string = exp.ArrayToString( 3863 this=list_transform, expression=expression.expression 3864 ) 3865 if expression.args.get("null_delim_is_null"): 3866 return self.sql( 3867 exp.case() 3868 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3869 .else_(array_to_string) 3870 ) 3871 return self.sql(array_to_string) 3872 3873 if null: 3874 x = exp.to_identifier("x") 3875 return self.sql( 3876 exp.ArrayToString( 3877 this=exp.Transform( 3878 this=expression.this, 3879 expression=exp.Lambda( 3880 this=exp.Coalesce(this=x, expressions=[null]), 3881 expressions=[x], 3882 ), 3883 ), 3884 expression=expression.expression, 3885 ) 3886 ) 3887 3888 return self.func("ARRAY_TO_STRING", expression.this, expression.expression)
3890 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3891 # DuckDB-specific: handle binary types using DPipe (||) operator 3892 separator = seq_get(expression.expressions, 0) 3893 args = expression.expressions[1:] 3894 3895 if any(_is_binary(arg) for arg in [separator, *args]): 3896 result = args[0] 3897 for arg in args[1:]: 3898 result = exp.DPipe( 3899 this=exp.DPipe(this=result, expression=separator), expression=arg 3900 ) 3901 return self.sql(result) 3902 3903 return super().concatws_sql(expression)
3964 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3965 this = expression.this 3966 pattern = expression.expression 3967 position = expression.args.get("position") 3968 orig_occ = expression.args.get("occurrence") 3969 occurrence = orig_occ or exp.Literal.number(1) 3970 option = expression.args.get("option") 3971 parameters = expression.args.get("parameters") 3972 3973 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3974 if validated_flags: 3975 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3976 3977 # Handle starting position offset 3978 pos_offset: exp.Expr = exp.Literal.number(0) 3979 if position and (not position.is_int or position.to_py() > 1): 3980 this = exp.Substring(this=this, start=position) 3981 pos_offset = position - exp.Literal.number(1) 3982 3983 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3984 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 3985 lst = exp.Bracket( 3986 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3987 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3988 offset=1, 3989 ) 3990 transform = exp.Anonymous( 3991 this="LIST_TRANSFORM", 3992 expressions=[ 3993 lst, 3994 exp.Lambda( 3995 this=exp.Length(this=exp.to_identifier("x")), 3996 expressions=[exp.to_identifier("x")], 3997 ), 3998 ], 3999 ) 4000 return exp.Coalesce( 4001 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4002 expressions=[exp.Literal.number(0)], 4003 ) 4004 4005 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4006 base_pos: exp.Expr = ( 4007 exp.Literal.number(1) 4008 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4009 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4010 + pos_offset 4011 ) 4012 4013 # option=1: add match length for end position 4014 if option and option.is_int and option.to_py() == 1: 4015 match_at_occ = exp.Bracket( 4016 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4017 expressions=[occurrence], 4018 offset=1, 4019 ) 4020 base_pos = base_pos + exp.Coalesce( 4021 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4022 ) 4023 4024 # NULL checks for all provided arguments 4025 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4026 null_args = [ 4027 expression.this, 4028 expression.expression, 4029 position, 4030 orig_occ, 4031 option, 4032 parameters, 4033 ] 4034 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4035 4036 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4037 4038 return self.sql( 4039 exp.case() 4040 .when(exp.or_(*null_checks), exp.Null()) 4041 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4042 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4043 .else_(base_pos) 4044 )
4046 @unsupported_args("culture") 4047 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4048 fmt = expression.args.get("format") 4049 if fmt and fmt.is_int: 4050 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4051 4052 self.unsupported("Only integer formats are supported by NumberToStr") 4053 return self.function_fallback_sql(expression)
4066 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4067 this = expression.this 4068 parent = expression.parent 4069 4070 # The default Spark aliases are "pos" and "col", unless specified otherwise 4071 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4072 4073 if isinstance(parent, exp.Aliases): 4074 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4075 pos, col = parent.expressions 4076 elif isinstance(parent, exp.Table): 4077 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4078 alias = parent.args.get("alias") 4079 if alias: 4080 pos, col = alias.columns or [pos, col] 4081 alias.pop() 4082 4083 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4084 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4085 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4086 gen_subscripts = self.sql( 4087 exp.Alias( 4088 this=exp.Anonymous( 4089 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4090 ) 4091 - exp.Literal.number(1), 4092 alias=pos, 4093 ) 4094 ) 4095 4096 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4097 4098 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4099 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4100 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4101 4102 return posexplode_sql
4104 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4105 """ 4106 Handles three key issues: 4107 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4108 2. End-of-month preservation: If input is last day of month, result is last day of result month 4109 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4110 """ 4111 from sqlglot.optimizer.annotate_types import annotate_types 4112 4113 this = expression.this 4114 if not this.type: 4115 this = annotate_types(this, dialect=self.dialect) 4116 4117 if this.is_type(*exp.DataType.TEXT_TYPES): 4118 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4119 4120 # Detect float/decimal months to apply rounding (Snowflake behavior) 4121 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4122 months_expr = expression.expression 4123 if not months_expr.type: 4124 months_expr = annotate_types(months_expr, dialect=self.dialect) 4125 4126 # Build interval or to_months expression based on type 4127 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4128 interval_or_to_months = ( 4129 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4130 if months_expr.is_type( 4131 exp.DType.FLOAT, 4132 exp.DType.DOUBLE, 4133 exp.DType.DECIMAL, 4134 ) 4135 # Integer case: standard INTERVAL N MONTH syntax 4136 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4137 ) 4138 4139 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4140 4141 # Apply end-of-month preservation if Snowflake flag is set 4142 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4143 preserve_eom = expression.args.get("preserve_end_of_month") 4144 result_expr = ( 4145 exp.case() 4146 .when( 4147 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4148 exp.func("LAST_DAY", date_add_expr), 4149 ) 4150 .else_(date_add_expr) 4151 if preserve_eom 4152 else date_add_expr 4153 ) 4154 4155 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4156 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4157 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4158 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4159 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4160 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4161 return self.sql(result_expr)
Handles three key issues:
- Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers
- End-of-month preservation: If input is last day of month, result is last day of result month
- Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP)
4175 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4176 unit = expression.args.get("unit") 4177 date = expression.this 4178 4179 week_start = _week_unit_to_dow(unit) 4180 unit = unit_to_str(expression) 4181 4182 if week_start: 4183 result = self.sql( 4184 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4185 ) 4186 else: 4187 result = self.func("DATE_TRUNC", unit, date) 4188 4189 if ( 4190 expression.args.get("input_type_preserved") 4191 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4192 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4193 ): 4194 return self.sql(exp.Cast(this=result, to=date.type)) 4195 4196 return result
4198 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4199 unit = unit_to_str(expression) 4200 zone = expression.args.get("zone") 4201 timestamp = expression.this 4202 date_unit = is_date_unit(unit) 4203 4204 if date_unit and zone: 4205 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4206 # Double AT TIME ZONE needed for BigQuery compatibility: 4207 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4208 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4209 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4210 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4211 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4212 4213 result = self.func("DATE_TRUNC", unit, timestamp) 4214 if expression.args.get("input_type_preserved"): 4215 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4216 dummy_date = exp.Cast( 4217 this=exp.Literal.string("1970-01-01"), 4218 to=exp.DataType(this=exp.DType.DATE), 4219 ) 4220 date_time = exp.Add(this=dummy_date, expression=timestamp) 4221 result = self.func("DATE_TRUNC", unit, date_time) 4222 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4223 4224 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4225 date_unit and timestamp.is_type(exp.DType.DATE) 4226 ): 4227 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4228 4229 return result
4231 def trim_sql(self, expression: exp.Trim) -> str: 4232 expression.this.replace(_cast_to_varchar(expression.this)) 4233 if expression.expression: 4234 expression.expression.replace(_cast_to_varchar(expression.expression)) 4235 4236 result_sql = super().trim_sql(expression) 4237 return _gen_with_cast_to_blob(self, expression, result_sql)
4239 def round_sql(self, expression: exp.Round) -> str: 4240 this = expression.this 4241 decimals = expression.args.get("decimals") 4242 truncate = expression.args.get("truncate") 4243 4244 # DuckDB requires the scale (decimals) argument to be an INT 4245 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4246 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4247 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4248 decimals = exp.cast(decimals, exp.DType.INT) 4249 4250 func = "ROUND" 4251 if truncate: 4252 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4253 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4254 func = "ROUND_EVEN" 4255 truncate = None 4256 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4257 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4258 truncate = None 4259 4260 return self.func(func, this, decimals, truncate)
4262 def strtok_sql(self, expression: exp.Strtok) -> str: 4263 string_arg = expression.this 4264 delimiter_arg = expression.args.get("delimiter") 4265 part_index_arg = expression.args.get("part_index") 4266 4267 if delimiter_arg and part_index_arg: 4268 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4269 escaped_delimiter = exp.Anonymous( 4270 this="REGEXP_REPLACE", 4271 expressions=[ 4272 delimiter_arg, 4273 exp.Literal.string( 4274 r"([\[\]^.\-*+?(){}|$\\])" 4275 ), # Escape problematic regex chars 4276 exp.Literal.string( 4277 r"\\\1" 4278 ), # Replace with escaped version using $1 backreference 4279 exp.Literal.string("g"), # Global flag 4280 ], 4281 ) 4282 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4283 regex_pattern = ( 4284 exp.case() 4285 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4286 .else_( 4287 exp.func( 4288 "CONCAT", 4289 exp.Literal.string("["), 4290 escaped_delimiter, 4291 exp.Literal.string("]"), 4292 ) 4293 ) 4294 ) 4295 4296 # STRTOK skips empty strings, so we need to filter them out 4297 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4298 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4299 x = exp.to_identifier("x") 4300 is_empty = x.eq(exp.Literal.string("")) 4301 filtered_array = exp.func( 4302 "LIST_FILTER", 4303 split_array, 4304 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4305 ) 4306 base_func = exp.Bracket( 4307 this=filtered_array, 4308 expressions=[part_index_arg], 4309 offset=1, 4310 ) 4311 4312 # Use template with the built regex pattern 4313 result = exp.replace_placeholders( 4314 self.STRTOK_TEMPLATE.copy(), 4315 string=string_arg, 4316 delimiter=delimiter_arg, 4317 part_index=part_index_arg, 4318 base_func=base_func, 4319 ) 4320 4321 return self.sql(result) 4322 4323 return self.function_fallback_sql(expression)
4325 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4326 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4327 4328 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4329 if expression.is_type(*exp.DataType.REAL_TYPES): 4330 result = f"CAST({result} AS DOUBLE)" 4331 4332 return result
4334 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4335 """ 4336 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4337 dividing the input distribution into n equal-sized buckets. 4338 4339 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4340 does not document the specific algorithm used so results may differ. DuckDB does not 4341 support RESPECT NULLS. 4342 """ 4343 this = expression.this 4344 if isinstance(this, exp.Distinct): 4345 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4346 if len(this.expressions) < 2: 4347 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4348 return self.function_fallback_sql(expression) 4349 num_quantiles_expr = this.expressions[1].pop() 4350 else: 4351 num_quantiles_expr = expression.expression 4352 4353 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4354 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4355 return self.function_fallback_sql(expression) 4356 4357 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4358 if num_quantiles <= 0: 4359 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4360 return self.function_fallback_sql(expression) 4361 4362 quantiles = [ 4363 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4364 for i in range(num_quantiles + 1) 4365 ] 4366 4367 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)))
BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values dividing the input distribution into n equal-sized buckets.
Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery does not document the specific algorithm used so results may differ. DuckDB does not support RESPECT NULLS.
4376 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4377 this = expression.this 4378 4379 if _is_binary(this): 4380 expression.type = exp.DType.BINARY.into_expr() 4381 4382 arg = _cast_to_bit(this) 4383 4384 if isinstance(this, exp.Neg): 4385 arg = exp.Paren(this=arg) 4386 4387 expression.set("this", arg) 4388 4389 result_sql = f"~{self.sql(expression, 'this')}" 4390 4391 return _gen_with_cast_to_blob(self, expression, result_sql)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- WINDOW_FUNCS_WITH_NULL_ORDERING
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SUPPORTS_MERGE_WHERE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_ONLY_LITERALS
- GROUPINGS_SEP
- INDEX_ON
- INOUT_SEPARATOR
- DIRECTED_JOINS
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_WITH_METHOD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- LIKE_PROPERTY_INSIDE_SCHEMA
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- TRY_SUPPORTED
- SUPPORTS_UESCAPE
- UNICODE_SUBSTITUTE
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- SUPPORTS_BETWEEN_FLAGS
- MATCH_AGAINST_TABLE_PREFIX
- DECLARE_DEFAULT_ASSIGNMENT
- UPDATE_STATEMENT_SUPPORTS_FROM
- STAR_EXCLUDE_REQUIRES_DERIVED_TABLE
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- pseudocolumn_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- inoutcolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- triggerproperties_sql
- triggerreferencing_sql
- triggerevent_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- uuidproperty_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- moduleproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- rollupindex_sql
- rollupproperty_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- booland_sql
- boolor_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- prewhere_sql
- where_sql
- partition_by_sql
- windowspec_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- convert_concat_args
- concat_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwiseor_sql
- bitwiserightshift_sql
- cast_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- eq_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- match_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- sub_sql
- trycast_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- generatetext_sql
- generatetable_sql
- generatebool_sql
- generateint_sql
- generatedouble_sql
- mltranslate_sql
- mlforecast_sql
- aiforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- changes_sql
- summarize_sql
- explodinggenerateseries_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- skipjsoncolumn_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- slice_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql
- uuid_sql
- initcap_sql
- localtime_sql
- localtimestamp_sql
- weekstart_sql
- block_sql
- storedprocedure_sql
- ifblock_sql
- whileblock_sql
- execute_sql
- executesql_sql
- altermodifysqlsecurity_sql
- usingproperty_sql
- renameindex_sql