sqlglot.generators.duckdb
1from __future__ import annotations 2 3from decimal import Decimal 4from itertools import groupby 5import re 6import typing as t 7 8from sqlglot import exp, generator, transforms 9 10from sqlglot.dialects.dialect import ( 11 DATETIME_DELTA, 12 JSON_EXTRACT_TYPE, 13 approx_count_distinct_sql, 14 array_append_sql, 15 array_compact_sql, 16 array_concat_sql, 17 arrow_json_extract_sql, 18 count_if_to_sum, 19 date_delta_to_binary_interval_op, 20 datestrtodate_sql, 21 encode_decode_sql, 22 explode_to_unnest_sql, 23 generate_series_sql, 24 getbit_sql, 25 groupconcat_sql, 26 inline_array_unless_query, 27 months_between_sql, 28 no_datetime_sql, 29 no_comment_column_constraint_sql, 30 no_make_interval_sql, 31 no_time_sql, 32 no_timestamp_sql, 33 rename_func, 34 remove_from_array_using_filter, 35 strposition_sql, 36 str_to_time_sql, 37 timestrtotime_sql, 38 unit_to_str, 39) 40from sqlglot.generator import unsupported_args 41from sqlglot.helper import is_date_unit, seq_get 42from builtins import type as Type 43 44# Regex to detect time zones in timestamps of the form [+|-]TT[:tt] 45# The pattern matches timezone offsets that appear after the time portion 46TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?") 47 48# Characters that must be escaped when building regex expressions in INITCAP 49REGEX_ESCAPE_REPLACEMENTS = { 50 "\\": "\\\\", 51 "-": r"\-", 52 "^": r"\^", 53 "[": r"\[", 54 "]": r"\]", 55} 56 57# Used to in RANDSTR transpilation 58RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 59RANDSTR_SEED = 123456 60 61# Whitespace control characters that DuckDB must process with `CHR({val})` calls 62WS_CONTROL_CHARS_TO_DUCK = { 63 "\u000b": 11, 64 "\u001c": 28, 65 "\u001d": 29, 66 "\u001e": 30, 67 "\u001f": 31, 68} 69 70# Days of week to ISO 8601 day-of-week numbers 71# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7 72WEEK_START_DAY_TO_DOW = { 73 "MONDAY": 1, 74 "TUESDAY": 2, 75 "WEDNESDAY": 3, 76 "THURSDAY": 4, 77 "FRIDAY": 5, 78 "SATURDAY": 6, 79 "SUNDAY": 7, 80} 81 82MAX_BIT_POSITION = exp.Literal.number(32768) 83 84# cs/as/ps are Snowflake defaults; DuckDB already behaves the same way, so they are safe to drop. 85# Note: "as" is also a reserved keyword in DuckDB, making it impossible to pass through. 86_SNOWFLAKE_COLLATION_DEFAULTS = frozenset({"cs", "as", "ps"}) 87_SNOWFLAKE_COLLATION_UNSUPPORTED = frozenset( 88 {"ci", "ai", "upper", "lower", "utf8", "bin", "pi", "fl", "fu", "trim", "ltrim", "rtrim"} 89) 90 91# Window functions that support IGNORE/RESPECT NULLS in DuckDB 92_IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 93 exp.FirstValue, 94 exp.Lag, 95 exp.LastValue, 96 exp.Lead, 97 exp.NthValue, 98) 99 100# SEQ function constants 101_SEQ_BASE: exp.Expr = exp.maybe_parse("(ROW_NUMBER() OVER (ORDER BY 1) - 1)") 102_SEQ_RESTRICTED = (exp.Where, exp.Having, exp.AggFunc, exp.Order, exp.Select) 103# Maps SEQ expression types to their byte width (suffix indicates bytes: SEQ1=1, SEQ2=2, etc.) 104_SEQ_BYTE_WIDTH = {exp.Seq1: 1, exp.Seq2: 2, exp.Seq4: 4, exp.Seq8: 8} 105 106# Template for generating signed and unsigned SEQ values within a specified range 107_SEQ_UNSIGNED: exp.Expr = exp.maybe_parse(":base % :max_val") 108_SEQ_SIGNED: exp.Expr = exp.maybe_parse( 109 "(CASE WHEN :base % :max_val >= :half " 110 "THEN :base % :max_val - :max_val " 111 "ELSE :base % :max_val END)" 112) 113 114 115def _apply_base64_alphabet_replacements( 116 result: exp.Expr, 117 alphabet: exp.Expr | None, 118 reverse: bool = False, 119) -> exp.Expr: 120 """ 121 Apply base64 alphabet character replacements. 122 123 Base64 alphabet can be 1-3 chars: 1st = index 62 ('+'), 2nd = index 63 ('/'), 3rd = padding ('='). 124 zip truncates to the shorter string, so 1-char alphabet only replaces '+', 2-char replaces '+/', etc. 125 126 Args: 127 result: The expression to apply replacements to 128 alphabet: Custom alphabet literal (expected chars for +/=) 129 reverse: If False, replace default with custom (encode) 130 If True, replace custom with default (decode) 131 """ 132 if isinstance(alphabet, exp.Literal) and alphabet.is_string: 133 for default_char, new_char in zip("+/=", alphabet.this): 134 if new_char != default_char: 135 find, replace = (new_char, default_char) if reverse else (default_char, new_char) 136 result = exp.Replace( 137 this=result, 138 expression=exp.Literal.string(find), 139 replacement=exp.Literal.string(replace), 140 ) 141 return result 142 143 144def _base64_decode_sql(self: DuckDBGenerator, expression: exp.Expr, to_string: bool) -> str: 145 """ 146 Transpile Snowflake BASE64_DECODE_STRING/BINARY to DuckDB. 147 148 DuckDB uses FROM_BASE64() which returns BLOB. For string output, wrap with DECODE(). 149 Custom alphabets require REPLACE() calls to convert to standard base64. 150 """ 151 input_expr = expression.this 152 alphabet = expression.args.get("alphabet") 153 154 # Handle custom alphabet by replacing non-standard chars with standard ones 155 input_expr = _apply_base64_alphabet_replacements(input_expr, alphabet, reverse=True) 156 157 # FROM_BASE64 returns BLOB 158 input_expr = exp.FromBase64(this=input_expr) 159 160 if to_string: 161 input_expr = exp.Decode(this=input_expr) 162 163 return self.sql(input_expr) 164 165 166def _last_day_sql(self: DuckDBGenerator, expression: exp.LastDay) -> str: 167 """ 168 DuckDB's LAST_DAY only supports finding the last day of a month. 169 For other date parts (year, quarter, week), we need to implement equivalent logic. 170 """ 171 date_expr = expression.this 172 unit = expression.text("unit") 173 174 if not unit or unit.upper() == "MONTH": 175 # Default behavior - use DuckDB's native LAST_DAY 176 return self.func("LAST_DAY", date_expr) 177 178 if unit.upper() == "YEAR": 179 # Last day of year: December 31st of the same year 180 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 181 make_date_expr = exp.func( 182 "MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31) 183 ) 184 return self.sql(make_date_expr) 185 186 if unit.upper() == "QUARTER": 187 # Last day of quarter 188 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 189 quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr) 190 191 # Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4 192 last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3)) 193 first_day_last_month_expr = exp.func( 194 "MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1) 195 ) 196 197 # Last day of the last month of the quarter 198 last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr) 199 return self.sql(last_day_expr) 200 201 if unit.upper() == "WEEK": 202 # DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6 203 dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr) 204 # Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake) 205 # Wrap in parentheses to ensure correct precedence 206 days_to_sunday_expr = exp.Mod( 207 this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)), 208 expression=exp.Literal.number(7), 209 ) 210 interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY")) 211 add_expr = exp.Add(this=date_expr, expression=interval_expr) 212 cast_expr = exp.cast(add_expr, exp.DType.DATE) 213 return self.sql(cast_expr) 214 215 self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function") 216 return self.function_fallback_sql(expression) 217 218 219def _is_nanosecond_unit(unit: exp.Expr | None) -> bool: 220 return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND" 221 222 223def _handle_nanosecond_diff( 224 self: DuckDBGenerator, 225 end_time: exp.Expr, 226 start_time: exp.Expr, 227) -> str: 228 """Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it.""" 229 end_ns = exp.cast(end_time, exp.DType.TIMESTAMP_NS) 230 start_ns = exp.cast(start_time, exp.DType.TIMESTAMP_NS) 231 232 # Build expression tree: EPOCH_NS(end) - EPOCH_NS(start) 233 return self.sql( 234 exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns)) 235 ) 236 237 238def _to_boolean_sql(self: DuckDBGenerator, expression: exp.ToBoolean) -> str: 239 """ 240 Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent. 241 242 DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'. 243 We need to handle the 'on'/'off' cases explicitly. 244 245 For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR() 246 function to replicate this behavior with a clear error message. 247 248 For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL 249 for invalid inputs instead of throwing errors. 250 """ 251 arg = expression.this 252 is_safe = expression.args.get("safe", False) 253 254 base_case_expr = ( 255 exp.case() 256 .when( 257 # Handle 'on' -> TRUE (case insensitive) 258 exp.Upper(this=exp.cast(arg, exp.DType.VARCHAR)).eq(exp.Literal.string("ON")), 259 exp.true(), 260 ) 261 .when( 262 # Handle 'off' -> FALSE (case insensitive) 263 exp.Upper(this=exp.cast(arg, exp.DType.VARCHAR)).eq(exp.Literal.string("OFF")), 264 exp.false(), 265 ) 266 ) 267 268 if is_safe: 269 # TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else 270 case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DType.BOOLEAN.into_expr())) 271 else: 272 # TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST 273 cast_to_real = exp.func("TRY_CAST", arg, exp.DType.FLOAT.into_expr()) 274 275 # Check for NaN and INF values 276 nan_inf_check = exp.Or( 277 this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real) 278 ) 279 280 case_expr = base_case_expr.when( 281 nan_inf_check, 282 exp.func( 283 "ERROR", 284 exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"), 285 ), 286 ).else_(exp.cast(arg, exp.DType.BOOLEAN)) 287 288 return self.sql(case_expr) 289 290 291# BigQuery -> DuckDB conversion for the DATE function 292def _date_sql(self: DuckDBGenerator, expression: exp.Date) -> str: 293 this = expression.this 294 zone = self.sql(expression, "zone") 295 296 if zone: 297 # BigQuery considers "this" at UTC, converts it to the specified 298 # time zone and then keeps only the DATE part 299 # To micmic that, we: 300 # (1) Cast to TIMESTAMP to remove DuckDB's local tz 301 # (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion 302 this = exp.cast(this, exp.DType.TIMESTAMP) 303 at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC")) 304 this = exp.AtTimeZone(this=at_utc, zone=zone) 305 306 return self.sql(exp.cast(expression=this, to=exp.DType.DATE)) 307 308 309# BigQuery -> DuckDB conversion for the TIME_DIFF function 310def _timediff_sql(self: DuckDBGenerator, expression: exp.TimeDiff) -> str: 311 unit = expression.unit 312 313 if _is_nanosecond_unit(unit): 314 return _handle_nanosecond_diff(self, expression.expression, expression.this) 315 316 this = exp.cast(expression.this, exp.DType.TIME) 317 expr = exp.cast(expression.expression, exp.DType.TIME) 318 319 # Although the 2 dialects share similar signatures, BQ seems to inverse 320 # the sign of the result so the start/end time operands are flipped 321 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 322 323 324def _date_delta_to_binary_interval_op( 325 cast: bool = True, 326) -> t.Callable[[DuckDBGenerator, DATETIME_DELTA], str]: 327 """ 328 DuckDB override to handle: 329 1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND) 330 2. Float/decimal interval values (DuckDB INTERVAL requires integers) 331 """ 332 base_impl = date_delta_to_binary_interval_op(cast=cast) 333 334 def _duckdb_date_delta_sql(self: DuckDBGenerator, expression: DATETIME_DELTA) -> str: 335 unit = expression.unit 336 interval_value = expression.expression 337 338 # Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND) 339 if _is_nanosecond_unit(unit): 340 if isinstance(interval_value, exp.Interval): 341 interval_value = interval_value.this 342 343 timestamp_ns = exp.cast(expression.this, exp.DType.TIMESTAMP_NS) 344 345 return self.sql( 346 exp.func( 347 "MAKE_TIMESTAMP_NS", 348 exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value), 349 ) 350 ) 351 352 # Handle float/decimal interval values as duckDB INTERVAL requires integer expressions 353 if not interval_value or isinstance(interval_value, exp.Interval): 354 return base_impl(self, expression) 355 356 if interval_value.is_type(*exp.DataType.REAL_TYPES): 357 expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT")) 358 359 return base_impl(self, expression) 360 361 return _duckdb_date_delta_sql 362 363 364def _array_insert_sql(self: DuckDBGenerator, expression: exp.ArrayInsert) -> str: 365 """ 366 Transpile ARRAY_INSERT to DuckDB using LIST_CONCAT and slicing. 367 368 Handles: 369 - 0-based and 1-based indexing (normalizes to 0-based for calculations) 370 - Negative position conversion (requires array length) 371 - NULL propagation (source dialects return NULL, DuckDB creates single-element array) 372 - Assumes position is within bounds per user constraint 373 374 Note: All dialects that support ARRAY_INSERT (Snowflake, Spark, Databricks) have 375 ARRAY_FUNCS_PROPAGATES_NULLS=True, so we always assume source propagates NULLs. 376 377 Args: 378 expression: The ArrayInsert expression to transpile. 379 380 Returns: 381 SQL string implementing ARRAY_INSERT behavior. 382 """ 383 this = expression.this 384 position = expression.args.get("position") 385 element = expression.expression 386 element_array = exp.Array(expressions=[element]) 387 index_offset = expression.args.get("offset", 0) 388 389 if not position or not position.is_int: 390 self.unsupported("ARRAY_INSERT can only be transpiled with a literal position") 391 return self.func("ARRAY_INSERT", this, position, element) 392 393 pos_value = position.to_py() 394 395 # Normalize one-based indexing to zero-based for slice calculations 396 # Spark (1-based) -> Snowflake (0-based): 397 # Positive: pos=1 -> pos=0 (subtract 1) 398 # Negative: pos=-2 -> pos=-1 (add 1) 399 # Example: Spark array_insert([a,b,c], -2, d) -> [a,b,d,c] is same as Snowflake pos=-1 400 if pos_value > 0: 401 pos_value = pos_value - index_offset 402 elif pos_value < 0: 403 pos_value = pos_value + index_offset 404 405 # Build the appropriate list_concat expression based on position 406 if pos_value == 0: 407 # insert at beginning 408 concat_exprs = [element_array, this] 409 elif pos_value > 0: 410 # Positive position: LIST_CONCAT(arr[1:pos], [elem], arr[pos+1:]) 411 # 0-based -> DuckDB 1-based slicing 412 413 # left slice: arr[1:pos] 414 slice_start = exp.Bracket( 415 this=this, 416 expressions=[ 417 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 418 ], 419 ) 420 421 # right slice: arr[pos+1:] 422 slice_end = exp.Bracket( 423 this=this, expressions=[exp.Slice(this=exp.Literal.number(pos_value + 1))] 424 ) 425 426 concat_exprs = [slice_start, element_array, slice_end] 427 else: 428 # Negative position: arr[1:LEN(arr)+pos], [elem], arr[LEN(arr)+pos+1:] 429 # pos=-1 means insert before last element 430 arr_len = exp.Length(this=this) 431 432 # Calculate slice position: LEN(arr) + pos (e.g., LEN(arr) + (-1) = LEN(arr) - 1) 433 slice_end_pos = arr_len + exp.Literal.number(pos_value) 434 slice_start_pos = slice_end_pos + exp.Literal.number(1) 435 436 # left slice: arr[1:LEN(arr)+pos] 437 slice_start = exp.Bracket( 438 this=this, 439 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 440 ) 441 442 # right slice: arr[LEN(arr)+pos+1:] 443 slice_end = exp.Bracket(this=this, expressions=[exp.Slice(this=slice_start_pos)]) 444 445 concat_exprs = [slice_start, element_array, slice_end] 446 447 # All dialects that support ARRAY_INSERT propagate NULLs (Snowflake/Spark/Databricks) 448 # Wrap in CASE WHEN array IS NULL THEN NULL ELSE func_expr END 449 return self.sql( 450 exp.If( 451 this=exp.Is(this=this, expression=exp.Null()), 452 true=exp.Null(), 453 false=self.func("LIST_CONCAT", *concat_exprs), 454 ) 455 ) 456 457 458def _array_remove_at_sql(self: DuckDBGenerator, expression: exp.ArrayRemoveAt) -> str: 459 """ 460 Transpile ARRAY_REMOVE_AT to DuckDB using LIST_CONCAT and slicing. 461 462 Handles: 463 - Positive positions (0-based indexing) 464 - Negative positions (from end of array) 465 - NULL propagation (Snowflake returns NULL for NULL array, DuckDB doesn't auto-propagate) 466 - Only supports literal integer positions (non-literals remain untranspiled) 467 468 Transpilation patterns: 469 - pos=0 (first): arr[2:] 470 - pos>0 (middle): LIST_CONCAT(arr[1:p], arr[p+2:]) 471 - pos=-1 (last): arr[1:LEN(arr)-1] 472 - pos<-1: LIST_CONCAT(arr[1:LEN(arr)+p], arr[LEN(arr)+p+2:]) 473 474 All wrapped in: CASE WHEN arr IS NULL THEN NULL ELSE ... END 475 476 Args: 477 expression: The ArrayRemoveAt expression to transpile. 478 479 Returns: 480 SQL string implementing ARRAY_REMOVE_AT behavior. 481 """ 482 this = expression.this 483 position = expression.args.get("position") 484 485 if not position or not position.is_int: 486 self.unsupported("ARRAY_REMOVE_AT can only be transpiled with a literal position") 487 return self.func("ARRAY_REMOVE_AT", this, position) 488 489 pos_value = position.to_py() 490 491 # Build the appropriate expression based on position 492 if pos_value == 0: 493 # Remove first element: arr[2:] 494 result_expr: exp.Expr | str = exp.Bracket( 495 this=this, 496 expressions=[exp.Slice(this=exp.Literal.number(2))], 497 ) 498 elif pos_value > 0: 499 # Remove at positive position: LIST_CONCAT(arr[1:pos], arr[pos+2:]) 500 # DuckDB uses 1-based slicing 501 left_slice = exp.Bracket( 502 this=this, 503 expressions=[ 504 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 505 ], 506 ) 507 right_slice = exp.Bracket( 508 this=this, 509 expressions=[exp.Slice(this=exp.Literal.number(pos_value + 2))], 510 ) 511 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 512 elif pos_value == -1: 513 # Remove last element: arr[1:LEN(arr)-1] 514 # Optimization: simpler than general negative case 515 arr_len = exp.Length(this=this) 516 slice_end = arr_len + exp.Literal.number(-1) 517 result_expr = exp.Bracket( 518 this=this, 519 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end)], 520 ) 521 else: 522 # Remove at negative position: LIST_CONCAT(arr[1:LEN(arr)+pos], arr[LEN(arr)+pos+2:]) 523 arr_len = exp.Length(this=this) 524 slice_end_pos = arr_len + exp.Literal.number(pos_value) 525 slice_start_pos = slice_end_pos + exp.Literal.number(2) 526 527 left_slice = exp.Bracket( 528 this=this, 529 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 530 ) 531 right_slice = exp.Bracket( 532 this=this, 533 expressions=[exp.Slice(this=slice_start_pos)], 534 ) 535 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 536 537 # Snowflake ARRAY_FUNCS_PROPAGATES_NULLS=True, so wrap in NULL check 538 # CASE WHEN array IS NULL THEN NULL ELSE result_expr END 539 return self.sql( 540 exp.If( 541 this=exp.Is(this=this, expression=exp.Null()), 542 true=exp.Null(), 543 false=result_expr, 544 ) 545 ) 546 547 548@unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator.")) 549def _array_sort_sql(self: DuckDBGenerator, expression: exp.ArraySort) -> str: 550 return self.func("ARRAY_SORT", expression.this) 551 552 553def _array_contains_sql(self: DuckDBGenerator, expression: exp.ArrayContains) -> str: 554 this = expression.this 555 expr = expression.expression 556 557 func = self.func("ARRAY_CONTAINS", this, expr) 558 559 if expression.args.get("check_null"): 560 check_null_in_array = exp.Nullif( 561 this=exp.NEQ(this=exp.ArraySize(this=this), expression=exp.func("LIST_COUNT", this)), 562 expression=exp.false(), 563 ) 564 return self.sql(exp.If(this=expr.is_(exp.Null()), true=check_null_in_array, false=func)) 565 566 return func 567 568 569def _array_overlaps_sql(self: DuckDBGenerator, expression: exp.ArrayOverlaps) -> str: 570 """ 571 Translates Snowflake's NULL-safe ARRAYS_OVERLAP to DuckDB. 572 573 DuckDB's native && operator is not NULL-safe: [1,NULL,3] && [NULL,4,5] returns FALSE. 574 Snowflake returns TRUE when both arrays contain NULL (NULLs are treated as known values). 575 576 Generated SQL: (arr1 && arr2) OR (ARRAY_LENGTH(arr1) <> LIST_COUNT(arr1) AND ARRAY_LENGTH(arr2) <> LIST_COUNT(arr2)) 577 578 ARRAY_LENGTH counts all elements (including NULLs); LIST_COUNT counts only non-NULLs. 579 When they differ, the array contains at least one NULL, matching Snowflake's NULL-safe semantics. 580 """ 581 if not expression.args.get("null_safe"): 582 return self.binary(expression, "&&") 583 584 arr1 = expression.this 585 arr2 = expression.expression 586 587 check_nulls = exp.and_( 588 exp.NEQ( 589 this=exp.ArraySize(this=arr1.copy()), 590 expression=exp.func("LIST_COUNT", arr1.copy()), 591 ), 592 exp.NEQ( 593 this=exp.ArraySize(this=arr2.copy()), 594 expression=exp.func("LIST_COUNT", arr2.copy()), 595 ), 596 copy=False, 597 ) 598 599 overlap = exp.ArrayOverlaps(this=arr1.copy(), expression=arr2.copy()) 600 601 return self.sql( 602 exp.or_( 603 exp.paren(overlap, copy=False), 604 exp.paren(check_nulls, copy=False), 605 copy=False, 606 wrap=False, 607 ) 608 ) 609 610 611def _struct_sql(self: DuckDBGenerator, expression: exp.Struct) -> str: 612 ancestor_cast = expression.find_ancestor(exp.Cast, exp.Select) 613 ancestor_cast = None if isinstance(ancestor_cast, exp.Select) else ancestor_cast 614 615 # Empty struct cast works with MAP() since DuckDB can't parse {} 616 if not expression.expressions: 617 if isinstance(ancestor_cast, exp.Cast) and ancestor_cast.to.is_type(exp.DType.MAP): 618 return "MAP()" 619 620 args: list[str] = [] 621 622 # BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is 623 # canonicalized to "ROW('str', 1) AS STRUCT(a TEXT, b INT)" in DuckDB 624 # The transformation to ROW will take place if: 625 # 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would 626 # 2. A cast to STRUCT / ARRAY of STRUCTs is found 627 is_bq_inline_struct = ( 628 (expression.find(exp.PropertyEQ) is None) 629 and ancestor_cast 630 and any( 631 casted_type.is_type(exp.DType.STRUCT) 632 for casted_type in ancestor_cast.find_all(exp.DataType) 633 ) 634 ) 635 636 for i, expr in enumerate(expression.expressions): 637 is_property_eq = isinstance(expr, exp.PropertyEQ) 638 this = expr.this 639 value = expr.expression if is_property_eq else expr 640 641 if is_bq_inline_struct: 642 args.append(self.sql(value)) 643 else: 644 if isinstance(this, exp.Identifier): 645 key = self.sql(exp.Literal.string(expr.name)) 646 elif is_property_eq: 647 key = self.sql(this) 648 else: 649 key = self.sql(exp.Literal.string(f"_{i}")) 650 651 args.append(f"{key}: {self.sql(value)}") 652 653 csv_args = ", ".join(args) 654 655 return f"ROW({csv_args})" if is_bq_inline_struct else f"{{{csv_args}}}" 656 657 658def _datatype_sql(self: DuckDBGenerator, expression: exp.DataType) -> str: 659 if expression.is_type("array"): 660 return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]" 661 662 # Modifiers are not supported for TIME, [TIME | TIMESTAMP] WITH TIME ZONE 663 if expression.is_type(exp.DType.TIME, exp.DType.TIMETZ, exp.DType.TIMESTAMPTZ): 664 return expression.this.value 665 666 return self.datatype_sql(expression) 667 668 669def _json_format_sql(self: DuckDBGenerator, expression: exp.JSONFormat) -> str: 670 sql = self.func("TO_JSON", expression.this, expression.args.get("options")) 671 return f"CAST({sql} AS TEXT)" 672 673 674def _build_seq_expression(base: exp.Expr, byte_width: int, signed: bool) -> exp.Expr: 675 """Build a SEQ expression with the given base, byte width, and signedness.""" 676 bits = byte_width * 8 677 max_val = exp.Literal.number(2**bits) 678 679 if signed: 680 half = exp.Literal.number(2 ** (bits - 1)) 681 return exp.replace_placeholders(_SEQ_SIGNED.copy(), base=base, max_val=max_val, half=half) 682 return exp.replace_placeholders(_SEQ_UNSIGNED.copy(), base=base, max_val=max_val) 683 684 685def _seq_to_range_in_generator(expression: exp.Expr) -> exp.Expr: 686 """ 687 Transform SEQ functions to `range` column references when inside a GENERATOR context. 688 689 When GENERATOR(ROWCOUNT => N) becomes RANGE(N) in DuckDB, it produces a column 690 named `range` with values 0, 1, ..., N-1. SEQ functions produce the same sequence, 691 so we replace them with `range % max_val` to avoid nested window function issues. 692 """ 693 if not isinstance(expression, exp.Select): 694 return expression 695 696 from_ = expression.args.get("from_") 697 if not ( 698 from_ 699 and isinstance(from_.this, exp.TableFromRows) 700 and isinstance(from_.this.this, exp.Generator) 701 ): 702 return expression 703 704 def replace_seq(node: exp.Expr) -> exp.Expr: 705 if isinstance(node, (exp.Seq1, exp.Seq2, exp.Seq4, exp.Seq8)): 706 byte_width = _SEQ_BYTE_WIDTH[type(node)] 707 return _build_seq_expression(exp.column("range"), byte_width, signed=node.name == "1") 708 return node 709 710 return expression.transform(replace_seq, copy=False) 711 712 713def _seq_sql(self: DuckDBGenerator, expression: exp.Func, byte_width: int) -> str: 714 """ 715 Transpile Snowflake SEQ1/SEQ2/SEQ4/SEQ8 to DuckDB. 716 717 Generates monotonically increasing integers starting from 0. 718 The signed parameter (0 or 1) affects wrap-around behavior: 719 - Unsigned (0): wraps at 2^(bits) - 1 720 - Signed (1): wraps at 2^(bits-1) - 1, then goes negative 721 """ 722 # Warn if SEQ is in a restricted context (Select stops search at current scope) 723 ancestor = expression.find_ancestor(*_SEQ_RESTRICTED) 724 if ancestor and ( 725 (not isinstance(ancestor, (exp.Order, exp.Select))) 726 or (isinstance(ancestor, exp.Order) and isinstance(ancestor.parent, exp.Window)) 727 ): 728 self.unsupported("SEQ in restricted context is not supported - use CTE or subquery") 729 730 result = _build_seq_expression(_SEQ_BASE.copy(), byte_width, signed=expression.name == "1") 731 return self.sql(result) 732 733 734def _unix_to_time_sql(self: DuckDBGenerator, expression: exp.UnixToTime) -> str: 735 scale = expression.args.get("scale") 736 timestamp = expression.this 737 target_type = expression.args.get("target_type") 738 739 # Check if we need NTZ (naive timestamp in UTC) 740 is_ntz = target_type and target_type.this in ( 741 exp.DType.TIMESTAMP, 742 exp.DType.TIMESTAMPNTZ, 743 ) 744 745 if scale == exp.UnixToTime.MILLIS: 746 # EPOCH_MS already returns TIMESTAMP (naive, UTC) 747 return self.func("EPOCH_MS", timestamp) 748 if scale == exp.UnixToTime.MICROS: 749 # MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC) 750 return self.func("MAKE_TIMESTAMP", timestamp) 751 752 # Other scales: divide and use TO_TIMESTAMP 753 if scale not in (None, exp.UnixToTime.SECONDS): 754 timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)) 755 756 to_timestamp: exp.Expr = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp]) 757 758 if is_ntz: 759 to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC")) 760 761 return self.sql(to_timestamp) 762 763 764WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not) 765 766 767def _arrow_json_extract_sql(self: DuckDBGenerator, expression: JSON_EXTRACT_TYPE) -> str: 768 arrow_sql = arrow_json_extract_sql(self, expression) 769 if not expression.same_parent and isinstance( 770 expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS 771 ): 772 arrow_sql = self.wrap(arrow_sql) 773 return arrow_sql 774 775 776def _implicit_datetime_cast( 777 arg: exp.Expr | None, type: exp.DType = exp.DType.DATE 778) -> exp.Expr | None: 779 if isinstance(arg, exp.Literal) and arg.is_string: 780 ts = arg.name 781 if type == exp.DType.DATE and ":" in ts: 782 type = exp.DType.TIMESTAMPTZ if TIMEZONE_PATTERN.search(ts) else exp.DType.TIMESTAMP 783 784 arg = exp.cast(arg, type) 785 786 return arg 787 788 789def _week_unit_to_dow(unit: exp.Expr | None) -> int | None: 790 """ 791 Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming 792 from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts. 793 794 Args: 795 unit: The unit expression (Var for ISOWEEK or WeekStart) 796 797 Returns: 798 The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant). 799 800 Examples: 801 "WEEK(SUNDAY)" -> 7 802 "WEEK(MONDAY)" -> 1 803 "ISOWEEK" -> 1 804 """ 805 # Handle plain Var expressions for ISOWEEK only 806 if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK": 807 return 1 808 809 # Handle WeekStart expressions with explicit day 810 if isinstance(unit, exp.WeekStart): 811 return WEEK_START_DAY_TO_DOW.get(unit.name.upper()) 812 813 return None 814 815 816def _build_week_trunc_expression( 817 date_expr: exp.Expr, 818 start_dow: int, 819 preserve_start_day: bool = False, 820) -> exp.Expr: 821 """ 822 Build DATE_TRUNC expression for week boundaries with custom start day. 823 824 DuckDB's DATE_TRUNC('WEEK', ...) always returns Monday. To align to a different 825 start day, we shift the date before truncating. 826 827 Args: 828 date_expr: The date expression to truncate. 829 start_dow: ISO 8601 day-of-week number (Monday=1, ..., Sunday=7). 830 preserve_start_day: If True, reverse the shift after truncating so the result lands on the 831 correct week start day. Needed for DATE_TRUNC (absolute result matters) but 832 not for DATE_DIFF (only relative alignment matters). 833 834 Shift formula: Sunday (7) gets +1, others get (1 - start_dow). 835 """ 836 shift_days = 1 if start_dow == 7 else 1 - start_dow 837 truncated = exp.func("DATE_TRUNC", unit=exp.var("WEEK"), this=date_expr) 838 839 if shift_days == 0: 840 return truncated 841 842 shift = exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")) 843 shifted_date = exp.DateAdd(this=date_expr, expression=shift) 844 truncated.set("this", shifted_date) 845 846 if preserve_start_day: 847 interval = exp.Interval(this=exp.Literal.string(str(-shift_days)), unit=exp.var("DAY")) 848 return exp.cast( 849 exp.DateAdd(this=truncated, expression=interval), to=exp.DType.DATE, copy=False 850 ) 851 852 return truncated 853 854 855def _date_diff_sql(self: DuckDBGenerator, expression: exp.DateDiff | exp.DatetimeDiff) -> str: 856 unit = expression.unit 857 858 if _is_nanosecond_unit(unit): 859 return _handle_nanosecond_diff(self, expression.this, expression.expression) 860 861 this = _implicit_datetime_cast(expression.this) 862 expr = _implicit_datetime_cast(expression.expression) 863 864 # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7: 865 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed) 866 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference) 867 # Whereas for other units such as MONTH it does respect month boundaries: 868 # SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed) 869 date_part_boundary = expression.args.get("date_part_boundary") 870 871 # Extract week start day; returns None if day is dynamic (column/placeholder) 872 week_start = _week_unit_to_dow(unit) 873 if date_part_boundary and week_start and this and expr: 874 expression.set("unit", exp.Literal.string("WEEK")) 875 876 # Truncate both dates to week boundaries to respect input dialect semantics 877 this = _build_week_trunc_expression(this, week_start) 878 expr = _build_week_trunc_expression(expr, week_start) 879 880 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 881 882 883def _generate_datetime_array_sql( 884 self: DuckDBGenerator, expression: exp.GenerateDateArray | exp.GenerateTimestampArray 885) -> str: 886 is_generate_date_array = isinstance(expression, exp.GenerateDateArray) 887 888 type = exp.DType.DATE if is_generate_date_array else exp.DType.TIMESTAMP 889 start = _implicit_datetime_cast(expression.args.get("start"), type=type) 890 end = _implicit_datetime_cast(expression.args.get("end"), type=type) 891 892 # BQ's GENERATE_DATE_ARRAY & GENERATE_TIMESTAMP_ARRAY are transformed to DuckDB'S GENERATE_SERIES 893 gen_series: exp.GenerateSeries | exp.Cast = exp.GenerateSeries( 894 start=start, end=end, step=expression.args.get("step") 895 ) 896 897 if is_generate_date_array: 898 # The GENERATE_SERIES result type is TIMESTAMP array, so to match BQ's semantics for 899 # GENERATE_DATE_ARRAY we must cast it back to DATE array 900 gen_series = exp.cast(gen_series, exp.DataType.from_str("ARRAY<DATE>")) 901 902 return self.sql(gen_series) 903 904 905def _json_extract_value_array_sql( 906 self: DuckDBGenerator, expression: exp.JSONValueArray | exp.JSONExtractArray 907) -> str: 908 json_extract = exp.JSONExtract(this=expression.this, expression=expression.expression) 909 data_type = "ARRAY<STRING>" if isinstance(expression, exp.JSONValueArray) else "ARRAY<JSON>" 910 return self.sql(exp.cast(json_extract, to=exp.DataType.from_str(data_type))) 911 912 913def _cast_to_varchar(arg: exp.Expr | None) -> exp.Expr | None: 914 if arg and arg.type and not arg.is_type(*exp.DataType.TEXT_TYPES, exp.DType.UNKNOWN): 915 return exp.cast(arg, exp.DType.VARCHAR) 916 return arg 917 918 919def _cast_to_boolean(arg: exp.Expr | None) -> exp.Expr | None: 920 if arg and not arg.is_type(exp.DType.BOOLEAN): 921 return exp.cast(arg, exp.DType.BOOLEAN) 922 return arg 923 924 925def _is_binary(arg: exp.Expr) -> bool: 926 return arg.is_type( 927 exp.DType.BINARY, 928 exp.DType.VARBINARY, 929 exp.DType.BLOB, 930 ) 931 932 933def _gen_with_cast_to_blob(self: DuckDBGenerator, expression: exp.Expr, result_sql: str) -> str: 934 if _is_binary(expression): 935 blob = exp.DataType.from_str("BLOB", dialect="duckdb") 936 result_sql = self.sql(exp.Cast(this=result_sql, to=blob)) 937 return result_sql 938 939 940def _cast_to_bit(arg: exp.Expr) -> exp.Expr: 941 if not _is_binary(arg): 942 return arg 943 944 if isinstance(arg, exp.HexString): 945 arg = exp.Unhex(this=exp.Literal.string(arg.this)) 946 947 return exp.cast(arg, exp.DType.BIT) 948 949 950def _prepare_binary_bitwise_args(expression: exp.Binary) -> None: 951 if _is_binary(expression.this): 952 expression.set("this", _cast_to_bit(expression.this)) 953 if _is_binary(expression.expression): 954 expression.set("expression", _cast_to_bit(expression.expression)) 955 956 957def _day_navigation_sql(self: DuckDBGenerator, expression: exp.NextDay | exp.PreviousDay) -> str: 958 """ 959 Transpile Snowflake's NEXT_DAY / PREVIOUS_DAY to DuckDB using date arithmetic. 960 961 Returns the DATE of the next/previous occurrence of the specified weekday. 962 963 Formulas: 964 - NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 965 - PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 966 967 Supports both literal and non-literal day names: 968 - Literal: Direct lookup (e.g., 'Monday' -> 1) 969 - Non-literal: CASE statement for runtime evaluation 970 971 Examples: 972 NEXT_DAY('2024-01-01' (Monday), 'Monday') 973 -> (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days -> 2024-01-08 974 975 PREVIOUS_DAY('2024-01-15' (Monday), 'Friday') 976 -> (1 - 5 + 6) % 7 + 1 = 2 % 7 + 1 = 3 days -> 2024-01-12 977 """ 978 date_expr = expression.this 979 day_name_expr = expression.expression 980 981 # Build ISODOW call for current day of week 982 isodow_call = exp.func("ISODOW", date_expr) 983 984 # Determine target day of week 985 if isinstance(day_name_expr, exp.Literal): 986 # Literal day name: lookup target_dow directly 987 day_name_str = day_name_expr.name.upper() 988 matching_day = next( 989 (day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None 990 ) 991 if matching_day: 992 target_dow: exp.Expr = exp.Literal.number(WEEK_START_DAY_TO_DOW[matching_day]) 993 else: 994 # Unrecognized day name, use fallback 995 return self.function_fallback_sql(expression) 996 else: 997 # Non-literal day name: build CASE statement for runtime mapping 998 upper_day_name = exp.Upper(this=day_name_expr) 999 target_dow = exp.Case( 1000 ifs=[ 1001 exp.If( 1002 this=exp.func( 1003 "STARTS_WITH", upper_day_name.copy(), exp.Literal.string(day[:2]) 1004 ), 1005 true=exp.Literal.number(dow_num), 1006 ) 1007 for day, dow_num in WEEK_START_DAY_TO_DOW.items() 1008 ] 1009 ) 1010 1011 # Calculate days offset and apply interval based on direction 1012 if isinstance(expression, exp.NextDay): 1013 # NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 1014 days_offset = exp.paren(target_dow - isodow_call + 6, copy=False) % 7 + 1 1015 date_with_offset = date_expr + exp.Interval(this=days_offset, unit=exp.var("DAY")) 1016 else: # exp.PreviousDay 1017 # PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 1018 days_offset = exp.paren(isodow_call - target_dow + 6, copy=False) % 7 + 1 1019 date_with_offset = date_expr - exp.Interval(this=days_offset, unit=exp.var("DAY")) 1020 1021 # Build final: CAST(date_with_offset AS DATE) 1022 return self.sql(exp.cast(date_with_offset, exp.DType.DATE)) 1023 1024 1025def _anyvalue_sql(self: DuckDBGenerator, expression: exp.AnyValue) -> str: 1026 # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL 1027 having = expression.this 1028 if isinstance(having, exp.HavingMax): 1029 func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL" 1030 return self.func(func_name, having.this, having.expression) 1031 return self.function_fallback_sql(expression) 1032 1033 1034def _bitwise_agg_sql( 1035 self: DuckDBGenerator, 1036 expression: exp.BitwiseOrAgg | exp.BitwiseAndAgg | exp.BitwiseXorAgg, 1037) -> str: 1038 """ 1039 DuckDB's bitwise aggregate functions only accept integer types. For other types: 1040 - DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int 1041 - FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss 1042 """ 1043 if isinstance(expression, exp.BitwiseOrAgg): 1044 func_name = "BIT_OR" 1045 elif isinstance(expression, exp.BitwiseAndAgg): 1046 func_name = "BIT_AND" 1047 else: # exp.BitwiseXorAgg 1048 func_name = "BIT_XOR" 1049 1050 arg = expression.this 1051 1052 if not arg.type: 1053 from sqlglot.optimizer.annotate_types import annotate_types 1054 1055 arg = annotate_types(arg, dialect=self.dialect) 1056 1057 if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES): 1058 if arg.is_type(*exp.DataType.FLOAT_TYPES): 1059 # float types need to be rounded first due to precision loss 1060 arg = exp.func("ROUND", arg) 1061 1062 arg = exp.cast(arg, exp.DType.INT) 1063 1064 return self.func(func_name, arg) 1065 1066 1067def _literal_sql_with_ws_chr(self: DuckDBGenerator, literal: str) -> str: 1068 # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly 1069 if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal): 1070 return self.sql(exp.Literal.string(literal)) 1071 1072 sql_segments: list[str] = [] 1073 for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK): 1074 if is_ws_control: 1075 for ch in group: 1076 duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch] 1077 sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code)))) 1078 else: 1079 sql_segments.append(self.sql(exp.Literal.string("".join(group)))) 1080 1081 sql = " || ".join(sql_segments) 1082 return sql if len(sql_segments) == 1 else f"({sql})" 1083 1084 1085def _escape_regex_metachars( 1086 self: DuckDBGenerator, delimiters: exp.Expr | None, delimiters_sql: str 1087) -> str: 1088 r""" 1089 Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions. 1090 1091 Literal strings are escaped at transpile time, expressions handled with REPLACE() calls. 1092 """ 1093 if not delimiters: 1094 return delimiters_sql 1095 1096 if delimiters.is_string: 1097 literal_value = delimiters.this 1098 escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value) 1099 return _literal_sql_with_ws_chr(self, escaped_literal) 1100 1101 escaped_sql = delimiters_sql 1102 for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items(): 1103 escaped_sql = self.func( 1104 "REPLACE", 1105 escaped_sql, 1106 self.sql(exp.Literal.string(raw)), 1107 self.sql(exp.Literal.string(escaped)), 1108 ) 1109 1110 return escaped_sql 1111 1112 1113def _build_capitalization_sql( 1114 self: DuckDBGenerator, 1115 value_to_split: str, 1116 delimiters_sql: str, 1117) -> str: 1118 # empty string delimiter --> treat value as one word, no need to split 1119 if delimiters_sql == "''": 1120 return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))" 1121 1122 delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')" 1123 split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')" 1124 1125 # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments. 1126 # We do not know whether the first segment is a delimiter or not, so we check the first character of the string 1127 # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd. 1128 return self.func( 1129 "ARRAY_TO_STRING", 1130 exp.case() 1131 .when( 1132 f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})", 1133 self.func( 1134 "LIST_TRANSFORM", 1135 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1136 "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1137 ), 1138 ) 1139 .else_( 1140 self.func( 1141 "LIST_TRANSFORM", 1142 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1143 "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1144 ), 1145 ), 1146 "''", 1147 ) 1148 1149 1150def _initcap_sql(self: DuckDBGenerator, expression: exp.Initcap) -> str: 1151 this_sql = self.sql(expression, "this") 1152 delimiters = expression.args.get("expression") 1153 if delimiters is None: 1154 # fallback for manually created exp.Initcap w/o delimiters arg 1155 delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS) 1156 delimiters_sql = self.sql(delimiters) 1157 1158 escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql) 1159 1160 return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql) 1161 1162 1163def _boolxor_agg_sql(self: DuckDBGenerator, expression: exp.BoolxorAgg) -> str: 1164 """ 1165 Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise; 1166 Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`. 1167 1168 DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean. 1169 """ 1170 return self.sql( 1171 exp.EQ( 1172 this=exp.CountIf(this=_cast_to_boolean(expression.this)), 1173 expression=exp.Literal.number(1), 1174 ) 1175 ) 1176 1177 1178def _bitshift_sql( 1179 self: DuckDBGenerator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift 1180) -> str: 1181 """ 1182 Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts. 1183 1184 DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast 1185 them to BIT for the operation, then cast the result back to the original type. 1186 1187 Note: Assumes type annotation has been applied with the source dialect. 1188 """ 1189 operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>" 1190 result_is_blob = False 1191 this = expression.this 1192 1193 if _is_binary(this): 1194 result_is_blob = True 1195 expression.set("this", exp.cast(this, exp.DType.BIT)) 1196 elif expression.args.get("requires_int128"): 1197 this.replace(exp.cast(this, exp.DType.INT128)) 1198 1199 result_sql = self.binary(expression, operator) 1200 1201 # Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue 1202 # DuckDB parses: a << b | c << d as (a << b | c) << d 1203 if isinstance(expression.parent, exp.Binary): 1204 result_sql = self.sql(exp.Paren(this=result_sql)) 1205 1206 if result_is_blob: 1207 result_sql = self.sql( 1208 exp.Cast(this=result_sql, to=exp.DataType.from_str("BLOB", dialect="duckdb")) 1209 ) 1210 1211 return result_sql 1212 1213 1214def _scale_rounding_sql( 1215 self: DuckDBGenerator, 1216 expression: exp.Expr, 1217 rounding_func: Type[exp.Expr], 1218) -> str | None: 1219 """ 1220 Handle scale parameter transformation for rounding functions. 1221 1222 DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL), 1223 so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n) 1224 1225 Args: 1226 self: The DuckDB generator instance 1227 expression: The expression to transform (must have 'this', 'decimals', and 'to' args) 1228 rounding_func: The rounding function class to use in the transformation 1229 1230 Returns: 1231 The transformed SQL string if decimals parameter exists, None otherwise 1232 """ 1233 decimals = expression.args.get("decimals") 1234 1235 if decimals is None or expression.args.get("to") is not None: 1236 return None 1237 1238 this = expression.this 1239 if isinstance(this, exp.Binary): 1240 this = exp.Paren(this=this) 1241 1242 n_int = decimals 1243 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 1244 n_int = exp.cast(decimals, exp.DType.INT) 1245 1246 pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int) 1247 rounded = rounding_func(this=exp.Mul(this=this, expression=pow_)) 1248 result = exp.Div(this=rounded, expression=pow_.copy()) 1249 1250 return self.round_sql( 1251 exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True) 1252 ) 1253 1254 1255def _ceil_floor(self: DuckDBGenerator, expression: exp.Floor | exp.Ceil) -> str: 1256 scaled_sql = _scale_rounding_sql(self, expression, type(expression)) 1257 if scaled_sql is not None: 1258 return scaled_sql 1259 return self.ceil_floor(expression) 1260 1261 1262def _regr_val_sql( 1263 self: DuckDBGenerator, 1264 expression: exp.RegrValx | exp.RegrValy, 1265) -> str: 1266 """ 1267 Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent. 1268 1269 REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x. 1270 REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y. 1271 """ 1272 from sqlglot.optimizer.annotate_types import annotate_types 1273 1274 y = expression.this 1275 x = expression.expression 1276 1277 # Determine which argument to check for NULL and which to return based on expression type 1278 if isinstance(expression, exp.RegrValx): 1279 # REGR_VALX: check y for NULL, return x 1280 check_for_null = y 1281 return_value = x 1282 return_value_attr = "expression" 1283 else: 1284 # REGR_VALY: check x for NULL, return y 1285 check_for_null = x 1286 return_value = y 1287 return_value_attr = "this" 1288 1289 # Get the type from the return argument 1290 result_type = return_value.type 1291 1292 # If no type info, annotate the expression to infer types 1293 if not result_type or result_type.this == exp.DType.UNKNOWN: 1294 try: 1295 annotated = annotate_types(expression.copy(), dialect=self.dialect) 1296 result_type = getattr(annotated, return_value_attr).type 1297 except Exception: 1298 pass 1299 1300 # Default to DOUBLE for regression functions if type still unknown 1301 if not result_type or result_type.this == exp.DType.UNKNOWN: 1302 result_type = exp.DType.DOUBLE.into_expr() 1303 1304 # Cast NULL to the same type as return_value to avoid DuckDB type inference issues 1305 typed_null = exp.Cast(this=exp.Null(), to=result_type) 1306 1307 return self.sql( 1308 exp.If( 1309 this=exp.Is(this=check_for_null.copy(), expression=exp.Null()), 1310 true=typed_null, 1311 false=return_value.copy(), 1312 ) 1313 ) 1314 1315 1316def _maybe_corr_null_to_false( 1317 expression: exp.Filter | exp.Window | exp.Corr, 1318) -> exp.Filter | exp.Window | exp.Corr | None: 1319 corr = expression 1320 while isinstance(corr, (exp.Window, exp.Filter)): 1321 corr = corr.this 1322 1323 if not isinstance(corr, exp.Corr) or not corr.args.get("null_on_zero_variance"): 1324 return None 1325 1326 corr.set("null_on_zero_variance", False) 1327 return expression 1328 1329 1330def _date_from_parts_sql(self, expression: exp.DateFromParts) -> str: 1331 """ 1332 Snowflake's DATE_FROM_PARTS allows out-of-range values for the month and day input. 1333 E.g., larger values (month=13, day=100), zero-values (month=0, day=0), negative values (month=-13, day=-100). 1334 1335 DuckDB's MAKE_DATE does not support out-of-range values, but DuckDB's INTERVAL type does. 1336 1337 We convert to date arithmetic: 1338 DATE_FROM_PARTS(year, month, day) 1339 - MAKE_DATE(year, 1, 1) + INTERVAL (month-1) MONTH + INTERVAL (day-1) DAY 1340 """ 1341 year_expr = expression.args.get("year") 1342 month_expr = expression.args.get("month") 1343 day_expr = expression.args.get("day") 1344 1345 if expression.args.get("allow_overflow"): 1346 base_date: exp.Expr = exp.func( 1347 "MAKE_DATE", year_expr, exp.Literal.number(1), exp.Literal.number(1) 1348 ) 1349 1350 if month_expr: 1351 base_date = base_date + exp.Interval(this=month_expr - 1, unit=exp.var("MONTH")) 1352 1353 if day_expr: 1354 base_date = base_date + exp.Interval(this=day_expr - 1, unit=exp.var("DAY")) 1355 1356 return self.sql(exp.cast(expression=base_date, to=exp.DType.DATE)) 1357 1358 return self.func("MAKE_DATE", year_expr, month_expr, day_expr) 1359 1360 1361def _round_arg(arg: exp.Expr, round_input: bool | None = None) -> exp.Expr: 1362 if round_input: 1363 return exp.func("ROUND", arg, exp.Literal.number(0)) 1364 return arg 1365 1366 1367def _boolnot_sql(self: DuckDBGenerator, expression: exp.Boolnot) -> str: 1368 arg = _round_arg(expression.this, expression.args.get("round_input")) 1369 return self.sql(exp.not_(exp.paren(arg))) 1370 1371 1372def _booland_sql(self: DuckDBGenerator, expression: exp.Booland) -> str: 1373 round_input = expression.args.get("round_input") 1374 left = _round_arg(expression.this, round_input) 1375 right = _round_arg(expression.expression, round_input) 1376 return self.sql(exp.paren(exp.and_(exp.paren(left), exp.paren(right), wrap=False))) 1377 1378 1379def _boolor_sql(self: DuckDBGenerator, expression: exp.Boolor) -> str: 1380 round_input = expression.args.get("round_input") 1381 left = _round_arg(expression.this, round_input) 1382 right = _round_arg(expression.expression, round_input) 1383 return self.sql(exp.paren(exp.or_(exp.paren(left), exp.paren(right), wrap=False))) 1384 1385 1386def _xor_sql(self: DuckDBGenerator, expression: exp.Xor) -> str: 1387 round_input = expression.args.get("round_input") 1388 left = _round_arg(expression.this, round_input) 1389 right = _round_arg(expression.expression, round_input) 1390 return self.sql( 1391 exp.or_( 1392 exp.paren(exp.and_(left.copy(), exp.paren(right.not_()), wrap=False)), 1393 exp.paren(exp.and_(exp.paren(left.not_()), right.copy(), wrap=False)), 1394 wrap=False, 1395 ) 1396 ) 1397 1398 1399def _explode_to_unnest_sql(self: DuckDBGenerator, expression: exp.Lateral) -> str: 1400 """Handle LATERAL VIEW EXPLODE/INLINE conversion to UNNEST for DuckDB.""" 1401 explode = expression.this 1402 1403 if isinstance(explode, exp.Inline): 1404 # For INLINE, create CROSS JOIN LATERAL (SELECT UNNEST(..., max_depth => 2)) 1405 # Build the UNNEST call with DuckDB-style named parameter 1406 unnest_expr = exp.Unnest( 1407 expressions=[ 1408 explode.this, 1409 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)), 1410 ] 1411 ) 1412 select_expr = exp.Select(expressions=[unnest_expr]).subquery() 1413 1414 alias_expr = expression.args.get("alias") 1415 if alias_expr and not alias_expr.this: 1416 # we need to provide a table name if not present 1417 alias_expr.set("this", exp.to_identifier(f"_u_{expression.index}")) 1418 1419 transformed_lateral_expr = exp.Lateral(this=select_expr, alias=alias_expr) 1420 cross_join_lateral_expr = exp.Join(this=transformed_lateral_expr, kind="CROSS") 1421 1422 return self.sql(cross_join_lateral_expr) 1423 1424 # For other cases, use the standard conversion 1425 return explode_to_unnest_sql(self, expression) 1426 1427 1428def _sha_sql( 1429 self: DuckDBGenerator, 1430 expression: exp.Expr, 1431 hash_func: str, 1432 is_binary: bool = False, 1433) -> str: 1434 arg = expression.this 1435 1436 # For SHA2 variants, check digest length (DuckDB only supports SHA256) 1437 if hash_func == "SHA256": 1438 length = expression.text("length") or "256" 1439 if length != "256": 1440 self.unsupported("DuckDB only supports SHA256 hashing algorithm.") 1441 1442 # Cast if type is incompatible with DuckDB 1443 if ( 1444 arg.type 1445 and arg.type.this != exp.DType.UNKNOWN 1446 and not arg.is_type(*exp.DataType.TEXT_TYPES) 1447 and not _is_binary(arg) 1448 ): 1449 arg = exp.cast(arg, exp.DType.VARCHAR) 1450 1451 result = self.func(hash_func, arg) 1452 return self.func("UNHEX", result) if is_binary else result 1453 1454 1455class DuckDBGenerator(generator.Generator): 1456 PARAMETER_TOKEN = "$" 1457 NAMED_PLACEHOLDER_TOKEN = "$" 1458 JOIN_HINTS = False 1459 TABLE_HINTS = False 1460 QUERY_HINTS = False 1461 LIMIT_FETCH = "LIMIT" 1462 STRUCT_DELIMITER = ("(", ")") 1463 RENAME_TABLE_WITH_DB = False 1464 NVL2_SUPPORTED = False 1465 SEMI_ANTI_JOIN_WITH_SIDE = False 1466 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1467 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1468 LAST_DAY_SUPPORTS_DATE_PART = False 1469 JSON_KEY_VALUE_PAIR_SEP = "," 1470 IGNORE_NULLS_IN_FUNC = True 1471 IGNORE_NULLS_BEFORE_ORDER = False 1472 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1473 SUPPORTS_CREATE_TABLE_LIKE = False 1474 MULTI_ARG_DISTINCT = False 1475 CAN_IMPLEMENT_ARRAY_ANY = True 1476 SUPPORTS_TO_NUMBER = False 1477 SELECT_KINDS: tuple[str, ...] = () 1478 SUPPORTS_DECODE_CASE = False 1479 SUPPORTS_DROP_ALTER_ICEBERG_PROPERTY = False 1480 1481 AFTER_HAVING_MODIFIER_TRANSFORMS = generator.AFTER_HAVING_MODIFIER_TRANSFORMS 1482 SUPPORTS_WINDOW_EXCLUDE = True 1483 COPY_HAS_INTO_KEYWORD = False 1484 STAR_EXCEPT = "EXCLUDE" 1485 PAD_FILL_PATTERN_IS_REQUIRED = True 1486 ARRAY_SIZE_DIM_REQUIRED: bool | None = False 1487 NORMALIZE_EXTRACT_DATE_PARTS = True 1488 SUPPORTS_LIKE_QUANTIFIERS = False 1489 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1490 1491 TRANSFORMS = { 1492 **generator.Generator.TRANSFORMS, 1493 exp.AnyValue: _anyvalue_sql, 1494 exp.ApproxDistinct: approx_count_distinct_sql, 1495 exp.Boolnot: _boolnot_sql, 1496 exp.Booland: _booland_sql, 1497 exp.Boolor: _boolor_sql, 1498 exp.Array: transforms.preprocess( 1499 [transforms.inherit_struct_field_names], 1500 generator=inline_array_unless_query, 1501 ), 1502 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1503 exp.ArrayCompact: array_compact_sql, 1504 exp.ArrayConstructCompact: lambda self, e: self.sql( 1505 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1506 ), 1507 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1508 exp.ArrayContains: _array_contains_sql, 1509 exp.ArrayOverlaps: _array_overlaps_sql, 1510 exp.ArrayFilter: rename_func("LIST_FILTER"), 1511 exp.ArrayInsert: _array_insert_sql, 1512 exp.ArrayPosition: lambda self, e: ( 1513 self.sql( 1514 exp.Sub( 1515 this=exp.ArrayPosition(this=e.this, expression=e.expression), 1516 expression=exp.Literal.number(1), 1517 ) 1518 ) 1519 if e.args.get("zero_based") 1520 else self.func("ARRAY_POSITION", e.this, e.expression) 1521 ), 1522 exp.ArrayRemoveAt: _array_remove_at_sql, 1523 exp.ArrayRemove: remove_from_array_using_filter, 1524 exp.ArraySort: _array_sort_sql, 1525 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1526 exp.ArraySum: rename_func("LIST_SUM"), 1527 exp.ArrayMax: rename_func("LIST_MAX"), 1528 exp.ArrayMin: rename_func("LIST_MIN"), 1529 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1530 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1531 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1532 exp.BitwiseAndAgg: _bitwise_agg_sql, 1533 exp.BitwiseCount: rename_func("BIT_COUNT"), 1534 exp.BitwiseLeftShift: _bitshift_sql, 1535 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1536 exp.BitwiseOrAgg: _bitwise_agg_sql, 1537 exp.BitwiseRightShift: _bitshift_sql, 1538 exp.BitwiseXorAgg: _bitwise_agg_sql, 1539 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1540 exp.Corr: lambda self, e: self._corr_sql(e), 1541 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1542 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1543 exp.CurrentSchemas: lambda self, e: self.func( 1544 "current_schemas", e.this if e.this else exp.true() 1545 ), 1546 exp.CurrentTimestamp: lambda self, e: ( 1547 self.sql( 1548 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1549 ) 1550 if e.args.get("sysdate") 1551 else "CURRENT_TIMESTAMP" 1552 ), 1553 exp.CurrentVersion: rename_func("version"), 1554 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1555 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1556 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1557 exp.DayOfWeekIso: rename_func("ISODOW"), 1558 exp.DayOfYear: rename_func("DAYOFYEAR"), 1559 exp.Dayname: lambda self, e: ( 1560 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1561 if e.args.get("abbreviated") 1562 else self.func("DAYNAME", e.this) 1563 ), 1564 exp.Monthname: lambda self, e: ( 1565 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1566 if e.args.get("abbreviated") 1567 else self.func("MONTHNAME", e.this) 1568 ), 1569 exp.DataType: _datatype_sql, 1570 exp.Date: _date_sql, 1571 exp.DateAdd: _date_delta_to_binary_interval_op(), 1572 exp.DateFromParts: _date_from_parts_sql, 1573 exp.DateSub: _date_delta_to_binary_interval_op(), 1574 exp.DateDiff: _date_diff_sql, 1575 exp.DateStrToDate: datestrtodate_sql, 1576 exp.Datetime: no_datetime_sql, 1577 exp.DatetimeDiff: _date_diff_sql, 1578 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1579 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1580 exp.DateToDi: lambda self, e: ( 1581 f"CAST(STRFTIME({self.sql(e, 'this')}, {self.dialect.DATEINT_FORMAT}) AS INT)" 1582 ), 1583 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1584 exp.HexDecodeString: lambda self, e: self.sql(exp.Decode(this=exp.Unhex(this=e.this))), 1585 exp.DiToDate: lambda self, e: ( 1586 f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {self.dialect.DATEINT_FORMAT}) AS DATE)" 1587 ), 1588 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1589 exp.EqualNull: lambda self, e: self.sql( 1590 exp.NullSafeEQ(this=e.this, expression=e.expression) 1591 ), 1592 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1593 exp.GenerateDateArray: _generate_datetime_array_sql, 1594 exp.GenerateSeries: generate_series_sql("GENERATE_SERIES", "RANGE"), 1595 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1596 exp.Getbit: getbit_sql, 1597 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1598 exp.Explode: rename_func("UNNEST"), 1599 exp.IcebergProperty: lambda *_: "", 1600 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1601 exp.IsInf: rename_func("ISINF"), 1602 exp.IsNan: rename_func("ISNAN"), 1603 exp.IsNullValue: lambda self, e: self.sql( 1604 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1605 ), 1606 exp.IsArray: lambda self, e: self.sql( 1607 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1608 ), 1609 exp.Ceil: _ceil_floor, 1610 exp.Floor: _ceil_floor, 1611 exp.JSONBExists: rename_func("JSON_EXISTS"), 1612 exp.JSONExtract: _arrow_json_extract_sql, 1613 exp.JSONExtractArray: _json_extract_value_array_sql, 1614 exp.JSONFormat: _json_format_sql, 1615 exp.JSONValueArray: _json_extract_value_array_sql, 1616 exp.Lateral: _explode_to_unnest_sql, 1617 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1618 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1619 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 1620 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1621 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1622 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1623 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1624 exp.BoolxorAgg: _boolxor_agg_sql, 1625 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1626 exp.Initcap: _initcap_sql, 1627 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1628 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 1629 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 1630 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 1631 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 1632 exp.MonthsBetween: months_between_sql, 1633 exp.NextDay: _day_navigation_sql, 1634 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1635 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1636 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1637 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1638 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1639 exp.PreviousDay: _day_navigation_sql, 1640 exp.RegexpILike: lambda self, e: self.func( 1641 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1642 ), 1643 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1644 exp.RegrValx: _regr_val_sql, 1645 exp.RegrValy: _regr_val_sql, 1646 exp.Return: lambda self, e: self.sql(e, "this"), 1647 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1648 exp.StrToUnix: lambda self, e: self.func( 1649 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1650 ), 1651 exp.Struct: _struct_sql, 1652 exp.Transform: rename_func("LIST_TRANSFORM"), 1653 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1654 exp.TimeSub: _date_delta_to_binary_interval_op(), 1655 exp.Time: no_time_sql, 1656 exp.TimeDiff: _timediff_sql, 1657 exp.Timestamp: no_timestamp_sql, 1658 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1659 exp.TimestampDiff: lambda self, e: self.func( 1660 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1661 ), 1662 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1663 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DType.DATE)), 1664 exp.TimeStrToTime: timestrtotime_sql, 1665 exp.TimeStrToUnix: lambda self, e: self.func( 1666 "EPOCH", exp.cast(e.this, exp.DType.TIMESTAMP) 1667 ), 1668 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1669 exp.ToBoolean: _to_boolean_sql, 1670 exp.ToVariant: lambda self, e: self.sql( 1671 exp.cast(e.this, exp.DataType.from_str("VARIANT", dialect="duckdb")) 1672 ), 1673 exp.TimeToUnix: rename_func("EPOCH"), 1674 exp.TsOrDiToDi: lambda self, e: ( 1675 f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)" 1676 ), 1677 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1678 exp.TsOrDsDiff: lambda self, e: self.func( 1679 "DATE_DIFF", 1680 f"'{e.args.get('unit') or 'DAY'}'", 1681 exp.cast(e.expression, exp.DType.TIMESTAMP), 1682 exp.cast(e.this, exp.DType.TIMESTAMP), 1683 ), 1684 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1685 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1686 exp.UnixSeconds: lambda self, e: self.sql( 1687 exp.cast(self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DType.BIGINT) 1688 ), 1689 exp.UnixToStr: lambda self, e: self.func( 1690 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1691 ), 1692 exp.DatetimeTrunc: lambda self, e: self.func( 1693 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DType.DATETIME) 1694 ), 1695 exp.UnixToTime: _unix_to_time_sql, 1696 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1697 exp.VariancePop: rename_func("VAR_POP"), 1698 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1699 exp.YearOfWeek: lambda self, e: self.sql( 1700 exp.Extract( 1701 this=exp.Var(this="ISOYEAR"), 1702 expression=e.this, 1703 ) 1704 ), 1705 exp.YearOfWeekIso: lambda self, e: self.sql( 1706 exp.Extract( 1707 this=exp.Var(this="ISOYEAR"), 1708 expression=e.this, 1709 ) 1710 ), 1711 exp.Xor: _xor_sql, 1712 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1713 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1714 exp.DateBin: rename_func("TIME_BUCKET"), 1715 exp.LastDay: _last_day_sql, 1716 } 1717 1718 SUPPORTED_JSON_PATH_PARTS = { 1719 exp.JSONPathKey, 1720 exp.JSONPathRoot, 1721 exp.JSONPathSubscript, 1722 exp.JSONPathWildcard, 1723 } 1724 1725 TYPE_MAPPING = { 1726 **generator.Generator.TYPE_MAPPING, 1727 exp.DType.BINARY: "BLOB", 1728 exp.DType.BPCHAR: "TEXT", 1729 exp.DType.CHAR: "TEXT", 1730 exp.DType.DATETIME: "TIMESTAMP", 1731 exp.DType.DECFLOAT: "DECIMAL", 1732 exp.DType.FLOAT: "REAL", 1733 exp.DType.JSONB: "JSON", 1734 exp.DType.NCHAR: "TEXT", 1735 exp.DType.NVARCHAR: "TEXT", 1736 exp.DType.UINT: "UINTEGER", 1737 exp.DType.VARBINARY: "BLOB", 1738 exp.DType.ROWVERSION: "BLOB", 1739 exp.DType.VARCHAR: "TEXT", 1740 exp.DType.TIMESTAMPLTZ: "TIMESTAMPTZ", 1741 exp.DType.TIMESTAMPNTZ: "TIMESTAMP", 1742 exp.DType.TIMESTAMP_S: "TIMESTAMP_S", 1743 exp.DType.TIMESTAMP_MS: "TIMESTAMP_MS", 1744 exp.DType.TIMESTAMP_NS: "TIMESTAMP_NS", 1745 exp.DType.BIGDECIMAL: "DECIMAL", 1746 } 1747 1748 TYPE_PARAM_SETTINGS = { 1749 **generator.Generator.TYPE_PARAM_SETTINGS, 1750 exp.DType.BIGDECIMAL: ((38, 5), (38, 38)), 1751 exp.DType.DECFLOAT: ((38, 5), (38, 38)), 1752 } 1753 1754 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1755 RESERVED_KEYWORDS = { 1756 "array", 1757 "analyse", 1758 "union", 1759 "all", 1760 "when", 1761 "in_p", 1762 "default", 1763 "create_p", 1764 "window", 1765 "asymmetric", 1766 "to", 1767 "else", 1768 "localtime", 1769 "from", 1770 "end_p", 1771 "select", 1772 "current_date", 1773 "foreign", 1774 "with", 1775 "grant", 1776 "session_user", 1777 "or", 1778 "except", 1779 "references", 1780 "fetch", 1781 "limit", 1782 "group_p", 1783 "leading", 1784 "into", 1785 "collate", 1786 "offset", 1787 "do", 1788 "then", 1789 "localtimestamp", 1790 "check_p", 1791 "lateral_p", 1792 "current_role", 1793 "where", 1794 "asc_p", 1795 "placing", 1796 "desc_p", 1797 "user", 1798 "unique", 1799 "initially", 1800 "column", 1801 "both", 1802 "some", 1803 "as", 1804 "any", 1805 "only", 1806 "deferrable", 1807 "null_p", 1808 "current_time", 1809 "true_p", 1810 "table", 1811 "case", 1812 "trailing", 1813 "variadic", 1814 "for", 1815 "on", 1816 "distinct", 1817 "false_p", 1818 "not", 1819 "constraint", 1820 "current_timestamp", 1821 "returning", 1822 "primary", 1823 "intersect", 1824 "having", 1825 "analyze", 1826 "current_user", 1827 "and", 1828 "cast", 1829 "symmetric", 1830 "using", 1831 "order", 1832 "current_catalog", 1833 } 1834 1835 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 1836 1837 # DuckDB doesn't generally support CREATE TABLE .. properties 1838 # https://duckdb.org/docs/sql/statements/create_table.html 1839 # There are a few exceptions (e.g. temporary tables) which are supported or 1840 # can be transpiled to DuckDB, so we explicitly override them accordingly 1841 PROPERTIES_LOCATION = { 1842 **{ 1843 prop: exp.Properties.Location.UNSUPPORTED 1844 for prop in generator.Generator.PROPERTIES_LOCATION 1845 }, 1846 exp.LikeProperty: exp.Properties.Location.POST_SCHEMA, 1847 exp.TemporaryProperty: exp.Properties.Location.POST_CREATE, 1848 exp.ReturnsProperty: exp.Properties.Location.POST_ALIAS, 1849 exp.SequenceProperties: exp.Properties.Location.POST_EXPRESSION, 1850 exp.IcebergProperty: exp.Properties.Location.POST_CREATE, 1851 } 1852 1853 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS: t.ClassVar = _IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS 1854 1855 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 1856 ZIPF_TEMPLATE: exp.Expr = exp.maybe_parse( 1857 """ 1858 WITH rand AS (SELECT :random_expr AS r), 1859 weights AS ( 1860 SELECT i, 1.0 / POWER(i, :s) AS w 1861 FROM RANGE(1, :n + 1) AS t(i) 1862 ), 1863 cdf AS ( 1864 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 1865 FROM weights 1866 ) 1867 SELECT MIN(i) 1868 FROM cdf 1869 WHERE p >= (SELECT r FROM rand) 1870 """ 1871 ) 1872 1873 # Template for NORMAL transpilation using Box-Muller transform 1874 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 1875 NORMAL_TEMPLATE: exp.Expr = exp.maybe_parse( 1876 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 1877 ) 1878 1879 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 1880 SEEDED_RANDOM_TEMPLATE: exp.Expr = exp.maybe_parse("(ABS(HASH(:seed)) % 1000000) / 1000000.0") 1881 1882 # Template for generating signed and unsigned SEQ values within a specified range 1883 SEQ_UNSIGNED: exp.Expr = _SEQ_UNSIGNED 1884 SEQ_SIGNED: exp.Expr = _SEQ_SIGNED 1885 1886 # Template for MAP_CAT transpilation - Snowflake semantics: 1887 # 1. Returns NULL if either input is NULL 1888 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 1889 # 3. Filters out entries with NULL values from the result 1890 MAPCAT_TEMPLATE: exp.Expr = exp.maybe_parse( 1891 """ 1892 CASE 1893 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 1894 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 1895 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 1896 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 1897 ), __x -> __x.value IS NOT NULL)) 1898 END 1899 """ 1900 ) 1901 1902 # Mappings for EXTRACT/DATE_PART transpilation 1903 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 1904 EXTRACT_STRFTIME_MAPPINGS: dict[str, tuple[str, str]] = { 1905 "WEEKISO": ("%V", "INTEGER"), 1906 "YEAROFWEEK": ("%G", "INTEGER"), 1907 "YEAROFWEEKISO": ("%G", "INTEGER"), 1908 "NANOSECOND": ("%n", "BIGINT"), 1909 } 1910 1911 # Maps epoch-based specifiers to DuckDB epoch functions 1912 EXTRACT_EPOCH_MAPPINGS: dict[str, str] = { 1913 "EPOCH_SECOND": "EPOCH", 1914 "EPOCH_MILLISECOND": "EPOCH_MS", 1915 "EPOCH_MICROSECOND": "EPOCH_US", 1916 "EPOCH_NANOSECOND": "EPOCH_NS", 1917 } 1918 1919 # Template for BITMAP_CONSTRUCT_AGG transpilation 1920 # 1921 # BACKGROUND: 1922 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 1923 # Supports values in range 0-32767, this version returns NULL if any value is out of range 1924 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 1925 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 1926 # 1927 # Snowflake uses two different formats based on the number of unique values: 1928 # 1929 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 1930 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 1931 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 1932 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 1933 # count v1 v2 v3 pad 1934 # 1935 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 1936 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 1937 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 1938 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 1939 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 1940 # 1941 # TEMPLATE STRUCTURE 1942 # 1943 # Phase 1 - Innermost subquery: Data preparation 1944 # SELECT LIST_SORT(...) AS l 1945 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 1946 # Result: Clean, sorted list of unique non-null integers stored as 'l' 1947 # 1948 # Phase 2 - Middle subquery: Hex string construction 1949 # LIST_TRANSFORM(...) 1950 # - Converts each integer to 2-byte little-endian hex representation 1951 # - & 255 extracts low byte, >> 8 extracts high byte 1952 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 1953 # Result: Hex string of all values 1954 # 1955 # Phase 3 - Outer SELECT: Final bitmap assembly 1956 # LENGTH(l) < 5: 1957 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 1958 # LENGTH(l) >= 5: 1959 # - Large format: Fixed 10-byte header + values (no padding needed) 1960 # Result: Complete binary bitmap as BLOB 1961 # 1962 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expr = exp.maybe_parse( 1963 """ 1964 SELECT CASE 1965 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 1966 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 1967 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 1968 ELSE UNHEX('08000000000000000000' || h) 1969 END 1970 FROM ( 1971 SELECT l, COALESCE(LIST_REDUCE( 1972 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 1973 (__a, __b) -> __a || __b, '' 1974 ), '') AS h 1975 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 1976 ) 1977 """ 1978 ) 1979 1980 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 1981 RANDSTR_TEMPLATE: exp.Expr = exp.maybe_parse( 1982 f""" 1983 SELECT LISTAGG( 1984 SUBSTRING( 1985 '{RANDSTR_CHAR_POOL}', 1986 1 + CAST(FLOOR(random_value * 62) AS INT), 1987 1 1988 ), 1989 '' 1990 ) 1991 FROM ( 1992 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 1993 FROM RANGE(:length) AS t(i) 1994 ) 1995 """, 1996 ) 1997 1998 # Template for MINHASH transpilation 1999 # Computes k minimum hash values across aggregated data using DuckDB list functions 2000 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2001 MINHASH_TEMPLATE: exp.Expr = exp.maybe_parse( 2002 """ 2003 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2004 FROM ( 2005 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2006 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2007 ) 2008 """, 2009 ) 2010 2011 # Template for MINHASH_COMBINE transpilation 2012 # Combines multiple minhash signatures by taking element-wise minimum 2013 MINHASH_COMBINE_TEMPLATE: exp.Expr = exp.maybe_parse( 2014 """ 2015 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2016 FROM ( 2017 SELECT 2018 pos AS idx, 2019 MIN(val) AS min_h 2020 FROM 2021 UNNEST(LIST(:expr)) AS _(sig), 2022 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2023 GROUP BY pos 2024 ) 2025 """, 2026 ) 2027 2028 # Template for APPROXIMATE_SIMILARITY transpilation 2029 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2030 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expr = exp.maybe_parse( 2031 """ 2032 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2033 FROM ( 2034 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2035 FROM ( 2036 SELECT h, pos 2037 FROM UNNEST(LIST(:expr)) AS _(sig), 2038 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2039 ) 2040 GROUP BY pos 2041 ) 2042 """, 2043 ) 2044 2045 # Template for ARRAYS_ZIP transpilation 2046 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2047 # Uses RANGE + indexing to match Snowflake behavior 2048 ARRAYS_ZIP_TEMPLATE: exp.Expr = exp.maybe_parse( 2049 """ 2050 CASE WHEN :null_check THEN NULL 2051 WHEN :all_empty_check THEN [:empty_struct] 2052 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2053 END 2054 """, 2055 ) 2056 2057 UUID_V5_TEMPLATE: exp.Expr = exp.maybe_parse( 2058 """ 2059 (SELECT 2060 LOWER( 2061 SUBSTR(h, 1, 8) || '-' || 2062 SUBSTR(h, 9, 4) || '-' || 2063 '5' || SUBSTR(h, 14, 3) || '-' || 2064 FORMAT('{:02x}', CAST('0x' || SUBSTR(h, 17, 2) AS INT) & 63 | 128) || SUBSTR(h, 19, 2) || '-' || 2065 SUBSTR(h, 21, 12) 2066 ) 2067 FROM ( 2068 SELECT SUBSTR(SHA1(UNHEX(REPLACE(:namespace, '-', '')) || ENCODE(:name, 'utf8')), 1, 32) AS h 2069 )) 2070 """ 2071 ) 2072 2073 # Shared bag semantics outer frame for ARRAY_EXCEPT and ARRAY_INTERSECTION. 2074 # Each element is paired with its 1-based position via LIST_ZIP, then filtered 2075 # by a comparison operator (supplied via :cond) that determines the operation: 2076 # EXCEPT (>): keep the N-th occurrence only if N > count in arr2 2077 # e.g. [2,2,2] EXCEPT [2,2] -> [2] 2078 # INTERSECTION (<=): keep the N-th occurrence only if N <= count in arr2 2079 # e.g. [2,2,2] INTERSECT [2,2] -> [2,2] 2080 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2081 ARRAY_BAG_TEMPLATE: exp.Expr = exp.maybe_parse( 2082 """ 2083 CASE 2084 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2085 ELSE LIST_TRANSFORM( 2086 LIST_FILTER( 2087 LIST_ZIP(:arr1, GENERATE_SERIES(1, LEN(:arr1))), 2088 pair -> :cond 2089 ), 2090 pair -> pair[0] 2091 ) 2092 END 2093 """ 2094 ) 2095 2096 ARRAY_EXCEPT_CONDITION: exp.Expr = exp.maybe_parse( 2097 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2098 " > LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2099 ) 2100 2101 ARRAY_INTERSECTION_CONDITION: exp.Expr = exp.maybe_parse( 2102 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2103 " <= LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2104 ) 2105 2106 # Set semantics for ARRAY_EXCEPT. Deduplicates arr1 via LIST_DISTINCT, then 2107 # filters out any element that appears at least once in arr2. 2108 # e.g. [1,1,2,3] EXCEPT [1] -> [2,3] 2109 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2110 ARRAY_EXCEPT_SET_TEMPLATE: exp.Expr = exp.maybe_parse( 2111 """ 2112 CASE 2113 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2114 ELSE LIST_FILTER( 2115 LIST_DISTINCT(:arr1), 2116 e -> LEN(LIST_FILTER(:arr2, x -> x IS NOT DISTINCT FROM e)) = 0 2117 ) 2118 END 2119 """ 2120 ) 2121 2122 STRTOK_TO_ARRAY_TEMPLATE: exp.Expr = exp.maybe_parse( 2123 """ 2124 CASE WHEN :delimiter IS NULL THEN NULL 2125 ELSE LIST_FILTER( 2126 REGEXP_SPLIT_TO_ARRAY(:string, CASE WHEN :delimiter = '' THEN '.^' ELSE CONCAT('[', :escaped, ']') END), 2127 x -> NOT x = '' 2128 ) END 2129 """ 2130 ) 2131 2132 # Template for STRTOK function transpilation 2133 # 2134 # DuckDB itself doesn't have a strtok function. This handles the transpilation from Snowflake to DuckDB. 2135 # We may need to adjust this if we want to support transpilation from other dialects 2136 # 2137 # CASE 2138 # -- Snowflake: empty delimiter + empty input string -> NULL 2139 # WHEN delimiter = '' AND input_str = '' THEN NULL 2140 # 2141 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return input string if index is 1 2142 # WHEN delimiter = '' AND index = 1 THEN input_str 2143 # 2144 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return NULL if index is not 1 2145 # WHEN delimiter = '' THEN NULL 2146 # 2147 # -- Snowflake: negative indices return NULL 2148 # WHEN index < 0 THEN NULL 2149 # 2150 # -- Snowflake: return NULL if any argument is NULL 2151 # WHEN input_str IS NULL OR delimiter IS NULL OR index IS NULL THEN NULL 2152 # 2153 # 2154 # ELSE LIST_FILTER( 2155 # REGEXP_SPLIT_TO_ARRAY( 2156 # input_str, 2157 # CASE 2158 # -- if delimiter is '', we don't want to surround it with '[' and ']' as '[]' is invalid for DuckDB 2159 # WHEN delimiter = '' THEN '' 2160 # 2161 # -- handle problematic regex characters in delimiter with REGEXP_REPLACE 2162 # -- turn delimiter into a regex char set, otherwise DuckDB will match in order, which we don't want 2163 # ELSE '[' || REGEXP_REPLACE(delimiter, problematic_char_set, '\\\1', 'g') || ']' 2164 # END 2165 # ), 2166 # 2167 # -- Snowflake: don't return empty strings 2168 # x -> NOT x = '' 2169 # )[index] 2170 # END 2171 STRTOK_TEMPLATE: exp.Expr = exp.maybe_parse( 2172 """ 2173 CASE 2174 WHEN :delimiter = '' AND :string = '' THEN NULL 2175 WHEN :delimiter = '' AND :part_index = 1 THEN :string 2176 WHEN :delimiter = '' THEN NULL 2177 WHEN :part_index < 0 THEN NULL 2178 WHEN :string IS NULL OR :delimiter IS NULL OR :part_index IS NULL THEN NULL 2179 ELSE :base_func 2180 END 2181 """ 2182 ) 2183 2184 def _array_bag_sql(self, condition: exp.Expr, arr1: exp.Expr, arr2: exp.Expr) -> str: 2185 cond = exp.Paren(this=exp.replace_placeholders(condition, arr1=arr1, arr2=arr2)) 2186 return self.sql( 2187 exp.replace_placeholders(self.ARRAY_BAG_TEMPLATE, arr1=arr1, arr2=arr2, cond=cond) 2188 ) 2189 2190 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2191 """ 2192 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2193 2194 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2195 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2196 2197 For 'END' kind, add the interval to get the end of the slice. 2198 For DATE type with 'END', cast result back to DATE to preserve type. 2199 """ 2200 date_expr = expression.this 2201 slice_length = expression.expression 2202 unit = expression.unit 2203 kind = expression.text("kind").upper() 2204 2205 # Create INTERVAL expression: INTERVAL 'N' UNIT 2206 interval_expr = exp.Interval(this=slice_length, unit=unit) 2207 2208 # Create base time_bucket expression 2209 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2210 2211 # Check if we need the end of the slice (default is start) 2212 if not kind == "END": 2213 # For 'START', return time_bucket directly 2214 return self.sql(time_bucket_expr) 2215 2216 # For 'END', add the interval to get end of slice 2217 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2218 2219 # If input is DATE type, cast result back to DATE to preserve type 2220 # DuckDB converts DATE to TIMESTAMP when adding intervals 2221 if date_expr.is_type(exp.DType.DATE): 2222 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2223 2224 return self.sql(add_expr) 2225 2226 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2227 """ 2228 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2229 2230 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2231 - Each bucket covers 32,768 values 2232 - Bucket numbering starts at 1 2233 - Formula: ((value - 1) // 32768) + 1 for positive values 2234 2235 For non-positive values (0 and negative), we use value // 32768 to avoid 2236 producing bucket 0 or positive bucket IDs for negative inputs. 2237 """ 2238 value = expression.this 2239 2240 positive_formula = ((value - 1) // 32768) + 1 2241 non_positive_formula = value // 32768 2242 2243 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2244 case_expr = ( 2245 exp.case() 2246 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2247 .else_(non_positive_formula) 2248 ) 2249 return self.sql(case_expr) 2250 2251 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2252 """ 2253 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2254 2255 Snowflake's BITMAP_BIT_POSITION behavior: 2256 - For n <= 0: returns ABS(n) % 32768 2257 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2258 """ 2259 this = expression.this 2260 2261 return self.sql( 2262 exp.Mod( 2263 this=exp.Paren( 2264 this=exp.If( 2265 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2266 true=this - exp.Literal.number(1), 2267 false=exp.Abs(this=this), 2268 ) 2269 ), 2270 expression=MAX_BIT_POSITION, 2271 ) 2272 ) 2273 2274 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2275 """ 2276 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2277 Uses a pre-parsed template with placeholders replaced by expression nodes. 2278 2279 Snowflake bitmap format: 2280 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2281 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2282 """ 2283 arg = expression.this 2284 return ( 2285 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2286 ) 2287 2288 def compress_sql(self, expression: exp.Compress) -> str: 2289 self.unsupported("DuckDB does not support the COMPRESS() function") 2290 return self.function_fallback_sql(expression) 2291 2292 def encrypt_sql(self, expression: exp.Encrypt) -> str: 2293 self.unsupported("ENCRYPT is not supported in DuckDB") 2294 return self.function_fallback_sql(expression) 2295 2296 def decrypt_sql(self, expression: exp.Decrypt) -> str: 2297 func_name = "TRY_DECRYPT" if expression.args.get("safe") else "DECRYPT" 2298 self.unsupported(f"{func_name} is not supported in DuckDB") 2299 return self.function_fallback_sql(expression) 2300 2301 def decryptraw_sql(self, expression: exp.DecryptRaw) -> str: 2302 func_name = "TRY_DECRYPT_RAW" if expression.args.get("safe") else "DECRYPT_RAW" 2303 self.unsupported(f"{func_name} is not supported in DuckDB") 2304 return self.function_fallback_sql(expression) 2305 2306 def encryptraw_sql(self, expression: exp.EncryptRaw) -> str: 2307 self.unsupported("ENCRYPT_RAW is not supported in DuckDB") 2308 return self.function_fallback_sql(expression) 2309 2310 def parseurl_sql(self, expression: exp.ParseUrl) -> str: 2311 self.unsupported("PARSE_URL is not supported in DuckDB") 2312 return self.function_fallback_sql(expression) 2313 2314 def parseip_sql(self, expression: exp.ParseIp) -> str: 2315 self.unsupported("PARSE_IP is not supported in DuckDB") 2316 return self.function_fallback_sql(expression) 2317 2318 def decompressstring_sql(self, expression: exp.DecompressString) -> str: 2319 self.unsupported("DECOMPRESS_STRING is not supported in DuckDB") 2320 return self.function_fallback_sql(expression) 2321 2322 def decompressbinary_sql(self, expression: exp.DecompressBinary) -> str: 2323 self.unsupported("DECOMPRESS_BINARY is not supported in DuckDB") 2324 return self.function_fallback_sql(expression) 2325 2326 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2327 this = expression.this 2328 expr = expression.expression 2329 2330 if expression.args.get("case_insensitive"): 2331 this = exp.Upper(this=this) 2332 expr = exp.Upper(this=expr) 2333 2334 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2335 2336 if expression.args.get("integer_scale"): 2337 result = exp.cast(result * 100, "INTEGER") 2338 2339 return self.sql(result) 2340 2341 def nthvalue_sql(self, expression: exp.NthValue) -> str: 2342 from_first = expression.args.get("from_first", True) 2343 if not from_first: 2344 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2345 2346 return self.function_fallback_sql(expression) 2347 2348 def randstr_sql(self, expression: exp.Randstr) -> str: 2349 """ 2350 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2351 Uses a pre-parsed template with placeholders replaced by expression nodes. 2352 2353 RANDSTR(length, generator) generates a random string of specified length. 2354 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2355 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2356 - No generator: Use default seed value 2357 """ 2358 length = expression.this 2359 generator = expression.args.get("generator") 2360 2361 if generator: 2362 if isinstance(generator, exp.Rand): 2363 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2364 seed_value = generator.this or generator 2365 else: 2366 # Const/int or other expression - use as seed directly 2367 seed_value = generator 2368 else: 2369 # No generator specified, use default seed (arbitrary but deterministic) 2370 seed_value = exp.Literal.number(RANDSTR_SEED) 2371 2372 replacements = {"seed": seed_value, "length": length} 2373 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2374 2375 @unsupported_args("finish") 2376 def reduce_sql(self, expression: exp.Reduce) -> str: 2377 array_arg = expression.this 2378 initial_value = expression.args.get("initial") 2379 merge_lambda = expression.args.get("merge") 2380 2381 if merge_lambda: 2382 merge_lambda.set("colon", True) 2383 2384 return self.func("list_reduce", array_arg, merge_lambda, initial_value) 2385 2386 def zipf_sql(self, expression: exp.Zipf) -> str: 2387 """ 2388 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2389 Uses a pre-parsed template with placeholders replaced by expression nodes. 2390 """ 2391 s = expression.this 2392 n = expression.args["elementcount"] 2393 gen = expression.args["gen"] 2394 2395 if not isinstance(gen, exp.Rand): 2396 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2397 random_expr: exp.Expr = exp.Div( 2398 this=exp.Paren( 2399 this=exp.Mod( 2400 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2401 expression=exp.Literal.number(1000000), 2402 ) 2403 ), 2404 expression=exp.Literal.number(1000000.0), 2405 ) 2406 else: 2407 # Use RANDOM() for non-deterministic output 2408 random_expr = exp.Rand() 2409 2410 replacements = {"s": s, "n": n, "random_expr": random_expr} 2411 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2412 2413 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2414 """ 2415 TO_BINARY and TRY_TO_BINARY transpilation: 2416 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2417 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2418 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2419 2420 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2421 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2422 """ 2423 value = expression.this 2424 format_arg = expression.args.get("format") 2425 is_safe = expression.args.get("safe") 2426 is_binary = _is_binary(expression) 2427 2428 if not format_arg and not is_binary: 2429 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2430 return self.func(func_name, value) 2431 2432 # Snowflake defaults to HEX encoding when no format is specified 2433 fmt = format_arg.name.upper() if format_arg else "HEX" 2434 2435 if fmt in ("UTF-8", "UTF8"): 2436 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2437 result = self.func("ENCODE", value) 2438 elif fmt == "BASE64": 2439 result = self.func("FROM_BASE64", value) 2440 elif fmt == "HEX": 2441 result = self.func("UNHEX", value) 2442 else: 2443 if is_safe: 2444 return self.sql(exp.null()) 2445 else: 2446 self.unsupported(f"format {fmt} is not supported") 2447 result = self.func("TO_BINARY", value) 2448 return f"TRY({result})" if is_safe else result 2449 2450 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2451 fmt = expression.args.get("format") 2452 precision = expression.args.get("precision") 2453 scale = expression.args.get("scale") 2454 2455 if not fmt and precision and scale: 2456 return self.sql( 2457 exp.cast( 2458 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2459 ) 2460 ) 2461 2462 return super().tonumber_sql(expression) 2463 2464 def _greatest_least_sql(self, expression: exp.Greatest | exp.Least) -> str: 2465 """ 2466 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2467 2468 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2469 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2470 """ 2471 # Get all arguments 2472 all_args = [expression.this, *expression.expressions] 2473 fallback_sql = self.function_fallback_sql(expression) 2474 2475 if expression.args.get("ignore_nulls"): 2476 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2477 return self.sql(fallback_sql) 2478 2479 # return NULL if any argument is NULL 2480 case_expr = exp.case().when( 2481 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2482 exp.null(), 2483 copy=False, 2484 ) 2485 case_expr.set("default", fallback_sql) 2486 return self.sql(case_expr) 2487 2488 def generator_sql(self, expression: exp.Generator) -> str: 2489 # Transpile Snowflake GENERATOR to DuckDB range() 2490 rowcount = expression.args.get("rowcount") 2491 time_limit = expression.args.get("time_limit") 2492 2493 if time_limit: 2494 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2495 2496 if not rowcount: 2497 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2498 return self.func("range", exp.Literal.number(0)) 2499 2500 return self.func("range", rowcount) 2501 2502 def greatest_sql(self, expression: exp.Greatest) -> str: 2503 return self._greatest_least_sql(expression) 2504 2505 def least_sql(self, expression: exp.Least) -> str: 2506 return self._greatest_least_sql(expression) 2507 2508 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2509 if expression.args.get("colon"): 2510 prefix = "LAMBDA " 2511 arrow_sep = ":" 2512 wrap = False 2513 else: 2514 prefix = "" 2515 2516 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2517 return f"{prefix}{lambda_sql}" 2518 2519 def show_sql(self, expression: exp.Show) -> str: 2520 from_ = self.sql(expression, "from_") 2521 from_ = f" FROM {from_}" if from_ else "" 2522 return f"SHOW {expression.name}{from_}" 2523 2524 def soundex_sql(self, expression: exp.Soundex) -> str: 2525 self.unsupported("SOUNDEX is not supported in DuckDB") 2526 return self.func("SOUNDEX", expression.this) 2527 2528 def sortarray_sql(self, expression: exp.SortArray) -> str: 2529 arr = expression.this 2530 asc = expression.args.get("asc") 2531 nulls_first = expression.args.get("nulls_first") 2532 2533 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2534 return self.func("LIST_SORT", arr, asc, nulls_first) 2535 2536 nulls_are_first = nulls_first == exp.true() 2537 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2538 2539 if not isinstance(asc, exp.Boolean): 2540 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2541 2542 descending = asc == exp.false() 2543 2544 if not descending and not nulls_are_first: 2545 return self.func("LIST_SORT", arr) 2546 if not nulls_are_first: 2547 return self.func("ARRAY_REVERSE_SORT", arr) 2548 return self.func( 2549 "LIST_SORT", 2550 arr, 2551 exp.Literal.string("DESC" if descending else "ASC"), 2552 exp.Literal.string("NULLS FIRST"), 2553 ) 2554 2555 def install_sql(self, expression: exp.Install) -> str: 2556 force = "FORCE " if expression.args.get("force") else "" 2557 this = self.sql(expression, "this") 2558 from_clause = expression.args.get("from_") 2559 from_clause = f" FROM {from_clause}" if from_clause else "" 2560 return f"{force}INSTALL {this}{from_clause}" 2561 2562 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2563 self.unsupported( 2564 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2565 ) 2566 return self.function_fallback_sql(expression) 2567 2568 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2569 return self.sql(exp.cast(expression.this, exp.DType.TIMESTAMPTZ)) 2570 2571 def strposition_sql(self, expression: exp.StrPosition) -> str: 2572 this = expression.this 2573 substr = expression.args.get("substr") 2574 position = expression.args.get("position") 2575 2576 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2577 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2578 if _is_binary(this): 2579 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2580 hex_strpos = exp.StrPosition( 2581 this=exp.Hex(this=this), 2582 substr=exp.Hex(this=substr), 2583 ) 2584 2585 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2586 2587 # For VARCHAR: handle clamp_position 2588 if expression.args.get("clamp_position") and position: 2589 expression = expression.copy() 2590 expression.set( 2591 "position", 2592 exp.If( 2593 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2594 true=exp.Literal.number(1), 2595 false=position.copy(), 2596 ), 2597 ) 2598 2599 return strposition_sql(self, expression) 2600 2601 def substring_sql(self, expression: exp.Substring) -> str: 2602 if expression.args.get("zero_start"): 2603 start = expression.args.get("start") 2604 length = expression.args.get("length") 2605 2606 if start := expression.args.get("start"): 2607 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2608 if length := expression.args.get("length"): 2609 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2610 2611 return self.func("SUBSTRING", expression.this, start, length) 2612 2613 return self.function_fallback_sql(expression) 2614 2615 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2616 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2617 target_type = expression.args.get("target_type") 2618 needs_tz = target_type and target_type.this in ( 2619 exp.DType.TIMESTAMPLTZ, 2620 exp.DType.TIMESTAMPTZ, 2621 ) 2622 2623 if expression.args.get("safe"): 2624 formatted_time = self.format_time(expression) 2625 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2626 return self.sql( 2627 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2628 ) 2629 2630 base_sql = str_to_time_sql(self, expression) 2631 if needs_tz: 2632 return self.sql( 2633 exp.cast( 2634 base_sql, 2635 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2636 ) 2637 ) 2638 return base_sql 2639 2640 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2641 formatted_time = self.format_time(expression) 2642 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2643 return self.sql( 2644 exp.cast( 2645 self.func(function_name, expression.this, formatted_time), 2646 exp.DataType(this=exp.DType.DATE), 2647 ) 2648 ) 2649 2650 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2651 this = expression.this 2652 time_format = self.format_time(expression) 2653 safe = expression.args.get("safe") 2654 time_type = exp.DataType.from_str("TIME", dialect="duckdb") 2655 cast_expr = exp.TryCast if safe else exp.Cast 2656 2657 if time_format: 2658 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2659 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2660 return self.sql(cast_expr(this=strptime, to=time_type)) 2661 2662 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2663 return self.sql(this) 2664 2665 return self.sql(cast_expr(this=this, to=time_type)) 2666 2667 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2668 if not expression.this: 2669 return "CURRENT_DATE" 2670 2671 expr = exp.Cast( 2672 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2673 to=exp.DataType(this=exp.DType.DATE), 2674 ) 2675 return self.sql(expr) 2676 2677 def checkjson_sql(self, expression: exp.CheckJson) -> str: 2678 arg = expression.this 2679 return self.sql( 2680 exp.case() 2681 .when( 2682 exp.or_(arg.is_(exp.Null()), arg.eq(""), exp.func("json_valid", arg)), 2683 exp.null(), 2684 ) 2685 .else_(exp.Literal.string("Invalid JSON")) 2686 ) 2687 2688 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2689 arg = expression.this 2690 if expression.args.get("safe"): 2691 return self.sql( 2692 exp.case() 2693 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2694 .else_(exp.null()) 2695 ) 2696 return self.func("JSON", arg) 2697 2698 def unicode_sql(self, expression: exp.Unicode) -> str: 2699 if expression.args.get("empty_is_zero"): 2700 return self.sql( 2701 exp.case() 2702 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2703 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2704 ) 2705 2706 return self.func("UNICODE", expression.this) 2707 2708 def stripnullvalue_sql(self, expression: exp.StripNullValue) -> str: 2709 return self.sql( 2710 exp.case() 2711 .when(exp.func("json_type", expression.this).eq("NULL"), exp.null()) 2712 .else_(expression.this) 2713 ) 2714 2715 def trunc_sql(self, expression: exp.Trunc) -> str: 2716 decimals = expression.args.get("decimals") 2717 if ( 2718 expression.args.get("fractions_supported") 2719 and decimals 2720 and not decimals.is_type(exp.DType.INT) 2721 ): 2722 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2723 2724 return self.func("TRUNC", expression.this, decimals) 2725 2726 def normal_sql(self, expression: exp.Normal) -> str: 2727 """ 2728 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2729 2730 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2731 """ 2732 mean = expression.this 2733 stddev = expression.args["stddev"] 2734 gen: exp.Expr = expression.args["gen"] 2735 2736 # Build two uniform random values [0, 1) for Box-Muller transform 2737 if isinstance(gen, exp.Rand) and gen.this is None: 2738 u1: exp.Expr = exp.Rand() 2739 u2: exp.Expr = exp.Rand() 2740 else: 2741 # Seeded: derive two values using HASH with different inputs 2742 seed = gen.this if isinstance(gen, exp.Rand) else gen 2743 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2744 u2 = exp.replace_placeholders( 2745 self.SEEDED_RANDOM_TEMPLATE, 2746 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2747 ) 2748 2749 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2750 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2751 2752 def uniform_sql(self, expression: exp.Uniform) -> str: 2753 """ 2754 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2755 2756 UNIFORM returns a random value in [min, max]: 2757 - Integer result if both min and max are integers 2758 - Float result if either min or max is a float 2759 """ 2760 min_val = expression.this 2761 max_val = expression.expression 2762 gen = expression.args.get("gen") 2763 2764 # Determine if result should be integer (both bounds are integers). 2765 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2766 is_int_result = min_val.is_int and max_val.is_int 2767 2768 # Build the random value expression [0, 1) 2769 if not isinstance(gen, exp.Rand): 2770 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2771 random_expr: exp.Expr = exp.Div( 2772 this=exp.Paren( 2773 this=exp.Mod( 2774 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2775 expression=exp.Literal.number(1000000), 2776 ) 2777 ), 2778 expression=exp.Literal.number(1000000.0), 2779 ) 2780 else: 2781 random_expr = exp.Rand() 2782 2783 # Build: min + random * (max - min [+ 1 for int]) 2784 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2785 if is_int_result: 2786 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2787 2788 result: exp.Expr = exp.Add( 2789 this=min_val, 2790 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2791 ) 2792 2793 if is_int_result: 2794 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2795 2796 return self.sql(result) 2797 2798 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2799 nano = expression.args.get("nano") 2800 overflow = expression.args.get("overflow") 2801 2802 # Snowflake's TIME_FROM_PARTS supports overflow 2803 if overflow: 2804 hour = expression.args["hour"] 2805 minute = expression.args["min"] 2806 sec = expression.args["sec"] 2807 2808 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2809 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2810 try: 2811 h_val = hour.to_py() 2812 m_val = minute.to_py() 2813 s_val = sec.to_py() 2814 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2815 return rename_func("MAKE_TIME")(self, expression) 2816 except ValueError: 2817 pass 2818 2819 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2820 if nano: 2821 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2822 2823 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2824 2825 return self.sql( 2826 exp.Add( 2827 this=exp.Cast( 2828 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2829 ), 2830 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2831 ) 2832 ) 2833 2834 # Default: MAKE_TIME 2835 if nano: 2836 expression.set( 2837 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2838 ) 2839 2840 return rename_func("MAKE_TIME")(self, expression) 2841 2842 def extract_sql(self, expression: exp.Extract) -> str: 2843 """ 2844 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2845 2846 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2847 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2848 """ 2849 this = expression.this 2850 datetime_expr = expression.expression 2851 2852 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2853 # because Snowflake applies server timezone while DuckDB uses local timezone 2854 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2855 self.unsupported( 2856 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2857 ) 2858 2859 part_name = this.name.upper() 2860 2861 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2862 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2863 2864 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2865 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2866 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2867 exp.DType.TIME, exp.DType.TIMETZ 2868 ) 2869 2870 if is_nano_time: 2871 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2872 return self.sql( 2873 exp.cast( 2874 exp.Mul( 2875 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2876 expression=exp.Literal.number(1000), 2877 ), 2878 exp.DataType.from_str(cast_type, dialect="duckdb"), 2879 ) 2880 ) 2881 2882 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2883 strftime_input = datetime_expr 2884 if part_name == "NANOSECOND": 2885 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2886 2887 return self.sql( 2888 exp.cast( 2889 exp.Anonymous( 2890 this="STRFTIME", 2891 expressions=[strftime_input, exp.Literal.string(fmt)], 2892 ), 2893 exp.DataType.from_str(cast_type, dialect="duckdb"), 2894 ) 2895 ) 2896 2897 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2898 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2899 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2900 # EPOCH returns float, cast to BIGINT for integer result 2901 if part_name == "EPOCH_SECOND": 2902 result = exp.cast(result, exp.DataType.from_str("BIGINT", dialect="duckdb")) 2903 return self.sql(result) 2904 2905 return super().extract_sql(expression) 2906 2907 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2908 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2909 date_expr = expression.this 2910 time_expr = expression.expression 2911 2912 if date_expr is not None and time_expr is not None: 2913 # In DuckDB, DATE + TIME produces TIMESTAMP 2914 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2915 2916 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2917 sec = expression.args.get("sec") 2918 if sec is None: 2919 # This shouldn't happen with valid input, but handle gracefully 2920 return rename_func("MAKE_TIMESTAMP")(self, expression) 2921 2922 milli = expression.args.get("milli") 2923 if milli is not None: 2924 sec += milli.pop() / exp.Literal.number(1000.0) 2925 2926 nano = expression.args.get("nano") 2927 if nano is not None: 2928 sec += nano.pop() / exp.Literal.number(1000000000.0) 2929 2930 if milli or nano: 2931 expression.set("sec", sec) 2932 2933 return rename_func("MAKE_TIMESTAMP")(self, expression) 2934 2935 @unsupported_args("nano") 2936 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2937 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2938 if nano := expression.args.get("nano"): 2939 nano.pop() 2940 2941 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2942 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2943 2944 @unsupported_args("nano") 2945 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2946 # Extract zone before popping 2947 zone = expression.args.get("zone") 2948 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2949 if zone: 2950 zone = zone.pop() 2951 2952 if nano := expression.args.get("nano"): 2953 nano.pop() 2954 2955 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2956 2957 if zone: 2958 # Use AT TIME ZONE to apply the explicit timezone 2959 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2960 2961 return timestamp 2962 2963 def tablesample_sql( 2964 self, 2965 expression: exp.TableSample, 2966 tablesample_keyword: str | None = None, 2967 ) -> str: 2968 if not isinstance(expression.parent, exp.Select): 2969 # This sample clause only applies to a single source, not the entire resulting relation 2970 tablesample_keyword = "TABLESAMPLE" 2971 2972 if expression.args.get("size"): 2973 method = expression.args.get("method") 2974 if method and method.name.upper() != "RESERVOIR": 2975 self.unsupported( 2976 f"Sampling method {method} is not supported with a discrete sample count, " 2977 "defaulting to reservoir sampling" 2978 ) 2979 expression.set("method", exp.var("RESERVOIR")) 2980 2981 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2982 2983 def join_sql(self, expression: exp.Join) -> str: 2984 if ( 2985 not expression.args.get("using") 2986 and not expression.args.get("on") 2987 and not expression.method 2988 and (expression.kind in ("", "INNER", "OUTER")) 2989 ): 2990 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2991 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2992 if isinstance(expression.this, exp.Unnest): 2993 return super().join_sql(expression.on(exp.true())) 2994 2995 expression.set("side", None) 2996 expression.set("kind", None) 2997 2998 return super().join_sql(expression) 2999 3000 def countif_sql(self, expression: exp.CountIf) -> str: 3001 if self.dialect.version >= (1, 2): 3002 return self.function_fallback_sql(expression) 3003 3004 # https://github.com/tobymao/sqlglot/pull/4749 3005 return count_if_to_sum(self, expression) 3006 3007 def bracket_sql(self, expression: exp.Bracket) -> str: 3008 if self.dialect.version >= (1, 2): 3009 return super().bracket_sql(expression) 3010 3011 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3012 this = expression.this 3013 if isinstance(this, exp.Array): 3014 this.replace(exp.paren(this)) 3015 3016 bracket = super().bracket_sql(expression) 3017 3018 if not expression.args.get("returns_list_for_maps"): 3019 if not this.type: 3020 from sqlglot.optimizer.annotate_types import annotate_types 3021 3022 this = annotate_types(this, dialect=self.dialect) 3023 3024 if this.is_type(exp.DType.MAP): 3025 bracket = f"({bracket})[1]" 3026 3027 return bracket 3028 3029 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3030 func = expression.this 3031 3032 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3033 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3034 if isinstance(func, exp.ArrayAgg): 3035 if not isinstance(order := expression.expression, exp.Order): 3036 return self.sql(func) 3037 3038 # Save the original column for FILTER clause (before wrapping with Order) 3039 original_this = func.this 3040 3041 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3042 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3043 func.set( 3044 "this", 3045 exp.Order( 3046 this=func.this.copy(), 3047 expressions=order.expressions, 3048 ), 3049 ) 3050 3051 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3052 # Use original_this (not the Order-wrapped version) for the FILTER condition 3053 array_agg_sql = self.function_fallback_sql(func) 3054 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3055 3056 # For other functions (like PERCENTILES), use existing logic 3057 expression_sql = self.sql(expression, "expression") 3058 3059 if isinstance(func, exp.PERCENTILES): 3060 # Make the order key the first arg and slide the fraction to the right 3061 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3062 order_col = expression.find(exp.Ordered) 3063 if order_col: 3064 func.set("expression", func.this) 3065 func.set("this", order_col.this) 3066 3067 this = self.sql(expression, "this").rstrip(")") 3068 3069 return f"{this}{expression_sql})" 3070 3071 def length_sql(self, expression: exp.Length) -> str: 3072 arg = expression.this 3073 3074 # Dialects like BQ and Snowflake also accept binary values as args, so 3075 # DDB will attempt to infer the type or resort to case/when resolution 3076 if not expression.args.get("binary") or arg.is_string: 3077 return self.func("LENGTH", arg) 3078 3079 if not arg.type: 3080 from sqlglot.optimizer.annotate_types import annotate_types 3081 3082 arg = annotate_types(arg, dialect=self.dialect) 3083 3084 if arg.is_type(*exp.DataType.TEXT_TYPES): 3085 return self.func("LENGTH", arg) 3086 3087 # We need these casts to make duckdb's static type checker happy 3088 blob = exp.cast(arg, exp.DType.VARBINARY) 3089 varchar = exp.cast(arg, exp.DType.VARCHAR) 3090 3091 case = ( 3092 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3093 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3094 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3095 ) 3096 return self.sql(case) 3097 3098 def bitlength_sql(self, expression: exp.BitLength) -> str: 3099 if not _is_binary(arg := expression.this): 3100 return self.func("BIT_LENGTH", arg) 3101 3102 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3103 return self.sql(exp.ByteLength(this=blob) * exp.Literal.number(8)) 3104 3105 def chr_sql(self, expression: exp.Chr, name: str = "CHR") -> str: 3106 arg = expression.expressions[0] 3107 if arg.is_type(*exp.DataType.REAL_TYPES): 3108 arg = exp.cast(arg, exp.DType.INT) 3109 return self.func("CHR", arg) 3110 3111 def collation_sql(self, expression: exp.Collation) -> str: 3112 self.unsupported("COLLATION function is not supported by DuckDB") 3113 return self.function_fallback_sql(expression) 3114 3115 def collate_sql(self, expression: exp.Collate) -> str: 3116 if not expression.expression.is_string: 3117 return super().collate_sql(expression) 3118 3119 raw = expression.expression.name 3120 if not raw: 3121 return self.sql(expression.this) 3122 3123 parts = [] 3124 for part in raw.split("-"): 3125 lower = part.lower() 3126 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3127 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3128 self.unsupported( 3129 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3130 ) 3131 parts.append(lower) 3132 3133 if not parts: 3134 return self.sql(expression.this) 3135 return super().collate_sql( 3136 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3137 ) 3138 3139 def _validate_regexp_flags(self, flags: exp.Expr | None, supported_flags: str) -> str | None: 3140 """ 3141 Validate and filter regexp flags for DuckDB compatibility. 3142 3143 Args: 3144 flags: The flags expression to validate 3145 supported_flags: String of supported flags (e.g., "ims", "cims"). 3146 Only these flags will be returned. 3147 3148 Returns: 3149 Validated/filtered flag string, or None if no valid flags remain 3150 """ 3151 if not isinstance(flags, exp.Expr): 3152 return None 3153 3154 if not flags.is_string: 3155 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3156 return None 3157 3158 flag_str = flags.this 3159 unsupported = set(flag_str) - set(supported_flags) 3160 3161 if unsupported: 3162 self.unsupported( 3163 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3164 ) 3165 3166 flag_str = "".join(f for f in flag_str if f in supported_flags) 3167 return flag_str if flag_str else None 3168 3169 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3170 this = expression.this 3171 pattern = expression.expression 3172 position = expression.args.get("position") 3173 parameters = expression.args.get("parameters") 3174 3175 # Validate flags - only "ims" flags are supported for embedded patterns 3176 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3177 3178 if position: 3179 this = exp.Substring(this=this, start=position) 3180 3181 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3182 if validated_flags: 3183 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3184 3185 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3186 result = ( 3187 exp.case() 3188 .when( 3189 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3190 exp.Literal.number(0), 3191 ) 3192 .else_( 3193 exp.Length( 3194 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3195 ) 3196 ) 3197 ) 3198 3199 return self.sql(result) 3200 3201 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3202 subject = expression.this 3203 pattern = expression.expression 3204 replacement = expression.args.get("replacement") or exp.Literal.string("") 3205 position = expression.args.get("position") 3206 occurrence = expression.args.get("occurrence") 3207 modifiers = expression.args.get("modifiers") 3208 3209 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3210 3211 # Handle occurrence (only literals supported) 3212 if occurrence and not occurrence.is_int: 3213 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3214 else: 3215 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3216 if occurrence > 1: 3217 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3218 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3219 elif ( 3220 occurrence == 0 3221 and "g" not in validated_flags 3222 and not expression.args.get("single_replace") 3223 ): 3224 validated_flags += "g" 3225 3226 # Handle position (only literals supported) 3227 prefix = None 3228 if position and not position.is_int: 3229 self.unsupported("REGEXP_REPLACE with non-literal position") 3230 elif position and position.is_int and position.to_py() > 1: 3231 pos = position.to_py() 3232 prefix = exp.Substring( 3233 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3234 ) 3235 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3236 3237 result: exp.Expr = exp.Anonymous( 3238 this="REGEXP_REPLACE", 3239 expressions=[ 3240 subject, 3241 pattern, 3242 replacement, 3243 exp.Literal.string(validated_flags) if validated_flags else None, 3244 ], 3245 ) 3246 3247 if prefix: 3248 result = exp.Concat(expressions=[prefix, result]) 3249 3250 return self.sql(result) 3251 3252 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3253 this = expression.this 3254 pattern = expression.expression 3255 flag = expression.args.get("flag") 3256 3257 if expression.args.get("full_match"): 3258 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3259 flag = exp.Literal.string(validated_flags) if validated_flags else None 3260 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3261 3262 return self.func("REGEXP_MATCHES", this, pattern, flag) 3263 3264 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3265 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3266 this = expression.this 3267 expr = expression.expression 3268 max_dist = expression.args.get("max_dist") 3269 3270 if max_dist is None: 3271 return self.func("LEVENSHTEIN", this, expr) 3272 3273 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3274 levenshtein = exp.Levenshtein(this=this, expression=expr) 3275 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3276 3277 def pad_sql(self, expression: exp.Pad) -> str: 3278 """ 3279 Handle RPAD/LPAD for VARCHAR and BINARY types. 3280 3281 For VARCHAR: Delegate to parent class 3282 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3283 """ 3284 string_arg = expression.this 3285 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3286 3287 if _is_binary(string_arg) or _is_binary(fill_arg): 3288 length_arg = expression.expression 3289 is_left = expression.args.get("is_left") 3290 3291 input_len = exp.ByteLength(this=string_arg) 3292 chars_needed = length_arg - input_len 3293 pad_count = exp.Greatest( 3294 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3295 ) 3296 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3297 3298 left, right = string_arg, repeat_expr 3299 if is_left: 3300 left, right = right, left 3301 3302 result = exp.DPipe(this=left, expression=right) 3303 return self.sql(result) 3304 3305 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3306 return super().pad_sql(expression) 3307 3308 def minhash_sql(self, expression: exp.Minhash) -> str: 3309 k = expression.this 3310 exprs = expression.expressions 3311 3312 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3313 self.unsupported( 3314 "MINHASH with multiple expressions or * requires manual query restructuring" 3315 ) 3316 return self.func("MINHASH", k, *exprs) 3317 3318 expr = exprs[0] 3319 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3320 return f"({self.sql(result)})" 3321 3322 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3323 expr = expression.this 3324 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3325 return f"({self.sql(result)})" 3326 3327 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3328 expr = expression.this 3329 result = exp.replace_placeholders(self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr) 3330 return f"({self.sql(result)})" 3331 3332 def arrayuniqueagg_sql(self, expression: exp.ArrayUniqueAgg) -> str: 3333 return self.sql( 3334 exp.Filter( 3335 this=exp.func("LIST", exp.Distinct(expressions=[expression.this])), 3336 expression=exp.Where(this=expression.this.copy().is_(exp.null()).not_()), 3337 ) 3338 ) 3339 3340 def arrayunionagg_sql(self, expression: exp.ArrayUnionAgg) -> str: 3341 self.unsupported("ARRAY_UNION_AGG is not supported in DuckDB") 3342 return self.function_fallback_sql(expression) 3343 3344 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3345 arr = expression.this 3346 func = self.func("LIST_DISTINCT", arr) 3347 3348 if expression.args.get("check_null"): 3349 add_null_to_array = exp.func( 3350 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3351 ) 3352 return self.sql( 3353 exp.If( 3354 this=exp.NEQ( 3355 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3356 ), 3357 true=add_null_to_array, 3358 false=func, 3359 ) 3360 ) 3361 3362 return func 3363 3364 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3365 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3366 return self._array_bag_sql( 3367 self.ARRAY_INTERSECTION_CONDITION, 3368 expression.expressions[0], 3369 expression.expressions[1], 3370 ) 3371 return self.function_fallback_sql(expression) 3372 3373 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3374 arr1, arr2 = expression.this, expression.expression 3375 if expression.args.get("is_multiset"): 3376 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3377 return self.sql( 3378 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3379 ) 3380 3381 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3382 """ 3383 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3384 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3385 expressions that adjust the index at query time: 3386 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3387 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3388 """ 3389 start, end = expression.args.get("start"), expression.args.get("end") 3390 3391 if expression.args.get("zero_based"): 3392 if start is not None: 3393 start = ( 3394 exp.case() 3395 .when( 3396 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3397 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3398 ) 3399 .else_(start) 3400 ) 3401 if end is not None: 3402 end = ( 3403 exp.case() 3404 .when( 3405 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3406 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3407 ) 3408 .else_(end) 3409 ) 3410 3411 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step")) 3412 3413 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3414 args = expression.expressions 3415 3416 if not args: 3417 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3418 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3419 3420 # Build placeholder values for template 3421 lengths = [exp.Length(this=arg) for arg in args] 3422 max_len = ( 3423 lengths[0] 3424 if len(lengths) == 1 3425 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3426 ) 3427 3428 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3429 empty_struct = exp.func( 3430 "STRUCT", 3431 *[ 3432 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3433 for i in range(len(args)) 3434 ], 3435 ) 3436 3437 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3438 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3439 index = exp.column("__i") + 1 3440 transform_struct = exp.func( 3441 "STRUCT", 3442 *[ 3443 exp.PropertyEQ( 3444 this=exp.Literal.string(f"${i + 1}"), 3445 expression=exp.func("COALESCE", arg, exp.array())[index], 3446 ) 3447 for i, arg in enumerate(args) 3448 ], 3449 ) 3450 3451 result = exp.replace_placeholders( 3452 self.ARRAYS_ZIP_TEMPLATE.copy(), 3453 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3454 all_empty_check=exp.and_( 3455 *[ 3456 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3457 for arg in args 3458 ] 3459 ), 3460 empty_struct=empty_struct, 3461 max_len=max_len, 3462 transform_struct=transform_struct, 3463 ) 3464 return self.sql(result) 3465 3466 def lower_sql(self, expression: exp.Lower) -> str: 3467 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3468 return _gen_with_cast_to_blob(self, expression, result_sql) 3469 3470 def upper_sql(self, expression: exp.Upper) -> str: 3471 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3472 return _gen_with_cast_to_blob(self, expression, result_sql) 3473 3474 def reverse_sql(self, expression: exp.Reverse) -> str: 3475 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3476 return _gen_with_cast_to_blob(self, expression, result_sql) 3477 3478 def _left_right_sql(self, expression: exp.Left | exp.Right, func_name: str) -> str: 3479 arg = expression.this 3480 length = expression.expression 3481 is_binary = _is_binary(arg) 3482 3483 if is_binary: 3484 # LEFT/RIGHT(blob, n) becomes UNHEX(LEFT/RIGHT(HEX(blob), n * 2)) 3485 # Each byte becomes 2 hex chars, so multiply length by 2 3486 hex_arg = exp.Hex(this=arg) 3487 hex_length = exp.Mul(this=length, expression=exp.Literal.number(2)) 3488 result: exp.Expression = exp.Unhex( 3489 this=exp.Anonymous(this=func_name, expressions=[hex_arg, hex_length]) 3490 ) 3491 else: 3492 result = exp.Anonymous(this=func_name, expressions=[arg, length]) 3493 3494 if expression.args.get("negative_length_returns_empty"): 3495 empty: exp.Expression = exp.Literal.string("") 3496 if is_binary: 3497 empty = exp.Unhex(this=empty) 3498 result = exp.case().when(length < exp.Literal.number(0), empty).else_(result) 3499 3500 return self.sql(result) 3501 3502 def left_sql(self, expression: exp.Left) -> str: 3503 return self._left_right_sql(expression, "LEFT") 3504 3505 def right_sql(self, expression: exp.Right) -> str: 3506 return self._left_right_sql(expression, "RIGHT") 3507 3508 def rtrimmedlength_sql(self, expression: exp.RtrimmedLength) -> str: 3509 return self.func("LENGTH", exp.Trim(this=expression.this, position="TRAILING")) 3510 3511 def stuff_sql(self, expression: exp.Stuff) -> str: 3512 base = expression.this 3513 start = expression.args["start"] 3514 length = expression.args["length"] 3515 insertion = expression.expression 3516 is_binary = _is_binary(base) 3517 3518 if is_binary: 3519 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3520 # (each byte = 2 hex chars), then UNHEX back to BLOB 3521 base = exp.Hex(this=base) 3522 insertion = exp.Hex(this=insertion) 3523 left = exp.Substring( 3524 this=base.copy(), 3525 start=exp.Literal.number(1), 3526 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3527 ) 3528 right = exp.Substring( 3529 this=base.copy(), 3530 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3531 + exp.Literal.number(1), 3532 ) 3533 else: 3534 left = exp.Substring( 3535 this=base.copy(), 3536 start=exp.Literal.number(1), 3537 length=start.copy() - exp.Literal.number(1), 3538 ) 3539 right = exp.Substring(this=base.copy(), start=start + length) 3540 result: exp.Expr = exp.DPipe( 3541 this=exp.DPipe(this=left, expression=insertion), expression=right 3542 ) 3543 3544 if is_binary: 3545 result = exp.Unhex(this=result) 3546 3547 return self.sql(result) 3548 3549 def rand_sql(self, expression: exp.Rand) -> str: 3550 seed = expression.this 3551 if seed is not None: 3552 self.unsupported("RANDOM with seed is not supported in DuckDB") 3553 3554 lower = expression.args.get("lower") 3555 upper = expression.args.get("upper") 3556 3557 if lower and upper: 3558 # scale DuckDB's [0,1) to the specified range 3559 range_size = exp.paren(upper - lower) 3560 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3561 3562 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3563 result = exp.cast(scaled, exp.DType.BIGINT) 3564 return self.sql(result) 3565 3566 # Default DuckDB behavior - just return RANDOM() as float 3567 return "RANDOM()" 3568 3569 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3570 arg = expression.this 3571 3572 # Check if it's a text type (handles both literals and annotated expressions) 3573 if arg.is_type(*exp.DataType.TEXT_TYPES): 3574 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3575 3576 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3577 return self.func("OCTET_LENGTH", arg) 3578 3579 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3580 # DuckDB TO_BASE64 requires BLOB input 3581 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3582 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3583 result = expression.this 3584 3585 # Check if input is a string type - ENCODE only accepts VARCHAR 3586 if result.is_type(*exp.DataType.TEXT_TYPES): 3587 result = exp.Encode(this=result) 3588 3589 result = exp.ToBase64(this=result) 3590 3591 max_line_length = expression.args.get("max_line_length") 3592 alphabet = expression.args.get("alphabet") 3593 3594 # Handle custom alphabet by replacing standard chars with custom ones 3595 result = _apply_base64_alphabet_replacements(result, alphabet) 3596 3597 # Handle max_line_length by inserting newlines every N characters 3598 line_length = ( 3599 t.cast(int, max_line_length.to_py()) 3600 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3601 else 0 3602 ) 3603 if line_length > 0: 3604 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3605 result = exp.Trim( 3606 this=exp.RegexpReplace( 3607 this=result, 3608 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3609 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3610 ), 3611 expression=newline, 3612 position="TRAILING", 3613 ) 3614 3615 return self.sql(result) 3616 3617 def hex_sql(self, expression: exp.Hex) -> str: 3618 case = expression.args.get("case") 3619 3620 if not case: 3621 return self.func("HEX", expression.this) 3622 3623 hex_expr = exp.Hex(this=expression.this) 3624 return self.sql( 3625 exp.case() 3626 .when(case.is_(exp.null()), exp.null()) 3627 .when(case.copy().eq(0), exp.Lower(this=hex_expr.copy())) 3628 .else_(hex_expr) 3629 ) 3630 3631 def replace_sql(self, expression: exp.Replace) -> str: 3632 result_sql = self.func( 3633 "REPLACE", 3634 _cast_to_varchar(expression.this), 3635 _cast_to_varchar(expression.expression), 3636 _cast_to_varchar(expression.args.get("replacement")), 3637 ) 3638 return _gen_with_cast_to_blob(self, expression, result_sql) 3639 3640 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3641 _prepare_binary_bitwise_args(expression) 3642 result_sql = self.binary(expression, op) 3643 return _gen_with_cast_to_blob(self, expression, result_sql) 3644 3645 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3646 _prepare_binary_bitwise_args(expression) 3647 result_sql = self.func("XOR", expression.this, expression.expression) 3648 return _gen_with_cast_to_blob(self, expression, result_sql) 3649 3650 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3651 this = expression.this 3652 key = expression.args.get("key") 3653 key_sql = key.name if isinstance(key, exp.Expr) else "" 3654 value_sql = self.sql(expression, "value") 3655 3656 kv_sql = f"{key_sql} := {value_sql}" 3657 3658 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3659 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3660 if isinstance(this, exp.Struct) and not this.expressions: 3661 return self.func("STRUCT_PACK", kv_sql) 3662 3663 return self.func("STRUCT_INSERT", this, kv_sql) 3664 3665 def mapcat_sql(self, expression: exp.MapCat) -> str: 3666 result = exp.replace_placeholders( 3667 self.MAPCAT_TEMPLATE.copy(), 3668 map1=expression.this, 3669 map2=expression.expression, 3670 ) 3671 return self.sql(result) 3672 3673 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3674 return self.func( 3675 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3676 ) 3677 3678 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3679 map_arg = expression.this 3680 keys_to_delete = expression.expressions 3681 3682 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3683 3684 lambda_expr = exp.Lambda( 3685 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3686 expressions=[exp.to_identifier("x")], 3687 ) 3688 result = exp.func( 3689 "MAP_FROM_ENTRIES", 3690 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3691 ) 3692 return self.sql(result) 3693 3694 def mappick_sql(self, expression: exp.MapPick) -> str: 3695 map_arg = expression.this 3696 keys_to_pick = expression.expressions 3697 3698 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3699 3700 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3701 lambda_expr = exp.Lambda( 3702 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3703 expressions=[exp.to_identifier("x")], 3704 ) 3705 else: 3706 lambda_expr = exp.Lambda( 3707 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3708 expressions=[exp.to_identifier("x")], 3709 ) 3710 3711 result = exp.func( 3712 "MAP_FROM_ENTRIES", 3713 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3714 ) 3715 return self.sql(result) 3716 3717 def mapsize_sql(self, expression: exp.MapSize) -> str: 3718 return self.func("CARDINALITY", expression.this) 3719 3720 @unsupported_args("update_flag") 3721 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3722 map_arg = expression.this 3723 key = expression.args.get("key") 3724 value = expression.args.get("value") 3725 3726 map_type = map_arg.type 3727 3728 if value is not None: 3729 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3730 # Extract the value type from MAP(key_type, value_type) 3731 value_type = map_type.expressions[1] 3732 # Cast value to match the map's value type to avoid type conflicts 3733 value = exp.cast(value, value_type) 3734 # else: polymorphic MAP case - no type parameters available, use value as-is 3735 3736 # Create a single-entry map for the new key-value pair 3737 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3738 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3739 3740 # Use MAP_CONCAT to merge the original map with the new entry 3741 # This automatically handles both insert and update cases 3742 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3743 3744 return self.sql(result) 3745 3746 def startswith_sql(self, expression: exp.StartsWith) -> str: 3747 return self.func( 3748 "STARTS_WITH", 3749 _cast_to_varchar(expression.this), 3750 _cast_to_varchar(expression.expression), 3751 ) 3752 3753 def space_sql(self, expression: exp.Space) -> str: 3754 # DuckDB's REPEAT requires BIGINT for the count parameter 3755 return self.sql( 3756 exp.Repeat( 3757 this=exp.Literal.string(" "), 3758 times=exp.cast(expression.this, exp.DType.BIGINT), 3759 ) 3760 ) 3761 3762 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3763 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3764 if isinstance(expression.this, exp.Generator): 3765 # Preserve alias, joins, and other table-level args 3766 table = exp.Table( 3767 this=expression.this, 3768 alias=expression.args.get("alias"), 3769 joins=expression.args.get("joins"), 3770 ) 3771 return self.sql(table) 3772 3773 return super().tablefromrows_sql(expression) 3774 3775 def unnest_sql(self, expression: exp.Unnest) -> str: 3776 explode_array = expression.args.get("explode_array") 3777 if explode_array: 3778 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3779 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3780 expression.expressions.append( 3781 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3782 ) 3783 3784 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3785 alias = expression.args.get("alias") 3786 if isinstance(alias, exp.TableAlias): 3787 expression.set("alias", None) 3788 if alias.columns: 3789 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3790 3791 unnest_sql = super().unnest_sql(expression) 3792 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3793 return self.sql(select) 3794 3795 return super().unnest_sql(expression) 3796 3797 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3798 this = expression.this 3799 3800 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3801 # DuckDB should render IGNORE NULLS only for the general-purpose 3802 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3803 return super().ignorenulls_sql(expression) 3804 3805 if isinstance(this, exp.First): 3806 this = exp.AnyValue(this=this.this) 3807 3808 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3809 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3810 3811 return self.sql(this) 3812 3813 def split_sql(self, expression: exp.Split) -> str: 3814 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3815 3816 case_expr = exp.case().else_(base_func) 3817 needs_case = False 3818 3819 if expression.args.get("null_returns_null"): 3820 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3821 needs_case = True 3822 3823 if expression.args.get("empty_delimiter_returns_whole"): 3824 # When delimiter is empty string, return input string as single array element 3825 array_with_input = exp.array(expression.this) 3826 case_expr = case_expr.when( 3827 expression.expression.eq(exp.Literal.string("")), array_with_input 3828 ) 3829 needs_case = True 3830 3831 return self.sql(case_expr if needs_case else base_func) 3832 3833 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3834 string_arg = expression.this 3835 delimiter_arg = expression.args.get("delimiter") 3836 part_index_arg = expression.args.get("part_index") 3837 3838 if delimiter_arg and part_index_arg: 3839 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3840 if expression.args.get("part_index_zero_as_one"): 3841 # Convert 0 to 1 for compatibility 3842 3843 part_index_arg = exp.Paren( 3844 this=exp.case() 3845 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3846 .else_(part_index_arg) 3847 ) 3848 3849 # Use Anonymous to avoid recursion 3850 base_func_expr: exp.Expr = exp.Anonymous( 3851 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3852 ) 3853 needs_case_transform = False 3854 case_expr = exp.case().else_(base_func_expr) 3855 3856 if expression.args.get("empty_delimiter_returns_whole"): 3857 # When delimiter is empty string: 3858 # - Return whole string if part_index is 1 or -1 3859 # - Return empty string otherwise 3860 empty_case = exp.Paren( 3861 this=exp.case() 3862 .when( 3863 exp.or_( 3864 part_index_arg.eq(exp.Literal.number("1")), 3865 part_index_arg.eq(exp.Literal.number("-1")), 3866 ), 3867 string_arg, 3868 ) 3869 .else_(exp.Literal.string("")) 3870 ) 3871 3872 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3873 needs_case_transform = True 3874 3875 """ 3876 Output looks something like this: 3877 3878 CASE 3879 WHEN delimiter is '' THEN 3880 ( 3881 CASE 3882 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3883 ELSE '' END 3884 ) 3885 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3886 END 3887 3888 """ 3889 return self.sql(case_expr if needs_case_transform else base_func_expr) 3890 3891 return self.function_fallback_sql(expression) 3892 3893 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3894 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3895 # DuckDB should render RESPECT NULLS only for the general-purpose 3896 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3897 return super().respectnulls_sql(expression) 3898 3899 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3900 return self.sql(expression, "this") 3901 3902 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3903 null = expression.args.get("null") 3904 3905 if expression.args.get("null_is_empty"): 3906 x = exp.to_identifier("x") 3907 list_transform = exp.Transform( 3908 this=expression.this.copy(), 3909 expression=exp.Lambda( 3910 this=exp.Coalesce( 3911 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3912 ), 3913 expressions=[x], 3914 ), 3915 ) 3916 array_to_string = exp.ArrayToString( 3917 this=list_transform, expression=expression.expression 3918 ) 3919 if expression.args.get("null_delim_is_null"): 3920 return self.sql( 3921 exp.case() 3922 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3923 .else_(array_to_string) 3924 ) 3925 return self.sql(array_to_string) 3926 3927 if null: 3928 x = exp.to_identifier("x") 3929 return self.sql( 3930 exp.ArrayToString( 3931 this=exp.Transform( 3932 this=expression.this, 3933 expression=exp.Lambda( 3934 this=exp.Coalesce(this=x, expressions=[null]), 3935 expressions=[x], 3936 ), 3937 ), 3938 expression=expression.expression, 3939 ) 3940 ) 3941 3942 return self.func("ARRAY_TO_STRING", expression.this, expression.expression) 3943 3944 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3945 # DuckDB-specific: handle binary types using DPipe (||) operator 3946 separator = seq_get(expression.expressions, 0) 3947 args = expression.expressions[1:] 3948 3949 if any(_is_binary(arg) for arg in [separator, *args]): 3950 result = args[0] 3951 for arg in args[1:]: 3952 result = exp.DPipe( 3953 this=exp.DPipe(this=result, expression=separator), expression=arg 3954 ) 3955 return self.sql(result) 3956 3957 return super().concatws_sql(expression) 3958 3959 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3960 this = expression.this 3961 group = expression.args.get("group") 3962 params = expression.args.get("parameters") 3963 position = expression.args.get("position") 3964 occurrence = expression.args.get("occurrence") 3965 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3966 3967 # Handle Snowflake's 'e' flag: it enables capture group extraction 3968 # In DuckDB, this is controlled by the group parameter directly 3969 if params and params.is_string and "e" in params.name: 3970 params = exp.Literal.string(params.name.replace("e", "")) 3971 3972 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3973 3974 # Strip default group when no following params (DuckDB default is same as group=0) 3975 if ( 3976 not validated_flags 3977 and group 3978 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3979 ): 3980 group = None 3981 3982 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3983 3984 # use substring to handle position argument 3985 if position and (not position.is_int or position.to_py() > 1): 3986 this = exp.Substring(this=this, start=position) 3987 3988 if null_if_pos_overflow: 3989 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3990 3991 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3992 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3993 3994 if is_extract_all or non_single_occurrence: 3995 name = "REGEXP_EXTRACT_ALL" 3996 else: 3997 name = "REGEXP_EXTRACT" 3998 3999 result: exp.Expr = exp.Anonymous( 4000 this=name, expressions=[this, expression.expression, group, flags_expr] 4001 ) 4002 4003 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 4004 if is_extract_all and non_single_occurrence: 4005 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 4006 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 4007 elif non_single_occurrence: 4008 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 4009 4010 return self.sql(result) 4011 4012 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 4013 return self._regexp_extract_sql(expression) 4014 4015 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 4016 return self._regexp_extract_sql(expression) 4017 4018 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 4019 this = expression.this 4020 pattern = expression.expression 4021 position = expression.args.get("position") 4022 orig_occ = expression.args.get("occurrence") 4023 occurrence = orig_occ or exp.Literal.number(1) 4024 option = expression.args.get("option") 4025 parameters = expression.args.get("parameters") 4026 4027 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 4028 if validated_flags: 4029 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 4030 4031 # Handle starting position offset 4032 pos_offset: exp.Expr = exp.Literal.number(0) 4033 if position and (not position.is_int or position.to_py() > 1): 4034 this = exp.Substring(this=this, start=position) 4035 pos_offset = position - exp.Literal.number(1) 4036 4037 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 4038 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 4039 lst = exp.Bracket( 4040 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 4041 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 4042 offset=1, 4043 ) 4044 transform = exp.Anonymous( 4045 this="LIST_TRANSFORM", 4046 expressions=[ 4047 lst, 4048 exp.Lambda( 4049 this=exp.Length(this=exp.to_identifier("x")), 4050 expressions=[exp.to_identifier("x")], 4051 ), 4052 ], 4053 ) 4054 return exp.Coalesce( 4055 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4056 expressions=[exp.Literal.number(0)], 4057 ) 4058 4059 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4060 base_pos: exp.Expr = ( 4061 exp.Literal.number(1) 4062 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4063 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4064 + pos_offset 4065 ) 4066 4067 # option=1: add match length for end position 4068 if option and option.is_int and option.to_py() == 1: 4069 match_at_occ = exp.Bracket( 4070 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4071 expressions=[occurrence], 4072 offset=1, 4073 ) 4074 base_pos = base_pos + exp.Coalesce( 4075 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4076 ) 4077 4078 # NULL checks for all provided arguments 4079 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4080 null_args = [ 4081 expression.this, 4082 expression.expression, 4083 position, 4084 orig_occ, 4085 option, 4086 parameters, 4087 ] 4088 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4089 4090 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4091 4092 return self.sql( 4093 exp.case() 4094 .when(exp.or_(*null_checks), exp.Null()) 4095 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4096 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4097 .else_(base_pos) 4098 ) 4099 4100 @unsupported_args("culture") 4101 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4102 fmt = expression.args.get("format") 4103 if fmt and fmt.is_int: 4104 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4105 4106 self.unsupported("Only integer formats are supported by NumberToStr") 4107 return self.function_fallback_sql(expression) 4108 4109 def autoincrementcolumnconstraint_sql(self, _) -> str: 4110 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 4111 return "" 4112 4113 def aliases_sql(self, expression: exp.Aliases) -> str: 4114 this = expression.this 4115 if isinstance(this, exp.Posexplode): 4116 return self.posexplode_sql(this) 4117 4118 return super().aliases_sql(expression) 4119 4120 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4121 this = expression.this 4122 parent = expression.parent 4123 4124 # The default Spark aliases are "pos" and "col", unless specified otherwise 4125 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4126 4127 if isinstance(parent, exp.Aliases): 4128 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4129 pos, col = parent.expressions 4130 elif isinstance(parent, exp.Table): 4131 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4132 alias = parent.args.get("alias") 4133 if alias: 4134 pos, col = alias.columns or [pos, col] 4135 alias.pop() 4136 4137 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4138 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4139 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4140 gen_subscripts = self.sql( 4141 exp.Alias( 4142 this=exp.Anonymous( 4143 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4144 ) 4145 - exp.Literal.number(1), 4146 alias=pos, 4147 ) 4148 ) 4149 4150 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4151 4152 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4153 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4154 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4155 4156 return posexplode_sql 4157 4158 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4159 """ 4160 Handles three key issues: 4161 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4162 2. End-of-month preservation: If input is last day of month, result is last day of result month 4163 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4164 """ 4165 from sqlglot.optimizer.annotate_types import annotate_types 4166 4167 this = expression.this 4168 if not this.type: 4169 this = annotate_types(this, dialect=self.dialect) 4170 4171 if this.is_type(*exp.DataType.TEXT_TYPES): 4172 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4173 4174 # Detect float/decimal months to apply rounding (Snowflake behavior) 4175 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4176 months_expr = expression.expression 4177 if not months_expr.type: 4178 months_expr = annotate_types(months_expr, dialect=self.dialect) 4179 4180 # Build interval or to_months expression based on type 4181 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4182 interval_or_to_months = ( 4183 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4184 if months_expr.is_type( 4185 exp.DType.FLOAT, 4186 exp.DType.DOUBLE, 4187 exp.DType.DECIMAL, 4188 ) 4189 # Integer case: standard INTERVAL N MONTH syntax 4190 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4191 ) 4192 4193 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4194 4195 # Apply end-of-month preservation if Snowflake flag is set 4196 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4197 preserve_eom = expression.args.get("preserve_end_of_month") 4198 result_expr = ( 4199 exp.case() 4200 .when( 4201 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4202 exp.func("LAST_DAY", date_add_expr), 4203 ) 4204 .else_(date_add_expr) 4205 if preserve_eom 4206 else date_add_expr 4207 ) 4208 4209 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4210 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4211 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4212 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4213 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4214 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4215 return self.sql(result_expr) 4216 4217 def format_sql(self, expression: exp.Format) -> str: 4218 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 4219 return self.func("FORMAT", "'{}'", expression.expressions[0]) 4220 4221 return self.function_fallback_sql(expression) 4222 4223 def hexstring_sql( 4224 self, expression: exp.HexString, binary_function_repr: str | None = None 4225 ) -> str: 4226 # UNHEX('FF') correctly produces blob \xFF in DuckDB 4227 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 4228 4229 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4230 unit = expression.args.get("unit") 4231 date = expression.this 4232 4233 week_start = _week_unit_to_dow(unit) 4234 unit = unit_to_str(expression) 4235 4236 if week_start: 4237 result = self.sql( 4238 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4239 ) 4240 else: 4241 result = self.func("DATE_TRUNC", unit, date) 4242 4243 if ( 4244 expression.args.get("input_type_preserved") 4245 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4246 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4247 ): 4248 return self.sql(exp.Cast(this=result, to=date.type)) 4249 4250 return result 4251 4252 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4253 unit = unit_to_str(expression) 4254 zone = expression.args.get("zone") 4255 timestamp = expression.this 4256 date_unit = is_date_unit(unit) 4257 4258 if date_unit and zone: 4259 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4260 # Double AT TIME ZONE needed for BigQuery compatibility: 4261 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4262 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4263 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4264 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4265 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4266 4267 result = self.func("DATE_TRUNC", unit, timestamp) 4268 if expression.args.get("input_type_preserved"): 4269 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4270 dummy_date = exp.Cast( 4271 this=exp.Literal.string("1970-01-01"), 4272 to=exp.DataType(this=exp.DType.DATE), 4273 ) 4274 date_time = exp.Add(this=dummy_date, expression=timestamp) 4275 result = self.func("DATE_TRUNC", unit, date_time) 4276 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4277 4278 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4279 date_unit and timestamp.is_type(exp.DType.DATE) 4280 ): 4281 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4282 4283 return result 4284 4285 def trim_sql(self, expression: exp.Trim) -> str: 4286 expression.this.replace(_cast_to_varchar(expression.this)) 4287 if expression.expression: 4288 expression.expression.replace(_cast_to_varchar(expression.expression)) 4289 4290 result_sql = super().trim_sql(expression) 4291 return _gen_with_cast_to_blob(self, expression, result_sql) 4292 4293 def round_sql(self, expression: exp.Round) -> str: 4294 this = expression.this 4295 decimals = expression.args.get("decimals") 4296 truncate = expression.args.get("truncate") 4297 4298 # DuckDB requires the scale (decimals) argument to be an INT 4299 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4300 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4301 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4302 decimals = exp.cast(decimals, exp.DType.INT) 4303 4304 func = "ROUND" 4305 if truncate: 4306 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4307 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4308 func = "ROUND_EVEN" 4309 truncate = None 4310 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4311 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4312 truncate = None 4313 4314 return self.func(func, this, decimals, truncate) 4315 4316 def strtok_sql(self, expression: exp.Strtok) -> str: 4317 string_arg = expression.this 4318 delimiter_arg = expression.args.get("delimiter") 4319 part_index_arg = expression.args.get("part_index") 4320 4321 if delimiter_arg and part_index_arg: 4322 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4323 escaped_delimiter = exp.Anonymous( 4324 this="REGEXP_REPLACE", 4325 expressions=[ 4326 delimiter_arg, 4327 exp.Literal.string( 4328 r"([\[\]^.\-*+?(){}|$\\])" 4329 ), # Escape problematic regex chars 4330 exp.Literal.string( 4331 r"\\\1" 4332 ), # Replace with escaped version using $1 backreference 4333 exp.Literal.string("g"), # Global flag 4334 ], 4335 ) 4336 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4337 regex_pattern = ( 4338 exp.case() 4339 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4340 .else_( 4341 exp.func( 4342 "CONCAT", 4343 exp.Literal.string("["), 4344 escaped_delimiter, 4345 exp.Literal.string("]"), 4346 ) 4347 ) 4348 ) 4349 4350 # STRTOK skips empty strings, so we need to filter them out 4351 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4352 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4353 x = exp.to_identifier("x") 4354 is_empty = x.eq(exp.Literal.string("")) 4355 filtered_array = exp.func( 4356 "LIST_FILTER", 4357 split_array, 4358 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4359 ) 4360 base_func = exp.Bracket( 4361 this=filtered_array, 4362 expressions=[part_index_arg], 4363 offset=1, 4364 ) 4365 4366 # Use template with the built regex pattern 4367 result = exp.replace_placeholders( 4368 self.STRTOK_TEMPLATE.copy(), 4369 string=string_arg, 4370 delimiter=delimiter_arg, 4371 part_index=part_index_arg, 4372 base_func=base_func, 4373 ) 4374 4375 return self.sql(result) 4376 4377 return self.function_fallback_sql(expression) 4378 4379 def strtoktoarray_sql(self, expression: exp.StrtokToArray) -> str: 4380 string_arg = expression.this 4381 delimiter_arg = expression.args.get("expression") or exp.Literal.string(" ") 4382 4383 escaped = exp.RegexpReplace( 4384 this=delimiter_arg.copy(), 4385 expression=exp.Literal.string(r"([\[\]^.\-*+?(){}|$\\])"), 4386 replacement=exp.Literal.string(r"\\\1"), 4387 modifiers=exp.Literal.string("g"), 4388 ) 4389 return self.sql( 4390 exp.replace_placeholders( 4391 self.STRTOK_TO_ARRAY_TEMPLATE.copy(), 4392 string=string_arg, 4393 delimiter=delimiter_arg, 4394 escaped=escaped, 4395 ) 4396 ) 4397 4398 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4399 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4400 4401 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4402 if expression.is_type(*exp.DataType.REAL_TYPES): 4403 result = f"CAST({result} AS DOUBLE)" 4404 4405 return result 4406 4407 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4408 """ 4409 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4410 dividing the input distribution into n equal-sized buckets. 4411 4412 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4413 does not document the specific algorithm used so results may differ. DuckDB does not 4414 support RESPECT NULLS. 4415 """ 4416 this = expression.this 4417 if isinstance(this, exp.Distinct): 4418 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4419 if len(this.expressions) < 2: 4420 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4421 return self.function_fallback_sql(expression) 4422 num_quantiles_expr = this.expressions[1].pop() 4423 else: 4424 num_quantiles_expr = expression.expression 4425 4426 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4427 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4428 return self.function_fallback_sql(expression) 4429 4430 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4431 if num_quantiles <= 0: 4432 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4433 return self.function_fallback_sql(expression) 4434 4435 quantiles = [ 4436 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4437 for i in range(num_quantiles + 1) 4438 ] 4439 4440 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))) 4441 4442 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4443 if expression.args.get("scalar_only"): 4444 expression = exp.JSONExtractScalar( 4445 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4446 ) 4447 return _arrow_json_extract_sql(self, expression) 4448 4449 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4450 this = expression.this 4451 4452 if _is_binary(this): 4453 expression.type = exp.DType.BINARY.into_expr() 4454 4455 arg = _cast_to_bit(this) 4456 4457 if isinstance(this, exp.Neg): 4458 arg = exp.Paren(this=arg) 4459 4460 expression.set("this", arg) 4461 4462 result_sql = f"~{self.sql(expression, 'this')}" 4463 4464 return _gen_with_cast_to_blob(self, expression, result_sql) 4465 4466 def window_sql(self, expression: exp.Window) -> str: 4467 this = expression.this 4468 if isinstance(this, exp.Corr) or ( 4469 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4470 ): 4471 return self._corr_sql(expression) 4472 4473 return super().window_sql(expression) 4474 4475 def filter_sql(self, expression: exp.Filter) -> str: 4476 if isinstance(expression.this, exp.Corr): 4477 return self._corr_sql(expression) 4478 4479 return super().filter_sql(expression) 4480 4481 def _corr_sql( 4482 self, 4483 expression: exp.Filter | exp.Window | exp.Corr, 4484 ) -> str: 4485 if isinstance(expression, exp.Corr) and not expression.args.get("null_on_zero_variance"): 4486 return self.func("CORR", expression.this, expression.expression) 4487 4488 corr_expr = _maybe_corr_null_to_false(expression) 4489 if corr_expr is None: 4490 if isinstance(expression, exp.Window): 4491 return super().window_sql(expression) 4492 if isinstance(expression, exp.Filter): 4493 return super().filter_sql(expression) 4494 corr_expr = expression # make mypy happy 4495 4496 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr)) 4497 4498 def uuid_sql(self, expression: exp.Uuid) -> str: 4499 namespace = expression.this 4500 name = expression.args.get("name") 4501 4502 # UUID v5 (namespace + name) - Emulate using SHA1 4503 if namespace and name: 4504 result = exp.replace_placeholders( 4505 self.UUID_V5_TEMPLATE.copy(), 4506 namespace=namespace, 4507 name=name, 4508 ) 4509 return self.sql(result) 4510 4511 return super().uuid_sql(expression)
1456class DuckDBGenerator(generator.Generator): 1457 PARAMETER_TOKEN = "$" 1458 NAMED_PLACEHOLDER_TOKEN = "$" 1459 JOIN_HINTS = False 1460 TABLE_HINTS = False 1461 QUERY_HINTS = False 1462 LIMIT_FETCH = "LIMIT" 1463 STRUCT_DELIMITER = ("(", ")") 1464 RENAME_TABLE_WITH_DB = False 1465 NVL2_SUPPORTED = False 1466 SEMI_ANTI_JOIN_WITH_SIDE = False 1467 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1468 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1469 LAST_DAY_SUPPORTS_DATE_PART = False 1470 JSON_KEY_VALUE_PAIR_SEP = "," 1471 IGNORE_NULLS_IN_FUNC = True 1472 IGNORE_NULLS_BEFORE_ORDER = False 1473 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1474 SUPPORTS_CREATE_TABLE_LIKE = False 1475 MULTI_ARG_DISTINCT = False 1476 CAN_IMPLEMENT_ARRAY_ANY = True 1477 SUPPORTS_TO_NUMBER = False 1478 SELECT_KINDS: tuple[str, ...] = () 1479 SUPPORTS_DECODE_CASE = False 1480 SUPPORTS_DROP_ALTER_ICEBERG_PROPERTY = False 1481 1482 AFTER_HAVING_MODIFIER_TRANSFORMS = generator.AFTER_HAVING_MODIFIER_TRANSFORMS 1483 SUPPORTS_WINDOW_EXCLUDE = True 1484 COPY_HAS_INTO_KEYWORD = False 1485 STAR_EXCEPT = "EXCLUDE" 1486 PAD_FILL_PATTERN_IS_REQUIRED = True 1487 ARRAY_SIZE_DIM_REQUIRED: bool | None = False 1488 NORMALIZE_EXTRACT_DATE_PARTS = True 1489 SUPPORTS_LIKE_QUANTIFIERS = False 1490 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1491 1492 TRANSFORMS = { 1493 **generator.Generator.TRANSFORMS, 1494 exp.AnyValue: _anyvalue_sql, 1495 exp.ApproxDistinct: approx_count_distinct_sql, 1496 exp.Boolnot: _boolnot_sql, 1497 exp.Booland: _booland_sql, 1498 exp.Boolor: _boolor_sql, 1499 exp.Array: transforms.preprocess( 1500 [transforms.inherit_struct_field_names], 1501 generator=inline_array_unless_query, 1502 ), 1503 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1504 exp.ArrayCompact: array_compact_sql, 1505 exp.ArrayConstructCompact: lambda self, e: self.sql( 1506 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1507 ), 1508 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1509 exp.ArrayContains: _array_contains_sql, 1510 exp.ArrayOverlaps: _array_overlaps_sql, 1511 exp.ArrayFilter: rename_func("LIST_FILTER"), 1512 exp.ArrayInsert: _array_insert_sql, 1513 exp.ArrayPosition: lambda self, e: ( 1514 self.sql( 1515 exp.Sub( 1516 this=exp.ArrayPosition(this=e.this, expression=e.expression), 1517 expression=exp.Literal.number(1), 1518 ) 1519 ) 1520 if e.args.get("zero_based") 1521 else self.func("ARRAY_POSITION", e.this, e.expression) 1522 ), 1523 exp.ArrayRemoveAt: _array_remove_at_sql, 1524 exp.ArrayRemove: remove_from_array_using_filter, 1525 exp.ArraySort: _array_sort_sql, 1526 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1527 exp.ArraySum: rename_func("LIST_SUM"), 1528 exp.ArrayMax: rename_func("LIST_MAX"), 1529 exp.ArrayMin: rename_func("LIST_MIN"), 1530 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1531 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1532 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1533 exp.BitwiseAndAgg: _bitwise_agg_sql, 1534 exp.BitwiseCount: rename_func("BIT_COUNT"), 1535 exp.BitwiseLeftShift: _bitshift_sql, 1536 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1537 exp.BitwiseOrAgg: _bitwise_agg_sql, 1538 exp.BitwiseRightShift: _bitshift_sql, 1539 exp.BitwiseXorAgg: _bitwise_agg_sql, 1540 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1541 exp.Corr: lambda self, e: self._corr_sql(e), 1542 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1543 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1544 exp.CurrentSchemas: lambda self, e: self.func( 1545 "current_schemas", e.this if e.this else exp.true() 1546 ), 1547 exp.CurrentTimestamp: lambda self, e: ( 1548 self.sql( 1549 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1550 ) 1551 if e.args.get("sysdate") 1552 else "CURRENT_TIMESTAMP" 1553 ), 1554 exp.CurrentVersion: rename_func("version"), 1555 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1556 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1557 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1558 exp.DayOfWeekIso: rename_func("ISODOW"), 1559 exp.DayOfYear: rename_func("DAYOFYEAR"), 1560 exp.Dayname: lambda self, e: ( 1561 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1562 if e.args.get("abbreviated") 1563 else self.func("DAYNAME", e.this) 1564 ), 1565 exp.Monthname: lambda self, e: ( 1566 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1567 if e.args.get("abbreviated") 1568 else self.func("MONTHNAME", e.this) 1569 ), 1570 exp.DataType: _datatype_sql, 1571 exp.Date: _date_sql, 1572 exp.DateAdd: _date_delta_to_binary_interval_op(), 1573 exp.DateFromParts: _date_from_parts_sql, 1574 exp.DateSub: _date_delta_to_binary_interval_op(), 1575 exp.DateDiff: _date_diff_sql, 1576 exp.DateStrToDate: datestrtodate_sql, 1577 exp.Datetime: no_datetime_sql, 1578 exp.DatetimeDiff: _date_diff_sql, 1579 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1580 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1581 exp.DateToDi: lambda self, e: ( 1582 f"CAST(STRFTIME({self.sql(e, 'this')}, {self.dialect.DATEINT_FORMAT}) AS INT)" 1583 ), 1584 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1585 exp.HexDecodeString: lambda self, e: self.sql(exp.Decode(this=exp.Unhex(this=e.this))), 1586 exp.DiToDate: lambda self, e: ( 1587 f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {self.dialect.DATEINT_FORMAT}) AS DATE)" 1588 ), 1589 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1590 exp.EqualNull: lambda self, e: self.sql( 1591 exp.NullSafeEQ(this=e.this, expression=e.expression) 1592 ), 1593 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1594 exp.GenerateDateArray: _generate_datetime_array_sql, 1595 exp.GenerateSeries: generate_series_sql("GENERATE_SERIES", "RANGE"), 1596 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1597 exp.Getbit: getbit_sql, 1598 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 1599 exp.Explode: rename_func("UNNEST"), 1600 exp.IcebergProperty: lambda *_: "", 1601 exp.IntDiv: lambda self, e: self.binary(e, "//"), 1602 exp.IsInf: rename_func("ISINF"), 1603 exp.IsNan: rename_func("ISNAN"), 1604 exp.IsNullValue: lambda self, e: self.sql( 1605 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 1606 ), 1607 exp.IsArray: lambda self, e: self.sql( 1608 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 1609 ), 1610 exp.Ceil: _ceil_floor, 1611 exp.Floor: _ceil_floor, 1612 exp.JSONBExists: rename_func("JSON_EXISTS"), 1613 exp.JSONExtract: _arrow_json_extract_sql, 1614 exp.JSONExtractArray: _json_extract_value_array_sql, 1615 exp.JSONFormat: _json_format_sql, 1616 exp.JSONValueArray: _json_extract_value_array_sql, 1617 exp.Lateral: _explode_to_unnest_sql, 1618 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 1619 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 1620 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 1621 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 1622 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 1623 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 1624 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 1625 exp.BoolxorAgg: _boolxor_agg_sql, 1626 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 1627 exp.Initcap: _initcap_sql, 1628 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 1629 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 1630 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 1631 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 1632 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 1633 exp.MonthsBetween: months_between_sql, 1634 exp.NextDay: _day_navigation_sql, 1635 exp.PercentileCont: rename_func("QUANTILE_CONT"), 1636 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 1637 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 1638 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 1639 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 1640 exp.PreviousDay: _day_navigation_sql, 1641 exp.RegexpILike: lambda self, e: self.func( 1642 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 1643 ), 1644 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 1645 exp.RegrValx: _regr_val_sql, 1646 exp.RegrValy: _regr_val_sql, 1647 exp.Return: lambda self, e: self.sql(e, "this"), 1648 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 1649 exp.StrToUnix: lambda self, e: self.func( 1650 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 1651 ), 1652 exp.Struct: _struct_sql, 1653 exp.Transform: rename_func("LIST_TRANSFORM"), 1654 exp.TimeAdd: _date_delta_to_binary_interval_op(), 1655 exp.TimeSub: _date_delta_to_binary_interval_op(), 1656 exp.Time: no_time_sql, 1657 exp.TimeDiff: _timediff_sql, 1658 exp.Timestamp: no_timestamp_sql, 1659 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 1660 exp.TimestampDiff: lambda self, e: self.func( 1661 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 1662 ), 1663 exp.TimestampSub: _date_delta_to_binary_interval_op(), 1664 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DType.DATE)), 1665 exp.TimeStrToTime: timestrtotime_sql, 1666 exp.TimeStrToUnix: lambda self, e: self.func( 1667 "EPOCH", exp.cast(e.this, exp.DType.TIMESTAMP) 1668 ), 1669 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 1670 exp.ToBoolean: _to_boolean_sql, 1671 exp.ToVariant: lambda self, e: self.sql( 1672 exp.cast(e.this, exp.DataType.from_str("VARIANT", dialect="duckdb")) 1673 ), 1674 exp.TimeToUnix: rename_func("EPOCH"), 1675 exp.TsOrDiToDi: lambda self, e: ( 1676 f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)" 1677 ), 1678 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 1679 exp.TsOrDsDiff: lambda self, e: self.func( 1680 "DATE_DIFF", 1681 f"'{e.args.get('unit') or 'DAY'}'", 1682 exp.cast(e.expression, exp.DType.TIMESTAMP), 1683 exp.cast(e.this, exp.DType.TIMESTAMP), 1684 ), 1685 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 1686 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 1687 exp.UnixSeconds: lambda self, e: self.sql( 1688 exp.cast(self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DType.BIGINT) 1689 ), 1690 exp.UnixToStr: lambda self, e: self.func( 1691 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 1692 ), 1693 exp.DatetimeTrunc: lambda self, e: self.func( 1694 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DType.DATETIME) 1695 ), 1696 exp.UnixToTime: _unix_to_time_sql, 1697 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 1698 exp.VariancePop: rename_func("VAR_POP"), 1699 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 1700 exp.YearOfWeek: lambda self, e: self.sql( 1701 exp.Extract( 1702 this=exp.Var(this="ISOYEAR"), 1703 expression=e.this, 1704 ) 1705 ), 1706 exp.YearOfWeekIso: lambda self, e: self.sql( 1707 exp.Extract( 1708 this=exp.Var(this="ISOYEAR"), 1709 expression=e.this, 1710 ) 1711 ), 1712 exp.Xor: _xor_sql, 1713 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1714 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 1715 exp.DateBin: rename_func("TIME_BUCKET"), 1716 exp.LastDay: _last_day_sql, 1717 } 1718 1719 SUPPORTED_JSON_PATH_PARTS = { 1720 exp.JSONPathKey, 1721 exp.JSONPathRoot, 1722 exp.JSONPathSubscript, 1723 exp.JSONPathWildcard, 1724 } 1725 1726 TYPE_MAPPING = { 1727 **generator.Generator.TYPE_MAPPING, 1728 exp.DType.BINARY: "BLOB", 1729 exp.DType.BPCHAR: "TEXT", 1730 exp.DType.CHAR: "TEXT", 1731 exp.DType.DATETIME: "TIMESTAMP", 1732 exp.DType.DECFLOAT: "DECIMAL", 1733 exp.DType.FLOAT: "REAL", 1734 exp.DType.JSONB: "JSON", 1735 exp.DType.NCHAR: "TEXT", 1736 exp.DType.NVARCHAR: "TEXT", 1737 exp.DType.UINT: "UINTEGER", 1738 exp.DType.VARBINARY: "BLOB", 1739 exp.DType.ROWVERSION: "BLOB", 1740 exp.DType.VARCHAR: "TEXT", 1741 exp.DType.TIMESTAMPLTZ: "TIMESTAMPTZ", 1742 exp.DType.TIMESTAMPNTZ: "TIMESTAMP", 1743 exp.DType.TIMESTAMP_S: "TIMESTAMP_S", 1744 exp.DType.TIMESTAMP_MS: "TIMESTAMP_MS", 1745 exp.DType.TIMESTAMP_NS: "TIMESTAMP_NS", 1746 exp.DType.BIGDECIMAL: "DECIMAL", 1747 } 1748 1749 TYPE_PARAM_SETTINGS = { 1750 **generator.Generator.TYPE_PARAM_SETTINGS, 1751 exp.DType.BIGDECIMAL: ((38, 5), (38, 38)), 1752 exp.DType.DECFLOAT: ((38, 5), (38, 38)), 1753 } 1754 1755 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 1756 RESERVED_KEYWORDS = { 1757 "array", 1758 "analyse", 1759 "union", 1760 "all", 1761 "when", 1762 "in_p", 1763 "default", 1764 "create_p", 1765 "window", 1766 "asymmetric", 1767 "to", 1768 "else", 1769 "localtime", 1770 "from", 1771 "end_p", 1772 "select", 1773 "current_date", 1774 "foreign", 1775 "with", 1776 "grant", 1777 "session_user", 1778 "or", 1779 "except", 1780 "references", 1781 "fetch", 1782 "limit", 1783 "group_p", 1784 "leading", 1785 "into", 1786 "collate", 1787 "offset", 1788 "do", 1789 "then", 1790 "localtimestamp", 1791 "check_p", 1792 "lateral_p", 1793 "current_role", 1794 "where", 1795 "asc_p", 1796 "placing", 1797 "desc_p", 1798 "user", 1799 "unique", 1800 "initially", 1801 "column", 1802 "both", 1803 "some", 1804 "as", 1805 "any", 1806 "only", 1807 "deferrable", 1808 "null_p", 1809 "current_time", 1810 "true_p", 1811 "table", 1812 "case", 1813 "trailing", 1814 "variadic", 1815 "for", 1816 "on", 1817 "distinct", 1818 "false_p", 1819 "not", 1820 "constraint", 1821 "current_timestamp", 1822 "returning", 1823 "primary", 1824 "intersect", 1825 "having", 1826 "analyze", 1827 "current_user", 1828 "and", 1829 "cast", 1830 "symmetric", 1831 "using", 1832 "order", 1833 "current_catalog", 1834 } 1835 1836 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 1837 1838 # DuckDB doesn't generally support CREATE TABLE .. properties 1839 # https://duckdb.org/docs/sql/statements/create_table.html 1840 # There are a few exceptions (e.g. temporary tables) which are supported or 1841 # can be transpiled to DuckDB, so we explicitly override them accordingly 1842 PROPERTIES_LOCATION = { 1843 **{ 1844 prop: exp.Properties.Location.UNSUPPORTED 1845 for prop in generator.Generator.PROPERTIES_LOCATION 1846 }, 1847 exp.LikeProperty: exp.Properties.Location.POST_SCHEMA, 1848 exp.TemporaryProperty: exp.Properties.Location.POST_CREATE, 1849 exp.ReturnsProperty: exp.Properties.Location.POST_ALIAS, 1850 exp.SequenceProperties: exp.Properties.Location.POST_EXPRESSION, 1851 exp.IcebergProperty: exp.Properties.Location.POST_CREATE, 1852 } 1853 1854 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS: t.ClassVar = _IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS 1855 1856 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 1857 ZIPF_TEMPLATE: exp.Expr = exp.maybe_parse( 1858 """ 1859 WITH rand AS (SELECT :random_expr AS r), 1860 weights AS ( 1861 SELECT i, 1.0 / POWER(i, :s) AS w 1862 FROM RANGE(1, :n + 1) AS t(i) 1863 ), 1864 cdf AS ( 1865 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 1866 FROM weights 1867 ) 1868 SELECT MIN(i) 1869 FROM cdf 1870 WHERE p >= (SELECT r FROM rand) 1871 """ 1872 ) 1873 1874 # Template for NORMAL transpilation using Box-Muller transform 1875 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 1876 NORMAL_TEMPLATE: exp.Expr = exp.maybe_parse( 1877 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 1878 ) 1879 1880 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 1881 SEEDED_RANDOM_TEMPLATE: exp.Expr = exp.maybe_parse("(ABS(HASH(:seed)) % 1000000) / 1000000.0") 1882 1883 # Template for generating signed and unsigned SEQ values within a specified range 1884 SEQ_UNSIGNED: exp.Expr = _SEQ_UNSIGNED 1885 SEQ_SIGNED: exp.Expr = _SEQ_SIGNED 1886 1887 # Template for MAP_CAT transpilation - Snowflake semantics: 1888 # 1. Returns NULL if either input is NULL 1889 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 1890 # 3. Filters out entries with NULL values from the result 1891 MAPCAT_TEMPLATE: exp.Expr = exp.maybe_parse( 1892 """ 1893 CASE 1894 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 1895 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 1896 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 1897 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 1898 ), __x -> __x.value IS NOT NULL)) 1899 END 1900 """ 1901 ) 1902 1903 # Mappings for EXTRACT/DATE_PART transpilation 1904 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 1905 EXTRACT_STRFTIME_MAPPINGS: dict[str, tuple[str, str]] = { 1906 "WEEKISO": ("%V", "INTEGER"), 1907 "YEAROFWEEK": ("%G", "INTEGER"), 1908 "YEAROFWEEKISO": ("%G", "INTEGER"), 1909 "NANOSECOND": ("%n", "BIGINT"), 1910 } 1911 1912 # Maps epoch-based specifiers to DuckDB epoch functions 1913 EXTRACT_EPOCH_MAPPINGS: dict[str, str] = { 1914 "EPOCH_SECOND": "EPOCH", 1915 "EPOCH_MILLISECOND": "EPOCH_MS", 1916 "EPOCH_MICROSECOND": "EPOCH_US", 1917 "EPOCH_NANOSECOND": "EPOCH_NS", 1918 } 1919 1920 # Template for BITMAP_CONSTRUCT_AGG transpilation 1921 # 1922 # BACKGROUND: 1923 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 1924 # Supports values in range 0-32767, this version returns NULL if any value is out of range 1925 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 1926 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 1927 # 1928 # Snowflake uses two different formats based on the number of unique values: 1929 # 1930 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 1931 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 1932 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 1933 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 1934 # count v1 v2 v3 pad 1935 # 1936 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 1937 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 1938 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 1939 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 1940 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 1941 # 1942 # TEMPLATE STRUCTURE 1943 # 1944 # Phase 1 - Innermost subquery: Data preparation 1945 # SELECT LIST_SORT(...) AS l 1946 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 1947 # Result: Clean, sorted list of unique non-null integers stored as 'l' 1948 # 1949 # Phase 2 - Middle subquery: Hex string construction 1950 # LIST_TRANSFORM(...) 1951 # - Converts each integer to 2-byte little-endian hex representation 1952 # - & 255 extracts low byte, >> 8 extracts high byte 1953 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 1954 # Result: Hex string of all values 1955 # 1956 # Phase 3 - Outer SELECT: Final bitmap assembly 1957 # LENGTH(l) < 5: 1958 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 1959 # LENGTH(l) >= 5: 1960 # - Large format: Fixed 10-byte header + values (no padding needed) 1961 # Result: Complete binary bitmap as BLOB 1962 # 1963 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expr = exp.maybe_parse( 1964 """ 1965 SELECT CASE 1966 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 1967 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 1968 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 1969 ELSE UNHEX('08000000000000000000' || h) 1970 END 1971 FROM ( 1972 SELECT l, COALESCE(LIST_REDUCE( 1973 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 1974 (__a, __b) -> __a || __b, '' 1975 ), '') AS h 1976 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 1977 ) 1978 """ 1979 ) 1980 1981 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 1982 RANDSTR_TEMPLATE: exp.Expr = exp.maybe_parse( 1983 f""" 1984 SELECT LISTAGG( 1985 SUBSTRING( 1986 '{RANDSTR_CHAR_POOL}', 1987 1 + CAST(FLOOR(random_value * 62) AS INT), 1988 1 1989 ), 1990 '' 1991 ) 1992 FROM ( 1993 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 1994 FROM RANGE(:length) AS t(i) 1995 ) 1996 """, 1997 ) 1998 1999 # Template for MINHASH transpilation 2000 # Computes k minimum hash values across aggregated data using DuckDB list functions 2001 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2002 MINHASH_TEMPLATE: exp.Expr = exp.maybe_parse( 2003 """ 2004 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2005 FROM ( 2006 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2007 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2008 ) 2009 """, 2010 ) 2011 2012 # Template for MINHASH_COMBINE transpilation 2013 # Combines multiple minhash signatures by taking element-wise minimum 2014 MINHASH_COMBINE_TEMPLATE: exp.Expr = exp.maybe_parse( 2015 """ 2016 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2017 FROM ( 2018 SELECT 2019 pos AS idx, 2020 MIN(val) AS min_h 2021 FROM 2022 UNNEST(LIST(:expr)) AS _(sig), 2023 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2024 GROUP BY pos 2025 ) 2026 """, 2027 ) 2028 2029 # Template for APPROXIMATE_SIMILARITY transpilation 2030 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2031 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expr = exp.maybe_parse( 2032 """ 2033 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2034 FROM ( 2035 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2036 FROM ( 2037 SELECT h, pos 2038 FROM UNNEST(LIST(:expr)) AS _(sig), 2039 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2040 ) 2041 GROUP BY pos 2042 ) 2043 """, 2044 ) 2045 2046 # Template for ARRAYS_ZIP transpilation 2047 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2048 # Uses RANGE + indexing to match Snowflake behavior 2049 ARRAYS_ZIP_TEMPLATE: exp.Expr = exp.maybe_parse( 2050 """ 2051 CASE WHEN :null_check THEN NULL 2052 WHEN :all_empty_check THEN [:empty_struct] 2053 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2054 END 2055 """, 2056 ) 2057 2058 UUID_V5_TEMPLATE: exp.Expr = exp.maybe_parse( 2059 """ 2060 (SELECT 2061 LOWER( 2062 SUBSTR(h, 1, 8) || '-' || 2063 SUBSTR(h, 9, 4) || '-' || 2064 '5' || SUBSTR(h, 14, 3) || '-' || 2065 FORMAT('{:02x}', CAST('0x' || SUBSTR(h, 17, 2) AS INT) & 63 | 128) || SUBSTR(h, 19, 2) || '-' || 2066 SUBSTR(h, 21, 12) 2067 ) 2068 FROM ( 2069 SELECT SUBSTR(SHA1(UNHEX(REPLACE(:namespace, '-', '')) || ENCODE(:name, 'utf8')), 1, 32) AS h 2070 )) 2071 """ 2072 ) 2073 2074 # Shared bag semantics outer frame for ARRAY_EXCEPT and ARRAY_INTERSECTION. 2075 # Each element is paired with its 1-based position via LIST_ZIP, then filtered 2076 # by a comparison operator (supplied via :cond) that determines the operation: 2077 # EXCEPT (>): keep the N-th occurrence only if N > count in arr2 2078 # e.g. [2,2,2] EXCEPT [2,2] -> [2] 2079 # INTERSECTION (<=): keep the N-th occurrence only if N <= count in arr2 2080 # e.g. [2,2,2] INTERSECT [2,2] -> [2,2] 2081 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2082 ARRAY_BAG_TEMPLATE: exp.Expr = exp.maybe_parse( 2083 """ 2084 CASE 2085 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2086 ELSE LIST_TRANSFORM( 2087 LIST_FILTER( 2088 LIST_ZIP(:arr1, GENERATE_SERIES(1, LEN(:arr1))), 2089 pair -> :cond 2090 ), 2091 pair -> pair[0] 2092 ) 2093 END 2094 """ 2095 ) 2096 2097 ARRAY_EXCEPT_CONDITION: exp.Expr = exp.maybe_parse( 2098 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2099 " > LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2100 ) 2101 2102 ARRAY_INTERSECTION_CONDITION: exp.Expr = exp.maybe_parse( 2103 "LEN(LIST_FILTER(:arr1[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0]))" 2104 " <= LEN(LIST_FILTER(:arr2, e -> e IS NOT DISTINCT FROM pair[0]))" 2105 ) 2106 2107 # Set semantics for ARRAY_EXCEPT. Deduplicates arr1 via LIST_DISTINCT, then 2108 # filters out any element that appears at least once in arr2. 2109 # e.g. [1,1,2,3] EXCEPT [1] -> [2,3] 2110 # IS NOT DISTINCT FROM is used for NULL-safe element comparison. 2111 ARRAY_EXCEPT_SET_TEMPLATE: exp.Expr = exp.maybe_parse( 2112 """ 2113 CASE 2114 WHEN :arr1 IS NULL OR :arr2 IS NULL THEN NULL 2115 ELSE LIST_FILTER( 2116 LIST_DISTINCT(:arr1), 2117 e -> LEN(LIST_FILTER(:arr2, x -> x IS NOT DISTINCT FROM e)) = 0 2118 ) 2119 END 2120 """ 2121 ) 2122 2123 STRTOK_TO_ARRAY_TEMPLATE: exp.Expr = exp.maybe_parse( 2124 """ 2125 CASE WHEN :delimiter IS NULL THEN NULL 2126 ELSE LIST_FILTER( 2127 REGEXP_SPLIT_TO_ARRAY(:string, CASE WHEN :delimiter = '' THEN '.^' ELSE CONCAT('[', :escaped, ']') END), 2128 x -> NOT x = '' 2129 ) END 2130 """ 2131 ) 2132 2133 # Template for STRTOK function transpilation 2134 # 2135 # DuckDB itself doesn't have a strtok function. This handles the transpilation from Snowflake to DuckDB. 2136 # We may need to adjust this if we want to support transpilation from other dialects 2137 # 2138 # CASE 2139 # -- Snowflake: empty delimiter + empty input string -> NULL 2140 # WHEN delimiter = '' AND input_str = '' THEN NULL 2141 # 2142 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return input string if index is 1 2143 # WHEN delimiter = '' AND index = 1 THEN input_str 2144 # 2145 # -- Snowflake: empty delimiter + non-empty input string -> treats whole input as 1 token -> return NULL if index is not 1 2146 # WHEN delimiter = '' THEN NULL 2147 # 2148 # -- Snowflake: negative indices return NULL 2149 # WHEN index < 0 THEN NULL 2150 # 2151 # -- Snowflake: return NULL if any argument is NULL 2152 # WHEN input_str IS NULL OR delimiter IS NULL OR index IS NULL THEN NULL 2153 # 2154 # 2155 # ELSE LIST_FILTER( 2156 # REGEXP_SPLIT_TO_ARRAY( 2157 # input_str, 2158 # CASE 2159 # -- if delimiter is '', we don't want to surround it with '[' and ']' as '[]' is invalid for DuckDB 2160 # WHEN delimiter = '' THEN '' 2161 # 2162 # -- handle problematic regex characters in delimiter with REGEXP_REPLACE 2163 # -- turn delimiter into a regex char set, otherwise DuckDB will match in order, which we don't want 2164 # ELSE '[' || REGEXP_REPLACE(delimiter, problematic_char_set, '\\\1', 'g') || ']' 2165 # END 2166 # ), 2167 # 2168 # -- Snowflake: don't return empty strings 2169 # x -> NOT x = '' 2170 # )[index] 2171 # END 2172 STRTOK_TEMPLATE: exp.Expr = exp.maybe_parse( 2173 """ 2174 CASE 2175 WHEN :delimiter = '' AND :string = '' THEN NULL 2176 WHEN :delimiter = '' AND :part_index = 1 THEN :string 2177 WHEN :delimiter = '' THEN NULL 2178 WHEN :part_index < 0 THEN NULL 2179 WHEN :string IS NULL OR :delimiter IS NULL OR :part_index IS NULL THEN NULL 2180 ELSE :base_func 2181 END 2182 """ 2183 ) 2184 2185 def _array_bag_sql(self, condition: exp.Expr, arr1: exp.Expr, arr2: exp.Expr) -> str: 2186 cond = exp.Paren(this=exp.replace_placeholders(condition, arr1=arr1, arr2=arr2)) 2187 return self.sql( 2188 exp.replace_placeholders(self.ARRAY_BAG_TEMPLATE, arr1=arr1, arr2=arr2, cond=cond) 2189 ) 2190 2191 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2192 """ 2193 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2194 2195 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2196 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2197 2198 For 'END' kind, add the interval to get the end of the slice. 2199 For DATE type with 'END', cast result back to DATE to preserve type. 2200 """ 2201 date_expr = expression.this 2202 slice_length = expression.expression 2203 unit = expression.unit 2204 kind = expression.text("kind").upper() 2205 2206 # Create INTERVAL expression: INTERVAL 'N' UNIT 2207 interval_expr = exp.Interval(this=slice_length, unit=unit) 2208 2209 # Create base time_bucket expression 2210 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2211 2212 # Check if we need the end of the slice (default is start) 2213 if not kind == "END": 2214 # For 'START', return time_bucket directly 2215 return self.sql(time_bucket_expr) 2216 2217 # For 'END', add the interval to get end of slice 2218 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2219 2220 # If input is DATE type, cast result back to DATE to preserve type 2221 # DuckDB converts DATE to TIMESTAMP when adding intervals 2222 if date_expr.is_type(exp.DType.DATE): 2223 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2224 2225 return self.sql(add_expr) 2226 2227 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2228 """ 2229 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2230 2231 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2232 - Each bucket covers 32,768 values 2233 - Bucket numbering starts at 1 2234 - Formula: ((value - 1) // 32768) + 1 for positive values 2235 2236 For non-positive values (0 and negative), we use value // 32768 to avoid 2237 producing bucket 0 or positive bucket IDs for negative inputs. 2238 """ 2239 value = expression.this 2240 2241 positive_formula = ((value - 1) // 32768) + 1 2242 non_positive_formula = value // 32768 2243 2244 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2245 case_expr = ( 2246 exp.case() 2247 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2248 .else_(non_positive_formula) 2249 ) 2250 return self.sql(case_expr) 2251 2252 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2253 """ 2254 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2255 2256 Snowflake's BITMAP_BIT_POSITION behavior: 2257 - For n <= 0: returns ABS(n) % 32768 2258 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2259 """ 2260 this = expression.this 2261 2262 return self.sql( 2263 exp.Mod( 2264 this=exp.Paren( 2265 this=exp.If( 2266 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2267 true=this - exp.Literal.number(1), 2268 false=exp.Abs(this=this), 2269 ) 2270 ), 2271 expression=MAX_BIT_POSITION, 2272 ) 2273 ) 2274 2275 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2276 """ 2277 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2278 Uses a pre-parsed template with placeholders replaced by expression nodes. 2279 2280 Snowflake bitmap format: 2281 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2282 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2283 """ 2284 arg = expression.this 2285 return ( 2286 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2287 ) 2288 2289 def compress_sql(self, expression: exp.Compress) -> str: 2290 self.unsupported("DuckDB does not support the COMPRESS() function") 2291 return self.function_fallback_sql(expression) 2292 2293 def encrypt_sql(self, expression: exp.Encrypt) -> str: 2294 self.unsupported("ENCRYPT is not supported in DuckDB") 2295 return self.function_fallback_sql(expression) 2296 2297 def decrypt_sql(self, expression: exp.Decrypt) -> str: 2298 func_name = "TRY_DECRYPT" if expression.args.get("safe") else "DECRYPT" 2299 self.unsupported(f"{func_name} is not supported in DuckDB") 2300 return self.function_fallback_sql(expression) 2301 2302 def decryptraw_sql(self, expression: exp.DecryptRaw) -> str: 2303 func_name = "TRY_DECRYPT_RAW" if expression.args.get("safe") else "DECRYPT_RAW" 2304 self.unsupported(f"{func_name} is not supported in DuckDB") 2305 return self.function_fallback_sql(expression) 2306 2307 def encryptraw_sql(self, expression: exp.EncryptRaw) -> str: 2308 self.unsupported("ENCRYPT_RAW is not supported in DuckDB") 2309 return self.function_fallback_sql(expression) 2310 2311 def parseurl_sql(self, expression: exp.ParseUrl) -> str: 2312 self.unsupported("PARSE_URL is not supported in DuckDB") 2313 return self.function_fallback_sql(expression) 2314 2315 def parseip_sql(self, expression: exp.ParseIp) -> str: 2316 self.unsupported("PARSE_IP is not supported in DuckDB") 2317 return self.function_fallback_sql(expression) 2318 2319 def decompressstring_sql(self, expression: exp.DecompressString) -> str: 2320 self.unsupported("DECOMPRESS_STRING is not supported in DuckDB") 2321 return self.function_fallback_sql(expression) 2322 2323 def decompressbinary_sql(self, expression: exp.DecompressBinary) -> str: 2324 self.unsupported("DECOMPRESS_BINARY is not supported in DuckDB") 2325 return self.function_fallback_sql(expression) 2326 2327 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2328 this = expression.this 2329 expr = expression.expression 2330 2331 if expression.args.get("case_insensitive"): 2332 this = exp.Upper(this=this) 2333 expr = exp.Upper(this=expr) 2334 2335 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2336 2337 if expression.args.get("integer_scale"): 2338 result = exp.cast(result * 100, "INTEGER") 2339 2340 return self.sql(result) 2341 2342 def nthvalue_sql(self, expression: exp.NthValue) -> str: 2343 from_first = expression.args.get("from_first", True) 2344 if not from_first: 2345 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2346 2347 return self.function_fallback_sql(expression) 2348 2349 def randstr_sql(self, expression: exp.Randstr) -> str: 2350 """ 2351 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2352 Uses a pre-parsed template with placeholders replaced by expression nodes. 2353 2354 RANDSTR(length, generator) generates a random string of specified length. 2355 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2356 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2357 - No generator: Use default seed value 2358 """ 2359 length = expression.this 2360 generator = expression.args.get("generator") 2361 2362 if generator: 2363 if isinstance(generator, exp.Rand): 2364 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2365 seed_value = generator.this or generator 2366 else: 2367 # Const/int or other expression - use as seed directly 2368 seed_value = generator 2369 else: 2370 # No generator specified, use default seed (arbitrary but deterministic) 2371 seed_value = exp.Literal.number(RANDSTR_SEED) 2372 2373 replacements = {"seed": seed_value, "length": length} 2374 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2375 2376 @unsupported_args("finish") 2377 def reduce_sql(self, expression: exp.Reduce) -> str: 2378 array_arg = expression.this 2379 initial_value = expression.args.get("initial") 2380 merge_lambda = expression.args.get("merge") 2381 2382 if merge_lambda: 2383 merge_lambda.set("colon", True) 2384 2385 return self.func("list_reduce", array_arg, merge_lambda, initial_value) 2386 2387 def zipf_sql(self, expression: exp.Zipf) -> str: 2388 """ 2389 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2390 Uses a pre-parsed template with placeholders replaced by expression nodes. 2391 """ 2392 s = expression.this 2393 n = expression.args["elementcount"] 2394 gen = expression.args["gen"] 2395 2396 if not isinstance(gen, exp.Rand): 2397 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2398 random_expr: exp.Expr = exp.Div( 2399 this=exp.Paren( 2400 this=exp.Mod( 2401 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2402 expression=exp.Literal.number(1000000), 2403 ) 2404 ), 2405 expression=exp.Literal.number(1000000.0), 2406 ) 2407 else: 2408 # Use RANDOM() for non-deterministic output 2409 random_expr = exp.Rand() 2410 2411 replacements = {"s": s, "n": n, "random_expr": random_expr} 2412 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2413 2414 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2415 """ 2416 TO_BINARY and TRY_TO_BINARY transpilation: 2417 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2418 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2419 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2420 2421 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2422 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2423 """ 2424 value = expression.this 2425 format_arg = expression.args.get("format") 2426 is_safe = expression.args.get("safe") 2427 is_binary = _is_binary(expression) 2428 2429 if not format_arg and not is_binary: 2430 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2431 return self.func(func_name, value) 2432 2433 # Snowflake defaults to HEX encoding when no format is specified 2434 fmt = format_arg.name.upper() if format_arg else "HEX" 2435 2436 if fmt in ("UTF-8", "UTF8"): 2437 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2438 result = self.func("ENCODE", value) 2439 elif fmt == "BASE64": 2440 result = self.func("FROM_BASE64", value) 2441 elif fmt == "HEX": 2442 result = self.func("UNHEX", value) 2443 else: 2444 if is_safe: 2445 return self.sql(exp.null()) 2446 else: 2447 self.unsupported(f"format {fmt} is not supported") 2448 result = self.func("TO_BINARY", value) 2449 return f"TRY({result})" if is_safe else result 2450 2451 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2452 fmt = expression.args.get("format") 2453 precision = expression.args.get("precision") 2454 scale = expression.args.get("scale") 2455 2456 if not fmt and precision and scale: 2457 return self.sql( 2458 exp.cast( 2459 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2460 ) 2461 ) 2462 2463 return super().tonumber_sql(expression) 2464 2465 def _greatest_least_sql(self, expression: exp.Greatest | exp.Least) -> str: 2466 """ 2467 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2468 2469 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2470 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2471 """ 2472 # Get all arguments 2473 all_args = [expression.this, *expression.expressions] 2474 fallback_sql = self.function_fallback_sql(expression) 2475 2476 if expression.args.get("ignore_nulls"): 2477 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2478 return self.sql(fallback_sql) 2479 2480 # return NULL if any argument is NULL 2481 case_expr = exp.case().when( 2482 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2483 exp.null(), 2484 copy=False, 2485 ) 2486 case_expr.set("default", fallback_sql) 2487 return self.sql(case_expr) 2488 2489 def generator_sql(self, expression: exp.Generator) -> str: 2490 # Transpile Snowflake GENERATOR to DuckDB range() 2491 rowcount = expression.args.get("rowcount") 2492 time_limit = expression.args.get("time_limit") 2493 2494 if time_limit: 2495 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2496 2497 if not rowcount: 2498 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2499 return self.func("range", exp.Literal.number(0)) 2500 2501 return self.func("range", rowcount) 2502 2503 def greatest_sql(self, expression: exp.Greatest) -> str: 2504 return self._greatest_least_sql(expression) 2505 2506 def least_sql(self, expression: exp.Least) -> str: 2507 return self._greatest_least_sql(expression) 2508 2509 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2510 if expression.args.get("colon"): 2511 prefix = "LAMBDA " 2512 arrow_sep = ":" 2513 wrap = False 2514 else: 2515 prefix = "" 2516 2517 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2518 return f"{prefix}{lambda_sql}" 2519 2520 def show_sql(self, expression: exp.Show) -> str: 2521 from_ = self.sql(expression, "from_") 2522 from_ = f" FROM {from_}" if from_ else "" 2523 return f"SHOW {expression.name}{from_}" 2524 2525 def soundex_sql(self, expression: exp.Soundex) -> str: 2526 self.unsupported("SOUNDEX is not supported in DuckDB") 2527 return self.func("SOUNDEX", expression.this) 2528 2529 def sortarray_sql(self, expression: exp.SortArray) -> str: 2530 arr = expression.this 2531 asc = expression.args.get("asc") 2532 nulls_first = expression.args.get("nulls_first") 2533 2534 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2535 return self.func("LIST_SORT", arr, asc, nulls_first) 2536 2537 nulls_are_first = nulls_first == exp.true() 2538 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2539 2540 if not isinstance(asc, exp.Boolean): 2541 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2542 2543 descending = asc == exp.false() 2544 2545 if not descending and not nulls_are_first: 2546 return self.func("LIST_SORT", arr) 2547 if not nulls_are_first: 2548 return self.func("ARRAY_REVERSE_SORT", arr) 2549 return self.func( 2550 "LIST_SORT", 2551 arr, 2552 exp.Literal.string("DESC" if descending else "ASC"), 2553 exp.Literal.string("NULLS FIRST"), 2554 ) 2555 2556 def install_sql(self, expression: exp.Install) -> str: 2557 force = "FORCE " if expression.args.get("force") else "" 2558 this = self.sql(expression, "this") 2559 from_clause = expression.args.get("from_") 2560 from_clause = f" FROM {from_clause}" if from_clause else "" 2561 return f"{force}INSTALL {this}{from_clause}" 2562 2563 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2564 self.unsupported( 2565 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2566 ) 2567 return self.function_fallback_sql(expression) 2568 2569 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2570 return self.sql(exp.cast(expression.this, exp.DType.TIMESTAMPTZ)) 2571 2572 def strposition_sql(self, expression: exp.StrPosition) -> str: 2573 this = expression.this 2574 substr = expression.args.get("substr") 2575 position = expression.args.get("position") 2576 2577 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2578 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2579 if _is_binary(this): 2580 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2581 hex_strpos = exp.StrPosition( 2582 this=exp.Hex(this=this), 2583 substr=exp.Hex(this=substr), 2584 ) 2585 2586 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2587 2588 # For VARCHAR: handle clamp_position 2589 if expression.args.get("clamp_position") and position: 2590 expression = expression.copy() 2591 expression.set( 2592 "position", 2593 exp.If( 2594 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2595 true=exp.Literal.number(1), 2596 false=position.copy(), 2597 ), 2598 ) 2599 2600 return strposition_sql(self, expression) 2601 2602 def substring_sql(self, expression: exp.Substring) -> str: 2603 if expression.args.get("zero_start"): 2604 start = expression.args.get("start") 2605 length = expression.args.get("length") 2606 2607 if start := expression.args.get("start"): 2608 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2609 if length := expression.args.get("length"): 2610 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2611 2612 return self.func("SUBSTRING", expression.this, start, length) 2613 2614 return self.function_fallback_sql(expression) 2615 2616 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2617 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2618 target_type = expression.args.get("target_type") 2619 needs_tz = target_type and target_type.this in ( 2620 exp.DType.TIMESTAMPLTZ, 2621 exp.DType.TIMESTAMPTZ, 2622 ) 2623 2624 if expression.args.get("safe"): 2625 formatted_time = self.format_time(expression) 2626 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2627 return self.sql( 2628 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2629 ) 2630 2631 base_sql = str_to_time_sql(self, expression) 2632 if needs_tz: 2633 return self.sql( 2634 exp.cast( 2635 base_sql, 2636 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2637 ) 2638 ) 2639 return base_sql 2640 2641 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2642 formatted_time = self.format_time(expression) 2643 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2644 return self.sql( 2645 exp.cast( 2646 self.func(function_name, expression.this, formatted_time), 2647 exp.DataType(this=exp.DType.DATE), 2648 ) 2649 ) 2650 2651 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2652 this = expression.this 2653 time_format = self.format_time(expression) 2654 safe = expression.args.get("safe") 2655 time_type = exp.DataType.from_str("TIME", dialect="duckdb") 2656 cast_expr = exp.TryCast if safe else exp.Cast 2657 2658 if time_format: 2659 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2660 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2661 return self.sql(cast_expr(this=strptime, to=time_type)) 2662 2663 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2664 return self.sql(this) 2665 2666 return self.sql(cast_expr(this=this, to=time_type)) 2667 2668 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2669 if not expression.this: 2670 return "CURRENT_DATE" 2671 2672 expr = exp.Cast( 2673 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2674 to=exp.DataType(this=exp.DType.DATE), 2675 ) 2676 return self.sql(expr) 2677 2678 def checkjson_sql(self, expression: exp.CheckJson) -> str: 2679 arg = expression.this 2680 return self.sql( 2681 exp.case() 2682 .when( 2683 exp.or_(arg.is_(exp.Null()), arg.eq(""), exp.func("json_valid", arg)), 2684 exp.null(), 2685 ) 2686 .else_(exp.Literal.string("Invalid JSON")) 2687 ) 2688 2689 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2690 arg = expression.this 2691 if expression.args.get("safe"): 2692 return self.sql( 2693 exp.case() 2694 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2695 .else_(exp.null()) 2696 ) 2697 return self.func("JSON", arg) 2698 2699 def unicode_sql(self, expression: exp.Unicode) -> str: 2700 if expression.args.get("empty_is_zero"): 2701 return self.sql( 2702 exp.case() 2703 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2704 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2705 ) 2706 2707 return self.func("UNICODE", expression.this) 2708 2709 def stripnullvalue_sql(self, expression: exp.StripNullValue) -> str: 2710 return self.sql( 2711 exp.case() 2712 .when(exp.func("json_type", expression.this).eq("NULL"), exp.null()) 2713 .else_(expression.this) 2714 ) 2715 2716 def trunc_sql(self, expression: exp.Trunc) -> str: 2717 decimals = expression.args.get("decimals") 2718 if ( 2719 expression.args.get("fractions_supported") 2720 and decimals 2721 and not decimals.is_type(exp.DType.INT) 2722 ): 2723 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2724 2725 return self.func("TRUNC", expression.this, decimals) 2726 2727 def normal_sql(self, expression: exp.Normal) -> str: 2728 """ 2729 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2730 2731 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2732 """ 2733 mean = expression.this 2734 stddev = expression.args["stddev"] 2735 gen: exp.Expr = expression.args["gen"] 2736 2737 # Build two uniform random values [0, 1) for Box-Muller transform 2738 if isinstance(gen, exp.Rand) and gen.this is None: 2739 u1: exp.Expr = exp.Rand() 2740 u2: exp.Expr = exp.Rand() 2741 else: 2742 # Seeded: derive two values using HASH with different inputs 2743 seed = gen.this if isinstance(gen, exp.Rand) else gen 2744 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2745 u2 = exp.replace_placeholders( 2746 self.SEEDED_RANDOM_TEMPLATE, 2747 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2748 ) 2749 2750 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2751 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2752 2753 def uniform_sql(self, expression: exp.Uniform) -> str: 2754 """ 2755 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2756 2757 UNIFORM returns a random value in [min, max]: 2758 - Integer result if both min and max are integers 2759 - Float result if either min or max is a float 2760 """ 2761 min_val = expression.this 2762 max_val = expression.expression 2763 gen = expression.args.get("gen") 2764 2765 # Determine if result should be integer (both bounds are integers). 2766 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2767 is_int_result = min_val.is_int and max_val.is_int 2768 2769 # Build the random value expression [0, 1) 2770 if not isinstance(gen, exp.Rand): 2771 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2772 random_expr: exp.Expr = exp.Div( 2773 this=exp.Paren( 2774 this=exp.Mod( 2775 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2776 expression=exp.Literal.number(1000000), 2777 ) 2778 ), 2779 expression=exp.Literal.number(1000000.0), 2780 ) 2781 else: 2782 random_expr = exp.Rand() 2783 2784 # Build: min + random * (max - min [+ 1 for int]) 2785 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2786 if is_int_result: 2787 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2788 2789 result: exp.Expr = exp.Add( 2790 this=min_val, 2791 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2792 ) 2793 2794 if is_int_result: 2795 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2796 2797 return self.sql(result) 2798 2799 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2800 nano = expression.args.get("nano") 2801 overflow = expression.args.get("overflow") 2802 2803 # Snowflake's TIME_FROM_PARTS supports overflow 2804 if overflow: 2805 hour = expression.args["hour"] 2806 minute = expression.args["min"] 2807 sec = expression.args["sec"] 2808 2809 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2810 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2811 try: 2812 h_val = hour.to_py() 2813 m_val = minute.to_py() 2814 s_val = sec.to_py() 2815 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2816 return rename_func("MAKE_TIME")(self, expression) 2817 except ValueError: 2818 pass 2819 2820 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2821 if nano: 2822 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2823 2824 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2825 2826 return self.sql( 2827 exp.Add( 2828 this=exp.Cast( 2829 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2830 ), 2831 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2832 ) 2833 ) 2834 2835 # Default: MAKE_TIME 2836 if nano: 2837 expression.set( 2838 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2839 ) 2840 2841 return rename_func("MAKE_TIME")(self, expression) 2842 2843 def extract_sql(self, expression: exp.Extract) -> str: 2844 """ 2845 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2846 2847 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2848 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2849 """ 2850 this = expression.this 2851 datetime_expr = expression.expression 2852 2853 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2854 # because Snowflake applies server timezone while DuckDB uses local timezone 2855 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2856 self.unsupported( 2857 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2858 ) 2859 2860 part_name = this.name.upper() 2861 2862 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2863 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2864 2865 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2866 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2867 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2868 exp.DType.TIME, exp.DType.TIMETZ 2869 ) 2870 2871 if is_nano_time: 2872 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2873 return self.sql( 2874 exp.cast( 2875 exp.Mul( 2876 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2877 expression=exp.Literal.number(1000), 2878 ), 2879 exp.DataType.from_str(cast_type, dialect="duckdb"), 2880 ) 2881 ) 2882 2883 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2884 strftime_input = datetime_expr 2885 if part_name == "NANOSECOND": 2886 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2887 2888 return self.sql( 2889 exp.cast( 2890 exp.Anonymous( 2891 this="STRFTIME", 2892 expressions=[strftime_input, exp.Literal.string(fmt)], 2893 ), 2894 exp.DataType.from_str(cast_type, dialect="duckdb"), 2895 ) 2896 ) 2897 2898 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2899 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2900 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2901 # EPOCH returns float, cast to BIGINT for integer result 2902 if part_name == "EPOCH_SECOND": 2903 result = exp.cast(result, exp.DataType.from_str("BIGINT", dialect="duckdb")) 2904 return self.sql(result) 2905 2906 return super().extract_sql(expression) 2907 2908 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2909 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2910 date_expr = expression.this 2911 time_expr = expression.expression 2912 2913 if date_expr is not None and time_expr is not None: 2914 # In DuckDB, DATE + TIME produces TIMESTAMP 2915 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2916 2917 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2918 sec = expression.args.get("sec") 2919 if sec is None: 2920 # This shouldn't happen with valid input, but handle gracefully 2921 return rename_func("MAKE_TIMESTAMP")(self, expression) 2922 2923 milli = expression.args.get("milli") 2924 if milli is not None: 2925 sec += milli.pop() / exp.Literal.number(1000.0) 2926 2927 nano = expression.args.get("nano") 2928 if nano is not None: 2929 sec += nano.pop() / exp.Literal.number(1000000000.0) 2930 2931 if milli or nano: 2932 expression.set("sec", sec) 2933 2934 return rename_func("MAKE_TIMESTAMP")(self, expression) 2935 2936 @unsupported_args("nano") 2937 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2938 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2939 if nano := expression.args.get("nano"): 2940 nano.pop() 2941 2942 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2943 return f"CAST({timestamp} AS TIMESTAMPTZ)" 2944 2945 @unsupported_args("nano") 2946 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2947 # Extract zone before popping 2948 zone = expression.args.get("zone") 2949 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2950 if zone: 2951 zone = zone.pop() 2952 2953 if nano := expression.args.get("nano"): 2954 nano.pop() 2955 2956 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2957 2958 if zone: 2959 # Use AT TIME ZONE to apply the explicit timezone 2960 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2961 2962 return timestamp 2963 2964 def tablesample_sql( 2965 self, 2966 expression: exp.TableSample, 2967 tablesample_keyword: str | None = None, 2968 ) -> str: 2969 if not isinstance(expression.parent, exp.Select): 2970 # This sample clause only applies to a single source, not the entire resulting relation 2971 tablesample_keyword = "TABLESAMPLE" 2972 2973 if expression.args.get("size"): 2974 method = expression.args.get("method") 2975 if method and method.name.upper() != "RESERVOIR": 2976 self.unsupported( 2977 f"Sampling method {method} is not supported with a discrete sample count, " 2978 "defaulting to reservoir sampling" 2979 ) 2980 expression.set("method", exp.var("RESERVOIR")) 2981 2982 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 2983 2984 def join_sql(self, expression: exp.Join) -> str: 2985 if ( 2986 not expression.args.get("using") 2987 and not expression.args.get("on") 2988 and not expression.method 2989 and (expression.kind in ("", "INNER", "OUTER")) 2990 ): 2991 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2992 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2993 if isinstance(expression.this, exp.Unnest): 2994 return super().join_sql(expression.on(exp.true())) 2995 2996 expression.set("side", None) 2997 expression.set("kind", None) 2998 2999 return super().join_sql(expression) 3000 3001 def countif_sql(self, expression: exp.CountIf) -> str: 3002 if self.dialect.version >= (1, 2): 3003 return self.function_fallback_sql(expression) 3004 3005 # https://github.com/tobymao/sqlglot/pull/4749 3006 return count_if_to_sum(self, expression) 3007 3008 def bracket_sql(self, expression: exp.Bracket) -> str: 3009 if self.dialect.version >= (1, 2): 3010 return super().bracket_sql(expression) 3011 3012 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3013 this = expression.this 3014 if isinstance(this, exp.Array): 3015 this.replace(exp.paren(this)) 3016 3017 bracket = super().bracket_sql(expression) 3018 3019 if not expression.args.get("returns_list_for_maps"): 3020 if not this.type: 3021 from sqlglot.optimizer.annotate_types import annotate_types 3022 3023 this = annotate_types(this, dialect=self.dialect) 3024 3025 if this.is_type(exp.DType.MAP): 3026 bracket = f"({bracket})[1]" 3027 3028 return bracket 3029 3030 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3031 func = expression.this 3032 3033 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3034 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3035 if isinstance(func, exp.ArrayAgg): 3036 if not isinstance(order := expression.expression, exp.Order): 3037 return self.sql(func) 3038 3039 # Save the original column for FILTER clause (before wrapping with Order) 3040 original_this = func.this 3041 3042 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3043 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3044 func.set( 3045 "this", 3046 exp.Order( 3047 this=func.this.copy(), 3048 expressions=order.expressions, 3049 ), 3050 ) 3051 3052 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3053 # Use original_this (not the Order-wrapped version) for the FILTER condition 3054 array_agg_sql = self.function_fallback_sql(func) 3055 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3056 3057 # For other functions (like PERCENTILES), use existing logic 3058 expression_sql = self.sql(expression, "expression") 3059 3060 if isinstance(func, exp.PERCENTILES): 3061 # Make the order key the first arg and slide the fraction to the right 3062 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3063 order_col = expression.find(exp.Ordered) 3064 if order_col: 3065 func.set("expression", func.this) 3066 func.set("this", order_col.this) 3067 3068 this = self.sql(expression, "this").rstrip(")") 3069 3070 return f"{this}{expression_sql})" 3071 3072 def length_sql(self, expression: exp.Length) -> str: 3073 arg = expression.this 3074 3075 # Dialects like BQ and Snowflake also accept binary values as args, so 3076 # DDB will attempt to infer the type or resort to case/when resolution 3077 if not expression.args.get("binary") or arg.is_string: 3078 return self.func("LENGTH", arg) 3079 3080 if not arg.type: 3081 from sqlglot.optimizer.annotate_types import annotate_types 3082 3083 arg = annotate_types(arg, dialect=self.dialect) 3084 3085 if arg.is_type(*exp.DataType.TEXT_TYPES): 3086 return self.func("LENGTH", arg) 3087 3088 # We need these casts to make duckdb's static type checker happy 3089 blob = exp.cast(arg, exp.DType.VARBINARY) 3090 varchar = exp.cast(arg, exp.DType.VARCHAR) 3091 3092 case = ( 3093 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3094 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3095 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3096 ) 3097 return self.sql(case) 3098 3099 def bitlength_sql(self, expression: exp.BitLength) -> str: 3100 if not _is_binary(arg := expression.this): 3101 return self.func("BIT_LENGTH", arg) 3102 3103 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3104 return self.sql(exp.ByteLength(this=blob) * exp.Literal.number(8)) 3105 3106 def chr_sql(self, expression: exp.Chr, name: str = "CHR") -> str: 3107 arg = expression.expressions[0] 3108 if arg.is_type(*exp.DataType.REAL_TYPES): 3109 arg = exp.cast(arg, exp.DType.INT) 3110 return self.func("CHR", arg) 3111 3112 def collation_sql(self, expression: exp.Collation) -> str: 3113 self.unsupported("COLLATION function is not supported by DuckDB") 3114 return self.function_fallback_sql(expression) 3115 3116 def collate_sql(self, expression: exp.Collate) -> str: 3117 if not expression.expression.is_string: 3118 return super().collate_sql(expression) 3119 3120 raw = expression.expression.name 3121 if not raw: 3122 return self.sql(expression.this) 3123 3124 parts = [] 3125 for part in raw.split("-"): 3126 lower = part.lower() 3127 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3128 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3129 self.unsupported( 3130 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3131 ) 3132 parts.append(lower) 3133 3134 if not parts: 3135 return self.sql(expression.this) 3136 return super().collate_sql( 3137 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3138 ) 3139 3140 def _validate_regexp_flags(self, flags: exp.Expr | None, supported_flags: str) -> str | None: 3141 """ 3142 Validate and filter regexp flags for DuckDB compatibility. 3143 3144 Args: 3145 flags: The flags expression to validate 3146 supported_flags: String of supported flags (e.g., "ims", "cims"). 3147 Only these flags will be returned. 3148 3149 Returns: 3150 Validated/filtered flag string, or None if no valid flags remain 3151 """ 3152 if not isinstance(flags, exp.Expr): 3153 return None 3154 3155 if not flags.is_string: 3156 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3157 return None 3158 3159 flag_str = flags.this 3160 unsupported = set(flag_str) - set(supported_flags) 3161 3162 if unsupported: 3163 self.unsupported( 3164 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3165 ) 3166 3167 flag_str = "".join(f for f in flag_str if f in supported_flags) 3168 return flag_str if flag_str else None 3169 3170 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3171 this = expression.this 3172 pattern = expression.expression 3173 position = expression.args.get("position") 3174 parameters = expression.args.get("parameters") 3175 3176 # Validate flags - only "ims" flags are supported for embedded patterns 3177 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3178 3179 if position: 3180 this = exp.Substring(this=this, start=position) 3181 3182 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3183 if validated_flags: 3184 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3185 3186 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3187 result = ( 3188 exp.case() 3189 .when( 3190 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3191 exp.Literal.number(0), 3192 ) 3193 .else_( 3194 exp.Length( 3195 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3196 ) 3197 ) 3198 ) 3199 3200 return self.sql(result) 3201 3202 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3203 subject = expression.this 3204 pattern = expression.expression 3205 replacement = expression.args.get("replacement") or exp.Literal.string("") 3206 position = expression.args.get("position") 3207 occurrence = expression.args.get("occurrence") 3208 modifiers = expression.args.get("modifiers") 3209 3210 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3211 3212 # Handle occurrence (only literals supported) 3213 if occurrence and not occurrence.is_int: 3214 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3215 else: 3216 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3217 if occurrence > 1: 3218 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3219 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3220 elif ( 3221 occurrence == 0 3222 and "g" not in validated_flags 3223 and not expression.args.get("single_replace") 3224 ): 3225 validated_flags += "g" 3226 3227 # Handle position (only literals supported) 3228 prefix = None 3229 if position and not position.is_int: 3230 self.unsupported("REGEXP_REPLACE with non-literal position") 3231 elif position and position.is_int and position.to_py() > 1: 3232 pos = position.to_py() 3233 prefix = exp.Substring( 3234 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3235 ) 3236 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3237 3238 result: exp.Expr = exp.Anonymous( 3239 this="REGEXP_REPLACE", 3240 expressions=[ 3241 subject, 3242 pattern, 3243 replacement, 3244 exp.Literal.string(validated_flags) if validated_flags else None, 3245 ], 3246 ) 3247 3248 if prefix: 3249 result = exp.Concat(expressions=[prefix, result]) 3250 3251 return self.sql(result) 3252 3253 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3254 this = expression.this 3255 pattern = expression.expression 3256 flag = expression.args.get("flag") 3257 3258 if expression.args.get("full_match"): 3259 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3260 flag = exp.Literal.string(validated_flags) if validated_flags else None 3261 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3262 3263 return self.func("REGEXP_MATCHES", this, pattern, flag) 3264 3265 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3266 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3267 this = expression.this 3268 expr = expression.expression 3269 max_dist = expression.args.get("max_dist") 3270 3271 if max_dist is None: 3272 return self.func("LEVENSHTEIN", this, expr) 3273 3274 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3275 levenshtein = exp.Levenshtein(this=this, expression=expr) 3276 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3277 3278 def pad_sql(self, expression: exp.Pad) -> str: 3279 """ 3280 Handle RPAD/LPAD for VARCHAR and BINARY types. 3281 3282 For VARCHAR: Delegate to parent class 3283 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3284 """ 3285 string_arg = expression.this 3286 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3287 3288 if _is_binary(string_arg) or _is_binary(fill_arg): 3289 length_arg = expression.expression 3290 is_left = expression.args.get("is_left") 3291 3292 input_len = exp.ByteLength(this=string_arg) 3293 chars_needed = length_arg - input_len 3294 pad_count = exp.Greatest( 3295 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3296 ) 3297 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3298 3299 left, right = string_arg, repeat_expr 3300 if is_left: 3301 left, right = right, left 3302 3303 result = exp.DPipe(this=left, expression=right) 3304 return self.sql(result) 3305 3306 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3307 return super().pad_sql(expression) 3308 3309 def minhash_sql(self, expression: exp.Minhash) -> str: 3310 k = expression.this 3311 exprs = expression.expressions 3312 3313 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3314 self.unsupported( 3315 "MINHASH with multiple expressions or * requires manual query restructuring" 3316 ) 3317 return self.func("MINHASH", k, *exprs) 3318 3319 expr = exprs[0] 3320 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3321 return f"({self.sql(result)})" 3322 3323 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3324 expr = expression.this 3325 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3326 return f"({self.sql(result)})" 3327 3328 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3329 expr = expression.this 3330 result = exp.replace_placeholders(self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr) 3331 return f"({self.sql(result)})" 3332 3333 def arrayuniqueagg_sql(self, expression: exp.ArrayUniqueAgg) -> str: 3334 return self.sql( 3335 exp.Filter( 3336 this=exp.func("LIST", exp.Distinct(expressions=[expression.this])), 3337 expression=exp.Where(this=expression.this.copy().is_(exp.null()).not_()), 3338 ) 3339 ) 3340 3341 def arrayunionagg_sql(self, expression: exp.ArrayUnionAgg) -> str: 3342 self.unsupported("ARRAY_UNION_AGG is not supported in DuckDB") 3343 return self.function_fallback_sql(expression) 3344 3345 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3346 arr = expression.this 3347 func = self.func("LIST_DISTINCT", arr) 3348 3349 if expression.args.get("check_null"): 3350 add_null_to_array = exp.func( 3351 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3352 ) 3353 return self.sql( 3354 exp.If( 3355 this=exp.NEQ( 3356 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3357 ), 3358 true=add_null_to_array, 3359 false=func, 3360 ) 3361 ) 3362 3363 return func 3364 3365 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3366 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3367 return self._array_bag_sql( 3368 self.ARRAY_INTERSECTION_CONDITION, 3369 expression.expressions[0], 3370 expression.expressions[1], 3371 ) 3372 return self.function_fallback_sql(expression) 3373 3374 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3375 arr1, arr2 = expression.this, expression.expression 3376 if expression.args.get("is_multiset"): 3377 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3378 return self.sql( 3379 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3380 ) 3381 3382 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3383 """ 3384 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3385 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3386 expressions that adjust the index at query time: 3387 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3388 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3389 """ 3390 start, end = expression.args.get("start"), expression.args.get("end") 3391 3392 if expression.args.get("zero_based"): 3393 if start is not None: 3394 start = ( 3395 exp.case() 3396 .when( 3397 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3398 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3399 ) 3400 .else_(start) 3401 ) 3402 if end is not None: 3403 end = ( 3404 exp.case() 3405 .when( 3406 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3407 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3408 ) 3409 .else_(end) 3410 ) 3411 3412 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step")) 3413 3414 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3415 args = expression.expressions 3416 3417 if not args: 3418 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3419 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3420 3421 # Build placeholder values for template 3422 lengths = [exp.Length(this=arg) for arg in args] 3423 max_len = ( 3424 lengths[0] 3425 if len(lengths) == 1 3426 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3427 ) 3428 3429 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3430 empty_struct = exp.func( 3431 "STRUCT", 3432 *[ 3433 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3434 for i in range(len(args)) 3435 ], 3436 ) 3437 3438 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3439 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3440 index = exp.column("__i") + 1 3441 transform_struct = exp.func( 3442 "STRUCT", 3443 *[ 3444 exp.PropertyEQ( 3445 this=exp.Literal.string(f"${i + 1}"), 3446 expression=exp.func("COALESCE", arg, exp.array())[index], 3447 ) 3448 for i, arg in enumerate(args) 3449 ], 3450 ) 3451 3452 result = exp.replace_placeholders( 3453 self.ARRAYS_ZIP_TEMPLATE.copy(), 3454 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3455 all_empty_check=exp.and_( 3456 *[ 3457 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3458 for arg in args 3459 ] 3460 ), 3461 empty_struct=empty_struct, 3462 max_len=max_len, 3463 transform_struct=transform_struct, 3464 ) 3465 return self.sql(result) 3466 3467 def lower_sql(self, expression: exp.Lower) -> str: 3468 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3469 return _gen_with_cast_to_blob(self, expression, result_sql) 3470 3471 def upper_sql(self, expression: exp.Upper) -> str: 3472 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3473 return _gen_with_cast_to_blob(self, expression, result_sql) 3474 3475 def reverse_sql(self, expression: exp.Reverse) -> str: 3476 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3477 return _gen_with_cast_to_blob(self, expression, result_sql) 3478 3479 def _left_right_sql(self, expression: exp.Left | exp.Right, func_name: str) -> str: 3480 arg = expression.this 3481 length = expression.expression 3482 is_binary = _is_binary(arg) 3483 3484 if is_binary: 3485 # LEFT/RIGHT(blob, n) becomes UNHEX(LEFT/RIGHT(HEX(blob), n * 2)) 3486 # Each byte becomes 2 hex chars, so multiply length by 2 3487 hex_arg = exp.Hex(this=arg) 3488 hex_length = exp.Mul(this=length, expression=exp.Literal.number(2)) 3489 result: exp.Expression = exp.Unhex( 3490 this=exp.Anonymous(this=func_name, expressions=[hex_arg, hex_length]) 3491 ) 3492 else: 3493 result = exp.Anonymous(this=func_name, expressions=[arg, length]) 3494 3495 if expression.args.get("negative_length_returns_empty"): 3496 empty: exp.Expression = exp.Literal.string("") 3497 if is_binary: 3498 empty = exp.Unhex(this=empty) 3499 result = exp.case().when(length < exp.Literal.number(0), empty).else_(result) 3500 3501 return self.sql(result) 3502 3503 def left_sql(self, expression: exp.Left) -> str: 3504 return self._left_right_sql(expression, "LEFT") 3505 3506 def right_sql(self, expression: exp.Right) -> str: 3507 return self._left_right_sql(expression, "RIGHT") 3508 3509 def rtrimmedlength_sql(self, expression: exp.RtrimmedLength) -> str: 3510 return self.func("LENGTH", exp.Trim(this=expression.this, position="TRAILING")) 3511 3512 def stuff_sql(self, expression: exp.Stuff) -> str: 3513 base = expression.this 3514 start = expression.args["start"] 3515 length = expression.args["length"] 3516 insertion = expression.expression 3517 is_binary = _is_binary(base) 3518 3519 if is_binary: 3520 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3521 # (each byte = 2 hex chars), then UNHEX back to BLOB 3522 base = exp.Hex(this=base) 3523 insertion = exp.Hex(this=insertion) 3524 left = exp.Substring( 3525 this=base.copy(), 3526 start=exp.Literal.number(1), 3527 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3528 ) 3529 right = exp.Substring( 3530 this=base.copy(), 3531 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3532 + exp.Literal.number(1), 3533 ) 3534 else: 3535 left = exp.Substring( 3536 this=base.copy(), 3537 start=exp.Literal.number(1), 3538 length=start.copy() - exp.Literal.number(1), 3539 ) 3540 right = exp.Substring(this=base.copy(), start=start + length) 3541 result: exp.Expr = exp.DPipe( 3542 this=exp.DPipe(this=left, expression=insertion), expression=right 3543 ) 3544 3545 if is_binary: 3546 result = exp.Unhex(this=result) 3547 3548 return self.sql(result) 3549 3550 def rand_sql(self, expression: exp.Rand) -> str: 3551 seed = expression.this 3552 if seed is not None: 3553 self.unsupported("RANDOM with seed is not supported in DuckDB") 3554 3555 lower = expression.args.get("lower") 3556 upper = expression.args.get("upper") 3557 3558 if lower and upper: 3559 # scale DuckDB's [0,1) to the specified range 3560 range_size = exp.paren(upper - lower) 3561 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3562 3563 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3564 result = exp.cast(scaled, exp.DType.BIGINT) 3565 return self.sql(result) 3566 3567 # Default DuckDB behavior - just return RANDOM() as float 3568 return "RANDOM()" 3569 3570 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3571 arg = expression.this 3572 3573 # Check if it's a text type (handles both literals and annotated expressions) 3574 if arg.is_type(*exp.DataType.TEXT_TYPES): 3575 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3576 3577 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3578 return self.func("OCTET_LENGTH", arg) 3579 3580 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3581 # DuckDB TO_BASE64 requires BLOB input 3582 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3583 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3584 result = expression.this 3585 3586 # Check if input is a string type - ENCODE only accepts VARCHAR 3587 if result.is_type(*exp.DataType.TEXT_TYPES): 3588 result = exp.Encode(this=result) 3589 3590 result = exp.ToBase64(this=result) 3591 3592 max_line_length = expression.args.get("max_line_length") 3593 alphabet = expression.args.get("alphabet") 3594 3595 # Handle custom alphabet by replacing standard chars with custom ones 3596 result = _apply_base64_alphabet_replacements(result, alphabet) 3597 3598 # Handle max_line_length by inserting newlines every N characters 3599 line_length = ( 3600 t.cast(int, max_line_length.to_py()) 3601 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3602 else 0 3603 ) 3604 if line_length > 0: 3605 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3606 result = exp.Trim( 3607 this=exp.RegexpReplace( 3608 this=result, 3609 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3610 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3611 ), 3612 expression=newline, 3613 position="TRAILING", 3614 ) 3615 3616 return self.sql(result) 3617 3618 def hex_sql(self, expression: exp.Hex) -> str: 3619 case = expression.args.get("case") 3620 3621 if not case: 3622 return self.func("HEX", expression.this) 3623 3624 hex_expr = exp.Hex(this=expression.this) 3625 return self.sql( 3626 exp.case() 3627 .when(case.is_(exp.null()), exp.null()) 3628 .when(case.copy().eq(0), exp.Lower(this=hex_expr.copy())) 3629 .else_(hex_expr) 3630 ) 3631 3632 def replace_sql(self, expression: exp.Replace) -> str: 3633 result_sql = self.func( 3634 "REPLACE", 3635 _cast_to_varchar(expression.this), 3636 _cast_to_varchar(expression.expression), 3637 _cast_to_varchar(expression.args.get("replacement")), 3638 ) 3639 return _gen_with_cast_to_blob(self, expression, result_sql) 3640 3641 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3642 _prepare_binary_bitwise_args(expression) 3643 result_sql = self.binary(expression, op) 3644 return _gen_with_cast_to_blob(self, expression, result_sql) 3645 3646 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3647 _prepare_binary_bitwise_args(expression) 3648 result_sql = self.func("XOR", expression.this, expression.expression) 3649 return _gen_with_cast_to_blob(self, expression, result_sql) 3650 3651 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3652 this = expression.this 3653 key = expression.args.get("key") 3654 key_sql = key.name if isinstance(key, exp.Expr) else "" 3655 value_sql = self.sql(expression, "value") 3656 3657 kv_sql = f"{key_sql} := {value_sql}" 3658 3659 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3660 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3661 if isinstance(this, exp.Struct) and not this.expressions: 3662 return self.func("STRUCT_PACK", kv_sql) 3663 3664 return self.func("STRUCT_INSERT", this, kv_sql) 3665 3666 def mapcat_sql(self, expression: exp.MapCat) -> str: 3667 result = exp.replace_placeholders( 3668 self.MAPCAT_TEMPLATE.copy(), 3669 map1=expression.this, 3670 map2=expression.expression, 3671 ) 3672 return self.sql(result) 3673 3674 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3675 return self.func( 3676 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3677 ) 3678 3679 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3680 map_arg = expression.this 3681 keys_to_delete = expression.expressions 3682 3683 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3684 3685 lambda_expr = exp.Lambda( 3686 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3687 expressions=[exp.to_identifier("x")], 3688 ) 3689 result = exp.func( 3690 "MAP_FROM_ENTRIES", 3691 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3692 ) 3693 return self.sql(result) 3694 3695 def mappick_sql(self, expression: exp.MapPick) -> str: 3696 map_arg = expression.this 3697 keys_to_pick = expression.expressions 3698 3699 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3700 3701 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3702 lambda_expr = exp.Lambda( 3703 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3704 expressions=[exp.to_identifier("x")], 3705 ) 3706 else: 3707 lambda_expr = exp.Lambda( 3708 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3709 expressions=[exp.to_identifier("x")], 3710 ) 3711 3712 result = exp.func( 3713 "MAP_FROM_ENTRIES", 3714 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3715 ) 3716 return self.sql(result) 3717 3718 def mapsize_sql(self, expression: exp.MapSize) -> str: 3719 return self.func("CARDINALITY", expression.this) 3720 3721 @unsupported_args("update_flag") 3722 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3723 map_arg = expression.this 3724 key = expression.args.get("key") 3725 value = expression.args.get("value") 3726 3727 map_type = map_arg.type 3728 3729 if value is not None: 3730 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3731 # Extract the value type from MAP(key_type, value_type) 3732 value_type = map_type.expressions[1] 3733 # Cast value to match the map's value type to avoid type conflicts 3734 value = exp.cast(value, value_type) 3735 # else: polymorphic MAP case - no type parameters available, use value as-is 3736 3737 # Create a single-entry map for the new key-value pair 3738 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3739 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3740 3741 # Use MAP_CONCAT to merge the original map with the new entry 3742 # This automatically handles both insert and update cases 3743 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3744 3745 return self.sql(result) 3746 3747 def startswith_sql(self, expression: exp.StartsWith) -> str: 3748 return self.func( 3749 "STARTS_WITH", 3750 _cast_to_varchar(expression.this), 3751 _cast_to_varchar(expression.expression), 3752 ) 3753 3754 def space_sql(self, expression: exp.Space) -> str: 3755 # DuckDB's REPEAT requires BIGINT for the count parameter 3756 return self.sql( 3757 exp.Repeat( 3758 this=exp.Literal.string(" "), 3759 times=exp.cast(expression.this, exp.DType.BIGINT), 3760 ) 3761 ) 3762 3763 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3764 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3765 if isinstance(expression.this, exp.Generator): 3766 # Preserve alias, joins, and other table-level args 3767 table = exp.Table( 3768 this=expression.this, 3769 alias=expression.args.get("alias"), 3770 joins=expression.args.get("joins"), 3771 ) 3772 return self.sql(table) 3773 3774 return super().tablefromrows_sql(expression) 3775 3776 def unnest_sql(self, expression: exp.Unnest) -> str: 3777 explode_array = expression.args.get("explode_array") 3778 if explode_array: 3779 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3780 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3781 expression.expressions.append( 3782 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3783 ) 3784 3785 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3786 alias = expression.args.get("alias") 3787 if isinstance(alias, exp.TableAlias): 3788 expression.set("alias", None) 3789 if alias.columns: 3790 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3791 3792 unnest_sql = super().unnest_sql(expression) 3793 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3794 return self.sql(select) 3795 3796 return super().unnest_sql(expression) 3797 3798 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3799 this = expression.this 3800 3801 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3802 # DuckDB should render IGNORE NULLS only for the general-purpose 3803 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3804 return super().ignorenulls_sql(expression) 3805 3806 if isinstance(this, exp.First): 3807 this = exp.AnyValue(this=this.this) 3808 3809 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3810 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3811 3812 return self.sql(this) 3813 3814 def split_sql(self, expression: exp.Split) -> str: 3815 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3816 3817 case_expr = exp.case().else_(base_func) 3818 needs_case = False 3819 3820 if expression.args.get("null_returns_null"): 3821 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3822 needs_case = True 3823 3824 if expression.args.get("empty_delimiter_returns_whole"): 3825 # When delimiter is empty string, return input string as single array element 3826 array_with_input = exp.array(expression.this) 3827 case_expr = case_expr.when( 3828 expression.expression.eq(exp.Literal.string("")), array_with_input 3829 ) 3830 needs_case = True 3831 3832 return self.sql(case_expr if needs_case else base_func) 3833 3834 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3835 string_arg = expression.this 3836 delimiter_arg = expression.args.get("delimiter") 3837 part_index_arg = expression.args.get("part_index") 3838 3839 if delimiter_arg and part_index_arg: 3840 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3841 if expression.args.get("part_index_zero_as_one"): 3842 # Convert 0 to 1 for compatibility 3843 3844 part_index_arg = exp.Paren( 3845 this=exp.case() 3846 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3847 .else_(part_index_arg) 3848 ) 3849 3850 # Use Anonymous to avoid recursion 3851 base_func_expr: exp.Expr = exp.Anonymous( 3852 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3853 ) 3854 needs_case_transform = False 3855 case_expr = exp.case().else_(base_func_expr) 3856 3857 if expression.args.get("empty_delimiter_returns_whole"): 3858 # When delimiter is empty string: 3859 # - Return whole string if part_index is 1 or -1 3860 # - Return empty string otherwise 3861 empty_case = exp.Paren( 3862 this=exp.case() 3863 .when( 3864 exp.or_( 3865 part_index_arg.eq(exp.Literal.number("1")), 3866 part_index_arg.eq(exp.Literal.number("-1")), 3867 ), 3868 string_arg, 3869 ) 3870 .else_(exp.Literal.string("")) 3871 ) 3872 3873 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3874 needs_case_transform = True 3875 3876 """ 3877 Output looks something like this: 3878 3879 CASE 3880 WHEN delimiter is '' THEN 3881 ( 3882 CASE 3883 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3884 ELSE '' END 3885 ) 3886 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3887 END 3888 3889 """ 3890 return self.sql(case_expr if needs_case_transform else base_func_expr) 3891 3892 return self.function_fallback_sql(expression) 3893 3894 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3895 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3896 # DuckDB should render RESPECT NULLS only for the general-purpose 3897 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3898 return super().respectnulls_sql(expression) 3899 3900 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3901 return self.sql(expression, "this") 3902 3903 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3904 null = expression.args.get("null") 3905 3906 if expression.args.get("null_is_empty"): 3907 x = exp.to_identifier("x") 3908 list_transform = exp.Transform( 3909 this=expression.this.copy(), 3910 expression=exp.Lambda( 3911 this=exp.Coalesce( 3912 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3913 ), 3914 expressions=[x], 3915 ), 3916 ) 3917 array_to_string = exp.ArrayToString( 3918 this=list_transform, expression=expression.expression 3919 ) 3920 if expression.args.get("null_delim_is_null"): 3921 return self.sql( 3922 exp.case() 3923 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3924 .else_(array_to_string) 3925 ) 3926 return self.sql(array_to_string) 3927 3928 if null: 3929 x = exp.to_identifier("x") 3930 return self.sql( 3931 exp.ArrayToString( 3932 this=exp.Transform( 3933 this=expression.this, 3934 expression=exp.Lambda( 3935 this=exp.Coalesce(this=x, expressions=[null]), 3936 expressions=[x], 3937 ), 3938 ), 3939 expression=expression.expression, 3940 ) 3941 ) 3942 3943 return self.func("ARRAY_TO_STRING", expression.this, expression.expression) 3944 3945 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3946 # DuckDB-specific: handle binary types using DPipe (||) operator 3947 separator = seq_get(expression.expressions, 0) 3948 args = expression.expressions[1:] 3949 3950 if any(_is_binary(arg) for arg in [separator, *args]): 3951 result = args[0] 3952 for arg in args[1:]: 3953 result = exp.DPipe( 3954 this=exp.DPipe(this=result, expression=separator), expression=arg 3955 ) 3956 return self.sql(result) 3957 3958 return super().concatws_sql(expression) 3959 3960 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3961 this = expression.this 3962 group = expression.args.get("group") 3963 params = expression.args.get("parameters") 3964 position = expression.args.get("position") 3965 occurrence = expression.args.get("occurrence") 3966 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3967 3968 # Handle Snowflake's 'e' flag: it enables capture group extraction 3969 # In DuckDB, this is controlled by the group parameter directly 3970 if params and params.is_string and "e" in params.name: 3971 params = exp.Literal.string(params.name.replace("e", "")) 3972 3973 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3974 3975 # Strip default group when no following params (DuckDB default is same as group=0) 3976 if ( 3977 not validated_flags 3978 and group 3979 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3980 ): 3981 group = None 3982 3983 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3984 3985 # use substring to handle position argument 3986 if position and (not position.is_int or position.to_py() > 1): 3987 this = exp.Substring(this=this, start=position) 3988 3989 if null_if_pos_overflow: 3990 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3991 3992 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3993 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3994 3995 if is_extract_all or non_single_occurrence: 3996 name = "REGEXP_EXTRACT_ALL" 3997 else: 3998 name = "REGEXP_EXTRACT" 3999 4000 result: exp.Expr = exp.Anonymous( 4001 this=name, expressions=[this, expression.expression, group, flags_expr] 4002 ) 4003 4004 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 4005 if is_extract_all and non_single_occurrence: 4006 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 4007 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 4008 elif non_single_occurrence: 4009 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 4010 4011 return self.sql(result) 4012 4013 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 4014 return self._regexp_extract_sql(expression) 4015 4016 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 4017 return self._regexp_extract_sql(expression) 4018 4019 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 4020 this = expression.this 4021 pattern = expression.expression 4022 position = expression.args.get("position") 4023 orig_occ = expression.args.get("occurrence") 4024 occurrence = orig_occ or exp.Literal.number(1) 4025 option = expression.args.get("option") 4026 parameters = expression.args.get("parameters") 4027 4028 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 4029 if validated_flags: 4030 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 4031 4032 # Handle starting position offset 4033 pos_offset: exp.Expr = exp.Literal.number(0) 4034 if position and (not position.is_int or position.to_py() > 1): 4035 this = exp.Substring(this=this, start=position) 4036 pos_offset = position - exp.Literal.number(1) 4037 4038 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 4039 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 4040 lst = exp.Bracket( 4041 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 4042 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 4043 offset=1, 4044 ) 4045 transform = exp.Anonymous( 4046 this="LIST_TRANSFORM", 4047 expressions=[ 4048 lst, 4049 exp.Lambda( 4050 this=exp.Length(this=exp.to_identifier("x")), 4051 expressions=[exp.to_identifier("x")], 4052 ), 4053 ], 4054 ) 4055 return exp.Coalesce( 4056 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4057 expressions=[exp.Literal.number(0)], 4058 ) 4059 4060 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4061 base_pos: exp.Expr = ( 4062 exp.Literal.number(1) 4063 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4064 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4065 + pos_offset 4066 ) 4067 4068 # option=1: add match length for end position 4069 if option and option.is_int and option.to_py() == 1: 4070 match_at_occ = exp.Bracket( 4071 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4072 expressions=[occurrence], 4073 offset=1, 4074 ) 4075 base_pos = base_pos + exp.Coalesce( 4076 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4077 ) 4078 4079 # NULL checks for all provided arguments 4080 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4081 null_args = [ 4082 expression.this, 4083 expression.expression, 4084 position, 4085 orig_occ, 4086 option, 4087 parameters, 4088 ] 4089 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4090 4091 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4092 4093 return self.sql( 4094 exp.case() 4095 .when(exp.or_(*null_checks), exp.Null()) 4096 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4097 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4098 .else_(base_pos) 4099 ) 4100 4101 @unsupported_args("culture") 4102 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4103 fmt = expression.args.get("format") 4104 if fmt and fmt.is_int: 4105 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4106 4107 self.unsupported("Only integer formats are supported by NumberToStr") 4108 return self.function_fallback_sql(expression) 4109 4110 def autoincrementcolumnconstraint_sql(self, _) -> str: 4111 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 4112 return "" 4113 4114 def aliases_sql(self, expression: exp.Aliases) -> str: 4115 this = expression.this 4116 if isinstance(this, exp.Posexplode): 4117 return self.posexplode_sql(this) 4118 4119 return super().aliases_sql(expression) 4120 4121 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4122 this = expression.this 4123 parent = expression.parent 4124 4125 # The default Spark aliases are "pos" and "col", unless specified otherwise 4126 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4127 4128 if isinstance(parent, exp.Aliases): 4129 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4130 pos, col = parent.expressions 4131 elif isinstance(parent, exp.Table): 4132 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4133 alias = parent.args.get("alias") 4134 if alias: 4135 pos, col = alias.columns or [pos, col] 4136 alias.pop() 4137 4138 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4139 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4140 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4141 gen_subscripts = self.sql( 4142 exp.Alias( 4143 this=exp.Anonymous( 4144 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4145 ) 4146 - exp.Literal.number(1), 4147 alias=pos, 4148 ) 4149 ) 4150 4151 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4152 4153 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4154 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4155 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4156 4157 return posexplode_sql 4158 4159 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4160 """ 4161 Handles three key issues: 4162 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4163 2. End-of-month preservation: If input is last day of month, result is last day of result month 4164 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4165 """ 4166 from sqlglot.optimizer.annotate_types import annotate_types 4167 4168 this = expression.this 4169 if not this.type: 4170 this = annotate_types(this, dialect=self.dialect) 4171 4172 if this.is_type(*exp.DataType.TEXT_TYPES): 4173 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4174 4175 # Detect float/decimal months to apply rounding (Snowflake behavior) 4176 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4177 months_expr = expression.expression 4178 if not months_expr.type: 4179 months_expr = annotate_types(months_expr, dialect=self.dialect) 4180 4181 # Build interval or to_months expression based on type 4182 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4183 interval_or_to_months = ( 4184 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4185 if months_expr.is_type( 4186 exp.DType.FLOAT, 4187 exp.DType.DOUBLE, 4188 exp.DType.DECIMAL, 4189 ) 4190 # Integer case: standard INTERVAL N MONTH syntax 4191 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4192 ) 4193 4194 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4195 4196 # Apply end-of-month preservation if Snowflake flag is set 4197 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4198 preserve_eom = expression.args.get("preserve_end_of_month") 4199 result_expr = ( 4200 exp.case() 4201 .when( 4202 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4203 exp.func("LAST_DAY", date_add_expr), 4204 ) 4205 .else_(date_add_expr) 4206 if preserve_eom 4207 else date_add_expr 4208 ) 4209 4210 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4211 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4212 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4213 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4214 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4215 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4216 return self.sql(result_expr) 4217 4218 def format_sql(self, expression: exp.Format) -> str: 4219 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 4220 return self.func("FORMAT", "'{}'", expression.expressions[0]) 4221 4222 return self.function_fallback_sql(expression) 4223 4224 def hexstring_sql( 4225 self, expression: exp.HexString, binary_function_repr: str | None = None 4226 ) -> str: 4227 # UNHEX('FF') correctly produces blob \xFF in DuckDB 4228 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 4229 4230 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4231 unit = expression.args.get("unit") 4232 date = expression.this 4233 4234 week_start = _week_unit_to_dow(unit) 4235 unit = unit_to_str(expression) 4236 4237 if week_start: 4238 result = self.sql( 4239 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4240 ) 4241 else: 4242 result = self.func("DATE_TRUNC", unit, date) 4243 4244 if ( 4245 expression.args.get("input_type_preserved") 4246 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4247 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4248 ): 4249 return self.sql(exp.Cast(this=result, to=date.type)) 4250 4251 return result 4252 4253 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4254 unit = unit_to_str(expression) 4255 zone = expression.args.get("zone") 4256 timestamp = expression.this 4257 date_unit = is_date_unit(unit) 4258 4259 if date_unit and zone: 4260 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4261 # Double AT TIME ZONE needed for BigQuery compatibility: 4262 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4263 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4264 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4265 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4266 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4267 4268 result = self.func("DATE_TRUNC", unit, timestamp) 4269 if expression.args.get("input_type_preserved"): 4270 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4271 dummy_date = exp.Cast( 4272 this=exp.Literal.string("1970-01-01"), 4273 to=exp.DataType(this=exp.DType.DATE), 4274 ) 4275 date_time = exp.Add(this=dummy_date, expression=timestamp) 4276 result = self.func("DATE_TRUNC", unit, date_time) 4277 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4278 4279 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4280 date_unit and timestamp.is_type(exp.DType.DATE) 4281 ): 4282 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4283 4284 return result 4285 4286 def trim_sql(self, expression: exp.Trim) -> str: 4287 expression.this.replace(_cast_to_varchar(expression.this)) 4288 if expression.expression: 4289 expression.expression.replace(_cast_to_varchar(expression.expression)) 4290 4291 result_sql = super().trim_sql(expression) 4292 return _gen_with_cast_to_blob(self, expression, result_sql) 4293 4294 def round_sql(self, expression: exp.Round) -> str: 4295 this = expression.this 4296 decimals = expression.args.get("decimals") 4297 truncate = expression.args.get("truncate") 4298 4299 # DuckDB requires the scale (decimals) argument to be an INT 4300 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4301 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4302 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4303 decimals = exp.cast(decimals, exp.DType.INT) 4304 4305 func = "ROUND" 4306 if truncate: 4307 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4308 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4309 func = "ROUND_EVEN" 4310 truncate = None 4311 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4312 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4313 truncate = None 4314 4315 return self.func(func, this, decimals, truncate) 4316 4317 def strtok_sql(self, expression: exp.Strtok) -> str: 4318 string_arg = expression.this 4319 delimiter_arg = expression.args.get("delimiter") 4320 part_index_arg = expression.args.get("part_index") 4321 4322 if delimiter_arg and part_index_arg: 4323 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4324 escaped_delimiter = exp.Anonymous( 4325 this="REGEXP_REPLACE", 4326 expressions=[ 4327 delimiter_arg, 4328 exp.Literal.string( 4329 r"([\[\]^.\-*+?(){}|$\\])" 4330 ), # Escape problematic regex chars 4331 exp.Literal.string( 4332 r"\\\1" 4333 ), # Replace with escaped version using $1 backreference 4334 exp.Literal.string("g"), # Global flag 4335 ], 4336 ) 4337 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4338 regex_pattern = ( 4339 exp.case() 4340 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4341 .else_( 4342 exp.func( 4343 "CONCAT", 4344 exp.Literal.string("["), 4345 escaped_delimiter, 4346 exp.Literal.string("]"), 4347 ) 4348 ) 4349 ) 4350 4351 # STRTOK skips empty strings, so we need to filter them out 4352 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4353 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4354 x = exp.to_identifier("x") 4355 is_empty = x.eq(exp.Literal.string("")) 4356 filtered_array = exp.func( 4357 "LIST_FILTER", 4358 split_array, 4359 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4360 ) 4361 base_func = exp.Bracket( 4362 this=filtered_array, 4363 expressions=[part_index_arg], 4364 offset=1, 4365 ) 4366 4367 # Use template with the built regex pattern 4368 result = exp.replace_placeholders( 4369 self.STRTOK_TEMPLATE.copy(), 4370 string=string_arg, 4371 delimiter=delimiter_arg, 4372 part_index=part_index_arg, 4373 base_func=base_func, 4374 ) 4375 4376 return self.sql(result) 4377 4378 return self.function_fallback_sql(expression) 4379 4380 def strtoktoarray_sql(self, expression: exp.StrtokToArray) -> str: 4381 string_arg = expression.this 4382 delimiter_arg = expression.args.get("expression") or exp.Literal.string(" ") 4383 4384 escaped = exp.RegexpReplace( 4385 this=delimiter_arg.copy(), 4386 expression=exp.Literal.string(r"([\[\]^.\-*+?(){}|$\\])"), 4387 replacement=exp.Literal.string(r"\\\1"), 4388 modifiers=exp.Literal.string("g"), 4389 ) 4390 return self.sql( 4391 exp.replace_placeholders( 4392 self.STRTOK_TO_ARRAY_TEMPLATE.copy(), 4393 string=string_arg, 4394 delimiter=delimiter_arg, 4395 escaped=escaped, 4396 ) 4397 ) 4398 4399 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4400 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4401 4402 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4403 if expression.is_type(*exp.DataType.REAL_TYPES): 4404 result = f"CAST({result} AS DOUBLE)" 4405 4406 return result 4407 4408 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4409 """ 4410 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4411 dividing the input distribution into n equal-sized buckets. 4412 4413 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4414 does not document the specific algorithm used so results may differ. DuckDB does not 4415 support RESPECT NULLS. 4416 """ 4417 this = expression.this 4418 if isinstance(this, exp.Distinct): 4419 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4420 if len(this.expressions) < 2: 4421 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4422 return self.function_fallback_sql(expression) 4423 num_quantiles_expr = this.expressions[1].pop() 4424 else: 4425 num_quantiles_expr = expression.expression 4426 4427 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4428 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4429 return self.function_fallback_sql(expression) 4430 4431 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4432 if num_quantiles <= 0: 4433 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4434 return self.function_fallback_sql(expression) 4435 4436 quantiles = [ 4437 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4438 for i in range(num_quantiles + 1) 4439 ] 4440 4441 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles))) 4442 4443 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4444 if expression.args.get("scalar_only"): 4445 expression = exp.JSONExtractScalar( 4446 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4447 ) 4448 return _arrow_json_extract_sql(self, expression) 4449 4450 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4451 this = expression.this 4452 4453 if _is_binary(this): 4454 expression.type = exp.DType.BINARY.into_expr() 4455 4456 arg = _cast_to_bit(this) 4457 4458 if isinstance(this, exp.Neg): 4459 arg = exp.Paren(this=arg) 4460 4461 expression.set("this", arg) 4462 4463 result_sql = f"~{self.sql(expression, 'this')}" 4464 4465 return _gen_with_cast_to_blob(self, expression, result_sql) 4466 4467 def window_sql(self, expression: exp.Window) -> str: 4468 this = expression.this 4469 if isinstance(this, exp.Corr) or ( 4470 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4471 ): 4472 return self._corr_sql(expression) 4473 4474 return super().window_sql(expression) 4475 4476 def filter_sql(self, expression: exp.Filter) -> str: 4477 if isinstance(expression.this, exp.Corr): 4478 return self._corr_sql(expression) 4479 4480 return super().filter_sql(expression) 4481 4482 def _corr_sql( 4483 self, 4484 expression: exp.Filter | exp.Window | exp.Corr, 4485 ) -> str: 4486 if isinstance(expression, exp.Corr) and not expression.args.get("null_on_zero_variance"): 4487 return self.func("CORR", expression.this, expression.expression) 4488 4489 corr_expr = _maybe_corr_null_to_false(expression) 4490 if corr_expr is None: 4491 if isinstance(expression, exp.Window): 4492 return super().window_sql(expression) 4493 if isinstance(expression, exp.Filter): 4494 return super().filter_sql(expression) 4495 corr_expr = expression # make mypy happy 4496 4497 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr)) 4498 4499 def uuid_sql(self, expression: exp.Uuid) -> str: 4500 namespace = expression.this 4501 name = expression.args.get("name") 4502 4503 # UUID v5 (namespace + name) - Emulate using SHA1 4504 if namespace and name: 4505 result = exp.replace_placeholders( 4506 self.UUID_V5_TEMPLATE.copy(), 4507 namespace=namespace, 4508 name=name, 4509 ) 4510 return self.sql(result) 4511 4512 return super().uuid_sql(expression)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True: Always quote except for specials cases. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
2191 def timeslice_sql(self, expression: exp.TimeSlice) -> str: 2192 """ 2193 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2194 2195 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2196 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2197 2198 For 'END' kind, add the interval to get the end of the slice. 2199 For DATE type with 'END', cast result back to DATE to preserve type. 2200 """ 2201 date_expr = expression.this 2202 slice_length = expression.expression 2203 unit = expression.unit 2204 kind = expression.text("kind").upper() 2205 2206 # Create INTERVAL expression: INTERVAL 'N' UNIT 2207 interval_expr = exp.Interval(this=slice_length, unit=unit) 2208 2209 # Create base time_bucket expression 2210 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2211 2212 # Check if we need the end of the slice (default is start) 2213 if not kind == "END": 2214 # For 'START', return time_bucket directly 2215 return self.sql(time_bucket_expr) 2216 2217 # For 'END', add the interval to get end of slice 2218 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2219 2220 # If input is DATE type, cast result back to DATE to preserve type 2221 # DuckDB converts DATE to TIMESTAMP when adding intervals 2222 if date_expr.is_type(exp.DType.DATE): 2223 return self.sql(exp.cast(add_expr, exp.DType.DATE)) 2224 2225 return self.sql(add_expr)
Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr)
For 'END' kind, add the interval to get the end of the slice. For DATE type with 'END', cast result back to DATE to preserve type.
2227 def bitmapbucketnumber_sql(self, expression: exp.BitmapBucketNumber) -> str: 2228 """ 2229 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2230 2231 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2232 - Each bucket covers 32,768 values 2233 - Bucket numbering starts at 1 2234 - Formula: ((value - 1) // 32768) + 1 for positive values 2235 2236 For non-positive values (0 and negative), we use value // 32768 to avoid 2237 producing bucket 0 or positive bucket IDs for negative inputs. 2238 """ 2239 value = expression.this 2240 2241 positive_formula = ((value - 1) // 32768) + 1 2242 non_positive_formula = value // 32768 2243 2244 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2245 case_expr = ( 2246 exp.case() 2247 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2248 .else_(non_positive_formula) 2249 ) 2250 return self.sql(case_expr)
Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent.
Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where:
- Each bucket covers 32,768 values
- Bucket numbering starts at 1
- Formula: ((value - 1) // 32768) + 1 for positive values
For non-positive values (0 and negative), we use value // 32768 to avoid producing bucket 0 or positive bucket IDs for negative inputs.
2252 def bitmapbitposition_sql(self, expression: exp.BitmapBitPosition) -> str: 2253 """ 2254 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2255 2256 Snowflake's BITMAP_BIT_POSITION behavior: 2257 - For n <= 0: returns ABS(n) % 32768 2258 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2259 """ 2260 this = expression.this 2261 2262 return self.sql( 2263 exp.Mod( 2264 this=exp.Paren( 2265 this=exp.If( 2266 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2267 true=this - exp.Literal.number(1), 2268 false=exp.Abs(this=this), 2269 ) 2270 ), 2271 expression=MAX_BIT_POSITION, 2272 ) 2273 )
Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
Snowflake's BITMAP_BIT_POSITION behavior:
- For n <= 0: returns ABS(n) % 32768
- For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
2275 def bitmapconstructagg_sql(self, expression: exp.BitmapConstructAgg) -> str: 2276 """ 2277 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2278 Uses a pre-parsed template with placeholders replaced by expression nodes. 2279 2280 Snowflake bitmap format: 2281 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2282 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2283 """ 2284 arg = expression.this 2285 return ( 2286 f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2287 )
Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. Uses a pre-parsed template with placeholders replaced by expression nodes.
Snowflake bitmap format:
- Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
- Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
2327 def jarowinklersimilarity_sql(self, expression: exp.JarowinklerSimilarity) -> str: 2328 this = expression.this 2329 expr = expression.expression 2330 2331 if expression.args.get("case_insensitive"): 2332 this = exp.Upper(this=this) 2333 expr = exp.Upper(this=expr) 2334 2335 result = exp.func("JARO_WINKLER_SIMILARITY", this, expr) 2336 2337 if expression.args.get("integer_scale"): 2338 result = exp.cast(result * 100, "INTEGER") 2339 2340 return self.sql(result)
2349 def randstr_sql(self, expression: exp.Randstr) -> str: 2350 """ 2351 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2352 Uses a pre-parsed template with placeholders replaced by expression nodes. 2353 2354 RANDSTR(length, generator) generates a random string of specified length. 2355 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2356 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2357 - No generator: Use default seed value 2358 """ 2359 length = expression.this 2360 generator = expression.args.get("generator") 2361 2362 if generator: 2363 if isinstance(generator, exp.Rand): 2364 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2365 seed_value = generator.this or generator 2366 else: 2367 # Const/int or other expression - use as seed directly 2368 seed_value = generator 2369 else: 2370 # No generator specified, use default seed (arbitrary but deterministic) 2371 seed_value = exp.Literal.number(RANDSTR_SEED) 2372 2373 replacements = {"seed": seed_value, "length": length} 2374 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})"
Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. Uses a pre-parsed template with placeholders replaced by expression nodes.
RANDSTR(length, generator) generates a random string of specified length.
- With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
- With RANDOM(): Use RANDOM() in the hash for non-deterministic output
- No generator: Use default seed value
2376 @unsupported_args("finish") 2377 def reduce_sql(self, expression: exp.Reduce) -> str: 2378 array_arg = expression.this 2379 initial_value = expression.args.get("initial") 2380 merge_lambda = expression.args.get("merge") 2381 2382 if merge_lambda: 2383 merge_lambda.set("colon", True) 2384 2385 return self.func("list_reduce", array_arg, merge_lambda, initial_value)
2387 def zipf_sql(self, expression: exp.Zipf) -> str: 2388 """ 2389 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2390 Uses a pre-parsed template with placeholders replaced by expression nodes. 2391 """ 2392 s = expression.this 2393 n = expression.args["elementcount"] 2394 gen = expression.args["gen"] 2395 2396 if not isinstance(gen, exp.Rand): 2397 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2398 random_expr: exp.Expr = exp.Div( 2399 this=exp.Paren( 2400 this=exp.Mod( 2401 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2402 expression=exp.Literal.number(1000000), 2403 ) 2404 ), 2405 expression=exp.Literal.number(1000000.0), 2406 ) 2407 else: 2408 # Use RANDOM() for non-deterministic output 2409 random_expr = exp.Rand() 2410 2411 replacements = {"s": s, "n": n, "random_expr": random_expr} 2412 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})"
Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. Uses a pre-parsed template with placeholders replaced by expression nodes.
2414 def tobinary_sql(self, expression: exp.ToBinary) -> str: 2415 """ 2416 TO_BINARY and TRY_TO_BINARY transpilation: 2417 - 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50') 2418 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST') 2419 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==') 2420 2421 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2422 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid')) 2423 """ 2424 value = expression.this 2425 format_arg = expression.args.get("format") 2426 is_safe = expression.args.get("safe") 2427 is_binary = _is_binary(expression) 2428 2429 if not format_arg and not is_binary: 2430 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2431 return self.func(func_name, value) 2432 2433 # Snowflake defaults to HEX encoding when no format is specified 2434 fmt = format_arg.name.upper() if format_arg else "HEX" 2435 2436 if fmt in ("UTF-8", "UTF8"): 2437 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2438 result = self.func("ENCODE", value) 2439 elif fmt == "BASE64": 2440 result = self.func("FROM_BASE64", value) 2441 elif fmt == "HEX": 2442 result = self.func("UNHEX", value) 2443 else: 2444 if is_safe: 2445 return self.sql(exp.null()) 2446 else: 2447 self.unsupported(f"format {fmt} is not supported") 2448 result = self.func("TO_BINARY", value) 2449 return f"TRY({result})" if is_safe else result
TO_BINARY and TRY_TO_BINARY transpilation:
- 'HEX': TO_BINARY('48454C50', 'HEX') -> UNHEX('48454C50')
- 'UTF-8': TO_BINARY('TEST', 'UTF-8') -> ENCODE('TEST')
- 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') -> FROM_BASE64('SEVMUA==')
For TRY_TO_BINARY (safe=True), wrap with TRY():
- 'HEX': TRY_TO_BINARY('invalid', 'HEX') -> TRY(UNHEX('invalid'))
2451 def tonumber_sql(self, expression: exp.ToNumber) -> str: 2452 fmt = expression.args.get("format") 2453 precision = expression.args.get("precision") 2454 scale = expression.args.get("scale") 2455 2456 if not fmt and precision and scale: 2457 return self.sql( 2458 exp.cast( 2459 expression.this, f"DECIMAL({precision.name}, {scale.name})", dialect="duckdb" 2460 ) 2461 ) 2462 2463 return super().tonumber_sql(expression)
2489 def generator_sql(self, expression: exp.Generator) -> str: 2490 # Transpile Snowflake GENERATOR to DuckDB range() 2491 rowcount = expression.args.get("rowcount") 2492 time_limit = expression.args.get("time_limit") 2493 2494 if time_limit: 2495 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2496 2497 if not rowcount: 2498 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2499 return self.func("range", exp.Literal.number(0)) 2500 2501 return self.func("range", rowcount)
2509 def lambda_sql(self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True) -> str: 2510 if expression.args.get("colon"): 2511 prefix = "LAMBDA " 2512 arrow_sep = ":" 2513 wrap = False 2514 else: 2515 prefix = "" 2516 2517 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2518 return f"{prefix}{lambda_sql}"
2529 def sortarray_sql(self, expression: exp.SortArray) -> str: 2530 arr = expression.this 2531 asc = expression.args.get("asc") 2532 nulls_first = expression.args.get("nulls_first") 2533 2534 if not isinstance(asc, exp.Boolean) and not isinstance(nulls_first, exp.Boolean): 2535 return self.func("LIST_SORT", arr, asc, nulls_first) 2536 2537 nulls_are_first = nulls_first == exp.true() 2538 nulls_first_sql = exp.Literal.string("NULLS FIRST") if nulls_are_first else None 2539 2540 if not isinstance(asc, exp.Boolean): 2541 return self.func("LIST_SORT", arr, asc, nulls_first_sql) 2542 2543 descending = asc == exp.false() 2544 2545 if not descending and not nulls_are_first: 2546 return self.func("LIST_SORT", arr) 2547 if not nulls_are_first: 2548 return self.func("ARRAY_REVERSE_SORT", arr) 2549 return self.func( 2550 "LIST_SORT", 2551 arr, 2552 exp.Literal.string("DESC" if descending else "ASC"), 2553 exp.Literal.string("NULLS FIRST"), 2554 )
2556 def install_sql(self, expression: exp.Install) -> str: 2557 force = "FORCE " if expression.args.get("force") else "" 2558 this = self.sql(expression, "this") 2559 from_clause = expression.args.get("from_") 2560 from_clause = f" FROM {from_clause}" if from_clause else "" 2561 return f"{force}INSTALL {this}{from_clause}"
2572 def strposition_sql(self, expression: exp.StrPosition) -> str: 2573 this = expression.this 2574 substr = expression.args.get("substr") 2575 position = expression.args.get("position") 2576 2577 # For BINARY/BLOB: DuckDB's STRPOS doesn't support BLOB types 2578 # Convert to HEX strings, use STRPOS, then convert hex position to byte position 2579 if _is_binary(this): 2580 # Build expression: STRPOS(HEX(haystack), HEX(needle)) 2581 hex_strpos = exp.StrPosition( 2582 this=exp.Hex(this=this), 2583 substr=exp.Hex(this=substr), 2584 ) 2585 2586 return self.sql(exp.cast((hex_strpos + 1) / 2, exp.DType.INT)) 2587 2588 # For VARCHAR: handle clamp_position 2589 if expression.args.get("clamp_position") and position: 2590 expression = expression.copy() 2591 expression.set( 2592 "position", 2593 exp.If( 2594 this=exp.LTE(this=position, expression=exp.Literal.number(0)), 2595 true=exp.Literal.number(1), 2596 false=position.copy(), 2597 ), 2598 ) 2599 2600 return strposition_sql(self, expression)
2602 def substring_sql(self, expression: exp.Substring) -> str: 2603 if expression.args.get("zero_start"): 2604 start = expression.args.get("start") 2605 length = expression.args.get("length") 2606 2607 if start := expression.args.get("start"): 2608 start = exp.If(this=start.eq(0), true=exp.Literal.number(1), false=start) 2609 if length := expression.args.get("length"): 2610 length = exp.If(this=length < 0, true=exp.Literal.number(0), false=length) 2611 2612 return self.func("SUBSTRING", expression.this, start, length) 2613 2614 return self.function_fallback_sql(expression)
2616 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2617 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2618 target_type = expression.args.get("target_type") 2619 needs_tz = target_type and target_type.this in ( 2620 exp.DType.TIMESTAMPLTZ, 2621 exp.DType.TIMESTAMPTZ, 2622 ) 2623 2624 if expression.args.get("safe"): 2625 formatted_time = self.format_time(expression) 2626 cast_type = exp.DType.TIMESTAMPTZ if needs_tz else exp.DType.TIMESTAMP 2627 return self.sql( 2628 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2629 ) 2630 2631 base_sql = str_to_time_sql(self, expression) 2632 if needs_tz: 2633 return self.sql( 2634 exp.cast( 2635 base_sql, 2636 exp.DataType(this=exp.DType.TIMESTAMPTZ), 2637 ) 2638 ) 2639 return base_sql
2641 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2642 formatted_time = self.format_time(expression) 2643 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2644 return self.sql( 2645 exp.cast( 2646 self.func(function_name, expression.this, formatted_time), 2647 exp.DataType(this=exp.DType.DATE), 2648 ) 2649 )
2651 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2652 this = expression.this 2653 time_format = self.format_time(expression) 2654 safe = expression.args.get("safe") 2655 time_type = exp.DataType.from_str("TIME", dialect="duckdb") 2656 cast_expr = exp.TryCast if safe else exp.Cast 2657 2658 if time_format: 2659 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2660 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2661 return self.sql(cast_expr(this=strptime, to=time_type)) 2662 2663 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DType.TIME): 2664 return self.sql(this) 2665 2666 return self.sql(cast_expr(this=this, to=time_type))
2668 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2669 if not expression.this: 2670 return "CURRENT_DATE" 2671 2672 expr = exp.Cast( 2673 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2674 to=exp.DataType(this=exp.DType.DATE), 2675 ) 2676 return self.sql(expr)
2689 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2690 arg = expression.this 2691 if expression.args.get("safe"): 2692 return self.sql( 2693 exp.case() 2694 .when(exp.func("json_valid", arg), exp.cast(arg.copy(), "JSON")) 2695 .else_(exp.null()) 2696 ) 2697 return self.func("JSON", arg)
2699 def unicode_sql(self, expression: exp.Unicode) -> str: 2700 if expression.args.get("empty_is_zero"): 2701 return self.sql( 2702 exp.case() 2703 .when(expression.this.eq(exp.Literal.string("")), exp.Literal.number(0)) 2704 .else_(exp.Anonymous(this="UNICODE", expressions=[expression.this])) 2705 ) 2706 2707 return self.func("UNICODE", expression.this)
2716 def trunc_sql(self, expression: exp.Trunc) -> str: 2717 decimals = expression.args.get("decimals") 2718 if ( 2719 expression.args.get("fractions_supported") 2720 and decimals 2721 and not decimals.is_type(exp.DType.INT) 2722 ): 2723 decimals = exp.cast(decimals, exp.DType.INT, dialect="duckdb") 2724 2725 return self.func("TRUNC", expression.this, decimals)
2727 def normal_sql(self, expression: exp.Normal) -> str: 2728 """ 2729 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2730 2731 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2732 """ 2733 mean = expression.this 2734 stddev = expression.args["stddev"] 2735 gen: exp.Expr = expression.args["gen"] 2736 2737 # Build two uniform random values [0, 1) for Box-Muller transform 2738 if isinstance(gen, exp.Rand) and gen.this is None: 2739 u1: exp.Expr = exp.Rand() 2740 u2: exp.Expr = exp.Rand() 2741 else: 2742 # Seeded: derive two values using HASH with different inputs 2743 seed = gen.this if isinstance(gen, exp.Rand) else gen 2744 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2745 u2 = exp.replace_placeholders( 2746 self.SEEDED_RANDOM_TEMPLATE, 2747 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2748 ) 2749 2750 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2751 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements))
Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB.
Uses the Box-Muller transform via NORMAL_TEMPLATE.
2753 def uniform_sql(self, expression: exp.Uniform) -> str: 2754 """ 2755 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2756 2757 UNIFORM returns a random value in [min, max]: 2758 - Integer result if both min and max are integers 2759 - Float result if either min or max is a float 2760 """ 2761 min_val = expression.this 2762 max_val = expression.expression 2763 gen = expression.args.get("gen") 2764 2765 # Determine if result should be integer (both bounds are integers). 2766 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2767 is_int_result = min_val.is_int and max_val.is_int 2768 2769 # Build the random value expression [0, 1) 2770 if not isinstance(gen, exp.Rand): 2771 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2772 random_expr: exp.Expr = exp.Div( 2773 this=exp.Paren( 2774 this=exp.Mod( 2775 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2776 expression=exp.Literal.number(1000000), 2777 ) 2778 ), 2779 expression=exp.Literal.number(1000000.0), 2780 ) 2781 else: 2782 random_expr = exp.Rand() 2783 2784 # Build: min + random * (max - min [+ 1 for int]) 2785 range_expr: exp.Expr = exp.Sub(this=max_val, expression=min_val) 2786 if is_int_result: 2787 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2788 2789 result: exp.Expr = exp.Add( 2790 this=min_val, 2791 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2792 ) 2793 2794 if is_int_result: 2795 result = exp.Cast(this=exp.Floor(this=result), to=exp.DType.BIGINT.into_expr()) 2796 2797 return self.sql(result)
Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB.
UNIFORM returns a random value in [min, max]:
- Integer result if both min and max are integers
- Float result if either min or max is a float
2799 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2800 nano = expression.args.get("nano") 2801 overflow = expression.args.get("overflow") 2802 2803 # Snowflake's TIME_FROM_PARTS supports overflow 2804 if overflow: 2805 hour = expression.args["hour"] 2806 minute = expression.args["min"] 2807 sec = expression.args["sec"] 2808 2809 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2810 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2811 try: 2812 h_val = hour.to_py() 2813 m_val = minute.to_py() 2814 s_val = sec.to_py() 2815 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2816 return rename_func("MAKE_TIME")(self, expression) 2817 except ValueError: 2818 pass 2819 2820 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2821 if nano: 2822 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2823 2824 total_seconds = hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2825 2826 return self.sql( 2827 exp.Add( 2828 this=exp.Cast( 2829 this=exp.Literal.string("00:00:00"), to=exp.DType.TIME.into_expr() 2830 ), 2831 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2832 ) 2833 ) 2834 2835 # Default: MAKE_TIME 2836 if nano: 2837 expression.set( 2838 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2839 ) 2840 2841 return rename_func("MAKE_TIME")(self, expression)
2843 def extract_sql(self, expression: exp.Extract) -> str: 2844 """ 2845 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2846 2847 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2848 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2849 """ 2850 this = expression.this 2851 datetime_expr = expression.expression 2852 2853 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2854 # because Snowflake applies server timezone while DuckDB uses local timezone 2855 if datetime_expr.is_type(exp.DType.TIMESTAMPTZ, exp.DType.TIMESTAMPLTZ): 2856 self.unsupported( 2857 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2858 ) 2859 2860 part_name = this.name.upper() 2861 2862 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2863 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2864 2865 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2866 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2867 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2868 exp.DType.TIME, exp.DType.TIMETZ 2869 ) 2870 2871 if is_nano_time: 2872 self.unsupported("Parameter NANOSECOND is not supported with TIME type in DuckDB") 2873 return self.sql( 2874 exp.cast( 2875 exp.Mul( 2876 this=exp.Extract(this=exp.var("MICROSECOND"), expression=datetime_expr), 2877 expression=exp.Literal.number(1000), 2878 ), 2879 exp.DataType.from_str(cast_type, dialect="duckdb"), 2880 ) 2881 ) 2882 2883 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 2884 strftime_input = datetime_expr 2885 if part_name == "NANOSECOND": 2886 strftime_input = exp.cast(datetime_expr, exp.DType.TIMESTAMP_NS) 2887 2888 return self.sql( 2889 exp.cast( 2890 exp.Anonymous( 2891 this="STRFTIME", 2892 expressions=[strftime_input, exp.Literal.string(fmt)], 2893 ), 2894 exp.DataType.from_str(cast_type, dialect="duckdb"), 2895 ) 2896 ) 2897 2898 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 2899 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 2900 result: exp.Expr = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 2901 # EPOCH returns float, cast to BIGINT for integer result 2902 if part_name == "EPOCH_SECOND": 2903 result = exp.cast(result, exp.DataType.from_str("BIGINT", dialect="duckdb")) 2904 return self.sql(result) 2905 2906 return super().extract_sql(expression)
Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported.
DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND
2908 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 2909 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 2910 date_expr = expression.this 2911 time_expr = expression.expression 2912 2913 if date_expr is not None and time_expr is not None: 2914 # In DuckDB, DATE + TIME produces TIMESTAMP 2915 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 2916 2917 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 2918 sec = expression.args.get("sec") 2919 if sec is None: 2920 # This shouldn't happen with valid input, but handle gracefully 2921 return rename_func("MAKE_TIMESTAMP")(self, expression) 2922 2923 milli = expression.args.get("milli") 2924 if milli is not None: 2925 sec += milli.pop() / exp.Literal.number(1000.0) 2926 2927 nano = expression.args.get("nano") 2928 if nano is not None: 2929 sec += nano.pop() / exp.Literal.number(1000000000.0) 2930 2931 if milli or nano: 2932 expression.set("sec", sec) 2933 2934 return rename_func("MAKE_TIMESTAMP")(self, expression)
2936 @unsupported_args("nano") 2937 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 2938 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2939 if nano := expression.args.get("nano"): 2940 nano.pop() 2941 2942 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2943 return f"CAST({timestamp} AS TIMESTAMPTZ)"
2945 @unsupported_args("nano") 2946 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 2947 # Extract zone before popping 2948 zone = expression.args.get("zone") 2949 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 2950 if zone: 2951 zone = zone.pop() 2952 2953 if nano := expression.args.get("nano"): 2954 nano.pop() 2955 2956 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 2957 2958 if zone: 2959 # Use AT TIME ZONE to apply the explicit timezone 2960 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 2961 2962 return timestamp
2964 def tablesample_sql( 2965 self, 2966 expression: exp.TableSample, 2967 tablesample_keyword: str | None = None, 2968 ) -> str: 2969 if not isinstance(expression.parent, exp.Select): 2970 # This sample clause only applies to a single source, not the entire resulting relation 2971 tablesample_keyword = "TABLESAMPLE" 2972 2973 if expression.args.get("size"): 2974 method = expression.args.get("method") 2975 if method and method.name.upper() != "RESERVOIR": 2976 self.unsupported( 2977 f"Sampling method {method} is not supported with a discrete sample count, " 2978 "defaulting to reservoir sampling" 2979 ) 2980 expression.set("method", exp.var("RESERVOIR")) 2981 2982 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword)
2984 def join_sql(self, expression: exp.Join) -> str: 2985 if ( 2986 not expression.args.get("using") 2987 and not expression.args.get("on") 2988 and not expression.method 2989 and (expression.kind in ("", "INNER", "OUTER")) 2990 ): 2991 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 2992 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 2993 if isinstance(expression.this, exp.Unnest): 2994 return super().join_sql(expression.on(exp.true())) 2995 2996 expression.set("side", None) 2997 expression.set("kind", None) 2998 2999 return super().join_sql(expression)
3008 def bracket_sql(self, expression: exp.Bracket) -> str: 3009 if self.dialect.version >= (1, 2): 3010 return super().bracket_sql(expression) 3011 3012 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3013 this = expression.this 3014 if isinstance(this, exp.Array): 3015 this.replace(exp.paren(this)) 3016 3017 bracket = super().bracket_sql(expression) 3018 3019 if not expression.args.get("returns_list_for_maps"): 3020 if not this.type: 3021 from sqlglot.optimizer.annotate_types import annotate_types 3022 3023 this = annotate_types(this, dialect=self.dialect) 3024 3025 if this.is_type(exp.DType.MAP): 3026 bracket = f"({bracket})[1]" 3027 3028 return bracket
3030 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3031 func = expression.this 3032 3033 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3034 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3035 if isinstance(func, exp.ArrayAgg): 3036 if not isinstance(order := expression.expression, exp.Order): 3037 return self.sql(func) 3038 3039 # Save the original column for FILTER clause (before wrapping with Order) 3040 original_this = func.this 3041 3042 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3043 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3044 func.set( 3045 "this", 3046 exp.Order( 3047 this=func.this.copy(), 3048 expressions=order.expressions, 3049 ), 3050 ) 3051 3052 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3053 # Use original_this (not the Order-wrapped version) for the FILTER condition 3054 array_agg_sql = self.function_fallback_sql(func) 3055 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3056 3057 # For other functions (like PERCENTILES), use existing logic 3058 expression_sql = self.sql(expression, "expression") 3059 3060 if isinstance(func, exp.PERCENTILES): 3061 # Make the order key the first arg and slide the fraction to the right 3062 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3063 order_col = expression.find(exp.Ordered) 3064 if order_col: 3065 func.set("expression", func.this) 3066 func.set("this", order_col.this) 3067 3068 this = self.sql(expression, "this").rstrip(")") 3069 3070 return f"{this}{expression_sql})"
3072 def length_sql(self, expression: exp.Length) -> str: 3073 arg = expression.this 3074 3075 # Dialects like BQ and Snowflake also accept binary values as args, so 3076 # DDB will attempt to infer the type or resort to case/when resolution 3077 if not expression.args.get("binary") or arg.is_string: 3078 return self.func("LENGTH", arg) 3079 3080 if not arg.type: 3081 from sqlglot.optimizer.annotate_types import annotate_types 3082 3083 arg = annotate_types(arg, dialect=self.dialect) 3084 3085 if arg.is_type(*exp.DataType.TEXT_TYPES): 3086 return self.func("LENGTH", arg) 3087 3088 # We need these casts to make duckdb's static type checker happy 3089 blob = exp.cast(arg, exp.DType.VARBINARY) 3090 varchar = exp.cast(arg, exp.DType.VARCHAR) 3091 3092 case = ( 3093 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3094 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3095 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3096 ) 3097 return self.sql(case)
3116 def collate_sql(self, expression: exp.Collate) -> str: 3117 if not expression.expression.is_string: 3118 return super().collate_sql(expression) 3119 3120 raw = expression.expression.name 3121 if not raw: 3122 return self.sql(expression.this) 3123 3124 parts = [] 3125 for part in raw.split("-"): 3126 lower = part.lower() 3127 if lower not in _SNOWFLAKE_COLLATION_DEFAULTS: 3128 if lower in _SNOWFLAKE_COLLATION_UNSUPPORTED: 3129 self.unsupported( 3130 f"Snowflake collation specifier '{part}' has no DuckDB equivalent" 3131 ) 3132 parts.append(lower) 3133 3134 if not parts: 3135 return self.sql(expression.this) 3136 return super().collate_sql( 3137 exp.Collate(this=expression.this, expression=exp.var(".".join(parts))) 3138 )
3170 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3171 this = expression.this 3172 pattern = expression.expression 3173 position = expression.args.get("position") 3174 parameters = expression.args.get("parameters") 3175 3176 # Validate flags - only "ims" flags are supported for embedded patterns 3177 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3178 3179 if position: 3180 this = exp.Substring(this=this, start=position) 3181 3182 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3183 if validated_flags: 3184 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 3185 3186 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3187 result = ( 3188 exp.case() 3189 .when( 3190 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3191 exp.Literal.number(0), 3192 ) 3193 .else_( 3194 exp.Length( 3195 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3196 ) 3197 ) 3198 ) 3199 3200 return self.sql(result)
3202 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3203 subject = expression.this 3204 pattern = expression.expression 3205 replacement = expression.args.get("replacement") or exp.Literal.string("") 3206 position = expression.args.get("position") 3207 occurrence = expression.args.get("occurrence") 3208 modifiers = expression.args.get("modifiers") 3209 3210 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3211 3212 # Handle occurrence (only literals supported) 3213 if occurrence and not occurrence.is_int: 3214 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3215 else: 3216 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3217 if occurrence > 1: 3218 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3219 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3220 elif ( 3221 occurrence == 0 3222 and "g" not in validated_flags 3223 and not expression.args.get("single_replace") 3224 ): 3225 validated_flags += "g" 3226 3227 # Handle position (only literals supported) 3228 prefix = None 3229 if position and not position.is_int: 3230 self.unsupported("REGEXP_REPLACE with non-literal position") 3231 elif position and position.is_int and position.to_py() > 1: 3232 pos = position.to_py() 3233 prefix = exp.Substring( 3234 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3235 ) 3236 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3237 3238 result: exp.Expr = exp.Anonymous( 3239 this="REGEXP_REPLACE", 3240 expressions=[ 3241 subject, 3242 pattern, 3243 replacement, 3244 exp.Literal.string(validated_flags) if validated_flags else None, 3245 ], 3246 ) 3247 3248 if prefix: 3249 result = exp.Concat(expressions=[prefix, result]) 3250 3251 return self.sql(result)
3253 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3254 this = expression.this 3255 pattern = expression.expression 3256 flag = expression.args.get("flag") 3257 3258 if expression.args.get("full_match"): 3259 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3260 flag = exp.Literal.string(validated_flags) if validated_flags else None 3261 return self.func("REGEXP_FULL_MATCH", this, pattern, flag) 3262 3263 return self.func("REGEXP_MATCHES", this, pattern, flag)
3265 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3266 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3267 this = expression.this 3268 expr = expression.expression 3269 max_dist = expression.args.get("max_dist") 3270 3271 if max_dist is None: 3272 return self.func("LEVENSHTEIN", this, expr) 3273 3274 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3275 levenshtein = exp.Levenshtein(this=this, expression=expr) 3276 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
3278 def pad_sql(self, expression: exp.Pad) -> str: 3279 """ 3280 Handle RPAD/LPAD for VARCHAR and BINARY types. 3281 3282 For VARCHAR: Delegate to parent class 3283 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3284 """ 3285 string_arg = expression.this 3286 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3287 3288 if _is_binary(string_arg) or _is_binary(fill_arg): 3289 length_arg = expression.expression 3290 is_left = expression.args.get("is_left") 3291 3292 input_len = exp.ByteLength(this=string_arg) 3293 chars_needed = length_arg - input_len 3294 pad_count = exp.Greatest( 3295 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3296 ) 3297 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3298 3299 left, right = string_arg, repeat_expr 3300 if is_left: 3301 left, right = right, left 3302 3303 result = exp.DPipe(this=left, expression=right) 3304 return self.sql(result) 3305 3306 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3307 return super().pad_sql(expression)
Handle RPAD/LPAD for VARCHAR and BINARY types.
For VARCHAR: Delegate to parent class For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input)))
3309 def minhash_sql(self, expression: exp.Minhash) -> str: 3310 k = expression.this 3311 exprs = expression.expressions 3312 3313 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3314 self.unsupported( 3315 "MINHASH with multiple expressions or * requires manual query restructuring" 3316 ) 3317 return self.func("MINHASH", k, *exprs) 3318 3319 expr = exprs[0] 3320 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3321 return f"({self.sql(result)})"
3345 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3346 arr = expression.this 3347 func = self.func("LIST_DISTINCT", arr) 3348 3349 if expression.args.get("check_null"): 3350 add_null_to_array = exp.func( 3351 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3352 ) 3353 return self.sql( 3354 exp.If( 3355 this=exp.NEQ( 3356 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3357 ), 3358 true=add_null_to_array, 3359 false=func, 3360 ) 3361 ) 3362 3363 return func
3365 def arrayintersect_sql(self, expression: exp.ArrayIntersect) -> str: 3366 if expression.args.get("is_multiset") and len(expression.expressions) == 2: 3367 return self._array_bag_sql( 3368 self.ARRAY_INTERSECTION_CONDITION, 3369 expression.expressions[0], 3370 expression.expressions[1], 3371 ) 3372 return self.function_fallback_sql(expression)
3374 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3375 arr1, arr2 = expression.this, expression.expression 3376 if expression.args.get("is_multiset"): 3377 return self._array_bag_sql(self.ARRAY_EXCEPT_CONDITION, arr1, arr2) 3378 return self.sql( 3379 exp.replace_placeholders(self.ARRAY_EXCEPT_SET_TEMPLATE, arr1=arr1, arr2=arr2) 3380 )
3382 def arrayslice_sql(self, expression: exp.ArraySlice) -> str: 3383 """ 3384 Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's 3385 ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE 3386 expressions that adjust the index at query time: 3387 - start: CASE WHEN start >= 0 THEN start + 1 ELSE start END 3388 - end: CASE WHEN end < 0 THEN end - 1 ELSE end END 3389 """ 3390 start, end = expression.args.get("start"), expression.args.get("end") 3391 3392 if expression.args.get("zero_based"): 3393 if start is not None: 3394 start = ( 3395 exp.case() 3396 .when( 3397 exp.GTE(this=start.copy(), expression=exp.Literal.number(0)), 3398 exp.Add(this=start.copy(), expression=exp.Literal.number(1)), 3399 ) 3400 .else_(start) 3401 ) 3402 if end is not None: 3403 end = ( 3404 exp.case() 3405 .when( 3406 exp.LT(this=end.copy(), expression=exp.Literal.number(0)), 3407 exp.Sub(this=end.copy(), expression=exp.Literal.number(1)), 3408 ) 3409 .else_(end) 3410 ) 3411 3412 return self.func("ARRAY_SLICE", expression.this, start, end, expression.args.get("step"))
Transpiles Snowflake's ARRAY_SLICE (0-indexed, exclusive end) to DuckDB's ARRAY_SLICE (1-indexed, inclusive end) by wrapping start and end in CASE expressions that adjust the index at query time:
- start: CASE WHEN start >= 0 THEN start + 1 ELSE start END
- end: CASE WHEN end < 0 THEN end - 1 ELSE end END
3414 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3415 args = expression.expressions 3416 3417 if not args: 3418 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3419 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3420 3421 # Build placeholder values for template 3422 lengths = [exp.Length(this=arg) for arg in args] 3423 max_len = ( 3424 lengths[0] 3425 if len(lengths) == 1 3426 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3427 ) 3428 3429 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3430 empty_struct = exp.func( 3431 "STRUCT", 3432 *[ 3433 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3434 for i in range(len(args)) 3435 ], 3436 ) 3437 3438 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3439 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3440 index = exp.column("__i") + 1 3441 transform_struct = exp.func( 3442 "STRUCT", 3443 *[ 3444 exp.PropertyEQ( 3445 this=exp.Literal.string(f"${i + 1}"), 3446 expression=exp.func("COALESCE", arg, exp.array())[index], 3447 ) 3448 for i, arg in enumerate(args) 3449 ], 3450 ) 3451 3452 result = exp.replace_placeholders( 3453 self.ARRAYS_ZIP_TEMPLATE.copy(), 3454 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3455 all_empty_check=exp.and_( 3456 *[ 3457 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3458 for arg in args 3459 ] 3460 ), 3461 empty_struct=empty_struct, 3462 max_len=max_len, 3463 transform_struct=transform_struct, 3464 ) 3465 return self.sql(result)
3512 def stuff_sql(self, expression: exp.Stuff) -> str: 3513 base = expression.this 3514 start = expression.args["start"] 3515 length = expression.args["length"] 3516 insertion = expression.expression 3517 is_binary = _is_binary(base) 3518 3519 if is_binary: 3520 # DuckDB's SUBSTRING doesn't accept BLOB; operate on the HEX string instead 3521 # (each byte = 2 hex chars), then UNHEX back to BLOB 3522 base = exp.Hex(this=base) 3523 insertion = exp.Hex(this=insertion) 3524 left = exp.Substring( 3525 this=base.copy(), 3526 start=exp.Literal.number(1), 3527 length=(start.copy() - exp.Literal.number(1)) * exp.Literal.number(2), 3528 ) 3529 right = exp.Substring( 3530 this=base.copy(), 3531 start=((start + length) - exp.Literal.number(1)) * exp.Literal.number(2) 3532 + exp.Literal.number(1), 3533 ) 3534 else: 3535 left = exp.Substring( 3536 this=base.copy(), 3537 start=exp.Literal.number(1), 3538 length=start.copy() - exp.Literal.number(1), 3539 ) 3540 right = exp.Substring(this=base.copy(), start=start + length) 3541 result: exp.Expr = exp.DPipe( 3542 this=exp.DPipe(this=left, expression=insertion), expression=right 3543 ) 3544 3545 if is_binary: 3546 result = exp.Unhex(this=result) 3547 3548 return self.sql(result)
3550 def rand_sql(self, expression: exp.Rand) -> str: 3551 seed = expression.this 3552 if seed is not None: 3553 self.unsupported("RANDOM with seed is not supported in DuckDB") 3554 3555 lower = expression.args.get("lower") 3556 upper = expression.args.get("upper") 3557 3558 if lower and upper: 3559 # scale DuckDB's [0,1) to the specified range 3560 range_size = exp.paren(upper - lower) 3561 scaled = exp.Add(this=lower, expression=exp.func("random") * range_size) 3562 3563 # For now we assume that if bounds are set, return type is BIGINT. Snowflake/Teradata 3564 result = exp.cast(scaled, exp.DType.BIGINT) 3565 return self.sql(result) 3566 3567 # Default DuckDB behavior - just return RANDOM() as float 3568 return "RANDOM()"
3570 def bytelength_sql(self, expression: exp.ByteLength) -> str: 3571 arg = expression.this 3572 3573 # Check if it's a text type (handles both literals and annotated expressions) 3574 if arg.is_type(*exp.DataType.TEXT_TYPES): 3575 return self.func("OCTET_LENGTH", exp.Encode(this=arg)) 3576 3577 # Default: pass through as-is (conservative for DuckDB, handles binary and unannotated) 3578 return self.func("OCTET_LENGTH", arg)
3580 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3581 # DuckDB TO_BASE64 requires BLOB input 3582 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3583 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3584 result = expression.this 3585 3586 # Check if input is a string type - ENCODE only accepts VARCHAR 3587 if result.is_type(*exp.DataType.TEXT_TYPES): 3588 result = exp.Encode(this=result) 3589 3590 result = exp.ToBase64(this=result) 3591 3592 max_line_length = expression.args.get("max_line_length") 3593 alphabet = expression.args.get("alphabet") 3594 3595 # Handle custom alphabet by replacing standard chars with custom ones 3596 result = _apply_base64_alphabet_replacements(result, alphabet) 3597 3598 # Handle max_line_length by inserting newlines every N characters 3599 line_length = ( 3600 t.cast(int, max_line_length.to_py()) 3601 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3602 else 0 3603 ) 3604 if line_length > 0: 3605 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3606 result = exp.Trim( 3607 this=exp.RegexpReplace( 3608 this=result, 3609 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3610 replacement=exp.Concat(expressions=[exp.Literal.string("\\1"), newline.copy()]), 3611 ), 3612 expression=newline, 3613 position="TRAILING", 3614 ) 3615 3616 return self.sql(result)
3618 def hex_sql(self, expression: exp.Hex) -> str: 3619 case = expression.args.get("case") 3620 3621 if not case: 3622 return self.func("HEX", expression.this) 3623 3624 hex_expr = exp.Hex(this=expression.this) 3625 return self.sql( 3626 exp.case() 3627 .when(case.is_(exp.null()), exp.null()) 3628 .when(case.copy().eq(0), exp.Lower(this=hex_expr.copy())) 3629 .else_(hex_expr) 3630 )
3632 def replace_sql(self, expression: exp.Replace) -> str: 3633 result_sql = self.func( 3634 "REPLACE", 3635 _cast_to_varchar(expression.this), 3636 _cast_to_varchar(expression.expression), 3637 _cast_to_varchar(expression.args.get("replacement")), 3638 ) 3639 return _gen_with_cast_to_blob(self, expression, result_sql)
3651 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3652 this = expression.this 3653 key = expression.args.get("key") 3654 key_sql = key.name if isinstance(key, exp.Expr) else "" 3655 value_sql = self.sql(expression, "value") 3656 3657 kv_sql = f"{key_sql} := {value_sql}" 3658 3659 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3660 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3661 if isinstance(this, exp.Struct) and not this.expressions: 3662 return self.func("STRUCT_PACK", kv_sql) 3663 3664 return self.func("STRUCT_INSERT", this, kv_sql)
3679 def mapdelete_sql(self, expression: exp.MapDelete) -> str: 3680 map_arg = expression.this 3681 keys_to_delete = expression.expressions 3682 3683 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3684 3685 lambda_expr = exp.Lambda( 3686 this=exp.In(this=x_dot_key, expressions=keys_to_delete).not_(), 3687 expressions=[exp.to_identifier("x")], 3688 ) 3689 result = exp.func( 3690 "MAP_FROM_ENTRIES", 3691 exp.ArrayFilter(this=exp.func("MAP_ENTRIES", map_arg), expression=lambda_expr), 3692 ) 3693 return self.sql(result)
3695 def mappick_sql(self, expression: exp.MapPick) -> str: 3696 map_arg = expression.this 3697 keys_to_pick = expression.expressions 3698 3699 x_dot_key = exp.Dot(this=exp.to_identifier("x"), expression=exp.to_identifier("key")) 3700 3701 if len(keys_to_pick) == 1 and keys_to_pick[0].is_type(exp.DType.ARRAY): 3702 lambda_expr = exp.Lambda( 3703 this=exp.func("ARRAY_CONTAINS", keys_to_pick[0], x_dot_key), 3704 expressions=[exp.to_identifier("x")], 3705 ) 3706 else: 3707 lambda_expr = exp.Lambda( 3708 this=exp.In(this=x_dot_key, expressions=keys_to_pick), 3709 expressions=[exp.to_identifier("x")], 3710 ) 3711 3712 result = exp.func( 3713 "MAP_FROM_ENTRIES", 3714 exp.func("LIST_FILTER", exp.func("MAP_ENTRIES", map_arg), lambda_expr), 3715 ) 3716 return self.sql(result)
3721 @unsupported_args("update_flag") 3722 def mapinsert_sql(self, expression: exp.MapInsert) -> str: 3723 map_arg = expression.this 3724 key = expression.args.get("key") 3725 value = expression.args.get("value") 3726 3727 map_type = map_arg.type 3728 3729 if value is not None: 3730 if map_type and map_type.expressions and len(map_type.expressions) > 1: 3731 # Extract the value type from MAP(key_type, value_type) 3732 value_type = map_type.expressions[1] 3733 # Cast value to match the map's value type to avoid type conflicts 3734 value = exp.cast(value, value_type) 3735 # else: polymorphic MAP case - no type parameters available, use value as-is 3736 3737 # Create a single-entry map for the new key-value pair 3738 new_entry_struct = exp.Struct(expressions=[exp.PropertyEQ(this=key, expression=value)]) 3739 new_entry: exp.Expression = exp.ToMap(this=new_entry_struct) 3740 3741 # Use MAP_CONCAT to merge the original map with the new entry 3742 # This automatically handles both insert and update cases 3743 result = exp.func("MAP_CONCAT", map_arg, new_entry) 3744 3745 return self.sql(result)
3763 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3764 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3765 if isinstance(expression.this, exp.Generator): 3766 # Preserve alias, joins, and other table-level args 3767 table = exp.Table( 3768 this=expression.this, 3769 alias=expression.args.get("alias"), 3770 joins=expression.args.get("joins"), 3771 ) 3772 return self.sql(table) 3773 3774 return super().tablefromrows_sql(expression)
3776 def unnest_sql(self, expression: exp.Unnest) -> str: 3777 explode_array = expression.args.get("explode_array") 3778 if explode_array: 3779 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3780 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3781 expression.expressions.append( 3782 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3783 ) 3784 3785 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3786 alias = expression.args.get("alias") 3787 if isinstance(alias, exp.TableAlias): 3788 expression.set("alias", None) 3789 if alias.columns: 3790 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3791 3792 unnest_sql = super().unnest_sql(expression) 3793 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3794 return self.sql(select) 3795 3796 return super().unnest_sql(expression)
3798 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3799 this = expression.this 3800 3801 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3802 # DuckDB should render IGNORE NULLS only for the general-purpose 3803 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3804 return super().ignorenulls_sql(expression) 3805 3806 if isinstance(this, exp.First): 3807 this = exp.AnyValue(this=this.this) 3808 3809 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3810 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3811 3812 return self.sql(this)
3814 def split_sql(self, expression: exp.Split) -> str: 3815 base_func = exp.func("STR_SPLIT", expression.this, expression.expression) 3816 3817 case_expr = exp.case().else_(base_func) 3818 needs_case = False 3819 3820 if expression.args.get("null_returns_null"): 3821 case_expr = case_expr.when(expression.expression.is_(exp.null()), exp.null()) 3822 needs_case = True 3823 3824 if expression.args.get("empty_delimiter_returns_whole"): 3825 # When delimiter is empty string, return input string as single array element 3826 array_with_input = exp.array(expression.this) 3827 case_expr = case_expr.when( 3828 expression.expression.eq(exp.Literal.string("")), array_with_input 3829 ) 3830 needs_case = True 3831 3832 return self.sql(case_expr if needs_case else base_func)
3834 def splitpart_sql(self, expression: exp.SplitPart) -> str: 3835 string_arg = expression.this 3836 delimiter_arg = expression.args.get("delimiter") 3837 part_index_arg = expression.args.get("part_index") 3838 3839 if delimiter_arg and part_index_arg: 3840 # Handle Snowflake's "index 0 and 1 both return first element" behavior 3841 if expression.args.get("part_index_zero_as_one"): 3842 # Convert 0 to 1 for compatibility 3843 3844 part_index_arg = exp.Paren( 3845 this=exp.case() 3846 .when(part_index_arg.eq(exp.Literal.number("0")), exp.Literal.number("1")) 3847 .else_(part_index_arg) 3848 ) 3849 3850 # Use Anonymous to avoid recursion 3851 base_func_expr: exp.Expr = exp.Anonymous( 3852 this="SPLIT_PART", expressions=[string_arg, delimiter_arg, part_index_arg] 3853 ) 3854 needs_case_transform = False 3855 case_expr = exp.case().else_(base_func_expr) 3856 3857 if expression.args.get("empty_delimiter_returns_whole"): 3858 # When delimiter is empty string: 3859 # - Return whole string if part_index is 1 or -1 3860 # - Return empty string otherwise 3861 empty_case = exp.Paren( 3862 this=exp.case() 3863 .when( 3864 exp.or_( 3865 part_index_arg.eq(exp.Literal.number("1")), 3866 part_index_arg.eq(exp.Literal.number("-1")), 3867 ), 3868 string_arg, 3869 ) 3870 .else_(exp.Literal.string("")) 3871 ) 3872 3873 case_expr = case_expr.when(delimiter_arg.eq(exp.Literal.string("")), empty_case) 3874 needs_case_transform = True 3875 3876 """ 3877 Output looks something like this: 3878 3879 CASE 3880 WHEN delimiter is '' THEN 3881 ( 3882 CASE 3883 WHEN adjusted_part_index = 1 OR adjusted_part_index = -1 THEN input 3884 ELSE '' END 3885 ) 3886 ELSE SPLIT_PART(input, delimiter, adjusted_part_index) 3887 END 3888 3889 """ 3890 return self.sql(case_expr if needs_case_transform else base_func_expr) 3891 3892 return self.function_fallback_sql(expression)
3894 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3895 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3896 # DuckDB should render RESPECT NULLS only for the general-purpose 3897 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3898 return super().respectnulls_sql(expression) 3899 3900 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3901 return self.sql(expression, "this")
3903 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3904 null = expression.args.get("null") 3905 3906 if expression.args.get("null_is_empty"): 3907 x = exp.to_identifier("x") 3908 list_transform = exp.Transform( 3909 this=expression.this.copy(), 3910 expression=exp.Lambda( 3911 this=exp.Coalesce( 3912 this=exp.cast(x, "TEXT"), expressions=[exp.Literal.string("")] 3913 ), 3914 expressions=[x], 3915 ), 3916 ) 3917 array_to_string = exp.ArrayToString( 3918 this=list_transform, expression=expression.expression 3919 ) 3920 if expression.args.get("null_delim_is_null"): 3921 return self.sql( 3922 exp.case() 3923 .when(expression.expression.copy().is_(exp.null()), exp.null()) 3924 .else_(array_to_string) 3925 ) 3926 return self.sql(array_to_string) 3927 3928 if null: 3929 x = exp.to_identifier("x") 3930 return self.sql( 3931 exp.ArrayToString( 3932 this=exp.Transform( 3933 this=expression.this, 3934 expression=exp.Lambda( 3935 this=exp.Coalesce(this=x, expressions=[null]), 3936 expressions=[x], 3937 ), 3938 ), 3939 expression=expression.expression, 3940 ) 3941 ) 3942 3943 return self.func("ARRAY_TO_STRING", expression.this, expression.expression)
3945 def concatws_sql(self, expression: exp.ConcatWs) -> str: 3946 # DuckDB-specific: handle binary types using DPipe (||) operator 3947 separator = seq_get(expression.expressions, 0) 3948 args = expression.expressions[1:] 3949 3950 if any(_is_binary(arg) for arg in [separator, *args]): 3951 result = args[0] 3952 for arg in args[1:]: 3953 result = exp.DPipe( 3954 this=exp.DPipe(this=result, expression=separator), expression=arg 3955 ) 3956 return self.sql(result) 3957 3958 return super().concatws_sql(expression)
4019 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 4020 this = expression.this 4021 pattern = expression.expression 4022 position = expression.args.get("position") 4023 orig_occ = expression.args.get("occurrence") 4024 occurrence = orig_occ or exp.Literal.number(1) 4025 option = expression.args.get("option") 4026 parameters = expression.args.get("parameters") 4027 4028 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 4029 if validated_flags: 4030 pattern = exp.Concat(expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern]) 4031 4032 # Handle starting position offset 4033 pos_offset: exp.Expr = exp.Literal.number(0) 4034 if position and (not position.is_int or position.to_py() > 1): 4035 this = exp.Substring(this=this, start=position) 4036 pos_offset = position - exp.Literal.number(1) 4037 4038 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 4039 def sum_lengths(func_name: str, end: exp.Expr) -> exp.Expr: 4040 lst = exp.Bracket( 4041 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 4042 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 4043 offset=1, 4044 ) 4045 transform = exp.Anonymous( 4046 this="LIST_TRANSFORM", 4047 expressions=[ 4048 lst, 4049 exp.Lambda( 4050 this=exp.Length(this=exp.to_identifier("x")), 4051 expressions=[exp.to_identifier("x")], 4052 ), 4053 ], 4054 ) 4055 return exp.Coalesce( 4056 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 4057 expressions=[exp.Literal.number(0)], 4058 ) 4059 4060 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 4061 base_pos: exp.Expr = ( 4062 exp.Literal.number(1) 4063 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 4064 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 4065 + pos_offset 4066 ) 4067 4068 # option=1: add match length for end position 4069 if option and option.is_int and option.to_py() == 1: 4070 match_at_occ = exp.Bracket( 4071 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 4072 expressions=[occurrence], 4073 offset=1, 4074 ) 4075 base_pos = base_pos + exp.Coalesce( 4076 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 4077 ) 4078 4079 # NULL checks for all provided arguments 4080 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 4081 null_args = [ 4082 expression.this, 4083 expression.expression, 4084 position, 4085 orig_occ, 4086 option, 4087 parameters, 4088 ] 4089 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 4090 4091 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 4092 4093 return self.sql( 4094 exp.case() 4095 .when(exp.or_(*null_checks), exp.Null()) 4096 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 4097 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 4098 .else_(base_pos) 4099 )
4101 @unsupported_args("culture") 4102 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 4103 fmt = expression.args.get("format") 4104 if fmt and fmt.is_int: 4105 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 4106 4107 self.unsupported("Only integer formats are supported by NumberToStr") 4108 return self.function_fallback_sql(expression)
4121 def posexplode_sql(self, expression: exp.Posexplode) -> str: 4122 this = expression.this 4123 parent = expression.parent 4124 4125 # The default Spark aliases are "pos" and "col", unless specified otherwise 4126 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 4127 4128 if isinstance(parent, exp.Aliases): 4129 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 4130 pos, col = parent.expressions 4131 elif isinstance(parent, exp.Table): 4132 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 4133 alias = parent.args.get("alias") 4134 if alias: 4135 pos, col = alias.columns or [pos, col] 4136 alias.pop() 4137 4138 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 4139 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 4140 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 4141 gen_subscripts = self.sql( 4142 exp.Alias( 4143 this=exp.Anonymous( 4144 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 4145 ) 4146 - exp.Literal.number(1), 4147 alias=pos, 4148 ) 4149 ) 4150 4151 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 4152 4153 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 4154 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 4155 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 4156 4157 return posexplode_sql
4159 def addmonths_sql(self, expression: exp.AddMonths) -> str: 4160 """ 4161 Handles three key issues: 4162 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 4163 2. End-of-month preservation: If input is last day of month, result is last day of result month 4164 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 4165 """ 4166 from sqlglot.optimizer.annotate_types import annotate_types 4167 4168 this = expression.this 4169 if not this.type: 4170 this = annotate_types(this, dialect=self.dialect) 4171 4172 if this.is_type(*exp.DataType.TEXT_TYPES): 4173 this = exp.Cast(this=this, to=exp.DataType(this=exp.DType.TIMESTAMP)) 4174 4175 # Detect float/decimal months to apply rounding (Snowflake behavior) 4176 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 4177 months_expr = expression.expression 4178 if not months_expr.type: 4179 months_expr = annotate_types(months_expr, dialect=self.dialect) 4180 4181 # Build interval or to_months expression based on type 4182 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 4183 interval_or_to_months = ( 4184 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 4185 if months_expr.is_type( 4186 exp.DType.FLOAT, 4187 exp.DType.DOUBLE, 4188 exp.DType.DECIMAL, 4189 ) 4190 # Integer case: standard INTERVAL N MONTH syntax 4191 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 4192 ) 4193 4194 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 4195 4196 # Apply end-of-month preservation if Snowflake flag is set 4197 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 4198 preserve_eom = expression.args.get("preserve_end_of_month") 4199 result_expr = ( 4200 exp.case() 4201 .when( 4202 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 4203 exp.func("LAST_DAY", date_add_expr), 4204 ) 4205 .else_(date_add_expr) 4206 if preserve_eom 4207 else date_add_expr 4208 ) 4209 4210 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 4211 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 4212 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 4213 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 4214 if this.is_type(exp.DType.DATE, exp.DType.TIMESTAMPTZ): 4215 return self.sql(exp.Cast(this=result_expr, to=this.type)) 4216 return self.sql(result_expr)
Handles three key issues:
- Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers
- End-of-month preservation: If input is last day of month, result is last day of result month
- Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP)
4230 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 4231 unit = expression.args.get("unit") 4232 date = expression.this 4233 4234 week_start = _week_unit_to_dow(unit) 4235 unit = unit_to_str(expression) 4236 4237 if week_start: 4238 result = self.sql( 4239 _build_week_trunc_expression(date, week_start, preserve_start_day=True) 4240 ) 4241 else: 4242 result = self.func("DATE_TRUNC", unit, date) 4243 4244 if ( 4245 expression.args.get("input_type_preserved") 4246 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 4247 and not (is_date_unit(unit) and date.is_type(exp.DType.DATE)) 4248 ): 4249 return self.sql(exp.Cast(this=result, to=date.type)) 4250 4251 return result
4253 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 4254 unit = unit_to_str(expression) 4255 zone = expression.args.get("zone") 4256 timestamp = expression.this 4257 date_unit = is_date_unit(unit) 4258 4259 if date_unit and zone: 4260 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 4261 # Double AT TIME ZONE needed for BigQuery compatibility: 4262 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4263 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4264 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4265 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4266 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4267 4268 result = self.func("DATE_TRUNC", unit, timestamp) 4269 if expression.args.get("input_type_preserved"): 4270 if timestamp.type and timestamp.is_type(exp.DType.TIME, exp.DType.TIMETZ): 4271 dummy_date = exp.Cast( 4272 this=exp.Literal.string("1970-01-01"), 4273 to=exp.DataType(this=exp.DType.DATE), 4274 ) 4275 date_time = exp.Add(this=dummy_date, expression=timestamp) 4276 result = self.func("DATE_TRUNC", unit, date_time) 4277 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4278 4279 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4280 date_unit and timestamp.is_type(exp.DType.DATE) 4281 ): 4282 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4283 4284 return result
4286 def trim_sql(self, expression: exp.Trim) -> str: 4287 expression.this.replace(_cast_to_varchar(expression.this)) 4288 if expression.expression: 4289 expression.expression.replace(_cast_to_varchar(expression.expression)) 4290 4291 result_sql = super().trim_sql(expression) 4292 return _gen_with_cast_to_blob(self, expression, result_sql)
4294 def round_sql(self, expression: exp.Round) -> str: 4295 this = expression.this 4296 decimals = expression.args.get("decimals") 4297 truncate = expression.args.get("truncate") 4298 4299 # DuckDB requires the scale (decimals) argument to be an INT 4300 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4301 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4302 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4303 decimals = exp.cast(decimals, exp.DType.INT) 4304 4305 func = "ROUND" 4306 if truncate: 4307 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4308 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4309 func = "ROUND_EVEN" 4310 truncate = None 4311 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4312 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4313 truncate = None 4314 4315 return self.func(func, this, decimals, truncate)
4317 def strtok_sql(self, expression: exp.Strtok) -> str: 4318 string_arg = expression.this 4319 delimiter_arg = expression.args.get("delimiter") 4320 part_index_arg = expression.args.get("part_index") 4321 4322 if delimiter_arg and part_index_arg: 4323 # Escape regex chars and build character class at runtime using REGEXP_REPLACE 4324 escaped_delimiter = exp.Anonymous( 4325 this="REGEXP_REPLACE", 4326 expressions=[ 4327 delimiter_arg, 4328 exp.Literal.string( 4329 r"([\[\]^.\-*+?(){}|$\\])" 4330 ), # Escape problematic regex chars 4331 exp.Literal.string( 4332 r"\\\1" 4333 ), # Replace with escaped version using $1 backreference 4334 exp.Literal.string("g"), # Global flag 4335 ], 4336 ) 4337 # CASE WHEN delimiter = '' THEN '' ELSE CONCAT('[', escaped_delimiter, ']') END 4338 regex_pattern = ( 4339 exp.case() 4340 .when(delimiter_arg.eq(exp.Literal.string("")), exp.Literal.string("")) 4341 .else_( 4342 exp.func( 4343 "CONCAT", 4344 exp.Literal.string("["), 4345 escaped_delimiter, 4346 exp.Literal.string("]"), 4347 ) 4348 ) 4349 ) 4350 4351 # STRTOK skips empty strings, so we need to filter them out 4352 # LIST_FILTER(REGEXP_SPLIT_TO_ARRAY(string, pattern), x -> x != '')[index] 4353 split_array = exp.func("REGEXP_SPLIT_TO_ARRAY", string_arg, regex_pattern) 4354 x = exp.to_identifier("x") 4355 is_empty = x.eq(exp.Literal.string("")) 4356 filtered_array = exp.func( 4357 "LIST_FILTER", 4358 split_array, 4359 exp.Lambda(this=exp.not_(is_empty.copy()), expressions=[x.copy()]), 4360 ) 4361 base_func = exp.Bracket( 4362 this=filtered_array, 4363 expressions=[part_index_arg], 4364 offset=1, 4365 ) 4366 4367 # Use template with the built regex pattern 4368 result = exp.replace_placeholders( 4369 self.STRTOK_TEMPLATE.copy(), 4370 string=string_arg, 4371 delimiter=delimiter_arg, 4372 part_index=part_index_arg, 4373 base_func=base_func, 4374 ) 4375 4376 return self.sql(result) 4377 4378 return self.function_fallback_sql(expression)
4380 def strtoktoarray_sql(self, expression: exp.StrtokToArray) -> str: 4381 string_arg = expression.this 4382 delimiter_arg = expression.args.get("expression") or exp.Literal.string(" ") 4383 4384 escaped = exp.RegexpReplace( 4385 this=delimiter_arg.copy(), 4386 expression=exp.Literal.string(r"([\[\]^.\-*+?(){}|$\\])"), 4387 replacement=exp.Literal.string(r"\\\1"), 4388 modifiers=exp.Literal.string("g"), 4389 ) 4390 return self.sql( 4391 exp.replace_placeholders( 4392 self.STRTOK_TO_ARRAY_TEMPLATE.copy(), 4393 string=string_arg, 4394 delimiter=delimiter_arg, 4395 escaped=escaped, 4396 ) 4397 )
4399 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4400 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4401 4402 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4403 if expression.is_type(*exp.DataType.REAL_TYPES): 4404 result = f"CAST({result} AS DOUBLE)" 4405 4406 return result
4408 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4409 """ 4410 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4411 dividing the input distribution into n equal-sized buckets. 4412 4413 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4414 does not document the specific algorithm used so results may differ. DuckDB does not 4415 support RESPECT NULLS. 4416 """ 4417 this = expression.this 4418 if isinstance(this, exp.Distinct): 4419 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4420 if len(this.expressions) < 2: 4421 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4422 return self.function_fallback_sql(expression) 4423 num_quantiles_expr = this.expressions[1].pop() 4424 else: 4425 num_quantiles_expr = expression.expression 4426 4427 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4428 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4429 return self.function_fallback_sql(expression) 4430 4431 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4432 if num_quantiles <= 0: 4433 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4434 return self.function_fallback_sql(expression) 4435 4436 quantiles = [ 4437 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4438 for i in range(num_quantiles + 1) 4439 ] 4440 4441 return self.sql(exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)))
BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values dividing the input distribution into n equal-sized buckets.
Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery does not document the specific algorithm used so results may differ. DuckDB does not support RESPECT NULLS.
4450 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4451 this = expression.this 4452 4453 if _is_binary(this): 4454 expression.type = exp.DType.BINARY.into_expr() 4455 4456 arg = _cast_to_bit(this) 4457 4458 if isinstance(this, exp.Neg): 4459 arg = exp.Paren(this=arg) 4460 4461 expression.set("this", arg) 4462 4463 result_sql = f"~{self.sql(expression, 'this')}" 4464 4465 return _gen_with_cast_to_blob(self, expression, result_sql)
4499 def uuid_sql(self, expression: exp.Uuid) -> str: 4500 namespace = expression.this 4501 name = expression.args.get("name") 4502 4503 # UUID v5 (namespace + name) - Emulate using SHA1 4504 if namespace and name: 4505 result = exp.replace_placeholders( 4506 self.UUID_V5_TEMPLATE.copy(), 4507 namespace=namespace, 4508 name=name, 4509 ) 4510 return self.sql(result) 4511 4512 return super().uuid_sql(expression)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- WINDOW_FUNCS_WITH_NULL_ORDERING
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SUPPORTS_MERGE_WHERE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_ONLY_LITERALS
- GROUPINGS_SEP
- INDEX_ON
- INOUT_SEPARATOR
- DIRECTED_JOINS
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_WITH_METHOD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- SUPPORTS_NAMED_CTE_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_MODIFY_COLUMN
- SUPPORTS_CHANGE_COLUMN
- LIKE_PROPERTY_INSIDE_SCHEMA
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- TRY_SUPPORTED
- SUPPORTS_UESCAPE
- UNICODE_SUBSTITUTE
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- SUPPORTS_BETWEEN_FLAGS
- MATCH_AGAINST_TABLE_PREFIX
- DECLARE_DEFAULT_ASSIGNMENT
- UPDATE_STATEMENT_SUPPORTS_FROM
- STAR_EXCLUDE_REQUIRES_DERIVED_TABLE
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- pseudocolumn_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- inoutcolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- triggerproperties_sql
- triggerreferencing_sql
- triggerevent_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_param_bound_limiter
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- uuidproperty_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- moduleproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- rollupindex_sql
- rollupproperty_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- booland_sql
- boolor_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- prewhere_sql
- where_sql
- partition_by_sql
- windowspec_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- convert_concat_args
- concat_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwiseor_sql
- bitwiserightshift_sql
- cast_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- modifycolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- dropprimarykey_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- eq_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- match_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- sub_sql
- trycast_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- generatetext_sql
- generatetable_sql
- generatebool_sql
- generateint_sql
- generatedouble_sql
- mltranslate_sql
- mlforecast_sql
- aiforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- changes_sql
- summarize_sql
- explodinggenerateseries_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- skipjsoncolumn_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- slice_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql
- initcap_sql
- localtime_sql
- localtimestamp_sql
- weekstart_sql
- block_sql
- storedprocedure_sql
- ifblock_sql
- whileblock_sql
- execute_sql
- executesql_sql
- altermodifysqlsecurity_sql
- usingproperty_sql
- renameindex_sql