sqlglot.dialects.duckdb
1from __future__ import annotations 2 3from decimal import Decimal 4from itertools import groupby 5import re 6import typing as t 7 8from sqlglot import exp, generator, parser, tokens, transforms 9 10from sqlglot.dialects.dialect import ( 11 DATETIME_DELTA, 12 Dialect, 13 JSON_EXTRACT_TYPE, 14 NormalizationStrategy, 15 approx_count_distinct_sql, 16 array_append_sql, 17 array_compact_sql, 18 array_concat_sql, 19 arrow_json_extract_sql, 20 binary_from_function, 21 build_default_decimal_type, 22 build_formatted_time, 23 build_regexp_extract, 24 count_if_to_sum, 25 date_delta_to_binary_interval_op, 26 date_trunc_to_time, 27 datestrtodate_sql, 28 encode_decode_sql, 29 explode_to_unnest_sql, 30 getbit_sql, 31 groupconcat_sql, 32 inline_array_unless_query, 33 jarowinkler_similarity, 34 months_between_sql, 35 no_datetime_sql, 36 no_comment_column_constraint_sql, 37 no_make_interval_sql, 38 no_time_sql, 39 no_timestamp_sql, 40 pivot_column_names, 41 rename_func, 42 remove_from_array_using_filter, 43 strposition_sql, 44 str_to_time_sql, 45 timestrtotime_sql, 46 unit_to_str, 47) 48from sqlglot.generator import unsupported_args 49from sqlglot.helper import is_date_unit, seq_get 50from sqlglot.tokens import TokenType 51from sqlglot.parser import binary_range_parser 52from sqlglot.typing.duckdb import EXPRESSION_METADATA 53 54# Regex to detect time zones in timestamps of the form [+|-]TT[:tt] 55# The pattern matches timezone offsets that appear after the time portion 56TIMEZONE_PATTERN = re.compile(r":\d{2}.*?[+\-]\d{2}(?::\d{2})?") 57 58# Characters that must be escaped when building regex expressions in INITCAP 59REGEX_ESCAPE_REPLACEMENTS = { 60 "\\": "\\\\", 61 "-": r"\-", 62 "^": r"\^", 63 "[": r"\[", 64 "]": r"\]", 65} 66 67# Used to in RANDSTR transpilation 68RANDSTR_CHAR_POOL = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 69RANDSTR_SEED = 123456 70 71# Whitespace control characters that DuckDB must process with `CHR({val})` calls 72WS_CONTROL_CHARS_TO_DUCK = { 73 "\u000b": 11, 74 "\u001c": 28, 75 "\u001d": 29, 76 "\u001e": 30, 77 "\u001f": 31, 78} 79 80# Days of week to ISO 8601 day-of-week numbers 81# ISO 8601 standard: Monday=1, Tuesday=2, Wednesday=3, Thursday=4, Friday=5, Saturday=6, Sunday=7 82WEEK_START_DAY_TO_DOW = { 83 "MONDAY": 1, 84 "TUESDAY": 2, 85 "WEDNESDAY": 3, 86 "THURSDAY": 4, 87 "FRIDAY": 5, 88 "SATURDAY": 6, 89 "SUNDAY": 7, 90} 91 92MAX_BIT_POSITION = exp.Literal.number(32768) 93 94# SEQ function constants 95_SEQ_BASE: exp.Expression = exp.maybe_parse("(ROW_NUMBER() OVER (ORDER BY 1) - 1)") 96_SEQ_RESTRICTED = (exp.Where, exp.Having, exp.AggFunc, exp.Order, exp.Select) 97# Maps SEQ expression types to their byte width (suffix indicates bytes: SEQ1=1, SEQ2=2, etc.) 98_SEQ_BYTE_WIDTH = {exp.Seq1: 1, exp.Seq2: 2, exp.Seq4: 4, exp.Seq8: 8} 99 100 101def _apply_base64_alphabet_replacements( 102 result: exp.Expression, 103 alphabet: t.Optional[exp.Expression], 104 reverse: bool = False, 105) -> exp.Expression: 106 """ 107 Apply base64 alphabet character replacements. 108 109 Base64 alphabet can be 1-3 chars: 1st = index 62 ('+'), 2nd = index 63 ('/'), 3rd = padding ('='). 110 zip truncates to the shorter string, so 1-char alphabet only replaces '+', 2-char replaces '+/', etc. 111 112 Args: 113 result: The expression to apply replacements to 114 alphabet: Custom alphabet literal (expected chars for +/=) 115 reverse: If False, replace default with custom (encode) 116 If True, replace custom with default (decode) 117 """ 118 if isinstance(alphabet, exp.Literal) and alphabet.is_string: 119 for default_char, new_char in zip("+/=", alphabet.this): 120 if new_char != default_char: 121 find, replace = (new_char, default_char) if reverse else (default_char, new_char) 122 result = exp.Replace( 123 this=result, 124 expression=exp.Literal.string(find), 125 replacement=exp.Literal.string(replace), 126 ) 127 return result 128 129 130def _base64_decode_sql(self: DuckDB.Generator, expression: exp.Expression, to_string: bool) -> str: 131 """ 132 Transpile Snowflake BASE64_DECODE_STRING/BINARY to DuckDB. 133 134 DuckDB uses FROM_BASE64() which returns BLOB. For string output, wrap with DECODE(). 135 Custom alphabets require REPLACE() calls to convert to standard base64. 136 """ 137 input_expr = expression.this 138 alphabet = expression.args.get("alphabet") 139 140 # Handle custom alphabet by replacing non-standard chars with standard ones 141 input_expr = _apply_base64_alphabet_replacements(input_expr, alphabet, reverse=True) 142 143 # FROM_BASE64 returns BLOB 144 input_expr = exp.FromBase64(this=input_expr) 145 146 if to_string: 147 input_expr = exp.Decode(this=input_expr) 148 149 return self.sql(input_expr) 150 151 152def _last_day_sql(self: DuckDB.Generator, expression: exp.LastDay) -> str: 153 """ 154 DuckDB's LAST_DAY only supports finding the last day of a month. 155 For other date parts (year, quarter, week), we need to implement equivalent logic. 156 """ 157 date_expr = expression.this 158 unit = expression.text("unit") 159 160 if not unit or unit.upper() == "MONTH": 161 # Default behavior - use DuckDB's native LAST_DAY 162 return self.func("LAST_DAY", date_expr) 163 164 if unit.upper() == "YEAR": 165 # Last day of year: December 31st of the same year 166 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 167 make_date_expr = exp.func( 168 "MAKE_DATE", year_expr, exp.Literal.number(12), exp.Literal.number(31) 169 ) 170 return self.sql(make_date_expr) 171 172 if unit.upper() == "QUARTER": 173 # Last day of quarter 174 year_expr = exp.func("EXTRACT", "YEAR", date_expr) 175 quarter_expr = exp.func("EXTRACT", "QUARTER", date_expr) 176 177 # Calculate last month of quarter: quarter * 3. Quarter can be 1 to 4 178 last_month_expr = exp.Mul(this=quarter_expr, expression=exp.Literal.number(3)) 179 first_day_last_month_expr = exp.func( 180 "MAKE_DATE", year_expr, last_month_expr, exp.Literal.number(1) 181 ) 182 183 # Last day of the last month of the quarter 184 last_day_expr = exp.func("LAST_DAY", first_day_last_month_expr) 185 return self.sql(last_day_expr) 186 187 if unit.upper() == "WEEK": 188 # DuckDB DAYOFWEEK: Sunday=0, Monday=1, ..., Saturday=6 189 dow = exp.func("EXTRACT", "DAYOFWEEK", date_expr) 190 # Days to the last day of week: (7 - dayofweek) % 7, assuming the last day of week is Sunday (Snowflake) 191 # Wrap in parentheses to ensure correct precedence 192 days_to_sunday_expr = exp.Mod( 193 this=exp.Paren(this=exp.Sub(this=exp.Literal.number(7), expression=dow)), 194 expression=exp.Literal.number(7), 195 ) 196 interval_expr = exp.Interval(this=days_to_sunday_expr, unit=exp.var("DAY")) 197 add_expr = exp.Add(this=date_expr, expression=interval_expr) 198 cast_expr = exp.cast(add_expr, exp.DataType.Type.DATE) 199 return self.sql(cast_expr) 200 201 self.unsupported(f"Unsupported date part '{unit}' in LAST_DAY function") 202 return self.function_fallback_sql(expression) 203 204 205def _is_nanosecond_unit(unit: t.Optional[exp.Expression]) -> bool: 206 return isinstance(unit, (exp.Var, exp.Literal)) and unit.name.upper() == "NANOSECOND" 207 208 209def _handle_nanosecond_diff( 210 self: DuckDB.Generator, 211 end_time: exp.Expression, 212 start_time: exp.Expression, 213) -> str: 214 """Generate NANOSECOND diff using EPOCH_NS since DATE_DIFF doesn't support it.""" 215 end_ns = exp.cast(end_time, exp.DataType.Type.TIMESTAMP_NS) 216 start_ns = exp.cast(start_time, exp.DataType.Type.TIMESTAMP_NS) 217 218 # Build expression tree: EPOCH_NS(end) - EPOCH_NS(start) 219 return self.sql( 220 exp.Sub(this=exp.func("EPOCH_NS", end_ns), expression=exp.func("EPOCH_NS", start_ns)) 221 ) 222 223 224def _to_boolean_sql(self: DuckDB.Generator, expression: exp.ToBoolean) -> str: 225 """ 226 Transpile TO_BOOLEAN and TRY_TO_BOOLEAN functions from Snowflake to DuckDB equivalent. 227 228 DuckDB's CAST to BOOLEAN supports most of Snowflake's TO_BOOLEAN strings except 'on'/'off'. 229 We need to handle the 'on'/'off' cases explicitly. 230 231 For TO_BOOLEAN (safe=False): NaN and INF values cause errors. We use DuckDB's native ERROR() 232 function to replicate this behavior with a clear error message. 233 234 For TRY_TO_BOOLEAN (safe=True): Use DuckDB's TRY_CAST for conversion, which returns NULL 235 for invalid inputs instead of throwing errors. 236 """ 237 arg = expression.this 238 is_safe = expression.args.get("safe", False) 239 240 base_case_expr = ( 241 exp.case() 242 .when( 243 # Handle 'on' -> TRUE (case insensitive) 244 exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("ON")), 245 exp.true(), 246 ) 247 .when( 248 # Handle 'off' -> FALSE (case insensitive) 249 exp.Upper(this=exp.cast(arg, exp.DataType.Type.VARCHAR)).eq(exp.Literal.string("OFF")), 250 exp.false(), 251 ) 252 ) 253 254 if is_safe: 255 # TRY_TO_BOOLEAN: handle 'on'/'off' and use TRY_CAST for everything else 256 case_expr = base_case_expr.else_(exp.func("TRY_CAST", arg, exp.DataType.build("BOOLEAN"))) 257 else: 258 # TO_BOOLEAN: handle NaN/INF errors, 'on'/'off', and use regular CAST 259 cast_to_real = exp.func("TRY_CAST", arg, exp.DataType.build("REAL")) 260 261 # Check for NaN and INF values 262 nan_inf_check = exp.Or( 263 this=exp.func("ISNAN", cast_to_real), expression=exp.func("ISINF", cast_to_real) 264 ) 265 266 case_expr = base_case_expr.when( 267 nan_inf_check, 268 exp.func( 269 "ERROR", 270 exp.Literal.string("TO_BOOLEAN: Non-numeric values NaN and INF are not supported"), 271 ), 272 ).else_(exp.cast(arg, exp.DataType.Type.BOOLEAN)) 273 274 return self.sql(case_expr) 275 276 277# BigQuery -> DuckDB conversion for the DATE function 278def _date_sql(self: DuckDB.Generator, expression: exp.Date) -> str: 279 this = expression.this 280 zone = self.sql(expression, "zone") 281 282 if zone: 283 # BigQuery considers "this" at UTC, converts it to the specified 284 # time zone and then keeps only the DATE part 285 # To micmic that, we: 286 # (1) Cast to TIMESTAMP to remove DuckDB's local tz 287 # (2) Apply consecutive AtTimeZone calls for UTC -> zone conversion 288 this = exp.cast(this, exp.DataType.Type.TIMESTAMP) 289 at_utc = exp.AtTimeZone(this=this, zone=exp.Literal.string("UTC")) 290 this = exp.AtTimeZone(this=at_utc, zone=zone) 291 292 return self.sql(exp.cast(expression=this, to=exp.DataType.Type.DATE)) 293 294 295# BigQuery -> DuckDB conversion for the TIME_DIFF function 296def _timediff_sql(self: DuckDB.Generator, expression: exp.TimeDiff) -> str: 297 unit = expression.unit 298 299 if _is_nanosecond_unit(unit): 300 return _handle_nanosecond_diff(self, expression.expression, expression.this) 301 302 this = exp.cast(expression.this, exp.DataType.Type.TIME) 303 expr = exp.cast(expression.expression, exp.DataType.Type.TIME) 304 305 # Although the 2 dialects share similar signatures, BQ seems to inverse 306 # the sign of the result so the start/end time operands are flipped 307 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 308 309 310def _date_delta_to_binary_interval_op( 311 cast: bool = True, 312) -> t.Callable[[DuckDB.Generator, DATETIME_DELTA], str]: 313 """ 314 DuckDB override to handle: 315 1. NANOSECOND operations (DuckDB doesn't support INTERVAL ... NANOSECOND) 316 2. Float/decimal interval values (DuckDB INTERVAL requires integers) 317 """ 318 base_impl = date_delta_to_binary_interval_op(cast=cast) 319 320 def _duckdb_date_delta_sql(self: DuckDB.Generator, expression: DATETIME_DELTA) -> str: 321 unit = expression.unit 322 interval_value = expression.expression 323 324 # Handle NANOSECOND unit (DuckDB doesn't support INTERVAL ... NANOSECOND) 325 if _is_nanosecond_unit(unit): 326 if isinstance(interval_value, exp.Interval): 327 interval_value = interval_value.this 328 329 timestamp_ns = exp.cast(expression.this, exp.DataType.Type.TIMESTAMP_NS) 330 331 return self.sql( 332 exp.func( 333 "MAKE_TIMESTAMP_NS", 334 exp.Add(this=exp.func("EPOCH_NS", timestamp_ns), expression=interval_value), 335 ) 336 ) 337 338 # Handle float/decimal interval values as duckDB INTERVAL requires integer expressions 339 if not interval_value or isinstance(interval_value, exp.Interval): 340 return base_impl(self, expression) 341 342 if interval_value.is_type(*exp.DataType.REAL_TYPES): 343 expression.set("expression", exp.cast(exp.func("ROUND", interval_value), "INT")) 344 345 return base_impl(self, expression) 346 347 return _duckdb_date_delta_sql 348 349 350def _array_insert_sql(self: DuckDB.Generator, expression: exp.ArrayInsert) -> str: 351 """ 352 Transpile ARRAY_INSERT to DuckDB using LIST_CONCAT and slicing. 353 354 Handles: 355 - 0-based and 1-based indexing (normalizes to 0-based for calculations) 356 - Negative position conversion (requires array length) 357 - NULL propagation (source dialects return NULL, DuckDB creates single-element array) 358 - Assumes position is within bounds per user constraint 359 360 Note: All dialects that support ARRAY_INSERT (Snowflake, Spark, Databricks) have 361 ARRAY_FUNCS_PROPAGATES_NULLS=True, so we always assume source propagates NULLs. 362 363 Args: 364 expression: The ArrayInsert expression to transpile. 365 366 Returns: 367 SQL string implementing ARRAY_INSERT behavior. 368 """ 369 this = expression.this 370 position = expression.args.get("position") 371 element = expression.expression 372 element_array = exp.Array(expressions=[element]) 373 index_offset = expression.args.get("offset", 0) 374 375 if not position or not position.is_int: 376 self.unsupported("ARRAY_INSERT can only be transpiled with a literal position") 377 return self.func("ARRAY_INSERT", this, position, element) 378 379 pos_value = position.to_py() 380 381 # Normalize one-based indexing to zero-based for slice calculations 382 # Spark (1-based) → Snowflake (0-based): 383 # Positive: pos=1 → pos=0 (subtract 1) 384 # Negative: pos=-2 → pos=-1 (add 1) 385 # Example: Spark array_insert([a,b,c], -2, d) → [a,b,d,c] is same as Snowflake pos=-1 386 if pos_value > 0: 387 pos_value = pos_value - index_offset 388 elif pos_value < 0: 389 pos_value = pos_value + index_offset 390 391 # Build the appropriate list_concat expression based on position 392 if pos_value == 0: 393 # insert at beginning 394 concat_exprs = [element_array, this] 395 elif pos_value > 0: 396 # Positive position: LIST_CONCAT(arr[1:pos], [elem], arr[pos+1:]) 397 # 0-based -> DuckDB 1-based slicing 398 399 # left slice: arr[1:pos] 400 slice_start = exp.Bracket( 401 this=this, 402 expressions=[ 403 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 404 ], 405 ) 406 407 # right slice: arr[pos+1:] 408 slice_end = exp.Bracket( 409 this=this, expressions=[exp.Slice(this=exp.Literal.number(pos_value + 1))] 410 ) 411 412 concat_exprs = [slice_start, element_array, slice_end] 413 else: 414 # Negative position: arr[1:LEN(arr)+pos], [elem], arr[LEN(arr)+pos+1:] 415 # pos=-1 means insert before last element 416 arr_len = exp.Length(this=this) 417 418 # Calculate slice position: LEN(arr) + pos (e.g., LEN(arr) + (-1) = LEN(arr) - 1) 419 slice_end_pos = arr_len + exp.Literal.number(pos_value) 420 slice_start_pos = slice_end_pos + exp.Literal.number(1) 421 422 # left slice: arr[1:LEN(arr)+pos] 423 slice_start = exp.Bracket( 424 this=this, 425 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 426 ) 427 428 # right slice: arr[LEN(arr)+pos+1:] 429 slice_end = exp.Bracket(this=this, expressions=[exp.Slice(this=slice_start_pos)]) 430 431 concat_exprs = [slice_start, element_array, slice_end] 432 433 # All dialects that support ARRAY_INSERT propagate NULLs (Snowflake/Spark/Databricks) 434 # Wrap in CASE WHEN array IS NULL THEN NULL ELSE func_expr END 435 return self.sql( 436 exp.If( 437 this=exp.Is(this=this, expression=exp.Null()), 438 true=exp.Null(), 439 false=self.func("LIST_CONCAT", *concat_exprs), 440 ) 441 ) 442 443 444def _array_remove_at_sql(self: DuckDB.Generator, expression: exp.ArrayRemoveAt) -> str: 445 """ 446 Transpile ARRAY_REMOVE_AT to DuckDB using LIST_CONCAT and slicing. 447 448 Handles: 449 - Positive positions (0-based indexing) 450 - Negative positions (from end of array) 451 - NULL propagation (Snowflake returns NULL for NULL array, DuckDB doesn't auto-propagate) 452 - Only supports literal integer positions (non-literals remain untranspiled) 453 454 Transpilation patterns: 455 - pos=0 (first): arr[2:] 456 - pos>0 (middle): LIST_CONCAT(arr[1:p], arr[p+2:]) 457 - pos=-1 (last): arr[1:LEN(arr)-1] 458 - pos<-1: LIST_CONCAT(arr[1:LEN(arr)+p], arr[LEN(arr)+p+2:]) 459 460 All wrapped in: CASE WHEN arr IS NULL THEN NULL ELSE ... END 461 462 Args: 463 expression: The ArrayRemoveAt expression to transpile. 464 465 Returns: 466 SQL string implementing ARRAY_REMOVE_AT behavior. 467 """ 468 this = expression.this 469 position = expression.args.get("position") 470 471 if not position or not position.is_int: 472 self.unsupported("ARRAY_REMOVE_AT can only be transpiled with a literal position") 473 return self.func("ARRAY_REMOVE_AT", this, position) 474 475 pos_value = position.to_py() 476 477 # Build the appropriate expression based on position 478 if pos_value == 0: 479 # Remove first element: arr[2:] 480 result_expr: exp.Expression | str = exp.Bracket( 481 this=this, 482 expressions=[exp.Slice(this=exp.Literal.number(2))], 483 ) 484 elif pos_value > 0: 485 # Remove at positive position: LIST_CONCAT(arr[1:pos], arr[pos+2:]) 486 # DuckDB uses 1-based slicing 487 left_slice = exp.Bracket( 488 this=this, 489 expressions=[ 490 exp.Slice(this=exp.Literal.number(1), expression=exp.Literal.number(pos_value)) 491 ], 492 ) 493 right_slice = exp.Bracket( 494 this=this, 495 expressions=[exp.Slice(this=exp.Literal.number(pos_value + 2))], 496 ) 497 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 498 elif pos_value == -1: 499 # Remove last element: arr[1:LEN(arr)-1] 500 # Optimization: simpler than general negative case 501 arr_len = exp.Length(this=this) 502 slice_end = arr_len + exp.Literal.number(-1) 503 result_expr = exp.Bracket( 504 this=this, 505 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end)], 506 ) 507 else: 508 # Remove at negative position: LIST_CONCAT(arr[1:LEN(arr)+pos], arr[LEN(arr)+pos+2:]) 509 arr_len = exp.Length(this=this) 510 slice_end_pos = arr_len + exp.Literal.number(pos_value) 511 slice_start_pos = slice_end_pos + exp.Literal.number(2) 512 513 left_slice = exp.Bracket( 514 this=this, 515 expressions=[exp.Slice(this=exp.Literal.number(1), expression=slice_end_pos)], 516 ) 517 right_slice = exp.Bracket( 518 this=this, 519 expressions=[exp.Slice(this=slice_start_pos)], 520 ) 521 result_expr = self.func("LIST_CONCAT", left_slice, right_slice) 522 523 # Snowflake ARRAY_FUNCS_PROPAGATES_NULLS=True, so wrap in NULL check 524 # CASE WHEN array IS NULL THEN NULL ELSE result_expr END 525 return self.sql( 526 exp.If( 527 this=exp.Is(this=this, expression=exp.Null()), 528 true=exp.Null(), 529 false=result_expr, 530 ) 531 ) 532 533 534@unsupported_args(("expression", "DuckDB's ARRAY_SORT does not support a comparator.")) 535def _array_sort_sql(self: DuckDB.Generator, expression: exp.ArraySort) -> str: 536 return self.func("ARRAY_SORT", expression.this) 537 538 539def _sort_array_sql(self: DuckDB.Generator, expression: exp.SortArray) -> str: 540 name = "ARRAY_REVERSE_SORT" if expression.args.get("asc") == exp.false() else "ARRAY_SORT" 541 return self.func(name, expression.this) 542 543 544def _array_contains_sql(self: DuckDB.Generator, expression: exp.ArrayContains) -> str: 545 this = expression.this 546 expr = expression.expression 547 548 func = self.func("ARRAY_CONTAINS", this, expr) 549 550 if expression.args.get("check_null"): 551 check_null_in_array = exp.Nullif( 552 this=exp.NEQ(this=exp.ArraySize(this=this), expression=exp.func("LIST_COUNT", this)), 553 expression=exp.false(), 554 ) 555 return self.sql(exp.If(this=expr.is_(exp.Null()), true=check_null_in_array, false=func)) 556 557 return func 558 559 560def _build_sort_array_desc(args: t.List) -> exp.Expression: 561 return exp.SortArray(this=seq_get(args, 0), asc=exp.false()) 562 563 564def _build_array_prepend(args: t.List) -> exp.Expression: 565 return exp.ArrayPrepend(this=seq_get(args, 1), expression=seq_get(args, 0)) 566 567 568def _build_date_diff(args: t.List) -> exp.Expression: 569 return exp.DateDiff(this=seq_get(args, 2), expression=seq_get(args, 1), unit=seq_get(args, 0)) 570 571 572def _build_generate_series(end_exclusive: bool = False) -> t.Callable[[t.List], exp.GenerateSeries]: 573 def _builder(args: t.List) -> exp.GenerateSeries: 574 # Check https://duckdb.org/docs/sql/functions/nested.html#range-functions 575 if len(args) == 1: 576 # DuckDB uses 0 as a default for the series' start when it's omitted 577 args.insert(0, exp.Literal.number("0")) 578 579 gen_series = exp.GenerateSeries.from_arg_list(args) 580 gen_series.set("is_end_exclusive", end_exclusive) 581 582 return gen_series 583 584 return _builder 585 586 587def _build_make_timestamp(args: t.List) -> exp.Expression: 588 if len(args) == 1: 589 return exp.UnixToTime(this=seq_get(args, 0), scale=exp.UnixToTime.MICROS) 590 591 return exp.TimestampFromParts( 592 year=seq_get(args, 0), 593 month=seq_get(args, 1), 594 day=seq_get(args, 2), 595 hour=seq_get(args, 3), 596 min=seq_get(args, 4), 597 sec=seq_get(args, 5), 598 ) 599 600 601def _show_parser(*args: t.Any, **kwargs: t.Any) -> t.Callable[[DuckDB.Parser], exp.Show]: 602 def _parse(self: DuckDB.Parser) -> exp.Show: 603 return self._parse_show_duckdb(*args, **kwargs) 604 605 return _parse 606 607 608def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str: 609 ancestor_cast = expression.find_ancestor(exp.Cast, exp.Select) 610 ancestor_cast = None if isinstance(ancestor_cast, exp.Select) else ancestor_cast 611 612 # Empty struct cast works with MAP() since DuckDB can't parse {} 613 if not expression.expressions: 614 if isinstance(ancestor_cast, exp.Cast) and ancestor_cast.to.is_type(exp.DataType.Type.MAP): 615 return "MAP()" 616 617 args: t.List[str] = [] 618 619 # BigQuery allows inline construction such as "STRUCT<a STRING, b INTEGER>('str', 1)" which is 620 # canonicalized to "ROW('str', 1) AS STRUCT(a TEXT, b INT)" in DuckDB 621 # The transformation to ROW will take place if: 622 # 1. The STRUCT itself does not have proper fields (key := value) as a "proper" STRUCT would 623 # 2. A cast to STRUCT / ARRAY of STRUCTs is found 624 is_bq_inline_struct = ( 625 (expression.find(exp.PropertyEQ) is None) 626 and ancestor_cast 627 and any( 628 casted_type.is_type(exp.DataType.Type.STRUCT) 629 for casted_type in ancestor_cast.find_all(exp.DataType) 630 ) 631 ) 632 633 for i, expr in enumerate(expression.expressions): 634 is_property_eq = isinstance(expr, exp.PropertyEQ) 635 this = expr.this 636 value = expr.expression if is_property_eq else expr 637 638 if is_bq_inline_struct: 639 args.append(self.sql(value)) 640 else: 641 if isinstance(this, exp.Identifier): 642 key = self.sql(exp.Literal.string(expr.name)) 643 elif is_property_eq: 644 key = self.sql(this) 645 else: 646 key = self.sql(exp.Literal.string(f"_{i}")) 647 648 args.append(f"{key}: {self.sql(value)}") 649 650 csv_args = ", ".join(args) 651 652 return f"ROW({csv_args})" if is_bq_inline_struct else f"{{{csv_args}}}" 653 654 655def _datatype_sql(self: DuckDB.Generator, expression: exp.DataType) -> str: 656 if expression.is_type("array"): 657 return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]" 658 659 # Modifiers are not supported for TIME, [TIME | TIMESTAMP] WITH TIME ZONE 660 if expression.is_type( 661 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ, exp.DataType.Type.TIMESTAMPTZ 662 ): 663 return expression.this.value 664 665 return self.datatype_sql(expression) 666 667 668def _json_format_sql(self: DuckDB.Generator, expression: exp.JSONFormat) -> str: 669 sql = self.func("TO_JSON", expression.this, expression.args.get("options")) 670 return f"CAST({sql} AS TEXT)" 671 672 673def _build_seq_expression(base: exp.Expression, byte_width: int, signed: bool) -> exp.Expression: 674 """Build a SEQ expression with the given base, byte width, and signedness.""" 675 bits = byte_width * 8 676 max_val = exp.Literal.number(2**bits) 677 678 if signed: 679 half = exp.Literal.number(2 ** (bits - 1)) 680 return exp.replace_placeholders( 681 DuckDB.Generator.SEQ_SIGNED.copy(), base=base, max_val=max_val, half=half 682 ) 683 return exp.replace_placeholders( 684 DuckDB.Generator.SEQ_UNSIGNED.copy(), base=base, max_val=max_val 685 ) 686 687 688def _seq_to_range_in_generator(expression: exp.Expression) -> exp.Expression: 689 """ 690 Transform SEQ functions to `range` column references when inside a GENERATOR context. 691 692 When GENERATOR(ROWCOUNT => N) becomes RANGE(N) in DuckDB, it produces a column 693 named `range` with values 0, 1, ..., N-1. SEQ functions produce the same sequence, 694 so we replace them with `range % max_val` to avoid nested window function issues. 695 """ 696 if not isinstance(expression, exp.Select): 697 return expression 698 699 from_ = expression.args.get("from_") 700 if not ( 701 from_ 702 and isinstance(from_.this, exp.TableFromRows) 703 and isinstance(from_.this.this, exp.Generator) 704 ): 705 return expression 706 707 def replace_seq(node: exp.Expression) -> exp.Expression: 708 if isinstance(node, (exp.Seq1, exp.Seq2, exp.Seq4, exp.Seq8)): 709 byte_width = _SEQ_BYTE_WIDTH[type(node)] 710 return _build_seq_expression(exp.column("range"), byte_width, signed=node.name == "1") 711 return node 712 713 return expression.transform(replace_seq, copy=False) 714 715 716def _seq_sql(self: DuckDB.Generator, expression: exp.Func, byte_width: int) -> str: 717 """ 718 Transpile Snowflake SEQ1/SEQ2/SEQ4/SEQ8 to DuckDB. 719 720 Generates monotonically increasing integers starting from 0. 721 The signed parameter (0 or 1) affects wrap-around behavior: 722 - Unsigned (0): wraps at 2^(bits) - 1 723 - Signed (1): wraps at 2^(bits-1) - 1, then goes negative 724 """ 725 # Warn if SEQ is in a restricted context (Select stops search at current scope) 726 ancestor = expression.find_ancestor(*_SEQ_RESTRICTED) 727 if ancestor and ( 728 (not isinstance(ancestor, (exp.Order, exp.Select))) 729 or (isinstance(ancestor, exp.Order) and isinstance(ancestor.parent, exp.Window)) 730 ): 731 self.unsupported("SEQ in restricted context is not supported - use CTE or subquery") 732 733 result = _build_seq_expression(_SEQ_BASE.copy(), byte_width, signed=expression.name == "1") 734 return self.sql(result) 735 736 737def _unix_to_time_sql(self: DuckDB.Generator, expression: exp.UnixToTime) -> str: 738 scale = expression.args.get("scale") 739 timestamp = expression.this 740 target_type = expression.args.get("target_type") 741 742 # Check if we need NTZ (naive timestamp in UTC) 743 is_ntz = target_type and target_type.this in ( 744 exp.DataType.Type.TIMESTAMP, 745 exp.DataType.Type.TIMESTAMPNTZ, 746 ) 747 748 if scale == exp.UnixToTime.MILLIS: 749 # EPOCH_MS already returns TIMESTAMP (naive, UTC) 750 return self.func("EPOCH_MS", timestamp) 751 if scale == exp.UnixToTime.MICROS: 752 # MAKE_TIMESTAMP already returns TIMESTAMP (naive, UTC) 753 return self.func("MAKE_TIMESTAMP", timestamp) 754 755 # Other scales: divide and use TO_TIMESTAMP 756 if scale not in (None, exp.UnixToTime.SECONDS): 757 timestamp = exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)) 758 759 to_timestamp: exp.Expression = exp.Anonymous(this="TO_TIMESTAMP", expressions=[timestamp]) 760 761 if is_ntz: 762 to_timestamp = exp.AtTimeZone(this=to_timestamp, zone=exp.Literal.string("UTC")) 763 764 return self.sql(to_timestamp) 765 766 767WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In, exp.Not) 768 769 770def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYPE) -> str: 771 arrow_sql = arrow_json_extract_sql(self, expression) 772 if not expression.same_parent and isinstance( 773 expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS 774 ): 775 arrow_sql = self.wrap(arrow_sql) 776 return arrow_sql 777 778 779def _implicit_datetime_cast( 780 arg: t.Optional[exp.Expression], type: exp.DataType.Type = exp.DataType.Type.DATE 781) -> t.Optional[exp.Expression]: 782 if isinstance(arg, exp.Literal) and arg.is_string: 783 ts = arg.name 784 if type == exp.DataType.Type.DATE and ":" in ts: 785 type = ( 786 exp.DataType.Type.TIMESTAMPTZ 787 if TIMEZONE_PATTERN.search(ts) 788 else exp.DataType.Type.TIMESTAMP 789 ) 790 791 arg = exp.cast(arg, type) 792 793 return arg 794 795 796def _week_unit_to_dow(unit: t.Optional[exp.Expression]) -> t.Optional[int]: 797 """ 798 Compute the Monday-based day shift to align DATE_DIFF('WEEK', ...) coming 799 from other dialects, e.g BigQuery's WEEK(<day>) or ISOWEEK unit parts. 800 801 Args: 802 unit: The unit expression (Var for ISOWEEK or WeekStart) 803 804 Returns: 805 The ISO 8601 day number (Monday=1, Sunday=7 etc) or None if not a week unit or if day is dynamic (not a constant). 806 807 Examples: 808 "WEEK(SUNDAY)" -> 7 809 "WEEK(MONDAY)" -> 1 810 "ISOWEEK" -> 1 811 """ 812 # Handle plain Var expressions for ISOWEEK only 813 if isinstance(unit, exp.Var) and unit.name.upper() in "ISOWEEK": 814 return 1 815 816 # Handle WeekStart expressions with explicit day 817 if isinstance(unit, exp.WeekStart): 818 return WEEK_START_DAY_TO_DOW.get(unit.name.upper()) 819 820 return None 821 822 823def _build_week_trunc_expression(date_expr: exp.Expression, start_dow: int) -> exp.Expression: 824 """ 825 Build DATE_TRUNC expression for week boundaries with custom start day. 826 827 Args: 828 date_expr: The date expression to truncate 829 shift_days: ISO 8601 day-of-week number (Monday=0, ..., Sunday=6) 830 831 DuckDB's DATE_TRUNC('WEEK', ...) aligns weeks to Monday (ISO standard). 832 To align to a different start day, we shift the date before truncating. 833 834 Shift formula: Sunday (7) gets +1, others get (1 - start_dow) 835 Examples: 836 Monday (1): shift = 0 (no shift needed) 837 Tuesday (2): shift = -1 (shift back 1 day) ... 838 Sunday (7): shift = +1 (shift forward 1 day, wraps to next Monday-based week) 839 """ 840 shift_days = 1 if start_dow == 7 else 1 - start_dow 841 842 # Shift date to align week boundaries with the desired start day 843 # No shift needed for Monday-based weeks (shift_days == 0) 844 shifted_date = ( 845 exp.DateAdd( 846 this=date_expr, 847 expression=exp.Interval(this=exp.Literal.string(str(shift_days)), unit=exp.var("DAY")), 848 ) 849 if shift_days != 0 850 else date_expr 851 ) 852 853 return exp.DateTrunc(unit=exp.var("WEEK"), this=shifted_date) 854 855 856def _date_diff_sql(self: DuckDB.Generator, expression: exp.DateDiff) -> str: 857 unit = expression.unit 858 859 if _is_nanosecond_unit(unit): 860 return _handle_nanosecond_diff(self, expression.this, expression.expression) 861 862 this = _implicit_datetime_cast(expression.this) 863 expr = _implicit_datetime_cast(expression.expression) 864 865 # DuckDB's WEEK diff does not respect Monday crossing (week boundaries), it checks (end_day - start_day) / 7: 866 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-17' AS DATE)) --> 0 (Monday crossed) 867 # SELECT DATE_DIFF('WEEK', CAST('2024-12-13' AS DATE), CAST('2024-12-20' AS DATE)) --> 1 (7 days difference) 868 # Whereas for other units such as MONTH it does respect month boundaries: 869 # SELECT DATE_DIFF('MONTH', CAST('2024-11-30' AS DATE), CAST('2024-12-01' AS DATE)) --> 1 (Month crossed) 870 date_part_boundary = expression.args.get("date_part_boundary") 871 872 # Extract week start day; returns None if day is dynamic (column/placeholder) 873 week_start = _week_unit_to_dow(unit) 874 if date_part_boundary and week_start and this and expr: 875 expression.set("unit", exp.Literal.string("WEEK")) 876 877 # Truncate both dates to week boundaries to respect input dialect semantics 878 this = _build_week_trunc_expression(this, week_start) 879 expr = _build_week_trunc_expression(expr, week_start) 880 881 return self.func("DATE_DIFF", unit_to_str(expression), expr, this) 882 883 884def _generate_datetime_array_sql( 885 self: DuckDB.Generator, expression: t.Union[exp.GenerateDateArray, exp.GenerateTimestampArray] 886) -> str: 887 is_generate_date_array = isinstance(expression, exp.GenerateDateArray) 888 889 type = exp.DataType.Type.DATE if is_generate_date_array else exp.DataType.Type.TIMESTAMP 890 start = _implicit_datetime_cast(expression.args.get("start"), type=type) 891 end = _implicit_datetime_cast(expression.args.get("end"), type=type) 892 893 # BQ's GENERATE_DATE_ARRAY & GENERATE_TIMESTAMP_ARRAY are transformed to DuckDB'S GENERATE_SERIES 894 gen_series: t.Union[exp.GenerateSeries, exp.Cast] = exp.GenerateSeries( 895 start=start, end=end, step=expression.args.get("step") 896 ) 897 898 if is_generate_date_array: 899 # The GENERATE_SERIES result type is TIMESTAMP array, so to match BQ's semantics for 900 # GENERATE_DATE_ARRAY we must cast it back to DATE array 901 gen_series = exp.cast(gen_series, exp.DataType.build("ARRAY<DATE>")) 902 903 return self.sql(gen_series) 904 905 906def _json_extract_value_array_sql( 907 self: DuckDB.Generator, expression: exp.JSONValueArray | exp.JSONExtractArray 908) -> str: 909 json_extract = exp.JSONExtract(this=expression.this, expression=expression.expression) 910 data_type = "ARRAY<STRING>" if isinstance(expression, exp.JSONValueArray) else "ARRAY<JSON>" 911 return self.sql(exp.cast(json_extract, to=exp.DataType.build(data_type))) 912 913 914def _cast_to_varchar(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 915 if arg and arg.type and not arg.is_type(*exp.DataType.TEXT_TYPES, exp.DataType.Type.UNKNOWN): 916 return exp.cast(arg, exp.DataType.Type.VARCHAR) 917 return arg 918 919 920def _cast_to_boolean(arg: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 921 if arg and not arg.is_type(exp.DataType.Type.BOOLEAN): 922 return exp.cast(arg, exp.DataType.Type.BOOLEAN) 923 return arg 924 925 926def _is_binary(arg: exp.Expression) -> bool: 927 return arg.is_type( 928 exp.DataType.Type.BINARY, 929 exp.DataType.Type.VARBINARY, 930 exp.DataType.Type.BLOB, 931 ) 932 933 934def _gen_with_cast_to_blob( 935 self: DuckDB.Generator, expression: exp.Expression, result_sql: str 936) -> str: 937 if _is_binary(expression): 938 blob = exp.DataType.build("BLOB", dialect="duckdb") 939 result_sql = self.sql(exp.Cast(this=result_sql, to=blob)) 940 return result_sql 941 942 943def _cast_to_bit(arg: exp.Expression) -> exp.Expression: 944 if not _is_binary(arg): 945 return arg 946 947 if isinstance(arg, exp.HexString): 948 arg = exp.Unhex(this=exp.Literal.string(arg.this)) 949 950 return exp.cast(arg, exp.DataType.Type.BIT) 951 952 953def _prepare_binary_bitwise_args(expression: exp.Binary) -> None: 954 if _is_binary(expression.this): 955 expression.set("this", _cast_to_bit(expression.this)) 956 if _is_binary(expression.expression): 957 expression.set("expression", _cast_to_bit(expression.expression)) 958 959 960def _day_navigation_sql( 961 self: DuckDB.Generator, expression: t.Union[exp.NextDay, exp.PreviousDay] 962) -> str: 963 """ 964 Transpile Snowflake's NEXT_DAY / PREVIOUS_DAY to DuckDB using date arithmetic. 965 966 Returns the DATE of the next/previous occurrence of the specified weekday. 967 968 Formulas: 969 - NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 970 - PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 971 972 Supports both literal and non-literal day names: 973 - Literal: Direct lookup (e.g., 'Monday' → 1) 974 - Non-literal: CASE statement for runtime evaluation 975 976 Examples: 977 NEXT_DAY('2024-01-01' (Monday), 'Monday') 978 → (1 - 1 + 6) % 7 + 1 = 6 % 7 + 1 = 7 days → 2024-01-08 979 980 PREVIOUS_DAY('2024-01-15' (Monday), 'Friday') 981 → (1 - 5 + 6) % 7 + 1 = 2 % 7 + 1 = 3 days → 2024-01-12 982 """ 983 date_expr = expression.this 984 day_name_expr = expression.expression 985 986 # Build ISODOW call for current day of week 987 isodow_call = exp.func("ISODOW", date_expr) 988 989 # Determine target day of week 990 if isinstance(day_name_expr, exp.Literal): 991 # Literal day name: lookup target_dow directly 992 day_name_str = day_name_expr.name.upper() 993 matching_day = next( 994 (day for day in WEEK_START_DAY_TO_DOW if day.startswith(day_name_str)), None 995 ) 996 if matching_day: 997 target_dow: exp.Expression = exp.Literal.number(WEEK_START_DAY_TO_DOW[matching_day]) 998 else: 999 # Unrecognized day name, use fallback 1000 return self.function_fallback_sql(expression) 1001 else: 1002 # Non-literal day name: build CASE statement for runtime mapping 1003 upper_day_name = exp.Upper(this=day_name_expr) 1004 target_dow = exp.Case( 1005 ifs=[ 1006 exp.If( 1007 this=exp.func( 1008 "STARTS_WITH", upper_day_name.copy(), exp.Literal.string(day[:2]) 1009 ), 1010 true=exp.Literal.number(dow_num), 1011 ) 1012 for day, dow_num in WEEK_START_DAY_TO_DOW.items() 1013 ] 1014 ) 1015 1016 # Calculate days offset and apply interval based on direction 1017 if isinstance(expression, exp.NextDay): 1018 # NEXT_DAY: (target_dow - current_dow + 6) % 7 + 1 1019 days_offset = exp.paren(target_dow - isodow_call + 6, copy=False) % 7 + 1 1020 date_with_offset = date_expr + exp.Interval(this=days_offset, unit=exp.var("DAY")) 1021 else: # exp.PreviousDay 1022 # PREVIOUS_DAY: (current_dow - target_dow + 6) % 7 + 1 1023 days_offset = exp.paren(isodow_call - target_dow + 6, copy=False) % 7 + 1 1024 date_with_offset = date_expr - exp.Interval(this=days_offset, unit=exp.var("DAY")) 1025 1026 # Build final: CAST(date_with_offset AS DATE) 1027 return self.sql(exp.cast(date_with_offset, exp.DataType.Type.DATE)) 1028 1029 1030def _anyvalue_sql(self: DuckDB.Generator, expression: exp.AnyValue) -> str: 1031 # Transform ANY_VALUE(expr HAVING MAX/MIN having_expr) to ARG_MAX_NULL/ARG_MIN_NULL 1032 having = expression.this 1033 if isinstance(having, exp.HavingMax): 1034 func_name = "ARG_MAX_NULL" if having.args.get("max") else "ARG_MIN_NULL" 1035 return self.func(func_name, having.this, having.expression) 1036 return self.function_fallback_sql(expression) 1037 1038 1039def _bitwise_agg_sql( 1040 self: DuckDB.Generator, 1041 expression: t.Union[exp.BitwiseOrAgg, exp.BitwiseAndAgg, exp.BitwiseXorAgg], 1042) -> str: 1043 """ 1044 DuckDB's bitwise aggregate functions only accept integer types. For other types: 1045 - DECIMAL/STRING: Use CAST(arg AS INT) to convert directly, will round to nearest int 1046 - FLOAT/DOUBLE: Use ROUND(arg)::INT to round to nearest integer, required due to float precision loss 1047 """ 1048 if isinstance(expression, exp.BitwiseOrAgg): 1049 func_name = "BIT_OR" 1050 elif isinstance(expression, exp.BitwiseAndAgg): 1051 func_name = "BIT_AND" 1052 else: # exp.BitwiseXorAgg 1053 func_name = "BIT_XOR" 1054 1055 arg = expression.this 1056 1057 if not arg.type: 1058 from sqlglot.optimizer.annotate_types import annotate_types 1059 1060 arg = annotate_types(arg, dialect=self.dialect) 1061 1062 if arg.is_type(*exp.DataType.REAL_TYPES, *exp.DataType.TEXT_TYPES): 1063 if arg.is_type(*exp.DataType.FLOAT_TYPES): 1064 # float types need to be rounded first due to precision loss 1065 arg = exp.func("ROUND", arg) 1066 1067 arg = exp.cast(arg, exp.DataType.Type.INT) 1068 1069 return self.func(func_name, arg) 1070 1071 1072def _literal_sql_with_ws_chr(self: DuckDB.Generator, literal: str) -> str: 1073 # DuckDB does not support \uXXXX escapes, so we must use CHR() instead of replacing them directly 1074 if not any(ch in WS_CONTROL_CHARS_TO_DUCK for ch in literal): 1075 return self.sql(exp.Literal.string(literal)) 1076 1077 sql_segments: t.List[str] = [] 1078 for is_ws_control, group in groupby(literal, key=lambda ch: ch in WS_CONTROL_CHARS_TO_DUCK): 1079 if is_ws_control: 1080 for ch in group: 1081 duckdb_char_code = WS_CONTROL_CHARS_TO_DUCK[ch] 1082 sql_segments.append(self.func("CHR", exp.Literal.number(str(duckdb_char_code)))) 1083 else: 1084 sql_segments.append(self.sql(exp.Literal.string("".join(group)))) 1085 1086 sql = " || ".join(sql_segments) 1087 return sql if len(sql_segments) == 1 else f"({sql})" 1088 1089 1090def _escape_regex_metachars( 1091 self: DuckDB.Generator, delimiters: t.Optional[exp.Expression], delimiters_sql: str 1092) -> str: 1093 r""" 1094 Escapes regex metacharacters \ - ^ [ ] for use in character classes regex expressions. 1095 1096 Literal strings are escaped at transpile time, expressions handled with REPLACE() calls. 1097 """ 1098 if not delimiters: 1099 return delimiters_sql 1100 1101 if delimiters.is_string: 1102 literal_value = delimiters.this 1103 escaped_literal = "".join(REGEX_ESCAPE_REPLACEMENTS.get(ch, ch) for ch in literal_value) 1104 return _literal_sql_with_ws_chr(self, escaped_literal) 1105 1106 escaped_sql = delimiters_sql 1107 for raw, escaped in REGEX_ESCAPE_REPLACEMENTS.items(): 1108 escaped_sql = self.func( 1109 "REPLACE", 1110 escaped_sql, 1111 self.sql(exp.Literal.string(raw)), 1112 self.sql(exp.Literal.string(escaped)), 1113 ) 1114 1115 return escaped_sql 1116 1117 1118def _build_capitalization_sql( 1119 self: DuckDB.Generator, 1120 value_to_split: str, 1121 delimiters_sql: str, 1122) -> str: 1123 # empty string delimiter --> treat value as one word, no need to split 1124 if delimiters_sql == "''": 1125 return f"UPPER(LEFT({value_to_split}, 1)) || LOWER(SUBSTRING({value_to_split}, 2))" 1126 1127 delim_regex_sql = f"CONCAT('[', {delimiters_sql}, ']')" 1128 split_regex_sql = f"CONCAT('([', {delimiters_sql}, ']+|[^', {delimiters_sql}, ']+)')" 1129 1130 # REGEXP_EXTRACT_ALL produces a list of string segments, alternating between delimiter and non-delimiter segments. 1131 # We do not know whether the first segment is a delimiter or not, so we check the first character of the string 1132 # with REGEXP_MATCHES. If the first char is a delimiter, we capitalize even list indexes, otherwise capitalize odd. 1133 return self.func( 1134 "ARRAY_TO_STRING", 1135 exp.case() 1136 .when( 1137 f"REGEXP_MATCHES(LEFT({value_to_split}, 1), {delim_regex_sql})", 1138 self.func( 1139 "LIST_TRANSFORM", 1140 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1141 "(seg, idx) -> CASE WHEN idx % 2 = 0 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1142 ), 1143 ) 1144 .else_( 1145 self.func( 1146 "LIST_TRANSFORM", 1147 self.func("REGEXP_EXTRACT_ALL", value_to_split, split_regex_sql), 1148 "(seg, idx) -> CASE WHEN idx % 2 = 1 THEN UPPER(LEFT(seg, 1)) || LOWER(SUBSTRING(seg, 2)) ELSE seg END", 1149 ), 1150 ), 1151 "''", 1152 ) 1153 1154 1155def _initcap_sql(self: DuckDB.Generator, expression: exp.Initcap) -> str: 1156 this_sql = self.sql(expression, "this") 1157 delimiters = expression.args.get("expression") 1158 if delimiters is None: 1159 # fallback for manually created exp.Initcap w/o delimiters arg 1160 delimiters = exp.Literal.string(self.dialect.INITCAP_DEFAULT_DELIMITER_CHARS) 1161 delimiters_sql = self.sql(delimiters) 1162 1163 escaped_delimiters_sql = _escape_regex_metachars(self, delimiters, delimiters_sql) 1164 1165 return _build_capitalization_sql(self, this_sql, escaped_delimiters_sql) 1166 1167 1168def _boolxor_agg_sql(self: DuckDB.Generator, expression: exp.BoolxorAgg) -> str: 1169 """ 1170 Snowflake's `BOOLXOR_AGG(col)` returns TRUE if exactly one input in `col` is TRUE, FALSE otherwise; 1171 Since DuckDB does not have a mapping function, we mimic the behavior by generating `COUNT_IF(col) = 1`. 1172 1173 DuckDB's COUNT_IF strictly requires boolean inputs, so cast if not already boolean. 1174 """ 1175 return self.sql( 1176 exp.EQ( 1177 this=exp.CountIf(this=_cast_to_boolean(expression.this)), 1178 expression=exp.Literal.number(1), 1179 ) 1180 ) 1181 1182 1183def _bitshift_sql( 1184 self: DuckDB.Generator, expression: exp.BitwiseLeftShift | exp.BitwiseRightShift 1185) -> str: 1186 """ 1187 Transform bitshift expressions for DuckDB by injecting BIT/INT128 casts. 1188 1189 DuckDB's bitwise shift operators don't work with BLOB/BINARY types, so we cast 1190 them to BIT for the operation, then cast the result back to the original type. 1191 1192 Note: Assumes type annotation has been applied with the source dialect. 1193 """ 1194 operator = "<<" if isinstance(expression, exp.BitwiseLeftShift) else ">>" 1195 result_is_blob = False 1196 this = expression.this 1197 1198 if _is_binary(this): 1199 result_is_blob = True 1200 expression.set("this", exp.cast(this, exp.DataType.Type.BIT)) 1201 elif expression.args.get("requires_int128"): 1202 this.replace(exp.cast(this, exp.DataType.Type.INT128)) 1203 1204 result_sql = self.binary(expression, operator) 1205 1206 # Wrap in parentheses if parent is a bitwise operator to "fix" DuckDB precedence issue 1207 # DuckDB parses: a << b | c << d as (a << b | c) << d 1208 if isinstance(expression.parent, exp.Binary): 1209 result_sql = self.sql(exp.Paren(this=result_sql)) 1210 1211 if result_is_blob: 1212 result_sql = self.sql( 1213 exp.Cast(this=result_sql, to=exp.DataType.build("BLOB", dialect="duckdb")) 1214 ) 1215 1216 return result_sql 1217 1218 1219def _scale_rounding_sql( 1220 self: DuckDB.Generator, 1221 expression: exp.Expression, 1222 rounding_func: type[exp.Expression], 1223) -> str | None: 1224 """ 1225 Handle scale parameter transformation for rounding functions. 1226 1227 DuckDB doesn't support the scale parameter for certain functions (e.g., FLOOR, CEIL), 1228 so we transform: FUNC(x, n) to ROUND(FUNC(x * 10^n) / 10^n, n) 1229 1230 Args: 1231 self: The DuckDB generator instance 1232 expression: The expression to transform (must have 'this', 'decimals', and 'to' args) 1233 rounding_func: The rounding function class to use in the transformation 1234 1235 Returns: 1236 The transformed SQL string if decimals parameter exists, None otherwise 1237 """ 1238 decimals = expression.args.get("decimals") 1239 1240 if decimals is None or expression.args.get("to") is not None: 1241 return None 1242 1243 this = expression.this 1244 if isinstance(this, exp.Binary): 1245 this = exp.Paren(this=this) 1246 1247 n_int = decimals 1248 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 1249 n_int = exp.cast(decimals, exp.DataType.Type.INT) 1250 1251 pow_ = exp.Pow(this=exp.Literal.number("10"), expression=n_int) 1252 rounded = rounding_func(this=exp.Mul(this=this, expression=pow_)) 1253 result = exp.Div(this=rounded, expression=pow_.copy()) 1254 1255 return self.round_sql( 1256 exp.Round(this=result, decimals=decimals, casts_non_integer_decimals=True) 1257 ) 1258 1259 1260def _ceil_floor(self: DuckDB.Generator, expression: exp.Floor | exp.Ceil) -> str: 1261 scaled_sql = _scale_rounding_sql(self, expression, type(expression)) 1262 if scaled_sql is not None: 1263 return scaled_sql 1264 return self.ceil_floor(expression) 1265 1266 1267def _regr_val_sql( 1268 self: DuckDB.Generator, 1269 expression: exp.RegrValx | exp.RegrValy, 1270) -> str: 1271 """ 1272 Transpile Snowflake's REGR_VALX/REGR_VALY to DuckDB equivalent. 1273 1274 REGR_VALX(y, x) returns NULL if y is NULL; otherwise returns x. 1275 REGR_VALY(y, x) returns NULL if x is NULL; otherwise returns y. 1276 """ 1277 from sqlglot.optimizer.annotate_types import annotate_types 1278 1279 y = expression.this 1280 x = expression.expression 1281 1282 # Determine which argument to check for NULL and which to return based on expression type 1283 if isinstance(expression, exp.RegrValx): 1284 # REGR_VALX: check y for NULL, return x 1285 check_for_null = y 1286 return_value = x 1287 return_value_attr = "expression" 1288 else: 1289 # REGR_VALY: check x for NULL, return y 1290 check_for_null = x 1291 return_value = y 1292 return_value_attr = "this" 1293 1294 # Get the type from the return argument 1295 result_type = return_value.type 1296 1297 # If no type info, annotate the expression to infer types 1298 if not result_type or result_type.this == exp.DataType.Type.UNKNOWN: 1299 try: 1300 annotated = annotate_types(expression.copy(), dialect=self.dialect) 1301 result_type = getattr(annotated, return_value_attr).type 1302 except Exception: 1303 pass 1304 1305 # Default to DOUBLE for regression functions if type still unknown 1306 if not result_type or result_type.this == exp.DataType.Type.UNKNOWN: 1307 result_type = exp.DataType.build("DOUBLE") 1308 1309 # Cast NULL to the same type as return_value to avoid DuckDB type inference issues 1310 typed_null = exp.Cast(this=exp.Null(), to=result_type) 1311 1312 return self.sql( 1313 exp.If( 1314 this=exp.Is(this=check_for_null.copy(), expression=exp.Null()), 1315 true=typed_null, 1316 false=return_value.copy(), 1317 ) 1318 ) 1319 1320 1321def _maybe_corr_null_to_false( 1322 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 1323) -> t.Optional[t.Union[exp.Filter, exp.Window, exp.Corr]]: 1324 corr = expression 1325 while isinstance(corr, (exp.Window, exp.Filter)): 1326 corr = corr.this 1327 1328 if not isinstance(corr, exp.Corr) or not corr.args.get("null_on_zero_variance"): 1329 return None 1330 1331 corr.set("null_on_zero_variance", False) 1332 return expression 1333 1334 1335def _date_from_parts_sql(self, expression: exp.DateFromParts) -> str: 1336 """ 1337 Snowflake's DATE_FROM_PARTS allows out-of-range values for the month and day input. 1338 E.g., larger values (month=13, day=100), zero-values (month=0, day=0), negative values (month=-13, day=-100). 1339 1340 DuckDB's MAKE_DATE does not support out-of-range values, but DuckDB's INTERVAL type does. 1341 1342 We convert to date arithmetic: 1343 DATE_FROM_PARTS(year, month, day) 1344 - MAKE_DATE(year, 1, 1) + INTERVAL (month-1) MONTH + INTERVAL (day-1) DAY 1345 """ 1346 year_expr = expression.args.get("year") 1347 month_expr = expression.args.get("month") 1348 day_expr = expression.args.get("day") 1349 1350 if expression.args.get("allow_overflow"): 1351 base_date: exp.Expression = exp.func( 1352 "MAKE_DATE", year_expr, exp.Literal.number(1), exp.Literal.number(1) 1353 ) 1354 1355 if month_expr: 1356 base_date = base_date + exp.Interval(this=month_expr - 1, unit=exp.var("MONTH")) 1357 1358 if day_expr: 1359 base_date = base_date + exp.Interval(this=day_expr - 1, unit=exp.var("DAY")) 1360 1361 return self.sql(exp.cast(expression=base_date, to=exp.DataType.Type.DATE)) 1362 1363 return self.func("MAKE_DATE", year_expr, month_expr, day_expr) 1364 1365 1366def _round_arg(arg: exp.Expression, round_input: t.Optional[bool] = None) -> exp.Expression: 1367 if round_input: 1368 return exp.func("ROUND", arg, exp.Literal.number(0)) 1369 return arg 1370 1371 1372def _boolnot_sql(self: DuckDB.Generator, expression: exp.Boolnot) -> str: 1373 arg = _round_arg(expression.this, expression.args.get("round_input")) 1374 return self.sql(exp.not_(exp.paren(arg))) 1375 1376 1377def _booland_sql(self: DuckDB.Generator, expression: exp.Booland) -> str: 1378 round_input = expression.args.get("round_input") 1379 left = _round_arg(expression.this, round_input) 1380 right = _round_arg(expression.expression, round_input) 1381 return self.sql(exp.paren(exp.and_(exp.paren(left), exp.paren(right), wrap=False))) 1382 1383 1384def _boolor_sql(self: DuckDB.Generator, expression: exp.Boolor) -> str: 1385 round_input = expression.args.get("round_input") 1386 left = _round_arg(expression.this, round_input) 1387 right = _round_arg(expression.expression, round_input) 1388 return self.sql(exp.paren(exp.or_(exp.paren(left), exp.paren(right), wrap=False))) 1389 1390 1391def _xor_sql(self: DuckDB.Generator, expression: exp.Xor) -> str: 1392 round_input = expression.args.get("round_input") 1393 left = _round_arg(expression.this, round_input) 1394 right = _round_arg(expression.expression, round_input) 1395 return self.sql( 1396 exp.or_( 1397 exp.paren(exp.and_(left.copy(), exp.paren(right.not_()), wrap=False)), 1398 exp.paren(exp.and_(exp.paren(left.not_()), right.copy(), wrap=False)), 1399 wrap=False, 1400 ) 1401 ) 1402 1403 1404def _explode_to_unnest_sql(self: DuckDB.Generator, expression: exp.Lateral) -> str: 1405 """Handle LATERAL VIEW EXPLODE/INLINE conversion to UNNEST for DuckDB.""" 1406 explode = expression.this 1407 1408 if isinstance(explode, exp.Inline): 1409 # For INLINE, create CROSS JOIN LATERAL (SELECT UNNEST(..., max_depth => 2)) 1410 # Build the UNNEST call with DuckDB-style named parameter 1411 unnest_expr = exp.Unnest( 1412 expressions=[ 1413 explode.this, 1414 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)), 1415 ] 1416 ) 1417 select_expr = exp.Select(expressions=[unnest_expr]).subquery() 1418 1419 alias_expr = expression.args.get("alias") 1420 if alias_expr and not alias_expr.this: 1421 # we need to provide a table name if not present 1422 alias_expr.set("this", exp.to_identifier(f"_u_{expression.index}")) 1423 1424 transformed_lateral_expr = exp.Lateral(this=select_expr, alias=alias_expr) 1425 cross_join_lateral_expr = exp.Join(this=transformed_lateral_expr, kind="CROSS") 1426 1427 return self.sql(cross_join_lateral_expr) 1428 1429 # For other cases, use the standard conversion 1430 return explode_to_unnest_sql(self, expression) 1431 1432 1433def _sha_sql( 1434 self: DuckDB.Generator, 1435 expression: exp.Expression, 1436 hash_func: str, 1437 is_binary: bool = False, 1438) -> str: 1439 arg = expression.this 1440 1441 # For SHA2 variants, check digest length (DuckDB only supports SHA256) 1442 if hash_func == "SHA256": 1443 length = expression.text("length") or "256" 1444 if length != "256": 1445 self.unsupported("DuckDB only supports SHA256 hashing algorithm.") 1446 1447 # Cast if type is incompatible with DuckDB 1448 if ( 1449 arg.type 1450 and arg.type.this != exp.DataType.Type.UNKNOWN 1451 and not arg.is_type(*exp.DataType.TEXT_TYPES) 1452 and not _is_binary(arg) 1453 ): 1454 arg = exp.cast(arg, exp.DataType.Type.VARCHAR) 1455 1456 result = self.func(hash_func, arg) 1457 return self.func("UNHEX", result) if is_binary else result 1458 1459 1460class DuckDB(Dialect): 1461 NULL_ORDERING = "nulls_are_last" 1462 SUPPORTS_USER_DEFINED_TYPES = True 1463 SAFE_DIVISION = True 1464 INDEX_OFFSET = 1 1465 CONCAT_COALESCE = True 1466 SUPPORTS_ORDER_BY_ALL = True 1467 SUPPORTS_FIXED_SIZE_ARRAYS = True 1468 STRICT_JSON_PATH_SYNTAX = False 1469 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 1470 1471 # https://duckdb.org/docs/sql/introduction.html#creating-a-new-table 1472 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 1473 1474 DATE_PART_MAPPING = { 1475 **Dialect.DATE_PART_MAPPING, 1476 "DAYOFWEEKISO": "ISODOW", 1477 } 1478 1479 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 1480 1481 DATE_PART_MAPPING.pop("WEEKDAY") 1482 1483 INVERSE_TIME_MAPPING = { 1484 "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d) 1485 "%:z": "%z", # In DuckDB %z can represent ±HH:MM, ±HHMM, or ±HH. 1486 "%-z": "%z", 1487 "%f_zero": "%n", 1488 "%f_one": "%n", 1489 "%f_two": "%n", 1490 "%f_three": "%g", 1491 "%f_four": "%n", 1492 "%f_five": "%n", 1493 "%f_seven": "%n", 1494 "%f_eight": "%n", 1495 "%f_nine": "%n", 1496 } 1497 1498 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1499 if isinstance(path, exp.Literal): 1500 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1501 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1502 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1503 # either result in a noisy warning or in an invalid representation of the path. 1504 path_text = path.name 1505 if path_text.startswith("/") or "[#" in path_text: 1506 return path 1507 1508 return super().to_json_path(path) 1509 1510 class Tokenizer(tokens.Tokenizer): 1511 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1512 BYTE_STRING_ESCAPES = ["'", "\\"] 1513 HEREDOC_STRINGS = ["$"] 1514 1515 HEREDOC_TAG_IS_IDENTIFIER = True 1516 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1517 1518 KEYWORDS = { 1519 **tokens.Tokenizer.KEYWORDS, 1520 "//": TokenType.DIV, 1521 "**": TokenType.DSTAR, 1522 "^@": TokenType.CARET_AT, 1523 "@>": TokenType.AT_GT, 1524 "<@": TokenType.LT_AT, 1525 "ATTACH": TokenType.ATTACH, 1526 "BINARY": TokenType.VARBINARY, 1527 "BITSTRING": TokenType.BIT, 1528 "BPCHAR": TokenType.TEXT, 1529 "CHAR": TokenType.TEXT, 1530 "DATETIME": TokenType.TIMESTAMPNTZ, 1531 "DETACH": TokenType.DETACH, 1532 "FORCE": TokenType.FORCE, 1533 "INSTALL": TokenType.INSTALL, 1534 "INT8": TokenType.BIGINT, 1535 "LOGICAL": TokenType.BOOLEAN, 1536 "MACRO": TokenType.FUNCTION, 1537 "ONLY": TokenType.ONLY, 1538 "PIVOT_WIDER": TokenType.PIVOT, 1539 "POSITIONAL": TokenType.POSITIONAL, 1540 "RESET": TokenType.COMMAND, 1541 "ROW": TokenType.STRUCT, 1542 "SIGNED": TokenType.INT, 1543 "STRING": TokenType.TEXT, 1544 "SUMMARIZE": TokenType.SUMMARIZE, 1545 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1546 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1547 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1548 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1549 "TIMESTAMP_US": TokenType.TIMESTAMP, 1550 "UBIGINT": TokenType.UBIGINT, 1551 "UINTEGER": TokenType.UINT, 1552 "USMALLINT": TokenType.USMALLINT, 1553 "UTINYINT": TokenType.UTINYINT, 1554 "VARCHAR": TokenType.TEXT, 1555 } 1556 KEYWORDS.pop("/*+") 1557 1558 SINGLE_TOKENS = { 1559 **tokens.Tokenizer.SINGLE_TOKENS, 1560 "$": TokenType.PARAMETER, 1561 } 1562 1563 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW} 1564 1565 class Parser(parser.Parser): 1566 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1567 1568 BITWISE = parser.Parser.BITWISE.copy() 1569 BITWISE.pop(TokenType.CARET) 1570 1571 RANGE_PARSERS = { 1572 **parser.Parser.RANGE_PARSERS, 1573 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1574 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1575 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1576 } 1577 1578 EXPONENT = { 1579 **parser.Parser.EXPONENT, 1580 TokenType.CARET: exp.Pow, 1581 TokenType.DSTAR: exp.Pow, 1582 } 1583 1584 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1585 1586 SHOW_PARSERS = { 1587 "TABLES": _show_parser("TABLES"), 1588 "ALL TABLES": _show_parser("ALL TABLES"), 1589 } 1590 1591 FUNCTIONS = { 1592 **parser.Parser.FUNCTIONS, 1593 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1594 "ARRAY_PREPEND": _build_array_prepend, 1595 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1596 "ARRAY_SORT": exp.SortArray.from_arg_list, 1597 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1598 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1599 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1600 "CURRENT_LOCALTIMESTAMP": exp.Localtimestamp.from_arg_list, 1601 "DATEDIFF": _build_date_diff, 1602 "DATE_DIFF": _build_date_diff, 1603 "DATE_TRUNC": date_trunc_to_time, 1604 "DATETRUNC": date_trunc_to_time, 1605 "DECODE": lambda args: exp.Decode( 1606 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1607 ), 1608 "EDITDIST3": exp.Levenshtein.from_arg_list, 1609 "ENCODE": lambda args: exp.Encode( 1610 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1611 ), 1612 "EPOCH": exp.TimeToUnix.from_arg_list, 1613 "EPOCH_MS": lambda args: exp.UnixToTime( 1614 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1615 ), 1616 "GENERATE_SERIES": _build_generate_series(), 1617 "GET_CURRENT_TIME": exp.CurrentTime.from_arg_list, 1618 "GET_BIT": lambda args: exp.Getbit( 1619 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1620 ), 1621 "JARO_WINKLER_SIMILARITY": exp.JarowinklerSimilarity.from_arg_list, 1622 "JSON": exp.ParseJSON.from_arg_list, 1623 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1624 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1625 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1626 "LIST_CONCAT": parser.build_array_concat, 1627 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1628 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1629 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1630 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1631 "LIST_HAS": exp.ArrayContains.from_arg_list, 1632 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1633 "LIST_MAX": exp.ArrayMax.from_arg_list, 1634 "LIST_MIN": exp.ArrayMin.from_arg_list, 1635 "LIST_PREPEND": _build_array_prepend, 1636 "LIST_REVERSE_SORT": _build_sort_array_desc, 1637 "LIST_SORT": exp.SortArray.from_arg_list, 1638 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1639 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1640 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1641 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1642 "MAKE_TIMESTAMP": _build_make_timestamp, 1643 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1644 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1645 "RANGE": _build_generate_series(end_exclusive=True), 1646 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1647 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1648 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1649 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1650 this=seq_get(args, 0), 1651 expression=seq_get(args, 1), 1652 replacement=seq_get(args, 2), 1653 modifiers=seq_get(args, 3), 1654 single_replace=True, 1655 ), 1656 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1657 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1658 "STRING_SPLIT": exp.Split.from_arg_list, 1659 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1660 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1661 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1662 "STRUCT_PACK": exp.Struct.from_arg_list, 1663 "STR_SPLIT": exp.Split.from_arg_list, 1664 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1665 "TODAY": exp.CurrentDate.from_arg_list, 1666 "TIME_BUCKET": exp.DateBin.from_arg_list, 1667 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1668 "UNNEST": exp.Explode.from_arg_list, 1669 "VERSION": exp.CurrentVersion.from_arg_list, 1670 "XOR": binary_from_function(exp.BitwiseXor), 1671 } 1672 1673 FUNCTIONS.pop("DATE_SUB") 1674 FUNCTIONS.pop("GLOB") 1675 1676 FUNCTION_PARSERS = { 1677 **parser.Parser.FUNCTION_PARSERS, 1678 **dict.fromkeys( 1679 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1680 ), 1681 } 1682 FUNCTION_PARSERS.pop("DECODE") 1683 1684 NO_PAREN_FUNCTION_PARSERS = { 1685 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1686 "MAP": lambda self: self._parse_map(), 1687 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1688 } 1689 1690 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1691 TokenType.SEMI, 1692 TokenType.ANTI, 1693 } 1694 1695 PLACEHOLDER_PARSERS = { 1696 **parser.Parser.PLACEHOLDER_PARSERS, 1697 TokenType.PARAMETER: lambda self: ( 1698 self.expression(exp.Placeholder, this=self._prev.text) 1699 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1700 else None 1701 ), 1702 } 1703 1704 TYPE_CONVERTERS = { 1705 # https://duckdb.org/docs/sql/data_types/numeric 1706 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1707 # https://duckdb.org/docs/sql/data_types/text 1708 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1709 } 1710 1711 STATEMENT_PARSERS = { 1712 **parser.Parser.STATEMENT_PARSERS, 1713 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1714 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1715 TokenType.FORCE: lambda self: self._parse_force(), 1716 TokenType.INSTALL: lambda self: self._parse_install(), 1717 TokenType.SHOW: lambda self: self._parse_show(), 1718 } 1719 1720 SET_PARSERS = { 1721 **parser.Parser.SET_PARSERS, 1722 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1723 } 1724 1725 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1726 index = self._index 1727 if not self._match_text_seq("LAMBDA"): 1728 return super()._parse_lambda(alias=alias) 1729 1730 expressions = self._parse_csv(self._parse_lambda_arg) 1731 if not self._match(TokenType.COLON): 1732 self._retreat(index) 1733 return None 1734 1735 this = self._replace_lambda(self._parse_assignment(), expressions) 1736 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1737 1738 def _parse_expression(self) -> t.Optional[exp.Expression]: 1739 # DuckDB supports prefix aliases, e.g. foo: 1 1740 if self._next and self._next.token_type == TokenType.COLON: 1741 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1742 self._match(TokenType.COLON) 1743 comments = self._prev_comments or [] 1744 1745 this = self._parse_assignment() 1746 if isinstance(this, exp.Expression): 1747 # Moves the comment next to the alias in `alias: expr /* comment */` 1748 comments += this.pop_comments() or [] 1749 1750 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1751 1752 return super()._parse_expression() 1753 1754 def _parse_table( 1755 self, 1756 schema: bool = False, 1757 joins: bool = False, 1758 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1759 parse_bracket: bool = False, 1760 is_db_reference: bool = False, 1761 parse_partition: bool = False, 1762 consume_pipe: bool = False, 1763 ) -> t.Optional[exp.Expression]: 1764 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1765 if self._next and self._next.token_type == TokenType.COLON: 1766 alias = self._parse_table_alias( 1767 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1768 ) 1769 self._match(TokenType.COLON) 1770 comments = self._prev_comments or [] 1771 else: 1772 alias = None 1773 comments = [] 1774 1775 table = super()._parse_table( 1776 schema=schema, 1777 joins=joins, 1778 alias_tokens=alias_tokens, 1779 parse_bracket=parse_bracket, 1780 is_db_reference=is_db_reference, 1781 parse_partition=parse_partition, 1782 ) 1783 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1784 # Moves the comment next to the alias in `alias: table /* comment */` 1785 comments += table.pop_comments() or [] 1786 alias.comments = alias.pop_comments() + comments 1787 table.set("alias", alias) 1788 1789 return table 1790 1791 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1792 # https://duckdb.org/docs/sql/samples.html 1793 sample = super()._parse_table_sample(as_modifier=as_modifier) 1794 if sample and not sample.args.get("method"): 1795 if sample.args.get("size"): 1796 sample.set("method", exp.var("RESERVOIR")) 1797 else: 1798 sample.set("method", exp.var("SYSTEM")) 1799 1800 return sample 1801 1802 def _parse_bracket( 1803 self, this: t.Optional[exp.Expression] = None 1804 ) -> t.Optional[exp.Expression]: 1805 bracket = super()._parse_bracket(this) 1806 1807 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1808 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1809 bracket.set("returns_list_for_maps", True) 1810 1811 return bracket 1812 1813 def _parse_map(self) -> exp.ToMap | exp.Map: 1814 if self._match(TokenType.L_BRACE, advance=False): 1815 return self.expression(exp.ToMap, this=self._parse_bracket()) 1816 1817 args = self._parse_wrapped_csv(self._parse_assignment) 1818 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1819 1820 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1821 return self._parse_field_def() 1822 1823 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1824 if len(aggregations) == 1: 1825 return super()._pivot_column_names(aggregations) 1826 return pivot_column_names(aggregations, dialect="duckdb") 1827 1828 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1829 def _parse_attach_option() -> exp.AttachOption: 1830 return self.expression( 1831 exp.AttachOption, 1832 this=self._parse_var(any_token=True), 1833 expression=self._parse_field(any_token=True), 1834 ) 1835 1836 self._match(TokenType.DATABASE) 1837 exists = self._parse_exists(not_=is_attach) 1838 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1839 1840 if self._match(TokenType.L_PAREN, advance=False): 1841 expressions = self._parse_wrapped_csv(_parse_attach_option) 1842 else: 1843 expressions = None 1844 1845 return ( 1846 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1847 if is_attach 1848 else self.expression(exp.Detach, this=this, exists=exists) 1849 ) 1850 1851 def _parse_show_duckdb(self, this: str) -> exp.Show: 1852 return self.expression(exp.Show, this=this) 1853 1854 def _parse_force(self) -> exp.Install | exp.Command: 1855 # FORCE can only be followed by INSTALL or CHECKPOINT 1856 # In the case of CHECKPOINT, we fallback 1857 if not self._match(TokenType.INSTALL): 1858 return self._parse_as_command(self._prev) 1859 1860 return self._parse_install(force=True) 1861 1862 def _parse_install(self, force: bool = False) -> exp.Install: 1863 return self.expression( 1864 exp.Install, 1865 this=self._parse_id_var(), 1866 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1867 force=force, 1868 ) 1869 1870 def _parse_primary(self) -> t.Optional[exp.Expression]: 1871 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1872 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1873 1874 return super()._parse_primary() 1875 1876 class Generator(generator.Generator): 1877 PARAMETER_TOKEN = "$" 1878 NAMED_PLACEHOLDER_TOKEN = "$" 1879 JOIN_HINTS = False 1880 TABLE_HINTS = False 1881 QUERY_HINTS = False 1882 LIMIT_FETCH = "LIMIT" 1883 STRUCT_DELIMITER = ("(", ")") 1884 RENAME_TABLE_WITH_DB = False 1885 NVL2_SUPPORTED = False 1886 SEMI_ANTI_JOIN_WITH_SIDE = False 1887 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1888 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1889 LAST_DAY_SUPPORTS_DATE_PART = False 1890 JSON_KEY_VALUE_PAIR_SEP = "," 1891 IGNORE_NULLS_IN_FUNC = True 1892 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1893 SUPPORTS_CREATE_TABLE_LIKE = False 1894 MULTI_ARG_DISTINCT = False 1895 CAN_IMPLEMENT_ARRAY_ANY = True 1896 SUPPORTS_TO_NUMBER = False 1897 SUPPORTS_WINDOW_EXCLUDE = True 1898 COPY_HAS_INTO_KEYWORD = False 1899 STAR_EXCEPT = "EXCLUDE" 1900 PAD_FILL_PATTERN_IS_REQUIRED = True 1901 ARRAY_SIZE_DIM_REQUIRED = False 1902 NORMALIZE_EXTRACT_DATE_PARTS = True 1903 SUPPORTS_LIKE_QUANTIFIERS = False 1904 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1905 1906 TRANSFORMS = { 1907 **generator.Generator.TRANSFORMS, 1908 exp.AnyValue: _anyvalue_sql, 1909 exp.ApproxDistinct: approx_count_distinct_sql, 1910 exp.Boolnot: _boolnot_sql, 1911 exp.Booland: _booland_sql, 1912 exp.Boolor: _boolor_sql, 1913 exp.Array: transforms.preprocess( 1914 [transforms.inherit_struct_field_names], 1915 generator=inline_array_unless_query, 1916 ), 1917 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1918 exp.ArrayCompact: array_compact_sql, 1919 exp.ArrayConstructCompact: lambda self, e: self.sql( 1920 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1921 ), 1922 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1923 exp.ArrayContains: _array_contains_sql, 1924 exp.ArrayFilter: rename_func("LIST_FILTER"), 1925 exp.ArrayInsert: _array_insert_sql, 1926 exp.ArrayRemoveAt: _array_remove_at_sql, 1927 exp.ArrayRemove: remove_from_array_using_filter, 1928 exp.ArraySort: _array_sort_sql, 1929 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1930 exp.ArraySum: rename_func("LIST_SUM"), 1931 exp.ArrayMax: rename_func("LIST_MAX"), 1932 exp.ArrayMin: rename_func("LIST_MIN"), 1933 exp.ArrayUniqueAgg: lambda self, e: self.func( 1934 "LIST", exp.Distinct(expressions=[e.this]) 1935 ), 1936 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1937 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1938 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1939 exp.BitwiseAndAgg: _bitwise_agg_sql, 1940 exp.BitwiseCount: rename_func("BIT_COUNT"), 1941 exp.BitwiseLeftShift: _bitshift_sql, 1942 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1943 exp.BitwiseOrAgg: _bitwise_agg_sql, 1944 exp.BitwiseRightShift: _bitshift_sql, 1945 exp.BitwiseXorAgg: _bitwise_agg_sql, 1946 exp.ByteLength: lambda self, e: self.func("OCTET_LENGTH", e.this), 1947 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1948 exp.Corr: lambda self, e: self._corr_sql(e), 1949 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1950 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1951 exp.CurrentSchemas: lambda self, e: self.func( 1952 "current_schemas", e.this if e.this else exp.true() 1953 ), 1954 exp.CurrentTimestamp: lambda self, e: self.sql( 1955 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1956 ) 1957 if e.args.get("sysdate") 1958 else "CURRENT_TIMESTAMP", 1959 exp.CurrentVersion: rename_func("version"), 1960 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1961 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1962 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1963 exp.DayOfWeekIso: rename_func("ISODOW"), 1964 exp.DayOfYear: rename_func("DAYOFYEAR"), 1965 exp.Dayname: lambda self, e: ( 1966 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1967 if e.args.get("abbreviated") 1968 else self.func("DAYNAME", e.this) 1969 ), 1970 exp.Monthname: lambda self, e: ( 1971 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1972 if e.args.get("abbreviated") 1973 else self.func("MONTHNAME", e.this) 1974 ), 1975 exp.DataType: _datatype_sql, 1976 exp.Date: _date_sql, 1977 exp.DateAdd: _date_delta_to_binary_interval_op(), 1978 exp.DateFromParts: _date_from_parts_sql, 1979 exp.DateSub: _date_delta_to_binary_interval_op(), 1980 exp.DateDiff: _date_diff_sql, 1981 exp.DateStrToDate: datestrtodate_sql, 1982 exp.Datetime: no_datetime_sql, 1983 exp.DatetimeDiff: _date_diff_sql, 1984 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1985 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1986 exp.DateToDi: lambda self, 1987 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1988 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1989 exp.DiToDate: lambda self, 1990 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1991 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1992 exp.EqualNull: lambda self, e: self.sql( 1993 exp.NullSafeEQ(this=e.this, expression=e.expression) 1994 ), 1995 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1996 exp.GenerateDateArray: _generate_datetime_array_sql, 1997 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1998 exp.Getbit: getbit_sql, 1999 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 2000 exp.Explode: rename_func("UNNEST"), 2001 exp.IntDiv: lambda self, e: self.binary(e, "//"), 2002 exp.IsInf: rename_func("ISINF"), 2003 exp.IsNan: rename_func("ISNAN"), 2004 exp.IsNullValue: lambda self, e: self.sql( 2005 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 2006 ), 2007 exp.IsArray: lambda self, e: self.sql( 2008 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 2009 ), 2010 exp.Ceil: _ceil_floor, 2011 exp.Floor: _ceil_floor, 2012 exp.JarowinklerSimilarity: jarowinkler_similarity("JARO_WINKLER_SIMILARITY"), 2013 exp.JSONBExists: rename_func("JSON_EXISTS"), 2014 exp.JSONExtract: _arrow_json_extract_sql, 2015 exp.JSONExtractArray: _json_extract_value_array_sql, 2016 exp.JSONFormat: _json_format_sql, 2017 exp.JSONValueArray: _json_extract_value_array_sql, 2018 exp.Lateral: _explode_to_unnest_sql, 2019 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 2020 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 2021 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 2022 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 2023 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 2024 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 2025 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 2026 exp.BoolxorAgg: _boolxor_agg_sql, 2027 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 2028 exp.Initcap: _initcap_sql, 2029 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 2030 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 2031 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 2032 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 2033 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 2034 exp.MonthsBetween: months_between_sql, 2035 exp.NextDay: _day_navigation_sql, 2036 exp.PercentileCont: rename_func("QUANTILE_CONT"), 2037 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 2038 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 2039 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 2040 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 2041 exp.PreviousDay: _day_navigation_sql, 2042 exp.RegexpILike: lambda self, e: self.func( 2043 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 2044 ), 2045 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 2046 exp.RegrValx: _regr_val_sql, 2047 exp.RegrValy: _regr_val_sql, 2048 exp.Return: lambda self, e: self.sql(e, "this"), 2049 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 2050 exp.Rand: rename_func("RANDOM"), 2051 exp.Split: rename_func("STR_SPLIT"), 2052 exp.SortArray: _sort_array_sql, 2053 exp.StrPosition: strposition_sql, 2054 exp.StrToUnix: lambda self, e: self.func( 2055 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 2056 ), 2057 exp.Struct: _struct_sql, 2058 exp.Transform: rename_func("LIST_TRANSFORM"), 2059 exp.TimeAdd: _date_delta_to_binary_interval_op(), 2060 exp.TimeSub: _date_delta_to_binary_interval_op(), 2061 exp.Time: no_time_sql, 2062 exp.TimeDiff: _timediff_sql, 2063 exp.Timestamp: no_timestamp_sql, 2064 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 2065 exp.TimestampDiff: lambda self, e: self.func( 2066 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 2067 ), 2068 exp.TimestampSub: _date_delta_to_binary_interval_op(), 2069 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 2070 exp.TimeStrToTime: timestrtotime_sql, 2071 exp.TimeStrToUnix: lambda self, e: self.func( 2072 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 2073 ), 2074 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 2075 exp.ToBoolean: _to_boolean_sql, 2076 exp.TimeToUnix: rename_func("EPOCH"), 2077 exp.TsOrDiToDi: lambda self, 2078 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 2079 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 2080 exp.TsOrDsDiff: lambda self, e: self.func( 2081 "DATE_DIFF", 2082 f"'{e.args.get('unit') or 'DAY'}'", 2083 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 2084 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 2085 ), 2086 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 2087 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 2088 exp.UnixSeconds: lambda self, e: self.sql( 2089 exp.cast( 2090 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 2091 ) 2092 ), 2093 exp.UnixToStr: lambda self, e: self.func( 2094 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 2095 ), 2096 exp.DatetimeTrunc: lambda self, e: self.func( 2097 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 2098 ), 2099 exp.UnixToTime: _unix_to_time_sql, 2100 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 2101 exp.VariancePop: rename_func("VAR_POP"), 2102 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 2103 exp.YearOfWeek: lambda self, e: self.sql( 2104 exp.Extract( 2105 this=exp.Var(this="ISOYEAR"), 2106 expression=e.this, 2107 ) 2108 ), 2109 exp.YearOfWeekIso: lambda self, e: self.sql( 2110 exp.Extract( 2111 this=exp.Var(this="ISOYEAR"), 2112 expression=e.this, 2113 ) 2114 ), 2115 exp.Xor: _xor_sql, 2116 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2117 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2118 exp.DateBin: rename_func("TIME_BUCKET"), 2119 exp.LastDay: _last_day_sql, 2120 } 2121 2122 SUPPORTED_JSON_PATH_PARTS = { 2123 exp.JSONPathKey, 2124 exp.JSONPathRoot, 2125 exp.JSONPathSubscript, 2126 exp.JSONPathWildcard, 2127 } 2128 2129 TYPE_MAPPING = { 2130 **generator.Generator.TYPE_MAPPING, 2131 exp.DataType.Type.BINARY: "BLOB", 2132 exp.DataType.Type.BPCHAR: "TEXT", 2133 exp.DataType.Type.CHAR: "TEXT", 2134 exp.DataType.Type.DATETIME: "TIMESTAMP", 2135 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 2136 exp.DataType.Type.FLOAT: "REAL", 2137 exp.DataType.Type.JSONB: "JSON", 2138 exp.DataType.Type.NCHAR: "TEXT", 2139 exp.DataType.Type.NVARCHAR: "TEXT", 2140 exp.DataType.Type.UINT: "UINTEGER", 2141 exp.DataType.Type.VARBINARY: "BLOB", 2142 exp.DataType.Type.ROWVERSION: "BLOB", 2143 exp.DataType.Type.VARCHAR: "TEXT", 2144 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 2145 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 2146 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 2147 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 2148 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 2149 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 2150 } 2151 2152 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 2153 RESERVED_KEYWORDS = { 2154 "array", 2155 "analyse", 2156 "union", 2157 "all", 2158 "when", 2159 "in_p", 2160 "default", 2161 "create_p", 2162 "window", 2163 "asymmetric", 2164 "to", 2165 "else", 2166 "localtime", 2167 "from", 2168 "end_p", 2169 "select", 2170 "current_date", 2171 "foreign", 2172 "with", 2173 "grant", 2174 "session_user", 2175 "or", 2176 "except", 2177 "references", 2178 "fetch", 2179 "limit", 2180 "group_p", 2181 "leading", 2182 "into", 2183 "collate", 2184 "offset", 2185 "do", 2186 "then", 2187 "localtimestamp", 2188 "check_p", 2189 "lateral_p", 2190 "current_role", 2191 "where", 2192 "asc_p", 2193 "placing", 2194 "desc_p", 2195 "user", 2196 "unique", 2197 "initially", 2198 "column", 2199 "both", 2200 "some", 2201 "as", 2202 "any", 2203 "only", 2204 "deferrable", 2205 "null_p", 2206 "current_time", 2207 "true_p", 2208 "table", 2209 "case", 2210 "trailing", 2211 "variadic", 2212 "for", 2213 "on", 2214 "distinct", 2215 "false_p", 2216 "not", 2217 "constraint", 2218 "current_timestamp", 2219 "returning", 2220 "primary", 2221 "intersect", 2222 "having", 2223 "analyze", 2224 "current_user", 2225 "and", 2226 "cast", 2227 "symmetric", 2228 "using", 2229 "order", 2230 "current_catalog", 2231 } 2232 2233 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2234 2235 # DuckDB doesn't generally support CREATE TABLE .. properties 2236 # https://duckdb.org/docs/sql/statements/create_table.html 2237 PROPERTIES_LOCATION = { 2238 prop: exp.Properties.Location.UNSUPPORTED 2239 for prop in generator.Generator.PROPERTIES_LOCATION 2240 } 2241 2242 # There are a few exceptions (e.g. temporary tables) which are supported or 2243 # can be transpiled to DuckDB, so we explicitly override them accordingly 2244 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2245 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2246 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2247 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2248 2249 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2250 exp.FirstValue, 2251 exp.Lag, 2252 exp.LastValue, 2253 exp.Lead, 2254 exp.NthValue, 2255 ) 2256 2257 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2258 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2259 """ 2260 WITH rand AS (SELECT :random_expr AS r), 2261 weights AS ( 2262 SELECT i, 1.0 / POWER(i, :s) AS w 2263 FROM RANGE(1, :n + 1) AS t(i) 2264 ), 2265 cdf AS ( 2266 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2267 FROM weights 2268 ) 2269 SELECT MIN(i) 2270 FROM cdf 2271 WHERE p >= (SELECT r FROM rand) 2272 """ 2273 ) 2274 2275 # Template for NORMAL transpilation using Box-Muller transform 2276 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2277 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2278 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2279 ) 2280 2281 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2282 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2283 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2284 ) 2285 2286 # Template for generating signed and unsigned SEQ values within a specified range 2287 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(":base % :max_val") 2288 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2289 "(CASE WHEN :base % :max_val >= :half " 2290 "THEN :base % :max_val - :max_val " 2291 "ELSE :base % :max_val END)" 2292 ) 2293 2294 # Template for MAP_CAT transpilation - Snowflake semantics: 2295 # 1. Returns NULL if either input is NULL 2296 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2297 # 3. Filters out entries with NULL values from the result 2298 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2299 """ 2300 CASE 2301 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2302 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2303 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2304 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2305 ), __x -> __x.value IS NOT NULL)) 2306 END 2307 """ 2308 ) 2309 2310 # Mappings for EXTRACT/DATE_PART transpilation 2311 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2312 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2313 "WEEKISO": ("%V", "INTEGER"), 2314 "YEAROFWEEK": ("%G", "INTEGER"), 2315 "YEAROFWEEKISO": ("%G", "INTEGER"), 2316 "NANOSECOND": ("%n", "BIGINT"), 2317 } 2318 2319 # Maps epoch-based specifiers to DuckDB epoch functions 2320 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2321 "EPOCH_SECOND": "EPOCH", 2322 "EPOCH_MILLISECOND": "EPOCH_MS", 2323 "EPOCH_MICROSECOND": "EPOCH_US", 2324 "EPOCH_NANOSECOND": "EPOCH_NS", 2325 } 2326 2327 # Template for BITMAP_CONSTRUCT_AGG transpilation 2328 # 2329 # BACKGROUND: 2330 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2331 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2332 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2333 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2334 # 2335 # Snowflake uses two different formats based on the number of unique values: 2336 # 2337 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2338 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2339 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2340 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2341 # count v1 v2 v3 pad 2342 # 2343 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2344 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2345 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2346 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2347 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2348 # 2349 # TEMPLATE STRUCTURE 2350 # 2351 # Phase 1 - Innermost subquery: Data preparation 2352 # SELECT LIST_SORT(...) AS l 2353 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2354 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2355 # 2356 # Phase 2 - Middle subquery: Hex string construction 2357 # LIST_TRANSFORM(...) 2358 # - Converts each integer to 2-byte little-endian hex representation 2359 # - & 255 extracts low byte, >> 8 extracts high byte 2360 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2361 # Result: Hex string of all values 2362 # 2363 # Phase 3 - Outer SELECT: Final bitmap assembly 2364 # LENGTH(l) < 5: 2365 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2366 # LENGTH(l) >= 5: 2367 # - Large format: Fixed 10-byte header + values (no padding needed) 2368 # Result: Complete binary bitmap as BLOB 2369 # 2370 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2371 """ 2372 SELECT CASE 2373 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2374 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2375 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2376 ELSE UNHEX('08000000000000000000' || h) 2377 END 2378 FROM ( 2379 SELECT l, COALESCE(LIST_REDUCE( 2380 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2381 (__a, __b) -> __a || __b, '' 2382 ), '') AS h 2383 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2384 ) 2385 """ 2386 ) 2387 2388 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2389 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2390 f""" 2391 SELECT LISTAGG( 2392 SUBSTRING( 2393 '{RANDSTR_CHAR_POOL}', 2394 1 + CAST(FLOOR(random_value * 62) AS INT), 2395 1 2396 ), 2397 '' 2398 ) 2399 FROM ( 2400 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2401 FROM RANGE(:length) AS t(i) 2402 ) 2403 """, 2404 ) 2405 2406 # Template for MINHASH transpilation 2407 # Computes k minimum hash values across aggregated data using DuckDB list functions 2408 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2409 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2410 """ 2411 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2412 FROM ( 2413 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2414 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2415 ) 2416 """, 2417 ) 2418 2419 # Template for MINHASH_COMBINE transpilation 2420 # Combines multiple minhash signatures by taking element-wise minimum 2421 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2422 """ 2423 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2424 FROM ( 2425 SELECT 2426 pos AS idx, 2427 MIN(val) AS min_h 2428 FROM 2429 UNNEST(LIST(:expr)) AS _(sig), 2430 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2431 GROUP BY pos 2432 ) 2433 """, 2434 ) 2435 2436 # Template for APPROXIMATE_SIMILARITY transpilation 2437 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2438 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2439 """ 2440 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2441 FROM ( 2442 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2443 FROM ( 2444 SELECT h, pos 2445 FROM UNNEST(LIST(:expr)) AS _(sig), 2446 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2447 ) 2448 GROUP BY pos 2449 ) 2450 """, 2451 ) 2452 2453 # Template for ARRAYS_ZIP transpilation 2454 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2455 # Uses RANGE + indexing to match Snowflake behavior 2456 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2457 """ 2458 CASE WHEN :null_check THEN NULL 2459 WHEN :all_empty_check THEN [:empty_struct] 2460 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2461 END 2462 """, 2463 ) 2464 2465 # ARRAY_EXCEPT with bag semantics: N - M occurrences via cumulative counting 2466 # 0-based indices in template (SQLGlot internal), converted to 1-based for DuckDB 2467 # IS NOT DISTINCT FROM for NULL-safe element comparison 2468 ARRAY_EXCEPT_TEMPLATE: exp.Expression = exp.maybe_parse( 2469 """ 2470 CASE 2471 WHEN :source IS NULL OR :exclude IS NULL THEN NULL 2472 ELSE LIST_TRANSFORM( 2473 LIST_FILTER( 2474 LIST_ZIP(:source, GENERATE_SERIES(1, LEN(:source))), 2475 pair -> ( 2476 LEN(LIST_FILTER(:source[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0])) 2477 > LEN(LIST_FILTER(:exclude, e -> e IS NOT DISTINCT FROM pair[0])) 2478 ) 2479 ), 2480 pair -> pair[0] 2481 ) 2482 END 2483 """, 2484 ) 2485 2486 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2487 """ 2488 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2489 2490 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2491 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2492 2493 For 'END' kind, add the interval to get the end of the slice. 2494 For DATE type with 'END', cast result back to DATE to preserve type. 2495 """ 2496 date_expr = expression.this 2497 slice_length = expression.expression 2498 unit = expression.unit 2499 kind = expression.text("kind").upper() 2500 2501 # Create INTERVAL expression: INTERVAL 'N' UNIT 2502 interval_expr = exp.Interval(this=slice_length, unit=unit) 2503 2504 # Create base time_bucket expression 2505 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2506 2507 # Check if we need the end of the slice (default is start) 2508 if not kind == "END": 2509 # For 'START', return time_bucket directly 2510 return self.sql(time_bucket_expr) 2511 2512 # For 'END', add the interval to get end of slice 2513 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2514 2515 # If input is DATE type, cast result back to DATE to preserve type 2516 # DuckDB converts DATE to TIMESTAMP when adding intervals 2517 if date_expr.is_type(exp.DataType.Type.DATE): 2518 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2519 2520 return self.sql(add_expr) 2521 2522 def bitmapbucketnumber_sql( 2523 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2524 ) -> str: 2525 """ 2526 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2527 2528 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2529 - Each bucket covers 32,768 values 2530 - Bucket numbering starts at 1 2531 - Formula: ((value - 1) // 32768) + 1 for positive values 2532 2533 For non-positive values (0 and negative), we use value // 32768 to avoid 2534 producing bucket 0 or positive bucket IDs for negative inputs. 2535 """ 2536 value = expression.this 2537 2538 positive_formula = ((value - 1) // 32768) + 1 2539 non_positive_formula = value // 32768 2540 2541 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2542 case_expr = ( 2543 exp.case() 2544 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2545 .else_(non_positive_formula) 2546 ) 2547 return self.sql(case_expr) 2548 2549 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2550 """ 2551 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2552 2553 Snowflake's BITMAP_BIT_POSITION behavior: 2554 - For n <= 0: returns ABS(n) % 32768 2555 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2556 """ 2557 this = expression.this 2558 2559 return self.sql( 2560 exp.Mod( 2561 this=exp.Paren( 2562 this=exp.If( 2563 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2564 true=this - exp.Literal.number(1), 2565 false=exp.Abs(this=this), 2566 ) 2567 ), 2568 expression=MAX_BIT_POSITION, 2569 ) 2570 ) 2571 2572 def bitmapconstructagg_sql( 2573 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2574 ) -> str: 2575 """ 2576 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2577 Uses a pre-parsed template with placeholders replaced by expression nodes. 2578 2579 Snowflake bitmap format: 2580 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2581 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2582 """ 2583 arg = expression.this 2584 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2585 2586 def nthvalue_sql(self: DuckDB.Generator, expression: exp.NthValue) -> str: 2587 from_first = expression.args.get("from_first", True) 2588 if not from_first: 2589 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2590 2591 return self.function_fallback_sql(expression) 2592 2593 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2594 """ 2595 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2596 Uses a pre-parsed template with placeholders replaced by expression nodes. 2597 2598 RANDSTR(length, generator) generates a random string of specified length. 2599 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2600 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2601 - No generator: Use default seed value 2602 """ 2603 length = expression.this 2604 generator = expression.args.get("generator") 2605 2606 if generator: 2607 if isinstance(generator, exp.Rand): 2608 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2609 seed_value = generator.this or generator 2610 else: 2611 # Const/int or other expression - use as seed directly 2612 seed_value = generator 2613 else: 2614 # No generator specified, use default seed (arbitrary but deterministic) 2615 seed_value = exp.Literal.number(RANDSTR_SEED) 2616 2617 replacements = {"seed": seed_value, "length": length} 2618 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2619 2620 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2621 """ 2622 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2623 Uses a pre-parsed template with placeholders replaced by expression nodes. 2624 """ 2625 s = expression.this 2626 n = expression.args["elementcount"] 2627 gen = expression.args["gen"] 2628 2629 if not isinstance(gen, exp.Rand): 2630 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2631 random_expr: exp.Expression = exp.Div( 2632 this=exp.Paren( 2633 this=exp.Mod( 2634 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2635 expression=exp.Literal.number(1000000), 2636 ) 2637 ), 2638 expression=exp.Literal.number(1000000.0), 2639 ) 2640 else: 2641 # Use RANDOM() for non-deterministic output 2642 random_expr = exp.Rand() 2643 2644 replacements = {"s": s, "n": n, "random_expr": random_expr} 2645 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2646 2647 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2648 """ 2649 TO_BINARY and TRY_TO_BINARY transpilation: 2650 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2651 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2652 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2653 2654 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2655 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2656 """ 2657 value = expression.this 2658 format_arg = expression.args.get("format") 2659 is_safe = expression.args.get("safe") 2660 is_binary = _is_binary(expression) 2661 2662 if not format_arg and not is_binary: 2663 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2664 return self.func(func_name, value) 2665 2666 # Snowflake defaults to HEX encoding when no format is specified 2667 fmt = format_arg.name.upper() if format_arg else "HEX" 2668 2669 if fmt in ("UTF-8", "UTF8"): 2670 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2671 result = self.func("ENCODE", value) 2672 elif fmt == "BASE64": 2673 result = self.func("FROM_BASE64", value) 2674 elif fmt == "HEX": 2675 result = self.func("UNHEX", value) 2676 else: 2677 if is_safe: 2678 return self.sql(exp.null()) 2679 else: 2680 self.unsupported(f"format {fmt} is not supported") 2681 result = self.func("TO_BINARY", value) 2682 return f"TRY({result})" if is_safe else result 2683 2684 def _greatest_least_sql( 2685 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2686 ) -> str: 2687 """ 2688 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2689 2690 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2691 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2692 """ 2693 # Get all arguments 2694 all_args = [expression.this, *expression.expressions] 2695 fallback_sql = self.function_fallback_sql(expression) 2696 2697 if expression.args.get("ignore_nulls"): 2698 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2699 return self.sql(fallback_sql) 2700 2701 # return NULL if any argument is NULL 2702 case_expr = exp.case().when( 2703 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2704 exp.null(), 2705 copy=False, 2706 ) 2707 case_expr.set("default", fallback_sql) 2708 return self.sql(case_expr) 2709 2710 def generator_sql(self, expression: exp.Generator) -> str: 2711 # Transpile Snowflake GENERATOR to DuckDB range() 2712 rowcount = expression.args.get("rowcount") 2713 time_limit = expression.args.get("time_limit") 2714 2715 if time_limit: 2716 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2717 2718 if not rowcount: 2719 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2720 return self.func("range", exp.Literal.number(0)) 2721 2722 return self.func("range", rowcount) 2723 2724 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2725 return self._greatest_least_sql(expression) 2726 2727 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2728 return self._greatest_least_sql(expression) 2729 2730 def lambda_sql( 2731 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2732 ) -> str: 2733 if expression.args.get("colon"): 2734 prefix = "LAMBDA " 2735 arrow_sep = ":" 2736 wrap = False 2737 else: 2738 prefix = "" 2739 2740 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2741 return f"{prefix}{lambda_sql}" 2742 2743 def show_sql(self, expression: exp.Show) -> str: 2744 return f"SHOW {expression.name}" 2745 2746 def install_sql(self, expression: exp.Install) -> str: 2747 force = "FORCE " if expression.args.get("force") else "" 2748 this = self.sql(expression, "this") 2749 from_clause = expression.args.get("from_") 2750 from_clause = f" FROM {from_clause}" if from_clause else "" 2751 return f"{force}INSTALL {this}{from_clause}" 2752 2753 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2754 self.unsupported( 2755 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2756 ) 2757 return self.function_fallback_sql(expression) 2758 2759 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2760 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2761 2762 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2763 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2764 target_type = expression.args.get("target_type") 2765 needs_tz = target_type and target_type.this in ( 2766 exp.DataType.Type.TIMESTAMPLTZ, 2767 exp.DataType.Type.TIMESTAMPTZ, 2768 ) 2769 2770 if expression.args.get("safe"): 2771 formatted_time = self.format_time(expression) 2772 cast_type = ( 2773 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2774 ) 2775 return self.sql( 2776 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2777 ) 2778 2779 base_sql = str_to_time_sql(self, expression) 2780 if needs_tz: 2781 return self.sql( 2782 exp.cast( 2783 base_sql, 2784 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2785 ) 2786 ) 2787 return base_sql 2788 2789 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2790 formatted_time = self.format_time(expression) 2791 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2792 return self.sql( 2793 exp.cast( 2794 self.func(function_name, expression.this, formatted_time), 2795 exp.DataType(this=exp.DataType.Type.DATE), 2796 ) 2797 ) 2798 2799 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2800 this = expression.this 2801 time_format = self.format_time(expression) 2802 safe = expression.args.get("safe") 2803 time_type = exp.DataType.build("TIME", dialect="duckdb") 2804 cast_expr = exp.TryCast if safe else exp.Cast 2805 2806 if time_format: 2807 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2808 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2809 return self.sql(cast_expr(this=strptime, to=time_type)) 2810 2811 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2812 return self.sql(this) 2813 2814 return self.sql(cast_expr(this=this, to=time_type)) 2815 2816 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2817 if not expression.this: 2818 return "CURRENT_DATE" 2819 2820 expr = exp.Cast( 2821 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2822 to=exp.DataType(this=exp.DataType.Type.DATE), 2823 ) 2824 return self.sql(expr) 2825 2826 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2827 arg = expression.this 2828 if expression.args.get("safe"): 2829 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2830 return self.func("JSON", arg) 2831 2832 @unsupported_args("decimals") 2833 def trunc_sql(self, expression: exp.Trunc) -> str: 2834 return self.func("TRUNC", expression.this) 2835 2836 def normal_sql(self, expression: exp.Normal) -> str: 2837 """ 2838 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2839 2840 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2841 """ 2842 mean = expression.this 2843 stddev = expression.args["stddev"] 2844 gen: exp.Expression = expression.args["gen"] 2845 2846 # Build two uniform random values [0, 1) for Box-Muller transform 2847 if isinstance(gen, exp.Rand) and gen.this is None: 2848 u1: exp.Expression = exp.Rand() 2849 u2: exp.Expression = exp.Rand() 2850 else: 2851 # Seeded: derive two values using HASH with different inputs 2852 seed = gen.this if isinstance(gen, exp.Rand) else gen 2853 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2854 u2 = exp.replace_placeholders( 2855 self.SEEDED_RANDOM_TEMPLATE, 2856 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2857 ) 2858 2859 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2860 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2861 2862 def uniform_sql(self, expression: exp.Uniform) -> str: 2863 """ 2864 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2865 2866 UNIFORM returns a random value in [min, max]: 2867 - Integer result if both min and max are integers 2868 - Float result if either min or max is a float 2869 """ 2870 min_val = expression.this 2871 max_val = expression.expression 2872 gen = expression.args.get("gen") 2873 2874 # Determine if result should be integer (both bounds are integers). 2875 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2876 is_int_result = min_val.is_int and max_val.is_int 2877 2878 # Build the random value expression [0, 1) 2879 if not isinstance(gen, exp.Rand): 2880 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2881 random_expr: exp.Expression = exp.Div( 2882 this=exp.Paren( 2883 this=exp.Mod( 2884 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2885 expression=exp.Literal.number(1000000), 2886 ) 2887 ), 2888 expression=exp.Literal.number(1000000.0), 2889 ) 2890 else: 2891 random_expr = exp.Rand() 2892 2893 # Build: min + random * (max - min [+ 1 for int]) 2894 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2895 if is_int_result: 2896 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2897 2898 result: exp.Expression = exp.Add( 2899 this=min_val, 2900 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2901 ) 2902 2903 if is_int_result: 2904 result = exp.Cast( 2905 this=exp.Floor(this=result), 2906 to=exp.DataType.build("BIGINT"), 2907 ) 2908 2909 return self.sql(result) 2910 2911 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2912 nano = expression.args.get("nano") 2913 overflow = expression.args.get("overflow") 2914 2915 # Snowflake's TIME_FROM_PARTS supports overflow 2916 if overflow: 2917 hour = expression.args["hour"] 2918 minute = expression.args["min"] 2919 sec = expression.args["sec"] 2920 2921 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2922 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2923 try: 2924 h_val = hour.to_py() 2925 m_val = minute.to_py() 2926 s_val = sec.to_py() 2927 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2928 return rename_func("MAKE_TIME")(self, expression) 2929 except ValueError: 2930 pass 2931 2932 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2933 if nano: 2934 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2935 2936 total_seconds = ( 2937 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2938 ) 2939 2940 return self.sql( 2941 exp.Add( 2942 this=exp.Cast( 2943 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2944 ), 2945 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2946 ) 2947 ) 2948 2949 # Default: MAKE_TIME 2950 if nano: 2951 expression.set( 2952 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2953 ) 2954 2955 return rename_func("MAKE_TIME")(self, expression) 2956 2957 def extract_sql(self, expression: exp.Extract) -> str: 2958 """ 2959 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2960 2961 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2962 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2963 """ 2964 this = expression.this 2965 datetime_expr = expression.expression 2966 2967 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2968 # because Snowflake applies server timezone while DuckDB uses local timezone 2969 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2970 self.unsupported( 2971 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2972 ) 2973 2974 part_name = this.name.upper() 2975 2976 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2977 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2978 2979 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2980 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2981 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2982 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2983 ) 2984 2985 if is_nano_time: 2986 self.unsupported( 2987 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2988 ) 2989 return self.sql( 2990 exp.cast( 2991 exp.Mul( 2992 this=exp.Extract( 2993 this=exp.var("MICROSECOND"), expression=datetime_expr 2994 ), 2995 expression=exp.Literal.number(1000), 2996 ), 2997 exp.DataType.build(cast_type, dialect="duckdb"), 2998 ) 2999 ) 3000 3001 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 3002 strftime_input = datetime_expr 3003 if part_name == "NANOSECOND": 3004 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 3005 3006 return self.sql( 3007 exp.cast( 3008 exp.Anonymous( 3009 this="STRFTIME", 3010 expressions=[strftime_input, exp.Literal.string(fmt)], 3011 ), 3012 exp.DataType.build(cast_type, dialect="duckdb"), 3013 ) 3014 ) 3015 3016 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 3017 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 3018 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 3019 # EPOCH returns float, cast to BIGINT for integer result 3020 if part_name == "EPOCH_SECOND": 3021 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 3022 return self.sql(result) 3023 3024 return super().extract_sql(expression) 3025 3026 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 3027 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 3028 date_expr = expression.this 3029 time_expr = expression.expression 3030 3031 if date_expr is not None and time_expr is not None: 3032 # In DuckDB, DATE + TIME produces TIMESTAMP 3033 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 3034 3035 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 3036 sec = expression.args.get("sec") 3037 if sec is None: 3038 # This shouldn't happen with valid input, but handle gracefully 3039 return rename_func("MAKE_TIMESTAMP")(self, expression) 3040 3041 milli = expression.args.get("milli") 3042 if milli is not None: 3043 sec += milli.pop() / exp.Literal.number(1000.0) 3044 3045 nano = expression.args.get("nano") 3046 if nano is not None: 3047 sec += nano.pop() / exp.Literal.number(1000000000.0) 3048 3049 if milli or nano: 3050 expression.set("sec", sec) 3051 3052 return rename_func("MAKE_TIMESTAMP")(self, expression) 3053 3054 @unsupported_args("nano") 3055 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 3056 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3057 if nano := expression.args.get("nano"): 3058 nano.pop() 3059 3060 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3061 return f"CAST({timestamp} AS TIMESTAMPTZ)" 3062 3063 @unsupported_args("nano") 3064 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 3065 # Extract zone before popping 3066 zone = expression.args.get("zone") 3067 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3068 if zone: 3069 zone = zone.pop() 3070 3071 if nano := expression.args.get("nano"): 3072 nano.pop() 3073 3074 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3075 3076 if zone: 3077 # Use AT TIME ZONE to apply the explicit timezone 3078 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 3079 3080 return timestamp 3081 3082 def tablesample_sql( 3083 self, 3084 expression: exp.TableSample, 3085 tablesample_keyword: t.Optional[str] = None, 3086 ) -> str: 3087 if not isinstance(expression.parent, exp.Select): 3088 # This sample clause only applies to a single source, not the entire resulting relation 3089 tablesample_keyword = "TABLESAMPLE" 3090 3091 if expression.args.get("size"): 3092 method = expression.args.get("method") 3093 if method and method.name.upper() != "RESERVOIR": 3094 self.unsupported( 3095 f"Sampling method {method} is not supported with a discrete sample count, " 3096 "defaulting to reservoir sampling" 3097 ) 3098 expression.set("method", exp.var("RESERVOIR")) 3099 3100 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 3101 3102 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 3103 if isinstance(expression.parent, exp.UserDefinedFunction): 3104 return self.sql(expression, "this") 3105 return super().columndef_sql(expression, sep) 3106 3107 def join_sql(self, expression: exp.Join) -> str: 3108 if ( 3109 not expression.args.get("using") 3110 and not expression.args.get("on") 3111 and not expression.method 3112 and (expression.kind in ("", "INNER", "OUTER")) 3113 ): 3114 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 3115 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 3116 if isinstance(expression.this, exp.Unnest): 3117 return super().join_sql(expression.on(exp.true())) 3118 3119 expression.set("side", None) 3120 expression.set("kind", None) 3121 3122 return super().join_sql(expression) 3123 3124 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 3125 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 3126 if expression.args.get("is_end_exclusive"): 3127 return rename_func("RANGE")(self, expression) 3128 3129 return self.function_fallback_sql(expression) 3130 3131 def countif_sql(self, expression: exp.CountIf) -> str: 3132 if self.dialect.version >= (1, 2): 3133 return self.function_fallback_sql(expression) 3134 3135 # https://github.com/tobymao/sqlglot/pull/4749 3136 return count_if_to_sum(self, expression) 3137 3138 def bracket_sql(self, expression: exp.Bracket) -> str: 3139 if self.dialect.version >= (1, 2): 3140 return super().bracket_sql(expression) 3141 3142 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3143 this = expression.this 3144 if isinstance(this, exp.Array): 3145 this.replace(exp.paren(this)) 3146 3147 bracket = super().bracket_sql(expression) 3148 3149 if not expression.args.get("returns_list_for_maps"): 3150 if not this.type: 3151 from sqlglot.optimizer.annotate_types import annotate_types 3152 3153 this = annotate_types(this, dialect=self.dialect) 3154 3155 if this.is_type(exp.DataType.Type.MAP): 3156 bracket = f"({bracket})[1]" 3157 3158 return bracket 3159 3160 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3161 func = expression.this 3162 3163 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3164 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3165 if isinstance(func, exp.ArrayAgg): 3166 if not isinstance(order := expression.expression, exp.Order): 3167 return self.sql(func) 3168 3169 # Save the original column for FILTER clause (before wrapping with Order) 3170 original_this = func.this 3171 3172 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3173 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3174 func.set( 3175 "this", 3176 exp.Order( 3177 this=func.this.copy(), 3178 expressions=order.expressions, 3179 ), 3180 ) 3181 3182 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3183 # Use original_this (not the Order-wrapped version) for the FILTER condition 3184 array_agg_sql = self.function_fallback_sql(func) 3185 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3186 3187 # For other functions (like PERCENTILES), use existing logic 3188 expression_sql = self.sql(expression, "expression") 3189 3190 if isinstance(func, exp.PERCENTILES): 3191 # Make the order key the first arg and slide the fraction to the right 3192 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3193 order_col = expression.find(exp.Ordered) 3194 if order_col: 3195 func.set("expression", func.this) 3196 func.set("this", order_col.this) 3197 3198 this = self.sql(expression, "this").rstrip(")") 3199 3200 return f"{this}{expression_sql})" 3201 3202 def length_sql(self, expression: exp.Length) -> str: 3203 arg = expression.this 3204 3205 # Dialects like BQ and Snowflake also accept binary values as args, so 3206 # DDB will attempt to infer the type or resort to case/when resolution 3207 if not expression.args.get("binary") or arg.is_string: 3208 return self.func("LENGTH", arg) 3209 3210 if not arg.type: 3211 from sqlglot.optimizer.annotate_types import annotate_types 3212 3213 arg = annotate_types(arg, dialect=self.dialect) 3214 3215 if arg.is_type(*exp.DataType.TEXT_TYPES): 3216 return self.func("LENGTH", arg) 3217 3218 # We need these casts to make duckdb's static type checker happy 3219 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3220 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 3221 3222 case = ( 3223 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3224 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3225 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3226 ) 3227 return self.sql(case) 3228 3229 def _validate_regexp_flags( 3230 self, flags: t.Optional[exp.Expression], supported_flags: str 3231 ) -> t.Optional[str]: 3232 """ 3233 Validate and filter regexp flags for DuckDB compatibility. 3234 3235 Args: 3236 flags: The flags expression to validate 3237 supported_flags: String of supported flags (e.g., "ims", "cims"). 3238 Only these flags will be returned. 3239 3240 Returns: 3241 Validated/filtered flag string, or None if no valid flags remain 3242 """ 3243 if not isinstance(flags, exp.Expression): 3244 return None 3245 3246 if not flags.is_string: 3247 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3248 return None 3249 3250 flag_str = flags.this 3251 unsupported = set(flag_str) - set(supported_flags) 3252 3253 if unsupported: 3254 self.unsupported( 3255 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3256 ) 3257 3258 flag_str = "".join(f for f in flag_str if f in supported_flags) 3259 return flag_str if flag_str else None 3260 3261 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3262 this = expression.this 3263 pattern = expression.expression 3264 position = expression.args.get("position") 3265 parameters = expression.args.get("parameters") 3266 3267 # Validate flags - only "ims" flags are supported for embedded patterns 3268 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3269 3270 if position: 3271 this = exp.Substring(this=this, start=position) 3272 3273 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3274 if validated_flags: 3275 pattern = exp.Concat( 3276 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3277 ) 3278 3279 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3280 result = ( 3281 exp.case() 3282 .when( 3283 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3284 exp.Literal.number(0), 3285 ) 3286 .else_( 3287 exp.Length( 3288 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3289 ) 3290 ) 3291 ) 3292 3293 return self.sql(result) 3294 3295 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3296 subject = expression.this 3297 pattern = expression.expression 3298 replacement = expression.args.get("replacement") or exp.Literal.string("") 3299 position = expression.args.get("position") 3300 occurrence = expression.args.get("occurrence") 3301 modifiers = expression.args.get("modifiers") 3302 3303 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3304 3305 # Handle occurrence (only literals supported) 3306 if occurrence and not occurrence.is_int: 3307 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3308 else: 3309 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3310 if occurrence > 1: 3311 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3312 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3313 elif ( 3314 occurrence == 0 3315 and "g" not in validated_flags 3316 and not expression.args.get("single_replace") 3317 ): 3318 validated_flags += "g" 3319 3320 # Handle position (only literals supported) 3321 prefix = None 3322 if position and not position.is_int: 3323 self.unsupported("REGEXP_REPLACE with non-literal position") 3324 elif position and position.is_int and position.to_py() > 1: 3325 pos = position.to_py() 3326 prefix = exp.Substring( 3327 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3328 ) 3329 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3330 3331 result: exp.Expression = exp.Anonymous( 3332 this="REGEXP_REPLACE", 3333 expressions=[ 3334 subject, 3335 pattern, 3336 replacement, 3337 exp.Literal.string(validated_flags) if validated_flags else None, 3338 ], 3339 ) 3340 3341 if prefix: 3342 result = exp.Concat(expressions=[prefix, result]) 3343 3344 return self.sql(result) 3345 3346 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3347 this = expression.this 3348 pattern = expression.expression 3349 flag = expression.args.get("flag") 3350 3351 if not expression.args.get("full_match"): 3352 return self.func("REGEXP_MATCHES", this, pattern, flag) 3353 3354 # DuckDB REGEXP_MATCHES supports: c, i, m, s (but not 'e') 3355 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3356 3357 anchored_pattern = exp.Concat( 3358 expressions=[ 3359 exp.Literal.string("^("), 3360 exp.Paren(this=pattern), 3361 exp.Literal.string(")$"), 3362 ] 3363 ) 3364 3365 if validated_flags: 3366 flag = exp.Literal.string(validated_flags) 3367 3368 return self.func("REGEXP_MATCHES", this, anchored_pattern, flag) 3369 3370 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3371 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3372 this = expression.this 3373 expr = expression.expression 3374 max_dist = expression.args.get("max_dist") 3375 3376 if max_dist is None: 3377 return self.func("LEVENSHTEIN", this, expr) 3378 3379 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3380 levenshtein = exp.Levenshtein(this=this, expression=expr) 3381 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3382 3383 def pad_sql(self, expression: exp.Pad) -> str: 3384 """ 3385 Handle RPAD/LPAD for VARCHAR and BINARY types. 3386 3387 For VARCHAR: Delegate to parent class 3388 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3389 """ 3390 string_arg = expression.this 3391 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3392 3393 if _is_binary(string_arg) or _is_binary(fill_arg): 3394 length_arg = expression.expression 3395 is_left = expression.args.get("is_left") 3396 3397 input_len = exp.ByteLength(this=string_arg) 3398 chars_needed = length_arg - input_len 3399 pad_count = exp.Greatest( 3400 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3401 ) 3402 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3403 3404 left, right = string_arg, repeat_expr 3405 if is_left: 3406 left, right = right, left 3407 3408 result = exp.DPipe(this=left, expression=right) 3409 return self.sql(result) 3410 3411 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3412 return super().pad_sql(expression) 3413 3414 def minhash_sql(self, expression: exp.Minhash) -> str: 3415 k = expression.this 3416 exprs = expression.expressions 3417 3418 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3419 self.unsupported( 3420 "MINHASH with multiple expressions or * requires manual query restructuring" 3421 ) 3422 return self.func("MINHASH", k, *exprs) 3423 3424 expr = exprs[0] 3425 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3426 return f"({self.sql(result)})" 3427 3428 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3429 expr = expression.this 3430 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3431 return f"({self.sql(result)})" 3432 3433 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3434 expr = expression.this 3435 result = exp.replace_placeholders( 3436 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3437 ) 3438 return f"({self.sql(result)})" 3439 3440 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3441 arr = expression.this 3442 func = self.func("LIST_DISTINCT", arr) 3443 3444 if expression.args.get("check_null"): 3445 add_null_to_array = exp.func( 3446 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3447 ) 3448 return self.sql( 3449 exp.If( 3450 this=exp.NEQ( 3451 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3452 ), 3453 true=add_null_to_array, 3454 false=func, 3455 ) 3456 ) 3457 3458 return func 3459 3460 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3461 source = expression.this 3462 exclude = expression.expression 3463 3464 replacements = {"source": source, "exclude": exclude} 3465 return self.sql(exp.replace_placeholders(self.ARRAY_EXCEPT_TEMPLATE, **replacements)) 3466 3467 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3468 args = expression.expressions 3469 3470 if not args: 3471 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3472 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3473 3474 # Build placeholder values for template 3475 lengths = [exp.Length(this=arg) for arg in args] 3476 max_len = ( 3477 lengths[0] 3478 if len(lengths) == 1 3479 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3480 ) 3481 3482 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3483 empty_struct = exp.func( 3484 "STRUCT", 3485 *[ 3486 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3487 for i in range(len(args)) 3488 ], 3489 ) 3490 3491 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3492 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3493 index = exp.column("__i") + 1 3494 transform_struct = exp.func( 3495 "STRUCT", 3496 *[ 3497 exp.PropertyEQ( 3498 this=exp.Literal.string(f"${i + 1}"), 3499 expression=exp.func("COALESCE", arg, exp.array())[index], 3500 ) 3501 for i, arg in enumerate(args) 3502 ], 3503 ) 3504 3505 result = exp.replace_placeholders( 3506 self.ARRAYS_ZIP_TEMPLATE.copy(), 3507 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3508 all_empty_check=exp.and_( 3509 *[ 3510 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3511 for arg in args 3512 ] 3513 ), 3514 empty_struct=empty_struct, 3515 max_len=max_len, 3516 transform_struct=transform_struct, 3517 ) 3518 return self.sql(result) 3519 3520 def lower_sql(self, expression: exp.Lower) -> str: 3521 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3522 return _gen_with_cast_to_blob(self, expression, result_sql) 3523 3524 def upper_sql(self, expression: exp.Upper) -> str: 3525 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3526 return _gen_with_cast_to_blob(self, expression, result_sql) 3527 3528 def reverse_sql(self, expression: exp.Reverse) -> str: 3529 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3530 return _gen_with_cast_to_blob(self, expression, result_sql) 3531 3532 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3533 # DuckDB TO_BASE64 requires BLOB input 3534 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3535 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3536 result = expression.this 3537 3538 # Check if input is a string type - ENCODE only accepts VARCHAR 3539 if result.is_type(*exp.DataType.TEXT_TYPES): 3540 result = exp.Encode(this=result) 3541 3542 result = exp.ToBase64(this=result) 3543 3544 max_line_length = expression.args.get("max_line_length") 3545 alphabet = expression.args.get("alphabet") 3546 3547 # Handle custom alphabet by replacing standard chars with custom ones 3548 result = _apply_base64_alphabet_replacements(result, alphabet) 3549 3550 # Handle max_line_length by inserting newlines every N characters 3551 line_length = ( 3552 t.cast(int, max_line_length.to_py()) 3553 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3554 else 0 3555 ) 3556 if line_length > 0: 3557 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3558 result = exp.Trim( 3559 this=exp.RegexpReplace( 3560 this=result, 3561 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3562 replacement=exp.Concat( 3563 expressions=[exp.Literal.string("\\1"), newline.copy()] 3564 ), 3565 ), 3566 expression=newline, 3567 position="TRAILING", 3568 ) 3569 3570 return self.sql(result) 3571 3572 def replace_sql(self, expression: exp.Replace) -> str: 3573 result_sql = self.func( 3574 "REPLACE", 3575 _cast_to_varchar(expression.this), 3576 _cast_to_varchar(expression.expression), 3577 _cast_to_varchar(expression.args.get("replacement")), 3578 ) 3579 return _gen_with_cast_to_blob(self, expression, result_sql) 3580 3581 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3582 _prepare_binary_bitwise_args(expression) 3583 result_sql = self.binary(expression, op) 3584 return _gen_with_cast_to_blob(self, expression, result_sql) 3585 3586 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3587 _prepare_binary_bitwise_args(expression) 3588 result_sql = self.func("XOR", expression.this, expression.expression) 3589 return _gen_with_cast_to_blob(self, expression, result_sql) 3590 3591 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3592 this = expression.this 3593 key = expression.args.get("key") 3594 key_sql = key.name if isinstance(key, exp.Expression) else "" 3595 value_sql = self.sql(expression, "value") 3596 3597 kv_sql = f"{key_sql} := {value_sql}" 3598 3599 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3600 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3601 if isinstance(this, exp.Struct) and not this.expressions: 3602 return self.func("STRUCT_PACK", kv_sql) 3603 3604 return self.func("STRUCT_INSERT", this, kv_sql) 3605 3606 def mapcat_sql(self, expression: exp.MapCat) -> str: 3607 result = exp.replace_placeholders( 3608 self.MAPCAT_TEMPLATE.copy(), 3609 map1=expression.this, 3610 map2=expression.expression, 3611 ) 3612 return self.sql(result) 3613 3614 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3615 return self.func( 3616 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3617 ) 3618 3619 def startswith_sql(self, expression: exp.StartsWith) -> str: 3620 return self.func( 3621 "STARTS_WITH", 3622 _cast_to_varchar(expression.this), 3623 _cast_to_varchar(expression.expression), 3624 ) 3625 3626 def space_sql(self, expression: exp.Space) -> str: 3627 # DuckDB's REPEAT requires BIGINT for the count parameter 3628 return self.sql( 3629 exp.Repeat( 3630 this=exp.Literal.string(" "), 3631 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3632 ) 3633 ) 3634 3635 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3636 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3637 if isinstance(expression.this, exp.Generator): 3638 # Preserve alias, joins, and other table-level args 3639 table = exp.Table( 3640 this=expression.this, 3641 alias=expression.args.get("alias"), 3642 joins=expression.args.get("joins"), 3643 ) 3644 return self.sql(table) 3645 3646 return super().tablefromrows_sql(expression) 3647 3648 def unnest_sql(self, expression: exp.Unnest) -> str: 3649 explode_array = expression.args.get("explode_array") 3650 if explode_array: 3651 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3652 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3653 expression.expressions.append( 3654 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3655 ) 3656 3657 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3658 alias = expression.args.get("alias") 3659 if isinstance(alias, exp.TableAlias): 3660 expression.set("alias", None) 3661 if alias.columns: 3662 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3663 3664 unnest_sql = super().unnest_sql(expression) 3665 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3666 return self.sql(select) 3667 3668 return super().unnest_sql(expression) 3669 3670 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3671 this = expression.this 3672 3673 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3674 # DuckDB should render IGNORE NULLS only for the general-purpose 3675 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3676 return super().ignorenulls_sql(expression) 3677 3678 if isinstance(this, exp.First): 3679 this = exp.AnyValue(this=this.this) 3680 3681 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3682 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3683 3684 return self.sql(this) 3685 3686 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3687 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3688 # DuckDB should render RESPECT NULLS only for the general-purpose 3689 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3690 return super().respectnulls_sql(expression) 3691 3692 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3693 return self.sql(expression, "this") 3694 3695 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3696 this = self.sql(expression, "this") 3697 null_text = self.sql(expression, "null") 3698 3699 if null_text: 3700 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3701 3702 return self.func("ARRAY_TO_STRING", this, expression.expression) 3703 3704 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3705 this = expression.this 3706 group = expression.args.get("group") 3707 params = expression.args.get("parameters") 3708 position = expression.args.get("position") 3709 occurrence = expression.args.get("occurrence") 3710 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3711 3712 # Handle Snowflake's 'e' flag: it enables capture group extraction 3713 # In DuckDB, this is controlled by the group parameter directly 3714 if params and params.is_string and "e" in params.name: 3715 params = exp.Literal.string(params.name.replace("e", "")) 3716 3717 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3718 3719 # Strip default group when no following params (DuckDB default is same as group=0) 3720 if ( 3721 not validated_flags 3722 and group 3723 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3724 ): 3725 group = None 3726 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3727 3728 # use substring to handle position argument 3729 if position and (not position.is_int or position.to_py() > 1): 3730 this = exp.Substring(this=this, start=position) 3731 3732 if null_if_pos_overflow: 3733 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3734 3735 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3736 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3737 3738 if is_extract_all or non_single_occurrence: 3739 name = "REGEXP_EXTRACT_ALL" 3740 else: 3741 name = "REGEXP_EXTRACT" 3742 3743 result: exp.Expression = exp.Anonymous( 3744 this=name, expressions=[this, expression.expression, group, flags_expr] 3745 ) 3746 3747 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 3748 if is_extract_all and non_single_occurrence: 3749 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 3750 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 3751 elif non_single_occurrence: 3752 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 3753 3754 return self.sql(result) 3755 3756 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3757 return self._regexp_extract_sql(expression) 3758 3759 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 3760 return self._regexp_extract_sql(expression) 3761 3762 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3763 this = expression.this 3764 pattern = expression.expression 3765 position = expression.args.get("position") 3766 orig_occ = expression.args.get("occurrence") 3767 occurrence = orig_occ or exp.Literal.number(1) 3768 option = expression.args.get("option") 3769 parameters = expression.args.get("parameters") 3770 3771 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3772 if validated_flags: 3773 pattern = exp.Concat( 3774 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3775 ) 3776 3777 # Handle starting position offset 3778 pos_offset: exp.Expression = exp.Literal.number(0) 3779 if position and (not position.is_int or position.to_py() > 1): 3780 this = exp.Substring(this=this, start=position) 3781 pos_offset = position - exp.Literal.number(1) 3782 3783 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3784 def sum_lengths(func_name: str, end: exp.Expression) -> exp.Expression: 3785 lst = exp.Bracket( 3786 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3787 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3788 offset=1, 3789 ) 3790 transform = exp.Anonymous( 3791 this="LIST_TRANSFORM", 3792 expressions=[ 3793 lst, 3794 exp.Lambda( 3795 this=exp.Length(this=exp.to_identifier("x")), 3796 expressions=[exp.to_identifier("x")], 3797 ), 3798 ], 3799 ) 3800 return exp.Coalesce( 3801 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 3802 expressions=[exp.Literal.number(0)], 3803 ) 3804 3805 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 3806 base_pos: exp.Expression = ( 3807 exp.Literal.number(1) 3808 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 3809 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 3810 + pos_offset 3811 ) 3812 3813 # option=1: add match length for end position 3814 if option and option.is_int and option.to_py() == 1: 3815 match_at_occ = exp.Bracket( 3816 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 3817 expressions=[occurrence], 3818 offset=1, 3819 ) 3820 base_pos = base_pos + exp.Coalesce( 3821 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 3822 ) 3823 3824 # NULL checks for all provided arguments 3825 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 3826 null_args = [ 3827 expression.this, 3828 expression.expression, 3829 position, 3830 orig_occ, 3831 option, 3832 parameters, 3833 ] 3834 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 3835 3836 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3837 3838 return self.sql( 3839 exp.case() 3840 .when(exp.or_(*null_checks), exp.Null()) 3841 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 3842 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 3843 .else_(base_pos) 3844 ) 3845 3846 @unsupported_args("culture") 3847 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3848 fmt = expression.args.get("format") 3849 if fmt and fmt.is_int: 3850 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3851 3852 self.unsupported("Only integer formats are supported by NumberToStr") 3853 return self.function_fallback_sql(expression) 3854 3855 def autoincrementcolumnconstraint_sql(self, _) -> str: 3856 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3857 return "" 3858 3859 def aliases_sql(self, expression: exp.Aliases) -> str: 3860 this = expression.this 3861 if isinstance(this, exp.Posexplode): 3862 return self.posexplode_sql(this) 3863 3864 return super().aliases_sql(expression) 3865 3866 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3867 this = expression.this 3868 parent = expression.parent 3869 3870 # The default Spark aliases are "pos" and "col", unless specified otherwise 3871 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3872 3873 if isinstance(parent, exp.Aliases): 3874 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3875 pos, col = parent.expressions 3876 elif isinstance(parent, exp.Table): 3877 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3878 alias = parent.args.get("alias") 3879 if alias: 3880 pos, col = alias.columns or [pos, col] 3881 alias.pop() 3882 3883 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3884 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3885 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3886 gen_subscripts = self.sql( 3887 exp.Alias( 3888 this=exp.Anonymous( 3889 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3890 ) 3891 - exp.Literal.number(1), 3892 alias=pos, 3893 ) 3894 ) 3895 3896 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3897 3898 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3899 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3900 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3901 3902 return posexplode_sql 3903 3904 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3905 """ 3906 Handles three key issues: 3907 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3908 2. End-of-month preservation: If input is last day of month, result is last day of result month 3909 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3910 """ 3911 from sqlglot.optimizer.annotate_types import annotate_types 3912 3913 this = expression.this 3914 if not this.type: 3915 this = annotate_types(this, dialect=self.dialect) 3916 3917 if this.is_type(*exp.DataType.TEXT_TYPES): 3918 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3919 3920 # Detect float/decimal months to apply rounding (Snowflake behavior) 3921 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3922 months_expr = expression.expression 3923 if not months_expr.type: 3924 months_expr = annotate_types(months_expr, dialect=self.dialect) 3925 3926 # Build interval or to_months expression based on type 3927 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3928 interval_or_to_months = ( 3929 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3930 if months_expr.is_type( 3931 exp.DataType.Type.FLOAT, 3932 exp.DataType.Type.DOUBLE, 3933 exp.DataType.Type.DECIMAL, 3934 ) 3935 # Integer case: standard INTERVAL N MONTH syntax 3936 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3937 ) 3938 3939 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3940 3941 # Apply end-of-month preservation if Snowflake flag is set 3942 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3943 preserve_eom = expression.args.get("preserve_end_of_month") 3944 result_expr = ( 3945 exp.case() 3946 .when( 3947 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3948 exp.func("LAST_DAY", date_add_expr), 3949 ) 3950 .else_(date_add_expr) 3951 if preserve_eom 3952 else date_add_expr 3953 ) 3954 3955 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3956 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3957 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3958 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3959 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3960 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3961 return self.sql(result_expr) 3962 3963 def format_sql(self, expression: exp.Format) -> str: 3964 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3965 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3966 3967 return self.function_fallback_sql(expression) 3968 3969 def hexstring_sql( 3970 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3971 ) -> str: 3972 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3973 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3974 3975 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3976 unit = unit_to_str(expression) 3977 date = expression.this 3978 result = self.func("DATE_TRUNC", unit, date) 3979 3980 if ( 3981 expression.args.get("input_type_preserved") 3982 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3983 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3984 ): 3985 return self.sql(exp.Cast(this=result, to=date.type)) 3986 3987 return result 3988 3989 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3990 unit = unit_to_str(expression) 3991 zone = expression.args.get("zone") 3992 timestamp = expression.this 3993 date_unit = is_date_unit(unit) 3994 3995 if date_unit and zone: 3996 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3997 # Double AT TIME ZONE needed for BigQuery compatibility: 3998 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 3999 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4000 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4001 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4002 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4003 4004 result = self.func("DATE_TRUNC", unit, timestamp) 4005 if expression.args.get("input_type_preserved"): 4006 if timestamp.type and timestamp.is_type( 4007 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 4008 ): 4009 dummy_date = exp.Cast( 4010 this=exp.Literal.string("1970-01-01"), 4011 to=exp.DataType(this=exp.DataType.Type.DATE), 4012 ) 4013 date_time = exp.Add(this=dummy_date, expression=timestamp) 4014 result = self.func("DATE_TRUNC", unit, date_time) 4015 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4016 4017 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4018 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 4019 ): 4020 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4021 4022 return result 4023 4024 def trim_sql(self, expression: exp.Trim) -> str: 4025 expression.this.replace(_cast_to_varchar(expression.this)) 4026 if expression.expression: 4027 expression.expression.replace(_cast_to_varchar(expression.expression)) 4028 4029 result_sql = super().trim_sql(expression) 4030 return _gen_with_cast_to_blob(self, expression, result_sql) 4031 4032 def round_sql(self, expression: exp.Round) -> str: 4033 this = expression.this 4034 decimals = expression.args.get("decimals") 4035 truncate = expression.args.get("truncate") 4036 4037 # DuckDB requires the scale (decimals) argument to be an INT 4038 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4039 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4040 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4041 decimals = exp.cast(decimals, exp.DataType.Type.INT) 4042 4043 func = "ROUND" 4044 if truncate: 4045 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4046 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4047 func = "ROUND_EVEN" 4048 truncate = None 4049 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4050 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4051 truncate = None 4052 4053 return self.func(func, this, decimals, truncate) 4054 4055 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4056 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4057 4058 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4059 if expression.is_type(*exp.DataType.REAL_TYPES): 4060 result = f"CAST({result} AS DOUBLE)" 4061 4062 return result 4063 4064 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4065 """ 4066 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4067 dividing the input distribution into n equal-sized buckets. 4068 4069 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4070 does not document the specific algorithm used so results may differ. DuckDB does not 4071 support RESPECT NULLS. 4072 """ 4073 this = expression.this 4074 if isinstance(this, exp.Distinct): 4075 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4076 if len(this.expressions) < 2: 4077 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4078 return self.function_fallback_sql(expression) 4079 num_quantiles_expr = this.expressions[1].pop() 4080 else: 4081 num_quantiles_expr = expression.expression 4082 4083 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4084 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4085 return self.function_fallback_sql(expression) 4086 4087 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4088 if num_quantiles <= 0: 4089 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4090 return self.function_fallback_sql(expression) 4091 4092 quantiles = [ 4093 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4094 for i in range(num_quantiles + 1) 4095 ] 4096 4097 return self.sql( 4098 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 4099 ) 4100 4101 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4102 if expression.args.get("scalar_only"): 4103 expression = exp.JSONExtractScalar( 4104 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4105 ) 4106 return _arrow_json_extract_sql(self, expression) 4107 4108 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4109 this = expression.this 4110 4111 if _is_binary(this): 4112 expression.type = exp.DataType.build("BINARY") 4113 4114 arg = _cast_to_bit(this) 4115 4116 if isinstance(this, exp.Neg): 4117 arg = exp.Paren(this=arg) 4118 4119 expression.set("this", arg) 4120 4121 result_sql = f"~{self.sql(expression, 'this')}" 4122 4123 return _gen_with_cast_to_blob(self, expression, result_sql) 4124 4125 def window_sql(self, expression: exp.Window) -> str: 4126 this = expression.this 4127 if isinstance(this, exp.Corr) or ( 4128 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4129 ): 4130 return self._corr_sql(expression) 4131 4132 return super().window_sql(expression) 4133 4134 def filter_sql(self, expression: exp.Filter) -> str: 4135 if isinstance(expression.this, exp.Corr): 4136 return self._corr_sql(expression) 4137 4138 return super().filter_sql(expression) 4139 4140 def _corr_sql( 4141 self, 4142 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 4143 ) -> str: 4144 if isinstance(expression, exp.Corr) and not expression.args.get( 4145 "null_on_zero_variance" 4146 ): 4147 return self.func("CORR", expression.this, expression.expression) 4148 4149 corr_expr = _maybe_corr_null_to_false(expression) 4150 if corr_expr is None: 4151 if isinstance(expression, exp.Window): 4152 return super().window_sql(expression) 4153 if isinstance(expression, exp.Filter): 4154 return super().filter_sql(expression) 4155 corr_expr = expression # make mypy happy 4156 4157 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
1461class DuckDB(Dialect): 1462 NULL_ORDERING = "nulls_are_last" 1463 SUPPORTS_USER_DEFINED_TYPES = True 1464 SAFE_DIVISION = True 1465 INDEX_OFFSET = 1 1466 CONCAT_COALESCE = True 1467 SUPPORTS_ORDER_BY_ALL = True 1468 SUPPORTS_FIXED_SIZE_ARRAYS = True 1469 STRICT_JSON_PATH_SYNTAX = False 1470 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 1471 1472 # https://duckdb.org/docs/sql/introduction.html#creating-a-new-table 1473 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 1474 1475 DATE_PART_MAPPING = { 1476 **Dialect.DATE_PART_MAPPING, 1477 "DAYOFWEEKISO": "ISODOW", 1478 } 1479 1480 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 1481 1482 DATE_PART_MAPPING.pop("WEEKDAY") 1483 1484 INVERSE_TIME_MAPPING = { 1485 "%e": "%-d", # BigQuery's space-padded day (%e) -> DuckDB's no-padding day (%-d) 1486 "%:z": "%z", # In DuckDB %z can represent ±HH:MM, ±HHMM, or ±HH. 1487 "%-z": "%z", 1488 "%f_zero": "%n", 1489 "%f_one": "%n", 1490 "%f_two": "%n", 1491 "%f_three": "%g", 1492 "%f_four": "%n", 1493 "%f_five": "%n", 1494 "%f_seven": "%n", 1495 "%f_eight": "%n", 1496 "%f_nine": "%n", 1497 } 1498 1499 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1500 if isinstance(path, exp.Literal): 1501 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1502 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1503 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1504 # either result in a noisy warning or in an invalid representation of the path. 1505 path_text = path.name 1506 if path_text.startswith("/") or "[#" in path_text: 1507 return path 1508 1509 return super().to_json_path(path) 1510 1511 class Tokenizer(tokens.Tokenizer): 1512 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1513 BYTE_STRING_ESCAPES = ["'", "\\"] 1514 HEREDOC_STRINGS = ["$"] 1515 1516 HEREDOC_TAG_IS_IDENTIFIER = True 1517 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1518 1519 KEYWORDS = { 1520 **tokens.Tokenizer.KEYWORDS, 1521 "//": TokenType.DIV, 1522 "**": TokenType.DSTAR, 1523 "^@": TokenType.CARET_AT, 1524 "@>": TokenType.AT_GT, 1525 "<@": TokenType.LT_AT, 1526 "ATTACH": TokenType.ATTACH, 1527 "BINARY": TokenType.VARBINARY, 1528 "BITSTRING": TokenType.BIT, 1529 "BPCHAR": TokenType.TEXT, 1530 "CHAR": TokenType.TEXT, 1531 "DATETIME": TokenType.TIMESTAMPNTZ, 1532 "DETACH": TokenType.DETACH, 1533 "FORCE": TokenType.FORCE, 1534 "INSTALL": TokenType.INSTALL, 1535 "INT8": TokenType.BIGINT, 1536 "LOGICAL": TokenType.BOOLEAN, 1537 "MACRO": TokenType.FUNCTION, 1538 "ONLY": TokenType.ONLY, 1539 "PIVOT_WIDER": TokenType.PIVOT, 1540 "POSITIONAL": TokenType.POSITIONAL, 1541 "RESET": TokenType.COMMAND, 1542 "ROW": TokenType.STRUCT, 1543 "SIGNED": TokenType.INT, 1544 "STRING": TokenType.TEXT, 1545 "SUMMARIZE": TokenType.SUMMARIZE, 1546 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1547 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1548 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1549 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1550 "TIMESTAMP_US": TokenType.TIMESTAMP, 1551 "UBIGINT": TokenType.UBIGINT, 1552 "UINTEGER": TokenType.UINT, 1553 "USMALLINT": TokenType.USMALLINT, 1554 "UTINYINT": TokenType.UTINYINT, 1555 "VARCHAR": TokenType.TEXT, 1556 } 1557 KEYWORDS.pop("/*+") 1558 1559 SINGLE_TOKENS = { 1560 **tokens.Tokenizer.SINGLE_TOKENS, 1561 "$": TokenType.PARAMETER, 1562 } 1563 1564 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW} 1565 1566 class Parser(parser.Parser): 1567 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1568 1569 BITWISE = parser.Parser.BITWISE.copy() 1570 BITWISE.pop(TokenType.CARET) 1571 1572 RANGE_PARSERS = { 1573 **parser.Parser.RANGE_PARSERS, 1574 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1575 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1576 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1577 } 1578 1579 EXPONENT = { 1580 **parser.Parser.EXPONENT, 1581 TokenType.CARET: exp.Pow, 1582 TokenType.DSTAR: exp.Pow, 1583 } 1584 1585 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1586 1587 SHOW_PARSERS = { 1588 "TABLES": _show_parser("TABLES"), 1589 "ALL TABLES": _show_parser("ALL TABLES"), 1590 } 1591 1592 FUNCTIONS = { 1593 **parser.Parser.FUNCTIONS, 1594 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1595 "ARRAY_PREPEND": _build_array_prepend, 1596 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1597 "ARRAY_SORT": exp.SortArray.from_arg_list, 1598 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1599 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1600 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1601 "CURRENT_LOCALTIMESTAMP": exp.Localtimestamp.from_arg_list, 1602 "DATEDIFF": _build_date_diff, 1603 "DATE_DIFF": _build_date_diff, 1604 "DATE_TRUNC": date_trunc_to_time, 1605 "DATETRUNC": date_trunc_to_time, 1606 "DECODE": lambda args: exp.Decode( 1607 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1608 ), 1609 "EDITDIST3": exp.Levenshtein.from_arg_list, 1610 "ENCODE": lambda args: exp.Encode( 1611 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1612 ), 1613 "EPOCH": exp.TimeToUnix.from_arg_list, 1614 "EPOCH_MS": lambda args: exp.UnixToTime( 1615 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1616 ), 1617 "GENERATE_SERIES": _build_generate_series(), 1618 "GET_CURRENT_TIME": exp.CurrentTime.from_arg_list, 1619 "GET_BIT": lambda args: exp.Getbit( 1620 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1621 ), 1622 "JARO_WINKLER_SIMILARITY": exp.JarowinklerSimilarity.from_arg_list, 1623 "JSON": exp.ParseJSON.from_arg_list, 1624 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1625 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1626 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1627 "LIST_CONCAT": parser.build_array_concat, 1628 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1629 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1630 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1631 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1632 "LIST_HAS": exp.ArrayContains.from_arg_list, 1633 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1634 "LIST_MAX": exp.ArrayMax.from_arg_list, 1635 "LIST_MIN": exp.ArrayMin.from_arg_list, 1636 "LIST_PREPEND": _build_array_prepend, 1637 "LIST_REVERSE_SORT": _build_sort_array_desc, 1638 "LIST_SORT": exp.SortArray.from_arg_list, 1639 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1640 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1641 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1642 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1643 "MAKE_TIMESTAMP": _build_make_timestamp, 1644 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1645 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1646 "RANGE": _build_generate_series(end_exclusive=True), 1647 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1648 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1649 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1650 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1651 this=seq_get(args, 0), 1652 expression=seq_get(args, 1), 1653 replacement=seq_get(args, 2), 1654 modifiers=seq_get(args, 3), 1655 single_replace=True, 1656 ), 1657 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1658 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1659 "STRING_SPLIT": exp.Split.from_arg_list, 1660 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1661 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1662 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1663 "STRUCT_PACK": exp.Struct.from_arg_list, 1664 "STR_SPLIT": exp.Split.from_arg_list, 1665 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1666 "TODAY": exp.CurrentDate.from_arg_list, 1667 "TIME_BUCKET": exp.DateBin.from_arg_list, 1668 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1669 "UNNEST": exp.Explode.from_arg_list, 1670 "VERSION": exp.CurrentVersion.from_arg_list, 1671 "XOR": binary_from_function(exp.BitwiseXor), 1672 } 1673 1674 FUNCTIONS.pop("DATE_SUB") 1675 FUNCTIONS.pop("GLOB") 1676 1677 FUNCTION_PARSERS = { 1678 **parser.Parser.FUNCTION_PARSERS, 1679 **dict.fromkeys( 1680 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1681 ), 1682 } 1683 FUNCTION_PARSERS.pop("DECODE") 1684 1685 NO_PAREN_FUNCTION_PARSERS = { 1686 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1687 "MAP": lambda self: self._parse_map(), 1688 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1689 } 1690 1691 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1692 TokenType.SEMI, 1693 TokenType.ANTI, 1694 } 1695 1696 PLACEHOLDER_PARSERS = { 1697 **parser.Parser.PLACEHOLDER_PARSERS, 1698 TokenType.PARAMETER: lambda self: ( 1699 self.expression(exp.Placeholder, this=self._prev.text) 1700 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1701 else None 1702 ), 1703 } 1704 1705 TYPE_CONVERTERS = { 1706 # https://duckdb.org/docs/sql/data_types/numeric 1707 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1708 # https://duckdb.org/docs/sql/data_types/text 1709 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1710 } 1711 1712 STATEMENT_PARSERS = { 1713 **parser.Parser.STATEMENT_PARSERS, 1714 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1715 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1716 TokenType.FORCE: lambda self: self._parse_force(), 1717 TokenType.INSTALL: lambda self: self._parse_install(), 1718 TokenType.SHOW: lambda self: self._parse_show(), 1719 } 1720 1721 SET_PARSERS = { 1722 **parser.Parser.SET_PARSERS, 1723 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1724 } 1725 1726 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1727 index = self._index 1728 if not self._match_text_seq("LAMBDA"): 1729 return super()._parse_lambda(alias=alias) 1730 1731 expressions = self._parse_csv(self._parse_lambda_arg) 1732 if not self._match(TokenType.COLON): 1733 self._retreat(index) 1734 return None 1735 1736 this = self._replace_lambda(self._parse_assignment(), expressions) 1737 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1738 1739 def _parse_expression(self) -> t.Optional[exp.Expression]: 1740 # DuckDB supports prefix aliases, e.g. foo: 1 1741 if self._next and self._next.token_type == TokenType.COLON: 1742 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1743 self._match(TokenType.COLON) 1744 comments = self._prev_comments or [] 1745 1746 this = self._parse_assignment() 1747 if isinstance(this, exp.Expression): 1748 # Moves the comment next to the alias in `alias: expr /* comment */` 1749 comments += this.pop_comments() or [] 1750 1751 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1752 1753 return super()._parse_expression() 1754 1755 def _parse_table( 1756 self, 1757 schema: bool = False, 1758 joins: bool = False, 1759 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1760 parse_bracket: bool = False, 1761 is_db_reference: bool = False, 1762 parse_partition: bool = False, 1763 consume_pipe: bool = False, 1764 ) -> t.Optional[exp.Expression]: 1765 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1766 if self._next and self._next.token_type == TokenType.COLON: 1767 alias = self._parse_table_alias( 1768 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1769 ) 1770 self._match(TokenType.COLON) 1771 comments = self._prev_comments or [] 1772 else: 1773 alias = None 1774 comments = [] 1775 1776 table = super()._parse_table( 1777 schema=schema, 1778 joins=joins, 1779 alias_tokens=alias_tokens, 1780 parse_bracket=parse_bracket, 1781 is_db_reference=is_db_reference, 1782 parse_partition=parse_partition, 1783 ) 1784 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1785 # Moves the comment next to the alias in `alias: table /* comment */` 1786 comments += table.pop_comments() or [] 1787 alias.comments = alias.pop_comments() + comments 1788 table.set("alias", alias) 1789 1790 return table 1791 1792 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1793 # https://duckdb.org/docs/sql/samples.html 1794 sample = super()._parse_table_sample(as_modifier=as_modifier) 1795 if sample and not sample.args.get("method"): 1796 if sample.args.get("size"): 1797 sample.set("method", exp.var("RESERVOIR")) 1798 else: 1799 sample.set("method", exp.var("SYSTEM")) 1800 1801 return sample 1802 1803 def _parse_bracket( 1804 self, this: t.Optional[exp.Expression] = None 1805 ) -> t.Optional[exp.Expression]: 1806 bracket = super()._parse_bracket(this) 1807 1808 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1809 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1810 bracket.set("returns_list_for_maps", True) 1811 1812 return bracket 1813 1814 def _parse_map(self) -> exp.ToMap | exp.Map: 1815 if self._match(TokenType.L_BRACE, advance=False): 1816 return self.expression(exp.ToMap, this=self._parse_bracket()) 1817 1818 args = self._parse_wrapped_csv(self._parse_assignment) 1819 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1820 1821 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1822 return self._parse_field_def() 1823 1824 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1825 if len(aggregations) == 1: 1826 return super()._pivot_column_names(aggregations) 1827 return pivot_column_names(aggregations, dialect="duckdb") 1828 1829 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1830 def _parse_attach_option() -> exp.AttachOption: 1831 return self.expression( 1832 exp.AttachOption, 1833 this=self._parse_var(any_token=True), 1834 expression=self._parse_field(any_token=True), 1835 ) 1836 1837 self._match(TokenType.DATABASE) 1838 exists = self._parse_exists(not_=is_attach) 1839 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1840 1841 if self._match(TokenType.L_PAREN, advance=False): 1842 expressions = self._parse_wrapped_csv(_parse_attach_option) 1843 else: 1844 expressions = None 1845 1846 return ( 1847 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1848 if is_attach 1849 else self.expression(exp.Detach, this=this, exists=exists) 1850 ) 1851 1852 def _parse_show_duckdb(self, this: str) -> exp.Show: 1853 return self.expression(exp.Show, this=this) 1854 1855 def _parse_force(self) -> exp.Install | exp.Command: 1856 # FORCE can only be followed by INSTALL or CHECKPOINT 1857 # In the case of CHECKPOINT, we fallback 1858 if not self._match(TokenType.INSTALL): 1859 return self._parse_as_command(self._prev) 1860 1861 return self._parse_install(force=True) 1862 1863 def _parse_install(self, force: bool = False) -> exp.Install: 1864 return self.expression( 1865 exp.Install, 1866 this=self._parse_id_var(), 1867 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1868 force=force, 1869 ) 1870 1871 def _parse_primary(self) -> t.Optional[exp.Expression]: 1872 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1873 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1874 1875 return super()._parse_primary() 1876 1877 class Generator(generator.Generator): 1878 PARAMETER_TOKEN = "$" 1879 NAMED_PLACEHOLDER_TOKEN = "$" 1880 JOIN_HINTS = False 1881 TABLE_HINTS = False 1882 QUERY_HINTS = False 1883 LIMIT_FETCH = "LIMIT" 1884 STRUCT_DELIMITER = ("(", ")") 1885 RENAME_TABLE_WITH_DB = False 1886 NVL2_SUPPORTED = False 1887 SEMI_ANTI_JOIN_WITH_SIDE = False 1888 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1889 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1890 LAST_DAY_SUPPORTS_DATE_PART = False 1891 JSON_KEY_VALUE_PAIR_SEP = "," 1892 IGNORE_NULLS_IN_FUNC = True 1893 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1894 SUPPORTS_CREATE_TABLE_LIKE = False 1895 MULTI_ARG_DISTINCT = False 1896 CAN_IMPLEMENT_ARRAY_ANY = True 1897 SUPPORTS_TO_NUMBER = False 1898 SUPPORTS_WINDOW_EXCLUDE = True 1899 COPY_HAS_INTO_KEYWORD = False 1900 STAR_EXCEPT = "EXCLUDE" 1901 PAD_FILL_PATTERN_IS_REQUIRED = True 1902 ARRAY_SIZE_DIM_REQUIRED = False 1903 NORMALIZE_EXTRACT_DATE_PARTS = True 1904 SUPPORTS_LIKE_QUANTIFIERS = False 1905 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1906 1907 TRANSFORMS = { 1908 **generator.Generator.TRANSFORMS, 1909 exp.AnyValue: _anyvalue_sql, 1910 exp.ApproxDistinct: approx_count_distinct_sql, 1911 exp.Boolnot: _boolnot_sql, 1912 exp.Booland: _booland_sql, 1913 exp.Boolor: _boolor_sql, 1914 exp.Array: transforms.preprocess( 1915 [transforms.inherit_struct_field_names], 1916 generator=inline_array_unless_query, 1917 ), 1918 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1919 exp.ArrayCompact: array_compact_sql, 1920 exp.ArrayConstructCompact: lambda self, e: self.sql( 1921 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1922 ), 1923 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1924 exp.ArrayContains: _array_contains_sql, 1925 exp.ArrayFilter: rename_func("LIST_FILTER"), 1926 exp.ArrayInsert: _array_insert_sql, 1927 exp.ArrayRemoveAt: _array_remove_at_sql, 1928 exp.ArrayRemove: remove_from_array_using_filter, 1929 exp.ArraySort: _array_sort_sql, 1930 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1931 exp.ArraySum: rename_func("LIST_SUM"), 1932 exp.ArrayMax: rename_func("LIST_MAX"), 1933 exp.ArrayMin: rename_func("LIST_MIN"), 1934 exp.ArrayUniqueAgg: lambda self, e: self.func( 1935 "LIST", exp.Distinct(expressions=[e.this]) 1936 ), 1937 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1938 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1939 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1940 exp.BitwiseAndAgg: _bitwise_agg_sql, 1941 exp.BitwiseCount: rename_func("BIT_COUNT"), 1942 exp.BitwiseLeftShift: _bitshift_sql, 1943 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1944 exp.BitwiseOrAgg: _bitwise_agg_sql, 1945 exp.BitwiseRightShift: _bitshift_sql, 1946 exp.BitwiseXorAgg: _bitwise_agg_sql, 1947 exp.ByteLength: lambda self, e: self.func("OCTET_LENGTH", e.this), 1948 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1949 exp.Corr: lambda self, e: self._corr_sql(e), 1950 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1951 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1952 exp.CurrentSchemas: lambda self, e: self.func( 1953 "current_schemas", e.this if e.this else exp.true() 1954 ), 1955 exp.CurrentTimestamp: lambda self, e: self.sql( 1956 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1957 ) 1958 if e.args.get("sysdate") 1959 else "CURRENT_TIMESTAMP", 1960 exp.CurrentVersion: rename_func("version"), 1961 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1962 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1963 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1964 exp.DayOfWeekIso: rename_func("ISODOW"), 1965 exp.DayOfYear: rename_func("DAYOFYEAR"), 1966 exp.Dayname: lambda self, e: ( 1967 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1968 if e.args.get("abbreviated") 1969 else self.func("DAYNAME", e.this) 1970 ), 1971 exp.Monthname: lambda self, e: ( 1972 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1973 if e.args.get("abbreviated") 1974 else self.func("MONTHNAME", e.this) 1975 ), 1976 exp.DataType: _datatype_sql, 1977 exp.Date: _date_sql, 1978 exp.DateAdd: _date_delta_to_binary_interval_op(), 1979 exp.DateFromParts: _date_from_parts_sql, 1980 exp.DateSub: _date_delta_to_binary_interval_op(), 1981 exp.DateDiff: _date_diff_sql, 1982 exp.DateStrToDate: datestrtodate_sql, 1983 exp.Datetime: no_datetime_sql, 1984 exp.DatetimeDiff: _date_diff_sql, 1985 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1986 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1987 exp.DateToDi: lambda self, 1988 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1989 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1990 exp.DiToDate: lambda self, 1991 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1992 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1993 exp.EqualNull: lambda self, e: self.sql( 1994 exp.NullSafeEQ(this=e.this, expression=e.expression) 1995 ), 1996 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1997 exp.GenerateDateArray: _generate_datetime_array_sql, 1998 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1999 exp.Getbit: getbit_sql, 2000 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 2001 exp.Explode: rename_func("UNNEST"), 2002 exp.IntDiv: lambda self, e: self.binary(e, "//"), 2003 exp.IsInf: rename_func("ISINF"), 2004 exp.IsNan: rename_func("ISNAN"), 2005 exp.IsNullValue: lambda self, e: self.sql( 2006 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 2007 ), 2008 exp.IsArray: lambda self, e: self.sql( 2009 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 2010 ), 2011 exp.Ceil: _ceil_floor, 2012 exp.Floor: _ceil_floor, 2013 exp.JarowinklerSimilarity: jarowinkler_similarity("JARO_WINKLER_SIMILARITY"), 2014 exp.JSONBExists: rename_func("JSON_EXISTS"), 2015 exp.JSONExtract: _arrow_json_extract_sql, 2016 exp.JSONExtractArray: _json_extract_value_array_sql, 2017 exp.JSONFormat: _json_format_sql, 2018 exp.JSONValueArray: _json_extract_value_array_sql, 2019 exp.Lateral: _explode_to_unnest_sql, 2020 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 2021 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 2022 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 2023 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 2024 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 2025 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 2026 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 2027 exp.BoolxorAgg: _boolxor_agg_sql, 2028 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 2029 exp.Initcap: _initcap_sql, 2030 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 2031 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 2032 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 2033 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 2034 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 2035 exp.MonthsBetween: months_between_sql, 2036 exp.NextDay: _day_navigation_sql, 2037 exp.PercentileCont: rename_func("QUANTILE_CONT"), 2038 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 2039 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 2040 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 2041 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 2042 exp.PreviousDay: _day_navigation_sql, 2043 exp.RegexpILike: lambda self, e: self.func( 2044 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 2045 ), 2046 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 2047 exp.RegrValx: _regr_val_sql, 2048 exp.RegrValy: _regr_val_sql, 2049 exp.Return: lambda self, e: self.sql(e, "this"), 2050 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 2051 exp.Rand: rename_func("RANDOM"), 2052 exp.Split: rename_func("STR_SPLIT"), 2053 exp.SortArray: _sort_array_sql, 2054 exp.StrPosition: strposition_sql, 2055 exp.StrToUnix: lambda self, e: self.func( 2056 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 2057 ), 2058 exp.Struct: _struct_sql, 2059 exp.Transform: rename_func("LIST_TRANSFORM"), 2060 exp.TimeAdd: _date_delta_to_binary_interval_op(), 2061 exp.TimeSub: _date_delta_to_binary_interval_op(), 2062 exp.Time: no_time_sql, 2063 exp.TimeDiff: _timediff_sql, 2064 exp.Timestamp: no_timestamp_sql, 2065 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 2066 exp.TimestampDiff: lambda self, e: self.func( 2067 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 2068 ), 2069 exp.TimestampSub: _date_delta_to_binary_interval_op(), 2070 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 2071 exp.TimeStrToTime: timestrtotime_sql, 2072 exp.TimeStrToUnix: lambda self, e: self.func( 2073 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 2074 ), 2075 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 2076 exp.ToBoolean: _to_boolean_sql, 2077 exp.TimeToUnix: rename_func("EPOCH"), 2078 exp.TsOrDiToDi: lambda self, 2079 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 2080 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 2081 exp.TsOrDsDiff: lambda self, e: self.func( 2082 "DATE_DIFF", 2083 f"'{e.args.get('unit') or 'DAY'}'", 2084 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 2085 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 2086 ), 2087 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 2088 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 2089 exp.UnixSeconds: lambda self, e: self.sql( 2090 exp.cast( 2091 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 2092 ) 2093 ), 2094 exp.UnixToStr: lambda self, e: self.func( 2095 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 2096 ), 2097 exp.DatetimeTrunc: lambda self, e: self.func( 2098 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 2099 ), 2100 exp.UnixToTime: _unix_to_time_sql, 2101 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 2102 exp.VariancePop: rename_func("VAR_POP"), 2103 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 2104 exp.YearOfWeek: lambda self, e: self.sql( 2105 exp.Extract( 2106 this=exp.Var(this="ISOYEAR"), 2107 expression=e.this, 2108 ) 2109 ), 2110 exp.YearOfWeekIso: lambda self, e: self.sql( 2111 exp.Extract( 2112 this=exp.Var(this="ISOYEAR"), 2113 expression=e.this, 2114 ) 2115 ), 2116 exp.Xor: _xor_sql, 2117 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2118 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2119 exp.DateBin: rename_func("TIME_BUCKET"), 2120 exp.LastDay: _last_day_sql, 2121 } 2122 2123 SUPPORTED_JSON_PATH_PARTS = { 2124 exp.JSONPathKey, 2125 exp.JSONPathRoot, 2126 exp.JSONPathSubscript, 2127 exp.JSONPathWildcard, 2128 } 2129 2130 TYPE_MAPPING = { 2131 **generator.Generator.TYPE_MAPPING, 2132 exp.DataType.Type.BINARY: "BLOB", 2133 exp.DataType.Type.BPCHAR: "TEXT", 2134 exp.DataType.Type.CHAR: "TEXT", 2135 exp.DataType.Type.DATETIME: "TIMESTAMP", 2136 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 2137 exp.DataType.Type.FLOAT: "REAL", 2138 exp.DataType.Type.JSONB: "JSON", 2139 exp.DataType.Type.NCHAR: "TEXT", 2140 exp.DataType.Type.NVARCHAR: "TEXT", 2141 exp.DataType.Type.UINT: "UINTEGER", 2142 exp.DataType.Type.VARBINARY: "BLOB", 2143 exp.DataType.Type.ROWVERSION: "BLOB", 2144 exp.DataType.Type.VARCHAR: "TEXT", 2145 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 2146 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 2147 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 2148 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 2149 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 2150 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 2151 } 2152 2153 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 2154 RESERVED_KEYWORDS = { 2155 "array", 2156 "analyse", 2157 "union", 2158 "all", 2159 "when", 2160 "in_p", 2161 "default", 2162 "create_p", 2163 "window", 2164 "asymmetric", 2165 "to", 2166 "else", 2167 "localtime", 2168 "from", 2169 "end_p", 2170 "select", 2171 "current_date", 2172 "foreign", 2173 "with", 2174 "grant", 2175 "session_user", 2176 "or", 2177 "except", 2178 "references", 2179 "fetch", 2180 "limit", 2181 "group_p", 2182 "leading", 2183 "into", 2184 "collate", 2185 "offset", 2186 "do", 2187 "then", 2188 "localtimestamp", 2189 "check_p", 2190 "lateral_p", 2191 "current_role", 2192 "where", 2193 "asc_p", 2194 "placing", 2195 "desc_p", 2196 "user", 2197 "unique", 2198 "initially", 2199 "column", 2200 "both", 2201 "some", 2202 "as", 2203 "any", 2204 "only", 2205 "deferrable", 2206 "null_p", 2207 "current_time", 2208 "true_p", 2209 "table", 2210 "case", 2211 "trailing", 2212 "variadic", 2213 "for", 2214 "on", 2215 "distinct", 2216 "false_p", 2217 "not", 2218 "constraint", 2219 "current_timestamp", 2220 "returning", 2221 "primary", 2222 "intersect", 2223 "having", 2224 "analyze", 2225 "current_user", 2226 "and", 2227 "cast", 2228 "symmetric", 2229 "using", 2230 "order", 2231 "current_catalog", 2232 } 2233 2234 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2235 2236 # DuckDB doesn't generally support CREATE TABLE .. properties 2237 # https://duckdb.org/docs/sql/statements/create_table.html 2238 PROPERTIES_LOCATION = { 2239 prop: exp.Properties.Location.UNSUPPORTED 2240 for prop in generator.Generator.PROPERTIES_LOCATION 2241 } 2242 2243 # There are a few exceptions (e.g. temporary tables) which are supported or 2244 # can be transpiled to DuckDB, so we explicitly override them accordingly 2245 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2246 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2247 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2248 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2249 2250 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2251 exp.FirstValue, 2252 exp.Lag, 2253 exp.LastValue, 2254 exp.Lead, 2255 exp.NthValue, 2256 ) 2257 2258 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2259 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2260 """ 2261 WITH rand AS (SELECT :random_expr AS r), 2262 weights AS ( 2263 SELECT i, 1.0 / POWER(i, :s) AS w 2264 FROM RANGE(1, :n + 1) AS t(i) 2265 ), 2266 cdf AS ( 2267 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2268 FROM weights 2269 ) 2270 SELECT MIN(i) 2271 FROM cdf 2272 WHERE p >= (SELECT r FROM rand) 2273 """ 2274 ) 2275 2276 # Template for NORMAL transpilation using Box-Muller transform 2277 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2278 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2279 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2280 ) 2281 2282 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2283 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2284 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2285 ) 2286 2287 # Template for generating signed and unsigned SEQ values within a specified range 2288 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(":base % :max_val") 2289 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2290 "(CASE WHEN :base % :max_val >= :half " 2291 "THEN :base % :max_val - :max_val " 2292 "ELSE :base % :max_val END)" 2293 ) 2294 2295 # Template for MAP_CAT transpilation - Snowflake semantics: 2296 # 1. Returns NULL if either input is NULL 2297 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2298 # 3. Filters out entries with NULL values from the result 2299 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2300 """ 2301 CASE 2302 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2303 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2304 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2305 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2306 ), __x -> __x.value IS NOT NULL)) 2307 END 2308 """ 2309 ) 2310 2311 # Mappings for EXTRACT/DATE_PART transpilation 2312 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2313 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2314 "WEEKISO": ("%V", "INTEGER"), 2315 "YEAROFWEEK": ("%G", "INTEGER"), 2316 "YEAROFWEEKISO": ("%G", "INTEGER"), 2317 "NANOSECOND": ("%n", "BIGINT"), 2318 } 2319 2320 # Maps epoch-based specifiers to DuckDB epoch functions 2321 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2322 "EPOCH_SECOND": "EPOCH", 2323 "EPOCH_MILLISECOND": "EPOCH_MS", 2324 "EPOCH_MICROSECOND": "EPOCH_US", 2325 "EPOCH_NANOSECOND": "EPOCH_NS", 2326 } 2327 2328 # Template for BITMAP_CONSTRUCT_AGG transpilation 2329 # 2330 # BACKGROUND: 2331 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2332 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2333 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2334 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2335 # 2336 # Snowflake uses two different formats based on the number of unique values: 2337 # 2338 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2339 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2340 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2341 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2342 # count v1 v2 v3 pad 2343 # 2344 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2345 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2346 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2347 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2348 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2349 # 2350 # TEMPLATE STRUCTURE 2351 # 2352 # Phase 1 - Innermost subquery: Data preparation 2353 # SELECT LIST_SORT(...) AS l 2354 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2355 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2356 # 2357 # Phase 2 - Middle subquery: Hex string construction 2358 # LIST_TRANSFORM(...) 2359 # - Converts each integer to 2-byte little-endian hex representation 2360 # - & 255 extracts low byte, >> 8 extracts high byte 2361 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2362 # Result: Hex string of all values 2363 # 2364 # Phase 3 - Outer SELECT: Final bitmap assembly 2365 # LENGTH(l) < 5: 2366 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2367 # LENGTH(l) >= 5: 2368 # - Large format: Fixed 10-byte header + values (no padding needed) 2369 # Result: Complete binary bitmap as BLOB 2370 # 2371 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2372 """ 2373 SELECT CASE 2374 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2375 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2376 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2377 ELSE UNHEX('08000000000000000000' || h) 2378 END 2379 FROM ( 2380 SELECT l, COALESCE(LIST_REDUCE( 2381 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2382 (__a, __b) -> __a || __b, '' 2383 ), '') AS h 2384 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2385 ) 2386 """ 2387 ) 2388 2389 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2390 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2391 f""" 2392 SELECT LISTAGG( 2393 SUBSTRING( 2394 '{RANDSTR_CHAR_POOL}', 2395 1 + CAST(FLOOR(random_value * 62) AS INT), 2396 1 2397 ), 2398 '' 2399 ) 2400 FROM ( 2401 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2402 FROM RANGE(:length) AS t(i) 2403 ) 2404 """, 2405 ) 2406 2407 # Template for MINHASH transpilation 2408 # Computes k minimum hash values across aggregated data using DuckDB list functions 2409 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2410 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2411 """ 2412 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2413 FROM ( 2414 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2415 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2416 ) 2417 """, 2418 ) 2419 2420 # Template for MINHASH_COMBINE transpilation 2421 # Combines multiple minhash signatures by taking element-wise minimum 2422 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2423 """ 2424 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2425 FROM ( 2426 SELECT 2427 pos AS idx, 2428 MIN(val) AS min_h 2429 FROM 2430 UNNEST(LIST(:expr)) AS _(sig), 2431 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2432 GROUP BY pos 2433 ) 2434 """, 2435 ) 2436 2437 # Template for APPROXIMATE_SIMILARITY transpilation 2438 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2439 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2440 """ 2441 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2442 FROM ( 2443 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2444 FROM ( 2445 SELECT h, pos 2446 FROM UNNEST(LIST(:expr)) AS _(sig), 2447 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2448 ) 2449 GROUP BY pos 2450 ) 2451 """, 2452 ) 2453 2454 # Template for ARRAYS_ZIP transpilation 2455 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2456 # Uses RANGE + indexing to match Snowflake behavior 2457 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2458 """ 2459 CASE WHEN :null_check THEN NULL 2460 WHEN :all_empty_check THEN [:empty_struct] 2461 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2462 END 2463 """, 2464 ) 2465 2466 # ARRAY_EXCEPT with bag semantics: N - M occurrences via cumulative counting 2467 # 0-based indices in template (SQLGlot internal), converted to 1-based for DuckDB 2468 # IS NOT DISTINCT FROM for NULL-safe element comparison 2469 ARRAY_EXCEPT_TEMPLATE: exp.Expression = exp.maybe_parse( 2470 """ 2471 CASE 2472 WHEN :source IS NULL OR :exclude IS NULL THEN NULL 2473 ELSE LIST_TRANSFORM( 2474 LIST_FILTER( 2475 LIST_ZIP(:source, GENERATE_SERIES(1, LEN(:source))), 2476 pair -> ( 2477 LEN(LIST_FILTER(:source[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0])) 2478 > LEN(LIST_FILTER(:exclude, e -> e IS NOT DISTINCT FROM pair[0])) 2479 ) 2480 ), 2481 pair -> pair[0] 2482 ) 2483 END 2484 """, 2485 ) 2486 2487 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2488 """ 2489 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2490 2491 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2492 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2493 2494 For 'END' kind, add the interval to get the end of the slice. 2495 For DATE type with 'END', cast result back to DATE to preserve type. 2496 """ 2497 date_expr = expression.this 2498 slice_length = expression.expression 2499 unit = expression.unit 2500 kind = expression.text("kind").upper() 2501 2502 # Create INTERVAL expression: INTERVAL 'N' UNIT 2503 interval_expr = exp.Interval(this=slice_length, unit=unit) 2504 2505 # Create base time_bucket expression 2506 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2507 2508 # Check if we need the end of the slice (default is start) 2509 if not kind == "END": 2510 # For 'START', return time_bucket directly 2511 return self.sql(time_bucket_expr) 2512 2513 # For 'END', add the interval to get end of slice 2514 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2515 2516 # If input is DATE type, cast result back to DATE to preserve type 2517 # DuckDB converts DATE to TIMESTAMP when adding intervals 2518 if date_expr.is_type(exp.DataType.Type.DATE): 2519 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2520 2521 return self.sql(add_expr) 2522 2523 def bitmapbucketnumber_sql( 2524 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2525 ) -> str: 2526 """ 2527 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2528 2529 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2530 - Each bucket covers 32,768 values 2531 - Bucket numbering starts at 1 2532 - Formula: ((value - 1) // 32768) + 1 for positive values 2533 2534 For non-positive values (0 and negative), we use value // 32768 to avoid 2535 producing bucket 0 or positive bucket IDs for negative inputs. 2536 """ 2537 value = expression.this 2538 2539 positive_formula = ((value - 1) // 32768) + 1 2540 non_positive_formula = value // 32768 2541 2542 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2543 case_expr = ( 2544 exp.case() 2545 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2546 .else_(non_positive_formula) 2547 ) 2548 return self.sql(case_expr) 2549 2550 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2551 """ 2552 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2553 2554 Snowflake's BITMAP_BIT_POSITION behavior: 2555 - For n <= 0: returns ABS(n) % 32768 2556 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2557 """ 2558 this = expression.this 2559 2560 return self.sql( 2561 exp.Mod( 2562 this=exp.Paren( 2563 this=exp.If( 2564 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2565 true=this - exp.Literal.number(1), 2566 false=exp.Abs(this=this), 2567 ) 2568 ), 2569 expression=MAX_BIT_POSITION, 2570 ) 2571 ) 2572 2573 def bitmapconstructagg_sql( 2574 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2575 ) -> str: 2576 """ 2577 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2578 Uses a pre-parsed template with placeholders replaced by expression nodes. 2579 2580 Snowflake bitmap format: 2581 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2582 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2583 """ 2584 arg = expression.this 2585 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2586 2587 def nthvalue_sql(self: DuckDB.Generator, expression: exp.NthValue) -> str: 2588 from_first = expression.args.get("from_first", True) 2589 if not from_first: 2590 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2591 2592 return self.function_fallback_sql(expression) 2593 2594 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2595 """ 2596 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2597 Uses a pre-parsed template with placeholders replaced by expression nodes. 2598 2599 RANDSTR(length, generator) generates a random string of specified length. 2600 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2601 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2602 - No generator: Use default seed value 2603 """ 2604 length = expression.this 2605 generator = expression.args.get("generator") 2606 2607 if generator: 2608 if isinstance(generator, exp.Rand): 2609 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2610 seed_value = generator.this or generator 2611 else: 2612 # Const/int or other expression - use as seed directly 2613 seed_value = generator 2614 else: 2615 # No generator specified, use default seed (arbitrary but deterministic) 2616 seed_value = exp.Literal.number(RANDSTR_SEED) 2617 2618 replacements = {"seed": seed_value, "length": length} 2619 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2620 2621 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2622 """ 2623 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2624 Uses a pre-parsed template with placeholders replaced by expression nodes. 2625 """ 2626 s = expression.this 2627 n = expression.args["elementcount"] 2628 gen = expression.args["gen"] 2629 2630 if not isinstance(gen, exp.Rand): 2631 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2632 random_expr: exp.Expression = exp.Div( 2633 this=exp.Paren( 2634 this=exp.Mod( 2635 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2636 expression=exp.Literal.number(1000000), 2637 ) 2638 ), 2639 expression=exp.Literal.number(1000000.0), 2640 ) 2641 else: 2642 # Use RANDOM() for non-deterministic output 2643 random_expr = exp.Rand() 2644 2645 replacements = {"s": s, "n": n, "random_expr": random_expr} 2646 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2647 2648 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2649 """ 2650 TO_BINARY and TRY_TO_BINARY transpilation: 2651 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2652 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2653 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2654 2655 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2656 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2657 """ 2658 value = expression.this 2659 format_arg = expression.args.get("format") 2660 is_safe = expression.args.get("safe") 2661 is_binary = _is_binary(expression) 2662 2663 if not format_arg and not is_binary: 2664 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2665 return self.func(func_name, value) 2666 2667 # Snowflake defaults to HEX encoding when no format is specified 2668 fmt = format_arg.name.upper() if format_arg else "HEX" 2669 2670 if fmt in ("UTF-8", "UTF8"): 2671 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2672 result = self.func("ENCODE", value) 2673 elif fmt == "BASE64": 2674 result = self.func("FROM_BASE64", value) 2675 elif fmt == "HEX": 2676 result = self.func("UNHEX", value) 2677 else: 2678 if is_safe: 2679 return self.sql(exp.null()) 2680 else: 2681 self.unsupported(f"format {fmt} is not supported") 2682 result = self.func("TO_BINARY", value) 2683 return f"TRY({result})" if is_safe else result 2684 2685 def _greatest_least_sql( 2686 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2687 ) -> str: 2688 """ 2689 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2690 2691 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2692 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2693 """ 2694 # Get all arguments 2695 all_args = [expression.this, *expression.expressions] 2696 fallback_sql = self.function_fallback_sql(expression) 2697 2698 if expression.args.get("ignore_nulls"): 2699 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2700 return self.sql(fallback_sql) 2701 2702 # return NULL if any argument is NULL 2703 case_expr = exp.case().when( 2704 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2705 exp.null(), 2706 copy=False, 2707 ) 2708 case_expr.set("default", fallback_sql) 2709 return self.sql(case_expr) 2710 2711 def generator_sql(self, expression: exp.Generator) -> str: 2712 # Transpile Snowflake GENERATOR to DuckDB range() 2713 rowcount = expression.args.get("rowcount") 2714 time_limit = expression.args.get("time_limit") 2715 2716 if time_limit: 2717 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2718 2719 if not rowcount: 2720 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2721 return self.func("range", exp.Literal.number(0)) 2722 2723 return self.func("range", rowcount) 2724 2725 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2726 return self._greatest_least_sql(expression) 2727 2728 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2729 return self._greatest_least_sql(expression) 2730 2731 def lambda_sql( 2732 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2733 ) -> str: 2734 if expression.args.get("colon"): 2735 prefix = "LAMBDA " 2736 arrow_sep = ":" 2737 wrap = False 2738 else: 2739 prefix = "" 2740 2741 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2742 return f"{prefix}{lambda_sql}" 2743 2744 def show_sql(self, expression: exp.Show) -> str: 2745 return f"SHOW {expression.name}" 2746 2747 def install_sql(self, expression: exp.Install) -> str: 2748 force = "FORCE " if expression.args.get("force") else "" 2749 this = self.sql(expression, "this") 2750 from_clause = expression.args.get("from_") 2751 from_clause = f" FROM {from_clause}" if from_clause else "" 2752 return f"{force}INSTALL {this}{from_clause}" 2753 2754 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2755 self.unsupported( 2756 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2757 ) 2758 return self.function_fallback_sql(expression) 2759 2760 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2761 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2762 2763 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2764 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2765 target_type = expression.args.get("target_type") 2766 needs_tz = target_type and target_type.this in ( 2767 exp.DataType.Type.TIMESTAMPLTZ, 2768 exp.DataType.Type.TIMESTAMPTZ, 2769 ) 2770 2771 if expression.args.get("safe"): 2772 formatted_time = self.format_time(expression) 2773 cast_type = ( 2774 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2775 ) 2776 return self.sql( 2777 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2778 ) 2779 2780 base_sql = str_to_time_sql(self, expression) 2781 if needs_tz: 2782 return self.sql( 2783 exp.cast( 2784 base_sql, 2785 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2786 ) 2787 ) 2788 return base_sql 2789 2790 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2791 formatted_time = self.format_time(expression) 2792 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2793 return self.sql( 2794 exp.cast( 2795 self.func(function_name, expression.this, formatted_time), 2796 exp.DataType(this=exp.DataType.Type.DATE), 2797 ) 2798 ) 2799 2800 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2801 this = expression.this 2802 time_format = self.format_time(expression) 2803 safe = expression.args.get("safe") 2804 time_type = exp.DataType.build("TIME", dialect="duckdb") 2805 cast_expr = exp.TryCast if safe else exp.Cast 2806 2807 if time_format: 2808 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2809 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2810 return self.sql(cast_expr(this=strptime, to=time_type)) 2811 2812 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2813 return self.sql(this) 2814 2815 return self.sql(cast_expr(this=this, to=time_type)) 2816 2817 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2818 if not expression.this: 2819 return "CURRENT_DATE" 2820 2821 expr = exp.Cast( 2822 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2823 to=exp.DataType(this=exp.DataType.Type.DATE), 2824 ) 2825 return self.sql(expr) 2826 2827 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2828 arg = expression.this 2829 if expression.args.get("safe"): 2830 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2831 return self.func("JSON", arg) 2832 2833 @unsupported_args("decimals") 2834 def trunc_sql(self, expression: exp.Trunc) -> str: 2835 return self.func("TRUNC", expression.this) 2836 2837 def normal_sql(self, expression: exp.Normal) -> str: 2838 """ 2839 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2840 2841 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2842 """ 2843 mean = expression.this 2844 stddev = expression.args["stddev"] 2845 gen: exp.Expression = expression.args["gen"] 2846 2847 # Build two uniform random values [0, 1) for Box-Muller transform 2848 if isinstance(gen, exp.Rand) and gen.this is None: 2849 u1: exp.Expression = exp.Rand() 2850 u2: exp.Expression = exp.Rand() 2851 else: 2852 # Seeded: derive two values using HASH with different inputs 2853 seed = gen.this if isinstance(gen, exp.Rand) else gen 2854 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2855 u2 = exp.replace_placeholders( 2856 self.SEEDED_RANDOM_TEMPLATE, 2857 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2858 ) 2859 2860 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2861 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2862 2863 def uniform_sql(self, expression: exp.Uniform) -> str: 2864 """ 2865 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2866 2867 UNIFORM returns a random value in [min, max]: 2868 - Integer result if both min and max are integers 2869 - Float result if either min or max is a float 2870 """ 2871 min_val = expression.this 2872 max_val = expression.expression 2873 gen = expression.args.get("gen") 2874 2875 # Determine if result should be integer (both bounds are integers). 2876 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2877 is_int_result = min_val.is_int and max_val.is_int 2878 2879 # Build the random value expression [0, 1) 2880 if not isinstance(gen, exp.Rand): 2881 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2882 random_expr: exp.Expression = exp.Div( 2883 this=exp.Paren( 2884 this=exp.Mod( 2885 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2886 expression=exp.Literal.number(1000000), 2887 ) 2888 ), 2889 expression=exp.Literal.number(1000000.0), 2890 ) 2891 else: 2892 random_expr = exp.Rand() 2893 2894 # Build: min + random * (max - min [+ 1 for int]) 2895 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2896 if is_int_result: 2897 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2898 2899 result: exp.Expression = exp.Add( 2900 this=min_val, 2901 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2902 ) 2903 2904 if is_int_result: 2905 result = exp.Cast( 2906 this=exp.Floor(this=result), 2907 to=exp.DataType.build("BIGINT"), 2908 ) 2909 2910 return self.sql(result) 2911 2912 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2913 nano = expression.args.get("nano") 2914 overflow = expression.args.get("overflow") 2915 2916 # Snowflake's TIME_FROM_PARTS supports overflow 2917 if overflow: 2918 hour = expression.args["hour"] 2919 minute = expression.args["min"] 2920 sec = expression.args["sec"] 2921 2922 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2923 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2924 try: 2925 h_val = hour.to_py() 2926 m_val = minute.to_py() 2927 s_val = sec.to_py() 2928 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2929 return rename_func("MAKE_TIME")(self, expression) 2930 except ValueError: 2931 pass 2932 2933 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2934 if nano: 2935 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2936 2937 total_seconds = ( 2938 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2939 ) 2940 2941 return self.sql( 2942 exp.Add( 2943 this=exp.Cast( 2944 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2945 ), 2946 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2947 ) 2948 ) 2949 2950 # Default: MAKE_TIME 2951 if nano: 2952 expression.set( 2953 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2954 ) 2955 2956 return rename_func("MAKE_TIME")(self, expression) 2957 2958 def extract_sql(self, expression: exp.Extract) -> str: 2959 """ 2960 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2961 2962 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2963 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2964 """ 2965 this = expression.this 2966 datetime_expr = expression.expression 2967 2968 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2969 # because Snowflake applies server timezone while DuckDB uses local timezone 2970 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2971 self.unsupported( 2972 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2973 ) 2974 2975 part_name = this.name.upper() 2976 2977 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2978 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2979 2980 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2981 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2982 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2983 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2984 ) 2985 2986 if is_nano_time: 2987 self.unsupported( 2988 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2989 ) 2990 return self.sql( 2991 exp.cast( 2992 exp.Mul( 2993 this=exp.Extract( 2994 this=exp.var("MICROSECOND"), expression=datetime_expr 2995 ), 2996 expression=exp.Literal.number(1000), 2997 ), 2998 exp.DataType.build(cast_type, dialect="duckdb"), 2999 ) 3000 ) 3001 3002 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 3003 strftime_input = datetime_expr 3004 if part_name == "NANOSECOND": 3005 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 3006 3007 return self.sql( 3008 exp.cast( 3009 exp.Anonymous( 3010 this="STRFTIME", 3011 expressions=[strftime_input, exp.Literal.string(fmt)], 3012 ), 3013 exp.DataType.build(cast_type, dialect="duckdb"), 3014 ) 3015 ) 3016 3017 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 3018 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 3019 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 3020 # EPOCH returns float, cast to BIGINT for integer result 3021 if part_name == "EPOCH_SECOND": 3022 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 3023 return self.sql(result) 3024 3025 return super().extract_sql(expression) 3026 3027 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 3028 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 3029 date_expr = expression.this 3030 time_expr = expression.expression 3031 3032 if date_expr is not None and time_expr is not None: 3033 # In DuckDB, DATE + TIME produces TIMESTAMP 3034 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 3035 3036 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 3037 sec = expression.args.get("sec") 3038 if sec is None: 3039 # This shouldn't happen with valid input, but handle gracefully 3040 return rename_func("MAKE_TIMESTAMP")(self, expression) 3041 3042 milli = expression.args.get("milli") 3043 if milli is not None: 3044 sec += milli.pop() / exp.Literal.number(1000.0) 3045 3046 nano = expression.args.get("nano") 3047 if nano is not None: 3048 sec += nano.pop() / exp.Literal.number(1000000000.0) 3049 3050 if milli or nano: 3051 expression.set("sec", sec) 3052 3053 return rename_func("MAKE_TIMESTAMP")(self, expression) 3054 3055 @unsupported_args("nano") 3056 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 3057 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3058 if nano := expression.args.get("nano"): 3059 nano.pop() 3060 3061 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3062 return f"CAST({timestamp} AS TIMESTAMPTZ)" 3063 3064 @unsupported_args("nano") 3065 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 3066 # Extract zone before popping 3067 zone = expression.args.get("zone") 3068 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3069 if zone: 3070 zone = zone.pop() 3071 3072 if nano := expression.args.get("nano"): 3073 nano.pop() 3074 3075 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3076 3077 if zone: 3078 # Use AT TIME ZONE to apply the explicit timezone 3079 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 3080 3081 return timestamp 3082 3083 def tablesample_sql( 3084 self, 3085 expression: exp.TableSample, 3086 tablesample_keyword: t.Optional[str] = None, 3087 ) -> str: 3088 if not isinstance(expression.parent, exp.Select): 3089 # This sample clause only applies to a single source, not the entire resulting relation 3090 tablesample_keyword = "TABLESAMPLE" 3091 3092 if expression.args.get("size"): 3093 method = expression.args.get("method") 3094 if method and method.name.upper() != "RESERVOIR": 3095 self.unsupported( 3096 f"Sampling method {method} is not supported with a discrete sample count, " 3097 "defaulting to reservoir sampling" 3098 ) 3099 expression.set("method", exp.var("RESERVOIR")) 3100 3101 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 3102 3103 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 3104 if isinstance(expression.parent, exp.UserDefinedFunction): 3105 return self.sql(expression, "this") 3106 return super().columndef_sql(expression, sep) 3107 3108 def join_sql(self, expression: exp.Join) -> str: 3109 if ( 3110 not expression.args.get("using") 3111 and not expression.args.get("on") 3112 and not expression.method 3113 and (expression.kind in ("", "INNER", "OUTER")) 3114 ): 3115 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 3116 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 3117 if isinstance(expression.this, exp.Unnest): 3118 return super().join_sql(expression.on(exp.true())) 3119 3120 expression.set("side", None) 3121 expression.set("kind", None) 3122 3123 return super().join_sql(expression) 3124 3125 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 3126 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 3127 if expression.args.get("is_end_exclusive"): 3128 return rename_func("RANGE")(self, expression) 3129 3130 return self.function_fallback_sql(expression) 3131 3132 def countif_sql(self, expression: exp.CountIf) -> str: 3133 if self.dialect.version >= (1, 2): 3134 return self.function_fallback_sql(expression) 3135 3136 # https://github.com/tobymao/sqlglot/pull/4749 3137 return count_if_to_sum(self, expression) 3138 3139 def bracket_sql(self, expression: exp.Bracket) -> str: 3140 if self.dialect.version >= (1, 2): 3141 return super().bracket_sql(expression) 3142 3143 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3144 this = expression.this 3145 if isinstance(this, exp.Array): 3146 this.replace(exp.paren(this)) 3147 3148 bracket = super().bracket_sql(expression) 3149 3150 if not expression.args.get("returns_list_for_maps"): 3151 if not this.type: 3152 from sqlglot.optimizer.annotate_types import annotate_types 3153 3154 this = annotate_types(this, dialect=self.dialect) 3155 3156 if this.is_type(exp.DataType.Type.MAP): 3157 bracket = f"({bracket})[1]" 3158 3159 return bracket 3160 3161 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3162 func = expression.this 3163 3164 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3165 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3166 if isinstance(func, exp.ArrayAgg): 3167 if not isinstance(order := expression.expression, exp.Order): 3168 return self.sql(func) 3169 3170 # Save the original column for FILTER clause (before wrapping with Order) 3171 original_this = func.this 3172 3173 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3174 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3175 func.set( 3176 "this", 3177 exp.Order( 3178 this=func.this.copy(), 3179 expressions=order.expressions, 3180 ), 3181 ) 3182 3183 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3184 # Use original_this (not the Order-wrapped version) for the FILTER condition 3185 array_agg_sql = self.function_fallback_sql(func) 3186 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3187 3188 # For other functions (like PERCENTILES), use existing logic 3189 expression_sql = self.sql(expression, "expression") 3190 3191 if isinstance(func, exp.PERCENTILES): 3192 # Make the order key the first arg and slide the fraction to the right 3193 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3194 order_col = expression.find(exp.Ordered) 3195 if order_col: 3196 func.set("expression", func.this) 3197 func.set("this", order_col.this) 3198 3199 this = self.sql(expression, "this").rstrip(")") 3200 3201 return f"{this}{expression_sql})" 3202 3203 def length_sql(self, expression: exp.Length) -> str: 3204 arg = expression.this 3205 3206 # Dialects like BQ and Snowflake also accept binary values as args, so 3207 # DDB will attempt to infer the type or resort to case/when resolution 3208 if not expression.args.get("binary") or arg.is_string: 3209 return self.func("LENGTH", arg) 3210 3211 if not arg.type: 3212 from sqlglot.optimizer.annotate_types import annotate_types 3213 3214 arg = annotate_types(arg, dialect=self.dialect) 3215 3216 if arg.is_type(*exp.DataType.TEXT_TYPES): 3217 return self.func("LENGTH", arg) 3218 3219 # We need these casts to make duckdb's static type checker happy 3220 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3221 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 3222 3223 case = ( 3224 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3225 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3226 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3227 ) 3228 return self.sql(case) 3229 3230 def _validate_regexp_flags( 3231 self, flags: t.Optional[exp.Expression], supported_flags: str 3232 ) -> t.Optional[str]: 3233 """ 3234 Validate and filter regexp flags for DuckDB compatibility. 3235 3236 Args: 3237 flags: The flags expression to validate 3238 supported_flags: String of supported flags (e.g., "ims", "cims"). 3239 Only these flags will be returned. 3240 3241 Returns: 3242 Validated/filtered flag string, or None if no valid flags remain 3243 """ 3244 if not isinstance(flags, exp.Expression): 3245 return None 3246 3247 if not flags.is_string: 3248 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3249 return None 3250 3251 flag_str = flags.this 3252 unsupported = set(flag_str) - set(supported_flags) 3253 3254 if unsupported: 3255 self.unsupported( 3256 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3257 ) 3258 3259 flag_str = "".join(f for f in flag_str if f in supported_flags) 3260 return flag_str if flag_str else None 3261 3262 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3263 this = expression.this 3264 pattern = expression.expression 3265 position = expression.args.get("position") 3266 parameters = expression.args.get("parameters") 3267 3268 # Validate flags - only "ims" flags are supported for embedded patterns 3269 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3270 3271 if position: 3272 this = exp.Substring(this=this, start=position) 3273 3274 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3275 if validated_flags: 3276 pattern = exp.Concat( 3277 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3278 ) 3279 3280 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3281 result = ( 3282 exp.case() 3283 .when( 3284 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3285 exp.Literal.number(0), 3286 ) 3287 .else_( 3288 exp.Length( 3289 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3290 ) 3291 ) 3292 ) 3293 3294 return self.sql(result) 3295 3296 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3297 subject = expression.this 3298 pattern = expression.expression 3299 replacement = expression.args.get("replacement") or exp.Literal.string("") 3300 position = expression.args.get("position") 3301 occurrence = expression.args.get("occurrence") 3302 modifiers = expression.args.get("modifiers") 3303 3304 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3305 3306 # Handle occurrence (only literals supported) 3307 if occurrence and not occurrence.is_int: 3308 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3309 else: 3310 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3311 if occurrence > 1: 3312 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3313 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3314 elif ( 3315 occurrence == 0 3316 and "g" not in validated_flags 3317 and not expression.args.get("single_replace") 3318 ): 3319 validated_flags += "g" 3320 3321 # Handle position (only literals supported) 3322 prefix = None 3323 if position and not position.is_int: 3324 self.unsupported("REGEXP_REPLACE with non-literal position") 3325 elif position and position.is_int and position.to_py() > 1: 3326 pos = position.to_py() 3327 prefix = exp.Substring( 3328 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3329 ) 3330 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3331 3332 result: exp.Expression = exp.Anonymous( 3333 this="REGEXP_REPLACE", 3334 expressions=[ 3335 subject, 3336 pattern, 3337 replacement, 3338 exp.Literal.string(validated_flags) if validated_flags else None, 3339 ], 3340 ) 3341 3342 if prefix: 3343 result = exp.Concat(expressions=[prefix, result]) 3344 3345 return self.sql(result) 3346 3347 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3348 this = expression.this 3349 pattern = expression.expression 3350 flag = expression.args.get("flag") 3351 3352 if not expression.args.get("full_match"): 3353 return self.func("REGEXP_MATCHES", this, pattern, flag) 3354 3355 # DuckDB REGEXP_MATCHES supports: c, i, m, s (but not 'e') 3356 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3357 3358 anchored_pattern = exp.Concat( 3359 expressions=[ 3360 exp.Literal.string("^("), 3361 exp.Paren(this=pattern), 3362 exp.Literal.string(")$"), 3363 ] 3364 ) 3365 3366 if validated_flags: 3367 flag = exp.Literal.string(validated_flags) 3368 3369 return self.func("REGEXP_MATCHES", this, anchored_pattern, flag) 3370 3371 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3372 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3373 this = expression.this 3374 expr = expression.expression 3375 max_dist = expression.args.get("max_dist") 3376 3377 if max_dist is None: 3378 return self.func("LEVENSHTEIN", this, expr) 3379 3380 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3381 levenshtein = exp.Levenshtein(this=this, expression=expr) 3382 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3383 3384 def pad_sql(self, expression: exp.Pad) -> str: 3385 """ 3386 Handle RPAD/LPAD for VARCHAR and BINARY types. 3387 3388 For VARCHAR: Delegate to parent class 3389 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3390 """ 3391 string_arg = expression.this 3392 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3393 3394 if _is_binary(string_arg) or _is_binary(fill_arg): 3395 length_arg = expression.expression 3396 is_left = expression.args.get("is_left") 3397 3398 input_len = exp.ByteLength(this=string_arg) 3399 chars_needed = length_arg - input_len 3400 pad_count = exp.Greatest( 3401 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3402 ) 3403 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3404 3405 left, right = string_arg, repeat_expr 3406 if is_left: 3407 left, right = right, left 3408 3409 result = exp.DPipe(this=left, expression=right) 3410 return self.sql(result) 3411 3412 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3413 return super().pad_sql(expression) 3414 3415 def minhash_sql(self, expression: exp.Minhash) -> str: 3416 k = expression.this 3417 exprs = expression.expressions 3418 3419 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3420 self.unsupported( 3421 "MINHASH with multiple expressions or * requires manual query restructuring" 3422 ) 3423 return self.func("MINHASH", k, *exprs) 3424 3425 expr = exprs[0] 3426 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3427 return f"({self.sql(result)})" 3428 3429 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3430 expr = expression.this 3431 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3432 return f"({self.sql(result)})" 3433 3434 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3435 expr = expression.this 3436 result = exp.replace_placeholders( 3437 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3438 ) 3439 return f"({self.sql(result)})" 3440 3441 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3442 arr = expression.this 3443 func = self.func("LIST_DISTINCT", arr) 3444 3445 if expression.args.get("check_null"): 3446 add_null_to_array = exp.func( 3447 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3448 ) 3449 return self.sql( 3450 exp.If( 3451 this=exp.NEQ( 3452 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3453 ), 3454 true=add_null_to_array, 3455 false=func, 3456 ) 3457 ) 3458 3459 return func 3460 3461 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3462 source = expression.this 3463 exclude = expression.expression 3464 3465 replacements = {"source": source, "exclude": exclude} 3466 return self.sql(exp.replace_placeholders(self.ARRAY_EXCEPT_TEMPLATE, **replacements)) 3467 3468 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3469 args = expression.expressions 3470 3471 if not args: 3472 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3473 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3474 3475 # Build placeholder values for template 3476 lengths = [exp.Length(this=arg) for arg in args] 3477 max_len = ( 3478 lengths[0] 3479 if len(lengths) == 1 3480 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3481 ) 3482 3483 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3484 empty_struct = exp.func( 3485 "STRUCT", 3486 *[ 3487 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3488 for i in range(len(args)) 3489 ], 3490 ) 3491 3492 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3493 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3494 index = exp.column("__i") + 1 3495 transform_struct = exp.func( 3496 "STRUCT", 3497 *[ 3498 exp.PropertyEQ( 3499 this=exp.Literal.string(f"${i + 1}"), 3500 expression=exp.func("COALESCE", arg, exp.array())[index], 3501 ) 3502 for i, arg in enumerate(args) 3503 ], 3504 ) 3505 3506 result = exp.replace_placeholders( 3507 self.ARRAYS_ZIP_TEMPLATE.copy(), 3508 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3509 all_empty_check=exp.and_( 3510 *[ 3511 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3512 for arg in args 3513 ] 3514 ), 3515 empty_struct=empty_struct, 3516 max_len=max_len, 3517 transform_struct=transform_struct, 3518 ) 3519 return self.sql(result) 3520 3521 def lower_sql(self, expression: exp.Lower) -> str: 3522 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3523 return _gen_with_cast_to_blob(self, expression, result_sql) 3524 3525 def upper_sql(self, expression: exp.Upper) -> str: 3526 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3527 return _gen_with_cast_to_blob(self, expression, result_sql) 3528 3529 def reverse_sql(self, expression: exp.Reverse) -> str: 3530 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3531 return _gen_with_cast_to_blob(self, expression, result_sql) 3532 3533 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3534 # DuckDB TO_BASE64 requires BLOB input 3535 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3536 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3537 result = expression.this 3538 3539 # Check if input is a string type - ENCODE only accepts VARCHAR 3540 if result.is_type(*exp.DataType.TEXT_TYPES): 3541 result = exp.Encode(this=result) 3542 3543 result = exp.ToBase64(this=result) 3544 3545 max_line_length = expression.args.get("max_line_length") 3546 alphabet = expression.args.get("alphabet") 3547 3548 # Handle custom alphabet by replacing standard chars with custom ones 3549 result = _apply_base64_alphabet_replacements(result, alphabet) 3550 3551 # Handle max_line_length by inserting newlines every N characters 3552 line_length = ( 3553 t.cast(int, max_line_length.to_py()) 3554 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3555 else 0 3556 ) 3557 if line_length > 0: 3558 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3559 result = exp.Trim( 3560 this=exp.RegexpReplace( 3561 this=result, 3562 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3563 replacement=exp.Concat( 3564 expressions=[exp.Literal.string("\\1"), newline.copy()] 3565 ), 3566 ), 3567 expression=newline, 3568 position="TRAILING", 3569 ) 3570 3571 return self.sql(result) 3572 3573 def replace_sql(self, expression: exp.Replace) -> str: 3574 result_sql = self.func( 3575 "REPLACE", 3576 _cast_to_varchar(expression.this), 3577 _cast_to_varchar(expression.expression), 3578 _cast_to_varchar(expression.args.get("replacement")), 3579 ) 3580 return _gen_with_cast_to_blob(self, expression, result_sql) 3581 3582 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3583 _prepare_binary_bitwise_args(expression) 3584 result_sql = self.binary(expression, op) 3585 return _gen_with_cast_to_blob(self, expression, result_sql) 3586 3587 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3588 _prepare_binary_bitwise_args(expression) 3589 result_sql = self.func("XOR", expression.this, expression.expression) 3590 return _gen_with_cast_to_blob(self, expression, result_sql) 3591 3592 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3593 this = expression.this 3594 key = expression.args.get("key") 3595 key_sql = key.name if isinstance(key, exp.Expression) else "" 3596 value_sql = self.sql(expression, "value") 3597 3598 kv_sql = f"{key_sql} := {value_sql}" 3599 3600 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3601 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3602 if isinstance(this, exp.Struct) and not this.expressions: 3603 return self.func("STRUCT_PACK", kv_sql) 3604 3605 return self.func("STRUCT_INSERT", this, kv_sql) 3606 3607 def mapcat_sql(self, expression: exp.MapCat) -> str: 3608 result = exp.replace_placeholders( 3609 self.MAPCAT_TEMPLATE.copy(), 3610 map1=expression.this, 3611 map2=expression.expression, 3612 ) 3613 return self.sql(result) 3614 3615 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3616 return self.func( 3617 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3618 ) 3619 3620 def startswith_sql(self, expression: exp.StartsWith) -> str: 3621 return self.func( 3622 "STARTS_WITH", 3623 _cast_to_varchar(expression.this), 3624 _cast_to_varchar(expression.expression), 3625 ) 3626 3627 def space_sql(self, expression: exp.Space) -> str: 3628 # DuckDB's REPEAT requires BIGINT for the count parameter 3629 return self.sql( 3630 exp.Repeat( 3631 this=exp.Literal.string(" "), 3632 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3633 ) 3634 ) 3635 3636 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3637 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3638 if isinstance(expression.this, exp.Generator): 3639 # Preserve alias, joins, and other table-level args 3640 table = exp.Table( 3641 this=expression.this, 3642 alias=expression.args.get("alias"), 3643 joins=expression.args.get("joins"), 3644 ) 3645 return self.sql(table) 3646 3647 return super().tablefromrows_sql(expression) 3648 3649 def unnest_sql(self, expression: exp.Unnest) -> str: 3650 explode_array = expression.args.get("explode_array") 3651 if explode_array: 3652 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3653 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3654 expression.expressions.append( 3655 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3656 ) 3657 3658 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3659 alias = expression.args.get("alias") 3660 if isinstance(alias, exp.TableAlias): 3661 expression.set("alias", None) 3662 if alias.columns: 3663 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3664 3665 unnest_sql = super().unnest_sql(expression) 3666 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3667 return self.sql(select) 3668 3669 return super().unnest_sql(expression) 3670 3671 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3672 this = expression.this 3673 3674 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3675 # DuckDB should render IGNORE NULLS only for the general-purpose 3676 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3677 return super().ignorenulls_sql(expression) 3678 3679 if isinstance(this, exp.First): 3680 this = exp.AnyValue(this=this.this) 3681 3682 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3683 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3684 3685 return self.sql(this) 3686 3687 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3688 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3689 # DuckDB should render RESPECT NULLS only for the general-purpose 3690 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3691 return super().respectnulls_sql(expression) 3692 3693 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3694 return self.sql(expression, "this") 3695 3696 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3697 this = self.sql(expression, "this") 3698 null_text = self.sql(expression, "null") 3699 3700 if null_text: 3701 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3702 3703 return self.func("ARRAY_TO_STRING", this, expression.expression) 3704 3705 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3706 this = expression.this 3707 group = expression.args.get("group") 3708 params = expression.args.get("parameters") 3709 position = expression.args.get("position") 3710 occurrence = expression.args.get("occurrence") 3711 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3712 3713 # Handle Snowflake's 'e' flag: it enables capture group extraction 3714 # In DuckDB, this is controlled by the group parameter directly 3715 if params and params.is_string and "e" in params.name: 3716 params = exp.Literal.string(params.name.replace("e", "")) 3717 3718 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3719 3720 # Strip default group when no following params (DuckDB default is same as group=0) 3721 if ( 3722 not validated_flags 3723 and group 3724 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3725 ): 3726 group = None 3727 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3728 3729 # use substring to handle position argument 3730 if position and (not position.is_int or position.to_py() > 1): 3731 this = exp.Substring(this=this, start=position) 3732 3733 if null_if_pos_overflow: 3734 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3735 3736 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3737 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3738 3739 if is_extract_all or non_single_occurrence: 3740 name = "REGEXP_EXTRACT_ALL" 3741 else: 3742 name = "REGEXP_EXTRACT" 3743 3744 result: exp.Expression = exp.Anonymous( 3745 this=name, expressions=[this, expression.expression, group, flags_expr] 3746 ) 3747 3748 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 3749 if is_extract_all and non_single_occurrence: 3750 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 3751 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 3752 elif non_single_occurrence: 3753 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 3754 3755 return self.sql(result) 3756 3757 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3758 return self._regexp_extract_sql(expression) 3759 3760 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 3761 return self._regexp_extract_sql(expression) 3762 3763 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3764 this = expression.this 3765 pattern = expression.expression 3766 position = expression.args.get("position") 3767 orig_occ = expression.args.get("occurrence") 3768 occurrence = orig_occ or exp.Literal.number(1) 3769 option = expression.args.get("option") 3770 parameters = expression.args.get("parameters") 3771 3772 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3773 if validated_flags: 3774 pattern = exp.Concat( 3775 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3776 ) 3777 3778 # Handle starting position offset 3779 pos_offset: exp.Expression = exp.Literal.number(0) 3780 if position and (not position.is_int or position.to_py() > 1): 3781 this = exp.Substring(this=this, start=position) 3782 pos_offset = position - exp.Literal.number(1) 3783 3784 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3785 def sum_lengths(func_name: str, end: exp.Expression) -> exp.Expression: 3786 lst = exp.Bracket( 3787 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3788 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3789 offset=1, 3790 ) 3791 transform = exp.Anonymous( 3792 this="LIST_TRANSFORM", 3793 expressions=[ 3794 lst, 3795 exp.Lambda( 3796 this=exp.Length(this=exp.to_identifier("x")), 3797 expressions=[exp.to_identifier("x")], 3798 ), 3799 ], 3800 ) 3801 return exp.Coalesce( 3802 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 3803 expressions=[exp.Literal.number(0)], 3804 ) 3805 3806 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 3807 base_pos: exp.Expression = ( 3808 exp.Literal.number(1) 3809 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 3810 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 3811 + pos_offset 3812 ) 3813 3814 # option=1: add match length for end position 3815 if option and option.is_int and option.to_py() == 1: 3816 match_at_occ = exp.Bracket( 3817 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 3818 expressions=[occurrence], 3819 offset=1, 3820 ) 3821 base_pos = base_pos + exp.Coalesce( 3822 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 3823 ) 3824 3825 # NULL checks for all provided arguments 3826 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 3827 null_args = [ 3828 expression.this, 3829 expression.expression, 3830 position, 3831 orig_occ, 3832 option, 3833 parameters, 3834 ] 3835 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 3836 3837 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3838 3839 return self.sql( 3840 exp.case() 3841 .when(exp.or_(*null_checks), exp.Null()) 3842 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 3843 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 3844 .else_(base_pos) 3845 ) 3846 3847 @unsupported_args("culture") 3848 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3849 fmt = expression.args.get("format") 3850 if fmt and fmt.is_int: 3851 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3852 3853 self.unsupported("Only integer formats are supported by NumberToStr") 3854 return self.function_fallback_sql(expression) 3855 3856 def autoincrementcolumnconstraint_sql(self, _) -> str: 3857 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3858 return "" 3859 3860 def aliases_sql(self, expression: exp.Aliases) -> str: 3861 this = expression.this 3862 if isinstance(this, exp.Posexplode): 3863 return self.posexplode_sql(this) 3864 3865 return super().aliases_sql(expression) 3866 3867 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3868 this = expression.this 3869 parent = expression.parent 3870 3871 # The default Spark aliases are "pos" and "col", unless specified otherwise 3872 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3873 3874 if isinstance(parent, exp.Aliases): 3875 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3876 pos, col = parent.expressions 3877 elif isinstance(parent, exp.Table): 3878 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3879 alias = parent.args.get("alias") 3880 if alias: 3881 pos, col = alias.columns or [pos, col] 3882 alias.pop() 3883 3884 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3885 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3886 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3887 gen_subscripts = self.sql( 3888 exp.Alias( 3889 this=exp.Anonymous( 3890 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3891 ) 3892 - exp.Literal.number(1), 3893 alias=pos, 3894 ) 3895 ) 3896 3897 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3898 3899 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3900 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3901 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3902 3903 return posexplode_sql 3904 3905 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3906 """ 3907 Handles three key issues: 3908 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3909 2. End-of-month preservation: If input is last day of month, result is last day of result month 3910 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3911 """ 3912 from sqlglot.optimizer.annotate_types import annotate_types 3913 3914 this = expression.this 3915 if not this.type: 3916 this = annotate_types(this, dialect=self.dialect) 3917 3918 if this.is_type(*exp.DataType.TEXT_TYPES): 3919 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3920 3921 # Detect float/decimal months to apply rounding (Snowflake behavior) 3922 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3923 months_expr = expression.expression 3924 if not months_expr.type: 3925 months_expr = annotate_types(months_expr, dialect=self.dialect) 3926 3927 # Build interval or to_months expression based on type 3928 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3929 interval_or_to_months = ( 3930 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3931 if months_expr.is_type( 3932 exp.DataType.Type.FLOAT, 3933 exp.DataType.Type.DOUBLE, 3934 exp.DataType.Type.DECIMAL, 3935 ) 3936 # Integer case: standard INTERVAL N MONTH syntax 3937 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3938 ) 3939 3940 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3941 3942 # Apply end-of-month preservation if Snowflake flag is set 3943 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3944 preserve_eom = expression.args.get("preserve_end_of_month") 3945 result_expr = ( 3946 exp.case() 3947 .when( 3948 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3949 exp.func("LAST_DAY", date_add_expr), 3950 ) 3951 .else_(date_add_expr) 3952 if preserve_eom 3953 else date_add_expr 3954 ) 3955 3956 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3957 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3958 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3959 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3960 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3961 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3962 return self.sql(result_expr) 3963 3964 def format_sql(self, expression: exp.Format) -> str: 3965 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3966 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3967 3968 return self.function_fallback_sql(expression) 3969 3970 def hexstring_sql( 3971 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3972 ) -> str: 3973 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3974 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3975 3976 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3977 unit = unit_to_str(expression) 3978 date = expression.this 3979 result = self.func("DATE_TRUNC", unit, date) 3980 3981 if ( 3982 expression.args.get("input_type_preserved") 3983 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3984 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3985 ): 3986 return self.sql(exp.Cast(this=result, to=date.type)) 3987 3988 return result 3989 3990 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3991 unit = unit_to_str(expression) 3992 zone = expression.args.get("zone") 3993 timestamp = expression.this 3994 date_unit = is_date_unit(unit) 3995 3996 if date_unit and zone: 3997 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3998 # Double AT TIME ZONE needed for BigQuery compatibility: 3999 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4000 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4001 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4002 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4003 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4004 4005 result = self.func("DATE_TRUNC", unit, timestamp) 4006 if expression.args.get("input_type_preserved"): 4007 if timestamp.type and timestamp.is_type( 4008 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 4009 ): 4010 dummy_date = exp.Cast( 4011 this=exp.Literal.string("1970-01-01"), 4012 to=exp.DataType(this=exp.DataType.Type.DATE), 4013 ) 4014 date_time = exp.Add(this=dummy_date, expression=timestamp) 4015 result = self.func("DATE_TRUNC", unit, date_time) 4016 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4017 4018 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4019 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 4020 ): 4021 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4022 4023 return result 4024 4025 def trim_sql(self, expression: exp.Trim) -> str: 4026 expression.this.replace(_cast_to_varchar(expression.this)) 4027 if expression.expression: 4028 expression.expression.replace(_cast_to_varchar(expression.expression)) 4029 4030 result_sql = super().trim_sql(expression) 4031 return _gen_with_cast_to_blob(self, expression, result_sql) 4032 4033 def round_sql(self, expression: exp.Round) -> str: 4034 this = expression.this 4035 decimals = expression.args.get("decimals") 4036 truncate = expression.args.get("truncate") 4037 4038 # DuckDB requires the scale (decimals) argument to be an INT 4039 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4040 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4041 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4042 decimals = exp.cast(decimals, exp.DataType.Type.INT) 4043 4044 func = "ROUND" 4045 if truncate: 4046 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4047 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4048 func = "ROUND_EVEN" 4049 truncate = None 4050 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4051 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4052 truncate = None 4053 4054 return self.func(func, this, decimals, truncate) 4055 4056 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4057 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4058 4059 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4060 if expression.is_type(*exp.DataType.REAL_TYPES): 4061 result = f"CAST({result} AS DOUBLE)" 4062 4063 return result 4064 4065 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4066 """ 4067 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4068 dividing the input distribution into n equal-sized buckets. 4069 4070 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4071 does not document the specific algorithm used so results may differ. DuckDB does not 4072 support RESPECT NULLS. 4073 """ 4074 this = expression.this 4075 if isinstance(this, exp.Distinct): 4076 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4077 if len(this.expressions) < 2: 4078 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4079 return self.function_fallback_sql(expression) 4080 num_quantiles_expr = this.expressions[1].pop() 4081 else: 4082 num_quantiles_expr = expression.expression 4083 4084 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4085 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4086 return self.function_fallback_sql(expression) 4087 4088 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4089 if num_quantiles <= 0: 4090 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4091 return self.function_fallback_sql(expression) 4092 4093 quantiles = [ 4094 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4095 for i in range(num_quantiles + 1) 4096 ] 4097 4098 return self.sql( 4099 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 4100 ) 4101 4102 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4103 if expression.args.get("scalar_only"): 4104 expression = exp.JSONExtractScalar( 4105 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4106 ) 4107 return _arrow_json_extract_sql(self, expression) 4108 4109 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4110 this = expression.this 4111 4112 if _is_binary(this): 4113 expression.type = exp.DataType.build("BINARY") 4114 4115 arg = _cast_to_bit(this) 4116 4117 if isinstance(this, exp.Neg): 4118 arg = exp.Paren(this=arg) 4119 4120 expression.set("this", arg) 4121 4122 result_sql = f"~{self.sql(expression, 'this')}" 4123 4124 return _gen_with_cast_to_blob(self, expression, result_sql) 4125 4126 def window_sql(self, expression: exp.Window) -> str: 4127 this = expression.this 4128 if isinstance(this, exp.Corr) or ( 4129 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4130 ): 4131 return self._corr_sql(expression) 4132 4133 return super().window_sql(expression) 4134 4135 def filter_sql(self, expression: exp.Filter) -> str: 4136 if isinstance(expression.this, exp.Corr): 4137 return self._corr_sql(expression) 4138 4139 return super().filter_sql(expression) 4140 4141 def _corr_sql( 4142 self, 4143 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 4144 ) -> str: 4145 if isinstance(expression, exp.Corr) and not expression.args.get( 4146 "null_on_zero_variance" 4147 ): 4148 return self.func("CORR", expression.this, expression.expression) 4149 4150 corr_expr = _maybe_corr_null_to_false(expression) 4151 if corr_expr is None: 4152 if isinstance(expression, exp.Window): 4153 return super().window_sql(expression) 4154 if isinstance(expression, exp.Filter): 4155 return super().filter_sql(expression) 4156 corr_expr = expression # make mypy happy 4157 4158 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
Default NULL ordering method to use if not explicitly set.
Possible values: "nulls_are_small", "nulls_are_large", "nulls_are_last"
A NULL arg in CONCAT yields NULL by default, but in some dialects it yields an empty string.
Whether ORDER BY ALL is supported (expands to all the selected columns) as in DuckDB, Spark3/Databricks
Whether expressions such as x::INT[5] should be parsed as fixed-size array defs/casts e.g. in DuckDB. In dialects which don't support fixed size arrays such as Snowflake, this should be interpreted as a subscript/index operator.
Whether failing to parse a JSON path expression using the JSONPath dialect will log a warning.
Whether number literals can include underscores for better readability
Specifies the strategy according to which identifiers should be normalized.
1499 def to_json_path(self, path: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 1500 if isinstance(path, exp.Literal): 1501 # DuckDB also supports the JSON pointer syntax, where every path starts with a `/`. 1502 # Additionally, it allows accessing the back of lists using the `[#-i]` syntax. 1503 # This check ensures we'll avoid trying to parse these as JSON paths, which can 1504 # either result in a noisy warning or in an invalid representation of the path. 1505 path_text = path.name 1506 if path_text.startswith("/") or "[#" in path_text: 1507 return path 1508 1509 return super().to_json_path(path)
Mapping of an escaped sequence (\n) to its unescaped version (
).
1511 class Tokenizer(tokens.Tokenizer): 1512 BYTE_STRINGS = [("e'", "'"), ("E'", "'")] 1513 BYTE_STRING_ESCAPES = ["'", "\\"] 1514 HEREDOC_STRINGS = ["$"] 1515 1516 HEREDOC_TAG_IS_IDENTIFIER = True 1517 HEREDOC_STRING_ALTERNATIVE = TokenType.PARAMETER 1518 1519 KEYWORDS = { 1520 **tokens.Tokenizer.KEYWORDS, 1521 "//": TokenType.DIV, 1522 "**": TokenType.DSTAR, 1523 "^@": TokenType.CARET_AT, 1524 "@>": TokenType.AT_GT, 1525 "<@": TokenType.LT_AT, 1526 "ATTACH": TokenType.ATTACH, 1527 "BINARY": TokenType.VARBINARY, 1528 "BITSTRING": TokenType.BIT, 1529 "BPCHAR": TokenType.TEXT, 1530 "CHAR": TokenType.TEXT, 1531 "DATETIME": TokenType.TIMESTAMPNTZ, 1532 "DETACH": TokenType.DETACH, 1533 "FORCE": TokenType.FORCE, 1534 "INSTALL": TokenType.INSTALL, 1535 "INT8": TokenType.BIGINT, 1536 "LOGICAL": TokenType.BOOLEAN, 1537 "MACRO": TokenType.FUNCTION, 1538 "ONLY": TokenType.ONLY, 1539 "PIVOT_WIDER": TokenType.PIVOT, 1540 "POSITIONAL": TokenType.POSITIONAL, 1541 "RESET": TokenType.COMMAND, 1542 "ROW": TokenType.STRUCT, 1543 "SIGNED": TokenType.INT, 1544 "STRING": TokenType.TEXT, 1545 "SUMMARIZE": TokenType.SUMMARIZE, 1546 "TIMESTAMP": TokenType.TIMESTAMPNTZ, 1547 "TIMESTAMP_S": TokenType.TIMESTAMP_S, 1548 "TIMESTAMP_MS": TokenType.TIMESTAMP_MS, 1549 "TIMESTAMP_NS": TokenType.TIMESTAMP_NS, 1550 "TIMESTAMP_US": TokenType.TIMESTAMP, 1551 "UBIGINT": TokenType.UBIGINT, 1552 "UINTEGER": TokenType.UINT, 1553 "USMALLINT": TokenType.USMALLINT, 1554 "UTINYINT": TokenType.UTINYINT, 1555 "VARCHAR": TokenType.TEXT, 1556 } 1557 KEYWORDS.pop("/*+") 1558 1559 SINGLE_TOKENS = { 1560 **tokens.Tokenizer.SINGLE_TOKENS, 1561 "$": TokenType.PARAMETER, 1562 } 1563 1564 COMMANDS = tokens.Tokenizer.COMMANDS - {TokenType.SHOW}
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- STRING_ESCAPES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- IDENTIFIER_ESCAPES
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- tokenize
- sql
- size
- tokens
1566 class Parser(parser.Parser): 1567 MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS = True 1568 1569 BITWISE = parser.Parser.BITWISE.copy() 1570 BITWISE.pop(TokenType.CARET) 1571 1572 RANGE_PARSERS = { 1573 **parser.Parser.RANGE_PARSERS, 1574 TokenType.DAMP: binary_range_parser(exp.ArrayOverlaps), 1575 TokenType.CARET_AT: binary_range_parser(exp.StartsWith), 1576 TokenType.TILDE: binary_range_parser(exp.RegexpFullMatch), 1577 } 1578 1579 EXPONENT = { 1580 **parser.Parser.EXPONENT, 1581 TokenType.CARET: exp.Pow, 1582 TokenType.DSTAR: exp.Pow, 1583 } 1584 1585 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "STRUCT_PACK"} 1586 1587 SHOW_PARSERS = { 1588 "TABLES": _show_parser("TABLES"), 1589 "ALL TABLES": _show_parser("ALL TABLES"), 1590 } 1591 1592 FUNCTIONS = { 1593 **parser.Parser.FUNCTIONS, 1594 "ANY_VALUE": lambda args: exp.IgnoreNulls(this=exp.AnyValue.from_arg_list(args)), 1595 "ARRAY_PREPEND": _build_array_prepend, 1596 "ARRAY_REVERSE_SORT": _build_sort_array_desc, 1597 "ARRAY_SORT": exp.SortArray.from_arg_list, 1598 "BIT_AND": exp.BitwiseAndAgg.from_arg_list, 1599 "BIT_OR": exp.BitwiseOrAgg.from_arg_list, 1600 "BIT_XOR": exp.BitwiseXorAgg.from_arg_list, 1601 "CURRENT_LOCALTIMESTAMP": exp.Localtimestamp.from_arg_list, 1602 "DATEDIFF": _build_date_diff, 1603 "DATE_DIFF": _build_date_diff, 1604 "DATE_TRUNC": date_trunc_to_time, 1605 "DATETRUNC": date_trunc_to_time, 1606 "DECODE": lambda args: exp.Decode( 1607 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1608 ), 1609 "EDITDIST3": exp.Levenshtein.from_arg_list, 1610 "ENCODE": lambda args: exp.Encode( 1611 this=seq_get(args, 0), charset=exp.Literal.string("utf-8") 1612 ), 1613 "EPOCH": exp.TimeToUnix.from_arg_list, 1614 "EPOCH_MS": lambda args: exp.UnixToTime( 1615 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 1616 ), 1617 "GENERATE_SERIES": _build_generate_series(), 1618 "GET_CURRENT_TIME": exp.CurrentTime.from_arg_list, 1619 "GET_BIT": lambda args: exp.Getbit( 1620 this=seq_get(args, 0), expression=seq_get(args, 1), zero_is_msb=True 1621 ), 1622 "JARO_WINKLER_SIMILARITY": exp.JarowinklerSimilarity.from_arg_list, 1623 "JSON": exp.ParseJSON.from_arg_list, 1624 "JSON_EXTRACT_PATH": parser.build_extract_json_with_path(exp.JSONExtract), 1625 "JSON_EXTRACT_STRING": parser.build_extract_json_with_path(exp.JSONExtractScalar), 1626 "LIST_APPEND": exp.ArrayAppend.from_arg_list, 1627 "LIST_CONCAT": parser.build_array_concat, 1628 "LIST_CONTAINS": exp.ArrayContains.from_arg_list, 1629 "LIST_COSINE_DISTANCE": exp.CosineDistance.from_arg_list, 1630 "LIST_DISTANCE": exp.EuclideanDistance.from_arg_list, 1631 "LIST_FILTER": exp.ArrayFilter.from_arg_list, 1632 "LIST_HAS": exp.ArrayContains.from_arg_list, 1633 "LIST_HAS_ANY": exp.ArrayOverlaps.from_arg_list, 1634 "LIST_MAX": exp.ArrayMax.from_arg_list, 1635 "LIST_MIN": exp.ArrayMin.from_arg_list, 1636 "LIST_PREPEND": _build_array_prepend, 1637 "LIST_REVERSE_SORT": _build_sort_array_desc, 1638 "LIST_SORT": exp.SortArray.from_arg_list, 1639 "LIST_TRANSFORM": exp.Transform.from_arg_list, 1640 "LIST_VALUE": lambda args: exp.Array(expressions=args), 1641 "MAKE_DATE": exp.DateFromParts.from_arg_list, 1642 "MAKE_TIME": exp.TimeFromParts.from_arg_list, 1643 "MAKE_TIMESTAMP": _build_make_timestamp, 1644 "QUANTILE_CONT": exp.PercentileCont.from_arg_list, 1645 "QUANTILE_DISC": exp.PercentileDisc.from_arg_list, 1646 "RANGE": _build_generate_series(end_exclusive=True), 1647 "REGEXP_EXTRACT": build_regexp_extract(exp.RegexpExtract), 1648 "REGEXP_EXTRACT_ALL": build_regexp_extract(exp.RegexpExtractAll), 1649 "REGEXP_MATCHES": exp.RegexpLike.from_arg_list, 1650 "REGEXP_REPLACE": lambda args: exp.RegexpReplace( 1651 this=seq_get(args, 0), 1652 expression=seq_get(args, 1), 1653 replacement=seq_get(args, 2), 1654 modifiers=seq_get(args, 3), 1655 single_replace=True, 1656 ), 1657 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 1658 "STRFTIME": build_formatted_time(exp.TimeToStr, "duckdb"), 1659 "STRING_SPLIT": exp.Split.from_arg_list, 1660 "STRING_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1661 "STRING_TO_ARRAY": exp.Split.from_arg_list, 1662 "STRPTIME": build_formatted_time(exp.StrToTime, "duckdb"), 1663 "STRUCT_PACK": exp.Struct.from_arg_list, 1664 "STR_SPLIT": exp.Split.from_arg_list, 1665 "STR_SPLIT_REGEX": exp.RegexpSplit.from_arg_list, 1666 "TODAY": exp.CurrentDate.from_arg_list, 1667 "TIME_BUCKET": exp.DateBin.from_arg_list, 1668 "TO_TIMESTAMP": exp.UnixToTime.from_arg_list, 1669 "UNNEST": exp.Explode.from_arg_list, 1670 "VERSION": exp.CurrentVersion.from_arg_list, 1671 "XOR": binary_from_function(exp.BitwiseXor), 1672 } 1673 1674 FUNCTIONS.pop("DATE_SUB") 1675 FUNCTIONS.pop("GLOB") 1676 1677 FUNCTION_PARSERS = { 1678 **parser.Parser.FUNCTION_PARSERS, 1679 **dict.fromkeys( 1680 ("GROUP_CONCAT", "LISTAGG", "STRINGAGG"), lambda self: self._parse_string_agg() 1681 ), 1682 } 1683 FUNCTION_PARSERS.pop("DECODE") 1684 1685 NO_PAREN_FUNCTION_PARSERS = { 1686 **parser.Parser.NO_PAREN_FUNCTION_PARSERS, 1687 "MAP": lambda self: self._parse_map(), 1688 "@": lambda self: exp.Abs(this=self._parse_bitwise()), 1689 } 1690 1691 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 1692 TokenType.SEMI, 1693 TokenType.ANTI, 1694 } 1695 1696 PLACEHOLDER_PARSERS = { 1697 **parser.Parser.PLACEHOLDER_PARSERS, 1698 TokenType.PARAMETER: lambda self: ( 1699 self.expression(exp.Placeholder, this=self._prev.text) 1700 if self._match(TokenType.NUMBER) or self._match_set(self.ID_VAR_TOKENS) 1701 else None 1702 ), 1703 } 1704 1705 TYPE_CONVERTERS = { 1706 # https://duckdb.org/docs/sql/data_types/numeric 1707 exp.DataType.Type.DECIMAL: build_default_decimal_type(precision=18, scale=3), 1708 # https://duckdb.org/docs/sql/data_types/text 1709 exp.DataType.Type.TEXT: lambda dtype: exp.DataType.build("TEXT"), 1710 } 1711 1712 STATEMENT_PARSERS = { 1713 **parser.Parser.STATEMENT_PARSERS, 1714 TokenType.ATTACH: lambda self: self._parse_attach_detach(), 1715 TokenType.DETACH: lambda self: self._parse_attach_detach(is_attach=False), 1716 TokenType.FORCE: lambda self: self._parse_force(), 1717 TokenType.INSTALL: lambda self: self._parse_install(), 1718 TokenType.SHOW: lambda self: self._parse_show(), 1719 } 1720 1721 SET_PARSERS = { 1722 **parser.Parser.SET_PARSERS, 1723 "VARIABLE": lambda self: self._parse_set_item_assignment("VARIABLE"), 1724 } 1725 1726 def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]: 1727 index = self._index 1728 if not self._match_text_seq("LAMBDA"): 1729 return super()._parse_lambda(alias=alias) 1730 1731 expressions = self._parse_csv(self._parse_lambda_arg) 1732 if not self._match(TokenType.COLON): 1733 self._retreat(index) 1734 return None 1735 1736 this = self._replace_lambda(self._parse_assignment(), expressions) 1737 return self.expression(exp.Lambda, this=this, expressions=expressions, colon=True) 1738 1739 def _parse_expression(self) -> t.Optional[exp.Expression]: 1740 # DuckDB supports prefix aliases, e.g. foo: 1 1741 if self._next and self._next.token_type == TokenType.COLON: 1742 alias = self._parse_id_var(tokens=self.ALIAS_TOKENS) 1743 self._match(TokenType.COLON) 1744 comments = self._prev_comments or [] 1745 1746 this = self._parse_assignment() 1747 if isinstance(this, exp.Expression): 1748 # Moves the comment next to the alias in `alias: expr /* comment */` 1749 comments += this.pop_comments() or [] 1750 1751 return self.expression(exp.Alias, comments=comments, this=this, alias=alias) 1752 1753 return super()._parse_expression() 1754 1755 def _parse_table( 1756 self, 1757 schema: bool = False, 1758 joins: bool = False, 1759 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 1760 parse_bracket: bool = False, 1761 is_db_reference: bool = False, 1762 parse_partition: bool = False, 1763 consume_pipe: bool = False, 1764 ) -> t.Optional[exp.Expression]: 1765 # DuckDB supports prefix aliases, e.g. FROM foo: bar 1766 if self._next and self._next.token_type == TokenType.COLON: 1767 alias = self._parse_table_alias( 1768 alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS 1769 ) 1770 self._match(TokenType.COLON) 1771 comments = self._prev_comments or [] 1772 else: 1773 alias = None 1774 comments = [] 1775 1776 table = super()._parse_table( 1777 schema=schema, 1778 joins=joins, 1779 alias_tokens=alias_tokens, 1780 parse_bracket=parse_bracket, 1781 is_db_reference=is_db_reference, 1782 parse_partition=parse_partition, 1783 ) 1784 if isinstance(table, exp.Expression) and isinstance(alias, exp.TableAlias): 1785 # Moves the comment next to the alias in `alias: table /* comment */` 1786 comments += table.pop_comments() or [] 1787 alias.comments = alias.pop_comments() + comments 1788 table.set("alias", alias) 1789 1790 return table 1791 1792 def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]: 1793 # https://duckdb.org/docs/sql/samples.html 1794 sample = super()._parse_table_sample(as_modifier=as_modifier) 1795 if sample and not sample.args.get("method"): 1796 if sample.args.get("size"): 1797 sample.set("method", exp.var("RESERVOIR")) 1798 else: 1799 sample.set("method", exp.var("SYSTEM")) 1800 1801 return sample 1802 1803 def _parse_bracket( 1804 self, this: t.Optional[exp.Expression] = None 1805 ) -> t.Optional[exp.Expression]: 1806 bracket = super()._parse_bracket(this) 1807 1808 if self.dialect.version < (1, 2) and isinstance(bracket, exp.Bracket): 1809 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 1810 bracket.set("returns_list_for_maps", True) 1811 1812 return bracket 1813 1814 def _parse_map(self) -> exp.ToMap | exp.Map: 1815 if self._match(TokenType.L_BRACE, advance=False): 1816 return self.expression(exp.ToMap, this=self._parse_bracket()) 1817 1818 args = self._parse_wrapped_csv(self._parse_assignment) 1819 return self.expression(exp.Map, keys=seq_get(args, 0), values=seq_get(args, 1)) 1820 1821 def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]: 1822 return self._parse_field_def() 1823 1824 def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]: 1825 if len(aggregations) == 1: 1826 return super()._pivot_column_names(aggregations) 1827 return pivot_column_names(aggregations, dialect="duckdb") 1828 1829 def _parse_attach_detach(self, is_attach=True) -> exp.Attach | exp.Detach: 1830 def _parse_attach_option() -> exp.AttachOption: 1831 return self.expression( 1832 exp.AttachOption, 1833 this=self._parse_var(any_token=True), 1834 expression=self._parse_field(any_token=True), 1835 ) 1836 1837 self._match(TokenType.DATABASE) 1838 exists = self._parse_exists(not_=is_attach) 1839 this = self._parse_alias(self._parse_primary_or_var(), explicit=True) 1840 1841 if self._match(TokenType.L_PAREN, advance=False): 1842 expressions = self._parse_wrapped_csv(_parse_attach_option) 1843 else: 1844 expressions = None 1845 1846 return ( 1847 self.expression(exp.Attach, this=this, exists=exists, expressions=expressions) 1848 if is_attach 1849 else self.expression(exp.Detach, this=this, exists=exists) 1850 ) 1851 1852 def _parse_show_duckdb(self, this: str) -> exp.Show: 1853 return self.expression(exp.Show, this=this) 1854 1855 def _parse_force(self) -> exp.Install | exp.Command: 1856 # FORCE can only be followed by INSTALL or CHECKPOINT 1857 # In the case of CHECKPOINT, we fallback 1858 if not self._match(TokenType.INSTALL): 1859 return self._parse_as_command(self._prev) 1860 1861 return self._parse_install(force=True) 1862 1863 def _parse_install(self, force: bool = False) -> exp.Install: 1864 return self.expression( 1865 exp.Install, 1866 this=self._parse_id_var(), 1867 from_=self._parse_var_or_string() if self._match(TokenType.FROM) else None, 1868 force=force, 1869 ) 1870 1871 def _parse_primary(self) -> t.Optional[exp.Expression]: 1872 if self._match_pair(TokenType.HASH, TokenType.NUMBER): 1873 return exp.PositionalColumn(this=exp.Literal.number(self._prev.text)) 1874 1875 return super()._parse_primary()
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- TRIGGER_EVENTS
- ALTERABLES
- ALIAS_TOKENS
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- TERM
- FACTOR
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- PROPERTY_PARSERS
- CONSTRAINT_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- INVALID_FUNC_NAME_TOKENS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- QUERY_MODIFIER_TOKENS
- TYPE_LITERAL_PARSERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- TRIGGER_TIMING
- TRIGGER_DEFERRABLE
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- SET_ASSIGNMENT_DELIMITERS
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- LOG_DEFAULTS_TO_LN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ALTER_TABLE_PARTITIONS
- JOINS_HAVE_EQUAL_PRECEDENCE
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- raise_error
- validate_expression
- reset
- errors
- error_level
- error_message_context
- max_errors
- dialect
- sql
- parse
- parse_into
- check_errors
- expression
- parse_set_operation
- build_cast
1877 class Generator(generator.Generator): 1878 PARAMETER_TOKEN = "$" 1879 NAMED_PLACEHOLDER_TOKEN = "$" 1880 JOIN_HINTS = False 1881 TABLE_HINTS = False 1882 QUERY_HINTS = False 1883 LIMIT_FETCH = "LIMIT" 1884 STRUCT_DELIMITER = ("(", ")") 1885 RENAME_TABLE_WITH_DB = False 1886 NVL2_SUPPORTED = False 1887 SEMI_ANTI_JOIN_WITH_SIDE = False 1888 TABLESAMPLE_KEYWORDS = "USING SAMPLE" 1889 TABLESAMPLE_SEED_KEYWORD = "REPEATABLE" 1890 LAST_DAY_SUPPORTS_DATE_PART = False 1891 JSON_KEY_VALUE_PAIR_SEP = "," 1892 IGNORE_NULLS_IN_FUNC = True 1893 JSON_PATH_BRACKETED_KEY_SUPPORTED = False 1894 SUPPORTS_CREATE_TABLE_LIKE = False 1895 MULTI_ARG_DISTINCT = False 1896 CAN_IMPLEMENT_ARRAY_ANY = True 1897 SUPPORTS_TO_NUMBER = False 1898 SUPPORTS_WINDOW_EXCLUDE = True 1899 COPY_HAS_INTO_KEYWORD = False 1900 STAR_EXCEPT = "EXCLUDE" 1901 PAD_FILL_PATTERN_IS_REQUIRED = True 1902 ARRAY_SIZE_DIM_REQUIRED = False 1903 NORMALIZE_EXTRACT_DATE_PARTS = True 1904 SUPPORTS_LIKE_QUANTIFIERS = False 1905 SET_ASSIGNMENT_REQUIRES_VARIABLE_KEYWORD = True 1906 1907 TRANSFORMS = { 1908 **generator.Generator.TRANSFORMS, 1909 exp.AnyValue: _anyvalue_sql, 1910 exp.ApproxDistinct: approx_count_distinct_sql, 1911 exp.Boolnot: _boolnot_sql, 1912 exp.Booland: _booland_sql, 1913 exp.Boolor: _boolor_sql, 1914 exp.Array: transforms.preprocess( 1915 [transforms.inherit_struct_field_names], 1916 generator=inline_array_unless_query, 1917 ), 1918 exp.ArrayAppend: array_append_sql("LIST_APPEND"), 1919 exp.ArrayCompact: array_compact_sql, 1920 exp.ArrayConstructCompact: lambda self, e: self.sql( 1921 exp.ArrayCompact(this=exp.Array(expressions=e.expressions)) 1922 ), 1923 exp.ArrayConcat: array_concat_sql("LIST_CONCAT"), 1924 exp.ArrayContains: _array_contains_sql, 1925 exp.ArrayFilter: rename_func("LIST_FILTER"), 1926 exp.ArrayInsert: _array_insert_sql, 1927 exp.ArrayRemoveAt: _array_remove_at_sql, 1928 exp.ArrayRemove: remove_from_array_using_filter, 1929 exp.ArraySort: _array_sort_sql, 1930 exp.ArrayPrepend: array_append_sql("LIST_PREPEND", swap_params=True), 1931 exp.ArraySum: rename_func("LIST_SUM"), 1932 exp.ArrayMax: rename_func("LIST_MAX"), 1933 exp.ArrayMin: rename_func("LIST_MIN"), 1934 exp.ArrayUniqueAgg: lambda self, e: self.func( 1935 "LIST", exp.Distinct(expressions=[e.this]) 1936 ), 1937 exp.Base64DecodeBinary: lambda self, e: _base64_decode_sql(self, e, to_string=False), 1938 exp.Base64DecodeString: lambda self, e: _base64_decode_sql(self, e, to_string=True), 1939 exp.BitwiseAnd: lambda self, e: self._bitwise_op(e, "&"), 1940 exp.BitwiseAndAgg: _bitwise_agg_sql, 1941 exp.BitwiseCount: rename_func("BIT_COUNT"), 1942 exp.BitwiseLeftShift: _bitshift_sql, 1943 exp.BitwiseOr: lambda self, e: self._bitwise_op(e, "|"), 1944 exp.BitwiseOrAgg: _bitwise_agg_sql, 1945 exp.BitwiseRightShift: _bitshift_sql, 1946 exp.BitwiseXorAgg: _bitwise_agg_sql, 1947 exp.ByteLength: lambda self, e: self.func("OCTET_LENGTH", e.this), 1948 exp.CommentColumnConstraint: no_comment_column_constraint_sql, 1949 exp.Corr: lambda self, e: self._corr_sql(e), 1950 exp.CosineDistance: rename_func("LIST_COSINE_DISTANCE"), 1951 exp.CurrentTime: lambda *_: "CURRENT_TIME", 1952 exp.CurrentSchemas: lambda self, e: self.func( 1953 "current_schemas", e.this if e.this else exp.true() 1954 ), 1955 exp.CurrentTimestamp: lambda self, e: self.sql( 1956 exp.AtTimeZone(this=exp.var("CURRENT_TIMESTAMP"), zone=exp.Literal.string("UTC")) 1957 ) 1958 if e.args.get("sysdate") 1959 else "CURRENT_TIMESTAMP", 1960 exp.CurrentVersion: rename_func("version"), 1961 exp.Localtime: unsupported_args("this")(lambda *_: "LOCALTIME"), 1962 exp.DayOfMonth: rename_func("DAYOFMONTH"), 1963 exp.DayOfWeek: rename_func("DAYOFWEEK"), 1964 exp.DayOfWeekIso: rename_func("ISODOW"), 1965 exp.DayOfYear: rename_func("DAYOFYEAR"), 1966 exp.Dayname: lambda self, e: ( 1967 self.func("STRFTIME", e.this, exp.Literal.string("%a")) 1968 if e.args.get("abbreviated") 1969 else self.func("DAYNAME", e.this) 1970 ), 1971 exp.Monthname: lambda self, e: ( 1972 self.func("STRFTIME", e.this, exp.Literal.string("%b")) 1973 if e.args.get("abbreviated") 1974 else self.func("MONTHNAME", e.this) 1975 ), 1976 exp.DataType: _datatype_sql, 1977 exp.Date: _date_sql, 1978 exp.DateAdd: _date_delta_to_binary_interval_op(), 1979 exp.DateFromParts: _date_from_parts_sql, 1980 exp.DateSub: _date_delta_to_binary_interval_op(), 1981 exp.DateDiff: _date_diff_sql, 1982 exp.DateStrToDate: datestrtodate_sql, 1983 exp.Datetime: no_datetime_sql, 1984 exp.DatetimeDiff: _date_diff_sql, 1985 exp.DatetimeSub: _date_delta_to_binary_interval_op(), 1986 exp.DatetimeAdd: _date_delta_to_binary_interval_op(), 1987 exp.DateToDi: lambda self, 1988 e: f"CAST(STRFTIME({self.sql(e, 'this')}, {DuckDB.DATEINT_FORMAT}) AS INT)", 1989 exp.Decode: lambda self, e: encode_decode_sql(self, e, "DECODE", replace=False), 1990 exp.DiToDate: lambda self, 1991 e: f"CAST(STRPTIME(CAST({self.sql(e, 'this')} AS TEXT), {DuckDB.DATEINT_FORMAT}) AS DATE)", 1992 exp.Encode: lambda self, e: encode_decode_sql(self, e, "ENCODE", replace=False), 1993 exp.EqualNull: lambda self, e: self.sql( 1994 exp.NullSafeEQ(this=e.this, expression=e.expression) 1995 ), 1996 exp.EuclideanDistance: rename_func("LIST_DISTANCE"), 1997 exp.GenerateDateArray: _generate_datetime_array_sql, 1998 exp.GenerateTimestampArray: _generate_datetime_array_sql, 1999 exp.Getbit: getbit_sql, 2000 exp.GroupConcat: lambda self, e: groupconcat_sql(self, e, within_group=False), 2001 exp.Explode: rename_func("UNNEST"), 2002 exp.IntDiv: lambda self, e: self.binary(e, "//"), 2003 exp.IsInf: rename_func("ISINF"), 2004 exp.IsNan: rename_func("ISNAN"), 2005 exp.IsNullValue: lambda self, e: self.sql( 2006 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("NULL")) 2007 ), 2008 exp.IsArray: lambda self, e: self.sql( 2009 exp.func("JSON_TYPE", e.this).eq(exp.Literal.string("ARRAY")) 2010 ), 2011 exp.Ceil: _ceil_floor, 2012 exp.Floor: _ceil_floor, 2013 exp.JarowinklerSimilarity: jarowinkler_similarity("JARO_WINKLER_SIMILARITY"), 2014 exp.JSONBExists: rename_func("JSON_EXISTS"), 2015 exp.JSONExtract: _arrow_json_extract_sql, 2016 exp.JSONExtractArray: _json_extract_value_array_sql, 2017 exp.JSONFormat: _json_format_sql, 2018 exp.JSONValueArray: _json_extract_value_array_sql, 2019 exp.Lateral: _explode_to_unnest_sql, 2020 exp.LogicalOr: lambda self, e: self.func("BOOL_OR", _cast_to_boolean(e.this)), 2021 exp.LogicalAnd: lambda self, e: self.func("BOOL_AND", _cast_to_boolean(e.this)), 2022 exp.Select: transforms.preprocess([_seq_to_range_in_generator]), 2023 exp.Seq1: lambda self, e: _seq_sql(self, e, 1), 2024 exp.Seq2: lambda self, e: _seq_sql(self, e, 2), 2025 exp.Seq4: lambda self, e: _seq_sql(self, e, 4), 2026 exp.Seq8: lambda self, e: _seq_sql(self, e, 8), 2027 exp.BoolxorAgg: _boolxor_agg_sql, 2028 exp.MakeInterval: lambda self, e: no_make_interval_sql(self, e, sep=" "), 2029 exp.Initcap: _initcap_sql, 2030 exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)), 2031 exp.SHA: lambda self, e: _sha_sql(self, e, "SHA1"), 2032 exp.SHA1Digest: lambda self, e: _sha_sql(self, e, "SHA1", is_binary=True), 2033 exp.SHA2: lambda self, e: _sha_sql(self, e, "SHA256"), 2034 exp.SHA2Digest: lambda self, e: _sha_sql(self, e, "SHA256", is_binary=True), 2035 exp.MonthsBetween: months_between_sql, 2036 exp.NextDay: _day_navigation_sql, 2037 exp.PercentileCont: rename_func("QUANTILE_CONT"), 2038 exp.PercentileDisc: rename_func("QUANTILE_DISC"), 2039 # DuckDB doesn't allow qualified columns inside of PIVOT expressions. 2040 # See: https://github.com/duckdb/duckdb/blob/671faf92411182f81dce42ac43de8bfb05d9909e/src/planner/binder/tableref/bind_pivot.cpp#L61-L62 2041 exp.Pivot: transforms.preprocess([transforms.unqualify_columns]), 2042 exp.PreviousDay: _day_navigation_sql, 2043 exp.RegexpILike: lambda self, e: self.func( 2044 "REGEXP_MATCHES", e.this, e.expression, exp.Literal.string("i") 2045 ), 2046 exp.RegexpSplit: rename_func("STR_SPLIT_REGEX"), 2047 exp.RegrValx: _regr_val_sql, 2048 exp.RegrValy: _regr_val_sql, 2049 exp.Return: lambda self, e: self.sql(e, "this"), 2050 exp.ReturnsProperty: lambda self, e: "TABLE" if isinstance(e.this, exp.Schema) else "", 2051 exp.Rand: rename_func("RANDOM"), 2052 exp.Split: rename_func("STR_SPLIT"), 2053 exp.SortArray: _sort_array_sql, 2054 exp.StrPosition: strposition_sql, 2055 exp.StrToUnix: lambda self, e: self.func( 2056 "EPOCH", self.func("STRPTIME", e.this, self.format_time(e)) 2057 ), 2058 exp.Struct: _struct_sql, 2059 exp.Transform: rename_func("LIST_TRANSFORM"), 2060 exp.TimeAdd: _date_delta_to_binary_interval_op(), 2061 exp.TimeSub: _date_delta_to_binary_interval_op(), 2062 exp.Time: no_time_sql, 2063 exp.TimeDiff: _timediff_sql, 2064 exp.Timestamp: no_timestamp_sql, 2065 exp.TimestampAdd: _date_delta_to_binary_interval_op(), 2066 exp.TimestampDiff: lambda self, e: self.func( 2067 "DATE_DIFF", exp.Literal.string(e.unit), e.expression, e.this 2068 ), 2069 exp.TimestampSub: _date_delta_to_binary_interval_op(), 2070 exp.TimeStrToDate: lambda self, e: self.sql(exp.cast(e.this, exp.DataType.Type.DATE)), 2071 exp.TimeStrToTime: timestrtotime_sql, 2072 exp.TimeStrToUnix: lambda self, e: self.func( 2073 "EPOCH", exp.cast(e.this, exp.DataType.Type.TIMESTAMP) 2074 ), 2075 exp.TimeToStr: lambda self, e: self.func("STRFTIME", e.this, self.format_time(e)), 2076 exp.ToBoolean: _to_boolean_sql, 2077 exp.TimeToUnix: rename_func("EPOCH"), 2078 exp.TsOrDiToDi: lambda self, 2079 e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS TEXT), '-', ''), 1, 8) AS INT)", 2080 exp.TsOrDsAdd: _date_delta_to_binary_interval_op(), 2081 exp.TsOrDsDiff: lambda self, e: self.func( 2082 "DATE_DIFF", 2083 f"'{e.args.get('unit') or 'DAY'}'", 2084 exp.cast(e.expression, exp.DataType.Type.TIMESTAMP), 2085 exp.cast(e.this, exp.DataType.Type.TIMESTAMP), 2086 ), 2087 exp.UnixMicros: lambda self, e: self.func("EPOCH_US", _implicit_datetime_cast(e.this)), 2088 exp.UnixMillis: lambda self, e: self.func("EPOCH_MS", _implicit_datetime_cast(e.this)), 2089 exp.UnixSeconds: lambda self, e: self.sql( 2090 exp.cast( 2091 self.func("EPOCH", _implicit_datetime_cast(e.this)), exp.DataType.Type.BIGINT 2092 ) 2093 ), 2094 exp.UnixToStr: lambda self, e: self.func( 2095 "STRFTIME", self.func("TO_TIMESTAMP", e.this), self.format_time(e) 2096 ), 2097 exp.DatetimeTrunc: lambda self, e: self.func( 2098 "DATE_TRUNC", unit_to_str(e), exp.cast(e.this, exp.DataType.Type.DATETIME) 2099 ), 2100 exp.UnixToTime: _unix_to_time_sql, 2101 exp.UnixToTimeStr: lambda self, e: f"CAST(TO_TIMESTAMP({self.sql(e, 'this')}) AS TEXT)", 2102 exp.VariancePop: rename_func("VAR_POP"), 2103 exp.WeekOfYear: rename_func("WEEKOFYEAR"), 2104 exp.YearOfWeek: lambda self, e: self.sql( 2105 exp.Extract( 2106 this=exp.Var(this="ISOYEAR"), 2107 expression=e.this, 2108 ) 2109 ), 2110 exp.YearOfWeekIso: lambda self, e: self.sql( 2111 exp.Extract( 2112 this=exp.Var(this="ISOYEAR"), 2113 expression=e.this, 2114 ) 2115 ), 2116 exp.Xor: _xor_sql, 2117 exp.JSONObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2118 exp.JSONBObjectAgg: rename_func("JSON_GROUP_OBJECT"), 2119 exp.DateBin: rename_func("TIME_BUCKET"), 2120 exp.LastDay: _last_day_sql, 2121 } 2122 2123 SUPPORTED_JSON_PATH_PARTS = { 2124 exp.JSONPathKey, 2125 exp.JSONPathRoot, 2126 exp.JSONPathSubscript, 2127 exp.JSONPathWildcard, 2128 } 2129 2130 TYPE_MAPPING = { 2131 **generator.Generator.TYPE_MAPPING, 2132 exp.DataType.Type.BINARY: "BLOB", 2133 exp.DataType.Type.BPCHAR: "TEXT", 2134 exp.DataType.Type.CHAR: "TEXT", 2135 exp.DataType.Type.DATETIME: "TIMESTAMP", 2136 exp.DataType.Type.DECFLOAT: "DECIMAL(38, 5)", 2137 exp.DataType.Type.FLOAT: "REAL", 2138 exp.DataType.Type.JSONB: "JSON", 2139 exp.DataType.Type.NCHAR: "TEXT", 2140 exp.DataType.Type.NVARCHAR: "TEXT", 2141 exp.DataType.Type.UINT: "UINTEGER", 2142 exp.DataType.Type.VARBINARY: "BLOB", 2143 exp.DataType.Type.ROWVERSION: "BLOB", 2144 exp.DataType.Type.VARCHAR: "TEXT", 2145 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMPTZ", 2146 exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP", 2147 exp.DataType.Type.TIMESTAMP_S: "TIMESTAMP_S", 2148 exp.DataType.Type.TIMESTAMP_MS: "TIMESTAMP_MS", 2149 exp.DataType.Type.TIMESTAMP_NS: "TIMESTAMP_NS", 2150 exp.DataType.Type.BIGDECIMAL: "DECIMAL(38, 5)", 2151 } 2152 2153 # https://github.com/duckdb/duckdb/blob/ff7f24fd8e3128d94371827523dae85ebaf58713/third_party/libpg_query/grammar/keywords/reserved_keywords.list#L1-L77 2154 RESERVED_KEYWORDS = { 2155 "array", 2156 "analyse", 2157 "union", 2158 "all", 2159 "when", 2160 "in_p", 2161 "default", 2162 "create_p", 2163 "window", 2164 "asymmetric", 2165 "to", 2166 "else", 2167 "localtime", 2168 "from", 2169 "end_p", 2170 "select", 2171 "current_date", 2172 "foreign", 2173 "with", 2174 "grant", 2175 "session_user", 2176 "or", 2177 "except", 2178 "references", 2179 "fetch", 2180 "limit", 2181 "group_p", 2182 "leading", 2183 "into", 2184 "collate", 2185 "offset", 2186 "do", 2187 "then", 2188 "localtimestamp", 2189 "check_p", 2190 "lateral_p", 2191 "current_role", 2192 "where", 2193 "asc_p", 2194 "placing", 2195 "desc_p", 2196 "user", 2197 "unique", 2198 "initially", 2199 "column", 2200 "both", 2201 "some", 2202 "as", 2203 "any", 2204 "only", 2205 "deferrable", 2206 "null_p", 2207 "current_time", 2208 "true_p", 2209 "table", 2210 "case", 2211 "trailing", 2212 "variadic", 2213 "for", 2214 "on", 2215 "distinct", 2216 "false_p", 2217 "not", 2218 "constraint", 2219 "current_timestamp", 2220 "returning", 2221 "primary", 2222 "intersect", 2223 "having", 2224 "analyze", 2225 "current_user", 2226 "and", 2227 "cast", 2228 "symmetric", 2229 "using", 2230 "order", 2231 "current_catalog", 2232 } 2233 2234 UNWRAPPED_INTERVAL_VALUES = (exp.Literal, exp.Paren) 2235 2236 # DuckDB doesn't generally support CREATE TABLE .. properties 2237 # https://duckdb.org/docs/sql/statements/create_table.html 2238 PROPERTIES_LOCATION = { 2239 prop: exp.Properties.Location.UNSUPPORTED 2240 for prop in generator.Generator.PROPERTIES_LOCATION 2241 } 2242 2243 # There are a few exceptions (e.g. temporary tables) which are supported or 2244 # can be transpiled to DuckDB, so we explicitly override them accordingly 2245 PROPERTIES_LOCATION[exp.LikeProperty] = exp.Properties.Location.POST_SCHEMA 2246 PROPERTIES_LOCATION[exp.TemporaryProperty] = exp.Properties.Location.POST_CREATE 2247 PROPERTIES_LOCATION[exp.ReturnsProperty] = exp.Properties.Location.POST_ALIAS 2248 PROPERTIES_LOCATION[exp.SequenceProperties] = exp.Properties.Location.POST_EXPRESSION 2249 2250 IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS = ( 2251 exp.FirstValue, 2252 exp.Lag, 2253 exp.LastValue, 2254 exp.Lead, 2255 exp.NthValue, 2256 ) 2257 2258 # Template for ZIPF transpilation - placeholders get replaced with actual parameters 2259 ZIPF_TEMPLATE: exp.Expression = exp.maybe_parse( 2260 """ 2261 WITH rand AS (SELECT :random_expr AS r), 2262 weights AS ( 2263 SELECT i, 1.0 / POWER(i, :s) AS w 2264 FROM RANGE(1, :n + 1) AS t(i) 2265 ), 2266 cdf AS ( 2267 SELECT i, SUM(w) OVER (ORDER BY i) / SUM(w) OVER () AS p 2268 FROM weights 2269 ) 2270 SELECT MIN(i) 2271 FROM cdf 2272 WHERE p >= (SELECT r FROM rand) 2273 """ 2274 ) 2275 2276 # Template for NORMAL transpilation using Box-Muller transform 2277 # mean + (stddev * sqrt(-2 * ln(u1)) * cos(2 * pi * u2)) 2278 NORMAL_TEMPLATE: exp.Expression = exp.maybe_parse( 2279 ":mean + (:stddev * SQRT(-2 * LN(GREATEST(:u1, 1e-10))) * COS(2 * PI() * :u2))" 2280 ) 2281 2282 # Template for generating a seeded pseudo-random value in [0, 1) from a hash 2283 SEEDED_RANDOM_TEMPLATE: exp.Expression = exp.maybe_parse( 2284 "(ABS(HASH(:seed)) % 1000000) / 1000000.0" 2285 ) 2286 2287 # Template for generating signed and unsigned SEQ values within a specified range 2288 SEQ_UNSIGNED: exp.Expression = exp.maybe_parse(":base % :max_val") 2289 SEQ_SIGNED: exp.Expression = exp.maybe_parse( 2290 "(CASE WHEN :base % :max_val >= :half " 2291 "THEN :base % :max_val - :max_val " 2292 "ELSE :base % :max_val END)" 2293 ) 2294 2295 # Template for MAP_CAT transpilation - Snowflake semantics: 2296 # 1. Returns NULL if either input is NULL 2297 # 2. For duplicate keys, prefers non-NULL value (COALESCE(m2[k], m1[k])) 2298 # 3. Filters out entries with NULL values from the result 2299 MAPCAT_TEMPLATE: exp.Expression = exp.maybe_parse( 2300 """ 2301 CASE 2302 WHEN :map1 IS NULL OR :map2 IS NULL THEN NULL 2303 ELSE MAP_FROM_ENTRIES(LIST_FILTER(LIST_TRANSFORM( 2304 LIST_DISTINCT(LIST_CONCAT(MAP_KEYS(:map1), MAP_KEYS(:map2))), 2305 __k -> STRUCT_PACK(key := __k, value := COALESCE(:map2[__k], :map1[__k])) 2306 ), __x -> __x.value IS NOT NULL)) 2307 END 2308 """ 2309 ) 2310 2311 # Mappings for EXTRACT/DATE_PART transpilation 2312 # Maps Snowflake specifiers unsupported in DuckDB to strftime format codes 2313 EXTRACT_STRFTIME_MAPPINGS: t.Dict[str, t.Tuple[str, str]] = { 2314 "WEEKISO": ("%V", "INTEGER"), 2315 "YEAROFWEEK": ("%G", "INTEGER"), 2316 "YEAROFWEEKISO": ("%G", "INTEGER"), 2317 "NANOSECOND": ("%n", "BIGINT"), 2318 } 2319 2320 # Maps epoch-based specifiers to DuckDB epoch functions 2321 EXTRACT_EPOCH_MAPPINGS: t.Dict[str, str] = { 2322 "EPOCH_SECOND": "EPOCH", 2323 "EPOCH_MILLISECOND": "EPOCH_MS", 2324 "EPOCH_MICROSECOND": "EPOCH_US", 2325 "EPOCH_NANOSECOND": "EPOCH_NS", 2326 } 2327 2328 # Template for BITMAP_CONSTRUCT_AGG transpilation 2329 # 2330 # BACKGROUND: 2331 # Snowflake's BITMAP_CONSTRUCT_AGG aggregates integers into a compact binary bitmap. 2332 # Supports values in range 0-32767, this version returns NULL if any value is out of range 2333 # See: https://docs.snowflake.com/en/sql-reference/functions/bitmap_construct_agg 2334 # See: https://docs.snowflake.com/en/user-guide/querying-bitmaps-for-distinct-counts 2335 # 2336 # Snowflake uses two different formats based on the number of unique values: 2337 # 2338 # Format 1 - Small bitmap (< 5 unique values): Length of 10 bytes 2339 # Bytes 0-1: Count of values as 2-byte big-endian integer (e.g., 3 values = 0x0003) 2340 # Bytes 2-9: Up to 4 values, each as 2-byte little-endian integers, zero-padded to 8 bytes 2341 # Example: Values [1, 2, 3] -> 0x0003 0100 0200 0300 0000 (hex) 2342 # count v1 v2 v3 pad 2343 # 2344 # Format 2 - Large bitmap (>= 5 unique values): Length of 10 + (2 * count) bytes 2345 # Bytes 0-9: Fixed header 0x08 followed by 9 zero bytes 2346 # Bytes 10+: Each value as 2-byte little-endian integer (no padding) 2347 # Example: Values [1,2,3,4,5] -> 0x08 00000000 00000000 00 0100 0200 0300 0400 0500 2348 # hdr ----9 zero bytes---- v1 v2 v3 v4 v5 2349 # 2350 # TEMPLATE STRUCTURE 2351 # 2352 # Phase 1 - Innermost subquery: Data preparation 2353 # SELECT LIST_SORT(...) AS l 2354 # - Aggregates all input values into a list, remove NULLs, duplicates and sorts 2355 # Result: Clean, sorted list of unique non-null integers stored as 'l' 2356 # 2357 # Phase 2 - Middle subquery: Hex string construction 2358 # LIST_TRANSFORM(...) 2359 # - Converts each integer to 2-byte little-endian hex representation 2360 # - & 255 extracts low byte, >> 8 extracts high byte 2361 # - LIST_REDUCE: Concatenates all hex pairs into single string 'h' 2362 # Result: Hex string of all values 2363 # 2364 # Phase 3 - Outer SELECT: Final bitmap assembly 2365 # LENGTH(l) < 5: 2366 # - Small format: 2-byte count (big-endian via %04X) + values + zero padding 2367 # LENGTH(l) >= 5: 2368 # - Large format: Fixed 10-byte header + values (no padding needed) 2369 # Result: Complete binary bitmap as BLOB 2370 # 2371 BITMAP_CONSTRUCT_AGG_TEMPLATE: exp.Expression = exp.maybe_parse( 2372 """ 2373 SELECT CASE 2374 WHEN l IS NULL OR LENGTH(l) = 0 THEN NULL 2375 WHEN LENGTH(l) != LENGTH(LIST_FILTER(l, __v -> __v BETWEEN 0 AND 32767)) THEN NULL 2376 WHEN LENGTH(l) < 5 THEN UNHEX(PRINTF('%04X', LENGTH(l)) || h || REPEAT('00', GREATEST(0, 4 - LENGTH(l)) * 2)) 2377 ELSE UNHEX('08000000000000000000' || h) 2378 END 2379 FROM ( 2380 SELECT l, COALESCE(LIST_REDUCE( 2381 LIST_TRANSFORM(l, __x -> PRINTF('%02X%02X', CAST(__x AS INT) & 255, (CAST(__x AS INT) >> 8) & 255)), 2382 (__a, __b) -> __a || __b, '' 2383 ), '') AS h 2384 FROM (SELECT LIST_SORT(LIST_DISTINCT(LIST(:arg) FILTER(NOT :arg IS NULL))) AS l) 2385 ) 2386 """ 2387 ) 2388 2389 # Template for RANDSTR transpilation - placeholders get replaced with actual parameters 2390 RANDSTR_TEMPLATE: exp.Expression = exp.maybe_parse( 2391 f""" 2392 SELECT LISTAGG( 2393 SUBSTRING( 2394 '{RANDSTR_CHAR_POOL}', 2395 1 + CAST(FLOOR(random_value * 62) AS INT), 2396 1 2397 ), 2398 '' 2399 ) 2400 FROM ( 2401 SELECT (ABS(HASH(i + :seed)) % 1000) / 1000.0 AS random_value 2402 FROM RANGE(:length) AS t(i) 2403 ) 2404 """, 2405 ) 2406 2407 # Template for MINHASH transpilation 2408 # Computes k minimum hash values across aggregated data using DuckDB list functions 2409 # Returns JSON matching Snowflake format: {"state": [...], "type": "minhash", "version": 1} 2410 MINHASH_TEMPLATE: exp.Expression = exp.maybe_parse( 2411 """ 2412 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY seed), 'type', 'minhash', 'version', 1) 2413 FROM ( 2414 SELECT seed, LIST_MIN(LIST_TRANSFORM(vals, __v -> HASH(CAST(__v AS VARCHAR) || CAST(seed AS VARCHAR)))) AS min_h 2415 FROM (SELECT LIST(:expr) AS vals), RANGE(0, :k) AS t(seed) 2416 ) 2417 """, 2418 ) 2419 2420 # Template for MINHASH_COMBINE transpilation 2421 # Combines multiple minhash signatures by taking element-wise minimum 2422 MINHASH_COMBINE_TEMPLATE: exp.Expression = exp.maybe_parse( 2423 """ 2424 SELECT JSON_OBJECT('state', LIST(min_h ORDER BY idx), 'type', 'minhash', 'version', 1) 2425 FROM ( 2426 SELECT 2427 pos AS idx, 2428 MIN(val) AS min_h 2429 FROM 2430 UNNEST(LIST(:expr)) AS _(sig), 2431 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS t(val, pos) 2432 GROUP BY pos 2433 ) 2434 """, 2435 ) 2436 2437 # Template for APPROXIMATE_SIMILARITY transpilation 2438 # Computes multi-way Jaccard similarity: fraction of positions where ALL signatures agree 2439 APPROXIMATE_SIMILARITY_TEMPLATE: exp.Expression = exp.maybe_parse( 2440 """ 2441 SELECT CAST(SUM(CASE WHEN num_distinct = 1 THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) 2442 FROM ( 2443 SELECT pos, COUNT(DISTINCT h) AS num_distinct 2444 FROM ( 2445 SELECT h, pos 2446 FROM UNNEST(LIST(:expr)) AS _(sig), 2447 UNNEST(CAST(sig -> 'state' AS UBIGINT[])) WITH ORDINALITY AS s(h, pos) 2448 ) 2449 GROUP BY pos 2450 ) 2451 """, 2452 ) 2453 2454 # Template for ARRAYS_ZIP transpilation 2455 # Snowflake pads to longest array; DuckDB LIST_ZIP truncates to shortest 2456 # Uses RANGE + indexing to match Snowflake behavior 2457 ARRAYS_ZIP_TEMPLATE: exp.Expression = exp.maybe_parse( 2458 """ 2459 CASE WHEN :null_check THEN NULL 2460 WHEN :all_empty_check THEN [:empty_struct] 2461 ELSE LIST_TRANSFORM(RANGE(0, :max_len), __i -> :transform_struct) 2462 END 2463 """, 2464 ) 2465 2466 # ARRAY_EXCEPT with bag semantics: N - M occurrences via cumulative counting 2467 # 0-based indices in template (SQLGlot internal), converted to 1-based for DuckDB 2468 # IS NOT DISTINCT FROM for NULL-safe element comparison 2469 ARRAY_EXCEPT_TEMPLATE: exp.Expression = exp.maybe_parse( 2470 """ 2471 CASE 2472 WHEN :source IS NULL OR :exclude IS NULL THEN NULL 2473 ELSE LIST_TRANSFORM( 2474 LIST_FILTER( 2475 LIST_ZIP(:source, GENERATE_SERIES(1, LEN(:source))), 2476 pair -> ( 2477 LEN(LIST_FILTER(:source[1:pair[1]], e -> e IS NOT DISTINCT FROM pair[0])) 2478 > LEN(LIST_FILTER(:exclude, e -> e IS NOT DISTINCT FROM pair[0])) 2479 ) 2480 ), 2481 pair -> pair[0] 2482 ) 2483 END 2484 """, 2485 ) 2486 2487 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2488 """ 2489 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2490 2491 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2492 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2493 2494 For 'END' kind, add the interval to get the end of the slice. 2495 For DATE type with 'END', cast result back to DATE to preserve type. 2496 """ 2497 date_expr = expression.this 2498 slice_length = expression.expression 2499 unit = expression.unit 2500 kind = expression.text("kind").upper() 2501 2502 # Create INTERVAL expression: INTERVAL 'N' UNIT 2503 interval_expr = exp.Interval(this=slice_length, unit=unit) 2504 2505 # Create base time_bucket expression 2506 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2507 2508 # Check if we need the end of the slice (default is start) 2509 if not kind == "END": 2510 # For 'START', return time_bucket directly 2511 return self.sql(time_bucket_expr) 2512 2513 # For 'END', add the interval to get end of slice 2514 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2515 2516 # If input is DATE type, cast result back to DATE to preserve type 2517 # DuckDB converts DATE to TIMESTAMP when adding intervals 2518 if date_expr.is_type(exp.DataType.Type.DATE): 2519 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2520 2521 return self.sql(add_expr) 2522 2523 def bitmapbucketnumber_sql( 2524 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2525 ) -> str: 2526 """ 2527 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2528 2529 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2530 - Each bucket covers 32,768 values 2531 - Bucket numbering starts at 1 2532 - Formula: ((value - 1) // 32768) + 1 for positive values 2533 2534 For non-positive values (0 and negative), we use value // 32768 to avoid 2535 producing bucket 0 or positive bucket IDs for negative inputs. 2536 """ 2537 value = expression.this 2538 2539 positive_formula = ((value - 1) // 32768) + 1 2540 non_positive_formula = value // 32768 2541 2542 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2543 case_expr = ( 2544 exp.case() 2545 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2546 .else_(non_positive_formula) 2547 ) 2548 return self.sql(case_expr) 2549 2550 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2551 """ 2552 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2553 2554 Snowflake's BITMAP_BIT_POSITION behavior: 2555 - For n <= 0: returns ABS(n) % 32768 2556 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2557 """ 2558 this = expression.this 2559 2560 return self.sql( 2561 exp.Mod( 2562 this=exp.Paren( 2563 this=exp.If( 2564 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2565 true=this - exp.Literal.number(1), 2566 false=exp.Abs(this=this), 2567 ) 2568 ), 2569 expression=MAX_BIT_POSITION, 2570 ) 2571 ) 2572 2573 def bitmapconstructagg_sql( 2574 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2575 ) -> str: 2576 """ 2577 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2578 Uses a pre-parsed template with placeholders replaced by expression nodes. 2579 2580 Snowflake bitmap format: 2581 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2582 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2583 """ 2584 arg = expression.this 2585 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})" 2586 2587 def nthvalue_sql(self: DuckDB.Generator, expression: exp.NthValue) -> str: 2588 from_first = expression.args.get("from_first", True) 2589 if not from_first: 2590 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2591 2592 return self.function_fallback_sql(expression) 2593 2594 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2595 """ 2596 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2597 Uses a pre-parsed template with placeholders replaced by expression nodes. 2598 2599 RANDSTR(length, generator) generates a random string of specified length. 2600 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2601 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2602 - No generator: Use default seed value 2603 """ 2604 length = expression.this 2605 generator = expression.args.get("generator") 2606 2607 if generator: 2608 if isinstance(generator, exp.Rand): 2609 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2610 seed_value = generator.this or generator 2611 else: 2612 # Const/int or other expression - use as seed directly 2613 seed_value = generator 2614 else: 2615 # No generator specified, use default seed (arbitrary but deterministic) 2616 seed_value = exp.Literal.number(RANDSTR_SEED) 2617 2618 replacements = {"seed": seed_value, "length": length} 2619 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})" 2620 2621 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2622 """ 2623 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2624 Uses a pre-parsed template with placeholders replaced by expression nodes. 2625 """ 2626 s = expression.this 2627 n = expression.args["elementcount"] 2628 gen = expression.args["gen"] 2629 2630 if not isinstance(gen, exp.Rand): 2631 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2632 random_expr: exp.Expression = exp.Div( 2633 this=exp.Paren( 2634 this=exp.Mod( 2635 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2636 expression=exp.Literal.number(1000000), 2637 ) 2638 ), 2639 expression=exp.Literal.number(1000000.0), 2640 ) 2641 else: 2642 # Use RANDOM() for non-deterministic output 2643 random_expr = exp.Rand() 2644 2645 replacements = {"s": s, "n": n, "random_expr": random_expr} 2646 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})" 2647 2648 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2649 """ 2650 TO_BINARY and TRY_TO_BINARY transpilation: 2651 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2652 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2653 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2654 2655 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2656 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2657 """ 2658 value = expression.this 2659 format_arg = expression.args.get("format") 2660 is_safe = expression.args.get("safe") 2661 is_binary = _is_binary(expression) 2662 2663 if not format_arg and not is_binary: 2664 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2665 return self.func(func_name, value) 2666 2667 # Snowflake defaults to HEX encoding when no format is specified 2668 fmt = format_arg.name.upper() if format_arg else "HEX" 2669 2670 if fmt in ("UTF-8", "UTF8"): 2671 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2672 result = self.func("ENCODE", value) 2673 elif fmt == "BASE64": 2674 result = self.func("FROM_BASE64", value) 2675 elif fmt == "HEX": 2676 result = self.func("UNHEX", value) 2677 else: 2678 if is_safe: 2679 return self.sql(exp.null()) 2680 else: 2681 self.unsupported(f"format {fmt} is not supported") 2682 result = self.func("TO_BINARY", value) 2683 return f"TRY({result})" if is_safe else result 2684 2685 def _greatest_least_sql( 2686 self: DuckDB.Generator, expression: exp.Greatest | exp.Least 2687 ) -> str: 2688 """ 2689 Handle GREATEST/LEAST functions with dialect-aware NULL behavior. 2690 2691 - If ignore_nulls=False (BigQuery-style): return NULL if any argument is NULL 2692 - If ignore_nulls=True (DuckDB/PostgreSQL-style): ignore NULLs, return greatest/least non-NULL value 2693 """ 2694 # Get all arguments 2695 all_args = [expression.this, *expression.expressions] 2696 fallback_sql = self.function_fallback_sql(expression) 2697 2698 if expression.args.get("ignore_nulls"): 2699 # DuckDB/PostgreSQL behavior: use native GREATEST/LEAST (ignores NULLs) 2700 return self.sql(fallback_sql) 2701 2702 # return NULL if any argument is NULL 2703 case_expr = exp.case().when( 2704 exp.or_(*[arg.is_(exp.null()) for arg in all_args], copy=False), 2705 exp.null(), 2706 copy=False, 2707 ) 2708 case_expr.set("default", fallback_sql) 2709 return self.sql(case_expr) 2710 2711 def generator_sql(self, expression: exp.Generator) -> str: 2712 # Transpile Snowflake GENERATOR to DuckDB range() 2713 rowcount = expression.args.get("rowcount") 2714 time_limit = expression.args.get("time_limit") 2715 2716 if time_limit: 2717 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2718 2719 if not rowcount: 2720 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2721 return self.func("range", exp.Literal.number(0)) 2722 2723 return self.func("range", rowcount) 2724 2725 def greatest_sql(self: DuckDB.Generator, expression: exp.Greatest) -> str: 2726 return self._greatest_least_sql(expression) 2727 2728 def least_sql(self: DuckDB.Generator, expression: exp.Least) -> str: 2729 return self._greatest_least_sql(expression) 2730 2731 def lambda_sql( 2732 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2733 ) -> str: 2734 if expression.args.get("colon"): 2735 prefix = "LAMBDA " 2736 arrow_sep = ":" 2737 wrap = False 2738 else: 2739 prefix = "" 2740 2741 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2742 return f"{prefix}{lambda_sql}" 2743 2744 def show_sql(self, expression: exp.Show) -> str: 2745 return f"SHOW {expression.name}" 2746 2747 def install_sql(self, expression: exp.Install) -> str: 2748 force = "FORCE " if expression.args.get("force") else "" 2749 this = self.sql(expression, "this") 2750 from_clause = expression.args.get("from_") 2751 from_clause = f" FROM {from_clause}" if from_clause else "" 2752 return f"{force}INSTALL {this}{from_clause}" 2753 2754 def approxtopk_sql(self, expression: exp.ApproxTopK) -> str: 2755 self.unsupported( 2756 "APPROX_TOP_K cannot be transpiled to DuckDB due to incompatible return types. " 2757 ) 2758 return self.function_fallback_sql(expression) 2759 2760 def fromiso8601timestamp_sql(self, expression: exp.FromISO8601Timestamp) -> str: 2761 return self.sql(exp.cast(expression.this, exp.DataType.Type.TIMESTAMPTZ)) 2762 2763 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2764 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2765 target_type = expression.args.get("target_type") 2766 needs_tz = target_type and target_type.this in ( 2767 exp.DataType.Type.TIMESTAMPLTZ, 2768 exp.DataType.Type.TIMESTAMPTZ, 2769 ) 2770 2771 if expression.args.get("safe"): 2772 formatted_time = self.format_time(expression) 2773 cast_type = ( 2774 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2775 ) 2776 return self.sql( 2777 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2778 ) 2779 2780 base_sql = str_to_time_sql(self, expression) 2781 if needs_tz: 2782 return self.sql( 2783 exp.cast( 2784 base_sql, 2785 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2786 ) 2787 ) 2788 return base_sql 2789 2790 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2791 formatted_time = self.format_time(expression) 2792 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2793 return self.sql( 2794 exp.cast( 2795 self.func(function_name, expression.this, formatted_time), 2796 exp.DataType(this=exp.DataType.Type.DATE), 2797 ) 2798 ) 2799 2800 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2801 this = expression.this 2802 time_format = self.format_time(expression) 2803 safe = expression.args.get("safe") 2804 time_type = exp.DataType.build("TIME", dialect="duckdb") 2805 cast_expr = exp.TryCast if safe else exp.Cast 2806 2807 if time_format: 2808 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2809 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2810 return self.sql(cast_expr(this=strptime, to=time_type)) 2811 2812 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2813 return self.sql(this) 2814 2815 return self.sql(cast_expr(this=this, to=time_type)) 2816 2817 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2818 if not expression.this: 2819 return "CURRENT_DATE" 2820 2821 expr = exp.Cast( 2822 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2823 to=exp.DataType(this=exp.DataType.Type.DATE), 2824 ) 2825 return self.sql(expr) 2826 2827 def parsejson_sql(self, expression: exp.ParseJSON) -> str: 2828 arg = expression.this 2829 if expression.args.get("safe"): 2830 return self.sql(exp.case().when(exp.func("json_valid", arg), arg).else_(exp.null())) 2831 return self.func("JSON", arg) 2832 2833 @unsupported_args("decimals") 2834 def trunc_sql(self, expression: exp.Trunc) -> str: 2835 return self.func("TRUNC", expression.this) 2836 2837 def normal_sql(self, expression: exp.Normal) -> str: 2838 """ 2839 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2840 2841 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2842 """ 2843 mean = expression.this 2844 stddev = expression.args["stddev"] 2845 gen: exp.Expression = expression.args["gen"] 2846 2847 # Build two uniform random values [0, 1) for Box-Muller transform 2848 if isinstance(gen, exp.Rand) and gen.this is None: 2849 u1: exp.Expression = exp.Rand() 2850 u2: exp.Expression = exp.Rand() 2851 else: 2852 # Seeded: derive two values using HASH with different inputs 2853 seed = gen.this if isinstance(gen, exp.Rand) else gen 2854 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2855 u2 = exp.replace_placeholders( 2856 self.SEEDED_RANDOM_TEMPLATE, 2857 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2858 ) 2859 2860 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2861 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements)) 2862 2863 def uniform_sql(self, expression: exp.Uniform) -> str: 2864 """ 2865 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2866 2867 UNIFORM returns a random value in [min, max]: 2868 - Integer result if both min and max are integers 2869 - Float result if either min or max is a float 2870 """ 2871 min_val = expression.this 2872 max_val = expression.expression 2873 gen = expression.args.get("gen") 2874 2875 # Determine if result should be integer (both bounds are integers). 2876 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2877 is_int_result = min_val.is_int and max_val.is_int 2878 2879 # Build the random value expression [0, 1) 2880 if not isinstance(gen, exp.Rand): 2881 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2882 random_expr: exp.Expression = exp.Div( 2883 this=exp.Paren( 2884 this=exp.Mod( 2885 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2886 expression=exp.Literal.number(1000000), 2887 ) 2888 ), 2889 expression=exp.Literal.number(1000000.0), 2890 ) 2891 else: 2892 random_expr = exp.Rand() 2893 2894 # Build: min + random * (max - min [+ 1 for int]) 2895 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2896 if is_int_result: 2897 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2898 2899 result: exp.Expression = exp.Add( 2900 this=min_val, 2901 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2902 ) 2903 2904 if is_int_result: 2905 result = exp.Cast( 2906 this=exp.Floor(this=result), 2907 to=exp.DataType.build("BIGINT"), 2908 ) 2909 2910 return self.sql(result) 2911 2912 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2913 nano = expression.args.get("nano") 2914 overflow = expression.args.get("overflow") 2915 2916 # Snowflake's TIME_FROM_PARTS supports overflow 2917 if overflow: 2918 hour = expression.args["hour"] 2919 minute = expression.args["min"] 2920 sec = expression.args["sec"] 2921 2922 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2923 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2924 try: 2925 h_val = hour.to_py() 2926 m_val = minute.to_py() 2927 s_val = sec.to_py() 2928 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2929 return rename_func("MAKE_TIME")(self, expression) 2930 except ValueError: 2931 pass 2932 2933 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2934 if nano: 2935 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2936 2937 total_seconds = ( 2938 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2939 ) 2940 2941 return self.sql( 2942 exp.Add( 2943 this=exp.Cast( 2944 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2945 ), 2946 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2947 ) 2948 ) 2949 2950 # Default: MAKE_TIME 2951 if nano: 2952 expression.set( 2953 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2954 ) 2955 2956 return rename_func("MAKE_TIME")(self, expression) 2957 2958 def extract_sql(self, expression: exp.Extract) -> str: 2959 """ 2960 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2961 2962 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2963 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2964 """ 2965 this = expression.this 2966 datetime_expr = expression.expression 2967 2968 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2969 # because Snowflake applies server timezone while DuckDB uses local timezone 2970 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2971 self.unsupported( 2972 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2973 ) 2974 2975 part_name = this.name.upper() 2976 2977 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2978 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2979 2980 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2981 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2982 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2983 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2984 ) 2985 2986 if is_nano_time: 2987 self.unsupported( 2988 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2989 ) 2990 return self.sql( 2991 exp.cast( 2992 exp.Mul( 2993 this=exp.Extract( 2994 this=exp.var("MICROSECOND"), expression=datetime_expr 2995 ), 2996 expression=exp.Literal.number(1000), 2997 ), 2998 exp.DataType.build(cast_type, dialect="duckdb"), 2999 ) 3000 ) 3001 3002 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 3003 strftime_input = datetime_expr 3004 if part_name == "NANOSECOND": 3005 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 3006 3007 return self.sql( 3008 exp.cast( 3009 exp.Anonymous( 3010 this="STRFTIME", 3011 expressions=[strftime_input, exp.Literal.string(fmt)], 3012 ), 3013 exp.DataType.build(cast_type, dialect="duckdb"), 3014 ) 3015 ) 3016 3017 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 3018 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 3019 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 3020 # EPOCH returns float, cast to BIGINT for integer result 3021 if part_name == "EPOCH_SECOND": 3022 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 3023 return self.sql(result) 3024 3025 return super().extract_sql(expression) 3026 3027 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 3028 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 3029 date_expr = expression.this 3030 time_expr = expression.expression 3031 3032 if date_expr is not None and time_expr is not None: 3033 # In DuckDB, DATE + TIME produces TIMESTAMP 3034 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 3035 3036 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 3037 sec = expression.args.get("sec") 3038 if sec is None: 3039 # This shouldn't happen with valid input, but handle gracefully 3040 return rename_func("MAKE_TIMESTAMP")(self, expression) 3041 3042 milli = expression.args.get("milli") 3043 if milli is not None: 3044 sec += milli.pop() / exp.Literal.number(1000.0) 3045 3046 nano = expression.args.get("nano") 3047 if nano is not None: 3048 sec += nano.pop() / exp.Literal.number(1000000000.0) 3049 3050 if milli or nano: 3051 expression.set("sec", sec) 3052 3053 return rename_func("MAKE_TIMESTAMP")(self, expression) 3054 3055 @unsupported_args("nano") 3056 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 3057 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3058 if nano := expression.args.get("nano"): 3059 nano.pop() 3060 3061 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3062 return f"CAST({timestamp} AS TIMESTAMPTZ)" 3063 3064 @unsupported_args("nano") 3065 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 3066 # Extract zone before popping 3067 zone = expression.args.get("zone") 3068 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3069 if zone: 3070 zone = zone.pop() 3071 3072 if nano := expression.args.get("nano"): 3073 nano.pop() 3074 3075 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3076 3077 if zone: 3078 # Use AT TIME ZONE to apply the explicit timezone 3079 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 3080 3081 return timestamp 3082 3083 def tablesample_sql( 3084 self, 3085 expression: exp.TableSample, 3086 tablesample_keyword: t.Optional[str] = None, 3087 ) -> str: 3088 if not isinstance(expression.parent, exp.Select): 3089 # This sample clause only applies to a single source, not the entire resulting relation 3090 tablesample_keyword = "TABLESAMPLE" 3091 3092 if expression.args.get("size"): 3093 method = expression.args.get("method") 3094 if method and method.name.upper() != "RESERVOIR": 3095 self.unsupported( 3096 f"Sampling method {method} is not supported with a discrete sample count, " 3097 "defaulting to reservoir sampling" 3098 ) 3099 expression.set("method", exp.var("RESERVOIR")) 3100 3101 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword) 3102 3103 def columndef_sql(self, expression: exp.ColumnDef, sep: str = " ") -> str: 3104 if isinstance(expression.parent, exp.UserDefinedFunction): 3105 return self.sql(expression, "this") 3106 return super().columndef_sql(expression, sep) 3107 3108 def join_sql(self, expression: exp.Join) -> str: 3109 if ( 3110 not expression.args.get("using") 3111 and not expression.args.get("on") 3112 and not expression.method 3113 and (expression.kind in ("", "INNER", "OUTER")) 3114 ): 3115 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 3116 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 3117 if isinstance(expression.this, exp.Unnest): 3118 return super().join_sql(expression.on(exp.true())) 3119 3120 expression.set("side", None) 3121 expression.set("kind", None) 3122 3123 return super().join_sql(expression) 3124 3125 def generateseries_sql(self, expression: exp.GenerateSeries) -> str: 3126 # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b) 3127 if expression.args.get("is_end_exclusive"): 3128 return rename_func("RANGE")(self, expression) 3129 3130 return self.function_fallback_sql(expression) 3131 3132 def countif_sql(self, expression: exp.CountIf) -> str: 3133 if self.dialect.version >= (1, 2): 3134 return self.function_fallback_sql(expression) 3135 3136 # https://github.com/tobymao/sqlglot/pull/4749 3137 return count_if_to_sum(self, expression) 3138 3139 def bracket_sql(self, expression: exp.Bracket) -> str: 3140 if self.dialect.version >= (1, 2): 3141 return super().bracket_sql(expression) 3142 3143 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3144 this = expression.this 3145 if isinstance(this, exp.Array): 3146 this.replace(exp.paren(this)) 3147 3148 bracket = super().bracket_sql(expression) 3149 3150 if not expression.args.get("returns_list_for_maps"): 3151 if not this.type: 3152 from sqlglot.optimizer.annotate_types import annotate_types 3153 3154 this = annotate_types(this, dialect=self.dialect) 3155 3156 if this.is_type(exp.DataType.Type.MAP): 3157 bracket = f"({bracket})[1]" 3158 3159 return bracket 3160 3161 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3162 func = expression.this 3163 3164 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3165 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3166 if isinstance(func, exp.ArrayAgg): 3167 if not isinstance(order := expression.expression, exp.Order): 3168 return self.sql(func) 3169 3170 # Save the original column for FILTER clause (before wrapping with Order) 3171 original_this = func.this 3172 3173 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3174 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3175 func.set( 3176 "this", 3177 exp.Order( 3178 this=func.this.copy(), 3179 expressions=order.expressions, 3180 ), 3181 ) 3182 3183 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3184 # Use original_this (not the Order-wrapped version) for the FILTER condition 3185 array_agg_sql = self.function_fallback_sql(func) 3186 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3187 3188 # For other functions (like PERCENTILES), use existing logic 3189 expression_sql = self.sql(expression, "expression") 3190 3191 if isinstance(func, exp.PERCENTILES): 3192 # Make the order key the first arg and slide the fraction to the right 3193 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3194 order_col = expression.find(exp.Ordered) 3195 if order_col: 3196 func.set("expression", func.this) 3197 func.set("this", order_col.this) 3198 3199 this = self.sql(expression, "this").rstrip(")") 3200 3201 return f"{this}{expression_sql})" 3202 3203 def length_sql(self, expression: exp.Length) -> str: 3204 arg = expression.this 3205 3206 # Dialects like BQ and Snowflake also accept binary values as args, so 3207 # DDB will attempt to infer the type or resort to case/when resolution 3208 if not expression.args.get("binary") or arg.is_string: 3209 return self.func("LENGTH", arg) 3210 3211 if not arg.type: 3212 from sqlglot.optimizer.annotate_types import annotate_types 3213 3214 arg = annotate_types(arg, dialect=self.dialect) 3215 3216 if arg.is_type(*exp.DataType.TEXT_TYPES): 3217 return self.func("LENGTH", arg) 3218 3219 # We need these casts to make duckdb's static type checker happy 3220 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3221 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 3222 3223 case = ( 3224 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3225 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3226 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3227 ) 3228 return self.sql(case) 3229 3230 def _validate_regexp_flags( 3231 self, flags: t.Optional[exp.Expression], supported_flags: str 3232 ) -> t.Optional[str]: 3233 """ 3234 Validate and filter regexp flags for DuckDB compatibility. 3235 3236 Args: 3237 flags: The flags expression to validate 3238 supported_flags: String of supported flags (e.g., "ims", "cims"). 3239 Only these flags will be returned. 3240 3241 Returns: 3242 Validated/filtered flag string, or None if no valid flags remain 3243 """ 3244 if not isinstance(flags, exp.Expression): 3245 return None 3246 3247 if not flags.is_string: 3248 self.unsupported("Non-literal regexp flags are not fully supported in DuckDB") 3249 return None 3250 3251 flag_str = flags.this 3252 unsupported = set(flag_str) - set(supported_flags) 3253 3254 if unsupported: 3255 self.unsupported( 3256 f"Regexp flags {sorted(unsupported)} are not supported in this context" 3257 ) 3258 3259 flag_str = "".join(f for f in flag_str if f in supported_flags) 3260 return flag_str if flag_str else None 3261 3262 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3263 this = expression.this 3264 pattern = expression.expression 3265 position = expression.args.get("position") 3266 parameters = expression.args.get("parameters") 3267 3268 # Validate flags - only "ims" flags are supported for embedded patterns 3269 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3270 3271 if position: 3272 this = exp.Substring(this=this, start=position) 3273 3274 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3275 if validated_flags: 3276 pattern = exp.Concat( 3277 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3278 ) 3279 3280 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3281 result = ( 3282 exp.case() 3283 .when( 3284 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3285 exp.Literal.number(0), 3286 ) 3287 .else_( 3288 exp.Length( 3289 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3290 ) 3291 ) 3292 ) 3293 3294 return self.sql(result) 3295 3296 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3297 subject = expression.this 3298 pattern = expression.expression 3299 replacement = expression.args.get("replacement") or exp.Literal.string("") 3300 position = expression.args.get("position") 3301 occurrence = expression.args.get("occurrence") 3302 modifiers = expression.args.get("modifiers") 3303 3304 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3305 3306 # Handle occurrence (only literals supported) 3307 if occurrence and not occurrence.is_int: 3308 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3309 else: 3310 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3311 if occurrence > 1: 3312 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3313 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3314 elif ( 3315 occurrence == 0 3316 and "g" not in validated_flags 3317 and not expression.args.get("single_replace") 3318 ): 3319 validated_flags += "g" 3320 3321 # Handle position (only literals supported) 3322 prefix = None 3323 if position and not position.is_int: 3324 self.unsupported("REGEXP_REPLACE with non-literal position") 3325 elif position and position.is_int and position.to_py() > 1: 3326 pos = position.to_py() 3327 prefix = exp.Substring( 3328 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3329 ) 3330 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3331 3332 result: exp.Expression = exp.Anonymous( 3333 this="REGEXP_REPLACE", 3334 expressions=[ 3335 subject, 3336 pattern, 3337 replacement, 3338 exp.Literal.string(validated_flags) if validated_flags else None, 3339 ], 3340 ) 3341 3342 if prefix: 3343 result = exp.Concat(expressions=[prefix, result]) 3344 3345 return self.sql(result) 3346 3347 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3348 this = expression.this 3349 pattern = expression.expression 3350 flag = expression.args.get("flag") 3351 3352 if not expression.args.get("full_match"): 3353 return self.func("REGEXP_MATCHES", this, pattern, flag) 3354 3355 # DuckDB REGEXP_MATCHES supports: c, i, m, s (but not 'e') 3356 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3357 3358 anchored_pattern = exp.Concat( 3359 expressions=[ 3360 exp.Literal.string("^("), 3361 exp.Paren(this=pattern), 3362 exp.Literal.string(")$"), 3363 ] 3364 ) 3365 3366 if validated_flags: 3367 flag = exp.Literal.string(validated_flags) 3368 3369 return self.func("REGEXP_MATCHES", this, anchored_pattern, flag) 3370 3371 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3372 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3373 this = expression.this 3374 expr = expression.expression 3375 max_dist = expression.args.get("max_dist") 3376 3377 if max_dist is None: 3378 return self.func("LEVENSHTEIN", this, expr) 3379 3380 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3381 levenshtein = exp.Levenshtein(this=this, expression=expr) 3382 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist])) 3383 3384 def pad_sql(self, expression: exp.Pad) -> str: 3385 """ 3386 Handle RPAD/LPAD for VARCHAR and BINARY types. 3387 3388 For VARCHAR: Delegate to parent class 3389 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3390 """ 3391 string_arg = expression.this 3392 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3393 3394 if _is_binary(string_arg) or _is_binary(fill_arg): 3395 length_arg = expression.expression 3396 is_left = expression.args.get("is_left") 3397 3398 input_len = exp.ByteLength(this=string_arg) 3399 chars_needed = length_arg - input_len 3400 pad_count = exp.Greatest( 3401 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3402 ) 3403 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3404 3405 left, right = string_arg, repeat_expr 3406 if is_left: 3407 left, right = right, left 3408 3409 result = exp.DPipe(this=left, expression=right) 3410 return self.sql(result) 3411 3412 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3413 return super().pad_sql(expression) 3414 3415 def minhash_sql(self, expression: exp.Minhash) -> str: 3416 k = expression.this 3417 exprs = expression.expressions 3418 3419 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3420 self.unsupported( 3421 "MINHASH with multiple expressions or * requires manual query restructuring" 3422 ) 3423 return self.func("MINHASH", k, *exprs) 3424 3425 expr = exprs[0] 3426 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3427 return f"({self.sql(result)})" 3428 3429 def minhashcombine_sql(self, expression: exp.MinhashCombine) -> str: 3430 expr = expression.this 3431 result = exp.replace_placeholders(self.MINHASH_COMBINE_TEMPLATE.copy(), expr=expr) 3432 return f"({self.sql(result)})" 3433 3434 def approximatesimilarity_sql(self, expression: exp.ApproximateSimilarity) -> str: 3435 expr = expression.this 3436 result = exp.replace_placeholders( 3437 self.APPROXIMATE_SIMILARITY_TEMPLATE.copy(), expr=expr 3438 ) 3439 return f"({self.sql(result)})" 3440 3441 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3442 arr = expression.this 3443 func = self.func("LIST_DISTINCT", arr) 3444 3445 if expression.args.get("check_null"): 3446 add_null_to_array = exp.func( 3447 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3448 ) 3449 return self.sql( 3450 exp.If( 3451 this=exp.NEQ( 3452 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3453 ), 3454 true=add_null_to_array, 3455 false=func, 3456 ) 3457 ) 3458 3459 return func 3460 3461 def arrayexcept_sql(self, expression: exp.ArrayExcept) -> str: 3462 source = expression.this 3463 exclude = expression.expression 3464 3465 replacements = {"source": source, "exclude": exclude} 3466 return self.sql(exp.replace_placeholders(self.ARRAY_EXCEPT_TEMPLATE, **replacements)) 3467 3468 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3469 args = expression.expressions 3470 3471 if not args: 3472 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3473 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3474 3475 # Build placeholder values for template 3476 lengths = [exp.Length(this=arg) for arg in args] 3477 max_len = ( 3478 lengths[0] 3479 if len(lengths) == 1 3480 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3481 ) 3482 3483 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3484 empty_struct = exp.func( 3485 "STRUCT", 3486 *[ 3487 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3488 for i in range(len(args)) 3489 ], 3490 ) 3491 3492 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3493 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3494 index = exp.column("__i") + 1 3495 transform_struct = exp.func( 3496 "STRUCT", 3497 *[ 3498 exp.PropertyEQ( 3499 this=exp.Literal.string(f"${i + 1}"), 3500 expression=exp.func("COALESCE", arg, exp.array())[index], 3501 ) 3502 for i, arg in enumerate(args) 3503 ], 3504 ) 3505 3506 result = exp.replace_placeholders( 3507 self.ARRAYS_ZIP_TEMPLATE.copy(), 3508 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3509 all_empty_check=exp.and_( 3510 *[ 3511 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3512 for arg in args 3513 ] 3514 ), 3515 empty_struct=empty_struct, 3516 max_len=max_len, 3517 transform_struct=transform_struct, 3518 ) 3519 return self.sql(result) 3520 3521 def lower_sql(self, expression: exp.Lower) -> str: 3522 result_sql = self.func("LOWER", _cast_to_varchar(expression.this)) 3523 return _gen_with_cast_to_blob(self, expression, result_sql) 3524 3525 def upper_sql(self, expression: exp.Upper) -> str: 3526 result_sql = self.func("UPPER", _cast_to_varchar(expression.this)) 3527 return _gen_with_cast_to_blob(self, expression, result_sql) 3528 3529 def reverse_sql(self, expression: exp.Reverse) -> str: 3530 result_sql = self.func("REVERSE", _cast_to_varchar(expression.this)) 3531 return _gen_with_cast_to_blob(self, expression, result_sql) 3532 3533 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3534 # DuckDB TO_BASE64 requires BLOB input 3535 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3536 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3537 result = expression.this 3538 3539 # Check if input is a string type - ENCODE only accepts VARCHAR 3540 if result.is_type(*exp.DataType.TEXT_TYPES): 3541 result = exp.Encode(this=result) 3542 3543 result = exp.ToBase64(this=result) 3544 3545 max_line_length = expression.args.get("max_line_length") 3546 alphabet = expression.args.get("alphabet") 3547 3548 # Handle custom alphabet by replacing standard chars with custom ones 3549 result = _apply_base64_alphabet_replacements(result, alphabet) 3550 3551 # Handle max_line_length by inserting newlines every N characters 3552 line_length = ( 3553 t.cast(int, max_line_length.to_py()) 3554 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3555 else 0 3556 ) 3557 if line_length > 0: 3558 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3559 result = exp.Trim( 3560 this=exp.RegexpReplace( 3561 this=result, 3562 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3563 replacement=exp.Concat( 3564 expressions=[exp.Literal.string("\\1"), newline.copy()] 3565 ), 3566 ), 3567 expression=newline, 3568 position="TRAILING", 3569 ) 3570 3571 return self.sql(result) 3572 3573 def replace_sql(self, expression: exp.Replace) -> str: 3574 result_sql = self.func( 3575 "REPLACE", 3576 _cast_to_varchar(expression.this), 3577 _cast_to_varchar(expression.expression), 3578 _cast_to_varchar(expression.args.get("replacement")), 3579 ) 3580 return _gen_with_cast_to_blob(self, expression, result_sql) 3581 3582 def _bitwise_op(self, expression: exp.Binary, op: str) -> str: 3583 _prepare_binary_bitwise_args(expression) 3584 result_sql = self.binary(expression, op) 3585 return _gen_with_cast_to_blob(self, expression, result_sql) 3586 3587 def bitwisexor_sql(self, expression: exp.BitwiseXor) -> str: 3588 _prepare_binary_bitwise_args(expression) 3589 result_sql = self.func("XOR", expression.this, expression.expression) 3590 return _gen_with_cast_to_blob(self, expression, result_sql) 3591 3592 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3593 this = expression.this 3594 key = expression.args.get("key") 3595 key_sql = key.name if isinstance(key, exp.Expression) else "" 3596 value_sql = self.sql(expression, "value") 3597 3598 kv_sql = f"{key_sql} := {value_sql}" 3599 3600 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3601 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3602 if isinstance(this, exp.Struct) and not this.expressions: 3603 return self.func("STRUCT_PACK", kv_sql) 3604 3605 return self.func("STRUCT_INSERT", this, kv_sql) 3606 3607 def mapcat_sql(self, expression: exp.MapCat) -> str: 3608 result = exp.replace_placeholders( 3609 self.MAPCAT_TEMPLATE.copy(), 3610 map1=expression.this, 3611 map2=expression.expression, 3612 ) 3613 return self.sql(result) 3614 3615 def mapcontainskey_sql(self, expression: exp.MapContainsKey) -> str: 3616 return self.func( 3617 "ARRAY_CONTAINS", exp.func("MAP_KEYS", expression.args["key"]), expression.this 3618 ) 3619 3620 def startswith_sql(self, expression: exp.StartsWith) -> str: 3621 return self.func( 3622 "STARTS_WITH", 3623 _cast_to_varchar(expression.this), 3624 _cast_to_varchar(expression.expression), 3625 ) 3626 3627 def space_sql(self, expression: exp.Space) -> str: 3628 # DuckDB's REPEAT requires BIGINT for the count parameter 3629 return self.sql( 3630 exp.Repeat( 3631 this=exp.Literal.string(" "), 3632 times=exp.cast(expression.this, exp.DataType.Type.BIGINT), 3633 ) 3634 ) 3635 3636 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3637 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3638 if isinstance(expression.this, exp.Generator): 3639 # Preserve alias, joins, and other table-level args 3640 table = exp.Table( 3641 this=expression.this, 3642 alias=expression.args.get("alias"), 3643 joins=expression.args.get("joins"), 3644 ) 3645 return self.sql(table) 3646 3647 return super().tablefromrows_sql(expression) 3648 3649 def unnest_sql(self, expression: exp.Unnest) -> str: 3650 explode_array = expression.args.get("explode_array") 3651 if explode_array: 3652 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3653 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3654 expression.expressions.append( 3655 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3656 ) 3657 3658 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3659 alias = expression.args.get("alias") 3660 if isinstance(alias, exp.TableAlias): 3661 expression.set("alias", None) 3662 if alias.columns: 3663 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3664 3665 unnest_sql = super().unnest_sql(expression) 3666 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3667 return self.sql(select) 3668 3669 return super().unnest_sql(expression) 3670 3671 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3672 this = expression.this 3673 3674 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3675 # DuckDB should render IGNORE NULLS only for the general-purpose 3676 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3677 return super().ignorenulls_sql(expression) 3678 3679 if isinstance(this, exp.First): 3680 this = exp.AnyValue(this=this.this) 3681 3682 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3683 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3684 3685 return self.sql(this) 3686 3687 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3688 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3689 # DuckDB should render RESPECT NULLS only for the general-purpose 3690 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3691 return super().respectnulls_sql(expression) 3692 3693 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3694 return self.sql(expression, "this") 3695 3696 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3697 this = self.sql(expression, "this") 3698 null_text = self.sql(expression, "null") 3699 3700 if null_text: 3701 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3702 3703 return self.func("ARRAY_TO_STRING", this, expression.expression) 3704 3705 def _regexp_extract_sql(self, expression: exp.RegexpExtract | exp.RegexpExtractAll) -> str: 3706 this = expression.this 3707 group = expression.args.get("group") 3708 params = expression.args.get("parameters") 3709 position = expression.args.get("position") 3710 occurrence = expression.args.get("occurrence") 3711 null_if_pos_overflow = expression.args.get("null_if_pos_overflow") 3712 3713 # Handle Snowflake's 'e' flag: it enables capture group extraction 3714 # In DuckDB, this is controlled by the group parameter directly 3715 if params and params.is_string and "e" in params.name: 3716 params = exp.Literal.string(params.name.replace("e", "")) 3717 3718 validated_flags = self._validate_regexp_flags(params, supported_flags="cims") 3719 3720 # Strip default group when no following params (DuckDB default is same as group=0) 3721 if ( 3722 not validated_flags 3723 and group 3724 and group.name == str(self.dialect.REGEXP_EXTRACT_DEFAULT_GROUP) 3725 ): 3726 group = None 3727 flags_expr = exp.Literal.string(validated_flags) if validated_flags else None 3728 3729 # use substring to handle position argument 3730 if position and (not position.is_int or position.to_py() > 1): 3731 this = exp.Substring(this=this, start=position) 3732 3733 if null_if_pos_overflow: 3734 this = exp.Nullif(this=this, expression=exp.Literal.string("")) 3735 3736 is_extract_all = isinstance(expression, exp.RegexpExtractAll) 3737 non_single_occurrence = occurrence and (not occurrence.is_int or occurrence.to_py() > 1) 3738 3739 if is_extract_all or non_single_occurrence: 3740 name = "REGEXP_EXTRACT_ALL" 3741 else: 3742 name = "REGEXP_EXTRACT" 3743 3744 result: exp.Expression = exp.Anonymous( 3745 this=name, expressions=[this, expression.expression, group, flags_expr] 3746 ) 3747 3748 # Array slicing for REGEXP_EXTRACT_ALL with occurrence 3749 if is_extract_all and non_single_occurrence: 3750 result = exp.Bracket(this=result, expressions=[exp.Slice(this=occurrence)]) 3751 # ARRAY_EXTRACT for REGEXP_EXTRACT with occurrence > 1 3752 elif non_single_occurrence: 3753 result = exp.Anonymous(this="ARRAY_EXTRACT", expressions=[result, occurrence]) 3754 3755 return self.sql(result) 3756 3757 def regexpextract_sql(self, expression: exp.RegexpExtract) -> str: 3758 return self._regexp_extract_sql(expression) 3759 3760 def regexpextractall_sql(self, expression: exp.RegexpExtractAll) -> str: 3761 return self._regexp_extract_sql(expression) 3762 3763 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3764 this = expression.this 3765 pattern = expression.expression 3766 position = expression.args.get("position") 3767 orig_occ = expression.args.get("occurrence") 3768 occurrence = orig_occ or exp.Literal.number(1) 3769 option = expression.args.get("option") 3770 parameters = expression.args.get("parameters") 3771 3772 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3773 if validated_flags: 3774 pattern = exp.Concat( 3775 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3776 ) 3777 3778 # Handle starting position offset 3779 pos_offset: exp.Expression = exp.Literal.number(0) 3780 if position and (not position.is_int or position.to_py() > 1): 3781 this = exp.Substring(this=this, start=position) 3782 pos_offset = position - exp.Literal.number(1) 3783 3784 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3785 def sum_lengths(func_name: str, end: exp.Expression) -> exp.Expression: 3786 lst = exp.Bracket( 3787 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3788 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3789 offset=1, 3790 ) 3791 transform = exp.Anonymous( 3792 this="LIST_TRANSFORM", 3793 expressions=[ 3794 lst, 3795 exp.Lambda( 3796 this=exp.Length(this=exp.to_identifier("x")), 3797 expressions=[exp.to_identifier("x")], 3798 ), 3799 ], 3800 ) 3801 return exp.Coalesce( 3802 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 3803 expressions=[exp.Literal.number(0)], 3804 ) 3805 3806 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 3807 base_pos: exp.Expression = ( 3808 exp.Literal.number(1) 3809 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 3810 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 3811 + pos_offset 3812 ) 3813 3814 # option=1: add match length for end position 3815 if option and option.is_int and option.to_py() == 1: 3816 match_at_occ = exp.Bracket( 3817 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 3818 expressions=[occurrence], 3819 offset=1, 3820 ) 3821 base_pos = base_pos + exp.Coalesce( 3822 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 3823 ) 3824 3825 # NULL checks for all provided arguments 3826 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 3827 null_args = [ 3828 expression.this, 3829 expression.expression, 3830 position, 3831 orig_occ, 3832 option, 3833 parameters, 3834 ] 3835 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 3836 3837 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3838 3839 return self.sql( 3840 exp.case() 3841 .when(exp.or_(*null_checks), exp.Null()) 3842 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 3843 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 3844 .else_(base_pos) 3845 ) 3846 3847 @unsupported_args("culture") 3848 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3849 fmt = expression.args.get("format") 3850 if fmt and fmt.is_int: 3851 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3852 3853 self.unsupported("Only integer formats are supported by NumberToStr") 3854 return self.function_fallback_sql(expression) 3855 3856 def autoincrementcolumnconstraint_sql(self, _) -> str: 3857 self.unsupported("The AUTOINCREMENT column constraint is not supported by DuckDB") 3858 return "" 3859 3860 def aliases_sql(self, expression: exp.Aliases) -> str: 3861 this = expression.this 3862 if isinstance(this, exp.Posexplode): 3863 return self.posexplode_sql(this) 3864 3865 return super().aliases_sql(expression) 3866 3867 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3868 this = expression.this 3869 parent = expression.parent 3870 3871 # The default Spark aliases are "pos" and "col", unless specified otherwise 3872 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3873 3874 if isinstance(parent, exp.Aliases): 3875 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3876 pos, col = parent.expressions 3877 elif isinstance(parent, exp.Table): 3878 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3879 alias = parent.args.get("alias") 3880 if alias: 3881 pos, col = alias.columns or [pos, col] 3882 alias.pop() 3883 3884 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3885 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3886 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3887 gen_subscripts = self.sql( 3888 exp.Alias( 3889 this=exp.Anonymous( 3890 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3891 ) 3892 - exp.Literal.number(1), 3893 alias=pos, 3894 ) 3895 ) 3896 3897 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3898 3899 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3900 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3901 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3902 3903 return posexplode_sql 3904 3905 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3906 """ 3907 Handles three key issues: 3908 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3909 2. End-of-month preservation: If input is last day of month, result is last day of result month 3910 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3911 """ 3912 from sqlglot.optimizer.annotate_types import annotate_types 3913 3914 this = expression.this 3915 if not this.type: 3916 this = annotate_types(this, dialect=self.dialect) 3917 3918 if this.is_type(*exp.DataType.TEXT_TYPES): 3919 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3920 3921 # Detect float/decimal months to apply rounding (Snowflake behavior) 3922 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3923 months_expr = expression.expression 3924 if not months_expr.type: 3925 months_expr = annotate_types(months_expr, dialect=self.dialect) 3926 3927 # Build interval or to_months expression based on type 3928 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3929 interval_or_to_months = ( 3930 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3931 if months_expr.is_type( 3932 exp.DataType.Type.FLOAT, 3933 exp.DataType.Type.DOUBLE, 3934 exp.DataType.Type.DECIMAL, 3935 ) 3936 # Integer case: standard INTERVAL N MONTH syntax 3937 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3938 ) 3939 3940 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3941 3942 # Apply end-of-month preservation if Snowflake flag is set 3943 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3944 preserve_eom = expression.args.get("preserve_end_of_month") 3945 result_expr = ( 3946 exp.case() 3947 .when( 3948 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3949 exp.func("LAST_DAY", date_add_expr), 3950 ) 3951 .else_(date_add_expr) 3952 if preserve_eom 3953 else date_add_expr 3954 ) 3955 3956 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3957 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3958 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3959 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3960 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3961 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3962 return self.sql(result_expr) 3963 3964 def format_sql(self, expression: exp.Format) -> str: 3965 if expression.name.lower() == "%s" and len(expression.expressions) == 1: 3966 return self.func("FORMAT", "'{}'", expression.expressions[0]) 3967 3968 return self.function_fallback_sql(expression) 3969 3970 def hexstring_sql( 3971 self, expression: exp.HexString, binary_function_repr: t.Optional[str] = None 3972 ) -> str: 3973 # UNHEX('FF') correctly produces blob \xFF in DuckDB 3974 return super().hexstring_sql(expression, binary_function_repr="UNHEX") 3975 3976 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3977 unit = unit_to_str(expression) 3978 date = expression.this 3979 result = self.func("DATE_TRUNC", unit, date) 3980 3981 if ( 3982 expression.args.get("input_type_preserved") 3983 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3984 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3985 ): 3986 return self.sql(exp.Cast(this=result, to=date.type)) 3987 3988 return result 3989 3990 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3991 unit = unit_to_str(expression) 3992 zone = expression.args.get("zone") 3993 timestamp = expression.this 3994 date_unit = is_date_unit(unit) 3995 3996 if date_unit and zone: 3997 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3998 # Double AT TIME ZONE needed for BigQuery compatibility: 3999 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4000 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4001 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4002 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4003 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4004 4005 result = self.func("DATE_TRUNC", unit, timestamp) 4006 if expression.args.get("input_type_preserved"): 4007 if timestamp.type and timestamp.is_type( 4008 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 4009 ): 4010 dummy_date = exp.Cast( 4011 this=exp.Literal.string("1970-01-01"), 4012 to=exp.DataType(this=exp.DataType.Type.DATE), 4013 ) 4014 date_time = exp.Add(this=dummy_date, expression=timestamp) 4015 result = self.func("DATE_TRUNC", unit, date_time) 4016 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4017 4018 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4019 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 4020 ): 4021 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4022 4023 return result 4024 4025 def trim_sql(self, expression: exp.Trim) -> str: 4026 expression.this.replace(_cast_to_varchar(expression.this)) 4027 if expression.expression: 4028 expression.expression.replace(_cast_to_varchar(expression.expression)) 4029 4030 result_sql = super().trim_sql(expression) 4031 return _gen_with_cast_to_blob(self, expression, result_sql) 4032 4033 def round_sql(self, expression: exp.Round) -> str: 4034 this = expression.this 4035 decimals = expression.args.get("decimals") 4036 truncate = expression.args.get("truncate") 4037 4038 # DuckDB requires the scale (decimals) argument to be an INT 4039 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4040 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4041 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4042 decimals = exp.cast(decimals, exp.DataType.Type.INT) 4043 4044 func = "ROUND" 4045 if truncate: 4046 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4047 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4048 func = "ROUND_EVEN" 4049 truncate = None 4050 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4051 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4052 truncate = None 4053 4054 return self.func(func, this, decimals, truncate) 4055 4056 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4057 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4058 4059 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4060 if expression.is_type(*exp.DataType.REAL_TYPES): 4061 result = f"CAST({result} AS DOUBLE)" 4062 4063 return result 4064 4065 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4066 """ 4067 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4068 dividing the input distribution into n equal-sized buckets. 4069 4070 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4071 does not document the specific algorithm used so results may differ. DuckDB does not 4072 support RESPECT NULLS. 4073 """ 4074 this = expression.this 4075 if isinstance(this, exp.Distinct): 4076 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4077 if len(this.expressions) < 2: 4078 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4079 return self.function_fallback_sql(expression) 4080 num_quantiles_expr = this.expressions[1].pop() 4081 else: 4082 num_quantiles_expr = expression.expression 4083 4084 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4085 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4086 return self.function_fallback_sql(expression) 4087 4088 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4089 if num_quantiles <= 0: 4090 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4091 return self.function_fallback_sql(expression) 4092 4093 quantiles = [ 4094 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4095 for i in range(num_quantiles + 1) 4096 ] 4097 4098 return self.sql( 4099 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 4100 ) 4101 4102 def jsonextractscalar_sql(self, expression: exp.JSONExtractScalar) -> str: 4103 if expression.args.get("scalar_only"): 4104 expression = exp.JSONExtractScalar( 4105 this=rename_func("JSON_VALUE")(self, expression), expression="'$'" 4106 ) 4107 return _arrow_json_extract_sql(self, expression) 4108 4109 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4110 this = expression.this 4111 4112 if _is_binary(this): 4113 expression.type = exp.DataType.build("BINARY") 4114 4115 arg = _cast_to_bit(this) 4116 4117 if isinstance(this, exp.Neg): 4118 arg = exp.Paren(this=arg) 4119 4120 expression.set("this", arg) 4121 4122 result_sql = f"~{self.sql(expression, 'this')}" 4123 4124 return _gen_with_cast_to_blob(self, expression, result_sql) 4125 4126 def window_sql(self, expression: exp.Window) -> str: 4127 this = expression.this 4128 if isinstance(this, exp.Corr) or ( 4129 isinstance(this, exp.Filter) and isinstance(this.this, exp.Corr) 4130 ): 4131 return self._corr_sql(expression) 4132 4133 return super().window_sql(expression) 4134 4135 def filter_sql(self, expression: exp.Filter) -> str: 4136 if isinstance(expression.this, exp.Corr): 4137 return self._corr_sql(expression) 4138 4139 return super().filter_sql(expression) 4140 4141 def _corr_sql( 4142 self, 4143 expression: t.Union[exp.Filter, exp.Window, exp.Corr], 4144 ) -> str: 4145 if isinstance(expression, exp.Corr) and not expression.args.get( 4146 "null_on_zero_variance" 4147 ): 4148 return self.func("CORR", expression.this, expression.expression) 4149 4150 corr_expr = _maybe_corr_null_to_false(expression) 4151 if corr_expr is None: 4152 if isinstance(expression, exp.Window): 4153 return super().window_sql(expression) 4154 if isinstance(expression, exp.Filter): 4155 return super().filter_sql(expression) 4156 corr_expr = expression # make mypy happy 4157 4158 return self.sql(exp.case().when(exp.IsNan(this=corr_expr), exp.null()).else_(corr_expr))
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True: Always quote except for specials cases. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
2487 def timeslice_sql(self: DuckDB.Generator, expression: exp.TimeSlice) -> str: 2488 """ 2489 Transform Snowflake's TIME_SLICE to DuckDB's time_bucket. 2490 2491 Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) 2492 DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr) 2493 2494 For 'END' kind, add the interval to get the end of the slice. 2495 For DATE type with 'END', cast result back to DATE to preserve type. 2496 """ 2497 date_expr = expression.this 2498 slice_length = expression.expression 2499 unit = expression.unit 2500 kind = expression.text("kind").upper() 2501 2502 # Create INTERVAL expression: INTERVAL 'N' UNIT 2503 interval_expr = exp.Interval(this=slice_length, unit=unit) 2504 2505 # Create base time_bucket expression 2506 time_bucket_expr = exp.func("time_bucket", interval_expr, date_expr) 2507 2508 # Check if we need the end of the slice (default is start) 2509 if not kind == "END": 2510 # For 'START', return time_bucket directly 2511 return self.sql(time_bucket_expr) 2512 2513 # For 'END', add the interval to get end of slice 2514 add_expr = exp.Add(this=time_bucket_expr, expression=interval_expr.copy()) 2515 2516 # If input is DATE type, cast result back to DATE to preserve type 2517 # DuckDB converts DATE to TIMESTAMP when adding intervals 2518 if date_expr.is_type(exp.DataType.Type.DATE): 2519 return self.sql(exp.cast(add_expr, exp.DataType.Type.DATE)) 2520 2521 return self.sql(add_expr)
Transform Snowflake's TIME_SLICE to DuckDB's time_bucket.
Snowflake: TIME_SLICE(date_expr, slice_length, 'UNIT' [, 'START'|'END']) DuckDB: time_bucket(INTERVAL 'slice_length' UNIT, date_expr)
For 'END' kind, add the interval to get the end of the slice. For DATE type with 'END', cast result back to DATE to preserve type.
2523 def bitmapbucketnumber_sql( 2524 self: DuckDB.Generator, expression: exp.BitmapBucketNumber 2525 ) -> str: 2526 """ 2527 Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent. 2528 2529 Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where: 2530 - Each bucket covers 32,768 values 2531 - Bucket numbering starts at 1 2532 - Formula: ((value - 1) // 32768) + 1 for positive values 2533 2534 For non-positive values (0 and negative), we use value // 32768 to avoid 2535 producing bucket 0 or positive bucket IDs for negative inputs. 2536 """ 2537 value = expression.this 2538 2539 positive_formula = ((value - 1) // 32768) + 1 2540 non_positive_formula = value // 32768 2541 2542 # CASE WHEN value > 0 THEN ((value - 1) // 32768) + 1 ELSE value // 32768 END 2543 case_expr = ( 2544 exp.case() 2545 .when(exp.GT(this=value, expression=exp.Literal.number(0)), positive_formula) 2546 .else_(non_positive_formula) 2547 ) 2548 return self.sql(case_expr)
Transpile BITMAP_BUCKET_NUMBER function from Snowflake to DuckDB equivalent.
Snowflake's BITMAP_BUCKET_NUMBER returns a 1-based bucket identifier where:
- Each bucket covers 32,768 values
- Bucket numbering starts at 1
- Formula: ((value - 1) // 32768) + 1 for positive values
For non-positive values (0 and negative), we use value // 32768 to avoid producing bucket 0 or positive bucket IDs for negative inputs.
2550 def bitmapbitposition_sql(self: DuckDB.Generator, expression: exp.BitmapBitPosition) -> str: 2551 """ 2552 Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression. 2553 2554 Snowflake's BITMAP_BIT_POSITION behavior: 2555 - For n <= 0: returns ABS(n) % 32768 2556 - For n > 0: returns (n - 1) % 32768 (maximum return value is 32767) 2557 """ 2558 this = expression.this 2559 2560 return self.sql( 2561 exp.Mod( 2562 this=exp.Paren( 2563 this=exp.If( 2564 this=exp.GT(this=this, expression=exp.Literal.number(0)), 2565 true=this - exp.Literal.number(1), 2566 false=exp.Abs(this=this), 2567 ) 2568 ), 2569 expression=MAX_BIT_POSITION, 2570 ) 2571 )
Transpile Snowflake's BITMAP_BIT_POSITION to DuckDB CASE expression.
Snowflake's BITMAP_BIT_POSITION behavior:
- For n <= 0: returns ABS(n) % 32768
- For n > 0: returns (n - 1) % 32768 (maximum return value is 32767)
2573 def bitmapconstructagg_sql( 2574 self: DuckDB.Generator, expression: exp.BitmapConstructAgg 2575 ) -> str: 2576 """ 2577 Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. 2578 Uses a pre-parsed template with placeholders replaced by expression nodes. 2579 2580 Snowflake bitmap format: 2581 - Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes 2582 - Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian) 2583 """ 2584 arg = expression.this 2585 return f"({self.sql(exp.replace_placeholders(self.BITMAP_CONSTRUCT_AGG_TEMPLATE, arg=arg))})"
Transpile Snowflake's BITMAP_CONSTRUCT_AGG to DuckDB equivalent. Uses a pre-parsed template with placeholders replaced by expression nodes.
Snowflake bitmap format:
- Small (< 5 unique values): 2-byte count (big-endian) + values (little-endian) + padding to 10 bytes
- Large (>= 5 unique values): 10-byte header (0x08 + 9 zeros) + values (little-endian)
2587 def nthvalue_sql(self: DuckDB.Generator, expression: exp.NthValue) -> str: 2588 from_first = expression.args.get("from_first", True) 2589 if not from_first: 2590 self.unsupported("DuckDB's NTH_VALUE doesn't support starting from the end ") 2591 2592 return self.function_fallback_sql(expression)
2594 def randstr_sql(self: DuckDB.Generator, expression: exp.Randstr) -> str: 2595 """ 2596 Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. 2597 Uses a pre-parsed template with placeholders replaced by expression nodes. 2598 2599 RANDSTR(length, generator) generates a random string of specified length. 2600 - With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result) 2601 - With RANDOM(): Use RANDOM() in the hash for non-deterministic output 2602 - No generator: Use default seed value 2603 """ 2604 length = expression.this 2605 generator = expression.args.get("generator") 2606 2607 if generator: 2608 if isinstance(generator, exp.Rand): 2609 # If it's RANDOM(), use its seed if available, otherwise use RANDOM() itself 2610 seed_value = generator.this or generator 2611 else: 2612 # Const/int or other expression - use as seed directly 2613 seed_value = generator 2614 else: 2615 # No generator specified, use default seed (arbitrary but deterministic) 2616 seed_value = exp.Literal.number(RANDSTR_SEED) 2617 2618 replacements = {"seed": seed_value, "length": length} 2619 return f"({self.sql(exp.replace_placeholders(self.RANDSTR_TEMPLATE, **replacements))})"
Transpile Snowflake's RANDSTR to DuckDB equivalent using deterministic hash-based random. Uses a pre-parsed template with placeholders replaced by expression nodes.
RANDSTR(length, generator) generates a random string of specified length.
- With numeric seed: Use HASH(i + seed) for deterministic output (same seed = same result)
- With RANDOM(): Use RANDOM() in the hash for non-deterministic output
- No generator: Use default seed value
2621 def zipf_sql(self: DuckDB.Generator, expression: exp.Zipf) -> str: 2622 """ 2623 Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. 2624 Uses a pre-parsed template with placeholders replaced by expression nodes. 2625 """ 2626 s = expression.this 2627 n = expression.args["elementcount"] 2628 gen = expression.args["gen"] 2629 2630 if not isinstance(gen, exp.Rand): 2631 # (ABS(HASH(seed)) % 1000000) / 1000000.0 2632 random_expr: exp.Expression = exp.Div( 2633 this=exp.Paren( 2634 this=exp.Mod( 2635 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen.copy()])), 2636 expression=exp.Literal.number(1000000), 2637 ) 2638 ), 2639 expression=exp.Literal.number(1000000.0), 2640 ) 2641 else: 2642 # Use RANDOM() for non-deterministic output 2643 random_expr = exp.Rand() 2644 2645 replacements = {"s": s, "n": n, "random_expr": random_expr} 2646 return f"({self.sql(exp.replace_placeholders(self.ZIPF_TEMPLATE, **replacements))})"
Transpile Snowflake's ZIPF to DuckDB using CDF-based inverse sampling. Uses a pre-parsed template with placeholders replaced by expression nodes.
2648 def tobinary_sql(self: DuckDB.Generator, expression: exp.ToBinary) -> str: 2649 """ 2650 TO_BINARY and TRY_TO_BINARY transpilation: 2651 - 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50') 2652 - 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST') 2653 - 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==') 2654 2655 For TRY_TO_BINARY (safe=True), wrap with TRY(): 2656 - 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid')) 2657 """ 2658 value = expression.this 2659 format_arg = expression.args.get("format") 2660 is_safe = expression.args.get("safe") 2661 is_binary = _is_binary(expression) 2662 2663 if not format_arg and not is_binary: 2664 func_name = "TRY_TO_BINARY" if is_safe else "TO_BINARY" 2665 return self.func(func_name, value) 2666 2667 # Snowflake defaults to HEX encoding when no format is specified 2668 fmt = format_arg.name.upper() if format_arg else "HEX" 2669 2670 if fmt in ("UTF-8", "UTF8"): 2671 # DuckDB ENCODE always uses UTF-8, no charset parameter needed 2672 result = self.func("ENCODE", value) 2673 elif fmt == "BASE64": 2674 result = self.func("FROM_BASE64", value) 2675 elif fmt == "HEX": 2676 result = self.func("UNHEX", value) 2677 else: 2678 if is_safe: 2679 return self.sql(exp.null()) 2680 else: 2681 self.unsupported(f"format {fmt} is not supported") 2682 result = self.func("TO_BINARY", value) 2683 return f"TRY({result})" if is_safe else result
TO_BINARY and TRY_TO_BINARY transpilation:
- 'HEX': TO_BINARY('48454C50', 'HEX') → UNHEX('48454C50')
- 'UTF-8': TO_BINARY('TEST', 'UTF-8') → ENCODE('TEST')
- 'BASE64': TO_BINARY('SEVMUA==', 'BASE64') → FROM_BASE64('SEVMUA==')
For TRY_TO_BINARY (safe=True), wrap with TRY():
- 'HEX': TRY_TO_BINARY('invalid', 'HEX') → TRY(UNHEX('invalid'))
2711 def generator_sql(self, expression: exp.Generator) -> str: 2712 # Transpile Snowflake GENERATOR to DuckDB range() 2713 rowcount = expression.args.get("rowcount") 2714 time_limit = expression.args.get("time_limit") 2715 2716 if time_limit: 2717 self.unsupported("GENERATOR TIMELIMIT parameter is not supported in DuckDB") 2718 2719 if not rowcount: 2720 self.unsupported("GENERATOR without ROWCOUNT is not supported in DuckDB") 2721 return self.func("range", exp.Literal.number(0)) 2722 2723 return self.func("range", rowcount)
2731 def lambda_sql( 2732 self, expression: exp.Lambda, arrow_sep: str = "->", wrap: bool = True 2733 ) -> str: 2734 if expression.args.get("colon"): 2735 prefix = "LAMBDA " 2736 arrow_sep = ":" 2737 wrap = False 2738 else: 2739 prefix = "" 2740 2741 lambda_sql = super().lambda_sql(expression, arrow_sep=arrow_sep, wrap=wrap) 2742 return f"{prefix}{lambda_sql}"
2747 def install_sql(self, expression: exp.Install) -> str: 2748 force = "FORCE " if expression.args.get("force") else "" 2749 this = self.sql(expression, "this") 2750 from_clause = expression.args.get("from_") 2751 from_clause = f" FROM {from_clause}" if from_clause else "" 2752 return f"{force}INSTALL {this}{from_clause}"
2763 def strtotime_sql(self, expression: exp.StrToTime) -> str: 2764 # Check if target_type requires TIMESTAMPTZ (for LTZ/TZ variants) 2765 target_type = expression.args.get("target_type") 2766 needs_tz = target_type and target_type.this in ( 2767 exp.DataType.Type.TIMESTAMPLTZ, 2768 exp.DataType.Type.TIMESTAMPTZ, 2769 ) 2770 2771 if expression.args.get("safe"): 2772 formatted_time = self.format_time(expression) 2773 cast_type = ( 2774 exp.DataType.Type.TIMESTAMPTZ if needs_tz else exp.DataType.Type.TIMESTAMP 2775 ) 2776 return self.sql( 2777 exp.cast(self.func("TRY_STRPTIME", expression.this, formatted_time), cast_type) 2778 ) 2779 2780 base_sql = str_to_time_sql(self, expression) 2781 if needs_tz: 2782 return self.sql( 2783 exp.cast( 2784 base_sql, 2785 exp.DataType(this=exp.DataType.Type.TIMESTAMPTZ), 2786 ) 2787 ) 2788 return base_sql
2790 def strtodate_sql(self, expression: exp.StrToDate) -> str: 2791 formatted_time = self.format_time(expression) 2792 function_name = "STRPTIME" if not expression.args.get("safe") else "TRY_STRPTIME" 2793 return self.sql( 2794 exp.cast( 2795 self.func(function_name, expression.this, formatted_time), 2796 exp.DataType(this=exp.DataType.Type.DATE), 2797 ) 2798 )
2800 def tsordstotime_sql(self, expression: exp.TsOrDsToTime) -> str: 2801 this = expression.this 2802 time_format = self.format_time(expression) 2803 safe = expression.args.get("safe") 2804 time_type = exp.DataType.build("TIME", dialect="duckdb") 2805 cast_expr = exp.TryCast if safe else exp.Cast 2806 2807 if time_format: 2808 func_name = "TRY_STRPTIME" if safe else "STRPTIME" 2809 strptime = exp.Anonymous(this=func_name, expressions=[this, time_format]) 2810 return self.sql(cast_expr(this=strptime, to=time_type)) 2811 2812 if isinstance(this, exp.TsOrDsToTime) or this.is_type(exp.DataType.Type.TIME): 2813 return self.sql(this) 2814 2815 return self.sql(cast_expr(this=this, to=time_type))
2817 def currentdate_sql(self, expression: exp.CurrentDate) -> str: 2818 if not expression.this: 2819 return "CURRENT_DATE" 2820 2821 expr = exp.Cast( 2822 this=exp.AtTimeZone(this=exp.CurrentTimestamp(), zone=expression.this), 2823 to=exp.DataType(this=exp.DataType.Type.DATE), 2824 ) 2825 return self.sql(expr)
2837 def normal_sql(self, expression: exp.Normal) -> str: 2838 """ 2839 Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB. 2840 2841 Uses the Box-Muller transform via NORMAL_TEMPLATE. 2842 """ 2843 mean = expression.this 2844 stddev = expression.args["stddev"] 2845 gen: exp.Expression = expression.args["gen"] 2846 2847 # Build two uniform random values [0, 1) for Box-Muller transform 2848 if isinstance(gen, exp.Rand) and gen.this is None: 2849 u1: exp.Expression = exp.Rand() 2850 u2: exp.Expression = exp.Rand() 2851 else: 2852 # Seeded: derive two values using HASH with different inputs 2853 seed = gen.this if isinstance(gen, exp.Rand) else gen 2854 u1 = exp.replace_placeholders(self.SEEDED_RANDOM_TEMPLATE, seed=seed) 2855 u2 = exp.replace_placeholders( 2856 self.SEEDED_RANDOM_TEMPLATE, 2857 seed=exp.Add(this=seed.copy(), expression=exp.Literal.number(1)), 2858 ) 2859 2860 replacements = {"mean": mean, "stddev": stddev, "u1": u1, "u2": u2} 2861 return self.sql(exp.replace_placeholders(self.NORMAL_TEMPLATE, **replacements))
Transpile Snowflake's NORMAL(mean, stddev, gen) to DuckDB.
Uses the Box-Muller transform via NORMAL_TEMPLATE.
2863 def uniform_sql(self, expression: exp.Uniform) -> str: 2864 """ 2865 Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB. 2866 2867 UNIFORM returns a random value in [min, max]: 2868 - Integer result if both min and max are integers 2869 - Float result if either min or max is a float 2870 """ 2871 min_val = expression.this 2872 max_val = expression.expression 2873 gen = expression.args.get("gen") 2874 2875 # Determine if result should be integer (both bounds are integers). 2876 # We do this to emulate Snowflake's behavior, INT -> INT, FLOAT -> FLOAT 2877 is_int_result = min_val.is_int and max_val.is_int 2878 2879 # Build the random value expression [0, 1) 2880 if not isinstance(gen, exp.Rand): 2881 # Seed value: (ABS(HASH(seed)) % 1000000) / 1000000.0 2882 random_expr: exp.Expression = exp.Div( 2883 this=exp.Paren( 2884 this=exp.Mod( 2885 this=exp.Abs(this=exp.Anonymous(this="HASH", expressions=[gen])), 2886 expression=exp.Literal.number(1000000), 2887 ) 2888 ), 2889 expression=exp.Literal.number(1000000.0), 2890 ) 2891 else: 2892 random_expr = exp.Rand() 2893 2894 # Build: min + random * (max - min [+ 1 for int]) 2895 range_expr: exp.Expression = exp.Sub(this=max_val, expression=min_val) 2896 if is_int_result: 2897 range_expr = exp.Add(this=range_expr, expression=exp.Literal.number(1)) 2898 2899 result: exp.Expression = exp.Add( 2900 this=min_val, 2901 expression=exp.Mul(this=random_expr, expression=exp.Paren(this=range_expr)), 2902 ) 2903 2904 if is_int_result: 2905 result = exp.Cast( 2906 this=exp.Floor(this=result), 2907 to=exp.DataType.build("BIGINT"), 2908 ) 2909 2910 return self.sql(result)
Transpile Snowflake's UNIFORM(min, max, gen) to DuckDB.
UNIFORM returns a random value in [min, max]:
- Integer result if both min and max are integers
- Float result if either min or max is a float
2912 def timefromparts_sql(self, expression: exp.TimeFromParts) -> str: 2913 nano = expression.args.get("nano") 2914 overflow = expression.args.get("overflow") 2915 2916 # Snowflake's TIME_FROM_PARTS supports overflow 2917 if overflow: 2918 hour = expression.args["hour"] 2919 minute = expression.args["min"] 2920 sec = expression.args["sec"] 2921 2922 # Check if values are within normal ranges - use MAKE_TIME for efficiency 2923 if not nano and all(arg.is_int for arg in [hour, minute, sec]): 2924 try: 2925 h_val = hour.to_py() 2926 m_val = minute.to_py() 2927 s_val = sec.to_py() 2928 if 0 <= h_val <= 23 and 0 <= m_val <= 59 and 0 <= s_val <= 59: 2929 return rename_func("MAKE_TIME")(self, expression) 2930 except ValueError: 2931 pass 2932 2933 # Overflow or nanoseconds detected - use INTERVAL arithmetic 2934 if nano: 2935 sec = sec + nano.pop() / exp.Literal.number(1000000000.0) 2936 2937 total_seconds = ( 2938 hour * exp.Literal.number(3600) + minute * exp.Literal.number(60) + sec 2939 ) 2940 2941 return self.sql( 2942 exp.Add( 2943 this=exp.Cast( 2944 this=exp.Literal.string("00:00:00"), to=exp.DataType.build("TIME") 2945 ), 2946 expression=exp.Interval(this=total_seconds, unit=exp.var("SECOND")), 2947 ) 2948 ) 2949 2950 # Default: MAKE_TIME 2951 if nano: 2952 expression.set( 2953 "sec", expression.args["sec"] + nano.pop() / exp.Literal.number(1000000000.0) 2954 ) 2955 2956 return rename_func("MAKE_TIME")(self, expression)
2958 def extract_sql(self, expression: exp.Extract) -> str: 2959 """ 2960 Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported. 2961 2962 DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, 2963 EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND 2964 """ 2965 this = expression.this 2966 datetime_expr = expression.expression 2967 2968 # TIMESTAMPTZ extractions may produce different results between Snowflake and DuckDB 2969 # because Snowflake applies server timezone while DuckDB uses local timezone 2970 if datetime_expr.is_type(exp.DataType.Type.TIMESTAMPTZ, exp.DataType.Type.TIMESTAMPLTZ): 2971 self.unsupported( 2972 "EXTRACT from TIMESTAMPTZ / TIMESTAMPLTZ may produce different results due to timezone handling differences" 2973 ) 2974 2975 part_name = this.name.upper() 2976 2977 if part_name in self.EXTRACT_STRFTIME_MAPPINGS: 2978 fmt, cast_type = self.EXTRACT_STRFTIME_MAPPINGS[part_name] 2979 2980 # Problem: strftime doesn't accept TIME and there's no NANOSECOND function 2981 # So, for NANOSECOND with TIME, fallback to MICROSECOND * 1000 2982 is_nano_time = part_name == "NANOSECOND" and datetime_expr.is_type( 2983 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 2984 ) 2985 2986 if is_nano_time: 2987 self.unsupported( 2988 "Parameter NANOSECOND is not supported with TIME type in DuckDB" 2989 ) 2990 return self.sql( 2991 exp.cast( 2992 exp.Mul( 2993 this=exp.Extract( 2994 this=exp.var("MICROSECOND"), expression=datetime_expr 2995 ), 2996 expression=exp.Literal.number(1000), 2997 ), 2998 exp.DataType.build(cast_type, dialect="duckdb"), 2999 ) 3000 ) 3001 3002 # For NANOSECOND, cast to TIMESTAMP_NS to preserve nanosecond precision 3003 strftime_input = datetime_expr 3004 if part_name == "NANOSECOND": 3005 strftime_input = exp.cast(datetime_expr, exp.DataType.Type.TIMESTAMP_NS) 3006 3007 return self.sql( 3008 exp.cast( 3009 exp.Anonymous( 3010 this="STRFTIME", 3011 expressions=[strftime_input, exp.Literal.string(fmt)], 3012 ), 3013 exp.DataType.build(cast_type, dialect="duckdb"), 3014 ) 3015 ) 3016 3017 if part_name in self.EXTRACT_EPOCH_MAPPINGS: 3018 func_name = self.EXTRACT_EPOCH_MAPPINGS[part_name] 3019 result: exp.Expression = exp.Anonymous(this=func_name, expressions=[datetime_expr]) 3020 # EPOCH returns float, cast to BIGINT for integer result 3021 if part_name == "EPOCH_SECOND": 3022 result = exp.cast(result, exp.DataType.build("BIGINT", dialect="duckdb")) 3023 return self.sql(result) 3024 3025 return super().extract_sql(expression)
Transpile EXTRACT/DATE_PART for DuckDB, handling specifiers not natively supported.
DuckDB doesn't support: WEEKISO, YEAROFWEEK, YEAROFWEEKISO, NANOSECOND, EPOCH_SECOND (as integer), EPOCH_MILLISECOND, EPOCH_MICROSECOND, EPOCH_NANOSECOND
3027 def timestampfromparts_sql(self, expression: exp.TimestampFromParts) -> str: 3028 # Check if this is the date/time expression form: TIMESTAMP_FROM_PARTS(date_expr, time_expr) 3029 date_expr = expression.this 3030 time_expr = expression.expression 3031 3032 if date_expr is not None and time_expr is not None: 3033 # In DuckDB, DATE + TIME produces TIMESTAMP 3034 return self.sql(exp.Add(this=date_expr, expression=time_expr)) 3035 3036 # Component-based form: TIMESTAMP_FROM_PARTS(year, month, day, hour, minute, second, ...) 3037 sec = expression.args.get("sec") 3038 if sec is None: 3039 # This shouldn't happen with valid input, but handle gracefully 3040 return rename_func("MAKE_TIMESTAMP")(self, expression) 3041 3042 milli = expression.args.get("milli") 3043 if milli is not None: 3044 sec += milli.pop() / exp.Literal.number(1000.0) 3045 3046 nano = expression.args.get("nano") 3047 if nano is not None: 3048 sec += nano.pop() / exp.Literal.number(1000000000.0) 3049 3050 if milli or nano: 3051 expression.set("sec", sec) 3052 3053 return rename_func("MAKE_TIMESTAMP")(self, expression)
3055 @unsupported_args("nano") 3056 def timestampltzfromparts_sql(self, expression: exp.TimestampLtzFromParts) -> str: 3057 # Pop nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3058 if nano := expression.args.get("nano"): 3059 nano.pop() 3060 3061 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3062 return f"CAST({timestamp} AS TIMESTAMPTZ)"
3064 @unsupported_args("nano") 3065 def timestamptzfromparts_sql(self, expression: exp.TimestampTzFromParts) -> str: 3066 # Extract zone before popping 3067 zone = expression.args.get("zone") 3068 # Pop zone and nano so rename_func only passes args that MAKE_TIMESTAMP accepts 3069 if zone: 3070 zone = zone.pop() 3071 3072 if nano := expression.args.get("nano"): 3073 nano.pop() 3074 3075 timestamp = rename_func("MAKE_TIMESTAMP")(self, expression) 3076 3077 if zone: 3078 # Use AT TIME ZONE to apply the explicit timezone 3079 return f"{timestamp} AT TIME ZONE {self.sql(zone)}" 3080 3081 return timestamp
3083 def tablesample_sql( 3084 self, 3085 expression: exp.TableSample, 3086 tablesample_keyword: t.Optional[str] = None, 3087 ) -> str: 3088 if not isinstance(expression.parent, exp.Select): 3089 # This sample clause only applies to a single source, not the entire resulting relation 3090 tablesample_keyword = "TABLESAMPLE" 3091 3092 if expression.args.get("size"): 3093 method = expression.args.get("method") 3094 if method and method.name.upper() != "RESERVOIR": 3095 self.unsupported( 3096 f"Sampling method {method} is not supported with a discrete sample count, " 3097 "defaulting to reservoir sampling" 3098 ) 3099 expression.set("method", exp.var("RESERVOIR")) 3100 3101 return super().tablesample_sql(expression, tablesample_keyword=tablesample_keyword)
3108 def join_sql(self, expression: exp.Join) -> str: 3109 if ( 3110 not expression.args.get("using") 3111 and not expression.args.get("on") 3112 and not expression.method 3113 and (expression.kind in ("", "INNER", "OUTER")) 3114 ): 3115 # Some dialects support `LEFT/INNER JOIN UNNEST(...)` without an explicit ON clause 3116 # DuckDB doesn't, but we can just add a dummy ON clause that is always true 3117 if isinstance(expression.this, exp.Unnest): 3118 return super().join_sql(expression.on(exp.true())) 3119 3120 expression.set("side", None) 3121 expression.set("kind", None) 3122 3123 return super().join_sql(expression)
3139 def bracket_sql(self, expression: exp.Bracket) -> str: 3140 if self.dialect.version >= (1, 2): 3141 return super().bracket_sql(expression) 3142 3143 # https://duckdb.org/2025/02/05/announcing-duckdb-120.html#breaking-changes 3144 this = expression.this 3145 if isinstance(this, exp.Array): 3146 this.replace(exp.paren(this)) 3147 3148 bracket = super().bracket_sql(expression) 3149 3150 if not expression.args.get("returns_list_for_maps"): 3151 if not this.type: 3152 from sqlglot.optimizer.annotate_types import annotate_types 3153 3154 this = annotate_types(this, dialect=self.dialect) 3155 3156 if this.is_type(exp.DataType.Type.MAP): 3157 bracket = f"({bracket})[1]" 3158 3159 return bracket
3161 def withingroup_sql(self, expression: exp.WithinGroup) -> str: 3162 func = expression.this 3163 3164 # For ARRAY_AGG, DuckDB requires ORDER BY inside the function, not in WITHIN GROUP 3165 # Transform: ARRAY_AGG(x) WITHIN GROUP (ORDER BY y) -> ARRAY_AGG(x ORDER BY y) 3166 if isinstance(func, exp.ArrayAgg): 3167 if not isinstance(order := expression.expression, exp.Order): 3168 return self.sql(func) 3169 3170 # Save the original column for FILTER clause (before wrapping with Order) 3171 original_this = func.this 3172 3173 # Move ORDER BY inside ARRAY_AGG by wrapping its argument with Order 3174 # ArrayAgg.this should become Order(this=ArrayAgg.this, expressions=order.expressions) 3175 func.set( 3176 "this", 3177 exp.Order( 3178 this=func.this.copy(), 3179 expressions=order.expressions, 3180 ), 3181 ) 3182 3183 # Generate the ARRAY_AGG function with ORDER BY and add FILTER clause if needed 3184 # Use original_this (not the Order-wrapped version) for the FILTER condition 3185 array_agg_sql = self.function_fallback_sql(func) 3186 return self._add_arrayagg_null_filter(array_agg_sql, func, original_this) 3187 3188 # For other functions (like PERCENTILES), use existing logic 3189 expression_sql = self.sql(expression, "expression") 3190 3191 if isinstance(func, exp.PERCENTILES): 3192 # Make the order key the first arg and slide the fraction to the right 3193 # https://duckdb.org/docs/sql/aggregates#ordered-set-aggregate-functions 3194 order_col = expression.find(exp.Ordered) 3195 if order_col: 3196 func.set("expression", func.this) 3197 func.set("this", order_col.this) 3198 3199 this = self.sql(expression, "this").rstrip(")") 3200 3201 return f"{this}{expression_sql})"
3203 def length_sql(self, expression: exp.Length) -> str: 3204 arg = expression.this 3205 3206 # Dialects like BQ and Snowflake also accept binary values as args, so 3207 # DDB will attempt to infer the type or resort to case/when resolution 3208 if not expression.args.get("binary") or arg.is_string: 3209 return self.func("LENGTH", arg) 3210 3211 if not arg.type: 3212 from sqlglot.optimizer.annotate_types import annotate_types 3213 3214 arg = annotate_types(arg, dialect=self.dialect) 3215 3216 if arg.is_type(*exp.DataType.TEXT_TYPES): 3217 return self.func("LENGTH", arg) 3218 3219 # We need these casts to make duckdb's static type checker happy 3220 blob = exp.cast(arg, exp.DataType.Type.VARBINARY) 3221 varchar = exp.cast(arg, exp.DataType.Type.VARCHAR) 3222 3223 case = ( 3224 exp.case(exp.Anonymous(this="TYPEOF", expressions=[arg])) 3225 .when(exp.Literal.string("BLOB"), exp.ByteLength(this=blob)) 3226 .else_(exp.Anonymous(this="LENGTH", expressions=[varchar])) 3227 ) 3228 return self.sql(case)
3262 def regexpcount_sql(self, expression: exp.RegexpCount) -> str: 3263 this = expression.this 3264 pattern = expression.expression 3265 position = expression.args.get("position") 3266 parameters = expression.args.get("parameters") 3267 3268 # Validate flags - only "ims" flags are supported for embedded patterns 3269 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3270 3271 if position: 3272 this = exp.Substring(this=this, start=position) 3273 3274 # Embed flags in pattern (REGEXP_EXTRACT_ALL doesn't support flags argument) 3275 if validated_flags: 3276 pattern = exp.Concat( 3277 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3278 ) 3279 3280 # Handle empty pattern: Snowflake returns 0, DuckDB would match between every character 3281 result = ( 3282 exp.case() 3283 .when( 3284 exp.EQ(this=pattern, expression=exp.Literal.string("")), 3285 exp.Literal.number(0), 3286 ) 3287 .else_( 3288 exp.Length( 3289 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3290 ) 3291 ) 3292 ) 3293 3294 return self.sql(result)
3296 def regexpreplace_sql(self, expression: exp.RegexpReplace) -> str: 3297 subject = expression.this 3298 pattern = expression.expression 3299 replacement = expression.args.get("replacement") or exp.Literal.string("") 3300 position = expression.args.get("position") 3301 occurrence = expression.args.get("occurrence") 3302 modifiers = expression.args.get("modifiers") 3303 3304 validated_flags = self._validate_regexp_flags(modifiers, supported_flags="cimsg") or "" 3305 3306 # Handle occurrence (only literals supported) 3307 if occurrence and not occurrence.is_int: 3308 self.unsupported("REGEXP_REPLACE with non-literal occurrence") 3309 else: 3310 occurrence = occurrence.to_py() if occurrence and occurrence.is_int else 0 3311 if occurrence > 1: 3312 self.unsupported(f"REGEXP_REPLACE occurrence={occurrence} not supported") 3313 # flag duckdb to do either all or none, single_replace check is for duckdb round trip 3314 elif ( 3315 occurrence == 0 3316 and "g" not in validated_flags 3317 and not expression.args.get("single_replace") 3318 ): 3319 validated_flags += "g" 3320 3321 # Handle position (only literals supported) 3322 prefix = None 3323 if position and not position.is_int: 3324 self.unsupported("REGEXP_REPLACE with non-literal position") 3325 elif position and position.is_int and position.to_py() > 1: 3326 pos = position.to_py() 3327 prefix = exp.Substring( 3328 this=subject, start=exp.Literal.number(1), length=exp.Literal.number(pos - 1) 3329 ) 3330 subject = exp.Substring(this=subject, start=exp.Literal.number(pos)) 3331 3332 result: exp.Expression = exp.Anonymous( 3333 this="REGEXP_REPLACE", 3334 expressions=[ 3335 subject, 3336 pattern, 3337 replacement, 3338 exp.Literal.string(validated_flags) if validated_flags else None, 3339 ], 3340 ) 3341 3342 if prefix: 3343 result = exp.Concat(expressions=[prefix, result]) 3344 3345 return self.sql(result)
3347 def regexplike_sql(self, expression: exp.RegexpLike) -> str: 3348 this = expression.this 3349 pattern = expression.expression 3350 flag = expression.args.get("flag") 3351 3352 if not expression.args.get("full_match"): 3353 return self.func("REGEXP_MATCHES", this, pattern, flag) 3354 3355 # DuckDB REGEXP_MATCHES supports: c, i, m, s (but not 'e') 3356 validated_flags = self._validate_regexp_flags(flag, supported_flags="cims") 3357 3358 anchored_pattern = exp.Concat( 3359 expressions=[ 3360 exp.Literal.string("^("), 3361 exp.Paren(this=pattern), 3362 exp.Literal.string(")$"), 3363 ] 3364 ) 3365 3366 if validated_flags: 3367 flag = exp.Literal.string(validated_flags) 3368 3369 return self.func("REGEXP_MATCHES", this, anchored_pattern, flag)
3371 @unsupported_args("ins_cost", "del_cost", "sub_cost") 3372 def levenshtein_sql(self, expression: exp.Levenshtein) -> str: 3373 this = expression.this 3374 expr = expression.expression 3375 max_dist = expression.args.get("max_dist") 3376 3377 if max_dist is None: 3378 return self.func("LEVENSHTEIN", this, expr) 3379 3380 # Emulate Snowflake semantics: if distance > max_dist, return max_dist 3381 levenshtein = exp.Levenshtein(this=this, expression=expr) 3382 return self.sql(exp.Least(this=levenshtein, expressions=[max_dist]))
3384 def pad_sql(self, expression: exp.Pad) -> str: 3385 """ 3386 Handle RPAD/LPAD for VARCHAR and BINARY types. 3387 3388 For VARCHAR: Delegate to parent class 3389 For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input))) 3390 """ 3391 string_arg = expression.this 3392 fill_arg = expression.args.get("fill_pattern") or exp.Literal.string(" ") 3393 3394 if _is_binary(string_arg) or _is_binary(fill_arg): 3395 length_arg = expression.expression 3396 is_left = expression.args.get("is_left") 3397 3398 input_len = exp.ByteLength(this=string_arg) 3399 chars_needed = length_arg - input_len 3400 pad_count = exp.Greatest( 3401 this=exp.Literal.number(0), expressions=[chars_needed], ignore_nulls=True 3402 ) 3403 repeat_expr = exp.Repeat(this=fill_arg, times=pad_count) 3404 3405 left, right = string_arg, repeat_expr 3406 if is_left: 3407 left, right = right, left 3408 3409 result = exp.DPipe(this=left, expression=right) 3410 return self.sql(result) 3411 3412 # For VARCHAR: Delegate to parent class (handles PAD_FILL_PATTERN_IS_REQUIRED) 3413 return super().pad_sql(expression)
Handle RPAD/LPAD for VARCHAR and BINARY types.
For VARCHAR: Delegate to parent class For BINARY: Lower to: input || REPEAT(pad, GREATEST(0, target_len - OCTET_LENGTH(input)))
3415 def minhash_sql(self, expression: exp.Minhash) -> str: 3416 k = expression.this 3417 exprs = expression.expressions 3418 3419 if len(exprs) != 1 or isinstance(exprs[0], exp.Star): 3420 self.unsupported( 3421 "MINHASH with multiple expressions or * requires manual query restructuring" 3422 ) 3423 return self.func("MINHASH", k, *exprs) 3424 3425 expr = exprs[0] 3426 result = exp.replace_placeholders(self.MINHASH_TEMPLATE.copy(), expr=expr, k=k) 3427 return f"({self.sql(result)})"
3441 def arraydistinct_sql(self, expression: exp.ArrayDistinct) -> str: 3442 arr = expression.this 3443 func = self.func("LIST_DISTINCT", arr) 3444 3445 if expression.args.get("check_null"): 3446 add_null_to_array = exp.func( 3447 "LIST_APPEND", exp.func("LIST_DISTINCT", exp.ArrayCompact(this=arr)), exp.Null() 3448 ) 3449 return self.sql( 3450 exp.If( 3451 this=exp.NEQ( 3452 this=exp.ArraySize(this=arr), expression=exp.func("LIST_COUNT", arr) 3453 ), 3454 true=add_null_to_array, 3455 false=func, 3456 ) 3457 ) 3458 3459 return func
3468 def arrayszip_sql(self, expression: exp.ArraysZip) -> str: 3469 args = expression.expressions 3470 3471 if not args: 3472 # Return [{}] - using MAP([], []) since DuckDB can't represent empty structs 3473 return self.sql(exp.array(exp.Map(keys=exp.array(), values=exp.array()))) 3474 3475 # Build placeholder values for template 3476 lengths = [exp.Length(this=arg) for arg in args] 3477 max_len = ( 3478 lengths[0] 3479 if len(lengths) == 1 3480 else exp.Greatest(this=lengths[0], expressions=lengths[1:]) 3481 ) 3482 3483 # Empty struct with same schema: {'$1': NULL, '$2': NULL, ...} 3484 empty_struct = exp.func( 3485 "STRUCT", 3486 *[ 3487 exp.PropertyEQ(this=exp.Literal.string(f"${i + 1}"), expression=exp.Null()) 3488 for i in range(len(args)) 3489 ], 3490 ) 3491 3492 # Struct for transform: {'$1': COALESCE(arr1, [])[__i + 1], ...} 3493 # COALESCE wrapping handles NULL arrays - prevents invalid NULL[i] syntax 3494 index = exp.column("__i") + 1 3495 transform_struct = exp.func( 3496 "STRUCT", 3497 *[ 3498 exp.PropertyEQ( 3499 this=exp.Literal.string(f"${i + 1}"), 3500 expression=exp.func("COALESCE", arg, exp.array())[index], 3501 ) 3502 for i, arg in enumerate(args) 3503 ], 3504 ) 3505 3506 result = exp.replace_placeholders( 3507 self.ARRAYS_ZIP_TEMPLATE.copy(), 3508 null_check=exp.or_(*[arg.is_(exp.Null()) for arg in args]), 3509 all_empty_check=exp.and_( 3510 *[ 3511 exp.EQ(this=exp.Length(this=arg), expression=exp.Literal.number(0)) 3512 for arg in args 3513 ] 3514 ), 3515 empty_struct=empty_struct, 3516 max_len=max_len, 3517 transform_struct=transform_struct, 3518 ) 3519 return self.sql(result)
3533 def base64encode_sql(self, expression: exp.Base64Encode) -> str: 3534 # DuckDB TO_BASE64 requires BLOB input 3535 # Snowflake BASE64_ENCODE accepts both VARCHAR and BINARY - for VARCHAR it implicitly 3536 # encodes UTF-8 bytes. We add ENCODE unless the input is a binary type. 3537 result = expression.this 3538 3539 # Check if input is a string type - ENCODE only accepts VARCHAR 3540 if result.is_type(*exp.DataType.TEXT_TYPES): 3541 result = exp.Encode(this=result) 3542 3543 result = exp.ToBase64(this=result) 3544 3545 max_line_length = expression.args.get("max_line_length") 3546 alphabet = expression.args.get("alphabet") 3547 3548 # Handle custom alphabet by replacing standard chars with custom ones 3549 result = _apply_base64_alphabet_replacements(result, alphabet) 3550 3551 # Handle max_line_length by inserting newlines every N characters 3552 line_length = ( 3553 t.cast(int, max_line_length.to_py()) 3554 if isinstance(max_line_length, exp.Literal) and max_line_length.is_number 3555 else 0 3556 ) 3557 if line_length > 0: 3558 newline = exp.Chr(expressions=[exp.Literal.number(10)]) 3559 result = exp.Trim( 3560 this=exp.RegexpReplace( 3561 this=result, 3562 expression=exp.Literal.string(f"(.{{{line_length}}})"), 3563 replacement=exp.Concat( 3564 expressions=[exp.Literal.string("\\1"), newline.copy()] 3565 ), 3566 ), 3567 expression=newline, 3568 position="TRAILING", 3569 ) 3570 3571 return self.sql(result)
3573 def replace_sql(self, expression: exp.Replace) -> str: 3574 result_sql = self.func( 3575 "REPLACE", 3576 _cast_to_varchar(expression.this), 3577 _cast_to_varchar(expression.expression), 3578 _cast_to_varchar(expression.args.get("replacement")), 3579 ) 3580 return _gen_with_cast_to_blob(self, expression, result_sql)
3592 def objectinsert_sql(self, expression: exp.ObjectInsert) -> str: 3593 this = expression.this 3594 key = expression.args.get("key") 3595 key_sql = key.name if isinstance(key, exp.Expression) else "" 3596 value_sql = self.sql(expression, "value") 3597 3598 kv_sql = f"{key_sql} := {value_sql}" 3599 3600 # If the input struct is empty e.g. transpiling OBJECT_INSERT(OBJECT_CONSTRUCT(), key, value) from Snowflake 3601 # then we can generate STRUCT_PACK which will build it since STRUCT_INSERT({}, key := value) is not valid DuckDB 3602 if isinstance(this, exp.Struct) and not this.expressions: 3603 return self.func("STRUCT_PACK", kv_sql) 3604 3605 return self.func("STRUCT_INSERT", this, kv_sql)
3636 def tablefromrows_sql(self, expression: exp.TableFromRows) -> str: 3637 # For GENERATOR, unwrap TABLE() - just emit the Generator (becomes RANGE) 3638 if isinstance(expression.this, exp.Generator): 3639 # Preserve alias, joins, and other table-level args 3640 table = exp.Table( 3641 this=expression.this, 3642 alias=expression.args.get("alias"), 3643 joins=expression.args.get("joins"), 3644 ) 3645 return self.sql(table) 3646 3647 return super().tablefromrows_sql(expression)
3649 def unnest_sql(self, expression: exp.Unnest) -> str: 3650 explode_array = expression.args.get("explode_array") 3651 if explode_array: 3652 # In BigQuery, UNNESTing a nested array leads to explosion of the top-level array & struct 3653 # This is transpiled to DDB by transforming "FROM UNNEST(...)" to "FROM (SELECT UNNEST(..., max_depth => 2))" 3654 expression.expressions.append( 3655 exp.Kwarg(this=exp.var("max_depth"), expression=exp.Literal.number(2)) 3656 ) 3657 3658 # If BQ's UNNEST is aliased, we transform it from a column alias to a table alias in DDB 3659 alias = expression.args.get("alias") 3660 if isinstance(alias, exp.TableAlias): 3661 expression.set("alias", None) 3662 if alias.columns: 3663 alias = exp.TableAlias(this=seq_get(alias.columns, 0)) 3664 3665 unnest_sql = super().unnest_sql(expression) 3666 select = exp.Select(expressions=[unnest_sql]).subquery(alias) 3667 return self.sql(select) 3668 3669 return super().unnest_sql(expression)
3671 def ignorenulls_sql(self, expression: exp.IgnoreNulls) -> str: 3672 this = expression.this 3673 3674 if isinstance(this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3675 # DuckDB should render IGNORE NULLS only for the general-purpose 3676 # window functions that accept it e.g. FIRST_VALUE(... IGNORE NULLS) OVER (...) 3677 return super().ignorenulls_sql(expression) 3678 3679 if isinstance(this, exp.First): 3680 this = exp.AnyValue(this=this.this) 3681 3682 if not isinstance(this, (exp.AnyValue, exp.ApproxQuantiles)): 3683 self.unsupported("IGNORE NULLS is not supported for non-window functions.") 3684 3685 return self.sql(this)
3687 def respectnulls_sql(self, expression: exp.RespectNulls) -> str: 3688 if isinstance(expression.this, self.IGNORE_RESPECT_NULLS_WINDOW_FUNCTIONS): 3689 # DuckDB should render RESPECT NULLS only for the general-purpose 3690 # window functions that accept it e.g. FIRST_VALUE(... RESPECT NULLS) OVER (...) 3691 return super().respectnulls_sql(expression) 3692 3693 self.unsupported("RESPECT NULLS is not supported for non-window functions.") 3694 return self.sql(expression, "this")
3696 def arraytostring_sql(self, expression: exp.ArrayToString) -> str: 3697 this = self.sql(expression, "this") 3698 null_text = self.sql(expression, "null") 3699 3700 if null_text: 3701 this = f"LIST_TRANSFORM({this}, x -> COALESCE(x, {null_text}))" 3702 3703 return self.func("ARRAY_TO_STRING", this, expression.expression)
3763 def regexpinstr_sql(self, expression: exp.RegexpInstr) -> str: 3764 this = expression.this 3765 pattern = expression.expression 3766 position = expression.args.get("position") 3767 orig_occ = expression.args.get("occurrence") 3768 occurrence = orig_occ or exp.Literal.number(1) 3769 option = expression.args.get("option") 3770 parameters = expression.args.get("parameters") 3771 3772 validated_flags = self._validate_regexp_flags(parameters, supported_flags="ims") 3773 if validated_flags: 3774 pattern = exp.Concat( 3775 expressions=[exp.Literal.string(f"(?{validated_flags})"), pattern] 3776 ) 3777 3778 # Handle starting position offset 3779 pos_offset: exp.Expression = exp.Literal.number(0) 3780 if position and (not position.is_int or position.to_py() > 1): 3781 this = exp.Substring(this=this, start=position) 3782 pos_offset = position - exp.Literal.number(1) 3783 3784 # Helper: LIST_SUM(LIST_TRANSFORM(list[1:end], x -> LENGTH(x))) 3785 def sum_lengths(func_name: str, end: exp.Expression) -> exp.Expression: 3786 lst = exp.Bracket( 3787 this=exp.Anonymous(this=func_name, expressions=[this, pattern]), 3788 expressions=[exp.Slice(this=exp.Literal.number(1), expression=end)], 3789 offset=1, 3790 ) 3791 transform = exp.Anonymous( 3792 this="LIST_TRANSFORM", 3793 expressions=[ 3794 lst, 3795 exp.Lambda( 3796 this=exp.Length(this=exp.to_identifier("x")), 3797 expressions=[exp.to_identifier("x")], 3798 ), 3799 ], 3800 ) 3801 return exp.Coalesce( 3802 this=exp.Anonymous(this="LIST_SUM", expressions=[transform]), 3803 expressions=[exp.Literal.number(0)], 3804 ) 3805 3806 # Position = 1 + sum(split_lengths[1:occ]) + sum(match_lengths[1:occ-1]) + offset 3807 base_pos: exp.Expression = ( 3808 exp.Literal.number(1) 3809 + sum_lengths("STRING_SPLIT_REGEX", occurrence) 3810 + sum_lengths("REGEXP_EXTRACT_ALL", occurrence - exp.Literal.number(1)) 3811 + pos_offset 3812 ) 3813 3814 # option=1: add match length for end position 3815 if option and option.is_int and option.to_py() == 1: 3816 match_at_occ = exp.Bracket( 3817 this=exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]), 3818 expressions=[occurrence], 3819 offset=1, 3820 ) 3821 base_pos = base_pos + exp.Coalesce( 3822 this=exp.Length(this=match_at_occ), expressions=[exp.Literal.number(0)] 3823 ) 3824 3825 # NULL checks for all provided arguments 3826 # .copy() is used strictly because .is_() alters the node's parent pointer, mutating the parsed AST 3827 null_args = [ 3828 expression.this, 3829 expression.expression, 3830 position, 3831 orig_occ, 3832 option, 3833 parameters, 3834 ] 3835 null_checks = [arg.copy().is_(exp.Null()) for arg in null_args if arg] 3836 3837 matches = exp.Anonymous(this="REGEXP_EXTRACT_ALL", expressions=[this, pattern]) 3838 3839 return self.sql( 3840 exp.case() 3841 .when(exp.or_(*null_checks), exp.Null()) 3842 .when(pattern.copy().eq(exp.Literal.string("")), exp.Literal.number(0)) 3843 .when(exp.Length(this=matches) < occurrence, exp.Literal.number(0)) 3844 .else_(base_pos) 3845 )
3847 @unsupported_args("culture") 3848 def numbertostr_sql(self, expression: exp.NumberToStr) -> str: 3849 fmt = expression.args.get("format") 3850 if fmt and fmt.is_int: 3851 return self.func("FORMAT", f"'{{:,.{fmt.name}f}}'", expression.this) 3852 3853 self.unsupported("Only integer formats are supported by NumberToStr") 3854 return self.function_fallback_sql(expression)
3867 def posexplode_sql(self, expression: exp.Posexplode) -> str: 3868 this = expression.this 3869 parent = expression.parent 3870 3871 # The default Spark aliases are "pos" and "col", unless specified otherwise 3872 pos, col = exp.to_identifier("pos"), exp.to_identifier("col") 3873 3874 if isinstance(parent, exp.Aliases): 3875 # Column case: SELECT POSEXPLODE(col) [AS (a, b)] 3876 pos, col = parent.expressions 3877 elif isinstance(parent, exp.Table): 3878 # Table case: SELECT * FROM POSEXPLODE(col) [AS (a, b)] 3879 alias = parent.args.get("alias") 3880 if alias: 3881 pos, col = alias.columns or [pos, col] 3882 alias.pop() 3883 3884 # Translate POSEXPLODE to UNNEST + GENERATE_SUBSCRIPTS 3885 # Note: In Spark pos is 0-indexed, but in DuckDB it's 1-indexed, so we subtract 1 from GENERATE_SUBSCRIPTS 3886 unnest_sql = self.sql(exp.Unnest(expressions=[this], alias=col)) 3887 gen_subscripts = self.sql( 3888 exp.Alias( 3889 this=exp.Anonymous( 3890 this="GENERATE_SUBSCRIPTS", expressions=[this, exp.Literal.number(1)] 3891 ) 3892 - exp.Literal.number(1), 3893 alias=pos, 3894 ) 3895 ) 3896 3897 posexplode_sql = self.format_args(gen_subscripts, unnest_sql) 3898 3899 if isinstance(parent, exp.From) or (parent and isinstance(parent.parent, exp.From)): 3900 # SELECT * FROM POSEXPLODE(col) -> SELECT * FROM (SELECT GENERATE_SUBSCRIPTS(...), UNNEST(...)) 3901 return self.sql(exp.Subquery(this=exp.Select(expressions=[posexplode_sql]))) 3902 3903 return posexplode_sql
3905 def addmonths_sql(self, expression: exp.AddMonths) -> str: 3906 """ 3907 Handles three key issues: 3908 1. Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers 3909 2. End-of-month preservation: If input is last day of month, result is last day of result month 3910 3. Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP) 3911 """ 3912 from sqlglot.optimizer.annotate_types import annotate_types 3913 3914 this = expression.this 3915 if not this.type: 3916 this = annotate_types(this, dialect=self.dialect) 3917 3918 if this.is_type(*exp.DataType.TEXT_TYPES): 3919 this = exp.Cast(this=this, to=exp.DataType(this=exp.DataType.Type.TIMESTAMP)) 3920 3921 # Detect float/decimal months to apply rounding (Snowflake behavior) 3922 # DuckDB INTERVAL syntax doesn't support non-integer expressions, so use TO_MONTHS 3923 months_expr = expression.expression 3924 if not months_expr.type: 3925 months_expr = annotate_types(months_expr, dialect=self.dialect) 3926 3927 # Build interval or to_months expression based on type 3928 # Float/decimal case: Round and use TO_MONTHS(CAST(ROUND(value) AS INT)) 3929 interval_or_to_months = ( 3930 exp.func("TO_MONTHS", exp.cast(exp.func("ROUND", months_expr), "INT")) 3931 if months_expr.is_type( 3932 exp.DataType.Type.FLOAT, 3933 exp.DataType.Type.DOUBLE, 3934 exp.DataType.Type.DECIMAL, 3935 ) 3936 # Integer case: standard INTERVAL N MONTH syntax 3937 else exp.Interval(this=months_expr, unit=exp.var("MONTH")) 3938 ) 3939 3940 date_add_expr = exp.Add(this=this, expression=interval_or_to_months) 3941 3942 # Apply end-of-month preservation if Snowflake flag is set 3943 # CASE WHEN LAST_DAY(date) = date THEN LAST_DAY(result) ELSE result END 3944 preserve_eom = expression.args.get("preserve_end_of_month") 3945 result_expr = ( 3946 exp.case() 3947 .when( 3948 exp.EQ(this=exp.func("LAST_DAY", this), expression=this), 3949 exp.func("LAST_DAY", date_add_expr), 3950 ) 3951 .else_(date_add_expr) 3952 if preserve_eom 3953 else date_add_expr 3954 ) 3955 3956 # DuckDB's DATE_ADD function returns TIMESTAMP/DATETIME by default, even when the input is DATE 3957 # To match for example Snowflake's ADD_MONTHS behavior (which preserves the input type) 3958 # We need to cast the result back to the original type when the input is DATE or TIMESTAMPTZ 3959 # Example: ADD_MONTHS('2023-01-31'::date, 1) should return DATE, not TIMESTAMP 3960 if this.is_type(exp.DataType.Type.DATE, exp.DataType.Type.TIMESTAMPTZ): 3961 return self.sql(exp.Cast(this=result_expr, to=this.type)) 3962 return self.sql(result_expr)
Handles three key issues:
- Float/decimal months: e.g., Snowflake rounds, whereas DuckDB INTERVAL requires integers
- End-of-month preservation: If input is last day of month, result is last day of result month
- Type preservation: Maintains DATE/TIMESTAMPTZ types (DuckDB defaults to TIMESTAMP)
3976 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 3977 unit = unit_to_str(expression) 3978 date = expression.this 3979 result = self.func("DATE_TRUNC", unit, date) 3980 3981 if ( 3982 expression.args.get("input_type_preserved") 3983 and date.is_type(*exp.DataType.TEMPORAL_TYPES) 3984 and not (is_date_unit(unit) and date.is_type(exp.DataType.Type.DATE)) 3985 ): 3986 return self.sql(exp.Cast(this=result, to=date.type)) 3987 3988 return result
3990 def timestamptrunc_sql(self, expression: exp.TimestampTrunc) -> str: 3991 unit = unit_to_str(expression) 3992 zone = expression.args.get("zone") 3993 timestamp = expression.this 3994 date_unit = is_date_unit(unit) 3995 3996 if date_unit and zone: 3997 # BigQuery's TIMESTAMP_TRUNC with timezone truncates in the target timezone and returns as UTC. 3998 # Double AT TIME ZONE needed for BigQuery compatibility: 3999 # 1. First AT TIME ZONE: ensures truncation happens in the target timezone 4000 # 2. Second AT TIME ZONE: converts the DATE result back to TIMESTAMPTZ (preserving time component) 4001 timestamp = exp.AtTimeZone(this=timestamp, zone=zone) 4002 result_sql = self.func("DATE_TRUNC", unit, timestamp) 4003 return self.sql(exp.AtTimeZone(this=result_sql, zone=zone)) 4004 4005 result = self.func("DATE_TRUNC", unit, timestamp) 4006 if expression.args.get("input_type_preserved"): 4007 if timestamp.type and timestamp.is_type( 4008 exp.DataType.Type.TIME, exp.DataType.Type.TIMETZ 4009 ): 4010 dummy_date = exp.Cast( 4011 this=exp.Literal.string("1970-01-01"), 4012 to=exp.DataType(this=exp.DataType.Type.DATE), 4013 ) 4014 date_time = exp.Add(this=dummy_date, expression=timestamp) 4015 result = self.func("DATE_TRUNC", unit, date_time) 4016 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4017 4018 if timestamp.is_type(*exp.DataType.TEMPORAL_TYPES) and not ( 4019 date_unit and timestamp.is_type(exp.DataType.Type.DATE) 4020 ): 4021 return self.sql(exp.Cast(this=result, to=timestamp.type)) 4022 4023 return result
4025 def trim_sql(self, expression: exp.Trim) -> str: 4026 expression.this.replace(_cast_to_varchar(expression.this)) 4027 if expression.expression: 4028 expression.expression.replace(_cast_to_varchar(expression.expression)) 4029 4030 result_sql = super().trim_sql(expression) 4031 return _gen_with_cast_to_blob(self, expression, result_sql)
4033 def round_sql(self, expression: exp.Round) -> str: 4034 this = expression.this 4035 decimals = expression.args.get("decimals") 4036 truncate = expression.args.get("truncate") 4037 4038 # DuckDB requires the scale (decimals) argument to be an INT 4039 # Some dialects (e.g., Snowflake) allow non-integer scales and cast to an integer internally 4040 if decimals is not None and expression.args.get("casts_non_integer_decimals"): 4041 if not (decimals.is_int or decimals.is_type(*exp.DataType.INTEGER_TYPES)): 4042 decimals = exp.cast(decimals, exp.DataType.Type.INT) 4043 4044 func = "ROUND" 4045 if truncate: 4046 # BigQuery uses ROUND_HALF_EVEN; Snowflake uses HALF_TO_EVEN 4047 if truncate.this in ("ROUND_HALF_EVEN", "HALF_TO_EVEN"): 4048 func = "ROUND_EVEN" 4049 truncate = None 4050 # BigQuery uses ROUND_HALF_AWAY_FROM_ZERO; Snowflake uses HALF_AWAY_FROM_ZERO 4051 elif truncate.this in ("ROUND_HALF_AWAY_FROM_ZERO", "HALF_AWAY_FROM_ZERO"): 4052 truncate = None 4053 4054 return self.func(func, this, decimals, truncate)
4056 def approxquantile_sql(self, expression: exp.ApproxQuantile) -> str: 4057 result = self.func("APPROX_QUANTILE", expression.this, expression.args.get("quantile")) 4058 4059 # DuckDB returns integers for APPROX_QUANTILE, cast to DOUBLE if the expected type is a real type 4060 if expression.is_type(*exp.DataType.REAL_TYPES): 4061 result = f"CAST({result} AS DOUBLE)" 4062 4063 return result
4065 def approxquantiles_sql(self, expression: exp.ApproxQuantiles) -> str: 4066 """ 4067 BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values 4068 dividing the input distribution into n equal-sized buckets. 4069 4070 Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery 4071 does not document the specific algorithm used so results may differ. DuckDB does not 4072 support RESPECT NULLS. 4073 """ 4074 this = expression.this 4075 if isinstance(this, exp.Distinct): 4076 # APPROX_QUANTILES requires 2 args and DISTINCT node grabs both 4077 if len(this.expressions) < 2: 4078 self.unsupported("APPROX_QUANTILES requires a bucket count argument") 4079 return self.function_fallback_sql(expression) 4080 num_quantiles_expr = this.expressions[1].pop() 4081 else: 4082 num_quantiles_expr = expression.expression 4083 4084 if not isinstance(num_quantiles_expr, exp.Literal) or not num_quantiles_expr.is_int: 4085 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4086 return self.function_fallback_sql(expression) 4087 4088 num_quantiles = t.cast(int, num_quantiles_expr.to_py()) 4089 if num_quantiles <= 0: 4090 self.unsupported("APPROX_QUANTILES bucket count must be a positive integer") 4091 return self.function_fallback_sql(expression) 4092 4093 quantiles = [ 4094 exp.Literal.number(Decimal(i) / Decimal(num_quantiles)) 4095 for i in range(num_quantiles + 1) 4096 ] 4097 4098 return self.sql( 4099 exp.ApproxQuantile(this=this, quantile=exp.Array(expressions=quantiles)) 4100 )
BigQuery's APPROX_QUANTILES(expr, n) returns an array of n+1 approximate quantile values dividing the input distribution into n equal-sized buckets.
Both BigQuery and DuckDB use approximate algorithms for quantile estimation, but BigQuery does not document the specific algorithm used so results may differ. DuckDB does not support RESPECT NULLS.
4109 def bitwisenot_sql(self, expression: exp.BitwiseNot) -> str: 4110 this = expression.this 4111 4112 if _is_binary(this): 4113 expression.type = exp.DataType.build("BINARY") 4114 4115 arg = _cast_to_bit(this) 4116 4117 if isinstance(this, exp.Neg): 4118 arg = exp.Paren(this=arg) 4119 4120 expression.set("this", arg) 4121 4122 result_sql = f"~{self.sql(expression, 'this')}" 4123 4124 return _gen_with_cast_to_blob(self, expression, result_sql)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_ONLY_LITERALS
- GROUPINGS_SEP
- INDEX_ON
- INOUT_SEPARATOR
- DIRECTED_JOINS
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_WITH_METHOD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- LIKE_PROPERTY_INSIDE_SCHEMA
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- TRY_SUPPORTED
- SUPPORTS_UESCAPE
- UNICODE_SUBSTITUTE
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- SUPPORTS_BETWEEN_FLAGS
- MATCH_AGAINST_TABLE_PREFIX
- DECLARE_DEFAULT_ASSIGNMENT
- UPDATE_STATEMENT_SUPPORTS_FROM
- STAR_EXCLUDE_REQUIRES_DERIVED_TABLE
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- pseudocolumn_sql
- columnposition_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- inoutcolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- triggerproperties_sql
- triggerreferencing_sql
- triggerevent_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- rollupindex_sql
- rollupproperty_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- booland_sql
- boolor_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- prewhere_sql
- where_sql
- partition_by_sql
- windowspec_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwiseor_sql
- bitwiserightshift_sql
- cast_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- eq_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- is_sql
- like_sql
- ilike_sql
- match_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- sub_sql
- trycast_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- rand_sql
- changes_sql
- summarize_sql
- explodinggenerateseries_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- slice_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql
- uuid_sql
- initcap_sql
- localtime_sql
- localtimestamp_sql
- weekstart_sql
- chr_sql
- block_sql
- storedprocedure_sql
- ifblock_sql
- whileblock_sql
- execute_sql
- executesql_sql