sqlglot.dialects.clickhouse
1from __future__ import annotations 2import typing as t 3import datetime 4from sqlglot import exp, generator, parser, tokens 5from sqlglot._typing import E 6from sqlglot.dialects.dialect import ( 7 Dialect, 8 NormalizationStrategy, 9 arg_max_or_min_no_count, 10 build_date_delta, 11 build_formatted_time, 12 inline_array_sql, 13 json_extract_segments, 14 json_path_key_only_name, 15 length_or_char_length_sql, 16 no_pivot_sql, 17 build_json_extract_path, 18 rename_func, 19 remove_from_array_using_filter, 20 sha256_sql, 21 strposition_sql, 22 var_map_sql, 23 timestamptrunc_sql, 24 unit_to_var, 25 trim_sql, 26) 27from sqlglot.generator import Generator 28from sqlglot.helper import is_int, seq_get 29from sqlglot.tokens import Token, TokenType 30from sqlglot.generator import unsupported_args 31 32DATEΤΙΜΕ_DELTA = t.Union[exp.DateAdd, exp.DateDiff, exp.DateSub, exp.TimestampSub, exp.TimestampAdd] 33 34 35def _build_datetime_format( 36 expr_type: t.Type[E], 37) -> t.Callable[[t.List], E]: 38 def _builder(args: t.List) -> E: 39 expr = build_formatted_time(expr_type, "clickhouse")(args) 40 41 timezone = seq_get(args, 2) 42 if timezone: 43 expr.set("zone", timezone) 44 45 return expr 46 47 return _builder 48 49 50def _unix_to_time_sql(self: ClickHouse.Generator, expression: exp.UnixToTime) -> str: 51 scale = expression.args.get("scale") 52 timestamp = expression.this 53 54 if scale in (None, exp.UnixToTime.SECONDS): 55 return self.func("fromUnixTimestamp", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 56 if scale == exp.UnixToTime.MILLIS: 57 return self.func("fromUnixTimestamp64Milli", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 58 if scale == exp.UnixToTime.MICROS: 59 return self.func("fromUnixTimestamp64Micro", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 60 if scale == exp.UnixToTime.NANOS: 61 return self.func("fromUnixTimestamp64Nano", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 62 63 return self.func( 64 "fromUnixTimestamp", 65 exp.cast( 66 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 67 ), 68 ) 69 70 71def _lower_func(sql: str) -> str: 72 index = sql.index("(") 73 return sql[:index].lower() + sql[index:] 74 75 76def _quantile_sql(self: ClickHouse.Generator, expression: exp.Quantile) -> str: 77 quantile = expression.args["quantile"] 78 args = f"({self.sql(expression, 'this')})" 79 80 if isinstance(quantile, exp.Array): 81 func = self.func("quantiles", *quantile) 82 else: 83 func = self.func("quantile", quantile) 84 85 return func + args 86 87 88def _build_count_if(args: t.List) -> exp.CountIf | exp.CombinedAggFunc: 89 if len(args) == 1: 90 return exp.CountIf(this=seq_get(args, 0)) 91 92 return exp.CombinedAggFunc(this="countIf", expressions=args) 93 94 95def _build_str_to_date(args: t.List) -> exp.Cast | exp.Anonymous: 96 if len(args) == 3: 97 return exp.Anonymous(this="STR_TO_DATE", expressions=args) 98 99 strtodate = exp.StrToDate.from_arg_list(args) 100 return exp.cast(strtodate, exp.DataType.build(exp.DataType.Type.DATETIME)) 101 102 103def _datetime_delta_sql(name: str) -> t.Callable[[Generator, DATEΤΙΜΕ_DELTA], str]: 104 def _delta_sql(self: Generator, expression: DATEΤΙΜΕ_DELTA) -> str: 105 if not expression.unit: 106 return rename_func(name)(self, expression) 107 108 return self.func( 109 name, 110 unit_to_var(expression), 111 expression.expression, 112 expression.this, 113 expression.args.get("zone"), 114 ) 115 116 return _delta_sql 117 118 119def _timestrtotime_sql(self: ClickHouse.Generator, expression: exp.TimeStrToTime): 120 ts = expression.this 121 122 tz = expression.args.get("zone") 123 if tz and isinstance(ts, exp.Literal): 124 # Clickhouse will not accept timestamps that include a UTC offset, so we must remove them. 125 # The first step to removing is parsing the string with `datetime.datetime.fromisoformat`. 126 # 127 # In python <3.11, `fromisoformat()` can only parse timestamps of millisecond (3 digit) 128 # or microsecond (6 digit) precision. It will error if passed any other number of fractional 129 # digits, so we extract the fractional seconds and pad to 6 digits before parsing. 130 ts_string = ts.name.strip() 131 132 # separate [date and time] from [fractional seconds and UTC offset] 133 ts_parts = ts_string.split(".") 134 if len(ts_parts) == 2: 135 # separate fractional seconds and UTC offset 136 offset_sep = "+" if "+" in ts_parts[1] else "-" 137 ts_frac_parts = ts_parts[1].split(offset_sep) 138 num_frac_parts = len(ts_frac_parts) 139 140 # pad to 6 digits if fractional seconds present 141 ts_frac_parts[0] = ts_frac_parts[0].ljust(6, "0") 142 ts_string = "".join( 143 [ 144 ts_parts[0], # date and time 145 ".", 146 ts_frac_parts[0], # fractional seconds 147 offset_sep if num_frac_parts > 1 else "", 148 ts_frac_parts[1] if num_frac_parts > 1 else "", # utc offset (if present) 149 ] 150 ) 151 152 # return literal with no timezone, eg turn '2020-01-01 12:13:14-08:00' into '2020-01-01 12:13:14' 153 # this is because Clickhouse encodes the timezone as a data type parameter and throws an error if 154 # it's part of the timestamp string 155 ts_without_tz = ( 156 datetime.datetime.fromisoformat(ts_string).replace(tzinfo=None).isoformat(sep=" ") 157 ) 158 ts = exp.Literal.string(ts_without_tz) 159 160 # Non-nullable DateTime64 with microsecond precision 161 expressions = [exp.DataTypeParam(this=tz)] if tz else [] 162 datatype = exp.DataType.build( 163 exp.DataType.Type.DATETIME64, 164 expressions=[exp.DataTypeParam(this=exp.Literal.number(6)), *expressions], 165 nullable=False, 166 ) 167 168 return self.sql(exp.cast(ts, datatype, dialect=self.dialect)) 169 170 171def _map_sql(self: ClickHouse.Generator, expression: exp.Map | exp.VarMap) -> str: 172 if not (expression.parent and expression.parent.arg_key == "settings"): 173 return _lower_func(var_map_sql(self, expression)) 174 175 keys = expression.args.get("keys") 176 values = expression.args.get("values") 177 178 if not isinstance(keys, exp.Array) or not isinstance(values, exp.Array): 179 self.unsupported("Cannot convert array columns into map.") 180 return "" 181 182 args = [] 183 for key, value in zip(keys.expressions, values.expressions): 184 args.append(f"{self.sql(key)}: {self.sql(value)}") 185 186 csv_args = ", ".join(args) 187 188 return f"{{{csv_args}}}" 189 190 191class ClickHouse(Dialect): 192 INDEX_OFFSET = 1 193 NORMALIZE_FUNCTIONS: bool | str = False 194 NULL_ORDERING = "nulls_are_last" 195 SUPPORTS_USER_DEFINED_TYPES = False 196 SAFE_DIVISION = True 197 LOG_BASE_FIRST: t.Optional[bool] = None 198 FORCE_EARLY_ALIAS_REF_EXPANSION = True 199 PRESERVE_ORIGINAL_NAMES = True 200 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 201 IDENTIFIERS_CAN_START_WITH_DIGIT = True 202 HEX_STRING_IS_INTEGER_TYPE = True 203 204 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 205 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 206 207 UNESCAPED_SEQUENCES = { 208 "\\0": "\0", 209 } 210 211 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 212 213 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 214 exp.Except: False, 215 exp.Intersect: False, 216 exp.Union: None, 217 } 218 219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases 244 245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 } 299 300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 317 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 322 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 323 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 324 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 325 "JSONEXTRACTSTRING": build_json_extract_path( 326 exp.JSONExtractScalar, zero_based_indexing=False 327 ), 328 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 329 "L2Distance": exp.EuclideanDistance.from_arg_list, 330 "MAP": parser.build_var_map, 331 "MATCH": exp.RegexpLike.from_arg_list, 332 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 333 "RANDCANONICAL": exp.Rand.from_arg_list, 334 "STR_TO_DATE": _build_str_to_date, 335 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 336 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 338 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "UNIQ": exp.ApproxDistinct.from_arg_list, 340 "XOR": lambda args: exp.Xor(expressions=args), 341 "MD5": exp.MD5Digest.from_arg_list, 342 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 343 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 344 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 345 "TOTYPENAME": exp.Typeof.from_arg_list, 346 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 347 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 348 } 349 FUNCTIONS.pop("TRANSFORM") 350 FUNCTIONS.pop("APPROX_TOP_SUM") 351 352 AGG_FUNCTIONS = { 353 "count", 354 "min", 355 "max", 356 "sum", 357 "avg", 358 "any", 359 "stddevPop", 360 "stddevSamp", 361 "varPop", 362 "varSamp", 363 "corr", 364 "covarPop", 365 "covarSamp", 366 "entropy", 367 "exponentialMovingAverage", 368 "intervalLengthSum", 369 "kolmogorovSmirnovTest", 370 "mannWhitneyUTest", 371 "median", 372 "rankCorr", 373 "sumKahan", 374 "studentTTest", 375 "welchTTest", 376 "anyHeavy", 377 "anyLast", 378 "boundingRatio", 379 "first_value", 380 "last_value", 381 "argMin", 382 "argMax", 383 "avgWeighted", 384 "topK", 385 "approx_top_sum", 386 "topKWeighted", 387 "deltaSum", 388 "deltaSumTimestamp", 389 "groupArray", 390 "groupArrayLast", 391 "groupUniqArray", 392 "groupArrayInsertAt", 393 "groupArrayMovingAvg", 394 "groupArrayMovingSum", 395 "groupArraySample", 396 "groupBitAnd", 397 "groupBitOr", 398 "groupBitXor", 399 "groupBitmap", 400 "groupBitmapAnd", 401 "groupBitmapOr", 402 "groupBitmapXor", 403 "sumWithOverflow", 404 "sumMap", 405 "minMap", 406 "maxMap", 407 "skewSamp", 408 "skewPop", 409 "kurtSamp", 410 "kurtPop", 411 "uniq", 412 "uniqExact", 413 "uniqCombined", 414 "uniqCombined64", 415 "uniqHLL12", 416 "uniqTheta", 417 "quantile", 418 "quantiles", 419 "quantileExact", 420 "quantilesExact", 421 "quantileExactLow", 422 "quantilesExactLow", 423 "quantileExactHigh", 424 "quantilesExactHigh", 425 "quantileExactWeighted", 426 "quantilesExactWeighted", 427 "quantileTiming", 428 "quantilesTiming", 429 "quantileTimingWeighted", 430 "quantilesTimingWeighted", 431 "quantileDeterministic", 432 "quantilesDeterministic", 433 "quantileTDigest", 434 "quantilesTDigest", 435 "quantileTDigestWeighted", 436 "quantilesTDigestWeighted", 437 "quantileBFloat16", 438 "quantilesBFloat16", 439 "quantileBFloat16Weighted", 440 "quantilesBFloat16Weighted", 441 "simpleLinearRegression", 442 "stochasticLinearRegression", 443 "stochasticLogisticRegression", 444 "categoricalInformationValue", 445 "contingency", 446 "cramersV", 447 "cramersVBiasCorrected", 448 "theilsU", 449 "maxIntersections", 450 "maxIntersectionsPosition", 451 "meanZTest", 452 "quantileInterpolatedWeighted", 453 "quantilesInterpolatedWeighted", 454 "quantileGK", 455 "quantilesGK", 456 "sparkBar", 457 "sumCount", 458 "largestTriangleThreeBuckets", 459 "histogram", 460 "sequenceMatch", 461 "sequenceCount", 462 "windowFunnel", 463 "retention", 464 "uniqUpTo", 465 "sequenceNextNode", 466 "exponentialTimeDecayedAvg", 467 } 468 469 AGG_FUNCTIONS_SUFFIXES = [ 470 "If", 471 "Array", 472 "ArrayIf", 473 "Map", 474 "SimpleState", 475 "State", 476 "Merge", 477 "MergeState", 478 "ForEach", 479 "Distinct", 480 "OrDefault", 481 "OrNull", 482 "Resample", 483 "ArgMin", 484 "ArgMax", 485 ] 486 487 FUNC_TOKENS = { 488 *parser.Parser.FUNC_TOKENS, 489 TokenType.AND, 490 TokenType.OR, 491 TokenType.SET, 492 } 493 494 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 495 496 ID_VAR_TOKENS = { 497 *parser.Parser.ID_VAR_TOKENS, 498 TokenType.LIKE, 499 } 500 501 AGG_FUNC_MAPPING = ( 502 lambda functions, suffixes: { 503 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 504 } 505 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 506 507 FUNCTION_PARSERS = { 508 **parser.Parser.FUNCTION_PARSERS, 509 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 510 "QUANTILE": lambda self: self._parse_quantile(), 511 "MEDIAN": lambda self: self._parse_quantile(), 512 "COLUMNS": lambda self: self._parse_columns(), 513 "TUPLE": lambda self: exp.Struct.from_arg_list(self._parse_function_args(alias=True)), 514 } 515 516 FUNCTION_PARSERS.pop("MATCH") 517 518 PROPERTY_PARSERS = { 519 **parser.Parser.PROPERTY_PARSERS, 520 "ENGINE": lambda self: self._parse_engine_property(), 521 } 522 PROPERTY_PARSERS.pop("DYNAMIC") 523 524 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 525 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 526 527 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 528 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 529 530 RANGE_PARSERS = { 531 **parser.Parser.RANGE_PARSERS, 532 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 533 } 534 535 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 536 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 537 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 538 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 539 540 JOIN_KINDS = { 541 *parser.Parser.JOIN_KINDS, 542 TokenType.ANY, 543 TokenType.ASOF, 544 TokenType.ARRAY, 545 } 546 547 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 548 TokenType.ANY, 549 TokenType.ARRAY, 550 TokenType.FINAL, 551 TokenType.FORMAT, 552 TokenType.SETTINGS, 553 } 554 555 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 556 TokenType.FORMAT, 557 } 558 559 LOG_DEFAULTS_TO_LN = True 560 561 QUERY_MODIFIER_PARSERS = { 562 **parser.Parser.QUERY_MODIFIER_PARSERS, 563 TokenType.SETTINGS: lambda self: ( 564 "settings", 565 self._advance() or self._parse_csv(self._parse_assignment), 566 ), 567 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 568 } 569 570 CONSTRAINT_PARSERS = { 571 **parser.Parser.CONSTRAINT_PARSERS, 572 "INDEX": lambda self: self._parse_index_constraint(), 573 "CODEC": lambda self: self._parse_compress(), 574 } 575 576 ALTER_PARSERS = { 577 **parser.Parser.ALTER_PARSERS, 578 "REPLACE": lambda self: self._parse_alter_table_replace(), 579 } 580 581 SCHEMA_UNNAMED_CONSTRAINTS = { 582 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 583 "INDEX", 584 } 585 586 PLACEHOLDER_PARSERS = { 587 **parser.Parser.PLACEHOLDER_PARSERS, 588 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 589 } 590 591 def _parse_engine_property(self) -> exp.EngineProperty: 592 self._match(TokenType.EQ) 593 return self.expression( 594 exp.EngineProperty, 595 this=self._parse_field(any_token=True, anonymous_func=True), 596 ) 597 598 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 599 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 600 return self._parse_lambda() 601 602 def _parse_types( 603 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 604 ) -> t.Optional[exp.Expression]: 605 dtype = super()._parse_types( 606 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 607 ) 608 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 609 # Mark every type as non-nullable which is ClickHouse's default, unless it's 610 # already marked as nullable. This marker helps us transpile types from other 611 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 612 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 613 # fail in ClickHouse without the `Nullable` type constructor. 614 dtype.set("nullable", False) 615 616 return dtype 617 618 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 619 index = self._index 620 this = self._parse_bitwise() 621 if self._match(TokenType.FROM): 622 self._retreat(index) 623 return super()._parse_extract() 624 625 # We return Anonymous here because extract and regexpExtract have different semantics, 626 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 627 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 628 # 629 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 630 self._match(TokenType.COMMA) 631 return self.expression( 632 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 633 ) 634 635 def _parse_assignment(self) -> t.Optional[exp.Expression]: 636 this = super()._parse_assignment() 637 638 if self._match(TokenType.PLACEHOLDER): 639 return self.expression( 640 exp.If, 641 this=this, 642 true=self._parse_assignment(), 643 false=self._match(TokenType.COLON) and self._parse_assignment(), 644 ) 645 646 return this 647 648 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 649 """ 650 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 651 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 652 """ 653 index = self._index 654 655 this = self._parse_id_var() 656 self._match(TokenType.COLON) 657 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 658 self._match_text_seq("IDENTIFIER") and "Identifier" 659 ) 660 661 if not kind: 662 self._retreat(index) 663 return None 664 elif not self._match(TokenType.R_BRACE): 665 self.raise_error("Expecting }") 666 667 if isinstance(this, exp.Identifier) and not this.quoted: 668 this = exp.var(this.name) 669 670 return self.expression(exp.Placeholder, this=this, kind=kind) 671 672 def _parse_bracket( 673 self, this: t.Optional[exp.Expression] = None 674 ) -> t.Optional[exp.Expression]: 675 l_brace = self._match(TokenType.L_BRACE, advance=False) 676 bracket = super()._parse_bracket(this) 677 678 if l_brace and isinstance(bracket, exp.Struct): 679 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 680 for expression in bracket.expressions: 681 if not isinstance(expression, exp.PropertyEQ): 682 break 683 684 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 685 varmap.args["values"].append("expressions", expression.expression) 686 687 return varmap 688 689 return bracket 690 691 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 692 this = super()._parse_in(this) 693 this.set("is_global", is_global) 694 return this 695 696 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 697 is_negated = self._match(TokenType.NOT) 698 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 699 return self.expression(exp.Not, this=this) if is_negated else this 700 701 def _parse_table( 702 self, 703 schema: bool = False, 704 joins: bool = False, 705 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 706 parse_bracket: bool = False, 707 is_db_reference: bool = False, 708 parse_partition: bool = False, 709 consume_pipe: bool = False, 710 ) -> t.Optional[exp.Expression]: 711 this = super()._parse_table( 712 schema=schema, 713 joins=joins, 714 alias_tokens=alias_tokens, 715 parse_bracket=parse_bracket, 716 is_db_reference=is_db_reference, 717 ) 718 719 if isinstance(this, exp.Table): 720 inner = this.this 721 alias = this.args.get("alias") 722 723 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 724 alias.set("columns", [exp.to_identifier("generate_series")]) 725 726 if self._match(TokenType.FINAL): 727 this = self.expression(exp.Final, this=this) 728 729 return this 730 731 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 732 return super()._parse_position(haystack_first=True) 733 734 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 735 def _parse_cte(self) -> t.Optional[exp.CTE]: 736 # WITH <identifier> AS <subquery expression> 737 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 738 739 if not cte: 740 # WITH <expression> AS <identifier> 741 cte = self.expression( 742 exp.CTE, 743 this=self._parse_assignment(), 744 alias=self._parse_table_alias(), 745 scalar=True, 746 ) 747 748 return cte 749 750 def _parse_join_parts( 751 self, 752 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 753 is_global = self._match(TokenType.GLOBAL) and self._prev 754 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 755 756 if kind_pre: 757 kind = self._match_set(self.JOIN_KINDS) and self._prev 758 side = self._match_set(self.JOIN_SIDES) and self._prev 759 return is_global, side, kind 760 761 return ( 762 is_global, 763 self._match_set(self.JOIN_SIDES) and self._prev, 764 self._match_set(self.JOIN_KINDS) and self._prev, 765 ) 766 767 def _parse_join( 768 self, skip_join_token: bool = False, parse_bracket: bool = False 769 ) -> t.Optional[exp.Join]: 770 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 771 if join: 772 join.set("global", join.args.pop("method", None)) 773 774 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 775 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 776 if join.kind == "ARRAY": 777 for table in join.find_all(exp.Table): 778 table.replace(table.to_column()) 779 780 return join 781 782 def _parse_function( 783 self, 784 functions: t.Optional[t.Dict[str, t.Callable]] = None, 785 anonymous: bool = False, 786 optional_parens: bool = True, 787 any_token: bool = False, 788 ) -> t.Optional[exp.Expression]: 789 expr = super()._parse_function( 790 functions=functions, 791 anonymous=anonymous, 792 optional_parens=optional_parens, 793 any_token=any_token, 794 ) 795 796 func = expr.this if isinstance(expr, exp.Window) else expr 797 798 # Aggregate functions can be split in 2 parts: <func_name><suffix> 799 parts = ( 800 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 801 ) 802 803 if parts: 804 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 805 params = self._parse_func_params(anon_func) 806 807 kwargs = { 808 "this": anon_func.this, 809 "expressions": anon_func.expressions, 810 } 811 if parts[1]: 812 exp_class: t.Type[exp.Expression] = ( 813 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 814 ) 815 else: 816 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 817 818 kwargs["exp_class"] = exp_class 819 if params: 820 kwargs["params"] = params 821 822 func = self.expression(**kwargs) 823 824 if isinstance(expr, exp.Window): 825 # The window's func was parsed as Anonymous in base parser, fix its 826 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 827 expr.set("this", func) 828 elif params: 829 # Params have blocked super()._parse_function() from parsing the following window 830 # (if that exists) as they're standing between the function call and the window spec 831 expr = self._parse_window(func) 832 else: 833 expr = func 834 835 return expr 836 837 def _parse_func_params( 838 self, this: t.Optional[exp.Func] = None 839 ) -> t.Optional[t.List[exp.Expression]]: 840 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 841 return self._parse_csv(self._parse_lambda) 842 843 if self._match(TokenType.L_PAREN): 844 params = self._parse_csv(self._parse_lambda) 845 self._match_r_paren(this) 846 return params 847 848 return None 849 850 def _parse_quantile(self) -> exp.Quantile: 851 this = self._parse_lambda() 852 params = self._parse_func_params() 853 if params: 854 return self.expression(exp.Quantile, this=params[0], quantile=this) 855 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 856 857 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 858 return super()._parse_wrapped_id_vars(optional=True) 859 860 def _parse_primary_key( 861 self, wrapped_optional: bool = False, in_props: bool = False 862 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 863 return super()._parse_primary_key( 864 wrapped_optional=wrapped_optional or in_props, in_props=in_props 865 ) 866 867 def _parse_on_property(self) -> t.Optional[exp.Expression]: 868 index = self._index 869 if self._match_text_seq("CLUSTER"): 870 this = self._parse_string() or self._parse_id_var() 871 if this: 872 return self.expression(exp.OnCluster, this=this) 873 else: 874 self._retreat(index) 875 return None 876 877 def _parse_index_constraint( 878 self, kind: t.Optional[str] = None 879 ) -> exp.IndexColumnConstraint: 880 # INDEX name1 expr TYPE type1(args) GRANULARITY value 881 this = self._parse_id_var() 882 expression = self._parse_assignment() 883 884 index_type = self._match_text_seq("TYPE") and ( 885 self._parse_function() or self._parse_var() 886 ) 887 888 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 889 890 return self.expression( 891 exp.IndexColumnConstraint, 892 this=this, 893 expression=expression, 894 index_type=index_type, 895 granularity=granularity, 896 ) 897 898 def _parse_partition(self) -> t.Optional[exp.Partition]: 899 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 900 if not self._match(TokenType.PARTITION): 901 return None 902 903 if self._match_text_seq("ID"): 904 # Corresponds to the PARTITION ID <string_value> syntax 905 expressions: t.List[exp.Expression] = [ 906 self.expression(exp.PartitionId, this=self._parse_string()) 907 ] 908 else: 909 expressions = self._parse_expressions() 910 911 return self.expression(exp.Partition, expressions=expressions) 912 913 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 914 partition = self._parse_partition() 915 916 if not partition or not self._match(TokenType.FROM): 917 return None 918 919 return self.expression( 920 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 921 ) 922 923 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 924 if not self._match_text_seq("PROJECTION"): 925 return None 926 927 return self.expression( 928 exp.ProjectionDef, 929 this=self._parse_id_var(), 930 expression=self._parse_wrapped(self._parse_statement), 931 ) 932 933 def _parse_constraint(self) -> t.Optional[exp.Expression]: 934 return super()._parse_constraint() or self._parse_projection_def() 935 936 def _parse_alias( 937 self, this: t.Optional[exp.Expression], explicit: bool = False 938 ) -> t.Optional[exp.Expression]: 939 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 940 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 941 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 942 return this 943 944 return super()._parse_alias(this=this, explicit=explicit) 945 946 def _parse_expression(self) -> t.Optional[exp.Expression]: 947 this = super()._parse_expression() 948 949 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 950 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 951 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 952 self._match(TokenType.R_PAREN) 953 954 return this 955 956 def _parse_columns(self) -> exp.Expression: 957 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 958 959 while self._next and self._match_text_seq(")", "APPLY", "("): 960 self._match(TokenType.R_PAREN) 961 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 962 return this 963 964 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 965 value = super()._parse_value(values=values) 966 if not value: 967 return None 968 969 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 970 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 971 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 972 # but the final result is not altered by the extra parentheses. 973 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 974 expressions = value.expressions 975 if values and not isinstance(expressions[-1], exp.Tuple): 976 value.set( 977 "expressions", 978 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 979 ) 980 981 return value 982 983 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 984 # ClickHouse allows custom expressions as partition key 985 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 986 return self.expression( 987 exp.PartitionedByProperty, 988 this=self._parse_assignment(), 989 ) 990 991 class Generator(generator.Generator): 992 QUERY_HINTS = False 993 STRUCT_DELIMITER = ("(", ")") 994 NVL2_SUPPORTED = False 995 TABLESAMPLE_REQUIRES_PARENS = False 996 TABLESAMPLE_SIZE_IS_ROWS = False 997 TABLESAMPLE_KEYWORDS = "SAMPLE" 998 LAST_DAY_SUPPORTS_DATE_PART = False 999 CAN_IMPLEMENT_ARRAY_ANY = True 1000 SUPPORTS_TO_NUMBER = False 1001 JOIN_HINTS = False 1002 TABLE_HINTS = False 1003 GROUPINGS_SEP = "" 1004 SET_OP_MODIFIERS = False 1005 ARRAY_SIZE_NAME = "LENGTH" 1006 WRAP_DERIVED_VALUES = False 1007 1008 STRING_TYPE_MAPPING = { 1009 exp.DataType.Type.BLOB: "String", 1010 exp.DataType.Type.CHAR: "String", 1011 exp.DataType.Type.LONGBLOB: "String", 1012 exp.DataType.Type.LONGTEXT: "String", 1013 exp.DataType.Type.MEDIUMBLOB: "String", 1014 exp.DataType.Type.MEDIUMTEXT: "String", 1015 exp.DataType.Type.TINYBLOB: "String", 1016 exp.DataType.Type.TINYTEXT: "String", 1017 exp.DataType.Type.TEXT: "String", 1018 exp.DataType.Type.VARBINARY: "String", 1019 exp.DataType.Type.VARCHAR: "String", 1020 } 1021 1022 SUPPORTED_JSON_PATH_PARTS = { 1023 exp.JSONPathKey, 1024 exp.JSONPathRoot, 1025 exp.JSONPathSubscript, 1026 } 1027 1028 TYPE_MAPPING = { 1029 **generator.Generator.TYPE_MAPPING, 1030 **STRING_TYPE_MAPPING, 1031 exp.DataType.Type.ARRAY: "Array", 1032 exp.DataType.Type.BOOLEAN: "Bool", 1033 exp.DataType.Type.BIGINT: "Int64", 1034 exp.DataType.Type.DATE32: "Date32", 1035 exp.DataType.Type.DATETIME: "DateTime", 1036 exp.DataType.Type.DATETIME2: "DateTime", 1037 exp.DataType.Type.SMALLDATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME64: "DateTime64", 1039 exp.DataType.Type.DECIMAL: "Decimal", 1040 exp.DataType.Type.DECIMAL32: "Decimal32", 1041 exp.DataType.Type.DECIMAL64: "Decimal64", 1042 exp.DataType.Type.DECIMAL128: "Decimal128", 1043 exp.DataType.Type.DECIMAL256: "Decimal256", 1044 exp.DataType.Type.TIMESTAMP: "DateTime", 1045 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1046 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1047 exp.DataType.Type.DOUBLE: "Float64", 1048 exp.DataType.Type.ENUM: "Enum", 1049 exp.DataType.Type.ENUM8: "Enum8", 1050 exp.DataType.Type.ENUM16: "Enum16", 1051 exp.DataType.Type.FIXEDSTRING: "FixedString", 1052 exp.DataType.Type.FLOAT: "Float32", 1053 exp.DataType.Type.INT: "Int32", 1054 exp.DataType.Type.MEDIUMINT: "Int32", 1055 exp.DataType.Type.INT128: "Int128", 1056 exp.DataType.Type.INT256: "Int256", 1057 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1058 exp.DataType.Type.MAP: "Map", 1059 exp.DataType.Type.NESTED: "Nested", 1060 exp.DataType.Type.NOTHING: "Nothing", 1061 exp.DataType.Type.SMALLINT: "Int16", 1062 exp.DataType.Type.STRUCT: "Tuple", 1063 exp.DataType.Type.TINYINT: "Int8", 1064 exp.DataType.Type.UBIGINT: "UInt64", 1065 exp.DataType.Type.UINT: "UInt32", 1066 exp.DataType.Type.UINT128: "UInt128", 1067 exp.DataType.Type.UINT256: "UInt256", 1068 exp.DataType.Type.USMALLINT: "UInt16", 1069 exp.DataType.Type.UTINYINT: "UInt8", 1070 exp.DataType.Type.IPV4: "IPv4", 1071 exp.DataType.Type.IPV6: "IPv6", 1072 exp.DataType.Type.POINT: "Point", 1073 exp.DataType.Type.RING: "Ring", 1074 exp.DataType.Type.LINESTRING: "LineString", 1075 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1076 exp.DataType.Type.POLYGON: "Polygon", 1077 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1078 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1079 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1080 exp.DataType.Type.DYNAMIC: "Dynamic", 1081 } 1082 1083 TRANSFORMS = { 1084 **generator.Generator.TRANSFORMS, 1085 exp.AnyValue: rename_func("any"), 1086 exp.ApproxDistinct: rename_func("uniq"), 1087 exp.ArrayConcat: rename_func("arrayConcat"), 1088 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1089 exp.ArrayRemove: remove_from_array_using_filter, 1090 exp.ArrayReverse: rename_func("arrayReverse"), 1091 exp.ArraySlice: rename_func("arraySlice"), 1092 exp.ArraySum: rename_func("arraySum"), 1093 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1094 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1095 exp.Array: inline_array_sql, 1096 exp.CastToStrType: rename_func("CAST"), 1097 exp.CountIf: rename_func("countIf"), 1098 exp.CosineDistance: rename_func("cosineDistance"), 1099 exp.CompressColumnConstraint: lambda self, 1100 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1101 exp.ComputedColumnConstraint: lambda self, 1102 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1103 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1104 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1105 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1106 exp.DateStrToDate: rename_func("toDate"), 1107 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1108 exp.Explode: rename_func("arrayJoin"), 1109 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1110 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1111 exp.IsNan: rename_func("isNaN"), 1112 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1113 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1114 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1115 exp.JSONPathKey: json_path_key_only_name, 1116 exp.JSONPathRoot: lambda *_: "", 1117 exp.Length: length_or_char_length_sql, 1118 exp.Map: _map_sql, 1119 exp.Median: rename_func("median"), 1120 exp.Nullif: rename_func("nullIf"), 1121 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1122 exp.Pivot: no_pivot_sql, 1123 exp.Quantile: _quantile_sql, 1124 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1125 exp.Rand: rename_func("randCanonical"), 1126 exp.StartsWith: rename_func("startsWith"), 1127 exp.Struct: rename_func("tuple"), 1128 exp.EndsWith: rename_func("endsWith"), 1129 exp.EuclideanDistance: rename_func("L2Distance"), 1130 exp.StrPosition: lambda self, e: strposition_sql( 1131 self, 1132 e, 1133 func_name="POSITION", 1134 supports_position=True, 1135 use_ansi_position=False, 1136 ), 1137 exp.TimeToStr: lambda self, e: self.func( 1138 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1139 ), 1140 exp.TimeStrToTime: _timestrtotime_sql, 1141 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1142 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1143 exp.Typeof: rename_func("toTypeName"), 1144 exp.VarMap: _map_sql, 1145 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1146 exp.MD5Digest: rename_func("MD5"), 1147 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1148 exp.SHA: rename_func("SHA1"), 1149 exp.SHA2: sha256_sql, 1150 exp.UnixToTime: _unix_to_time_sql, 1151 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1152 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1153 exp.Variance: rename_func("varSamp"), 1154 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1155 exp.Stddev: rename_func("stddevSamp"), 1156 exp.Chr: rename_func("CHAR"), 1157 exp.Lag: lambda self, e: self.func( 1158 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1159 ), 1160 exp.Lead: lambda self, e: self.func( 1161 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1162 ), 1163 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1164 rename_func("editDistance") 1165 ), 1166 exp.ParseDatetime: rename_func("parseDateTime"), 1167 } 1168 1169 PROPERTIES_LOCATION = { 1170 **generator.Generator.PROPERTIES_LOCATION, 1171 exp.OnCluster: exp.Properties.Location.POST_NAME, 1172 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1173 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1174 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1175 } 1176 1177 # There's no list in docs, but it can be found in Clickhouse code 1178 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1179 ON_CLUSTER_TARGETS = { 1180 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1181 "DATABASE", 1182 "TABLE", 1183 "VIEW", 1184 "DICTIONARY", 1185 "INDEX", 1186 "FUNCTION", 1187 "NAMED COLLECTION", 1188 } 1189 1190 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1191 NON_NULLABLE_TYPES = { 1192 exp.DataType.Type.ARRAY, 1193 exp.DataType.Type.MAP, 1194 exp.DataType.Type.STRUCT, 1195 exp.DataType.Type.POINT, 1196 exp.DataType.Type.RING, 1197 exp.DataType.Type.LINESTRING, 1198 exp.DataType.Type.MULTILINESTRING, 1199 exp.DataType.Type.POLYGON, 1200 exp.DataType.Type.MULTIPOLYGON, 1201 } 1202 1203 def offset_sql(self, expression: exp.Offset) -> str: 1204 offset = super().offset_sql(expression) 1205 1206 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1207 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1208 parent = expression.parent 1209 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1210 offset = f"{offset} ROWS" 1211 1212 return offset 1213 1214 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1215 strtodate_sql = self.function_fallback_sql(expression) 1216 1217 if not isinstance(expression.parent, exp.Cast): 1218 # StrToDate returns DATEs in other dialects (eg. postgres), so 1219 # this branch aims to improve the transpilation to clickhouse 1220 return self.cast_sql(exp.cast(expression, "DATE")) 1221 1222 return strtodate_sql 1223 1224 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1225 this = expression.this 1226 1227 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1228 return self.sql(this) 1229 1230 return super().cast_sql(expression, safe_prefix=safe_prefix) 1231 1232 def trycast_sql(self, expression: exp.TryCast) -> str: 1233 dtype = expression.to 1234 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1235 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1236 dtype.set("nullable", True) 1237 1238 return super().cast_sql(expression) 1239 1240 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1241 this = self.json_path_part(expression.this) 1242 return str(int(this) + 1) if is_int(this) else this 1243 1244 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1245 return f"AS {self.sql(expression, 'this')}" 1246 1247 def _any_to_has( 1248 self, 1249 expression: exp.EQ | exp.NEQ, 1250 default: t.Callable[[t.Any], str], 1251 prefix: str = "", 1252 ) -> str: 1253 if isinstance(expression.left, exp.Any): 1254 arr = expression.left 1255 this = expression.right 1256 elif isinstance(expression.right, exp.Any): 1257 arr = expression.right 1258 this = expression.left 1259 else: 1260 return default(expression) 1261 1262 return prefix + self.func("has", arr.this.unnest(), this) 1263 1264 def eq_sql(self, expression: exp.EQ) -> str: 1265 return self._any_to_has(expression, super().eq_sql) 1266 1267 def neq_sql(self, expression: exp.NEQ) -> str: 1268 return self._any_to_has(expression, super().neq_sql, "NOT ") 1269 1270 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1271 # Manually add a flag to make the search case-insensitive 1272 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1273 return self.func("match", expression.this, regex) 1274 1275 def datatype_sql(self, expression: exp.DataType) -> str: 1276 # String is the standard ClickHouse type, every other variant is just an alias. 1277 # Additionally, any supplied length parameter will be ignored. 1278 # 1279 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1280 if expression.this in self.STRING_TYPE_MAPPING: 1281 dtype = "String" 1282 else: 1283 dtype = super().datatype_sql(expression) 1284 1285 # This section changes the type to `Nullable(...)` if the following conditions hold: 1286 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1287 # and change their semantics 1288 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1289 # constraint: "Type of Map key must be a type, that can be represented by integer or 1290 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1291 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1292 parent = expression.parent 1293 nullable = expression.args.get("nullable") 1294 if nullable is True or ( 1295 nullable is None 1296 and not ( 1297 isinstance(parent, exp.DataType) 1298 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1299 and expression.index in (None, 0) 1300 ) 1301 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1302 ): 1303 dtype = f"Nullable({dtype})" 1304 1305 return dtype 1306 1307 def cte_sql(self, expression: exp.CTE) -> str: 1308 if expression.args.get("scalar"): 1309 this = self.sql(expression, "this") 1310 alias = self.sql(expression, "alias") 1311 return f"{this} AS {alias}" 1312 1313 return super().cte_sql(expression) 1314 1315 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1316 return super().after_limit_modifiers(expression) + [ 1317 ( 1318 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1319 if expression.args.get("settings") 1320 else "" 1321 ), 1322 ( 1323 self.seg("FORMAT ") + self.sql(expression, "format") 1324 if expression.args.get("format") 1325 else "" 1326 ), 1327 ] 1328 1329 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1330 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1331 1332 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1333 return f"ON CLUSTER {self.sql(expression, 'this')}" 1334 1335 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1336 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1337 exp.Properties.Location.POST_NAME 1338 ): 1339 this_name = self.sql( 1340 expression.this if isinstance(expression.this, exp.Schema) else expression, 1341 "this", 1342 ) 1343 this_properties = " ".join( 1344 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1345 ) 1346 this_schema = self.schema_columns_sql(expression.this) 1347 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1348 1349 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1350 1351 return super().createable_sql(expression, locations) 1352 1353 def create_sql(self, expression: exp.Create) -> str: 1354 # The comment property comes last in CTAS statements, i.e. after the query 1355 query = expression.expression 1356 if isinstance(query, exp.Query): 1357 comment_prop = expression.find(exp.SchemaCommentProperty) 1358 if comment_prop: 1359 comment_prop.pop() 1360 query.replace(exp.paren(query)) 1361 else: 1362 comment_prop = None 1363 1364 create_sql = super().create_sql(expression) 1365 1366 comment_sql = self.sql(comment_prop) 1367 comment_sql = f" {comment_sql}" if comment_sql else "" 1368 1369 return f"{create_sql}{comment_sql}" 1370 1371 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1372 this = self.indent(self.sql(expression, "this")) 1373 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1374 1375 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1376 this = self.sql(expression, "this") 1377 this = f" {this}" if this else "" 1378 expr = self.sql(expression, "expression") 1379 expr = f" {expr}" if expr else "" 1380 index_type = self.sql(expression, "index_type") 1381 index_type = f" TYPE {index_type}" if index_type else "" 1382 granularity = self.sql(expression, "granularity") 1383 granularity = f" GRANULARITY {granularity}" if granularity else "" 1384 1385 return f"INDEX{this}{expr}{index_type}{granularity}" 1386 1387 def partition_sql(self, expression: exp.Partition) -> str: 1388 return f"PARTITION {self.expressions(expression, flat=True)}" 1389 1390 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1391 return f"ID {self.sql(expression.this)}" 1392 1393 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1394 return ( 1395 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1396 ) 1397 1398 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1399 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1400 1401 def is_sql(self, expression: exp.Is) -> str: 1402 is_sql = super().is_sql(expression) 1403 1404 if isinstance(expression.parent, exp.Not): 1405 # value IS NOT NULL -> NOT (value IS NULL) 1406 is_sql = self.wrap(is_sql) 1407 1408 return is_sql 1409 1410 def in_sql(self, expression: exp.In) -> str: 1411 in_sql = super().in_sql(expression) 1412 1413 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1414 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1415 1416 return in_sql 1417 1418 def not_sql(self, expression: exp.Not) -> str: 1419 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1420 # let `GLOBAL IN` child interpose `NOT` 1421 return self.sql(expression, "this") 1422 1423 return super().not_sql(expression) 1424 1425 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1426 # If the VALUES clause contains tuples of expressions, we need to treat it 1427 # as a table since Clickhouse will automatically alias it as such. 1428 alias = expression.args.get("alias") 1429 1430 if alias and alias.args.get("columns") and expression.expressions: 1431 values = expression.expressions[0].expressions 1432 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1433 else: 1434 values_as_table = True 1435 1436 return super().values_sql(expression, values_as_table=values_as_table)
192class ClickHouse(Dialect): 193 INDEX_OFFSET = 1 194 NORMALIZE_FUNCTIONS: bool | str = False 195 NULL_ORDERING = "nulls_are_last" 196 SUPPORTS_USER_DEFINED_TYPES = False 197 SAFE_DIVISION = True 198 LOG_BASE_FIRST: t.Optional[bool] = None 199 FORCE_EARLY_ALIAS_REF_EXPANSION = True 200 PRESERVE_ORIGINAL_NAMES = True 201 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 202 IDENTIFIERS_CAN_START_WITH_DIGIT = True 203 HEX_STRING_IS_INTEGER_TYPE = True 204 205 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 206 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 207 208 UNESCAPED_SEQUENCES = { 209 "\\0": "\0", 210 } 211 212 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 213 214 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 215 exp.Except: False, 216 exp.Intersect: False, 217 exp.Union: None, 218 } 219 220 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 221 # Clickhouse allows VALUES to have an embedded structure e.g: 222 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 223 # In this case, we don't want to qualify the columns 224 values = expression.expressions[0].expressions 225 226 structure = ( 227 values[0] 228 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 229 else None 230 ) 231 if structure: 232 # Split each column definition into the column name e.g: 233 # 'person String, place String' -> ['person', 'place'] 234 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 235 column_aliases = [ 236 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 237 ] 238 else: 239 # Default column aliases in CH are "c1", "c2", etc. 240 column_aliases = [ 241 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 242 ] 243 244 return column_aliases 245 246 class Tokenizer(tokens.Tokenizer): 247 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 248 IDENTIFIERS = ['"', "`"] 249 IDENTIFIER_ESCAPES = ["\\"] 250 STRING_ESCAPES = ["'", "\\"] 251 BIT_STRINGS = [("0b", "")] 252 HEX_STRINGS = [("0x", ""), ("0X", "")] 253 HEREDOC_STRINGS = ["$"] 254 255 KEYWORDS = { 256 **tokens.Tokenizer.KEYWORDS, 257 ".:": TokenType.DOTCOLON, 258 "ATTACH": TokenType.COMMAND, 259 "DATE32": TokenType.DATE32, 260 "DATETIME64": TokenType.DATETIME64, 261 "DICTIONARY": TokenType.DICTIONARY, 262 "DYNAMIC": TokenType.DYNAMIC, 263 "ENUM8": TokenType.ENUM8, 264 "ENUM16": TokenType.ENUM16, 265 "EXCHANGE": TokenType.COMMAND, 266 "FINAL": TokenType.FINAL, 267 "FIXEDSTRING": TokenType.FIXEDSTRING, 268 "FLOAT32": TokenType.FLOAT, 269 "FLOAT64": TokenType.DOUBLE, 270 "GLOBAL": TokenType.GLOBAL, 271 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 272 "MAP": TokenType.MAP, 273 "NESTED": TokenType.NESTED, 274 "NOTHING": TokenType.NOTHING, 275 "SAMPLE": TokenType.TABLE_SAMPLE, 276 "TUPLE": TokenType.STRUCT, 277 "UINT16": TokenType.USMALLINT, 278 "UINT32": TokenType.UINT, 279 "UINT64": TokenType.UBIGINT, 280 "UINT8": TokenType.UTINYINT, 281 "IPV4": TokenType.IPV4, 282 "IPV6": TokenType.IPV6, 283 "POINT": TokenType.POINT, 284 "RING": TokenType.RING, 285 "LINESTRING": TokenType.LINESTRING, 286 "MULTILINESTRING": TokenType.MULTILINESTRING, 287 "POLYGON": TokenType.POLYGON, 288 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 289 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 290 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 291 "SYSTEM": TokenType.COMMAND, 292 "PREWHERE": TokenType.PREWHERE, 293 } 294 KEYWORDS.pop("/*+") 295 296 SINGLE_TOKENS = { 297 **tokens.Tokenizer.SINGLE_TOKENS, 298 "$": TokenType.HEREDOC_STRING, 299 } 300 301 class Parser(parser.Parser): 302 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 303 # * select x from t1 union all select x from t2 limit 1; 304 # * select x from t1 union all (select x from t2 limit 1); 305 MODIFIERS_ATTACHED_TO_SET_OP = False 306 INTERVAL_SPANS = False 307 OPTIONAL_ALIAS_TOKEN_CTE = False 308 JOINS_HAVE_EQUAL_PRECEDENCE = True 309 310 FUNCTIONS = { 311 **parser.Parser.FUNCTIONS, 312 "ANY": exp.AnyValue.from_arg_list, 313 "ARRAYSUM": exp.ArraySum.from_arg_list, 314 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 315 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 316 "COUNTIF": _build_count_if, 317 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 318 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 320 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 322 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 323 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 324 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 325 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 326 "JSONEXTRACTSTRING": build_json_extract_path( 327 exp.JSONExtractScalar, zero_based_indexing=False 328 ), 329 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 330 "L2Distance": exp.EuclideanDistance.from_arg_list, 331 "MAP": parser.build_var_map, 332 "MATCH": exp.RegexpLike.from_arg_list, 333 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 334 "RANDCANONICAL": exp.Rand.from_arg_list, 335 "STR_TO_DATE": _build_str_to_date, 336 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "UNIQ": exp.ApproxDistinct.from_arg_list, 341 "XOR": lambda args: exp.Xor(expressions=args), 342 "MD5": exp.MD5Digest.from_arg_list, 343 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 344 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 345 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 346 "TOTYPENAME": exp.Typeof.from_arg_list, 347 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 348 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 349 } 350 FUNCTIONS.pop("TRANSFORM") 351 FUNCTIONS.pop("APPROX_TOP_SUM") 352 353 AGG_FUNCTIONS = { 354 "count", 355 "min", 356 "max", 357 "sum", 358 "avg", 359 "any", 360 "stddevPop", 361 "stddevSamp", 362 "varPop", 363 "varSamp", 364 "corr", 365 "covarPop", 366 "covarSamp", 367 "entropy", 368 "exponentialMovingAverage", 369 "intervalLengthSum", 370 "kolmogorovSmirnovTest", 371 "mannWhitneyUTest", 372 "median", 373 "rankCorr", 374 "sumKahan", 375 "studentTTest", 376 "welchTTest", 377 "anyHeavy", 378 "anyLast", 379 "boundingRatio", 380 "first_value", 381 "last_value", 382 "argMin", 383 "argMax", 384 "avgWeighted", 385 "topK", 386 "approx_top_sum", 387 "topKWeighted", 388 "deltaSum", 389 "deltaSumTimestamp", 390 "groupArray", 391 "groupArrayLast", 392 "groupUniqArray", 393 "groupArrayInsertAt", 394 "groupArrayMovingAvg", 395 "groupArrayMovingSum", 396 "groupArraySample", 397 "groupBitAnd", 398 "groupBitOr", 399 "groupBitXor", 400 "groupBitmap", 401 "groupBitmapAnd", 402 "groupBitmapOr", 403 "groupBitmapXor", 404 "sumWithOverflow", 405 "sumMap", 406 "minMap", 407 "maxMap", 408 "skewSamp", 409 "skewPop", 410 "kurtSamp", 411 "kurtPop", 412 "uniq", 413 "uniqExact", 414 "uniqCombined", 415 "uniqCombined64", 416 "uniqHLL12", 417 "uniqTheta", 418 "quantile", 419 "quantiles", 420 "quantileExact", 421 "quantilesExact", 422 "quantileExactLow", 423 "quantilesExactLow", 424 "quantileExactHigh", 425 "quantilesExactHigh", 426 "quantileExactWeighted", 427 "quantilesExactWeighted", 428 "quantileTiming", 429 "quantilesTiming", 430 "quantileTimingWeighted", 431 "quantilesTimingWeighted", 432 "quantileDeterministic", 433 "quantilesDeterministic", 434 "quantileTDigest", 435 "quantilesTDigest", 436 "quantileTDigestWeighted", 437 "quantilesTDigestWeighted", 438 "quantileBFloat16", 439 "quantilesBFloat16", 440 "quantileBFloat16Weighted", 441 "quantilesBFloat16Weighted", 442 "simpleLinearRegression", 443 "stochasticLinearRegression", 444 "stochasticLogisticRegression", 445 "categoricalInformationValue", 446 "contingency", 447 "cramersV", 448 "cramersVBiasCorrected", 449 "theilsU", 450 "maxIntersections", 451 "maxIntersectionsPosition", 452 "meanZTest", 453 "quantileInterpolatedWeighted", 454 "quantilesInterpolatedWeighted", 455 "quantileGK", 456 "quantilesGK", 457 "sparkBar", 458 "sumCount", 459 "largestTriangleThreeBuckets", 460 "histogram", 461 "sequenceMatch", 462 "sequenceCount", 463 "windowFunnel", 464 "retention", 465 "uniqUpTo", 466 "sequenceNextNode", 467 "exponentialTimeDecayedAvg", 468 } 469 470 AGG_FUNCTIONS_SUFFIXES = [ 471 "If", 472 "Array", 473 "ArrayIf", 474 "Map", 475 "SimpleState", 476 "State", 477 "Merge", 478 "MergeState", 479 "ForEach", 480 "Distinct", 481 "OrDefault", 482 "OrNull", 483 "Resample", 484 "ArgMin", 485 "ArgMax", 486 ] 487 488 FUNC_TOKENS = { 489 *parser.Parser.FUNC_TOKENS, 490 TokenType.AND, 491 TokenType.OR, 492 TokenType.SET, 493 } 494 495 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 496 497 ID_VAR_TOKENS = { 498 *parser.Parser.ID_VAR_TOKENS, 499 TokenType.LIKE, 500 } 501 502 AGG_FUNC_MAPPING = ( 503 lambda functions, suffixes: { 504 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 505 } 506 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 507 508 FUNCTION_PARSERS = { 509 **parser.Parser.FUNCTION_PARSERS, 510 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 511 "QUANTILE": lambda self: self._parse_quantile(), 512 "MEDIAN": lambda self: self._parse_quantile(), 513 "COLUMNS": lambda self: self._parse_columns(), 514 "TUPLE": lambda self: exp.Struct.from_arg_list(self._parse_function_args(alias=True)), 515 } 516 517 FUNCTION_PARSERS.pop("MATCH") 518 519 PROPERTY_PARSERS = { 520 **parser.Parser.PROPERTY_PARSERS, 521 "ENGINE": lambda self: self._parse_engine_property(), 522 } 523 PROPERTY_PARSERS.pop("DYNAMIC") 524 525 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 526 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 527 528 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 529 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 530 531 RANGE_PARSERS = { 532 **parser.Parser.RANGE_PARSERS, 533 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 534 } 535 536 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 537 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 538 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 539 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 540 541 JOIN_KINDS = { 542 *parser.Parser.JOIN_KINDS, 543 TokenType.ANY, 544 TokenType.ASOF, 545 TokenType.ARRAY, 546 } 547 548 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 549 TokenType.ANY, 550 TokenType.ARRAY, 551 TokenType.FINAL, 552 TokenType.FORMAT, 553 TokenType.SETTINGS, 554 } 555 556 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 557 TokenType.FORMAT, 558 } 559 560 LOG_DEFAULTS_TO_LN = True 561 562 QUERY_MODIFIER_PARSERS = { 563 **parser.Parser.QUERY_MODIFIER_PARSERS, 564 TokenType.SETTINGS: lambda self: ( 565 "settings", 566 self._advance() or self._parse_csv(self._parse_assignment), 567 ), 568 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 569 } 570 571 CONSTRAINT_PARSERS = { 572 **parser.Parser.CONSTRAINT_PARSERS, 573 "INDEX": lambda self: self._parse_index_constraint(), 574 "CODEC": lambda self: self._parse_compress(), 575 } 576 577 ALTER_PARSERS = { 578 **parser.Parser.ALTER_PARSERS, 579 "REPLACE": lambda self: self._parse_alter_table_replace(), 580 } 581 582 SCHEMA_UNNAMED_CONSTRAINTS = { 583 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 584 "INDEX", 585 } 586 587 PLACEHOLDER_PARSERS = { 588 **parser.Parser.PLACEHOLDER_PARSERS, 589 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 590 } 591 592 def _parse_engine_property(self) -> exp.EngineProperty: 593 self._match(TokenType.EQ) 594 return self.expression( 595 exp.EngineProperty, 596 this=self._parse_field(any_token=True, anonymous_func=True), 597 ) 598 599 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 600 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 601 return self._parse_lambda() 602 603 def _parse_types( 604 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 605 ) -> t.Optional[exp.Expression]: 606 dtype = super()._parse_types( 607 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 608 ) 609 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 610 # Mark every type as non-nullable which is ClickHouse's default, unless it's 611 # already marked as nullable. This marker helps us transpile types from other 612 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 613 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 614 # fail in ClickHouse without the `Nullable` type constructor. 615 dtype.set("nullable", False) 616 617 return dtype 618 619 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 620 index = self._index 621 this = self._parse_bitwise() 622 if self._match(TokenType.FROM): 623 self._retreat(index) 624 return super()._parse_extract() 625 626 # We return Anonymous here because extract and regexpExtract have different semantics, 627 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 628 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 629 # 630 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 631 self._match(TokenType.COMMA) 632 return self.expression( 633 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 634 ) 635 636 def _parse_assignment(self) -> t.Optional[exp.Expression]: 637 this = super()._parse_assignment() 638 639 if self._match(TokenType.PLACEHOLDER): 640 return self.expression( 641 exp.If, 642 this=this, 643 true=self._parse_assignment(), 644 false=self._match(TokenType.COLON) and self._parse_assignment(), 645 ) 646 647 return this 648 649 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 650 """ 651 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 652 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 653 """ 654 index = self._index 655 656 this = self._parse_id_var() 657 self._match(TokenType.COLON) 658 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 659 self._match_text_seq("IDENTIFIER") and "Identifier" 660 ) 661 662 if not kind: 663 self._retreat(index) 664 return None 665 elif not self._match(TokenType.R_BRACE): 666 self.raise_error("Expecting }") 667 668 if isinstance(this, exp.Identifier) and not this.quoted: 669 this = exp.var(this.name) 670 671 return self.expression(exp.Placeholder, this=this, kind=kind) 672 673 def _parse_bracket( 674 self, this: t.Optional[exp.Expression] = None 675 ) -> t.Optional[exp.Expression]: 676 l_brace = self._match(TokenType.L_BRACE, advance=False) 677 bracket = super()._parse_bracket(this) 678 679 if l_brace and isinstance(bracket, exp.Struct): 680 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 681 for expression in bracket.expressions: 682 if not isinstance(expression, exp.PropertyEQ): 683 break 684 685 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 686 varmap.args["values"].append("expressions", expression.expression) 687 688 return varmap 689 690 return bracket 691 692 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 693 this = super()._parse_in(this) 694 this.set("is_global", is_global) 695 return this 696 697 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 698 is_negated = self._match(TokenType.NOT) 699 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 700 return self.expression(exp.Not, this=this) if is_negated else this 701 702 def _parse_table( 703 self, 704 schema: bool = False, 705 joins: bool = False, 706 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 707 parse_bracket: bool = False, 708 is_db_reference: bool = False, 709 parse_partition: bool = False, 710 consume_pipe: bool = False, 711 ) -> t.Optional[exp.Expression]: 712 this = super()._parse_table( 713 schema=schema, 714 joins=joins, 715 alias_tokens=alias_tokens, 716 parse_bracket=parse_bracket, 717 is_db_reference=is_db_reference, 718 ) 719 720 if isinstance(this, exp.Table): 721 inner = this.this 722 alias = this.args.get("alias") 723 724 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 725 alias.set("columns", [exp.to_identifier("generate_series")]) 726 727 if self._match(TokenType.FINAL): 728 this = self.expression(exp.Final, this=this) 729 730 return this 731 732 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 733 return super()._parse_position(haystack_first=True) 734 735 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 736 def _parse_cte(self) -> t.Optional[exp.CTE]: 737 # WITH <identifier> AS <subquery expression> 738 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 739 740 if not cte: 741 # WITH <expression> AS <identifier> 742 cte = self.expression( 743 exp.CTE, 744 this=self._parse_assignment(), 745 alias=self._parse_table_alias(), 746 scalar=True, 747 ) 748 749 return cte 750 751 def _parse_join_parts( 752 self, 753 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 754 is_global = self._match(TokenType.GLOBAL) and self._prev 755 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 756 757 if kind_pre: 758 kind = self._match_set(self.JOIN_KINDS) and self._prev 759 side = self._match_set(self.JOIN_SIDES) and self._prev 760 return is_global, side, kind 761 762 return ( 763 is_global, 764 self._match_set(self.JOIN_SIDES) and self._prev, 765 self._match_set(self.JOIN_KINDS) and self._prev, 766 ) 767 768 def _parse_join( 769 self, skip_join_token: bool = False, parse_bracket: bool = False 770 ) -> t.Optional[exp.Join]: 771 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 772 if join: 773 join.set("global", join.args.pop("method", None)) 774 775 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 776 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 777 if join.kind == "ARRAY": 778 for table in join.find_all(exp.Table): 779 table.replace(table.to_column()) 780 781 return join 782 783 def _parse_function( 784 self, 785 functions: t.Optional[t.Dict[str, t.Callable]] = None, 786 anonymous: bool = False, 787 optional_parens: bool = True, 788 any_token: bool = False, 789 ) -> t.Optional[exp.Expression]: 790 expr = super()._parse_function( 791 functions=functions, 792 anonymous=anonymous, 793 optional_parens=optional_parens, 794 any_token=any_token, 795 ) 796 797 func = expr.this if isinstance(expr, exp.Window) else expr 798 799 # Aggregate functions can be split in 2 parts: <func_name><suffix> 800 parts = ( 801 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 802 ) 803 804 if parts: 805 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 806 params = self._parse_func_params(anon_func) 807 808 kwargs = { 809 "this": anon_func.this, 810 "expressions": anon_func.expressions, 811 } 812 if parts[1]: 813 exp_class: t.Type[exp.Expression] = ( 814 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 815 ) 816 else: 817 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 818 819 kwargs["exp_class"] = exp_class 820 if params: 821 kwargs["params"] = params 822 823 func = self.expression(**kwargs) 824 825 if isinstance(expr, exp.Window): 826 # The window's func was parsed as Anonymous in base parser, fix its 827 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 828 expr.set("this", func) 829 elif params: 830 # Params have blocked super()._parse_function() from parsing the following window 831 # (if that exists) as they're standing between the function call and the window spec 832 expr = self._parse_window(func) 833 else: 834 expr = func 835 836 return expr 837 838 def _parse_func_params( 839 self, this: t.Optional[exp.Func] = None 840 ) -> t.Optional[t.List[exp.Expression]]: 841 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 842 return self._parse_csv(self._parse_lambda) 843 844 if self._match(TokenType.L_PAREN): 845 params = self._parse_csv(self._parse_lambda) 846 self._match_r_paren(this) 847 return params 848 849 return None 850 851 def _parse_quantile(self) -> exp.Quantile: 852 this = self._parse_lambda() 853 params = self._parse_func_params() 854 if params: 855 return self.expression(exp.Quantile, this=params[0], quantile=this) 856 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 857 858 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 859 return super()._parse_wrapped_id_vars(optional=True) 860 861 def _parse_primary_key( 862 self, wrapped_optional: bool = False, in_props: bool = False 863 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 864 return super()._parse_primary_key( 865 wrapped_optional=wrapped_optional or in_props, in_props=in_props 866 ) 867 868 def _parse_on_property(self) -> t.Optional[exp.Expression]: 869 index = self._index 870 if self._match_text_seq("CLUSTER"): 871 this = self._parse_string() or self._parse_id_var() 872 if this: 873 return self.expression(exp.OnCluster, this=this) 874 else: 875 self._retreat(index) 876 return None 877 878 def _parse_index_constraint( 879 self, kind: t.Optional[str] = None 880 ) -> exp.IndexColumnConstraint: 881 # INDEX name1 expr TYPE type1(args) GRANULARITY value 882 this = self._parse_id_var() 883 expression = self._parse_assignment() 884 885 index_type = self._match_text_seq("TYPE") and ( 886 self._parse_function() or self._parse_var() 887 ) 888 889 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 890 891 return self.expression( 892 exp.IndexColumnConstraint, 893 this=this, 894 expression=expression, 895 index_type=index_type, 896 granularity=granularity, 897 ) 898 899 def _parse_partition(self) -> t.Optional[exp.Partition]: 900 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 901 if not self._match(TokenType.PARTITION): 902 return None 903 904 if self._match_text_seq("ID"): 905 # Corresponds to the PARTITION ID <string_value> syntax 906 expressions: t.List[exp.Expression] = [ 907 self.expression(exp.PartitionId, this=self._parse_string()) 908 ] 909 else: 910 expressions = self._parse_expressions() 911 912 return self.expression(exp.Partition, expressions=expressions) 913 914 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 915 partition = self._parse_partition() 916 917 if not partition or not self._match(TokenType.FROM): 918 return None 919 920 return self.expression( 921 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 922 ) 923 924 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 925 if not self._match_text_seq("PROJECTION"): 926 return None 927 928 return self.expression( 929 exp.ProjectionDef, 930 this=self._parse_id_var(), 931 expression=self._parse_wrapped(self._parse_statement), 932 ) 933 934 def _parse_constraint(self) -> t.Optional[exp.Expression]: 935 return super()._parse_constraint() or self._parse_projection_def() 936 937 def _parse_alias( 938 self, this: t.Optional[exp.Expression], explicit: bool = False 939 ) -> t.Optional[exp.Expression]: 940 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 941 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 942 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 943 return this 944 945 return super()._parse_alias(this=this, explicit=explicit) 946 947 def _parse_expression(self) -> t.Optional[exp.Expression]: 948 this = super()._parse_expression() 949 950 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 951 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 952 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 953 self._match(TokenType.R_PAREN) 954 955 return this 956 957 def _parse_columns(self) -> exp.Expression: 958 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 959 960 while self._next and self._match_text_seq(")", "APPLY", "("): 961 self._match(TokenType.R_PAREN) 962 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 963 return this 964 965 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 966 value = super()._parse_value(values=values) 967 if not value: 968 return None 969 970 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 971 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 972 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 973 # but the final result is not altered by the extra parentheses. 974 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 975 expressions = value.expressions 976 if values and not isinstance(expressions[-1], exp.Tuple): 977 value.set( 978 "expressions", 979 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 980 ) 981 982 return value 983 984 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 985 # ClickHouse allows custom expressions as partition key 986 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 987 return self.expression( 988 exp.PartitionedByProperty, 989 this=self._parse_assignment(), 990 ) 991 992 class Generator(generator.Generator): 993 QUERY_HINTS = False 994 STRUCT_DELIMITER = ("(", ")") 995 NVL2_SUPPORTED = False 996 TABLESAMPLE_REQUIRES_PARENS = False 997 TABLESAMPLE_SIZE_IS_ROWS = False 998 TABLESAMPLE_KEYWORDS = "SAMPLE" 999 LAST_DAY_SUPPORTS_DATE_PART = False 1000 CAN_IMPLEMENT_ARRAY_ANY = True 1001 SUPPORTS_TO_NUMBER = False 1002 JOIN_HINTS = False 1003 TABLE_HINTS = False 1004 GROUPINGS_SEP = "" 1005 SET_OP_MODIFIERS = False 1006 ARRAY_SIZE_NAME = "LENGTH" 1007 WRAP_DERIVED_VALUES = False 1008 1009 STRING_TYPE_MAPPING = { 1010 exp.DataType.Type.BLOB: "String", 1011 exp.DataType.Type.CHAR: "String", 1012 exp.DataType.Type.LONGBLOB: "String", 1013 exp.DataType.Type.LONGTEXT: "String", 1014 exp.DataType.Type.MEDIUMBLOB: "String", 1015 exp.DataType.Type.MEDIUMTEXT: "String", 1016 exp.DataType.Type.TINYBLOB: "String", 1017 exp.DataType.Type.TINYTEXT: "String", 1018 exp.DataType.Type.TEXT: "String", 1019 exp.DataType.Type.VARBINARY: "String", 1020 exp.DataType.Type.VARCHAR: "String", 1021 } 1022 1023 SUPPORTED_JSON_PATH_PARTS = { 1024 exp.JSONPathKey, 1025 exp.JSONPathRoot, 1026 exp.JSONPathSubscript, 1027 } 1028 1029 TYPE_MAPPING = { 1030 **generator.Generator.TYPE_MAPPING, 1031 **STRING_TYPE_MAPPING, 1032 exp.DataType.Type.ARRAY: "Array", 1033 exp.DataType.Type.BOOLEAN: "Bool", 1034 exp.DataType.Type.BIGINT: "Int64", 1035 exp.DataType.Type.DATE32: "Date32", 1036 exp.DataType.Type.DATETIME: "DateTime", 1037 exp.DataType.Type.DATETIME2: "DateTime", 1038 exp.DataType.Type.SMALLDATETIME: "DateTime", 1039 exp.DataType.Type.DATETIME64: "DateTime64", 1040 exp.DataType.Type.DECIMAL: "Decimal", 1041 exp.DataType.Type.DECIMAL32: "Decimal32", 1042 exp.DataType.Type.DECIMAL64: "Decimal64", 1043 exp.DataType.Type.DECIMAL128: "Decimal128", 1044 exp.DataType.Type.DECIMAL256: "Decimal256", 1045 exp.DataType.Type.TIMESTAMP: "DateTime", 1046 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1047 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1048 exp.DataType.Type.DOUBLE: "Float64", 1049 exp.DataType.Type.ENUM: "Enum", 1050 exp.DataType.Type.ENUM8: "Enum8", 1051 exp.DataType.Type.ENUM16: "Enum16", 1052 exp.DataType.Type.FIXEDSTRING: "FixedString", 1053 exp.DataType.Type.FLOAT: "Float32", 1054 exp.DataType.Type.INT: "Int32", 1055 exp.DataType.Type.MEDIUMINT: "Int32", 1056 exp.DataType.Type.INT128: "Int128", 1057 exp.DataType.Type.INT256: "Int256", 1058 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1059 exp.DataType.Type.MAP: "Map", 1060 exp.DataType.Type.NESTED: "Nested", 1061 exp.DataType.Type.NOTHING: "Nothing", 1062 exp.DataType.Type.SMALLINT: "Int16", 1063 exp.DataType.Type.STRUCT: "Tuple", 1064 exp.DataType.Type.TINYINT: "Int8", 1065 exp.DataType.Type.UBIGINT: "UInt64", 1066 exp.DataType.Type.UINT: "UInt32", 1067 exp.DataType.Type.UINT128: "UInt128", 1068 exp.DataType.Type.UINT256: "UInt256", 1069 exp.DataType.Type.USMALLINT: "UInt16", 1070 exp.DataType.Type.UTINYINT: "UInt8", 1071 exp.DataType.Type.IPV4: "IPv4", 1072 exp.DataType.Type.IPV6: "IPv6", 1073 exp.DataType.Type.POINT: "Point", 1074 exp.DataType.Type.RING: "Ring", 1075 exp.DataType.Type.LINESTRING: "LineString", 1076 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1077 exp.DataType.Type.POLYGON: "Polygon", 1078 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1079 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1080 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1081 exp.DataType.Type.DYNAMIC: "Dynamic", 1082 } 1083 1084 TRANSFORMS = { 1085 **generator.Generator.TRANSFORMS, 1086 exp.AnyValue: rename_func("any"), 1087 exp.ApproxDistinct: rename_func("uniq"), 1088 exp.ArrayConcat: rename_func("arrayConcat"), 1089 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1090 exp.ArrayRemove: remove_from_array_using_filter, 1091 exp.ArrayReverse: rename_func("arrayReverse"), 1092 exp.ArraySlice: rename_func("arraySlice"), 1093 exp.ArraySum: rename_func("arraySum"), 1094 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1095 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1096 exp.Array: inline_array_sql, 1097 exp.CastToStrType: rename_func("CAST"), 1098 exp.CountIf: rename_func("countIf"), 1099 exp.CosineDistance: rename_func("cosineDistance"), 1100 exp.CompressColumnConstraint: lambda self, 1101 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1102 exp.ComputedColumnConstraint: lambda self, 1103 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1104 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1105 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1106 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1107 exp.DateStrToDate: rename_func("toDate"), 1108 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1109 exp.Explode: rename_func("arrayJoin"), 1110 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1111 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1112 exp.IsNan: rename_func("isNaN"), 1113 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1114 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1115 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONPathKey: json_path_key_only_name, 1117 exp.JSONPathRoot: lambda *_: "", 1118 exp.Length: length_or_char_length_sql, 1119 exp.Map: _map_sql, 1120 exp.Median: rename_func("median"), 1121 exp.Nullif: rename_func("nullIf"), 1122 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1123 exp.Pivot: no_pivot_sql, 1124 exp.Quantile: _quantile_sql, 1125 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1126 exp.Rand: rename_func("randCanonical"), 1127 exp.StartsWith: rename_func("startsWith"), 1128 exp.Struct: rename_func("tuple"), 1129 exp.EndsWith: rename_func("endsWith"), 1130 exp.EuclideanDistance: rename_func("L2Distance"), 1131 exp.StrPosition: lambda self, e: strposition_sql( 1132 self, 1133 e, 1134 func_name="POSITION", 1135 supports_position=True, 1136 use_ansi_position=False, 1137 ), 1138 exp.TimeToStr: lambda self, e: self.func( 1139 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1140 ), 1141 exp.TimeStrToTime: _timestrtotime_sql, 1142 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1143 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1144 exp.Typeof: rename_func("toTypeName"), 1145 exp.VarMap: _map_sql, 1146 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1147 exp.MD5Digest: rename_func("MD5"), 1148 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1149 exp.SHA: rename_func("SHA1"), 1150 exp.SHA2: sha256_sql, 1151 exp.UnixToTime: _unix_to_time_sql, 1152 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1153 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1154 exp.Variance: rename_func("varSamp"), 1155 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1156 exp.Stddev: rename_func("stddevSamp"), 1157 exp.Chr: rename_func("CHAR"), 1158 exp.Lag: lambda self, e: self.func( 1159 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1160 ), 1161 exp.Lead: lambda self, e: self.func( 1162 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1163 ), 1164 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1165 rename_func("editDistance") 1166 ), 1167 exp.ParseDatetime: rename_func("parseDateTime"), 1168 } 1169 1170 PROPERTIES_LOCATION = { 1171 **generator.Generator.PROPERTIES_LOCATION, 1172 exp.OnCluster: exp.Properties.Location.POST_NAME, 1173 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1174 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1175 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1176 } 1177 1178 # There's no list in docs, but it can be found in Clickhouse code 1179 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1180 ON_CLUSTER_TARGETS = { 1181 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1182 "DATABASE", 1183 "TABLE", 1184 "VIEW", 1185 "DICTIONARY", 1186 "INDEX", 1187 "FUNCTION", 1188 "NAMED COLLECTION", 1189 } 1190 1191 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1192 NON_NULLABLE_TYPES = { 1193 exp.DataType.Type.ARRAY, 1194 exp.DataType.Type.MAP, 1195 exp.DataType.Type.STRUCT, 1196 exp.DataType.Type.POINT, 1197 exp.DataType.Type.RING, 1198 exp.DataType.Type.LINESTRING, 1199 exp.DataType.Type.MULTILINESTRING, 1200 exp.DataType.Type.POLYGON, 1201 exp.DataType.Type.MULTIPOLYGON, 1202 } 1203 1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset 1214 1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql 1224 1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix) 1232 1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression) 1240 1241 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1242 this = self.json_path_part(expression.this) 1243 return str(int(this) + 1) if is_int(this) else this 1244 1245 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1246 return f"AS {self.sql(expression, 'this')}" 1247 1248 def _any_to_has( 1249 self, 1250 expression: exp.EQ | exp.NEQ, 1251 default: t.Callable[[t.Any], str], 1252 prefix: str = "", 1253 ) -> str: 1254 if isinstance(expression.left, exp.Any): 1255 arr = expression.left 1256 this = expression.right 1257 elif isinstance(expression.right, exp.Any): 1258 arr = expression.right 1259 this = expression.left 1260 else: 1261 return default(expression) 1262 1263 return prefix + self.func("has", arr.this.unnest(), this) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 return self._any_to_has(expression, super().eq_sql) 1267 1268 def neq_sql(self, expression: exp.NEQ) -> str: 1269 return self._any_to_has(expression, super().neq_sql, "NOT ") 1270 1271 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1272 # Manually add a flag to make the search case-insensitive 1273 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1274 return self.func("match", expression.this, regex) 1275 1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype 1307 1308 def cte_sql(self, expression: exp.CTE) -> str: 1309 if expression.args.get("scalar"): 1310 this = self.sql(expression, "this") 1311 alias = self.sql(expression, "alias") 1312 return f"{this} AS {alias}" 1313 1314 return super().cte_sql(expression) 1315 1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ] 1329 1330 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1331 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1332 1333 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1334 return f"ON CLUSTER {self.sql(expression, 'this')}" 1335 1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations) 1353 1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}" 1371 1372 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1373 this = self.indent(self.sql(expression, "this")) 1374 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1375 1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}" 1387 1388 def partition_sql(self, expression: exp.Partition) -> str: 1389 return f"PARTITION {self.expressions(expression, flat=True)}" 1390 1391 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1392 return f"ID {self.sql(expression.this)}" 1393 1394 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1395 return ( 1396 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1397 ) 1398 1399 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1400 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1401 1402 def is_sql(self, expression: exp.Is) -> str: 1403 is_sql = super().is_sql(expression) 1404 1405 if isinstance(expression.parent, exp.Not): 1406 # value IS NOT NULL -> NOT (value IS NULL) 1407 is_sql = self.wrap(is_sql) 1408 1409 return is_sql 1410 1411 def in_sql(self, expression: exp.In) -> str: 1412 in_sql = super().in_sql(expression) 1413 1414 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1415 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1416 1417 return in_sql 1418 1419 def not_sql(self, expression: exp.Not) -> str: 1420 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1421 # let `GLOBAL IN` child interpose `NOT` 1422 return self.sql(expression, "this") 1423 1424 return super().not_sql(expression) 1425 1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Default NULL
ordering method to use if not explicitly set.
Possible values: "nulls_are_small"
, "nulls_are_large"
, "nulls_are_last"
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether number literals can include underscores for better readability
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Helper for dialects that use a different name for the same creatable kind. For example, the Clickhouse equivalent of CREATE SCHEMA is CREATE DATABASE.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
220 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 221 # Clickhouse allows VALUES to have an embedded structure e.g: 222 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 223 # In this case, we don't want to qualify the columns 224 values = expression.expressions[0].expressions 225 226 structure = ( 227 values[0] 228 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 229 else None 230 ) 231 if structure: 232 # Split each column definition into the column name e.g: 233 # 'person String, place String' -> ['person', 'place'] 234 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 235 column_aliases = [ 236 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 237 ] 238 else: 239 # Default column aliases in CH are "c1", "c2", etc. 240 column_aliases = [ 241 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 242 ] 243 244 return column_aliases
246 class Tokenizer(tokens.Tokenizer): 247 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 248 IDENTIFIERS = ['"', "`"] 249 IDENTIFIER_ESCAPES = ["\\"] 250 STRING_ESCAPES = ["'", "\\"] 251 BIT_STRINGS = [("0b", "")] 252 HEX_STRINGS = [("0x", ""), ("0X", "")] 253 HEREDOC_STRINGS = ["$"] 254 255 KEYWORDS = { 256 **tokens.Tokenizer.KEYWORDS, 257 ".:": TokenType.DOTCOLON, 258 "ATTACH": TokenType.COMMAND, 259 "DATE32": TokenType.DATE32, 260 "DATETIME64": TokenType.DATETIME64, 261 "DICTIONARY": TokenType.DICTIONARY, 262 "DYNAMIC": TokenType.DYNAMIC, 263 "ENUM8": TokenType.ENUM8, 264 "ENUM16": TokenType.ENUM16, 265 "EXCHANGE": TokenType.COMMAND, 266 "FINAL": TokenType.FINAL, 267 "FIXEDSTRING": TokenType.FIXEDSTRING, 268 "FLOAT32": TokenType.FLOAT, 269 "FLOAT64": TokenType.DOUBLE, 270 "GLOBAL": TokenType.GLOBAL, 271 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 272 "MAP": TokenType.MAP, 273 "NESTED": TokenType.NESTED, 274 "NOTHING": TokenType.NOTHING, 275 "SAMPLE": TokenType.TABLE_SAMPLE, 276 "TUPLE": TokenType.STRUCT, 277 "UINT16": TokenType.USMALLINT, 278 "UINT32": TokenType.UINT, 279 "UINT64": TokenType.UBIGINT, 280 "UINT8": TokenType.UTINYINT, 281 "IPV4": TokenType.IPV4, 282 "IPV6": TokenType.IPV6, 283 "POINT": TokenType.POINT, 284 "RING": TokenType.RING, 285 "LINESTRING": TokenType.LINESTRING, 286 "MULTILINESTRING": TokenType.MULTILINESTRING, 287 "POLYGON": TokenType.POLYGON, 288 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 289 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 290 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 291 "SYSTEM": TokenType.COMMAND, 292 "PREWHERE": TokenType.PREWHERE, 293 } 294 KEYWORDS.pop("/*+") 295 296 SINGLE_TOKENS = { 297 **tokens.Tokenizer.SINGLE_TOKENS, 298 "$": TokenType.HEREDOC_STRING, 299 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BYTE_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
301 class Parser(parser.Parser): 302 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 303 # * select x from t1 union all select x from t2 limit 1; 304 # * select x from t1 union all (select x from t2 limit 1); 305 MODIFIERS_ATTACHED_TO_SET_OP = False 306 INTERVAL_SPANS = False 307 OPTIONAL_ALIAS_TOKEN_CTE = False 308 JOINS_HAVE_EQUAL_PRECEDENCE = True 309 310 FUNCTIONS = { 311 **parser.Parser.FUNCTIONS, 312 "ANY": exp.AnyValue.from_arg_list, 313 "ARRAYSUM": exp.ArraySum.from_arg_list, 314 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 315 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 316 "COUNTIF": _build_count_if, 317 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 318 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 320 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 322 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 323 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 324 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 325 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 326 "JSONEXTRACTSTRING": build_json_extract_path( 327 exp.JSONExtractScalar, zero_based_indexing=False 328 ), 329 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 330 "L2Distance": exp.EuclideanDistance.from_arg_list, 331 "MAP": parser.build_var_map, 332 "MATCH": exp.RegexpLike.from_arg_list, 333 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 334 "RANDCANONICAL": exp.Rand.from_arg_list, 335 "STR_TO_DATE": _build_str_to_date, 336 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "UNIQ": exp.ApproxDistinct.from_arg_list, 341 "XOR": lambda args: exp.Xor(expressions=args), 342 "MD5": exp.MD5Digest.from_arg_list, 343 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 344 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 345 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 346 "TOTYPENAME": exp.Typeof.from_arg_list, 347 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 348 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 349 } 350 FUNCTIONS.pop("TRANSFORM") 351 FUNCTIONS.pop("APPROX_TOP_SUM") 352 353 AGG_FUNCTIONS = { 354 "count", 355 "min", 356 "max", 357 "sum", 358 "avg", 359 "any", 360 "stddevPop", 361 "stddevSamp", 362 "varPop", 363 "varSamp", 364 "corr", 365 "covarPop", 366 "covarSamp", 367 "entropy", 368 "exponentialMovingAverage", 369 "intervalLengthSum", 370 "kolmogorovSmirnovTest", 371 "mannWhitneyUTest", 372 "median", 373 "rankCorr", 374 "sumKahan", 375 "studentTTest", 376 "welchTTest", 377 "anyHeavy", 378 "anyLast", 379 "boundingRatio", 380 "first_value", 381 "last_value", 382 "argMin", 383 "argMax", 384 "avgWeighted", 385 "topK", 386 "approx_top_sum", 387 "topKWeighted", 388 "deltaSum", 389 "deltaSumTimestamp", 390 "groupArray", 391 "groupArrayLast", 392 "groupUniqArray", 393 "groupArrayInsertAt", 394 "groupArrayMovingAvg", 395 "groupArrayMovingSum", 396 "groupArraySample", 397 "groupBitAnd", 398 "groupBitOr", 399 "groupBitXor", 400 "groupBitmap", 401 "groupBitmapAnd", 402 "groupBitmapOr", 403 "groupBitmapXor", 404 "sumWithOverflow", 405 "sumMap", 406 "minMap", 407 "maxMap", 408 "skewSamp", 409 "skewPop", 410 "kurtSamp", 411 "kurtPop", 412 "uniq", 413 "uniqExact", 414 "uniqCombined", 415 "uniqCombined64", 416 "uniqHLL12", 417 "uniqTheta", 418 "quantile", 419 "quantiles", 420 "quantileExact", 421 "quantilesExact", 422 "quantileExactLow", 423 "quantilesExactLow", 424 "quantileExactHigh", 425 "quantilesExactHigh", 426 "quantileExactWeighted", 427 "quantilesExactWeighted", 428 "quantileTiming", 429 "quantilesTiming", 430 "quantileTimingWeighted", 431 "quantilesTimingWeighted", 432 "quantileDeterministic", 433 "quantilesDeterministic", 434 "quantileTDigest", 435 "quantilesTDigest", 436 "quantileTDigestWeighted", 437 "quantilesTDigestWeighted", 438 "quantileBFloat16", 439 "quantilesBFloat16", 440 "quantileBFloat16Weighted", 441 "quantilesBFloat16Weighted", 442 "simpleLinearRegression", 443 "stochasticLinearRegression", 444 "stochasticLogisticRegression", 445 "categoricalInformationValue", 446 "contingency", 447 "cramersV", 448 "cramersVBiasCorrected", 449 "theilsU", 450 "maxIntersections", 451 "maxIntersectionsPosition", 452 "meanZTest", 453 "quantileInterpolatedWeighted", 454 "quantilesInterpolatedWeighted", 455 "quantileGK", 456 "quantilesGK", 457 "sparkBar", 458 "sumCount", 459 "largestTriangleThreeBuckets", 460 "histogram", 461 "sequenceMatch", 462 "sequenceCount", 463 "windowFunnel", 464 "retention", 465 "uniqUpTo", 466 "sequenceNextNode", 467 "exponentialTimeDecayedAvg", 468 } 469 470 AGG_FUNCTIONS_SUFFIXES = [ 471 "If", 472 "Array", 473 "ArrayIf", 474 "Map", 475 "SimpleState", 476 "State", 477 "Merge", 478 "MergeState", 479 "ForEach", 480 "Distinct", 481 "OrDefault", 482 "OrNull", 483 "Resample", 484 "ArgMin", 485 "ArgMax", 486 ] 487 488 FUNC_TOKENS = { 489 *parser.Parser.FUNC_TOKENS, 490 TokenType.AND, 491 TokenType.OR, 492 TokenType.SET, 493 } 494 495 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 496 497 ID_VAR_TOKENS = { 498 *parser.Parser.ID_VAR_TOKENS, 499 TokenType.LIKE, 500 } 501 502 AGG_FUNC_MAPPING = ( 503 lambda functions, suffixes: { 504 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 505 } 506 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 507 508 FUNCTION_PARSERS = { 509 **parser.Parser.FUNCTION_PARSERS, 510 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 511 "QUANTILE": lambda self: self._parse_quantile(), 512 "MEDIAN": lambda self: self._parse_quantile(), 513 "COLUMNS": lambda self: self._parse_columns(), 514 "TUPLE": lambda self: exp.Struct.from_arg_list(self._parse_function_args(alias=True)), 515 } 516 517 FUNCTION_PARSERS.pop("MATCH") 518 519 PROPERTY_PARSERS = { 520 **parser.Parser.PROPERTY_PARSERS, 521 "ENGINE": lambda self: self._parse_engine_property(), 522 } 523 PROPERTY_PARSERS.pop("DYNAMIC") 524 525 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 526 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 527 528 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 529 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 530 531 RANGE_PARSERS = { 532 **parser.Parser.RANGE_PARSERS, 533 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 534 } 535 536 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 537 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 538 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 539 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 540 541 JOIN_KINDS = { 542 *parser.Parser.JOIN_KINDS, 543 TokenType.ANY, 544 TokenType.ASOF, 545 TokenType.ARRAY, 546 } 547 548 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 549 TokenType.ANY, 550 TokenType.ARRAY, 551 TokenType.FINAL, 552 TokenType.FORMAT, 553 TokenType.SETTINGS, 554 } 555 556 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 557 TokenType.FORMAT, 558 } 559 560 LOG_DEFAULTS_TO_LN = True 561 562 QUERY_MODIFIER_PARSERS = { 563 **parser.Parser.QUERY_MODIFIER_PARSERS, 564 TokenType.SETTINGS: lambda self: ( 565 "settings", 566 self._advance() or self._parse_csv(self._parse_assignment), 567 ), 568 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 569 } 570 571 CONSTRAINT_PARSERS = { 572 **parser.Parser.CONSTRAINT_PARSERS, 573 "INDEX": lambda self: self._parse_index_constraint(), 574 "CODEC": lambda self: self._parse_compress(), 575 } 576 577 ALTER_PARSERS = { 578 **parser.Parser.ALTER_PARSERS, 579 "REPLACE": lambda self: self._parse_alter_table_replace(), 580 } 581 582 SCHEMA_UNNAMED_CONSTRAINTS = { 583 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 584 "INDEX", 585 } 586 587 PLACEHOLDER_PARSERS = { 588 **parser.Parser.PLACEHOLDER_PARSERS, 589 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 590 } 591 592 def _parse_engine_property(self) -> exp.EngineProperty: 593 self._match(TokenType.EQ) 594 return self.expression( 595 exp.EngineProperty, 596 this=self._parse_field(any_token=True, anonymous_func=True), 597 ) 598 599 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 600 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 601 return self._parse_lambda() 602 603 def _parse_types( 604 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 605 ) -> t.Optional[exp.Expression]: 606 dtype = super()._parse_types( 607 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 608 ) 609 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 610 # Mark every type as non-nullable which is ClickHouse's default, unless it's 611 # already marked as nullable. This marker helps us transpile types from other 612 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 613 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 614 # fail in ClickHouse without the `Nullable` type constructor. 615 dtype.set("nullable", False) 616 617 return dtype 618 619 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 620 index = self._index 621 this = self._parse_bitwise() 622 if self._match(TokenType.FROM): 623 self._retreat(index) 624 return super()._parse_extract() 625 626 # We return Anonymous here because extract and regexpExtract have different semantics, 627 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 628 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 629 # 630 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 631 self._match(TokenType.COMMA) 632 return self.expression( 633 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 634 ) 635 636 def _parse_assignment(self) -> t.Optional[exp.Expression]: 637 this = super()._parse_assignment() 638 639 if self._match(TokenType.PLACEHOLDER): 640 return self.expression( 641 exp.If, 642 this=this, 643 true=self._parse_assignment(), 644 false=self._match(TokenType.COLON) and self._parse_assignment(), 645 ) 646 647 return this 648 649 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 650 """ 651 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 652 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 653 """ 654 index = self._index 655 656 this = self._parse_id_var() 657 self._match(TokenType.COLON) 658 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 659 self._match_text_seq("IDENTIFIER") and "Identifier" 660 ) 661 662 if not kind: 663 self._retreat(index) 664 return None 665 elif not self._match(TokenType.R_BRACE): 666 self.raise_error("Expecting }") 667 668 if isinstance(this, exp.Identifier) and not this.quoted: 669 this = exp.var(this.name) 670 671 return self.expression(exp.Placeholder, this=this, kind=kind) 672 673 def _parse_bracket( 674 self, this: t.Optional[exp.Expression] = None 675 ) -> t.Optional[exp.Expression]: 676 l_brace = self._match(TokenType.L_BRACE, advance=False) 677 bracket = super()._parse_bracket(this) 678 679 if l_brace and isinstance(bracket, exp.Struct): 680 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 681 for expression in bracket.expressions: 682 if not isinstance(expression, exp.PropertyEQ): 683 break 684 685 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 686 varmap.args["values"].append("expressions", expression.expression) 687 688 return varmap 689 690 return bracket 691 692 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 693 this = super()._parse_in(this) 694 this.set("is_global", is_global) 695 return this 696 697 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 698 is_negated = self._match(TokenType.NOT) 699 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 700 return self.expression(exp.Not, this=this) if is_negated else this 701 702 def _parse_table( 703 self, 704 schema: bool = False, 705 joins: bool = False, 706 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 707 parse_bracket: bool = False, 708 is_db_reference: bool = False, 709 parse_partition: bool = False, 710 consume_pipe: bool = False, 711 ) -> t.Optional[exp.Expression]: 712 this = super()._parse_table( 713 schema=schema, 714 joins=joins, 715 alias_tokens=alias_tokens, 716 parse_bracket=parse_bracket, 717 is_db_reference=is_db_reference, 718 ) 719 720 if isinstance(this, exp.Table): 721 inner = this.this 722 alias = this.args.get("alias") 723 724 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 725 alias.set("columns", [exp.to_identifier("generate_series")]) 726 727 if self._match(TokenType.FINAL): 728 this = self.expression(exp.Final, this=this) 729 730 return this 731 732 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 733 return super()._parse_position(haystack_first=True) 734 735 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 736 def _parse_cte(self) -> t.Optional[exp.CTE]: 737 # WITH <identifier> AS <subquery expression> 738 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 739 740 if not cte: 741 # WITH <expression> AS <identifier> 742 cte = self.expression( 743 exp.CTE, 744 this=self._parse_assignment(), 745 alias=self._parse_table_alias(), 746 scalar=True, 747 ) 748 749 return cte 750 751 def _parse_join_parts( 752 self, 753 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 754 is_global = self._match(TokenType.GLOBAL) and self._prev 755 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 756 757 if kind_pre: 758 kind = self._match_set(self.JOIN_KINDS) and self._prev 759 side = self._match_set(self.JOIN_SIDES) and self._prev 760 return is_global, side, kind 761 762 return ( 763 is_global, 764 self._match_set(self.JOIN_SIDES) and self._prev, 765 self._match_set(self.JOIN_KINDS) and self._prev, 766 ) 767 768 def _parse_join( 769 self, skip_join_token: bool = False, parse_bracket: bool = False 770 ) -> t.Optional[exp.Join]: 771 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 772 if join: 773 join.set("global", join.args.pop("method", None)) 774 775 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 776 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 777 if join.kind == "ARRAY": 778 for table in join.find_all(exp.Table): 779 table.replace(table.to_column()) 780 781 return join 782 783 def _parse_function( 784 self, 785 functions: t.Optional[t.Dict[str, t.Callable]] = None, 786 anonymous: bool = False, 787 optional_parens: bool = True, 788 any_token: bool = False, 789 ) -> t.Optional[exp.Expression]: 790 expr = super()._parse_function( 791 functions=functions, 792 anonymous=anonymous, 793 optional_parens=optional_parens, 794 any_token=any_token, 795 ) 796 797 func = expr.this if isinstance(expr, exp.Window) else expr 798 799 # Aggregate functions can be split in 2 parts: <func_name><suffix> 800 parts = ( 801 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 802 ) 803 804 if parts: 805 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 806 params = self._parse_func_params(anon_func) 807 808 kwargs = { 809 "this": anon_func.this, 810 "expressions": anon_func.expressions, 811 } 812 if parts[1]: 813 exp_class: t.Type[exp.Expression] = ( 814 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 815 ) 816 else: 817 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 818 819 kwargs["exp_class"] = exp_class 820 if params: 821 kwargs["params"] = params 822 823 func = self.expression(**kwargs) 824 825 if isinstance(expr, exp.Window): 826 # The window's func was parsed as Anonymous in base parser, fix its 827 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 828 expr.set("this", func) 829 elif params: 830 # Params have blocked super()._parse_function() from parsing the following window 831 # (if that exists) as they're standing between the function call and the window spec 832 expr = self._parse_window(func) 833 else: 834 expr = func 835 836 return expr 837 838 def _parse_func_params( 839 self, this: t.Optional[exp.Func] = None 840 ) -> t.Optional[t.List[exp.Expression]]: 841 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 842 return self._parse_csv(self._parse_lambda) 843 844 if self._match(TokenType.L_PAREN): 845 params = self._parse_csv(self._parse_lambda) 846 self._match_r_paren(this) 847 return params 848 849 return None 850 851 def _parse_quantile(self) -> exp.Quantile: 852 this = self._parse_lambda() 853 params = self._parse_func_params() 854 if params: 855 return self.expression(exp.Quantile, this=params[0], quantile=this) 856 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 857 858 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 859 return super()._parse_wrapped_id_vars(optional=True) 860 861 def _parse_primary_key( 862 self, wrapped_optional: bool = False, in_props: bool = False 863 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 864 return super()._parse_primary_key( 865 wrapped_optional=wrapped_optional or in_props, in_props=in_props 866 ) 867 868 def _parse_on_property(self) -> t.Optional[exp.Expression]: 869 index = self._index 870 if self._match_text_seq("CLUSTER"): 871 this = self._parse_string() or self._parse_id_var() 872 if this: 873 return self.expression(exp.OnCluster, this=this) 874 else: 875 self._retreat(index) 876 return None 877 878 def _parse_index_constraint( 879 self, kind: t.Optional[str] = None 880 ) -> exp.IndexColumnConstraint: 881 # INDEX name1 expr TYPE type1(args) GRANULARITY value 882 this = self._parse_id_var() 883 expression = self._parse_assignment() 884 885 index_type = self._match_text_seq("TYPE") and ( 886 self._parse_function() or self._parse_var() 887 ) 888 889 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 890 891 return self.expression( 892 exp.IndexColumnConstraint, 893 this=this, 894 expression=expression, 895 index_type=index_type, 896 granularity=granularity, 897 ) 898 899 def _parse_partition(self) -> t.Optional[exp.Partition]: 900 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 901 if not self._match(TokenType.PARTITION): 902 return None 903 904 if self._match_text_seq("ID"): 905 # Corresponds to the PARTITION ID <string_value> syntax 906 expressions: t.List[exp.Expression] = [ 907 self.expression(exp.PartitionId, this=self._parse_string()) 908 ] 909 else: 910 expressions = self._parse_expressions() 911 912 return self.expression(exp.Partition, expressions=expressions) 913 914 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 915 partition = self._parse_partition() 916 917 if not partition or not self._match(TokenType.FROM): 918 return None 919 920 return self.expression( 921 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 922 ) 923 924 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 925 if not self._match_text_seq("PROJECTION"): 926 return None 927 928 return self.expression( 929 exp.ProjectionDef, 930 this=self._parse_id_var(), 931 expression=self._parse_wrapped(self._parse_statement), 932 ) 933 934 def _parse_constraint(self) -> t.Optional[exp.Expression]: 935 return super()._parse_constraint() or self._parse_projection_def() 936 937 def _parse_alias( 938 self, this: t.Optional[exp.Expression], explicit: bool = False 939 ) -> t.Optional[exp.Expression]: 940 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 941 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 942 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 943 return this 944 945 return super()._parse_alias(this=this, explicit=explicit) 946 947 def _parse_expression(self) -> t.Optional[exp.Expression]: 948 this = super()._parse_expression() 949 950 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 951 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 952 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 953 self._match(TokenType.R_PAREN) 954 955 return this 956 957 def _parse_columns(self) -> exp.Expression: 958 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 959 960 while self._next and self._match_text_seq(")", "APPLY", "("): 961 self._match(TokenType.R_PAREN) 962 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 963 return this 964 965 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 966 value = super()._parse_value(values=values) 967 if not value: 968 return None 969 970 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 971 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 972 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 973 # but the final result is not altered by the extra parentheses. 974 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 975 expressions = value.expressions 976 if values and not isinstance(expressions[-1], exp.Tuple): 977 value.set( 978 "expressions", 979 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 980 ) 981 982 return value 983 984 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 985 # ClickHouse allows custom expressions as partition key 986 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 987 return self.expression( 988 exp.PartitionedByProperty, 989 this=self._parse_assignment(), 990 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_HINTS
- LAMBDAS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- STATEMENT_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_ALTER_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- ALTER_RENAME_REQUIRES_COLUMN
- ALTER_TABLE_PARTITIONS
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- SUPPORTS_OMITTED_INTERVAL_SPAN_UNIT
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
992 class Generator(generator.Generator): 993 QUERY_HINTS = False 994 STRUCT_DELIMITER = ("(", ")") 995 NVL2_SUPPORTED = False 996 TABLESAMPLE_REQUIRES_PARENS = False 997 TABLESAMPLE_SIZE_IS_ROWS = False 998 TABLESAMPLE_KEYWORDS = "SAMPLE" 999 LAST_DAY_SUPPORTS_DATE_PART = False 1000 CAN_IMPLEMENT_ARRAY_ANY = True 1001 SUPPORTS_TO_NUMBER = False 1002 JOIN_HINTS = False 1003 TABLE_HINTS = False 1004 GROUPINGS_SEP = "" 1005 SET_OP_MODIFIERS = False 1006 ARRAY_SIZE_NAME = "LENGTH" 1007 WRAP_DERIVED_VALUES = False 1008 1009 STRING_TYPE_MAPPING = { 1010 exp.DataType.Type.BLOB: "String", 1011 exp.DataType.Type.CHAR: "String", 1012 exp.DataType.Type.LONGBLOB: "String", 1013 exp.DataType.Type.LONGTEXT: "String", 1014 exp.DataType.Type.MEDIUMBLOB: "String", 1015 exp.DataType.Type.MEDIUMTEXT: "String", 1016 exp.DataType.Type.TINYBLOB: "String", 1017 exp.DataType.Type.TINYTEXT: "String", 1018 exp.DataType.Type.TEXT: "String", 1019 exp.DataType.Type.VARBINARY: "String", 1020 exp.DataType.Type.VARCHAR: "String", 1021 } 1022 1023 SUPPORTED_JSON_PATH_PARTS = { 1024 exp.JSONPathKey, 1025 exp.JSONPathRoot, 1026 exp.JSONPathSubscript, 1027 } 1028 1029 TYPE_MAPPING = { 1030 **generator.Generator.TYPE_MAPPING, 1031 **STRING_TYPE_MAPPING, 1032 exp.DataType.Type.ARRAY: "Array", 1033 exp.DataType.Type.BOOLEAN: "Bool", 1034 exp.DataType.Type.BIGINT: "Int64", 1035 exp.DataType.Type.DATE32: "Date32", 1036 exp.DataType.Type.DATETIME: "DateTime", 1037 exp.DataType.Type.DATETIME2: "DateTime", 1038 exp.DataType.Type.SMALLDATETIME: "DateTime", 1039 exp.DataType.Type.DATETIME64: "DateTime64", 1040 exp.DataType.Type.DECIMAL: "Decimal", 1041 exp.DataType.Type.DECIMAL32: "Decimal32", 1042 exp.DataType.Type.DECIMAL64: "Decimal64", 1043 exp.DataType.Type.DECIMAL128: "Decimal128", 1044 exp.DataType.Type.DECIMAL256: "Decimal256", 1045 exp.DataType.Type.TIMESTAMP: "DateTime", 1046 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1047 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1048 exp.DataType.Type.DOUBLE: "Float64", 1049 exp.DataType.Type.ENUM: "Enum", 1050 exp.DataType.Type.ENUM8: "Enum8", 1051 exp.DataType.Type.ENUM16: "Enum16", 1052 exp.DataType.Type.FIXEDSTRING: "FixedString", 1053 exp.DataType.Type.FLOAT: "Float32", 1054 exp.DataType.Type.INT: "Int32", 1055 exp.DataType.Type.MEDIUMINT: "Int32", 1056 exp.DataType.Type.INT128: "Int128", 1057 exp.DataType.Type.INT256: "Int256", 1058 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1059 exp.DataType.Type.MAP: "Map", 1060 exp.DataType.Type.NESTED: "Nested", 1061 exp.DataType.Type.NOTHING: "Nothing", 1062 exp.DataType.Type.SMALLINT: "Int16", 1063 exp.DataType.Type.STRUCT: "Tuple", 1064 exp.DataType.Type.TINYINT: "Int8", 1065 exp.DataType.Type.UBIGINT: "UInt64", 1066 exp.DataType.Type.UINT: "UInt32", 1067 exp.DataType.Type.UINT128: "UInt128", 1068 exp.DataType.Type.UINT256: "UInt256", 1069 exp.DataType.Type.USMALLINT: "UInt16", 1070 exp.DataType.Type.UTINYINT: "UInt8", 1071 exp.DataType.Type.IPV4: "IPv4", 1072 exp.DataType.Type.IPV6: "IPv6", 1073 exp.DataType.Type.POINT: "Point", 1074 exp.DataType.Type.RING: "Ring", 1075 exp.DataType.Type.LINESTRING: "LineString", 1076 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1077 exp.DataType.Type.POLYGON: "Polygon", 1078 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1079 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1080 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1081 exp.DataType.Type.DYNAMIC: "Dynamic", 1082 } 1083 1084 TRANSFORMS = { 1085 **generator.Generator.TRANSFORMS, 1086 exp.AnyValue: rename_func("any"), 1087 exp.ApproxDistinct: rename_func("uniq"), 1088 exp.ArrayConcat: rename_func("arrayConcat"), 1089 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1090 exp.ArrayRemove: remove_from_array_using_filter, 1091 exp.ArrayReverse: rename_func("arrayReverse"), 1092 exp.ArraySlice: rename_func("arraySlice"), 1093 exp.ArraySum: rename_func("arraySum"), 1094 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1095 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1096 exp.Array: inline_array_sql, 1097 exp.CastToStrType: rename_func("CAST"), 1098 exp.CountIf: rename_func("countIf"), 1099 exp.CosineDistance: rename_func("cosineDistance"), 1100 exp.CompressColumnConstraint: lambda self, 1101 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1102 exp.ComputedColumnConstraint: lambda self, 1103 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1104 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1105 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1106 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1107 exp.DateStrToDate: rename_func("toDate"), 1108 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1109 exp.Explode: rename_func("arrayJoin"), 1110 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1111 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1112 exp.IsNan: rename_func("isNaN"), 1113 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1114 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1115 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONPathKey: json_path_key_only_name, 1117 exp.JSONPathRoot: lambda *_: "", 1118 exp.Length: length_or_char_length_sql, 1119 exp.Map: _map_sql, 1120 exp.Median: rename_func("median"), 1121 exp.Nullif: rename_func("nullIf"), 1122 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1123 exp.Pivot: no_pivot_sql, 1124 exp.Quantile: _quantile_sql, 1125 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1126 exp.Rand: rename_func("randCanonical"), 1127 exp.StartsWith: rename_func("startsWith"), 1128 exp.Struct: rename_func("tuple"), 1129 exp.EndsWith: rename_func("endsWith"), 1130 exp.EuclideanDistance: rename_func("L2Distance"), 1131 exp.StrPosition: lambda self, e: strposition_sql( 1132 self, 1133 e, 1134 func_name="POSITION", 1135 supports_position=True, 1136 use_ansi_position=False, 1137 ), 1138 exp.TimeToStr: lambda self, e: self.func( 1139 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1140 ), 1141 exp.TimeStrToTime: _timestrtotime_sql, 1142 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1143 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1144 exp.Typeof: rename_func("toTypeName"), 1145 exp.VarMap: _map_sql, 1146 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1147 exp.MD5Digest: rename_func("MD5"), 1148 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1149 exp.SHA: rename_func("SHA1"), 1150 exp.SHA2: sha256_sql, 1151 exp.UnixToTime: _unix_to_time_sql, 1152 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1153 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1154 exp.Variance: rename_func("varSamp"), 1155 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1156 exp.Stddev: rename_func("stddevSamp"), 1157 exp.Chr: rename_func("CHAR"), 1158 exp.Lag: lambda self, e: self.func( 1159 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1160 ), 1161 exp.Lead: lambda self, e: self.func( 1162 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1163 ), 1164 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1165 rename_func("editDistance") 1166 ), 1167 exp.ParseDatetime: rename_func("parseDateTime"), 1168 } 1169 1170 PROPERTIES_LOCATION = { 1171 **generator.Generator.PROPERTIES_LOCATION, 1172 exp.OnCluster: exp.Properties.Location.POST_NAME, 1173 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1174 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1175 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1176 } 1177 1178 # There's no list in docs, but it can be found in Clickhouse code 1179 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1180 ON_CLUSTER_TARGETS = { 1181 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1182 "DATABASE", 1183 "TABLE", 1184 "VIEW", 1185 "DICTIONARY", 1186 "INDEX", 1187 "FUNCTION", 1188 "NAMED COLLECTION", 1189 } 1190 1191 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1192 NON_NULLABLE_TYPES = { 1193 exp.DataType.Type.ARRAY, 1194 exp.DataType.Type.MAP, 1195 exp.DataType.Type.STRUCT, 1196 exp.DataType.Type.POINT, 1197 exp.DataType.Type.RING, 1198 exp.DataType.Type.LINESTRING, 1199 exp.DataType.Type.MULTILINESTRING, 1200 exp.DataType.Type.POLYGON, 1201 exp.DataType.Type.MULTIPOLYGON, 1202 } 1203 1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset 1214 1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql 1224 1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix) 1232 1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression) 1240 1241 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1242 this = self.json_path_part(expression.this) 1243 return str(int(this) + 1) if is_int(this) else this 1244 1245 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1246 return f"AS {self.sql(expression, 'this')}" 1247 1248 def _any_to_has( 1249 self, 1250 expression: exp.EQ | exp.NEQ, 1251 default: t.Callable[[t.Any], str], 1252 prefix: str = "", 1253 ) -> str: 1254 if isinstance(expression.left, exp.Any): 1255 arr = expression.left 1256 this = expression.right 1257 elif isinstance(expression.right, exp.Any): 1258 arr = expression.right 1259 this = expression.left 1260 else: 1261 return default(expression) 1262 1263 return prefix + self.func("has", arr.this.unnest(), this) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 return self._any_to_has(expression, super().eq_sql) 1267 1268 def neq_sql(self, expression: exp.NEQ) -> str: 1269 return self._any_to_has(expression, super().neq_sql, "NOT ") 1270 1271 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1272 # Manually add a flag to make the search case-insensitive 1273 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1274 return self.func("match", expression.this, regex) 1275 1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype 1307 1308 def cte_sql(self, expression: exp.CTE) -> str: 1309 if expression.args.get("scalar"): 1310 this = self.sql(expression, "this") 1311 alias = self.sql(expression, "alias") 1312 return f"{this} AS {alias}" 1313 1314 return super().cte_sql(expression) 1315 1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ] 1329 1330 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1331 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1332 1333 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1334 return f"ON CLUSTER {self.sql(expression, 'this')}" 1335 1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations) 1353 1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}" 1371 1372 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1373 this = self.indent(self.sql(expression, "this")) 1374 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1375 1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}" 1387 1388 def partition_sql(self, expression: exp.Partition) -> str: 1389 return f"PARTITION {self.expressions(expression, flat=True)}" 1390 1391 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1392 return f"ID {self.sql(expression.this)}" 1393 1394 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1395 return ( 1396 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1397 ) 1398 1399 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1400 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1401 1402 def is_sql(self, expression: exp.Is) -> str: 1403 is_sql = super().is_sql(expression) 1404 1405 if isinstance(expression.parent, exp.Not): 1406 # value IS NOT NULL -> NOT (value IS NULL) 1407 is_sql = self.wrap(is_sql) 1408 1409 return is_sql 1410 1411 def in_sql(self, expression: exp.In) -> str: 1412 in_sql = super().in_sql(expression) 1413 1414 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1415 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1416 1417 return in_sql 1418 1419 def not_sql(self, expression: exp.Not) -> str: 1420 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1421 # let `GLOBAL IN` child interpose `NOT` 1422 return self.sql(expression, "this") 1423 1424 return super().not_sql(expression) 1425 1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset
1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql
1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix)
1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression)
1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype
1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ]
1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations)
1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}"
1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}"
1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- IGNORE_NULLS_IN_FUNC
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_FETCH
- LIMIT_ONLY_LITERALS
- RENAME_TABLE_WITH_DB
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- JSON_KEY_VALUE_PAIR_SEP
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SUPPORTS_WINDOW_EXCLUDE
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- PARAMETER_TOKEN
- NAMED_PLACEHOLDER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- RESERVED_KEYWORDS
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- subquery_sql
- qualify_sql
- unnest_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- bracket_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- install_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql
- modelattribute_sql
- directorystage_sql