sqlglot.dialects.clickhouse
1from __future__ import annotations 2import typing as t 3import datetime 4from sqlglot import exp, generator, parser, tokens 5from sqlglot._typing import E 6from sqlglot.dialects.dialect import ( 7 Dialect, 8 NormalizationStrategy, 9 arg_max_or_min_no_count, 10 build_date_delta, 11 build_formatted_time, 12 inline_array_sql, 13 json_extract_segments, 14 json_path_key_only_name, 15 length_or_char_length_sql, 16 no_pivot_sql, 17 build_json_extract_path, 18 rename_func, 19 remove_from_array_using_filter, 20 sha256_sql, 21 strposition_sql, 22 var_map_sql, 23 timestamptrunc_sql, 24 unit_to_var, 25 trim_sql, 26) 27from sqlglot.generator import Generator 28from sqlglot.helper import is_int, seq_get 29from sqlglot.tokens import Token, TokenType 30from sqlglot.generator import unsupported_args 31 32DATEΤΙΜΕ_DELTA = t.Union[exp.DateAdd, exp.DateDiff, exp.DateSub, exp.TimestampSub, exp.TimestampAdd] 33 34 35def _build_datetime_format( 36 expr_type: t.Type[E], 37) -> t.Callable[[t.List], E]: 38 def _builder(args: t.List) -> E: 39 expr = build_formatted_time(expr_type, "clickhouse")(args) 40 41 timezone = seq_get(args, 2) 42 if timezone: 43 expr.set("zone", timezone) 44 45 return expr 46 47 return _builder 48 49 50def _unix_to_time_sql(self: ClickHouse.Generator, expression: exp.UnixToTime) -> str: 51 scale = expression.args.get("scale") 52 timestamp = expression.this 53 54 if scale in (None, exp.UnixToTime.SECONDS): 55 return self.func("fromUnixTimestamp", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 56 if scale == exp.UnixToTime.MILLIS: 57 return self.func("fromUnixTimestamp64Milli", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 58 if scale == exp.UnixToTime.MICROS: 59 return self.func("fromUnixTimestamp64Micro", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 60 if scale == exp.UnixToTime.NANOS: 61 return self.func("fromUnixTimestamp64Nano", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 62 63 return self.func( 64 "fromUnixTimestamp", 65 exp.cast( 66 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 67 ), 68 ) 69 70 71def _lower_func(sql: str) -> str: 72 index = sql.index("(") 73 return sql[:index].lower() + sql[index:] 74 75 76def _quantile_sql(self: ClickHouse.Generator, expression: exp.Quantile) -> str: 77 quantile = expression.args["quantile"] 78 args = f"({self.sql(expression, 'this')})" 79 80 if isinstance(quantile, exp.Array): 81 func = self.func("quantiles", *quantile) 82 else: 83 func = self.func("quantile", quantile) 84 85 return func + args 86 87 88def _build_count_if(args: t.List) -> exp.CountIf | exp.CombinedAggFunc: 89 if len(args) == 1: 90 return exp.CountIf(this=seq_get(args, 0)) 91 92 return exp.CombinedAggFunc(this="countIf", expressions=args) 93 94 95def _build_str_to_date(args: t.List) -> exp.Cast | exp.Anonymous: 96 if len(args) == 3: 97 return exp.Anonymous(this="STR_TO_DATE", expressions=args) 98 99 strtodate = exp.StrToDate.from_arg_list(args) 100 return exp.cast(strtodate, exp.DataType.build(exp.DataType.Type.DATETIME)) 101 102 103def _datetime_delta_sql(name: str) -> t.Callable[[Generator, DATEΤΙΜΕ_DELTA], str]: 104 def _delta_sql(self: Generator, expression: DATEΤΙΜΕ_DELTA) -> str: 105 if not expression.unit: 106 return rename_func(name)(self, expression) 107 108 return self.func( 109 name, 110 unit_to_var(expression), 111 expression.expression, 112 expression.this, 113 expression.args.get("zone"), 114 ) 115 116 return _delta_sql 117 118 119def _timestrtotime_sql(self: ClickHouse.Generator, expression: exp.TimeStrToTime): 120 ts = expression.this 121 122 tz = expression.args.get("zone") 123 if tz and isinstance(ts, exp.Literal): 124 # Clickhouse will not accept timestamps that include a UTC offset, so we must remove them. 125 # The first step to removing is parsing the string with `datetime.datetime.fromisoformat`. 126 # 127 # In python <3.11, `fromisoformat()` can only parse timestamps of millisecond (3 digit) 128 # or microsecond (6 digit) precision. It will error if passed any other number of fractional 129 # digits, so we extract the fractional seconds and pad to 6 digits before parsing. 130 ts_string = ts.name.strip() 131 132 # separate [date and time] from [fractional seconds and UTC offset] 133 ts_parts = ts_string.split(".") 134 if len(ts_parts) == 2: 135 # separate fractional seconds and UTC offset 136 offset_sep = "+" if "+" in ts_parts[1] else "-" 137 ts_frac_parts = ts_parts[1].split(offset_sep) 138 num_frac_parts = len(ts_frac_parts) 139 140 # pad to 6 digits if fractional seconds present 141 ts_frac_parts[0] = ts_frac_parts[0].ljust(6, "0") 142 ts_string = "".join( 143 [ 144 ts_parts[0], # date and time 145 ".", 146 ts_frac_parts[0], # fractional seconds 147 offset_sep if num_frac_parts > 1 else "", 148 ts_frac_parts[1] if num_frac_parts > 1 else "", # utc offset (if present) 149 ] 150 ) 151 152 # return literal with no timezone, eg turn '2020-01-01 12:13:14-08:00' into '2020-01-01 12:13:14' 153 # this is because Clickhouse encodes the timezone as a data type parameter and throws an error if 154 # it's part of the timestamp string 155 ts_without_tz = ( 156 datetime.datetime.fromisoformat(ts_string).replace(tzinfo=None).isoformat(sep=" ") 157 ) 158 ts = exp.Literal.string(ts_without_tz) 159 160 # Non-nullable DateTime64 with microsecond precision 161 expressions = [exp.DataTypeParam(this=tz)] if tz else [] 162 datatype = exp.DataType.build( 163 exp.DataType.Type.DATETIME64, 164 expressions=[exp.DataTypeParam(this=exp.Literal.number(6)), *expressions], 165 nullable=False, 166 ) 167 168 return self.sql(exp.cast(ts, datatype, dialect=self.dialect)) 169 170 171def _map_sql(self: ClickHouse.Generator, expression: exp.Map | exp.VarMap) -> str: 172 if not (expression.parent and expression.parent.arg_key == "settings"): 173 return _lower_func(var_map_sql(self, expression)) 174 175 keys = expression.args.get("keys") 176 values = expression.args.get("values") 177 178 if not isinstance(keys, exp.Array) or not isinstance(values, exp.Array): 179 self.unsupported("Cannot convert array columns into map.") 180 return "" 181 182 args = [] 183 for key, value in zip(keys.expressions, values.expressions): 184 args.append(f"{self.sql(key)}: {self.sql(value)}") 185 186 csv_args = ", ".join(args) 187 188 return f"{{{csv_args}}}" 189 190 191class ClickHouse(Dialect): 192 INDEX_OFFSET = 1 193 NORMALIZE_FUNCTIONS: bool | str = False 194 NULL_ORDERING = "nulls_are_last" 195 SUPPORTS_USER_DEFINED_TYPES = False 196 SAFE_DIVISION = True 197 LOG_BASE_FIRST: t.Optional[bool] = None 198 FORCE_EARLY_ALIAS_REF_EXPANSION = True 199 PRESERVE_ORIGINAL_NAMES = True 200 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 201 IDENTIFIERS_CAN_START_WITH_DIGIT = True 202 HEX_STRING_IS_INTEGER_TYPE = True 203 204 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 205 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 206 207 UNESCAPED_SEQUENCES = { 208 "\\0": "\0", 209 } 210 211 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 212 213 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 214 exp.Except: False, 215 exp.Intersect: False, 216 exp.Union: None, 217 } 218 219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases 244 245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 } 299 300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 317 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 322 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 323 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 324 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 325 "JSONEXTRACTSTRING": build_json_extract_path( 326 exp.JSONExtractScalar, zero_based_indexing=False 327 ), 328 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 329 "L2Distance": exp.EuclideanDistance.from_arg_list, 330 "MAP": parser.build_var_map, 331 "MATCH": exp.RegexpLike.from_arg_list, 332 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 333 "RANDCANONICAL": exp.Rand.from_arg_list, 334 "STR_TO_DATE": _build_str_to_date, 335 "TUPLE": exp.Struct.from_arg_list, 336 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "UNIQ": exp.ApproxDistinct.from_arg_list, 341 "XOR": lambda args: exp.Xor(expressions=args), 342 "MD5": exp.MD5Digest.from_arg_list, 343 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 344 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 345 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 346 "TOTYPENAME": exp.Typeof.from_arg_list, 347 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 348 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 349 } 350 FUNCTIONS.pop("TRANSFORM") 351 FUNCTIONS.pop("APPROX_TOP_SUM") 352 353 AGG_FUNCTIONS = { 354 "count", 355 "min", 356 "max", 357 "sum", 358 "avg", 359 "any", 360 "stddevPop", 361 "stddevSamp", 362 "varPop", 363 "varSamp", 364 "corr", 365 "covarPop", 366 "covarSamp", 367 "entropy", 368 "exponentialMovingAverage", 369 "intervalLengthSum", 370 "kolmogorovSmirnovTest", 371 "mannWhitneyUTest", 372 "median", 373 "rankCorr", 374 "sumKahan", 375 "studentTTest", 376 "welchTTest", 377 "anyHeavy", 378 "anyLast", 379 "boundingRatio", 380 "first_value", 381 "last_value", 382 "argMin", 383 "argMax", 384 "avgWeighted", 385 "topK", 386 "approx_top_sum", 387 "topKWeighted", 388 "deltaSum", 389 "deltaSumTimestamp", 390 "groupArray", 391 "groupArrayLast", 392 "groupUniqArray", 393 "groupArrayInsertAt", 394 "groupArrayMovingAvg", 395 "groupArrayMovingSum", 396 "groupArraySample", 397 "groupBitAnd", 398 "groupBitOr", 399 "groupBitXor", 400 "groupBitmap", 401 "groupBitmapAnd", 402 "groupBitmapOr", 403 "groupBitmapXor", 404 "sumWithOverflow", 405 "sumMap", 406 "minMap", 407 "maxMap", 408 "skewSamp", 409 "skewPop", 410 "kurtSamp", 411 "kurtPop", 412 "uniq", 413 "uniqExact", 414 "uniqCombined", 415 "uniqCombined64", 416 "uniqHLL12", 417 "uniqTheta", 418 "quantile", 419 "quantiles", 420 "quantileExact", 421 "quantilesExact", 422 "quantileExactLow", 423 "quantilesExactLow", 424 "quantileExactHigh", 425 "quantilesExactHigh", 426 "quantileExactWeighted", 427 "quantilesExactWeighted", 428 "quantileTiming", 429 "quantilesTiming", 430 "quantileTimingWeighted", 431 "quantilesTimingWeighted", 432 "quantileDeterministic", 433 "quantilesDeterministic", 434 "quantileTDigest", 435 "quantilesTDigest", 436 "quantileTDigestWeighted", 437 "quantilesTDigestWeighted", 438 "quantileBFloat16", 439 "quantilesBFloat16", 440 "quantileBFloat16Weighted", 441 "quantilesBFloat16Weighted", 442 "simpleLinearRegression", 443 "stochasticLinearRegression", 444 "stochasticLogisticRegression", 445 "categoricalInformationValue", 446 "contingency", 447 "cramersV", 448 "cramersVBiasCorrected", 449 "theilsU", 450 "maxIntersections", 451 "maxIntersectionsPosition", 452 "meanZTest", 453 "quantileInterpolatedWeighted", 454 "quantilesInterpolatedWeighted", 455 "quantileGK", 456 "quantilesGK", 457 "sparkBar", 458 "sumCount", 459 "largestTriangleThreeBuckets", 460 "histogram", 461 "sequenceMatch", 462 "sequenceCount", 463 "windowFunnel", 464 "retention", 465 "uniqUpTo", 466 "sequenceNextNode", 467 "exponentialTimeDecayedAvg", 468 } 469 470 AGG_FUNCTIONS_SUFFIXES = [ 471 "If", 472 "Array", 473 "ArrayIf", 474 "Map", 475 "SimpleState", 476 "State", 477 "Merge", 478 "MergeState", 479 "ForEach", 480 "Distinct", 481 "OrDefault", 482 "OrNull", 483 "Resample", 484 "ArgMin", 485 "ArgMax", 486 ] 487 488 FUNC_TOKENS = { 489 *parser.Parser.FUNC_TOKENS, 490 TokenType.AND, 491 TokenType.OR, 492 TokenType.SET, 493 } 494 495 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 496 497 ID_VAR_TOKENS = { 498 *parser.Parser.ID_VAR_TOKENS, 499 TokenType.LIKE, 500 } 501 502 AGG_FUNC_MAPPING = ( 503 lambda functions, suffixes: { 504 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 505 } 506 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 507 508 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 509 510 FUNCTION_PARSERS = { 511 **parser.Parser.FUNCTION_PARSERS, 512 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 513 "QUANTILE": lambda self: self._parse_quantile(), 514 "MEDIAN": lambda self: self._parse_quantile(), 515 "COLUMNS": lambda self: self._parse_columns(), 516 } 517 518 FUNCTION_PARSERS.pop("MATCH") 519 520 PROPERTY_PARSERS = { 521 **parser.Parser.PROPERTY_PARSERS, 522 "ENGINE": lambda self: self._parse_engine_property(), 523 } 524 PROPERTY_PARSERS.pop("DYNAMIC") 525 526 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 527 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 528 529 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 530 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 531 532 RANGE_PARSERS = { 533 **parser.Parser.RANGE_PARSERS, 534 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 535 } 536 537 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 538 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 539 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 540 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 541 542 JOIN_KINDS = { 543 *parser.Parser.JOIN_KINDS, 544 TokenType.ANY, 545 TokenType.ASOF, 546 TokenType.ARRAY, 547 } 548 549 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 550 TokenType.ANY, 551 TokenType.ARRAY, 552 TokenType.FINAL, 553 TokenType.FORMAT, 554 TokenType.SETTINGS, 555 } 556 557 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 558 TokenType.FORMAT, 559 } 560 561 LOG_DEFAULTS_TO_LN = True 562 563 QUERY_MODIFIER_PARSERS = { 564 **parser.Parser.QUERY_MODIFIER_PARSERS, 565 TokenType.SETTINGS: lambda self: ( 566 "settings", 567 self._advance() or self._parse_csv(self._parse_assignment), 568 ), 569 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 570 } 571 572 CONSTRAINT_PARSERS = { 573 **parser.Parser.CONSTRAINT_PARSERS, 574 "INDEX": lambda self: self._parse_index_constraint(), 575 "CODEC": lambda self: self._parse_compress(), 576 } 577 578 ALTER_PARSERS = { 579 **parser.Parser.ALTER_PARSERS, 580 "REPLACE": lambda self: self._parse_alter_table_replace(), 581 } 582 583 SCHEMA_UNNAMED_CONSTRAINTS = { 584 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 585 "INDEX", 586 } 587 588 PLACEHOLDER_PARSERS = { 589 **parser.Parser.PLACEHOLDER_PARSERS, 590 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 591 } 592 593 def _parse_engine_property(self) -> exp.EngineProperty: 594 self._match(TokenType.EQ) 595 return self.expression( 596 exp.EngineProperty, 597 this=self._parse_field(any_token=True, anonymous_func=True), 598 ) 599 600 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 601 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 602 return self._parse_lambda() 603 604 def _parse_types( 605 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 606 ) -> t.Optional[exp.Expression]: 607 dtype = super()._parse_types( 608 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 609 ) 610 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 611 # Mark every type as non-nullable which is ClickHouse's default, unless it's 612 # already marked as nullable. This marker helps us transpile types from other 613 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 614 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 615 # fail in ClickHouse without the `Nullable` type constructor. 616 dtype.set("nullable", False) 617 618 return dtype 619 620 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 621 index = self._index 622 this = self._parse_bitwise() 623 if self._match(TokenType.FROM): 624 self._retreat(index) 625 return super()._parse_extract() 626 627 # We return Anonymous here because extract and regexpExtract have different semantics, 628 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 629 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 630 # 631 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 632 self._match(TokenType.COMMA) 633 return self.expression( 634 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 635 ) 636 637 def _parse_assignment(self) -> t.Optional[exp.Expression]: 638 this = super()._parse_assignment() 639 640 if self._match(TokenType.PLACEHOLDER): 641 return self.expression( 642 exp.If, 643 this=this, 644 true=self._parse_assignment(), 645 false=self._match(TokenType.COLON) and self._parse_assignment(), 646 ) 647 648 return this 649 650 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 651 """ 652 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 653 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 654 """ 655 index = self._index 656 657 this = self._parse_id_var() 658 self._match(TokenType.COLON) 659 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 660 self._match_text_seq("IDENTIFIER") and "Identifier" 661 ) 662 663 if not kind: 664 self._retreat(index) 665 return None 666 elif not self._match(TokenType.R_BRACE): 667 self.raise_error("Expecting }") 668 669 if isinstance(this, exp.Identifier) and not this.quoted: 670 this = exp.var(this.name) 671 672 return self.expression(exp.Placeholder, this=this, kind=kind) 673 674 def _parse_bracket( 675 self, this: t.Optional[exp.Expression] = None 676 ) -> t.Optional[exp.Expression]: 677 l_brace = self._match(TokenType.L_BRACE, advance=False) 678 bracket = super()._parse_bracket(this) 679 680 if l_brace and isinstance(bracket, exp.Struct): 681 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 682 for expression in bracket.expressions: 683 if not isinstance(expression, exp.PropertyEQ): 684 break 685 686 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 687 varmap.args["values"].append("expressions", expression.expression) 688 689 return varmap 690 691 return bracket 692 693 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 694 this = super()._parse_in(this) 695 this.set("is_global", is_global) 696 return this 697 698 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 699 is_negated = self._match(TokenType.NOT) 700 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 701 return self.expression(exp.Not, this=this) if is_negated else this 702 703 def _parse_table( 704 self, 705 schema: bool = False, 706 joins: bool = False, 707 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 708 parse_bracket: bool = False, 709 is_db_reference: bool = False, 710 parse_partition: bool = False, 711 consume_pipe: bool = False, 712 ) -> t.Optional[exp.Expression]: 713 this = super()._parse_table( 714 schema=schema, 715 joins=joins, 716 alias_tokens=alias_tokens, 717 parse_bracket=parse_bracket, 718 is_db_reference=is_db_reference, 719 ) 720 721 if isinstance(this, exp.Table): 722 inner = this.this 723 alias = this.args.get("alias") 724 725 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 726 alias.set("columns", [exp.to_identifier("generate_series")]) 727 728 if self._match(TokenType.FINAL): 729 this = self.expression(exp.Final, this=this) 730 731 return this 732 733 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 734 return super()._parse_position(haystack_first=True) 735 736 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 737 def _parse_cte(self) -> t.Optional[exp.CTE]: 738 # WITH <identifier> AS <subquery expression> 739 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 740 741 if not cte: 742 # WITH <expression> AS <identifier> 743 cte = self.expression( 744 exp.CTE, 745 this=self._parse_assignment(), 746 alias=self._parse_table_alias(), 747 scalar=True, 748 ) 749 750 return cte 751 752 def _parse_join_parts( 753 self, 754 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 755 is_global = self._match(TokenType.GLOBAL) and self._prev 756 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 757 758 if kind_pre: 759 kind = self._match_set(self.JOIN_KINDS) and self._prev 760 side = self._match_set(self.JOIN_SIDES) and self._prev 761 return is_global, side, kind 762 763 return ( 764 is_global, 765 self._match_set(self.JOIN_SIDES) and self._prev, 766 self._match_set(self.JOIN_KINDS) and self._prev, 767 ) 768 769 def _parse_join( 770 self, skip_join_token: bool = False, parse_bracket: bool = False 771 ) -> t.Optional[exp.Join]: 772 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 773 if join: 774 join.set("global", join.args.pop("method", None)) 775 776 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 777 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 778 if join.kind == "ARRAY": 779 for table in join.find_all(exp.Table): 780 table.replace(table.to_column()) 781 782 return join 783 784 def _parse_function( 785 self, 786 functions: t.Optional[t.Dict[str, t.Callable]] = None, 787 anonymous: bool = False, 788 optional_parens: bool = True, 789 any_token: bool = False, 790 ) -> t.Optional[exp.Expression]: 791 expr = super()._parse_function( 792 functions=functions, 793 anonymous=anonymous, 794 optional_parens=optional_parens, 795 any_token=any_token, 796 ) 797 798 func = expr.this if isinstance(expr, exp.Window) else expr 799 800 # Aggregate functions can be split in 2 parts: <func_name><suffix> 801 parts = ( 802 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 803 ) 804 805 if parts: 806 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 807 params = self._parse_func_params(anon_func) 808 809 kwargs = { 810 "this": anon_func.this, 811 "expressions": anon_func.expressions, 812 } 813 if parts[1]: 814 exp_class: t.Type[exp.Expression] = ( 815 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 816 ) 817 else: 818 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 819 820 kwargs["exp_class"] = exp_class 821 if params: 822 kwargs["params"] = params 823 824 func = self.expression(**kwargs) 825 826 if isinstance(expr, exp.Window): 827 # The window's func was parsed as Anonymous in base parser, fix its 828 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 829 expr.set("this", func) 830 elif params: 831 # Params have blocked super()._parse_function() from parsing the following window 832 # (if that exists) as they're standing between the function call and the window spec 833 expr = self._parse_window(func) 834 else: 835 expr = func 836 837 return expr 838 839 def _parse_func_params( 840 self, this: t.Optional[exp.Func] = None 841 ) -> t.Optional[t.List[exp.Expression]]: 842 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 843 return self._parse_csv(self._parse_lambda) 844 845 if self._match(TokenType.L_PAREN): 846 params = self._parse_csv(self._parse_lambda) 847 self._match_r_paren(this) 848 return params 849 850 return None 851 852 def _parse_quantile(self) -> exp.Quantile: 853 this = self._parse_lambda() 854 params = self._parse_func_params() 855 if params: 856 return self.expression(exp.Quantile, this=params[0], quantile=this) 857 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 858 859 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 860 return super()._parse_wrapped_id_vars(optional=True) 861 862 def _parse_primary_key( 863 self, wrapped_optional: bool = False, in_props: bool = False 864 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 865 return super()._parse_primary_key( 866 wrapped_optional=wrapped_optional or in_props, in_props=in_props 867 ) 868 869 def _parse_on_property(self) -> t.Optional[exp.Expression]: 870 index = self._index 871 if self._match_text_seq("CLUSTER"): 872 this = self._parse_string() or self._parse_id_var() 873 if this: 874 return self.expression(exp.OnCluster, this=this) 875 else: 876 self._retreat(index) 877 return None 878 879 def _parse_index_constraint( 880 self, kind: t.Optional[str] = None 881 ) -> exp.IndexColumnConstraint: 882 # INDEX name1 expr TYPE type1(args) GRANULARITY value 883 this = self._parse_id_var() 884 expression = self._parse_assignment() 885 886 index_type = self._match_text_seq("TYPE") and ( 887 self._parse_function() or self._parse_var() 888 ) 889 890 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 891 892 return self.expression( 893 exp.IndexColumnConstraint, 894 this=this, 895 expression=expression, 896 index_type=index_type, 897 granularity=granularity, 898 ) 899 900 def _parse_partition(self) -> t.Optional[exp.Partition]: 901 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 902 if not self._match(TokenType.PARTITION): 903 return None 904 905 if self._match_text_seq("ID"): 906 # Corresponds to the PARTITION ID <string_value> syntax 907 expressions: t.List[exp.Expression] = [ 908 self.expression(exp.PartitionId, this=self._parse_string()) 909 ] 910 else: 911 expressions = self._parse_expressions() 912 913 return self.expression(exp.Partition, expressions=expressions) 914 915 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 916 partition = self._parse_partition() 917 918 if not partition or not self._match(TokenType.FROM): 919 return None 920 921 return self.expression( 922 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 923 ) 924 925 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 926 if not self._match_text_seq("PROJECTION"): 927 return None 928 929 return self.expression( 930 exp.ProjectionDef, 931 this=self._parse_id_var(), 932 expression=self._parse_wrapped(self._parse_statement), 933 ) 934 935 def _parse_constraint(self) -> t.Optional[exp.Expression]: 936 return super()._parse_constraint() or self._parse_projection_def() 937 938 def _parse_alias( 939 self, this: t.Optional[exp.Expression], explicit: bool = False 940 ) -> t.Optional[exp.Expression]: 941 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 942 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 943 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 944 return this 945 946 return super()._parse_alias(this=this, explicit=explicit) 947 948 def _parse_expression(self) -> t.Optional[exp.Expression]: 949 this = super()._parse_expression() 950 951 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 952 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 953 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 954 self._match(TokenType.R_PAREN) 955 956 return this 957 958 def _parse_columns(self) -> exp.Expression: 959 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 960 961 while self._next and self._match_text_seq(")", "APPLY", "("): 962 self._match(TokenType.R_PAREN) 963 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 964 return this 965 966 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 967 value = super()._parse_value(values=values) 968 if not value: 969 return None 970 971 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 972 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 973 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 974 # but the final result is not altered by the extra parentheses. 975 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 976 expressions = value.expressions 977 if values and not isinstance(expressions[-1], exp.Tuple): 978 value.set( 979 "expressions", 980 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 981 ) 982 983 return value 984 985 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 986 # ClickHouse allows custom expressions as partition key 987 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 988 return self.expression( 989 exp.PartitionedByProperty, 990 this=self._parse_assignment(), 991 ) 992 993 class Generator(generator.Generator): 994 QUERY_HINTS = False 995 STRUCT_DELIMITER = ("(", ")") 996 NVL2_SUPPORTED = False 997 TABLESAMPLE_REQUIRES_PARENS = False 998 TABLESAMPLE_SIZE_IS_ROWS = False 999 TABLESAMPLE_KEYWORDS = "SAMPLE" 1000 LAST_DAY_SUPPORTS_DATE_PART = False 1001 CAN_IMPLEMENT_ARRAY_ANY = True 1002 SUPPORTS_TO_NUMBER = False 1003 JOIN_HINTS = False 1004 TABLE_HINTS = False 1005 GROUPINGS_SEP = "" 1006 SET_OP_MODIFIERS = False 1007 ARRAY_SIZE_NAME = "LENGTH" 1008 WRAP_DERIVED_VALUES = False 1009 1010 STRING_TYPE_MAPPING = { 1011 exp.DataType.Type.BLOB: "String", 1012 exp.DataType.Type.CHAR: "String", 1013 exp.DataType.Type.LONGBLOB: "String", 1014 exp.DataType.Type.LONGTEXT: "String", 1015 exp.DataType.Type.MEDIUMBLOB: "String", 1016 exp.DataType.Type.MEDIUMTEXT: "String", 1017 exp.DataType.Type.TINYBLOB: "String", 1018 exp.DataType.Type.TINYTEXT: "String", 1019 exp.DataType.Type.TEXT: "String", 1020 exp.DataType.Type.VARBINARY: "String", 1021 exp.DataType.Type.VARCHAR: "String", 1022 } 1023 1024 SUPPORTED_JSON_PATH_PARTS = { 1025 exp.JSONPathKey, 1026 exp.JSONPathRoot, 1027 exp.JSONPathSubscript, 1028 } 1029 1030 TYPE_MAPPING = { 1031 **generator.Generator.TYPE_MAPPING, 1032 **STRING_TYPE_MAPPING, 1033 exp.DataType.Type.ARRAY: "Array", 1034 exp.DataType.Type.BOOLEAN: "Bool", 1035 exp.DataType.Type.BIGINT: "Int64", 1036 exp.DataType.Type.DATE32: "Date32", 1037 exp.DataType.Type.DATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME2: "DateTime", 1039 exp.DataType.Type.SMALLDATETIME: "DateTime", 1040 exp.DataType.Type.DATETIME64: "DateTime64", 1041 exp.DataType.Type.DECIMAL: "Decimal", 1042 exp.DataType.Type.DECIMAL32: "Decimal32", 1043 exp.DataType.Type.DECIMAL64: "Decimal64", 1044 exp.DataType.Type.DECIMAL128: "Decimal128", 1045 exp.DataType.Type.DECIMAL256: "Decimal256", 1046 exp.DataType.Type.TIMESTAMP: "DateTime", 1047 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1048 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1049 exp.DataType.Type.DOUBLE: "Float64", 1050 exp.DataType.Type.ENUM: "Enum", 1051 exp.DataType.Type.ENUM8: "Enum8", 1052 exp.DataType.Type.ENUM16: "Enum16", 1053 exp.DataType.Type.FIXEDSTRING: "FixedString", 1054 exp.DataType.Type.FLOAT: "Float32", 1055 exp.DataType.Type.INT: "Int32", 1056 exp.DataType.Type.MEDIUMINT: "Int32", 1057 exp.DataType.Type.INT128: "Int128", 1058 exp.DataType.Type.INT256: "Int256", 1059 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1060 exp.DataType.Type.MAP: "Map", 1061 exp.DataType.Type.NESTED: "Nested", 1062 exp.DataType.Type.NOTHING: "Nothing", 1063 exp.DataType.Type.SMALLINT: "Int16", 1064 exp.DataType.Type.STRUCT: "Tuple", 1065 exp.DataType.Type.TINYINT: "Int8", 1066 exp.DataType.Type.UBIGINT: "UInt64", 1067 exp.DataType.Type.UINT: "UInt32", 1068 exp.DataType.Type.UINT128: "UInt128", 1069 exp.DataType.Type.UINT256: "UInt256", 1070 exp.DataType.Type.USMALLINT: "UInt16", 1071 exp.DataType.Type.UTINYINT: "UInt8", 1072 exp.DataType.Type.IPV4: "IPv4", 1073 exp.DataType.Type.IPV6: "IPv6", 1074 exp.DataType.Type.POINT: "Point", 1075 exp.DataType.Type.RING: "Ring", 1076 exp.DataType.Type.LINESTRING: "LineString", 1077 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1078 exp.DataType.Type.POLYGON: "Polygon", 1079 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1080 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1081 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1082 exp.DataType.Type.DYNAMIC: "Dynamic", 1083 } 1084 1085 TRANSFORMS = { 1086 **generator.Generator.TRANSFORMS, 1087 exp.AnyValue: rename_func("any"), 1088 exp.ApproxDistinct: rename_func("uniq"), 1089 exp.ArrayConcat: rename_func("arrayConcat"), 1090 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1091 exp.ArrayRemove: remove_from_array_using_filter, 1092 exp.ArrayReverse: rename_func("arrayReverse"), 1093 exp.ArraySlice: rename_func("arraySlice"), 1094 exp.ArraySum: rename_func("arraySum"), 1095 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1096 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1097 exp.Array: inline_array_sql, 1098 exp.CastToStrType: rename_func("CAST"), 1099 exp.CountIf: rename_func("countIf"), 1100 exp.CosineDistance: rename_func("cosineDistance"), 1101 exp.CompressColumnConstraint: lambda self, 1102 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1103 exp.ComputedColumnConstraint: lambda self, 1104 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1105 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1106 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1107 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1108 exp.DateStrToDate: rename_func("toDate"), 1109 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1110 exp.Explode: rename_func("arrayJoin"), 1111 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1112 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1113 exp.IsNan: rename_func("isNaN"), 1114 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1115 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1117 exp.JSONPathKey: json_path_key_only_name, 1118 exp.JSONPathRoot: lambda *_: "", 1119 exp.Length: length_or_char_length_sql, 1120 exp.Map: _map_sql, 1121 exp.Median: rename_func("median"), 1122 exp.Nullif: rename_func("nullIf"), 1123 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1124 exp.Pivot: no_pivot_sql, 1125 exp.Quantile: _quantile_sql, 1126 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1127 exp.Rand: rename_func("randCanonical"), 1128 exp.StartsWith: rename_func("startsWith"), 1129 exp.EndsWith: rename_func("endsWith"), 1130 exp.EuclideanDistance: rename_func("L2Distance"), 1131 exp.StrPosition: lambda self, e: strposition_sql( 1132 self, 1133 e, 1134 func_name="POSITION", 1135 supports_position=True, 1136 use_ansi_position=False, 1137 ), 1138 exp.TimeToStr: lambda self, e: self.func( 1139 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1140 ), 1141 exp.TimeStrToTime: _timestrtotime_sql, 1142 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1143 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1144 exp.Typeof: rename_func("toTypeName"), 1145 exp.VarMap: _map_sql, 1146 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1147 exp.MD5Digest: rename_func("MD5"), 1148 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1149 exp.SHA: rename_func("SHA1"), 1150 exp.SHA2: sha256_sql, 1151 exp.UnixToTime: _unix_to_time_sql, 1152 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1153 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1154 exp.Variance: rename_func("varSamp"), 1155 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1156 exp.Stddev: rename_func("stddevSamp"), 1157 exp.Chr: rename_func("CHAR"), 1158 exp.Lag: lambda self, e: self.func( 1159 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1160 ), 1161 exp.Lead: lambda self, e: self.func( 1162 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1163 ), 1164 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1165 rename_func("editDistance") 1166 ), 1167 exp.ParseDatetime: rename_func("parseDateTime"), 1168 } 1169 1170 PROPERTIES_LOCATION = { 1171 **generator.Generator.PROPERTIES_LOCATION, 1172 exp.OnCluster: exp.Properties.Location.POST_NAME, 1173 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1174 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1175 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1176 } 1177 1178 # There's no list in docs, but it can be found in Clickhouse code 1179 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1180 ON_CLUSTER_TARGETS = { 1181 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1182 "DATABASE", 1183 "TABLE", 1184 "VIEW", 1185 "DICTIONARY", 1186 "INDEX", 1187 "FUNCTION", 1188 "NAMED COLLECTION", 1189 } 1190 1191 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1192 NON_NULLABLE_TYPES = { 1193 exp.DataType.Type.ARRAY, 1194 exp.DataType.Type.MAP, 1195 exp.DataType.Type.STRUCT, 1196 exp.DataType.Type.POINT, 1197 exp.DataType.Type.RING, 1198 exp.DataType.Type.LINESTRING, 1199 exp.DataType.Type.MULTILINESTRING, 1200 exp.DataType.Type.POLYGON, 1201 exp.DataType.Type.MULTIPOLYGON, 1202 } 1203 1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset 1214 1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql 1224 1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix) 1232 1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression) 1240 1241 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1242 this = self.json_path_part(expression.this) 1243 return str(int(this) + 1) if is_int(this) else this 1244 1245 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1246 return f"AS {self.sql(expression, 'this')}" 1247 1248 def _any_to_has( 1249 self, 1250 expression: exp.EQ | exp.NEQ, 1251 default: t.Callable[[t.Any], str], 1252 prefix: str = "", 1253 ) -> str: 1254 if isinstance(expression.left, exp.Any): 1255 arr = expression.left 1256 this = expression.right 1257 elif isinstance(expression.right, exp.Any): 1258 arr = expression.right 1259 this = expression.left 1260 else: 1261 return default(expression) 1262 1263 return prefix + self.func("has", arr.this.unnest(), this) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 return self._any_to_has(expression, super().eq_sql) 1267 1268 def neq_sql(self, expression: exp.NEQ) -> str: 1269 return self._any_to_has(expression, super().neq_sql, "NOT ") 1270 1271 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1272 # Manually add a flag to make the search case-insensitive 1273 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1274 return self.func("match", expression.this, regex) 1275 1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype 1307 1308 def cte_sql(self, expression: exp.CTE) -> str: 1309 if expression.args.get("scalar"): 1310 this = self.sql(expression, "this") 1311 alias = self.sql(expression, "alias") 1312 return f"{this} AS {alias}" 1313 1314 return super().cte_sql(expression) 1315 1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ] 1329 1330 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1331 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1332 1333 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1334 return f"ON CLUSTER {self.sql(expression, 'this')}" 1335 1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations) 1353 1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}" 1371 1372 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1373 this = self.indent(self.sql(expression, "this")) 1374 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1375 1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}" 1387 1388 def partition_sql(self, expression: exp.Partition) -> str: 1389 return f"PARTITION {self.expressions(expression, flat=True)}" 1390 1391 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1392 return f"ID {self.sql(expression.this)}" 1393 1394 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1395 return ( 1396 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1397 ) 1398 1399 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1400 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1401 1402 def is_sql(self, expression: exp.Is) -> str: 1403 is_sql = super().is_sql(expression) 1404 1405 if isinstance(expression.parent, exp.Not): 1406 # value IS NOT NULL -> NOT (value IS NULL) 1407 is_sql = self.wrap(is_sql) 1408 1409 return is_sql 1410 1411 def in_sql(self, expression: exp.In) -> str: 1412 in_sql = super().in_sql(expression) 1413 1414 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1415 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1416 1417 return in_sql 1418 1419 def not_sql(self, expression: exp.Not) -> str: 1420 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1421 # let `GLOBAL IN` child interpose `NOT` 1422 return self.sql(expression, "this") 1423 1424 return super().not_sql(expression) 1425 1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
192class ClickHouse(Dialect): 193 INDEX_OFFSET = 1 194 NORMALIZE_FUNCTIONS: bool | str = False 195 NULL_ORDERING = "nulls_are_last" 196 SUPPORTS_USER_DEFINED_TYPES = False 197 SAFE_DIVISION = True 198 LOG_BASE_FIRST: t.Optional[bool] = None 199 FORCE_EARLY_ALIAS_REF_EXPANSION = True 200 PRESERVE_ORIGINAL_NAMES = True 201 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 202 IDENTIFIERS_CAN_START_WITH_DIGIT = True 203 HEX_STRING_IS_INTEGER_TYPE = True 204 205 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 206 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 207 208 UNESCAPED_SEQUENCES = { 209 "\\0": "\0", 210 } 211 212 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 213 214 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 215 exp.Except: False, 216 exp.Intersect: False, 217 exp.Union: None, 218 } 219 220 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 221 # Clickhouse allows VALUES to have an embedded structure e.g: 222 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 223 # In this case, we don't want to qualify the columns 224 values = expression.expressions[0].expressions 225 226 structure = ( 227 values[0] 228 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 229 else None 230 ) 231 if structure: 232 # Split each column definition into the column name e.g: 233 # 'person String, place String' -> ['person', 'place'] 234 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 235 column_aliases = [ 236 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 237 ] 238 else: 239 # Default column aliases in CH are "c1", "c2", etc. 240 column_aliases = [ 241 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 242 ] 243 244 return column_aliases 245 246 class Tokenizer(tokens.Tokenizer): 247 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 248 IDENTIFIERS = ['"', "`"] 249 IDENTIFIER_ESCAPES = ["\\"] 250 STRING_ESCAPES = ["'", "\\"] 251 BIT_STRINGS = [("0b", "")] 252 HEX_STRINGS = [("0x", ""), ("0X", "")] 253 HEREDOC_STRINGS = ["$"] 254 255 KEYWORDS = { 256 **tokens.Tokenizer.KEYWORDS, 257 ".:": TokenType.DOTCOLON, 258 "ATTACH": TokenType.COMMAND, 259 "DATE32": TokenType.DATE32, 260 "DATETIME64": TokenType.DATETIME64, 261 "DICTIONARY": TokenType.DICTIONARY, 262 "DYNAMIC": TokenType.DYNAMIC, 263 "ENUM8": TokenType.ENUM8, 264 "ENUM16": TokenType.ENUM16, 265 "EXCHANGE": TokenType.COMMAND, 266 "FINAL": TokenType.FINAL, 267 "FIXEDSTRING": TokenType.FIXEDSTRING, 268 "FLOAT32": TokenType.FLOAT, 269 "FLOAT64": TokenType.DOUBLE, 270 "GLOBAL": TokenType.GLOBAL, 271 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 272 "MAP": TokenType.MAP, 273 "NESTED": TokenType.NESTED, 274 "NOTHING": TokenType.NOTHING, 275 "SAMPLE": TokenType.TABLE_SAMPLE, 276 "TUPLE": TokenType.STRUCT, 277 "UINT16": TokenType.USMALLINT, 278 "UINT32": TokenType.UINT, 279 "UINT64": TokenType.UBIGINT, 280 "UINT8": TokenType.UTINYINT, 281 "IPV4": TokenType.IPV4, 282 "IPV6": TokenType.IPV6, 283 "POINT": TokenType.POINT, 284 "RING": TokenType.RING, 285 "LINESTRING": TokenType.LINESTRING, 286 "MULTILINESTRING": TokenType.MULTILINESTRING, 287 "POLYGON": TokenType.POLYGON, 288 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 289 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 290 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 291 "SYSTEM": TokenType.COMMAND, 292 "PREWHERE": TokenType.PREWHERE, 293 } 294 KEYWORDS.pop("/*+") 295 296 SINGLE_TOKENS = { 297 **tokens.Tokenizer.SINGLE_TOKENS, 298 "$": TokenType.HEREDOC_STRING, 299 } 300 301 class Parser(parser.Parser): 302 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 303 # * select x from t1 union all select x from t2 limit 1; 304 # * select x from t1 union all (select x from t2 limit 1); 305 MODIFIERS_ATTACHED_TO_SET_OP = False 306 INTERVAL_SPANS = False 307 OPTIONAL_ALIAS_TOKEN_CTE = False 308 JOINS_HAVE_EQUAL_PRECEDENCE = True 309 310 FUNCTIONS = { 311 **parser.Parser.FUNCTIONS, 312 "ANY": exp.AnyValue.from_arg_list, 313 "ARRAYSUM": exp.ArraySum.from_arg_list, 314 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 315 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 316 "COUNTIF": _build_count_if, 317 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 318 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 320 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 322 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 323 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 324 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 325 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 326 "JSONEXTRACTSTRING": build_json_extract_path( 327 exp.JSONExtractScalar, zero_based_indexing=False 328 ), 329 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 330 "L2Distance": exp.EuclideanDistance.from_arg_list, 331 "MAP": parser.build_var_map, 332 "MATCH": exp.RegexpLike.from_arg_list, 333 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 334 "RANDCANONICAL": exp.Rand.from_arg_list, 335 "STR_TO_DATE": _build_str_to_date, 336 "TUPLE": exp.Struct.from_arg_list, 337 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 339 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 341 "UNIQ": exp.ApproxDistinct.from_arg_list, 342 "XOR": lambda args: exp.Xor(expressions=args), 343 "MD5": exp.MD5Digest.from_arg_list, 344 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 345 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 346 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 347 "TOTYPENAME": exp.Typeof.from_arg_list, 348 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 349 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 350 } 351 FUNCTIONS.pop("TRANSFORM") 352 FUNCTIONS.pop("APPROX_TOP_SUM") 353 354 AGG_FUNCTIONS = { 355 "count", 356 "min", 357 "max", 358 "sum", 359 "avg", 360 "any", 361 "stddevPop", 362 "stddevSamp", 363 "varPop", 364 "varSamp", 365 "corr", 366 "covarPop", 367 "covarSamp", 368 "entropy", 369 "exponentialMovingAverage", 370 "intervalLengthSum", 371 "kolmogorovSmirnovTest", 372 "mannWhitneyUTest", 373 "median", 374 "rankCorr", 375 "sumKahan", 376 "studentTTest", 377 "welchTTest", 378 "anyHeavy", 379 "anyLast", 380 "boundingRatio", 381 "first_value", 382 "last_value", 383 "argMin", 384 "argMax", 385 "avgWeighted", 386 "topK", 387 "approx_top_sum", 388 "topKWeighted", 389 "deltaSum", 390 "deltaSumTimestamp", 391 "groupArray", 392 "groupArrayLast", 393 "groupUniqArray", 394 "groupArrayInsertAt", 395 "groupArrayMovingAvg", 396 "groupArrayMovingSum", 397 "groupArraySample", 398 "groupBitAnd", 399 "groupBitOr", 400 "groupBitXor", 401 "groupBitmap", 402 "groupBitmapAnd", 403 "groupBitmapOr", 404 "groupBitmapXor", 405 "sumWithOverflow", 406 "sumMap", 407 "minMap", 408 "maxMap", 409 "skewSamp", 410 "skewPop", 411 "kurtSamp", 412 "kurtPop", 413 "uniq", 414 "uniqExact", 415 "uniqCombined", 416 "uniqCombined64", 417 "uniqHLL12", 418 "uniqTheta", 419 "quantile", 420 "quantiles", 421 "quantileExact", 422 "quantilesExact", 423 "quantileExactLow", 424 "quantilesExactLow", 425 "quantileExactHigh", 426 "quantilesExactHigh", 427 "quantileExactWeighted", 428 "quantilesExactWeighted", 429 "quantileTiming", 430 "quantilesTiming", 431 "quantileTimingWeighted", 432 "quantilesTimingWeighted", 433 "quantileDeterministic", 434 "quantilesDeterministic", 435 "quantileTDigest", 436 "quantilesTDigest", 437 "quantileTDigestWeighted", 438 "quantilesTDigestWeighted", 439 "quantileBFloat16", 440 "quantilesBFloat16", 441 "quantileBFloat16Weighted", 442 "quantilesBFloat16Weighted", 443 "simpleLinearRegression", 444 "stochasticLinearRegression", 445 "stochasticLogisticRegression", 446 "categoricalInformationValue", 447 "contingency", 448 "cramersV", 449 "cramersVBiasCorrected", 450 "theilsU", 451 "maxIntersections", 452 "maxIntersectionsPosition", 453 "meanZTest", 454 "quantileInterpolatedWeighted", 455 "quantilesInterpolatedWeighted", 456 "quantileGK", 457 "quantilesGK", 458 "sparkBar", 459 "sumCount", 460 "largestTriangleThreeBuckets", 461 "histogram", 462 "sequenceMatch", 463 "sequenceCount", 464 "windowFunnel", 465 "retention", 466 "uniqUpTo", 467 "sequenceNextNode", 468 "exponentialTimeDecayedAvg", 469 } 470 471 AGG_FUNCTIONS_SUFFIXES = [ 472 "If", 473 "Array", 474 "ArrayIf", 475 "Map", 476 "SimpleState", 477 "State", 478 "Merge", 479 "MergeState", 480 "ForEach", 481 "Distinct", 482 "OrDefault", 483 "OrNull", 484 "Resample", 485 "ArgMin", 486 "ArgMax", 487 ] 488 489 FUNC_TOKENS = { 490 *parser.Parser.FUNC_TOKENS, 491 TokenType.AND, 492 TokenType.OR, 493 TokenType.SET, 494 } 495 496 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 497 498 ID_VAR_TOKENS = { 499 *parser.Parser.ID_VAR_TOKENS, 500 TokenType.LIKE, 501 } 502 503 AGG_FUNC_MAPPING = ( 504 lambda functions, suffixes: { 505 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 506 } 507 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 508 509 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 510 511 FUNCTION_PARSERS = { 512 **parser.Parser.FUNCTION_PARSERS, 513 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 514 "QUANTILE": lambda self: self._parse_quantile(), 515 "MEDIAN": lambda self: self._parse_quantile(), 516 "COLUMNS": lambda self: self._parse_columns(), 517 } 518 519 FUNCTION_PARSERS.pop("MATCH") 520 521 PROPERTY_PARSERS = { 522 **parser.Parser.PROPERTY_PARSERS, 523 "ENGINE": lambda self: self._parse_engine_property(), 524 } 525 PROPERTY_PARSERS.pop("DYNAMIC") 526 527 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 528 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 529 530 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 531 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 532 533 RANGE_PARSERS = { 534 **parser.Parser.RANGE_PARSERS, 535 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 536 } 537 538 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 539 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 540 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 541 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 542 543 JOIN_KINDS = { 544 *parser.Parser.JOIN_KINDS, 545 TokenType.ANY, 546 TokenType.ASOF, 547 TokenType.ARRAY, 548 } 549 550 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 551 TokenType.ANY, 552 TokenType.ARRAY, 553 TokenType.FINAL, 554 TokenType.FORMAT, 555 TokenType.SETTINGS, 556 } 557 558 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 559 TokenType.FORMAT, 560 } 561 562 LOG_DEFAULTS_TO_LN = True 563 564 QUERY_MODIFIER_PARSERS = { 565 **parser.Parser.QUERY_MODIFIER_PARSERS, 566 TokenType.SETTINGS: lambda self: ( 567 "settings", 568 self._advance() or self._parse_csv(self._parse_assignment), 569 ), 570 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 571 } 572 573 CONSTRAINT_PARSERS = { 574 **parser.Parser.CONSTRAINT_PARSERS, 575 "INDEX": lambda self: self._parse_index_constraint(), 576 "CODEC": lambda self: self._parse_compress(), 577 } 578 579 ALTER_PARSERS = { 580 **parser.Parser.ALTER_PARSERS, 581 "REPLACE": lambda self: self._parse_alter_table_replace(), 582 } 583 584 SCHEMA_UNNAMED_CONSTRAINTS = { 585 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 586 "INDEX", 587 } 588 589 PLACEHOLDER_PARSERS = { 590 **parser.Parser.PLACEHOLDER_PARSERS, 591 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 592 } 593 594 def _parse_engine_property(self) -> exp.EngineProperty: 595 self._match(TokenType.EQ) 596 return self.expression( 597 exp.EngineProperty, 598 this=self._parse_field(any_token=True, anonymous_func=True), 599 ) 600 601 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 602 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 603 return self._parse_lambda() 604 605 def _parse_types( 606 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 607 ) -> t.Optional[exp.Expression]: 608 dtype = super()._parse_types( 609 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 610 ) 611 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 612 # Mark every type as non-nullable which is ClickHouse's default, unless it's 613 # already marked as nullable. This marker helps us transpile types from other 614 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 615 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 616 # fail in ClickHouse without the `Nullable` type constructor. 617 dtype.set("nullable", False) 618 619 return dtype 620 621 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 622 index = self._index 623 this = self._parse_bitwise() 624 if self._match(TokenType.FROM): 625 self._retreat(index) 626 return super()._parse_extract() 627 628 # We return Anonymous here because extract and regexpExtract have different semantics, 629 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 630 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 631 # 632 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 633 self._match(TokenType.COMMA) 634 return self.expression( 635 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 636 ) 637 638 def _parse_assignment(self) -> t.Optional[exp.Expression]: 639 this = super()._parse_assignment() 640 641 if self._match(TokenType.PLACEHOLDER): 642 return self.expression( 643 exp.If, 644 this=this, 645 true=self._parse_assignment(), 646 false=self._match(TokenType.COLON) and self._parse_assignment(), 647 ) 648 649 return this 650 651 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 652 """ 653 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 654 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 655 """ 656 index = self._index 657 658 this = self._parse_id_var() 659 self._match(TokenType.COLON) 660 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 661 self._match_text_seq("IDENTIFIER") and "Identifier" 662 ) 663 664 if not kind: 665 self._retreat(index) 666 return None 667 elif not self._match(TokenType.R_BRACE): 668 self.raise_error("Expecting }") 669 670 if isinstance(this, exp.Identifier) and not this.quoted: 671 this = exp.var(this.name) 672 673 return self.expression(exp.Placeholder, this=this, kind=kind) 674 675 def _parse_bracket( 676 self, this: t.Optional[exp.Expression] = None 677 ) -> t.Optional[exp.Expression]: 678 l_brace = self._match(TokenType.L_BRACE, advance=False) 679 bracket = super()._parse_bracket(this) 680 681 if l_brace and isinstance(bracket, exp.Struct): 682 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 683 for expression in bracket.expressions: 684 if not isinstance(expression, exp.PropertyEQ): 685 break 686 687 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 688 varmap.args["values"].append("expressions", expression.expression) 689 690 return varmap 691 692 return bracket 693 694 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 695 this = super()._parse_in(this) 696 this.set("is_global", is_global) 697 return this 698 699 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 700 is_negated = self._match(TokenType.NOT) 701 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 702 return self.expression(exp.Not, this=this) if is_negated else this 703 704 def _parse_table( 705 self, 706 schema: bool = False, 707 joins: bool = False, 708 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 709 parse_bracket: bool = False, 710 is_db_reference: bool = False, 711 parse_partition: bool = False, 712 consume_pipe: bool = False, 713 ) -> t.Optional[exp.Expression]: 714 this = super()._parse_table( 715 schema=schema, 716 joins=joins, 717 alias_tokens=alias_tokens, 718 parse_bracket=parse_bracket, 719 is_db_reference=is_db_reference, 720 ) 721 722 if isinstance(this, exp.Table): 723 inner = this.this 724 alias = this.args.get("alias") 725 726 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 727 alias.set("columns", [exp.to_identifier("generate_series")]) 728 729 if self._match(TokenType.FINAL): 730 this = self.expression(exp.Final, this=this) 731 732 return this 733 734 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 735 return super()._parse_position(haystack_first=True) 736 737 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 738 def _parse_cte(self) -> t.Optional[exp.CTE]: 739 # WITH <identifier> AS <subquery expression> 740 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 741 742 if not cte: 743 # WITH <expression> AS <identifier> 744 cte = self.expression( 745 exp.CTE, 746 this=self._parse_assignment(), 747 alias=self._parse_table_alias(), 748 scalar=True, 749 ) 750 751 return cte 752 753 def _parse_join_parts( 754 self, 755 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 756 is_global = self._match(TokenType.GLOBAL) and self._prev 757 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 758 759 if kind_pre: 760 kind = self._match_set(self.JOIN_KINDS) and self._prev 761 side = self._match_set(self.JOIN_SIDES) and self._prev 762 return is_global, side, kind 763 764 return ( 765 is_global, 766 self._match_set(self.JOIN_SIDES) and self._prev, 767 self._match_set(self.JOIN_KINDS) and self._prev, 768 ) 769 770 def _parse_join( 771 self, skip_join_token: bool = False, parse_bracket: bool = False 772 ) -> t.Optional[exp.Join]: 773 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 774 if join: 775 join.set("global", join.args.pop("method", None)) 776 777 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 778 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 779 if join.kind == "ARRAY": 780 for table in join.find_all(exp.Table): 781 table.replace(table.to_column()) 782 783 return join 784 785 def _parse_function( 786 self, 787 functions: t.Optional[t.Dict[str, t.Callable]] = None, 788 anonymous: bool = False, 789 optional_parens: bool = True, 790 any_token: bool = False, 791 ) -> t.Optional[exp.Expression]: 792 expr = super()._parse_function( 793 functions=functions, 794 anonymous=anonymous, 795 optional_parens=optional_parens, 796 any_token=any_token, 797 ) 798 799 func = expr.this if isinstance(expr, exp.Window) else expr 800 801 # Aggregate functions can be split in 2 parts: <func_name><suffix> 802 parts = ( 803 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 804 ) 805 806 if parts: 807 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 808 params = self._parse_func_params(anon_func) 809 810 kwargs = { 811 "this": anon_func.this, 812 "expressions": anon_func.expressions, 813 } 814 if parts[1]: 815 exp_class: t.Type[exp.Expression] = ( 816 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 817 ) 818 else: 819 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 820 821 kwargs["exp_class"] = exp_class 822 if params: 823 kwargs["params"] = params 824 825 func = self.expression(**kwargs) 826 827 if isinstance(expr, exp.Window): 828 # The window's func was parsed as Anonymous in base parser, fix its 829 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 830 expr.set("this", func) 831 elif params: 832 # Params have blocked super()._parse_function() from parsing the following window 833 # (if that exists) as they're standing between the function call and the window spec 834 expr = self._parse_window(func) 835 else: 836 expr = func 837 838 return expr 839 840 def _parse_func_params( 841 self, this: t.Optional[exp.Func] = None 842 ) -> t.Optional[t.List[exp.Expression]]: 843 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 844 return self._parse_csv(self._parse_lambda) 845 846 if self._match(TokenType.L_PAREN): 847 params = self._parse_csv(self._parse_lambda) 848 self._match_r_paren(this) 849 return params 850 851 return None 852 853 def _parse_quantile(self) -> exp.Quantile: 854 this = self._parse_lambda() 855 params = self._parse_func_params() 856 if params: 857 return self.expression(exp.Quantile, this=params[0], quantile=this) 858 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 859 860 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 861 return super()._parse_wrapped_id_vars(optional=True) 862 863 def _parse_primary_key( 864 self, wrapped_optional: bool = False, in_props: bool = False 865 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 866 return super()._parse_primary_key( 867 wrapped_optional=wrapped_optional or in_props, in_props=in_props 868 ) 869 870 def _parse_on_property(self) -> t.Optional[exp.Expression]: 871 index = self._index 872 if self._match_text_seq("CLUSTER"): 873 this = self._parse_string() or self._parse_id_var() 874 if this: 875 return self.expression(exp.OnCluster, this=this) 876 else: 877 self._retreat(index) 878 return None 879 880 def _parse_index_constraint( 881 self, kind: t.Optional[str] = None 882 ) -> exp.IndexColumnConstraint: 883 # INDEX name1 expr TYPE type1(args) GRANULARITY value 884 this = self._parse_id_var() 885 expression = self._parse_assignment() 886 887 index_type = self._match_text_seq("TYPE") and ( 888 self._parse_function() or self._parse_var() 889 ) 890 891 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 892 893 return self.expression( 894 exp.IndexColumnConstraint, 895 this=this, 896 expression=expression, 897 index_type=index_type, 898 granularity=granularity, 899 ) 900 901 def _parse_partition(self) -> t.Optional[exp.Partition]: 902 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 903 if not self._match(TokenType.PARTITION): 904 return None 905 906 if self._match_text_seq("ID"): 907 # Corresponds to the PARTITION ID <string_value> syntax 908 expressions: t.List[exp.Expression] = [ 909 self.expression(exp.PartitionId, this=self._parse_string()) 910 ] 911 else: 912 expressions = self._parse_expressions() 913 914 return self.expression(exp.Partition, expressions=expressions) 915 916 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 917 partition = self._parse_partition() 918 919 if not partition or not self._match(TokenType.FROM): 920 return None 921 922 return self.expression( 923 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 924 ) 925 926 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 927 if not self._match_text_seq("PROJECTION"): 928 return None 929 930 return self.expression( 931 exp.ProjectionDef, 932 this=self._parse_id_var(), 933 expression=self._parse_wrapped(self._parse_statement), 934 ) 935 936 def _parse_constraint(self) -> t.Optional[exp.Expression]: 937 return super()._parse_constraint() or self._parse_projection_def() 938 939 def _parse_alias( 940 self, this: t.Optional[exp.Expression], explicit: bool = False 941 ) -> t.Optional[exp.Expression]: 942 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 943 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 944 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 945 return this 946 947 return super()._parse_alias(this=this, explicit=explicit) 948 949 def _parse_expression(self) -> t.Optional[exp.Expression]: 950 this = super()._parse_expression() 951 952 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 953 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 954 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 955 self._match(TokenType.R_PAREN) 956 957 return this 958 959 def _parse_columns(self) -> exp.Expression: 960 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 961 962 while self._next and self._match_text_seq(")", "APPLY", "("): 963 self._match(TokenType.R_PAREN) 964 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 965 return this 966 967 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 968 value = super()._parse_value(values=values) 969 if not value: 970 return None 971 972 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 973 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 974 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 975 # but the final result is not altered by the extra parentheses. 976 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 977 expressions = value.expressions 978 if values and not isinstance(expressions[-1], exp.Tuple): 979 value.set( 980 "expressions", 981 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 982 ) 983 984 return value 985 986 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 987 # ClickHouse allows custom expressions as partition key 988 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 989 return self.expression( 990 exp.PartitionedByProperty, 991 this=self._parse_assignment(), 992 ) 993 994 class Generator(generator.Generator): 995 QUERY_HINTS = False 996 STRUCT_DELIMITER = ("(", ")") 997 NVL2_SUPPORTED = False 998 TABLESAMPLE_REQUIRES_PARENS = False 999 TABLESAMPLE_SIZE_IS_ROWS = False 1000 TABLESAMPLE_KEYWORDS = "SAMPLE" 1001 LAST_DAY_SUPPORTS_DATE_PART = False 1002 CAN_IMPLEMENT_ARRAY_ANY = True 1003 SUPPORTS_TO_NUMBER = False 1004 JOIN_HINTS = False 1005 TABLE_HINTS = False 1006 GROUPINGS_SEP = "" 1007 SET_OP_MODIFIERS = False 1008 ARRAY_SIZE_NAME = "LENGTH" 1009 WRAP_DERIVED_VALUES = False 1010 1011 STRING_TYPE_MAPPING = { 1012 exp.DataType.Type.BLOB: "String", 1013 exp.DataType.Type.CHAR: "String", 1014 exp.DataType.Type.LONGBLOB: "String", 1015 exp.DataType.Type.LONGTEXT: "String", 1016 exp.DataType.Type.MEDIUMBLOB: "String", 1017 exp.DataType.Type.MEDIUMTEXT: "String", 1018 exp.DataType.Type.TINYBLOB: "String", 1019 exp.DataType.Type.TINYTEXT: "String", 1020 exp.DataType.Type.TEXT: "String", 1021 exp.DataType.Type.VARBINARY: "String", 1022 exp.DataType.Type.VARCHAR: "String", 1023 } 1024 1025 SUPPORTED_JSON_PATH_PARTS = { 1026 exp.JSONPathKey, 1027 exp.JSONPathRoot, 1028 exp.JSONPathSubscript, 1029 } 1030 1031 TYPE_MAPPING = { 1032 **generator.Generator.TYPE_MAPPING, 1033 **STRING_TYPE_MAPPING, 1034 exp.DataType.Type.ARRAY: "Array", 1035 exp.DataType.Type.BOOLEAN: "Bool", 1036 exp.DataType.Type.BIGINT: "Int64", 1037 exp.DataType.Type.DATE32: "Date32", 1038 exp.DataType.Type.DATETIME: "DateTime", 1039 exp.DataType.Type.DATETIME2: "DateTime", 1040 exp.DataType.Type.SMALLDATETIME: "DateTime", 1041 exp.DataType.Type.DATETIME64: "DateTime64", 1042 exp.DataType.Type.DECIMAL: "Decimal", 1043 exp.DataType.Type.DECIMAL32: "Decimal32", 1044 exp.DataType.Type.DECIMAL64: "Decimal64", 1045 exp.DataType.Type.DECIMAL128: "Decimal128", 1046 exp.DataType.Type.DECIMAL256: "Decimal256", 1047 exp.DataType.Type.TIMESTAMP: "DateTime", 1048 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1049 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1050 exp.DataType.Type.DOUBLE: "Float64", 1051 exp.DataType.Type.ENUM: "Enum", 1052 exp.DataType.Type.ENUM8: "Enum8", 1053 exp.DataType.Type.ENUM16: "Enum16", 1054 exp.DataType.Type.FIXEDSTRING: "FixedString", 1055 exp.DataType.Type.FLOAT: "Float32", 1056 exp.DataType.Type.INT: "Int32", 1057 exp.DataType.Type.MEDIUMINT: "Int32", 1058 exp.DataType.Type.INT128: "Int128", 1059 exp.DataType.Type.INT256: "Int256", 1060 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1061 exp.DataType.Type.MAP: "Map", 1062 exp.DataType.Type.NESTED: "Nested", 1063 exp.DataType.Type.NOTHING: "Nothing", 1064 exp.DataType.Type.SMALLINT: "Int16", 1065 exp.DataType.Type.STRUCT: "Tuple", 1066 exp.DataType.Type.TINYINT: "Int8", 1067 exp.DataType.Type.UBIGINT: "UInt64", 1068 exp.DataType.Type.UINT: "UInt32", 1069 exp.DataType.Type.UINT128: "UInt128", 1070 exp.DataType.Type.UINT256: "UInt256", 1071 exp.DataType.Type.USMALLINT: "UInt16", 1072 exp.DataType.Type.UTINYINT: "UInt8", 1073 exp.DataType.Type.IPV4: "IPv4", 1074 exp.DataType.Type.IPV6: "IPv6", 1075 exp.DataType.Type.POINT: "Point", 1076 exp.DataType.Type.RING: "Ring", 1077 exp.DataType.Type.LINESTRING: "LineString", 1078 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1079 exp.DataType.Type.POLYGON: "Polygon", 1080 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1081 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1082 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1083 exp.DataType.Type.DYNAMIC: "Dynamic", 1084 } 1085 1086 TRANSFORMS = { 1087 **generator.Generator.TRANSFORMS, 1088 exp.AnyValue: rename_func("any"), 1089 exp.ApproxDistinct: rename_func("uniq"), 1090 exp.ArrayConcat: rename_func("arrayConcat"), 1091 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1092 exp.ArrayRemove: remove_from_array_using_filter, 1093 exp.ArrayReverse: rename_func("arrayReverse"), 1094 exp.ArraySlice: rename_func("arraySlice"), 1095 exp.ArraySum: rename_func("arraySum"), 1096 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1097 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1098 exp.Array: inline_array_sql, 1099 exp.CastToStrType: rename_func("CAST"), 1100 exp.CountIf: rename_func("countIf"), 1101 exp.CosineDistance: rename_func("cosineDistance"), 1102 exp.CompressColumnConstraint: lambda self, 1103 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1104 exp.ComputedColumnConstraint: lambda self, 1105 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1106 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1107 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1108 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1109 exp.DateStrToDate: rename_func("toDate"), 1110 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1111 exp.Explode: rename_func("arrayJoin"), 1112 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1113 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1114 exp.IsNan: rename_func("isNaN"), 1115 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1116 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1117 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1118 exp.JSONPathKey: json_path_key_only_name, 1119 exp.JSONPathRoot: lambda *_: "", 1120 exp.Length: length_or_char_length_sql, 1121 exp.Map: _map_sql, 1122 exp.Median: rename_func("median"), 1123 exp.Nullif: rename_func("nullIf"), 1124 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1125 exp.Pivot: no_pivot_sql, 1126 exp.Quantile: _quantile_sql, 1127 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1128 exp.Rand: rename_func("randCanonical"), 1129 exp.StartsWith: rename_func("startsWith"), 1130 exp.EndsWith: rename_func("endsWith"), 1131 exp.EuclideanDistance: rename_func("L2Distance"), 1132 exp.StrPosition: lambda self, e: strposition_sql( 1133 self, 1134 e, 1135 func_name="POSITION", 1136 supports_position=True, 1137 use_ansi_position=False, 1138 ), 1139 exp.TimeToStr: lambda self, e: self.func( 1140 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1141 ), 1142 exp.TimeStrToTime: _timestrtotime_sql, 1143 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1144 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1145 exp.Typeof: rename_func("toTypeName"), 1146 exp.VarMap: _map_sql, 1147 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1148 exp.MD5Digest: rename_func("MD5"), 1149 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1150 exp.SHA: rename_func("SHA1"), 1151 exp.SHA2: sha256_sql, 1152 exp.UnixToTime: _unix_to_time_sql, 1153 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1154 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1155 exp.Variance: rename_func("varSamp"), 1156 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1157 exp.Stddev: rename_func("stddevSamp"), 1158 exp.Chr: rename_func("CHAR"), 1159 exp.Lag: lambda self, e: self.func( 1160 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1161 ), 1162 exp.Lead: lambda self, e: self.func( 1163 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1164 ), 1165 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1166 rename_func("editDistance") 1167 ), 1168 exp.ParseDatetime: rename_func("parseDateTime"), 1169 } 1170 1171 PROPERTIES_LOCATION = { 1172 **generator.Generator.PROPERTIES_LOCATION, 1173 exp.OnCluster: exp.Properties.Location.POST_NAME, 1174 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1175 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1176 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1177 } 1178 1179 # There's no list in docs, but it can be found in Clickhouse code 1180 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1181 ON_CLUSTER_TARGETS = { 1182 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1183 "DATABASE", 1184 "TABLE", 1185 "VIEW", 1186 "DICTIONARY", 1187 "INDEX", 1188 "FUNCTION", 1189 "NAMED COLLECTION", 1190 } 1191 1192 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1193 NON_NULLABLE_TYPES = { 1194 exp.DataType.Type.ARRAY, 1195 exp.DataType.Type.MAP, 1196 exp.DataType.Type.STRUCT, 1197 exp.DataType.Type.POINT, 1198 exp.DataType.Type.RING, 1199 exp.DataType.Type.LINESTRING, 1200 exp.DataType.Type.MULTILINESTRING, 1201 exp.DataType.Type.POLYGON, 1202 exp.DataType.Type.MULTIPOLYGON, 1203 } 1204 1205 def offset_sql(self, expression: exp.Offset) -> str: 1206 offset = super().offset_sql(expression) 1207 1208 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1209 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1210 parent = expression.parent 1211 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1212 offset = f"{offset} ROWS" 1213 1214 return offset 1215 1216 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1217 strtodate_sql = self.function_fallback_sql(expression) 1218 1219 if not isinstance(expression.parent, exp.Cast): 1220 # StrToDate returns DATEs in other dialects (eg. postgres), so 1221 # this branch aims to improve the transpilation to clickhouse 1222 return self.cast_sql(exp.cast(expression, "DATE")) 1223 1224 return strtodate_sql 1225 1226 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1227 this = expression.this 1228 1229 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1230 return self.sql(this) 1231 1232 return super().cast_sql(expression, safe_prefix=safe_prefix) 1233 1234 def trycast_sql(self, expression: exp.TryCast) -> str: 1235 dtype = expression.to 1236 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1237 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1238 dtype.set("nullable", True) 1239 1240 return super().cast_sql(expression) 1241 1242 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1243 this = self.json_path_part(expression.this) 1244 return str(int(this) + 1) if is_int(this) else this 1245 1246 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1247 return f"AS {self.sql(expression, 'this')}" 1248 1249 def _any_to_has( 1250 self, 1251 expression: exp.EQ | exp.NEQ, 1252 default: t.Callable[[t.Any], str], 1253 prefix: str = "", 1254 ) -> str: 1255 if isinstance(expression.left, exp.Any): 1256 arr = expression.left 1257 this = expression.right 1258 elif isinstance(expression.right, exp.Any): 1259 arr = expression.right 1260 this = expression.left 1261 else: 1262 return default(expression) 1263 1264 return prefix + self.func("has", arr.this.unnest(), this) 1265 1266 def eq_sql(self, expression: exp.EQ) -> str: 1267 return self._any_to_has(expression, super().eq_sql) 1268 1269 def neq_sql(self, expression: exp.NEQ) -> str: 1270 return self._any_to_has(expression, super().neq_sql, "NOT ") 1271 1272 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1273 # Manually add a flag to make the search case-insensitive 1274 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1275 return self.func("match", expression.this, regex) 1276 1277 def datatype_sql(self, expression: exp.DataType) -> str: 1278 # String is the standard ClickHouse type, every other variant is just an alias. 1279 # Additionally, any supplied length parameter will be ignored. 1280 # 1281 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1282 if expression.this in self.STRING_TYPE_MAPPING: 1283 dtype = "String" 1284 else: 1285 dtype = super().datatype_sql(expression) 1286 1287 # This section changes the type to `Nullable(...)` if the following conditions hold: 1288 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1289 # and change their semantics 1290 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1291 # constraint: "Type of Map key must be a type, that can be represented by integer or 1292 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1293 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1294 parent = expression.parent 1295 nullable = expression.args.get("nullable") 1296 if nullable is True or ( 1297 nullable is None 1298 and not ( 1299 isinstance(parent, exp.DataType) 1300 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1301 and expression.index in (None, 0) 1302 ) 1303 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1304 ): 1305 dtype = f"Nullable({dtype})" 1306 1307 return dtype 1308 1309 def cte_sql(self, expression: exp.CTE) -> str: 1310 if expression.args.get("scalar"): 1311 this = self.sql(expression, "this") 1312 alias = self.sql(expression, "alias") 1313 return f"{this} AS {alias}" 1314 1315 return super().cte_sql(expression) 1316 1317 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1318 return super().after_limit_modifiers(expression) + [ 1319 ( 1320 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1321 if expression.args.get("settings") 1322 else "" 1323 ), 1324 ( 1325 self.seg("FORMAT ") + self.sql(expression, "format") 1326 if expression.args.get("format") 1327 else "" 1328 ), 1329 ] 1330 1331 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1332 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1333 1334 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1335 return f"ON CLUSTER {self.sql(expression, 'this')}" 1336 1337 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1338 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1339 exp.Properties.Location.POST_NAME 1340 ): 1341 this_name = self.sql( 1342 expression.this if isinstance(expression.this, exp.Schema) else expression, 1343 "this", 1344 ) 1345 this_properties = " ".join( 1346 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1347 ) 1348 this_schema = self.schema_columns_sql(expression.this) 1349 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1350 1351 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1352 1353 return super().createable_sql(expression, locations) 1354 1355 def create_sql(self, expression: exp.Create) -> str: 1356 # The comment property comes last in CTAS statements, i.e. after the query 1357 query = expression.expression 1358 if isinstance(query, exp.Query): 1359 comment_prop = expression.find(exp.SchemaCommentProperty) 1360 if comment_prop: 1361 comment_prop.pop() 1362 query.replace(exp.paren(query)) 1363 else: 1364 comment_prop = None 1365 1366 create_sql = super().create_sql(expression) 1367 1368 comment_sql = self.sql(comment_prop) 1369 comment_sql = f" {comment_sql}" if comment_sql else "" 1370 1371 return f"{create_sql}{comment_sql}" 1372 1373 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1374 this = self.indent(self.sql(expression, "this")) 1375 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1376 1377 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1378 this = self.sql(expression, "this") 1379 this = f" {this}" if this else "" 1380 expr = self.sql(expression, "expression") 1381 expr = f" {expr}" if expr else "" 1382 index_type = self.sql(expression, "index_type") 1383 index_type = f" TYPE {index_type}" if index_type else "" 1384 granularity = self.sql(expression, "granularity") 1385 granularity = f" GRANULARITY {granularity}" if granularity else "" 1386 1387 return f"INDEX{this}{expr}{index_type}{granularity}" 1388 1389 def partition_sql(self, expression: exp.Partition) -> str: 1390 return f"PARTITION {self.expressions(expression, flat=True)}" 1391 1392 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1393 return f"ID {self.sql(expression.this)}" 1394 1395 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1396 return ( 1397 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1398 ) 1399 1400 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1401 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1402 1403 def is_sql(self, expression: exp.Is) -> str: 1404 is_sql = super().is_sql(expression) 1405 1406 if isinstance(expression.parent, exp.Not): 1407 # value IS NOT NULL -> NOT (value IS NULL) 1408 is_sql = self.wrap(is_sql) 1409 1410 return is_sql 1411 1412 def in_sql(self, expression: exp.In) -> str: 1413 in_sql = super().in_sql(expression) 1414 1415 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1416 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1417 1418 return in_sql 1419 1420 def not_sql(self, expression: exp.Not) -> str: 1421 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1422 # let `GLOBAL IN` child interpose `NOT` 1423 return self.sql(expression, "this") 1424 1425 return super().not_sql(expression) 1426 1427 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1428 # If the VALUES clause contains tuples of expressions, we need to treat it 1429 # as a table since Clickhouse will automatically alias it as such. 1430 alias = expression.args.get("alias") 1431 1432 if alias and alias.args.get("columns") and expression.expressions: 1433 values = expression.expressions[0].expressions 1434 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1435 else: 1436 values_as_table = True 1437 1438 return super().values_sql(expression, values_as_table=values_as_table)
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Default NULL
ordering method to use if not explicitly set.
Possible values: "nulls_are_small"
, "nulls_are_large"
, "nulls_are_last"
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether number literals can include underscores for better readability
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Helper for dialects that use a different name for the same creatable kind. For example, the Clickhouse equivalent of CREATE SCHEMA is CREATE DATABASE.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
220 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 221 # Clickhouse allows VALUES to have an embedded structure e.g: 222 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 223 # In this case, we don't want to qualify the columns 224 values = expression.expressions[0].expressions 225 226 structure = ( 227 values[0] 228 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 229 else None 230 ) 231 if structure: 232 # Split each column definition into the column name e.g: 233 # 'person String, place String' -> ['person', 'place'] 234 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 235 column_aliases = [ 236 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 237 ] 238 else: 239 # Default column aliases in CH are "c1", "c2", etc. 240 column_aliases = [ 241 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 242 ] 243 244 return column_aliases
246 class Tokenizer(tokens.Tokenizer): 247 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 248 IDENTIFIERS = ['"', "`"] 249 IDENTIFIER_ESCAPES = ["\\"] 250 STRING_ESCAPES = ["'", "\\"] 251 BIT_STRINGS = [("0b", "")] 252 HEX_STRINGS = [("0x", ""), ("0X", "")] 253 HEREDOC_STRINGS = ["$"] 254 255 KEYWORDS = { 256 **tokens.Tokenizer.KEYWORDS, 257 ".:": TokenType.DOTCOLON, 258 "ATTACH": TokenType.COMMAND, 259 "DATE32": TokenType.DATE32, 260 "DATETIME64": TokenType.DATETIME64, 261 "DICTIONARY": TokenType.DICTIONARY, 262 "DYNAMIC": TokenType.DYNAMIC, 263 "ENUM8": TokenType.ENUM8, 264 "ENUM16": TokenType.ENUM16, 265 "EXCHANGE": TokenType.COMMAND, 266 "FINAL": TokenType.FINAL, 267 "FIXEDSTRING": TokenType.FIXEDSTRING, 268 "FLOAT32": TokenType.FLOAT, 269 "FLOAT64": TokenType.DOUBLE, 270 "GLOBAL": TokenType.GLOBAL, 271 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 272 "MAP": TokenType.MAP, 273 "NESTED": TokenType.NESTED, 274 "NOTHING": TokenType.NOTHING, 275 "SAMPLE": TokenType.TABLE_SAMPLE, 276 "TUPLE": TokenType.STRUCT, 277 "UINT16": TokenType.USMALLINT, 278 "UINT32": TokenType.UINT, 279 "UINT64": TokenType.UBIGINT, 280 "UINT8": TokenType.UTINYINT, 281 "IPV4": TokenType.IPV4, 282 "IPV6": TokenType.IPV6, 283 "POINT": TokenType.POINT, 284 "RING": TokenType.RING, 285 "LINESTRING": TokenType.LINESTRING, 286 "MULTILINESTRING": TokenType.MULTILINESTRING, 287 "POLYGON": TokenType.POLYGON, 288 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 289 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 290 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 291 "SYSTEM": TokenType.COMMAND, 292 "PREWHERE": TokenType.PREWHERE, 293 } 294 KEYWORDS.pop("/*+") 295 296 SINGLE_TOKENS = { 297 **tokens.Tokenizer.SINGLE_TOKENS, 298 "$": TokenType.HEREDOC_STRING, 299 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BYTE_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
301 class Parser(parser.Parser): 302 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 303 # * select x from t1 union all select x from t2 limit 1; 304 # * select x from t1 union all (select x from t2 limit 1); 305 MODIFIERS_ATTACHED_TO_SET_OP = False 306 INTERVAL_SPANS = False 307 OPTIONAL_ALIAS_TOKEN_CTE = False 308 JOINS_HAVE_EQUAL_PRECEDENCE = True 309 310 FUNCTIONS = { 311 **parser.Parser.FUNCTIONS, 312 "ANY": exp.AnyValue.from_arg_list, 313 "ARRAYSUM": exp.ArraySum.from_arg_list, 314 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 315 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 316 "COUNTIF": _build_count_if, 317 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 318 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 320 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 322 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 323 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 324 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 325 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 326 "JSONEXTRACTSTRING": build_json_extract_path( 327 exp.JSONExtractScalar, zero_based_indexing=False 328 ), 329 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 330 "L2Distance": exp.EuclideanDistance.from_arg_list, 331 "MAP": parser.build_var_map, 332 "MATCH": exp.RegexpLike.from_arg_list, 333 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 334 "RANDCANONICAL": exp.Rand.from_arg_list, 335 "STR_TO_DATE": _build_str_to_date, 336 "TUPLE": exp.Struct.from_arg_list, 337 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 339 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 341 "UNIQ": exp.ApproxDistinct.from_arg_list, 342 "XOR": lambda args: exp.Xor(expressions=args), 343 "MD5": exp.MD5Digest.from_arg_list, 344 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 345 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 346 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 347 "TOTYPENAME": exp.Typeof.from_arg_list, 348 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 349 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 350 } 351 FUNCTIONS.pop("TRANSFORM") 352 FUNCTIONS.pop("APPROX_TOP_SUM") 353 354 AGG_FUNCTIONS = { 355 "count", 356 "min", 357 "max", 358 "sum", 359 "avg", 360 "any", 361 "stddevPop", 362 "stddevSamp", 363 "varPop", 364 "varSamp", 365 "corr", 366 "covarPop", 367 "covarSamp", 368 "entropy", 369 "exponentialMovingAverage", 370 "intervalLengthSum", 371 "kolmogorovSmirnovTest", 372 "mannWhitneyUTest", 373 "median", 374 "rankCorr", 375 "sumKahan", 376 "studentTTest", 377 "welchTTest", 378 "anyHeavy", 379 "anyLast", 380 "boundingRatio", 381 "first_value", 382 "last_value", 383 "argMin", 384 "argMax", 385 "avgWeighted", 386 "topK", 387 "approx_top_sum", 388 "topKWeighted", 389 "deltaSum", 390 "deltaSumTimestamp", 391 "groupArray", 392 "groupArrayLast", 393 "groupUniqArray", 394 "groupArrayInsertAt", 395 "groupArrayMovingAvg", 396 "groupArrayMovingSum", 397 "groupArraySample", 398 "groupBitAnd", 399 "groupBitOr", 400 "groupBitXor", 401 "groupBitmap", 402 "groupBitmapAnd", 403 "groupBitmapOr", 404 "groupBitmapXor", 405 "sumWithOverflow", 406 "sumMap", 407 "minMap", 408 "maxMap", 409 "skewSamp", 410 "skewPop", 411 "kurtSamp", 412 "kurtPop", 413 "uniq", 414 "uniqExact", 415 "uniqCombined", 416 "uniqCombined64", 417 "uniqHLL12", 418 "uniqTheta", 419 "quantile", 420 "quantiles", 421 "quantileExact", 422 "quantilesExact", 423 "quantileExactLow", 424 "quantilesExactLow", 425 "quantileExactHigh", 426 "quantilesExactHigh", 427 "quantileExactWeighted", 428 "quantilesExactWeighted", 429 "quantileTiming", 430 "quantilesTiming", 431 "quantileTimingWeighted", 432 "quantilesTimingWeighted", 433 "quantileDeterministic", 434 "quantilesDeterministic", 435 "quantileTDigest", 436 "quantilesTDigest", 437 "quantileTDigestWeighted", 438 "quantilesTDigestWeighted", 439 "quantileBFloat16", 440 "quantilesBFloat16", 441 "quantileBFloat16Weighted", 442 "quantilesBFloat16Weighted", 443 "simpleLinearRegression", 444 "stochasticLinearRegression", 445 "stochasticLogisticRegression", 446 "categoricalInformationValue", 447 "contingency", 448 "cramersV", 449 "cramersVBiasCorrected", 450 "theilsU", 451 "maxIntersections", 452 "maxIntersectionsPosition", 453 "meanZTest", 454 "quantileInterpolatedWeighted", 455 "quantilesInterpolatedWeighted", 456 "quantileGK", 457 "quantilesGK", 458 "sparkBar", 459 "sumCount", 460 "largestTriangleThreeBuckets", 461 "histogram", 462 "sequenceMatch", 463 "sequenceCount", 464 "windowFunnel", 465 "retention", 466 "uniqUpTo", 467 "sequenceNextNode", 468 "exponentialTimeDecayedAvg", 469 } 470 471 AGG_FUNCTIONS_SUFFIXES = [ 472 "If", 473 "Array", 474 "ArrayIf", 475 "Map", 476 "SimpleState", 477 "State", 478 "Merge", 479 "MergeState", 480 "ForEach", 481 "Distinct", 482 "OrDefault", 483 "OrNull", 484 "Resample", 485 "ArgMin", 486 "ArgMax", 487 ] 488 489 FUNC_TOKENS = { 490 *parser.Parser.FUNC_TOKENS, 491 TokenType.AND, 492 TokenType.OR, 493 TokenType.SET, 494 } 495 496 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 497 498 ID_VAR_TOKENS = { 499 *parser.Parser.ID_VAR_TOKENS, 500 TokenType.LIKE, 501 } 502 503 AGG_FUNC_MAPPING = ( 504 lambda functions, suffixes: { 505 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 506 } 507 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 508 509 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 510 511 FUNCTION_PARSERS = { 512 **parser.Parser.FUNCTION_PARSERS, 513 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 514 "QUANTILE": lambda self: self._parse_quantile(), 515 "MEDIAN": lambda self: self._parse_quantile(), 516 "COLUMNS": lambda self: self._parse_columns(), 517 } 518 519 FUNCTION_PARSERS.pop("MATCH") 520 521 PROPERTY_PARSERS = { 522 **parser.Parser.PROPERTY_PARSERS, 523 "ENGINE": lambda self: self._parse_engine_property(), 524 } 525 PROPERTY_PARSERS.pop("DYNAMIC") 526 527 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 528 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 529 530 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 531 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 532 533 RANGE_PARSERS = { 534 **parser.Parser.RANGE_PARSERS, 535 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 536 } 537 538 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 539 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 540 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 541 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 542 543 JOIN_KINDS = { 544 *parser.Parser.JOIN_KINDS, 545 TokenType.ANY, 546 TokenType.ASOF, 547 TokenType.ARRAY, 548 } 549 550 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 551 TokenType.ANY, 552 TokenType.ARRAY, 553 TokenType.FINAL, 554 TokenType.FORMAT, 555 TokenType.SETTINGS, 556 } 557 558 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 559 TokenType.FORMAT, 560 } 561 562 LOG_DEFAULTS_TO_LN = True 563 564 QUERY_MODIFIER_PARSERS = { 565 **parser.Parser.QUERY_MODIFIER_PARSERS, 566 TokenType.SETTINGS: lambda self: ( 567 "settings", 568 self._advance() or self._parse_csv(self._parse_assignment), 569 ), 570 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 571 } 572 573 CONSTRAINT_PARSERS = { 574 **parser.Parser.CONSTRAINT_PARSERS, 575 "INDEX": lambda self: self._parse_index_constraint(), 576 "CODEC": lambda self: self._parse_compress(), 577 } 578 579 ALTER_PARSERS = { 580 **parser.Parser.ALTER_PARSERS, 581 "REPLACE": lambda self: self._parse_alter_table_replace(), 582 } 583 584 SCHEMA_UNNAMED_CONSTRAINTS = { 585 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 586 "INDEX", 587 } 588 589 PLACEHOLDER_PARSERS = { 590 **parser.Parser.PLACEHOLDER_PARSERS, 591 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 592 } 593 594 def _parse_engine_property(self) -> exp.EngineProperty: 595 self._match(TokenType.EQ) 596 return self.expression( 597 exp.EngineProperty, 598 this=self._parse_field(any_token=True, anonymous_func=True), 599 ) 600 601 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 602 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 603 return self._parse_lambda() 604 605 def _parse_types( 606 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 607 ) -> t.Optional[exp.Expression]: 608 dtype = super()._parse_types( 609 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 610 ) 611 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 612 # Mark every type as non-nullable which is ClickHouse's default, unless it's 613 # already marked as nullable. This marker helps us transpile types from other 614 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 615 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 616 # fail in ClickHouse without the `Nullable` type constructor. 617 dtype.set("nullable", False) 618 619 return dtype 620 621 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 622 index = self._index 623 this = self._parse_bitwise() 624 if self._match(TokenType.FROM): 625 self._retreat(index) 626 return super()._parse_extract() 627 628 # We return Anonymous here because extract and regexpExtract have different semantics, 629 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 630 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 631 # 632 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 633 self._match(TokenType.COMMA) 634 return self.expression( 635 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 636 ) 637 638 def _parse_assignment(self) -> t.Optional[exp.Expression]: 639 this = super()._parse_assignment() 640 641 if self._match(TokenType.PLACEHOLDER): 642 return self.expression( 643 exp.If, 644 this=this, 645 true=self._parse_assignment(), 646 false=self._match(TokenType.COLON) and self._parse_assignment(), 647 ) 648 649 return this 650 651 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 652 """ 653 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 654 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 655 """ 656 index = self._index 657 658 this = self._parse_id_var() 659 self._match(TokenType.COLON) 660 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 661 self._match_text_seq("IDENTIFIER") and "Identifier" 662 ) 663 664 if not kind: 665 self._retreat(index) 666 return None 667 elif not self._match(TokenType.R_BRACE): 668 self.raise_error("Expecting }") 669 670 if isinstance(this, exp.Identifier) and not this.quoted: 671 this = exp.var(this.name) 672 673 return self.expression(exp.Placeholder, this=this, kind=kind) 674 675 def _parse_bracket( 676 self, this: t.Optional[exp.Expression] = None 677 ) -> t.Optional[exp.Expression]: 678 l_brace = self._match(TokenType.L_BRACE, advance=False) 679 bracket = super()._parse_bracket(this) 680 681 if l_brace and isinstance(bracket, exp.Struct): 682 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 683 for expression in bracket.expressions: 684 if not isinstance(expression, exp.PropertyEQ): 685 break 686 687 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 688 varmap.args["values"].append("expressions", expression.expression) 689 690 return varmap 691 692 return bracket 693 694 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 695 this = super()._parse_in(this) 696 this.set("is_global", is_global) 697 return this 698 699 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 700 is_negated = self._match(TokenType.NOT) 701 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 702 return self.expression(exp.Not, this=this) if is_negated else this 703 704 def _parse_table( 705 self, 706 schema: bool = False, 707 joins: bool = False, 708 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 709 parse_bracket: bool = False, 710 is_db_reference: bool = False, 711 parse_partition: bool = False, 712 consume_pipe: bool = False, 713 ) -> t.Optional[exp.Expression]: 714 this = super()._parse_table( 715 schema=schema, 716 joins=joins, 717 alias_tokens=alias_tokens, 718 parse_bracket=parse_bracket, 719 is_db_reference=is_db_reference, 720 ) 721 722 if isinstance(this, exp.Table): 723 inner = this.this 724 alias = this.args.get("alias") 725 726 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 727 alias.set("columns", [exp.to_identifier("generate_series")]) 728 729 if self._match(TokenType.FINAL): 730 this = self.expression(exp.Final, this=this) 731 732 return this 733 734 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 735 return super()._parse_position(haystack_first=True) 736 737 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 738 def _parse_cte(self) -> t.Optional[exp.CTE]: 739 # WITH <identifier> AS <subquery expression> 740 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 741 742 if not cte: 743 # WITH <expression> AS <identifier> 744 cte = self.expression( 745 exp.CTE, 746 this=self._parse_assignment(), 747 alias=self._parse_table_alias(), 748 scalar=True, 749 ) 750 751 return cte 752 753 def _parse_join_parts( 754 self, 755 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 756 is_global = self._match(TokenType.GLOBAL) and self._prev 757 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 758 759 if kind_pre: 760 kind = self._match_set(self.JOIN_KINDS) and self._prev 761 side = self._match_set(self.JOIN_SIDES) and self._prev 762 return is_global, side, kind 763 764 return ( 765 is_global, 766 self._match_set(self.JOIN_SIDES) and self._prev, 767 self._match_set(self.JOIN_KINDS) and self._prev, 768 ) 769 770 def _parse_join( 771 self, skip_join_token: bool = False, parse_bracket: bool = False 772 ) -> t.Optional[exp.Join]: 773 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 774 if join: 775 join.set("global", join.args.pop("method", None)) 776 777 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 778 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 779 if join.kind == "ARRAY": 780 for table in join.find_all(exp.Table): 781 table.replace(table.to_column()) 782 783 return join 784 785 def _parse_function( 786 self, 787 functions: t.Optional[t.Dict[str, t.Callable]] = None, 788 anonymous: bool = False, 789 optional_parens: bool = True, 790 any_token: bool = False, 791 ) -> t.Optional[exp.Expression]: 792 expr = super()._parse_function( 793 functions=functions, 794 anonymous=anonymous, 795 optional_parens=optional_parens, 796 any_token=any_token, 797 ) 798 799 func = expr.this if isinstance(expr, exp.Window) else expr 800 801 # Aggregate functions can be split in 2 parts: <func_name><suffix> 802 parts = ( 803 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 804 ) 805 806 if parts: 807 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 808 params = self._parse_func_params(anon_func) 809 810 kwargs = { 811 "this": anon_func.this, 812 "expressions": anon_func.expressions, 813 } 814 if parts[1]: 815 exp_class: t.Type[exp.Expression] = ( 816 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 817 ) 818 else: 819 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 820 821 kwargs["exp_class"] = exp_class 822 if params: 823 kwargs["params"] = params 824 825 func = self.expression(**kwargs) 826 827 if isinstance(expr, exp.Window): 828 # The window's func was parsed as Anonymous in base parser, fix its 829 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 830 expr.set("this", func) 831 elif params: 832 # Params have blocked super()._parse_function() from parsing the following window 833 # (if that exists) as they're standing between the function call and the window spec 834 expr = self._parse_window(func) 835 else: 836 expr = func 837 838 return expr 839 840 def _parse_func_params( 841 self, this: t.Optional[exp.Func] = None 842 ) -> t.Optional[t.List[exp.Expression]]: 843 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 844 return self._parse_csv(self._parse_lambda) 845 846 if self._match(TokenType.L_PAREN): 847 params = self._parse_csv(self._parse_lambda) 848 self._match_r_paren(this) 849 return params 850 851 return None 852 853 def _parse_quantile(self) -> exp.Quantile: 854 this = self._parse_lambda() 855 params = self._parse_func_params() 856 if params: 857 return self.expression(exp.Quantile, this=params[0], quantile=this) 858 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 859 860 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 861 return super()._parse_wrapped_id_vars(optional=True) 862 863 def _parse_primary_key( 864 self, wrapped_optional: bool = False, in_props: bool = False 865 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 866 return super()._parse_primary_key( 867 wrapped_optional=wrapped_optional or in_props, in_props=in_props 868 ) 869 870 def _parse_on_property(self) -> t.Optional[exp.Expression]: 871 index = self._index 872 if self._match_text_seq("CLUSTER"): 873 this = self._parse_string() or self._parse_id_var() 874 if this: 875 return self.expression(exp.OnCluster, this=this) 876 else: 877 self._retreat(index) 878 return None 879 880 def _parse_index_constraint( 881 self, kind: t.Optional[str] = None 882 ) -> exp.IndexColumnConstraint: 883 # INDEX name1 expr TYPE type1(args) GRANULARITY value 884 this = self._parse_id_var() 885 expression = self._parse_assignment() 886 887 index_type = self._match_text_seq("TYPE") and ( 888 self._parse_function() or self._parse_var() 889 ) 890 891 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 892 893 return self.expression( 894 exp.IndexColumnConstraint, 895 this=this, 896 expression=expression, 897 index_type=index_type, 898 granularity=granularity, 899 ) 900 901 def _parse_partition(self) -> t.Optional[exp.Partition]: 902 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 903 if not self._match(TokenType.PARTITION): 904 return None 905 906 if self._match_text_seq("ID"): 907 # Corresponds to the PARTITION ID <string_value> syntax 908 expressions: t.List[exp.Expression] = [ 909 self.expression(exp.PartitionId, this=self._parse_string()) 910 ] 911 else: 912 expressions = self._parse_expressions() 913 914 return self.expression(exp.Partition, expressions=expressions) 915 916 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 917 partition = self._parse_partition() 918 919 if not partition or not self._match(TokenType.FROM): 920 return None 921 922 return self.expression( 923 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 924 ) 925 926 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 927 if not self._match_text_seq("PROJECTION"): 928 return None 929 930 return self.expression( 931 exp.ProjectionDef, 932 this=self._parse_id_var(), 933 expression=self._parse_wrapped(self._parse_statement), 934 ) 935 936 def _parse_constraint(self) -> t.Optional[exp.Expression]: 937 return super()._parse_constraint() or self._parse_projection_def() 938 939 def _parse_alias( 940 self, this: t.Optional[exp.Expression], explicit: bool = False 941 ) -> t.Optional[exp.Expression]: 942 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 943 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 944 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 945 return this 946 947 return super()._parse_alias(this=this, explicit=explicit) 948 949 def _parse_expression(self) -> t.Optional[exp.Expression]: 950 this = super()._parse_expression() 951 952 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 953 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 954 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 955 self._match(TokenType.R_PAREN) 956 957 return this 958 959 def _parse_columns(self) -> exp.Expression: 960 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 961 962 while self._next and self._match_text_seq(")", "APPLY", "("): 963 self._match(TokenType.R_PAREN) 964 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 965 return this 966 967 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 968 value = super()._parse_value(values=values) 969 if not value: 970 return None 971 972 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 973 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 974 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 975 # but the final result is not altered by the extra parentheses. 976 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 977 expressions = value.expressions 978 if values and not isinstance(expressions[-1], exp.Tuple): 979 value.set( 980 "expressions", 981 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 982 ) 983 984 return value 985 986 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 987 # ClickHouse allows custom expressions as partition key 988 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 989 return self.expression( 990 exp.PartitionedByProperty, 991 this=self._parse_assignment(), 992 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_HINTS
- LAMBDAS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- STATEMENT_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_ALTER_PARSERS
- INVALID_FUNC_NAME_TOKENS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- NULL_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- ADD_JOIN_ON_TRUE
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
994 class Generator(generator.Generator): 995 QUERY_HINTS = False 996 STRUCT_DELIMITER = ("(", ")") 997 NVL2_SUPPORTED = False 998 TABLESAMPLE_REQUIRES_PARENS = False 999 TABLESAMPLE_SIZE_IS_ROWS = False 1000 TABLESAMPLE_KEYWORDS = "SAMPLE" 1001 LAST_DAY_SUPPORTS_DATE_PART = False 1002 CAN_IMPLEMENT_ARRAY_ANY = True 1003 SUPPORTS_TO_NUMBER = False 1004 JOIN_HINTS = False 1005 TABLE_HINTS = False 1006 GROUPINGS_SEP = "" 1007 SET_OP_MODIFIERS = False 1008 ARRAY_SIZE_NAME = "LENGTH" 1009 WRAP_DERIVED_VALUES = False 1010 1011 STRING_TYPE_MAPPING = { 1012 exp.DataType.Type.BLOB: "String", 1013 exp.DataType.Type.CHAR: "String", 1014 exp.DataType.Type.LONGBLOB: "String", 1015 exp.DataType.Type.LONGTEXT: "String", 1016 exp.DataType.Type.MEDIUMBLOB: "String", 1017 exp.DataType.Type.MEDIUMTEXT: "String", 1018 exp.DataType.Type.TINYBLOB: "String", 1019 exp.DataType.Type.TINYTEXT: "String", 1020 exp.DataType.Type.TEXT: "String", 1021 exp.DataType.Type.VARBINARY: "String", 1022 exp.DataType.Type.VARCHAR: "String", 1023 } 1024 1025 SUPPORTED_JSON_PATH_PARTS = { 1026 exp.JSONPathKey, 1027 exp.JSONPathRoot, 1028 exp.JSONPathSubscript, 1029 } 1030 1031 TYPE_MAPPING = { 1032 **generator.Generator.TYPE_MAPPING, 1033 **STRING_TYPE_MAPPING, 1034 exp.DataType.Type.ARRAY: "Array", 1035 exp.DataType.Type.BOOLEAN: "Bool", 1036 exp.DataType.Type.BIGINT: "Int64", 1037 exp.DataType.Type.DATE32: "Date32", 1038 exp.DataType.Type.DATETIME: "DateTime", 1039 exp.DataType.Type.DATETIME2: "DateTime", 1040 exp.DataType.Type.SMALLDATETIME: "DateTime", 1041 exp.DataType.Type.DATETIME64: "DateTime64", 1042 exp.DataType.Type.DECIMAL: "Decimal", 1043 exp.DataType.Type.DECIMAL32: "Decimal32", 1044 exp.DataType.Type.DECIMAL64: "Decimal64", 1045 exp.DataType.Type.DECIMAL128: "Decimal128", 1046 exp.DataType.Type.DECIMAL256: "Decimal256", 1047 exp.DataType.Type.TIMESTAMP: "DateTime", 1048 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1049 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1050 exp.DataType.Type.DOUBLE: "Float64", 1051 exp.DataType.Type.ENUM: "Enum", 1052 exp.DataType.Type.ENUM8: "Enum8", 1053 exp.DataType.Type.ENUM16: "Enum16", 1054 exp.DataType.Type.FIXEDSTRING: "FixedString", 1055 exp.DataType.Type.FLOAT: "Float32", 1056 exp.DataType.Type.INT: "Int32", 1057 exp.DataType.Type.MEDIUMINT: "Int32", 1058 exp.DataType.Type.INT128: "Int128", 1059 exp.DataType.Type.INT256: "Int256", 1060 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1061 exp.DataType.Type.MAP: "Map", 1062 exp.DataType.Type.NESTED: "Nested", 1063 exp.DataType.Type.NOTHING: "Nothing", 1064 exp.DataType.Type.SMALLINT: "Int16", 1065 exp.DataType.Type.STRUCT: "Tuple", 1066 exp.DataType.Type.TINYINT: "Int8", 1067 exp.DataType.Type.UBIGINT: "UInt64", 1068 exp.DataType.Type.UINT: "UInt32", 1069 exp.DataType.Type.UINT128: "UInt128", 1070 exp.DataType.Type.UINT256: "UInt256", 1071 exp.DataType.Type.USMALLINT: "UInt16", 1072 exp.DataType.Type.UTINYINT: "UInt8", 1073 exp.DataType.Type.IPV4: "IPv4", 1074 exp.DataType.Type.IPV6: "IPv6", 1075 exp.DataType.Type.POINT: "Point", 1076 exp.DataType.Type.RING: "Ring", 1077 exp.DataType.Type.LINESTRING: "LineString", 1078 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1079 exp.DataType.Type.POLYGON: "Polygon", 1080 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1081 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1082 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1083 exp.DataType.Type.DYNAMIC: "Dynamic", 1084 } 1085 1086 TRANSFORMS = { 1087 **generator.Generator.TRANSFORMS, 1088 exp.AnyValue: rename_func("any"), 1089 exp.ApproxDistinct: rename_func("uniq"), 1090 exp.ArrayConcat: rename_func("arrayConcat"), 1091 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1092 exp.ArrayRemove: remove_from_array_using_filter, 1093 exp.ArrayReverse: rename_func("arrayReverse"), 1094 exp.ArraySlice: rename_func("arraySlice"), 1095 exp.ArraySum: rename_func("arraySum"), 1096 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1097 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1098 exp.Array: inline_array_sql, 1099 exp.CastToStrType: rename_func("CAST"), 1100 exp.CountIf: rename_func("countIf"), 1101 exp.CosineDistance: rename_func("cosineDistance"), 1102 exp.CompressColumnConstraint: lambda self, 1103 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1104 exp.ComputedColumnConstraint: lambda self, 1105 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1106 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1107 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1108 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1109 exp.DateStrToDate: rename_func("toDate"), 1110 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1111 exp.Explode: rename_func("arrayJoin"), 1112 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1113 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1114 exp.IsNan: rename_func("isNaN"), 1115 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1116 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1117 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1118 exp.JSONPathKey: json_path_key_only_name, 1119 exp.JSONPathRoot: lambda *_: "", 1120 exp.Length: length_or_char_length_sql, 1121 exp.Map: _map_sql, 1122 exp.Median: rename_func("median"), 1123 exp.Nullif: rename_func("nullIf"), 1124 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1125 exp.Pivot: no_pivot_sql, 1126 exp.Quantile: _quantile_sql, 1127 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1128 exp.Rand: rename_func("randCanonical"), 1129 exp.StartsWith: rename_func("startsWith"), 1130 exp.EndsWith: rename_func("endsWith"), 1131 exp.EuclideanDistance: rename_func("L2Distance"), 1132 exp.StrPosition: lambda self, e: strposition_sql( 1133 self, 1134 e, 1135 func_name="POSITION", 1136 supports_position=True, 1137 use_ansi_position=False, 1138 ), 1139 exp.TimeToStr: lambda self, e: self.func( 1140 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1141 ), 1142 exp.TimeStrToTime: _timestrtotime_sql, 1143 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1144 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1145 exp.Typeof: rename_func("toTypeName"), 1146 exp.VarMap: _map_sql, 1147 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1148 exp.MD5Digest: rename_func("MD5"), 1149 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1150 exp.SHA: rename_func("SHA1"), 1151 exp.SHA2: sha256_sql, 1152 exp.UnixToTime: _unix_to_time_sql, 1153 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1154 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1155 exp.Variance: rename_func("varSamp"), 1156 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1157 exp.Stddev: rename_func("stddevSamp"), 1158 exp.Chr: rename_func("CHAR"), 1159 exp.Lag: lambda self, e: self.func( 1160 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1161 ), 1162 exp.Lead: lambda self, e: self.func( 1163 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1164 ), 1165 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1166 rename_func("editDistance") 1167 ), 1168 exp.ParseDatetime: rename_func("parseDateTime"), 1169 } 1170 1171 PROPERTIES_LOCATION = { 1172 **generator.Generator.PROPERTIES_LOCATION, 1173 exp.OnCluster: exp.Properties.Location.POST_NAME, 1174 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1175 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1176 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1177 } 1178 1179 # There's no list in docs, but it can be found in Clickhouse code 1180 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1181 ON_CLUSTER_TARGETS = { 1182 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1183 "DATABASE", 1184 "TABLE", 1185 "VIEW", 1186 "DICTIONARY", 1187 "INDEX", 1188 "FUNCTION", 1189 "NAMED COLLECTION", 1190 } 1191 1192 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1193 NON_NULLABLE_TYPES = { 1194 exp.DataType.Type.ARRAY, 1195 exp.DataType.Type.MAP, 1196 exp.DataType.Type.STRUCT, 1197 exp.DataType.Type.POINT, 1198 exp.DataType.Type.RING, 1199 exp.DataType.Type.LINESTRING, 1200 exp.DataType.Type.MULTILINESTRING, 1201 exp.DataType.Type.POLYGON, 1202 exp.DataType.Type.MULTIPOLYGON, 1203 } 1204 1205 def offset_sql(self, expression: exp.Offset) -> str: 1206 offset = super().offset_sql(expression) 1207 1208 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1209 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1210 parent = expression.parent 1211 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1212 offset = f"{offset} ROWS" 1213 1214 return offset 1215 1216 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1217 strtodate_sql = self.function_fallback_sql(expression) 1218 1219 if not isinstance(expression.parent, exp.Cast): 1220 # StrToDate returns DATEs in other dialects (eg. postgres), so 1221 # this branch aims to improve the transpilation to clickhouse 1222 return self.cast_sql(exp.cast(expression, "DATE")) 1223 1224 return strtodate_sql 1225 1226 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1227 this = expression.this 1228 1229 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1230 return self.sql(this) 1231 1232 return super().cast_sql(expression, safe_prefix=safe_prefix) 1233 1234 def trycast_sql(self, expression: exp.TryCast) -> str: 1235 dtype = expression.to 1236 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1237 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1238 dtype.set("nullable", True) 1239 1240 return super().cast_sql(expression) 1241 1242 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1243 this = self.json_path_part(expression.this) 1244 return str(int(this) + 1) if is_int(this) else this 1245 1246 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1247 return f"AS {self.sql(expression, 'this')}" 1248 1249 def _any_to_has( 1250 self, 1251 expression: exp.EQ | exp.NEQ, 1252 default: t.Callable[[t.Any], str], 1253 prefix: str = "", 1254 ) -> str: 1255 if isinstance(expression.left, exp.Any): 1256 arr = expression.left 1257 this = expression.right 1258 elif isinstance(expression.right, exp.Any): 1259 arr = expression.right 1260 this = expression.left 1261 else: 1262 return default(expression) 1263 1264 return prefix + self.func("has", arr.this.unnest(), this) 1265 1266 def eq_sql(self, expression: exp.EQ) -> str: 1267 return self._any_to_has(expression, super().eq_sql) 1268 1269 def neq_sql(self, expression: exp.NEQ) -> str: 1270 return self._any_to_has(expression, super().neq_sql, "NOT ") 1271 1272 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1273 # Manually add a flag to make the search case-insensitive 1274 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1275 return self.func("match", expression.this, regex) 1276 1277 def datatype_sql(self, expression: exp.DataType) -> str: 1278 # String is the standard ClickHouse type, every other variant is just an alias. 1279 # Additionally, any supplied length parameter will be ignored. 1280 # 1281 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1282 if expression.this in self.STRING_TYPE_MAPPING: 1283 dtype = "String" 1284 else: 1285 dtype = super().datatype_sql(expression) 1286 1287 # This section changes the type to `Nullable(...)` if the following conditions hold: 1288 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1289 # and change their semantics 1290 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1291 # constraint: "Type of Map key must be a type, that can be represented by integer or 1292 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1293 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1294 parent = expression.parent 1295 nullable = expression.args.get("nullable") 1296 if nullable is True or ( 1297 nullable is None 1298 and not ( 1299 isinstance(parent, exp.DataType) 1300 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1301 and expression.index in (None, 0) 1302 ) 1303 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1304 ): 1305 dtype = f"Nullable({dtype})" 1306 1307 return dtype 1308 1309 def cte_sql(self, expression: exp.CTE) -> str: 1310 if expression.args.get("scalar"): 1311 this = self.sql(expression, "this") 1312 alias = self.sql(expression, "alias") 1313 return f"{this} AS {alias}" 1314 1315 return super().cte_sql(expression) 1316 1317 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1318 return super().after_limit_modifiers(expression) + [ 1319 ( 1320 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1321 if expression.args.get("settings") 1322 else "" 1323 ), 1324 ( 1325 self.seg("FORMAT ") + self.sql(expression, "format") 1326 if expression.args.get("format") 1327 else "" 1328 ), 1329 ] 1330 1331 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1332 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1333 1334 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1335 return f"ON CLUSTER {self.sql(expression, 'this')}" 1336 1337 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1338 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1339 exp.Properties.Location.POST_NAME 1340 ): 1341 this_name = self.sql( 1342 expression.this if isinstance(expression.this, exp.Schema) else expression, 1343 "this", 1344 ) 1345 this_properties = " ".join( 1346 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1347 ) 1348 this_schema = self.schema_columns_sql(expression.this) 1349 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1350 1351 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1352 1353 return super().createable_sql(expression, locations) 1354 1355 def create_sql(self, expression: exp.Create) -> str: 1356 # The comment property comes last in CTAS statements, i.e. after the query 1357 query = expression.expression 1358 if isinstance(query, exp.Query): 1359 comment_prop = expression.find(exp.SchemaCommentProperty) 1360 if comment_prop: 1361 comment_prop.pop() 1362 query.replace(exp.paren(query)) 1363 else: 1364 comment_prop = None 1365 1366 create_sql = super().create_sql(expression) 1367 1368 comment_sql = self.sql(comment_prop) 1369 comment_sql = f" {comment_sql}" if comment_sql else "" 1370 1371 return f"{create_sql}{comment_sql}" 1372 1373 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1374 this = self.indent(self.sql(expression, "this")) 1375 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1376 1377 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1378 this = self.sql(expression, "this") 1379 this = f" {this}" if this else "" 1380 expr = self.sql(expression, "expression") 1381 expr = f" {expr}" if expr else "" 1382 index_type = self.sql(expression, "index_type") 1383 index_type = f" TYPE {index_type}" if index_type else "" 1384 granularity = self.sql(expression, "granularity") 1385 granularity = f" GRANULARITY {granularity}" if granularity else "" 1386 1387 return f"INDEX{this}{expr}{index_type}{granularity}" 1388 1389 def partition_sql(self, expression: exp.Partition) -> str: 1390 return f"PARTITION {self.expressions(expression, flat=True)}" 1391 1392 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1393 return f"ID {self.sql(expression.this)}" 1394 1395 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1396 return ( 1397 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1398 ) 1399 1400 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1401 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1402 1403 def is_sql(self, expression: exp.Is) -> str: 1404 is_sql = super().is_sql(expression) 1405 1406 if isinstance(expression.parent, exp.Not): 1407 # value IS NOT NULL -> NOT (value IS NULL) 1408 is_sql = self.wrap(is_sql) 1409 1410 return is_sql 1411 1412 def in_sql(self, expression: exp.In) -> str: 1413 in_sql = super().in_sql(expression) 1414 1415 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1416 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1417 1418 return in_sql 1419 1420 def not_sql(self, expression: exp.Not) -> str: 1421 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1422 # let `GLOBAL IN` child interpose `NOT` 1423 return self.sql(expression, "this") 1424 1425 return super().not_sql(expression) 1426 1427 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1428 # If the VALUES clause contains tuples of expressions, we need to treat it 1429 # as a table since Clickhouse will automatically alias it as such. 1430 alias = expression.args.get("alias") 1431 1432 if alias and alias.args.get("columns") and expression.expressions: 1433 values = expression.expressions[0].expressions 1434 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1435 else: 1436 values_as_table = True 1437 1438 return super().values_sql(expression, values_as_table=values_as_table)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1205 def offset_sql(self, expression: exp.Offset) -> str: 1206 offset = super().offset_sql(expression) 1207 1208 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1209 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1210 parent = expression.parent 1211 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1212 offset = f"{offset} ROWS" 1213 1214 return offset
1216 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1217 strtodate_sql = self.function_fallback_sql(expression) 1218 1219 if not isinstance(expression.parent, exp.Cast): 1220 # StrToDate returns DATEs in other dialects (eg. postgres), so 1221 # this branch aims to improve the transpilation to clickhouse 1222 return self.cast_sql(exp.cast(expression, "DATE")) 1223 1224 return strtodate_sql
1226 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1227 this = expression.this 1228 1229 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1230 return self.sql(this) 1231 1232 return super().cast_sql(expression, safe_prefix=safe_prefix)
1234 def trycast_sql(self, expression: exp.TryCast) -> str: 1235 dtype = expression.to 1236 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1237 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1238 dtype.set("nullable", True) 1239 1240 return super().cast_sql(expression)
1277 def datatype_sql(self, expression: exp.DataType) -> str: 1278 # String is the standard ClickHouse type, every other variant is just an alias. 1279 # Additionally, any supplied length parameter will be ignored. 1280 # 1281 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1282 if expression.this in self.STRING_TYPE_MAPPING: 1283 dtype = "String" 1284 else: 1285 dtype = super().datatype_sql(expression) 1286 1287 # This section changes the type to `Nullable(...)` if the following conditions hold: 1288 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1289 # and change their semantics 1290 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1291 # constraint: "Type of Map key must be a type, that can be represented by integer or 1292 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1293 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1294 parent = expression.parent 1295 nullable = expression.args.get("nullable") 1296 if nullable is True or ( 1297 nullable is None 1298 and not ( 1299 isinstance(parent, exp.DataType) 1300 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1301 and expression.index in (None, 0) 1302 ) 1303 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1304 ): 1305 dtype = f"Nullable({dtype})" 1306 1307 return dtype
1317 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1318 return super().after_limit_modifiers(expression) + [ 1319 ( 1320 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1321 if expression.args.get("settings") 1322 else "" 1323 ), 1324 ( 1325 self.seg("FORMAT ") + self.sql(expression, "format") 1326 if expression.args.get("format") 1327 else "" 1328 ), 1329 ]
1337 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1338 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1339 exp.Properties.Location.POST_NAME 1340 ): 1341 this_name = self.sql( 1342 expression.this if isinstance(expression.this, exp.Schema) else expression, 1343 "this", 1344 ) 1345 this_properties = " ".join( 1346 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1347 ) 1348 this_schema = self.schema_columns_sql(expression.this) 1349 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1350 1351 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1352 1353 return super().createable_sql(expression, locations)
1355 def create_sql(self, expression: exp.Create) -> str: 1356 # The comment property comes last in CTAS statements, i.e. after the query 1357 query = expression.expression 1358 if isinstance(query, exp.Query): 1359 comment_prop = expression.find(exp.SchemaCommentProperty) 1360 if comment_prop: 1361 comment_prop.pop() 1362 query.replace(exp.paren(query)) 1363 else: 1364 comment_prop = None 1365 1366 create_sql = super().create_sql(expression) 1367 1368 comment_sql = self.sql(comment_prop) 1369 comment_sql = f" {comment_sql}" if comment_sql else "" 1370 1371 return f"{create_sql}{comment_sql}"
1377 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1378 this = self.sql(expression, "this") 1379 this = f" {this}" if this else "" 1380 expr = self.sql(expression, "expression") 1381 expr = f" {expr}" if expr else "" 1382 index_type = self.sql(expression, "index_type") 1383 index_type = f" TYPE {index_type}" if index_type else "" 1384 granularity = self.sql(expression, "granularity") 1385 granularity = f" GRANULARITY {granularity}" if granularity else "" 1386 1387 return f"INDEX{this}{expr}{index_type}{granularity}"
1427 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1428 # If the VALUES clause contains tuples of expressions, we need to treat it 1429 # as a table since Clickhouse will automatically alias it as such. 1430 alias = expression.args.get("alias") 1431 1432 if alias and alias.args.get("columns") and expression.expressions: 1433 values = expression.expressions[0].expressions 1434 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1435 else: 1436 values_as_table = True 1437 1438 return super().values_sql(expression, values_as_table=values_as_table)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- IGNORE_NULLS_IN_FUNC
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_FETCH
- LIMIT_ONLY_LITERALS
- RENAME_TABLE_WITH_DB
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- JSON_KEY_VALUE_PAIR_SEP
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SUPPORTS_WINDOW_EXCLUDE
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- PARAMETER_TOKEN
- NAMED_PLACEHOLDER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- RESERVED_KEYWORDS
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- subquery_sql
- qualify_sql
- unnest_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- bracket_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- mltranslate_sql
- mlforecast_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql