sqlglot.dialects.clickhouse
1from __future__ import annotations 2import typing as t 3import datetime 4from sqlglot import exp, generator, parser, tokens 5from sqlglot._typing import E 6from sqlglot.dialects.dialect import ( 7 Dialect, 8 NormalizationStrategy, 9 arg_max_or_min_no_count, 10 build_date_delta, 11 build_formatted_time, 12 inline_array_sql, 13 json_extract_segments, 14 json_path_key_only_name, 15 length_or_char_length_sql, 16 no_pivot_sql, 17 build_json_extract_path, 18 rename_func, 19 remove_from_array_using_filter, 20 sha256_sql, 21 strposition_sql, 22 var_map_sql, 23 timestamptrunc_sql, 24 unit_to_var, 25 trim_sql, 26) 27from sqlglot.generator import Generator 28from sqlglot.helper import is_int, seq_get 29from sqlglot.tokens import Token, TokenType 30from sqlglot.generator import unsupported_args 31 32DATEΤΙΜΕ_DELTA = t.Union[exp.DateAdd, exp.DateDiff, exp.DateSub, exp.TimestampSub, exp.TimestampAdd] 33 34 35def _build_datetime_format( 36 expr_type: t.Type[E], 37) -> t.Callable[[t.List], E]: 38 def _builder(args: t.List) -> E: 39 expr = build_formatted_time(expr_type, "clickhouse")(args) 40 41 timezone = seq_get(args, 2) 42 if timezone: 43 expr.set("zone", timezone) 44 45 return expr 46 47 return _builder 48 49 50def _unix_to_time_sql(self: ClickHouse.Generator, expression: exp.UnixToTime) -> str: 51 scale = expression.args.get("scale") 52 timestamp = expression.this 53 54 if scale in (None, exp.UnixToTime.SECONDS): 55 return self.func("fromUnixTimestamp", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 56 if scale == exp.UnixToTime.MILLIS: 57 return self.func("fromUnixTimestamp64Milli", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 58 if scale == exp.UnixToTime.MICROS: 59 return self.func("fromUnixTimestamp64Micro", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 60 if scale == exp.UnixToTime.NANOS: 61 return self.func("fromUnixTimestamp64Nano", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 62 63 return self.func( 64 "fromUnixTimestamp", 65 exp.cast( 66 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 67 ), 68 ) 69 70 71def _lower_func(sql: str) -> str: 72 index = sql.index("(") 73 return sql[:index].lower() + sql[index:] 74 75 76def _quantile_sql(self: ClickHouse.Generator, expression: exp.Quantile) -> str: 77 quantile = expression.args["quantile"] 78 args = f"({self.sql(expression, 'this')})" 79 80 if isinstance(quantile, exp.Array): 81 func = self.func("quantiles", *quantile) 82 else: 83 func = self.func("quantile", quantile) 84 85 return func + args 86 87 88def _build_count_if(args: t.List) -> exp.CountIf | exp.CombinedAggFunc: 89 if len(args) == 1: 90 return exp.CountIf(this=seq_get(args, 0)) 91 92 return exp.CombinedAggFunc(this="countIf", expressions=args) 93 94 95def _build_str_to_date(args: t.List) -> exp.Cast | exp.Anonymous: 96 if len(args) == 3: 97 return exp.Anonymous(this="STR_TO_DATE", expressions=args) 98 99 strtodate = exp.StrToDate.from_arg_list(args) 100 return exp.cast(strtodate, exp.DataType.build(exp.DataType.Type.DATETIME)) 101 102 103def _datetime_delta_sql(name: str) -> t.Callable[[Generator, DATEΤΙΜΕ_DELTA], str]: 104 def _delta_sql(self: Generator, expression: DATEΤΙΜΕ_DELTA) -> str: 105 if not expression.unit: 106 return rename_func(name)(self, expression) 107 108 return self.func( 109 name, 110 unit_to_var(expression), 111 expression.expression, 112 expression.this, 113 expression.args.get("zone"), 114 ) 115 116 return _delta_sql 117 118 119def _timestrtotime_sql(self: ClickHouse.Generator, expression: exp.TimeStrToTime): 120 ts = expression.this 121 122 tz = expression.args.get("zone") 123 if tz and isinstance(ts, exp.Literal): 124 # Clickhouse will not accept timestamps that include a UTC offset, so we must remove them. 125 # The first step to removing is parsing the string with `datetime.datetime.fromisoformat`. 126 # 127 # In python <3.11, `fromisoformat()` can only parse timestamps of millisecond (3 digit) 128 # or microsecond (6 digit) precision. It will error if passed any other number of fractional 129 # digits, so we extract the fractional seconds and pad to 6 digits before parsing. 130 ts_string = ts.name.strip() 131 132 # separate [date and time] from [fractional seconds and UTC offset] 133 ts_parts = ts_string.split(".") 134 if len(ts_parts) == 2: 135 # separate fractional seconds and UTC offset 136 offset_sep = "+" if "+" in ts_parts[1] else "-" 137 ts_frac_parts = ts_parts[1].split(offset_sep) 138 num_frac_parts = len(ts_frac_parts) 139 140 # pad to 6 digits if fractional seconds present 141 ts_frac_parts[0] = ts_frac_parts[0].ljust(6, "0") 142 ts_string = "".join( 143 [ 144 ts_parts[0], # date and time 145 ".", 146 ts_frac_parts[0], # fractional seconds 147 offset_sep if num_frac_parts > 1 else "", 148 ts_frac_parts[1] if num_frac_parts > 1 else "", # utc offset (if present) 149 ] 150 ) 151 152 # return literal with no timezone, eg turn '2020-01-01 12:13:14-08:00' into '2020-01-01 12:13:14' 153 # this is because Clickhouse encodes the timezone as a data type parameter and throws an error if 154 # it's part of the timestamp string 155 ts_without_tz = ( 156 datetime.datetime.fromisoformat(ts_string).replace(tzinfo=None).isoformat(sep=" ") 157 ) 158 ts = exp.Literal.string(ts_without_tz) 159 160 # Non-nullable DateTime64 with microsecond precision 161 expressions = [exp.DataTypeParam(this=tz)] if tz else [] 162 datatype = exp.DataType.build( 163 exp.DataType.Type.DATETIME64, 164 expressions=[exp.DataTypeParam(this=exp.Literal.number(6)), *expressions], 165 nullable=False, 166 ) 167 168 return self.sql(exp.cast(ts, datatype, dialect=self.dialect)) 169 170 171def _map_sql(self: ClickHouse.Generator, expression: exp.Map | exp.VarMap) -> str: 172 if not (expression.parent and expression.parent.arg_key == "settings"): 173 return _lower_func(var_map_sql(self, expression)) 174 175 keys = expression.args.get("keys") 176 values = expression.args.get("values") 177 178 if not isinstance(keys, exp.Array) or not isinstance(values, exp.Array): 179 self.unsupported("Cannot convert array columns into map.") 180 return "" 181 182 args = [] 183 for key, value in zip(keys.expressions, values.expressions): 184 args.append(f"{self.sql(key)}: {self.sql(value)}") 185 186 csv_args = ", ".join(args) 187 188 return f"{{{csv_args}}}" 189 190 191class ClickHouse(Dialect): 192 NORMALIZE_FUNCTIONS: bool | str = False 193 NULL_ORDERING = "nulls_are_last" 194 SUPPORTS_USER_DEFINED_TYPES = False 195 SAFE_DIVISION = True 196 LOG_BASE_FIRST: t.Optional[bool] = None 197 FORCE_EARLY_ALIAS_REF_EXPANSION = True 198 PRESERVE_ORIGINAL_NAMES = True 199 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 200 IDENTIFIERS_CAN_START_WITH_DIGIT = True 201 HEX_STRING_IS_INTEGER_TYPE = True 202 203 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 204 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 205 206 UNESCAPED_SEQUENCES = { 207 "\\0": "\0", 208 } 209 210 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 211 212 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 213 exp.Except: False, 214 exp.Intersect: False, 215 exp.Union: None, 216 } 217 218 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 219 # Clickhouse allows VALUES to have an embedded structure e.g: 220 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 221 # In this case, we don't want to qualify the columns 222 values = expression.expressions[0].expressions 223 224 structure = ( 225 values[0] 226 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 227 else None 228 ) 229 if structure: 230 # Split each column definition into the column name e.g: 231 # 'person String, place String' -> ['person', 'place'] 232 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 233 column_aliases = [ 234 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 235 ] 236 else: 237 # Default column aliases in CH are "c1", "c2", etc. 238 column_aliases = [ 239 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 240 ] 241 242 return column_aliases 243 244 class Tokenizer(tokens.Tokenizer): 245 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 246 IDENTIFIERS = ['"', "`"] 247 IDENTIFIER_ESCAPES = ["\\"] 248 STRING_ESCAPES = ["'", "\\"] 249 BIT_STRINGS = [("0b", "")] 250 HEX_STRINGS = [("0x", ""), ("0X", "")] 251 HEREDOC_STRINGS = ["$"] 252 253 KEYWORDS = { 254 **tokens.Tokenizer.KEYWORDS, 255 ".:": TokenType.DOTCOLON, 256 "ATTACH": TokenType.COMMAND, 257 "DATE32": TokenType.DATE32, 258 "DATETIME64": TokenType.DATETIME64, 259 "DICTIONARY": TokenType.DICTIONARY, 260 "DYNAMIC": TokenType.DYNAMIC, 261 "ENUM8": TokenType.ENUM8, 262 "ENUM16": TokenType.ENUM16, 263 "EXCHANGE": TokenType.COMMAND, 264 "FINAL": TokenType.FINAL, 265 "FIXEDSTRING": TokenType.FIXEDSTRING, 266 "FLOAT32": TokenType.FLOAT, 267 "FLOAT64": TokenType.DOUBLE, 268 "GLOBAL": TokenType.GLOBAL, 269 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 270 "MAP": TokenType.MAP, 271 "NESTED": TokenType.NESTED, 272 "NOTHING": TokenType.NOTHING, 273 "SAMPLE": TokenType.TABLE_SAMPLE, 274 "TUPLE": TokenType.STRUCT, 275 "UINT16": TokenType.USMALLINT, 276 "UINT32": TokenType.UINT, 277 "UINT64": TokenType.UBIGINT, 278 "UINT8": TokenType.UTINYINT, 279 "IPV4": TokenType.IPV4, 280 "IPV6": TokenType.IPV6, 281 "POINT": TokenType.POINT, 282 "RING": TokenType.RING, 283 "LINESTRING": TokenType.LINESTRING, 284 "MULTILINESTRING": TokenType.MULTILINESTRING, 285 "POLYGON": TokenType.POLYGON, 286 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 287 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 288 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 289 "SYSTEM": TokenType.COMMAND, 290 "PREWHERE": TokenType.PREWHERE, 291 } 292 KEYWORDS.pop("/*+") 293 294 SINGLE_TOKENS = { 295 **tokens.Tokenizer.SINGLE_TOKENS, 296 "$": TokenType.HEREDOC_STRING, 297 } 298 299 class Parser(parser.Parser): 300 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 301 # * select x from t1 union all select x from t2 limit 1; 302 # * select x from t1 union all (select x from t2 limit 1); 303 MODIFIERS_ATTACHED_TO_SET_OP = False 304 INTERVAL_SPANS = False 305 OPTIONAL_ALIAS_TOKEN_CTE = False 306 JOINS_HAVE_EQUAL_PRECEDENCE = True 307 308 FUNCTIONS = { 309 **parser.Parser.FUNCTIONS, 310 "ANY": exp.AnyValue.from_arg_list, 311 "ARRAYSUM": exp.ArraySum.from_arg_list, 312 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 313 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 314 "COUNTIF": _build_count_if, 315 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 316 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 317 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 318 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 319 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 320 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 321 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 322 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 323 "JSONEXTRACTSTRING": build_json_extract_path( 324 exp.JSONExtractScalar, zero_based_indexing=False 325 ), 326 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 327 "MAP": parser.build_var_map, 328 "MATCH": exp.RegexpLike.from_arg_list, 329 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 330 "RANDCANONICAL": exp.Rand.from_arg_list, 331 "STR_TO_DATE": _build_str_to_date, 332 "TUPLE": exp.Struct.from_arg_list, 333 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 334 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 335 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 336 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 337 "UNIQ": exp.ApproxDistinct.from_arg_list, 338 "XOR": lambda args: exp.Xor(expressions=args), 339 "MD5": exp.MD5Digest.from_arg_list, 340 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 341 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 342 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 343 "TOTYPENAME": exp.Typeof.from_arg_list, 344 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 345 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 346 } 347 FUNCTIONS.pop("TRANSFORM") 348 FUNCTIONS.pop("APPROX_TOP_SUM") 349 350 AGG_FUNCTIONS = { 351 "count", 352 "min", 353 "max", 354 "sum", 355 "avg", 356 "any", 357 "stddevPop", 358 "stddevSamp", 359 "varPop", 360 "varSamp", 361 "corr", 362 "covarPop", 363 "covarSamp", 364 "entropy", 365 "exponentialMovingAverage", 366 "intervalLengthSum", 367 "kolmogorovSmirnovTest", 368 "mannWhitneyUTest", 369 "median", 370 "rankCorr", 371 "sumKahan", 372 "studentTTest", 373 "welchTTest", 374 "anyHeavy", 375 "anyLast", 376 "boundingRatio", 377 "first_value", 378 "last_value", 379 "argMin", 380 "argMax", 381 "avgWeighted", 382 "topK", 383 "approx_top_sum", 384 "topKWeighted", 385 "deltaSum", 386 "deltaSumTimestamp", 387 "groupArray", 388 "groupArrayLast", 389 "groupUniqArray", 390 "groupArrayInsertAt", 391 "groupArrayMovingAvg", 392 "groupArrayMovingSum", 393 "groupArraySample", 394 "groupBitAnd", 395 "groupBitOr", 396 "groupBitXor", 397 "groupBitmap", 398 "groupBitmapAnd", 399 "groupBitmapOr", 400 "groupBitmapXor", 401 "sumWithOverflow", 402 "sumMap", 403 "minMap", 404 "maxMap", 405 "skewSamp", 406 "skewPop", 407 "kurtSamp", 408 "kurtPop", 409 "uniq", 410 "uniqExact", 411 "uniqCombined", 412 "uniqCombined64", 413 "uniqHLL12", 414 "uniqTheta", 415 "quantile", 416 "quantiles", 417 "quantileExact", 418 "quantilesExact", 419 "quantileExactLow", 420 "quantilesExactLow", 421 "quantileExactHigh", 422 "quantilesExactHigh", 423 "quantileExactWeighted", 424 "quantilesExactWeighted", 425 "quantileTiming", 426 "quantilesTiming", 427 "quantileTimingWeighted", 428 "quantilesTimingWeighted", 429 "quantileDeterministic", 430 "quantilesDeterministic", 431 "quantileTDigest", 432 "quantilesTDigest", 433 "quantileTDigestWeighted", 434 "quantilesTDigestWeighted", 435 "quantileBFloat16", 436 "quantilesBFloat16", 437 "quantileBFloat16Weighted", 438 "quantilesBFloat16Weighted", 439 "simpleLinearRegression", 440 "stochasticLinearRegression", 441 "stochasticLogisticRegression", 442 "categoricalInformationValue", 443 "contingency", 444 "cramersV", 445 "cramersVBiasCorrected", 446 "theilsU", 447 "maxIntersections", 448 "maxIntersectionsPosition", 449 "meanZTest", 450 "quantileInterpolatedWeighted", 451 "quantilesInterpolatedWeighted", 452 "quantileGK", 453 "quantilesGK", 454 "sparkBar", 455 "sumCount", 456 "largestTriangleThreeBuckets", 457 "histogram", 458 "sequenceMatch", 459 "sequenceCount", 460 "windowFunnel", 461 "retention", 462 "uniqUpTo", 463 "sequenceNextNode", 464 "exponentialTimeDecayedAvg", 465 } 466 467 AGG_FUNCTIONS_SUFFIXES = [ 468 "If", 469 "Array", 470 "ArrayIf", 471 "Map", 472 "SimpleState", 473 "State", 474 "Merge", 475 "MergeState", 476 "ForEach", 477 "Distinct", 478 "OrDefault", 479 "OrNull", 480 "Resample", 481 "ArgMin", 482 "ArgMax", 483 ] 484 485 FUNC_TOKENS = { 486 *parser.Parser.FUNC_TOKENS, 487 TokenType.AND, 488 TokenType.OR, 489 TokenType.SET, 490 } 491 492 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 493 494 ID_VAR_TOKENS = { 495 *parser.Parser.ID_VAR_TOKENS, 496 TokenType.LIKE, 497 } 498 499 AGG_FUNC_MAPPING = ( 500 lambda functions, suffixes: { 501 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 502 } 503 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 504 505 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 506 507 FUNCTION_PARSERS = { 508 **parser.Parser.FUNCTION_PARSERS, 509 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 510 "QUANTILE": lambda self: self._parse_quantile(), 511 "MEDIAN": lambda self: self._parse_quantile(), 512 "COLUMNS": lambda self: self._parse_columns(), 513 } 514 515 FUNCTION_PARSERS.pop("MATCH") 516 517 PROPERTY_PARSERS = { 518 **parser.Parser.PROPERTY_PARSERS, 519 "ENGINE": lambda self: self._parse_engine_property(), 520 } 521 PROPERTY_PARSERS.pop("DYNAMIC") 522 523 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 524 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 525 526 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 527 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 528 529 RANGE_PARSERS = { 530 **parser.Parser.RANGE_PARSERS, 531 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 532 } 533 534 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 535 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 536 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 537 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 538 539 JOIN_KINDS = { 540 *parser.Parser.JOIN_KINDS, 541 TokenType.ANY, 542 TokenType.ASOF, 543 TokenType.ARRAY, 544 } 545 546 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 547 TokenType.ANY, 548 TokenType.ARRAY, 549 TokenType.FINAL, 550 TokenType.FORMAT, 551 TokenType.SETTINGS, 552 } 553 554 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 555 TokenType.FORMAT, 556 } 557 558 LOG_DEFAULTS_TO_LN = True 559 560 QUERY_MODIFIER_PARSERS = { 561 **parser.Parser.QUERY_MODIFIER_PARSERS, 562 TokenType.SETTINGS: lambda self: ( 563 "settings", 564 self._advance() or self._parse_csv(self._parse_assignment), 565 ), 566 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 567 } 568 569 CONSTRAINT_PARSERS = { 570 **parser.Parser.CONSTRAINT_PARSERS, 571 "INDEX": lambda self: self._parse_index_constraint(), 572 "CODEC": lambda self: self._parse_compress(), 573 } 574 575 ALTER_PARSERS = { 576 **parser.Parser.ALTER_PARSERS, 577 "REPLACE": lambda self: self._parse_alter_table_replace(), 578 } 579 580 SCHEMA_UNNAMED_CONSTRAINTS = { 581 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 582 "INDEX", 583 } 584 585 PLACEHOLDER_PARSERS = { 586 **parser.Parser.PLACEHOLDER_PARSERS, 587 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 588 } 589 590 def _parse_engine_property(self) -> exp.EngineProperty: 591 self._match(TokenType.EQ) 592 return self.expression( 593 exp.EngineProperty, 594 this=self._parse_field(any_token=True, anonymous_func=True), 595 ) 596 597 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 598 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 599 return self._parse_lambda() 600 601 def _parse_types( 602 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 603 ) -> t.Optional[exp.Expression]: 604 dtype = super()._parse_types( 605 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 606 ) 607 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 608 # Mark every type as non-nullable which is ClickHouse's default, unless it's 609 # already marked as nullable. This marker helps us transpile types from other 610 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 611 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 612 # fail in ClickHouse without the `Nullable` type constructor. 613 dtype.set("nullable", False) 614 615 return dtype 616 617 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 618 index = self._index 619 this = self._parse_bitwise() 620 if self._match(TokenType.FROM): 621 self._retreat(index) 622 return super()._parse_extract() 623 624 # We return Anonymous here because extract and regexpExtract have different semantics, 625 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 626 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 627 # 628 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 629 self._match(TokenType.COMMA) 630 return self.expression( 631 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 632 ) 633 634 def _parse_assignment(self) -> t.Optional[exp.Expression]: 635 this = super()._parse_assignment() 636 637 if self._match(TokenType.PLACEHOLDER): 638 return self.expression( 639 exp.If, 640 this=this, 641 true=self._parse_assignment(), 642 false=self._match(TokenType.COLON) and self._parse_assignment(), 643 ) 644 645 return this 646 647 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 648 """ 649 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 650 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 651 """ 652 index = self._index 653 654 this = self._parse_id_var() 655 self._match(TokenType.COLON) 656 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 657 self._match_text_seq("IDENTIFIER") and "Identifier" 658 ) 659 660 if not kind: 661 self._retreat(index) 662 return None 663 elif not self._match(TokenType.R_BRACE): 664 self.raise_error("Expecting }") 665 666 if isinstance(this, exp.Identifier) and not this.quoted: 667 this = exp.var(this.name) 668 669 return self.expression(exp.Placeholder, this=this, kind=kind) 670 671 def _parse_bracket( 672 self, this: t.Optional[exp.Expression] = None 673 ) -> t.Optional[exp.Expression]: 674 l_brace = self._match(TokenType.L_BRACE, advance=False) 675 bracket = super()._parse_bracket(this) 676 677 if l_brace and isinstance(bracket, exp.Struct): 678 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 679 for expression in bracket.expressions: 680 if not isinstance(expression, exp.PropertyEQ): 681 break 682 683 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 684 varmap.args["values"].append("expressions", expression.expression) 685 686 return varmap 687 688 return bracket 689 690 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 691 this = super()._parse_in(this) 692 this.set("is_global", is_global) 693 return this 694 695 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 696 is_negated = self._match(TokenType.NOT) 697 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 698 return self.expression(exp.Not, this=this) if is_negated else this 699 700 def _parse_table( 701 self, 702 schema: bool = False, 703 joins: bool = False, 704 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 705 parse_bracket: bool = False, 706 is_db_reference: bool = False, 707 parse_partition: bool = False, 708 consume_pipe: bool = False, 709 ) -> t.Optional[exp.Expression]: 710 this = super()._parse_table( 711 schema=schema, 712 joins=joins, 713 alias_tokens=alias_tokens, 714 parse_bracket=parse_bracket, 715 is_db_reference=is_db_reference, 716 ) 717 718 if isinstance(this, exp.Table): 719 inner = this.this 720 alias = this.args.get("alias") 721 722 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 723 alias.set("columns", [exp.to_identifier("generate_series")]) 724 725 if self._match(TokenType.FINAL): 726 this = self.expression(exp.Final, this=this) 727 728 return this 729 730 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 731 return super()._parse_position(haystack_first=True) 732 733 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 734 def _parse_cte(self) -> t.Optional[exp.CTE]: 735 # WITH <identifier> AS <subquery expression> 736 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 737 738 if not cte: 739 # WITH <expression> AS <identifier> 740 cte = self.expression( 741 exp.CTE, 742 this=self._parse_assignment(), 743 alias=self._parse_table_alias(), 744 scalar=True, 745 ) 746 747 return cte 748 749 def _parse_join_parts( 750 self, 751 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 752 is_global = self._match(TokenType.GLOBAL) and self._prev 753 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 754 755 if kind_pre: 756 kind = self._match_set(self.JOIN_KINDS) and self._prev 757 side = self._match_set(self.JOIN_SIDES) and self._prev 758 return is_global, side, kind 759 760 return ( 761 is_global, 762 self._match_set(self.JOIN_SIDES) and self._prev, 763 self._match_set(self.JOIN_KINDS) and self._prev, 764 ) 765 766 def _parse_join( 767 self, skip_join_token: bool = False, parse_bracket: bool = False 768 ) -> t.Optional[exp.Join]: 769 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 770 if join: 771 join.set("global", join.args.pop("method", None)) 772 773 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 774 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 775 if join.kind == "ARRAY": 776 for table in join.find_all(exp.Table): 777 table.replace(table.to_column()) 778 779 return join 780 781 def _parse_function( 782 self, 783 functions: t.Optional[t.Dict[str, t.Callable]] = None, 784 anonymous: bool = False, 785 optional_parens: bool = True, 786 any_token: bool = False, 787 ) -> t.Optional[exp.Expression]: 788 expr = super()._parse_function( 789 functions=functions, 790 anonymous=anonymous, 791 optional_parens=optional_parens, 792 any_token=any_token, 793 ) 794 795 func = expr.this if isinstance(expr, exp.Window) else expr 796 797 # Aggregate functions can be split in 2 parts: <func_name><suffix> 798 parts = ( 799 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 800 ) 801 802 if parts: 803 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 804 params = self._parse_func_params(anon_func) 805 806 kwargs = { 807 "this": anon_func.this, 808 "expressions": anon_func.expressions, 809 } 810 if parts[1]: 811 exp_class: t.Type[exp.Expression] = ( 812 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 813 ) 814 else: 815 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 816 817 kwargs["exp_class"] = exp_class 818 if params: 819 kwargs["params"] = params 820 821 func = self.expression(**kwargs) 822 823 if isinstance(expr, exp.Window): 824 # The window's func was parsed as Anonymous in base parser, fix its 825 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 826 expr.set("this", func) 827 elif params: 828 # Params have blocked super()._parse_function() from parsing the following window 829 # (if that exists) as they're standing between the function call and the window spec 830 expr = self._parse_window(func) 831 else: 832 expr = func 833 834 return expr 835 836 def _parse_func_params( 837 self, this: t.Optional[exp.Func] = None 838 ) -> t.Optional[t.List[exp.Expression]]: 839 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 840 return self._parse_csv(self._parse_lambda) 841 842 if self._match(TokenType.L_PAREN): 843 params = self._parse_csv(self._parse_lambda) 844 self._match_r_paren(this) 845 return params 846 847 return None 848 849 def _parse_quantile(self) -> exp.Quantile: 850 this = self._parse_lambda() 851 params = self._parse_func_params() 852 if params: 853 return self.expression(exp.Quantile, this=params[0], quantile=this) 854 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 855 856 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 857 return super()._parse_wrapped_id_vars(optional=True) 858 859 def _parse_primary_key( 860 self, wrapped_optional: bool = False, in_props: bool = False 861 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 862 return super()._parse_primary_key( 863 wrapped_optional=wrapped_optional or in_props, in_props=in_props 864 ) 865 866 def _parse_on_property(self) -> t.Optional[exp.Expression]: 867 index = self._index 868 if self._match_text_seq("CLUSTER"): 869 this = self._parse_string() or self._parse_id_var() 870 if this: 871 return self.expression(exp.OnCluster, this=this) 872 else: 873 self._retreat(index) 874 return None 875 876 def _parse_index_constraint( 877 self, kind: t.Optional[str] = None 878 ) -> exp.IndexColumnConstraint: 879 # INDEX name1 expr TYPE type1(args) GRANULARITY value 880 this = self._parse_id_var() 881 expression = self._parse_assignment() 882 883 index_type = self._match_text_seq("TYPE") and ( 884 self._parse_function() or self._parse_var() 885 ) 886 887 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 888 889 return self.expression( 890 exp.IndexColumnConstraint, 891 this=this, 892 expression=expression, 893 index_type=index_type, 894 granularity=granularity, 895 ) 896 897 def _parse_partition(self) -> t.Optional[exp.Partition]: 898 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 899 if not self._match(TokenType.PARTITION): 900 return None 901 902 if self._match_text_seq("ID"): 903 # Corresponds to the PARTITION ID <string_value> syntax 904 expressions: t.List[exp.Expression] = [ 905 self.expression(exp.PartitionId, this=self._parse_string()) 906 ] 907 else: 908 expressions = self._parse_expressions() 909 910 return self.expression(exp.Partition, expressions=expressions) 911 912 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 913 partition = self._parse_partition() 914 915 if not partition or not self._match(TokenType.FROM): 916 return None 917 918 return self.expression( 919 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 920 ) 921 922 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 923 if not self._match_text_seq("PROJECTION"): 924 return None 925 926 return self.expression( 927 exp.ProjectionDef, 928 this=self._parse_id_var(), 929 expression=self._parse_wrapped(self._parse_statement), 930 ) 931 932 def _parse_constraint(self) -> t.Optional[exp.Expression]: 933 return super()._parse_constraint() or self._parse_projection_def() 934 935 def _parse_alias( 936 self, this: t.Optional[exp.Expression], explicit: bool = False 937 ) -> t.Optional[exp.Expression]: 938 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 939 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 940 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 941 return this 942 943 return super()._parse_alias(this=this, explicit=explicit) 944 945 def _parse_expression(self) -> t.Optional[exp.Expression]: 946 this = super()._parse_expression() 947 948 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 949 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 950 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 951 self._match(TokenType.R_PAREN) 952 953 return this 954 955 def _parse_columns(self) -> exp.Expression: 956 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 957 958 while self._next and self._match_text_seq(")", "APPLY", "("): 959 self._match(TokenType.R_PAREN) 960 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 961 return this 962 963 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 964 value = super()._parse_value(values=values) 965 if not value: 966 return None 967 968 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 969 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 970 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 971 # but the final result is not altered by the extra parentheses. 972 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 973 expressions = value.expressions 974 if values and not isinstance(expressions[-1], exp.Tuple): 975 value.set( 976 "expressions", 977 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 978 ) 979 980 return value 981 982 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 983 # ClickHouse allows custom expressions as partition key 984 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 985 return self.expression( 986 exp.PartitionedByProperty, 987 this=self._parse_assignment(), 988 ) 989 990 class Generator(generator.Generator): 991 QUERY_HINTS = False 992 STRUCT_DELIMITER = ("(", ")") 993 NVL2_SUPPORTED = False 994 TABLESAMPLE_REQUIRES_PARENS = False 995 TABLESAMPLE_SIZE_IS_ROWS = False 996 TABLESAMPLE_KEYWORDS = "SAMPLE" 997 LAST_DAY_SUPPORTS_DATE_PART = False 998 CAN_IMPLEMENT_ARRAY_ANY = True 999 SUPPORTS_TO_NUMBER = False 1000 JOIN_HINTS = False 1001 TABLE_HINTS = False 1002 GROUPINGS_SEP = "" 1003 SET_OP_MODIFIERS = False 1004 ARRAY_SIZE_NAME = "LENGTH" 1005 WRAP_DERIVED_VALUES = False 1006 1007 STRING_TYPE_MAPPING = { 1008 exp.DataType.Type.BLOB: "String", 1009 exp.DataType.Type.CHAR: "String", 1010 exp.DataType.Type.LONGBLOB: "String", 1011 exp.DataType.Type.LONGTEXT: "String", 1012 exp.DataType.Type.MEDIUMBLOB: "String", 1013 exp.DataType.Type.MEDIUMTEXT: "String", 1014 exp.DataType.Type.TINYBLOB: "String", 1015 exp.DataType.Type.TINYTEXT: "String", 1016 exp.DataType.Type.TEXT: "String", 1017 exp.DataType.Type.VARBINARY: "String", 1018 exp.DataType.Type.VARCHAR: "String", 1019 } 1020 1021 SUPPORTED_JSON_PATH_PARTS = { 1022 exp.JSONPathKey, 1023 exp.JSONPathRoot, 1024 exp.JSONPathSubscript, 1025 } 1026 1027 TYPE_MAPPING = { 1028 **generator.Generator.TYPE_MAPPING, 1029 **STRING_TYPE_MAPPING, 1030 exp.DataType.Type.ARRAY: "Array", 1031 exp.DataType.Type.BOOLEAN: "Bool", 1032 exp.DataType.Type.BIGINT: "Int64", 1033 exp.DataType.Type.DATE32: "Date32", 1034 exp.DataType.Type.DATETIME: "DateTime", 1035 exp.DataType.Type.DATETIME2: "DateTime", 1036 exp.DataType.Type.SMALLDATETIME: "DateTime", 1037 exp.DataType.Type.DATETIME64: "DateTime64", 1038 exp.DataType.Type.DECIMAL: "Decimal", 1039 exp.DataType.Type.DECIMAL32: "Decimal32", 1040 exp.DataType.Type.DECIMAL64: "Decimal64", 1041 exp.DataType.Type.DECIMAL128: "Decimal128", 1042 exp.DataType.Type.DECIMAL256: "Decimal256", 1043 exp.DataType.Type.TIMESTAMP: "DateTime", 1044 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1045 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1046 exp.DataType.Type.DOUBLE: "Float64", 1047 exp.DataType.Type.ENUM: "Enum", 1048 exp.DataType.Type.ENUM8: "Enum8", 1049 exp.DataType.Type.ENUM16: "Enum16", 1050 exp.DataType.Type.FIXEDSTRING: "FixedString", 1051 exp.DataType.Type.FLOAT: "Float32", 1052 exp.DataType.Type.INT: "Int32", 1053 exp.DataType.Type.MEDIUMINT: "Int32", 1054 exp.DataType.Type.INT128: "Int128", 1055 exp.DataType.Type.INT256: "Int256", 1056 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1057 exp.DataType.Type.MAP: "Map", 1058 exp.DataType.Type.NESTED: "Nested", 1059 exp.DataType.Type.NOTHING: "Nothing", 1060 exp.DataType.Type.SMALLINT: "Int16", 1061 exp.DataType.Type.STRUCT: "Tuple", 1062 exp.DataType.Type.TINYINT: "Int8", 1063 exp.DataType.Type.UBIGINT: "UInt64", 1064 exp.DataType.Type.UINT: "UInt32", 1065 exp.DataType.Type.UINT128: "UInt128", 1066 exp.DataType.Type.UINT256: "UInt256", 1067 exp.DataType.Type.USMALLINT: "UInt16", 1068 exp.DataType.Type.UTINYINT: "UInt8", 1069 exp.DataType.Type.IPV4: "IPv4", 1070 exp.DataType.Type.IPV6: "IPv6", 1071 exp.DataType.Type.POINT: "Point", 1072 exp.DataType.Type.RING: "Ring", 1073 exp.DataType.Type.LINESTRING: "LineString", 1074 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1075 exp.DataType.Type.POLYGON: "Polygon", 1076 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1077 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1078 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1079 exp.DataType.Type.DYNAMIC: "Dynamic", 1080 } 1081 1082 TRANSFORMS = { 1083 **generator.Generator.TRANSFORMS, 1084 exp.AnyValue: rename_func("any"), 1085 exp.ApproxDistinct: rename_func("uniq"), 1086 exp.ArrayConcat: rename_func("arrayConcat"), 1087 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1088 exp.ArrayRemove: remove_from_array_using_filter, 1089 exp.ArrayReverse: rename_func("arrayReverse"), 1090 exp.ArraySlice: rename_func("arraySlice"), 1091 exp.ArraySum: rename_func("arraySum"), 1092 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1093 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1094 exp.Array: inline_array_sql, 1095 exp.CastToStrType: rename_func("CAST"), 1096 exp.CountIf: rename_func("countIf"), 1097 exp.CompressColumnConstraint: lambda self, 1098 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1099 exp.ComputedColumnConstraint: lambda self, 1100 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1101 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1102 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1103 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1104 exp.DateStrToDate: rename_func("toDate"), 1105 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1106 exp.Explode: rename_func("arrayJoin"), 1107 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1108 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1109 exp.IsNan: rename_func("isNaN"), 1110 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1111 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1112 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1113 exp.JSONPathKey: json_path_key_only_name, 1114 exp.JSONPathRoot: lambda *_: "", 1115 exp.Length: length_or_char_length_sql, 1116 exp.Map: _map_sql, 1117 exp.Median: rename_func("median"), 1118 exp.Nullif: rename_func("nullIf"), 1119 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1120 exp.Pivot: no_pivot_sql, 1121 exp.Quantile: _quantile_sql, 1122 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1123 exp.Rand: rename_func("randCanonical"), 1124 exp.StartsWith: rename_func("startsWith"), 1125 exp.EndsWith: rename_func("endsWith"), 1126 exp.StrPosition: lambda self, e: strposition_sql( 1127 self, 1128 e, 1129 func_name="POSITION", 1130 supports_position=True, 1131 use_ansi_position=False, 1132 ), 1133 exp.TimeToStr: lambda self, e: self.func( 1134 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1135 ), 1136 exp.TimeStrToTime: _timestrtotime_sql, 1137 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1138 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1139 exp.Typeof: rename_func("toTypeName"), 1140 exp.VarMap: _map_sql, 1141 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1142 exp.MD5Digest: rename_func("MD5"), 1143 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1144 exp.SHA: rename_func("SHA1"), 1145 exp.SHA2: sha256_sql, 1146 exp.UnixToTime: _unix_to_time_sql, 1147 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1148 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1149 exp.Variance: rename_func("varSamp"), 1150 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1151 exp.Stddev: rename_func("stddevSamp"), 1152 exp.Chr: rename_func("CHAR"), 1153 exp.Lag: lambda self, e: self.func( 1154 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1155 ), 1156 exp.Lead: lambda self, e: self.func( 1157 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1158 ), 1159 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1160 rename_func("editDistance") 1161 ), 1162 exp.ParseDatetime: rename_func("parseDateTime"), 1163 } 1164 1165 PROPERTIES_LOCATION = { 1166 **generator.Generator.PROPERTIES_LOCATION, 1167 exp.OnCluster: exp.Properties.Location.POST_NAME, 1168 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1169 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1170 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1171 } 1172 1173 # There's no list in docs, but it can be found in Clickhouse code 1174 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1175 ON_CLUSTER_TARGETS = { 1176 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1177 "DATABASE", 1178 "TABLE", 1179 "VIEW", 1180 "DICTIONARY", 1181 "INDEX", 1182 "FUNCTION", 1183 "NAMED COLLECTION", 1184 } 1185 1186 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1187 NON_NULLABLE_TYPES = { 1188 exp.DataType.Type.ARRAY, 1189 exp.DataType.Type.MAP, 1190 exp.DataType.Type.STRUCT, 1191 exp.DataType.Type.POINT, 1192 exp.DataType.Type.RING, 1193 exp.DataType.Type.LINESTRING, 1194 exp.DataType.Type.MULTILINESTRING, 1195 exp.DataType.Type.POLYGON, 1196 exp.DataType.Type.MULTIPOLYGON, 1197 } 1198 1199 def offset_sql(self, expression: exp.Offset) -> str: 1200 offset = super().offset_sql(expression) 1201 1202 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1203 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1204 parent = expression.parent 1205 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1206 offset = f"{offset} ROWS" 1207 1208 return offset 1209 1210 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1211 strtodate_sql = self.function_fallback_sql(expression) 1212 1213 if not isinstance(expression.parent, exp.Cast): 1214 # StrToDate returns DATEs in other dialects (eg. postgres), so 1215 # this branch aims to improve the transpilation to clickhouse 1216 return self.cast_sql(exp.cast(expression, "DATE")) 1217 1218 return strtodate_sql 1219 1220 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1221 this = expression.this 1222 1223 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1224 return self.sql(this) 1225 1226 return super().cast_sql(expression, safe_prefix=safe_prefix) 1227 1228 def trycast_sql(self, expression: exp.TryCast) -> str: 1229 dtype = expression.to 1230 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1231 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1232 dtype.set("nullable", True) 1233 1234 return super().cast_sql(expression) 1235 1236 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1237 this = self.json_path_part(expression.this) 1238 return str(int(this) + 1) if is_int(this) else this 1239 1240 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1241 return f"AS {self.sql(expression, 'this')}" 1242 1243 def _any_to_has( 1244 self, 1245 expression: exp.EQ | exp.NEQ, 1246 default: t.Callable[[t.Any], str], 1247 prefix: str = "", 1248 ) -> str: 1249 if isinstance(expression.left, exp.Any): 1250 arr = expression.left 1251 this = expression.right 1252 elif isinstance(expression.right, exp.Any): 1253 arr = expression.right 1254 this = expression.left 1255 else: 1256 return default(expression) 1257 1258 return prefix + self.func("has", arr.this.unnest(), this) 1259 1260 def eq_sql(self, expression: exp.EQ) -> str: 1261 return self._any_to_has(expression, super().eq_sql) 1262 1263 def neq_sql(self, expression: exp.NEQ) -> str: 1264 return self._any_to_has(expression, super().neq_sql, "NOT ") 1265 1266 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1267 # Manually add a flag to make the search case-insensitive 1268 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1269 return self.func("match", expression.this, regex) 1270 1271 def datatype_sql(self, expression: exp.DataType) -> str: 1272 # String is the standard ClickHouse type, every other variant is just an alias. 1273 # Additionally, any supplied length parameter will be ignored. 1274 # 1275 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1276 if expression.this in self.STRING_TYPE_MAPPING: 1277 dtype = "String" 1278 else: 1279 dtype = super().datatype_sql(expression) 1280 1281 # This section changes the type to `Nullable(...)` if the following conditions hold: 1282 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1283 # and change their semantics 1284 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1285 # constraint: "Type of Map key must be a type, that can be represented by integer or 1286 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1287 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1288 parent = expression.parent 1289 nullable = expression.args.get("nullable") 1290 if nullable is True or ( 1291 nullable is None 1292 and not ( 1293 isinstance(parent, exp.DataType) 1294 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1295 and expression.index in (None, 0) 1296 ) 1297 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1298 ): 1299 dtype = f"Nullable({dtype})" 1300 1301 return dtype 1302 1303 def cte_sql(self, expression: exp.CTE) -> str: 1304 if expression.args.get("scalar"): 1305 this = self.sql(expression, "this") 1306 alias = self.sql(expression, "alias") 1307 return f"{this} AS {alias}" 1308 1309 return super().cte_sql(expression) 1310 1311 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1312 return super().after_limit_modifiers(expression) + [ 1313 ( 1314 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1315 if expression.args.get("settings") 1316 else "" 1317 ), 1318 ( 1319 self.seg("FORMAT ") + self.sql(expression, "format") 1320 if expression.args.get("format") 1321 else "" 1322 ), 1323 ] 1324 1325 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1326 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1327 1328 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1329 return f"ON CLUSTER {self.sql(expression, 'this')}" 1330 1331 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1332 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1333 exp.Properties.Location.POST_NAME 1334 ): 1335 this_name = self.sql( 1336 expression.this if isinstance(expression.this, exp.Schema) else expression, 1337 "this", 1338 ) 1339 this_properties = " ".join( 1340 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1341 ) 1342 this_schema = self.schema_columns_sql(expression.this) 1343 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1344 1345 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1346 1347 return super().createable_sql(expression, locations) 1348 1349 def create_sql(self, expression: exp.Create) -> str: 1350 # The comment property comes last in CTAS statements, i.e. after the query 1351 query = expression.expression 1352 if isinstance(query, exp.Query): 1353 comment_prop = expression.find(exp.SchemaCommentProperty) 1354 if comment_prop: 1355 comment_prop.pop() 1356 query.replace(exp.paren(query)) 1357 else: 1358 comment_prop = None 1359 1360 create_sql = super().create_sql(expression) 1361 1362 comment_sql = self.sql(comment_prop) 1363 comment_sql = f" {comment_sql}" if comment_sql else "" 1364 1365 return f"{create_sql}{comment_sql}" 1366 1367 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1368 this = self.indent(self.sql(expression, "this")) 1369 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1370 1371 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1372 this = self.sql(expression, "this") 1373 this = f" {this}" if this else "" 1374 expr = self.sql(expression, "expression") 1375 expr = f" {expr}" if expr else "" 1376 index_type = self.sql(expression, "index_type") 1377 index_type = f" TYPE {index_type}" if index_type else "" 1378 granularity = self.sql(expression, "granularity") 1379 granularity = f" GRANULARITY {granularity}" if granularity else "" 1380 1381 return f"INDEX{this}{expr}{index_type}{granularity}" 1382 1383 def partition_sql(self, expression: exp.Partition) -> str: 1384 return f"PARTITION {self.expressions(expression, flat=True)}" 1385 1386 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1387 return f"ID {self.sql(expression.this)}" 1388 1389 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1390 return ( 1391 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1392 ) 1393 1394 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1395 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1396 1397 def is_sql(self, expression: exp.Is) -> str: 1398 is_sql = super().is_sql(expression) 1399 1400 if isinstance(expression.parent, exp.Not): 1401 # value IS NOT NULL -> NOT (value IS NULL) 1402 is_sql = self.wrap(is_sql) 1403 1404 return is_sql 1405 1406 def in_sql(self, expression: exp.In) -> str: 1407 in_sql = super().in_sql(expression) 1408 1409 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1410 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1411 1412 return in_sql 1413 1414 def not_sql(self, expression: exp.Not) -> str: 1415 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1416 # let `GLOBAL IN` child interpose `NOT` 1417 return self.sql(expression, "this") 1418 1419 return super().not_sql(expression) 1420 1421 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1422 # If the VALUES clause contains tuples of expressions, we need to treat it 1423 # as a table since Clickhouse will automatically alias it as such. 1424 alias = expression.args.get("alias") 1425 1426 if alias and alias.args.get("columns") and expression.expressions: 1427 values = expression.expressions[0].expressions 1428 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1429 else: 1430 values_as_table = True 1431 1432 return super().values_sql(expression, values_as_table=values_as_table)
192class ClickHouse(Dialect): 193 NORMALIZE_FUNCTIONS: bool | str = False 194 NULL_ORDERING = "nulls_are_last" 195 SUPPORTS_USER_DEFINED_TYPES = False 196 SAFE_DIVISION = True 197 LOG_BASE_FIRST: t.Optional[bool] = None 198 FORCE_EARLY_ALIAS_REF_EXPANSION = True 199 PRESERVE_ORIGINAL_NAMES = True 200 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 201 IDENTIFIERS_CAN_START_WITH_DIGIT = True 202 HEX_STRING_IS_INTEGER_TYPE = True 203 204 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 205 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 206 207 UNESCAPED_SEQUENCES = { 208 "\\0": "\0", 209 } 210 211 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 212 213 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 214 exp.Except: False, 215 exp.Intersect: False, 216 exp.Union: None, 217 } 218 219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases 244 245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 } 299 300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 317 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 319 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 321 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 322 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 323 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 324 "JSONEXTRACTSTRING": build_json_extract_path( 325 exp.JSONExtractScalar, zero_based_indexing=False 326 ), 327 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 328 "MAP": parser.build_var_map, 329 "MATCH": exp.RegexpLike.from_arg_list, 330 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 331 "RANDCANONICAL": exp.Rand.from_arg_list, 332 "STR_TO_DATE": _build_str_to_date, 333 "TUPLE": exp.Struct.from_arg_list, 334 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 335 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 336 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 337 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 338 "UNIQ": exp.ApproxDistinct.from_arg_list, 339 "XOR": lambda args: exp.Xor(expressions=args), 340 "MD5": exp.MD5Digest.from_arg_list, 341 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 342 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 343 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 344 "TOTYPENAME": exp.Typeof.from_arg_list, 345 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 346 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 347 } 348 FUNCTIONS.pop("TRANSFORM") 349 FUNCTIONS.pop("APPROX_TOP_SUM") 350 351 AGG_FUNCTIONS = { 352 "count", 353 "min", 354 "max", 355 "sum", 356 "avg", 357 "any", 358 "stddevPop", 359 "stddevSamp", 360 "varPop", 361 "varSamp", 362 "corr", 363 "covarPop", 364 "covarSamp", 365 "entropy", 366 "exponentialMovingAverage", 367 "intervalLengthSum", 368 "kolmogorovSmirnovTest", 369 "mannWhitneyUTest", 370 "median", 371 "rankCorr", 372 "sumKahan", 373 "studentTTest", 374 "welchTTest", 375 "anyHeavy", 376 "anyLast", 377 "boundingRatio", 378 "first_value", 379 "last_value", 380 "argMin", 381 "argMax", 382 "avgWeighted", 383 "topK", 384 "approx_top_sum", 385 "topKWeighted", 386 "deltaSum", 387 "deltaSumTimestamp", 388 "groupArray", 389 "groupArrayLast", 390 "groupUniqArray", 391 "groupArrayInsertAt", 392 "groupArrayMovingAvg", 393 "groupArrayMovingSum", 394 "groupArraySample", 395 "groupBitAnd", 396 "groupBitOr", 397 "groupBitXor", 398 "groupBitmap", 399 "groupBitmapAnd", 400 "groupBitmapOr", 401 "groupBitmapXor", 402 "sumWithOverflow", 403 "sumMap", 404 "minMap", 405 "maxMap", 406 "skewSamp", 407 "skewPop", 408 "kurtSamp", 409 "kurtPop", 410 "uniq", 411 "uniqExact", 412 "uniqCombined", 413 "uniqCombined64", 414 "uniqHLL12", 415 "uniqTheta", 416 "quantile", 417 "quantiles", 418 "quantileExact", 419 "quantilesExact", 420 "quantileExactLow", 421 "quantilesExactLow", 422 "quantileExactHigh", 423 "quantilesExactHigh", 424 "quantileExactWeighted", 425 "quantilesExactWeighted", 426 "quantileTiming", 427 "quantilesTiming", 428 "quantileTimingWeighted", 429 "quantilesTimingWeighted", 430 "quantileDeterministic", 431 "quantilesDeterministic", 432 "quantileTDigest", 433 "quantilesTDigest", 434 "quantileTDigestWeighted", 435 "quantilesTDigestWeighted", 436 "quantileBFloat16", 437 "quantilesBFloat16", 438 "quantileBFloat16Weighted", 439 "quantilesBFloat16Weighted", 440 "simpleLinearRegression", 441 "stochasticLinearRegression", 442 "stochasticLogisticRegression", 443 "categoricalInformationValue", 444 "contingency", 445 "cramersV", 446 "cramersVBiasCorrected", 447 "theilsU", 448 "maxIntersections", 449 "maxIntersectionsPosition", 450 "meanZTest", 451 "quantileInterpolatedWeighted", 452 "quantilesInterpolatedWeighted", 453 "quantileGK", 454 "quantilesGK", 455 "sparkBar", 456 "sumCount", 457 "largestTriangleThreeBuckets", 458 "histogram", 459 "sequenceMatch", 460 "sequenceCount", 461 "windowFunnel", 462 "retention", 463 "uniqUpTo", 464 "sequenceNextNode", 465 "exponentialTimeDecayedAvg", 466 } 467 468 AGG_FUNCTIONS_SUFFIXES = [ 469 "If", 470 "Array", 471 "ArrayIf", 472 "Map", 473 "SimpleState", 474 "State", 475 "Merge", 476 "MergeState", 477 "ForEach", 478 "Distinct", 479 "OrDefault", 480 "OrNull", 481 "Resample", 482 "ArgMin", 483 "ArgMax", 484 ] 485 486 FUNC_TOKENS = { 487 *parser.Parser.FUNC_TOKENS, 488 TokenType.AND, 489 TokenType.OR, 490 TokenType.SET, 491 } 492 493 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 494 495 ID_VAR_TOKENS = { 496 *parser.Parser.ID_VAR_TOKENS, 497 TokenType.LIKE, 498 } 499 500 AGG_FUNC_MAPPING = ( 501 lambda functions, suffixes: { 502 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 503 } 504 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 505 506 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 507 508 FUNCTION_PARSERS = { 509 **parser.Parser.FUNCTION_PARSERS, 510 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 511 "QUANTILE": lambda self: self._parse_quantile(), 512 "MEDIAN": lambda self: self._parse_quantile(), 513 "COLUMNS": lambda self: self._parse_columns(), 514 } 515 516 FUNCTION_PARSERS.pop("MATCH") 517 518 PROPERTY_PARSERS = { 519 **parser.Parser.PROPERTY_PARSERS, 520 "ENGINE": lambda self: self._parse_engine_property(), 521 } 522 PROPERTY_PARSERS.pop("DYNAMIC") 523 524 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 525 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 526 527 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 528 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 529 530 RANGE_PARSERS = { 531 **parser.Parser.RANGE_PARSERS, 532 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 533 } 534 535 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 536 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 537 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 538 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 539 540 JOIN_KINDS = { 541 *parser.Parser.JOIN_KINDS, 542 TokenType.ANY, 543 TokenType.ASOF, 544 TokenType.ARRAY, 545 } 546 547 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 548 TokenType.ANY, 549 TokenType.ARRAY, 550 TokenType.FINAL, 551 TokenType.FORMAT, 552 TokenType.SETTINGS, 553 } 554 555 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 556 TokenType.FORMAT, 557 } 558 559 LOG_DEFAULTS_TO_LN = True 560 561 QUERY_MODIFIER_PARSERS = { 562 **parser.Parser.QUERY_MODIFIER_PARSERS, 563 TokenType.SETTINGS: lambda self: ( 564 "settings", 565 self._advance() or self._parse_csv(self._parse_assignment), 566 ), 567 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 568 } 569 570 CONSTRAINT_PARSERS = { 571 **parser.Parser.CONSTRAINT_PARSERS, 572 "INDEX": lambda self: self._parse_index_constraint(), 573 "CODEC": lambda self: self._parse_compress(), 574 } 575 576 ALTER_PARSERS = { 577 **parser.Parser.ALTER_PARSERS, 578 "REPLACE": lambda self: self._parse_alter_table_replace(), 579 } 580 581 SCHEMA_UNNAMED_CONSTRAINTS = { 582 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 583 "INDEX", 584 } 585 586 PLACEHOLDER_PARSERS = { 587 **parser.Parser.PLACEHOLDER_PARSERS, 588 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 589 } 590 591 def _parse_engine_property(self) -> exp.EngineProperty: 592 self._match(TokenType.EQ) 593 return self.expression( 594 exp.EngineProperty, 595 this=self._parse_field(any_token=True, anonymous_func=True), 596 ) 597 598 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 599 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 600 return self._parse_lambda() 601 602 def _parse_types( 603 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 604 ) -> t.Optional[exp.Expression]: 605 dtype = super()._parse_types( 606 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 607 ) 608 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 609 # Mark every type as non-nullable which is ClickHouse's default, unless it's 610 # already marked as nullable. This marker helps us transpile types from other 611 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 612 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 613 # fail in ClickHouse without the `Nullable` type constructor. 614 dtype.set("nullable", False) 615 616 return dtype 617 618 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 619 index = self._index 620 this = self._parse_bitwise() 621 if self._match(TokenType.FROM): 622 self._retreat(index) 623 return super()._parse_extract() 624 625 # We return Anonymous here because extract and regexpExtract have different semantics, 626 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 627 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 628 # 629 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 630 self._match(TokenType.COMMA) 631 return self.expression( 632 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 633 ) 634 635 def _parse_assignment(self) -> t.Optional[exp.Expression]: 636 this = super()._parse_assignment() 637 638 if self._match(TokenType.PLACEHOLDER): 639 return self.expression( 640 exp.If, 641 this=this, 642 true=self._parse_assignment(), 643 false=self._match(TokenType.COLON) and self._parse_assignment(), 644 ) 645 646 return this 647 648 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 649 """ 650 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 651 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 652 """ 653 index = self._index 654 655 this = self._parse_id_var() 656 self._match(TokenType.COLON) 657 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 658 self._match_text_seq("IDENTIFIER") and "Identifier" 659 ) 660 661 if not kind: 662 self._retreat(index) 663 return None 664 elif not self._match(TokenType.R_BRACE): 665 self.raise_error("Expecting }") 666 667 if isinstance(this, exp.Identifier) and not this.quoted: 668 this = exp.var(this.name) 669 670 return self.expression(exp.Placeholder, this=this, kind=kind) 671 672 def _parse_bracket( 673 self, this: t.Optional[exp.Expression] = None 674 ) -> t.Optional[exp.Expression]: 675 l_brace = self._match(TokenType.L_BRACE, advance=False) 676 bracket = super()._parse_bracket(this) 677 678 if l_brace and isinstance(bracket, exp.Struct): 679 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 680 for expression in bracket.expressions: 681 if not isinstance(expression, exp.PropertyEQ): 682 break 683 684 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 685 varmap.args["values"].append("expressions", expression.expression) 686 687 return varmap 688 689 return bracket 690 691 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 692 this = super()._parse_in(this) 693 this.set("is_global", is_global) 694 return this 695 696 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 697 is_negated = self._match(TokenType.NOT) 698 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 699 return self.expression(exp.Not, this=this) if is_negated else this 700 701 def _parse_table( 702 self, 703 schema: bool = False, 704 joins: bool = False, 705 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 706 parse_bracket: bool = False, 707 is_db_reference: bool = False, 708 parse_partition: bool = False, 709 consume_pipe: bool = False, 710 ) -> t.Optional[exp.Expression]: 711 this = super()._parse_table( 712 schema=schema, 713 joins=joins, 714 alias_tokens=alias_tokens, 715 parse_bracket=parse_bracket, 716 is_db_reference=is_db_reference, 717 ) 718 719 if isinstance(this, exp.Table): 720 inner = this.this 721 alias = this.args.get("alias") 722 723 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 724 alias.set("columns", [exp.to_identifier("generate_series")]) 725 726 if self._match(TokenType.FINAL): 727 this = self.expression(exp.Final, this=this) 728 729 return this 730 731 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 732 return super()._parse_position(haystack_first=True) 733 734 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 735 def _parse_cte(self) -> t.Optional[exp.CTE]: 736 # WITH <identifier> AS <subquery expression> 737 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 738 739 if not cte: 740 # WITH <expression> AS <identifier> 741 cte = self.expression( 742 exp.CTE, 743 this=self._parse_assignment(), 744 alias=self._parse_table_alias(), 745 scalar=True, 746 ) 747 748 return cte 749 750 def _parse_join_parts( 751 self, 752 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 753 is_global = self._match(TokenType.GLOBAL) and self._prev 754 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 755 756 if kind_pre: 757 kind = self._match_set(self.JOIN_KINDS) and self._prev 758 side = self._match_set(self.JOIN_SIDES) and self._prev 759 return is_global, side, kind 760 761 return ( 762 is_global, 763 self._match_set(self.JOIN_SIDES) and self._prev, 764 self._match_set(self.JOIN_KINDS) and self._prev, 765 ) 766 767 def _parse_join( 768 self, skip_join_token: bool = False, parse_bracket: bool = False 769 ) -> t.Optional[exp.Join]: 770 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 771 if join: 772 join.set("global", join.args.pop("method", None)) 773 774 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 775 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 776 if join.kind == "ARRAY": 777 for table in join.find_all(exp.Table): 778 table.replace(table.to_column()) 779 780 return join 781 782 def _parse_function( 783 self, 784 functions: t.Optional[t.Dict[str, t.Callable]] = None, 785 anonymous: bool = False, 786 optional_parens: bool = True, 787 any_token: bool = False, 788 ) -> t.Optional[exp.Expression]: 789 expr = super()._parse_function( 790 functions=functions, 791 anonymous=anonymous, 792 optional_parens=optional_parens, 793 any_token=any_token, 794 ) 795 796 func = expr.this if isinstance(expr, exp.Window) else expr 797 798 # Aggregate functions can be split in 2 parts: <func_name><suffix> 799 parts = ( 800 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 801 ) 802 803 if parts: 804 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 805 params = self._parse_func_params(anon_func) 806 807 kwargs = { 808 "this": anon_func.this, 809 "expressions": anon_func.expressions, 810 } 811 if parts[1]: 812 exp_class: t.Type[exp.Expression] = ( 813 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 814 ) 815 else: 816 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 817 818 kwargs["exp_class"] = exp_class 819 if params: 820 kwargs["params"] = params 821 822 func = self.expression(**kwargs) 823 824 if isinstance(expr, exp.Window): 825 # The window's func was parsed as Anonymous in base parser, fix its 826 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 827 expr.set("this", func) 828 elif params: 829 # Params have blocked super()._parse_function() from parsing the following window 830 # (if that exists) as they're standing between the function call and the window spec 831 expr = self._parse_window(func) 832 else: 833 expr = func 834 835 return expr 836 837 def _parse_func_params( 838 self, this: t.Optional[exp.Func] = None 839 ) -> t.Optional[t.List[exp.Expression]]: 840 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 841 return self._parse_csv(self._parse_lambda) 842 843 if self._match(TokenType.L_PAREN): 844 params = self._parse_csv(self._parse_lambda) 845 self._match_r_paren(this) 846 return params 847 848 return None 849 850 def _parse_quantile(self) -> exp.Quantile: 851 this = self._parse_lambda() 852 params = self._parse_func_params() 853 if params: 854 return self.expression(exp.Quantile, this=params[0], quantile=this) 855 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 856 857 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 858 return super()._parse_wrapped_id_vars(optional=True) 859 860 def _parse_primary_key( 861 self, wrapped_optional: bool = False, in_props: bool = False 862 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 863 return super()._parse_primary_key( 864 wrapped_optional=wrapped_optional or in_props, in_props=in_props 865 ) 866 867 def _parse_on_property(self) -> t.Optional[exp.Expression]: 868 index = self._index 869 if self._match_text_seq("CLUSTER"): 870 this = self._parse_string() or self._parse_id_var() 871 if this: 872 return self.expression(exp.OnCluster, this=this) 873 else: 874 self._retreat(index) 875 return None 876 877 def _parse_index_constraint( 878 self, kind: t.Optional[str] = None 879 ) -> exp.IndexColumnConstraint: 880 # INDEX name1 expr TYPE type1(args) GRANULARITY value 881 this = self._parse_id_var() 882 expression = self._parse_assignment() 883 884 index_type = self._match_text_seq("TYPE") and ( 885 self._parse_function() or self._parse_var() 886 ) 887 888 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 889 890 return self.expression( 891 exp.IndexColumnConstraint, 892 this=this, 893 expression=expression, 894 index_type=index_type, 895 granularity=granularity, 896 ) 897 898 def _parse_partition(self) -> t.Optional[exp.Partition]: 899 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 900 if not self._match(TokenType.PARTITION): 901 return None 902 903 if self._match_text_seq("ID"): 904 # Corresponds to the PARTITION ID <string_value> syntax 905 expressions: t.List[exp.Expression] = [ 906 self.expression(exp.PartitionId, this=self._parse_string()) 907 ] 908 else: 909 expressions = self._parse_expressions() 910 911 return self.expression(exp.Partition, expressions=expressions) 912 913 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 914 partition = self._parse_partition() 915 916 if not partition or not self._match(TokenType.FROM): 917 return None 918 919 return self.expression( 920 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 921 ) 922 923 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 924 if not self._match_text_seq("PROJECTION"): 925 return None 926 927 return self.expression( 928 exp.ProjectionDef, 929 this=self._parse_id_var(), 930 expression=self._parse_wrapped(self._parse_statement), 931 ) 932 933 def _parse_constraint(self) -> t.Optional[exp.Expression]: 934 return super()._parse_constraint() or self._parse_projection_def() 935 936 def _parse_alias( 937 self, this: t.Optional[exp.Expression], explicit: bool = False 938 ) -> t.Optional[exp.Expression]: 939 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 940 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 941 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 942 return this 943 944 return super()._parse_alias(this=this, explicit=explicit) 945 946 def _parse_expression(self) -> t.Optional[exp.Expression]: 947 this = super()._parse_expression() 948 949 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 950 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 951 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 952 self._match(TokenType.R_PAREN) 953 954 return this 955 956 def _parse_columns(self) -> exp.Expression: 957 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 958 959 while self._next and self._match_text_seq(")", "APPLY", "("): 960 self._match(TokenType.R_PAREN) 961 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 962 return this 963 964 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 965 value = super()._parse_value(values=values) 966 if not value: 967 return None 968 969 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 970 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 971 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 972 # but the final result is not altered by the extra parentheses. 973 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 974 expressions = value.expressions 975 if values and not isinstance(expressions[-1], exp.Tuple): 976 value.set( 977 "expressions", 978 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 979 ) 980 981 return value 982 983 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 984 # ClickHouse allows custom expressions as partition key 985 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 986 return self.expression( 987 exp.PartitionedByProperty, 988 this=self._parse_assignment(), 989 ) 990 991 class Generator(generator.Generator): 992 QUERY_HINTS = False 993 STRUCT_DELIMITER = ("(", ")") 994 NVL2_SUPPORTED = False 995 TABLESAMPLE_REQUIRES_PARENS = False 996 TABLESAMPLE_SIZE_IS_ROWS = False 997 TABLESAMPLE_KEYWORDS = "SAMPLE" 998 LAST_DAY_SUPPORTS_DATE_PART = False 999 CAN_IMPLEMENT_ARRAY_ANY = True 1000 SUPPORTS_TO_NUMBER = False 1001 JOIN_HINTS = False 1002 TABLE_HINTS = False 1003 GROUPINGS_SEP = "" 1004 SET_OP_MODIFIERS = False 1005 ARRAY_SIZE_NAME = "LENGTH" 1006 WRAP_DERIVED_VALUES = False 1007 1008 STRING_TYPE_MAPPING = { 1009 exp.DataType.Type.BLOB: "String", 1010 exp.DataType.Type.CHAR: "String", 1011 exp.DataType.Type.LONGBLOB: "String", 1012 exp.DataType.Type.LONGTEXT: "String", 1013 exp.DataType.Type.MEDIUMBLOB: "String", 1014 exp.DataType.Type.MEDIUMTEXT: "String", 1015 exp.DataType.Type.TINYBLOB: "String", 1016 exp.DataType.Type.TINYTEXT: "String", 1017 exp.DataType.Type.TEXT: "String", 1018 exp.DataType.Type.VARBINARY: "String", 1019 exp.DataType.Type.VARCHAR: "String", 1020 } 1021 1022 SUPPORTED_JSON_PATH_PARTS = { 1023 exp.JSONPathKey, 1024 exp.JSONPathRoot, 1025 exp.JSONPathSubscript, 1026 } 1027 1028 TYPE_MAPPING = { 1029 **generator.Generator.TYPE_MAPPING, 1030 **STRING_TYPE_MAPPING, 1031 exp.DataType.Type.ARRAY: "Array", 1032 exp.DataType.Type.BOOLEAN: "Bool", 1033 exp.DataType.Type.BIGINT: "Int64", 1034 exp.DataType.Type.DATE32: "Date32", 1035 exp.DataType.Type.DATETIME: "DateTime", 1036 exp.DataType.Type.DATETIME2: "DateTime", 1037 exp.DataType.Type.SMALLDATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME64: "DateTime64", 1039 exp.DataType.Type.DECIMAL: "Decimal", 1040 exp.DataType.Type.DECIMAL32: "Decimal32", 1041 exp.DataType.Type.DECIMAL64: "Decimal64", 1042 exp.DataType.Type.DECIMAL128: "Decimal128", 1043 exp.DataType.Type.DECIMAL256: "Decimal256", 1044 exp.DataType.Type.TIMESTAMP: "DateTime", 1045 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1046 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1047 exp.DataType.Type.DOUBLE: "Float64", 1048 exp.DataType.Type.ENUM: "Enum", 1049 exp.DataType.Type.ENUM8: "Enum8", 1050 exp.DataType.Type.ENUM16: "Enum16", 1051 exp.DataType.Type.FIXEDSTRING: "FixedString", 1052 exp.DataType.Type.FLOAT: "Float32", 1053 exp.DataType.Type.INT: "Int32", 1054 exp.DataType.Type.MEDIUMINT: "Int32", 1055 exp.DataType.Type.INT128: "Int128", 1056 exp.DataType.Type.INT256: "Int256", 1057 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1058 exp.DataType.Type.MAP: "Map", 1059 exp.DataType.Type.NESTED: "Nested", 1060 exp.DataType.Type.NOTHING: "Nothing", 1061 exp.DataType.Type.SMALLINT: "Int16", 1062 exp.DataType.Type.STRUCT: "Tuple", 1063 exp.DataType.Type.TINYINT: "Int8", 1064 exp.DataType.Type.UBIGINT: "UInt64", 1065 exp.DataType.Type.UINT: "UInt32", 1066 exp.DataType.Type.UINT128: "UInt128", 1067 exp.DataType.Type.UINT256: "UInt256", 1068 exp.DataType.Type.USMALLINT: "UInt16", 1069 exp.DataType.Type.UTINYINT: "UInt8", 1070 exp.DataType.Type.IPV4: "IPv4", 1071 exp.DataType.Type.IPV6: "IPv6", 1072 exp.DataType.Type.POINT: "Point", 1073 exp.DataType.Type.RING: "Ring", 1074 exp.DataType.Type.LINESTRING: "LineString", 1075 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1076 exp.DataType.Type.POLYGON: "Polygon", 1077 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1078 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1079 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1080 exp.DataType.Type.DYNAMIC: "Dynamic", 1081 } 1082 1083 TRANSFORMS = { 1084 **generator.Generator.TRANSFORMS, 1085 exp.AnyValue: rename_func("any"), 1086 exp.ApproxDistinct: rename_func("uniq"), 1087 exp.ArrayConcat: rename_func("arrayConcat"), 1088 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1089 exp.ArrayRemove: remove_from_array_using_filter, 1090 exp.ArrayReverse: rename_func("arrayReverse"), 1091 exp.ArraySlice: rename_func("arraySlice"), 1092 exp.ArraySum: rename_func("arraySum"), 1093 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1094 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1095 exp.Array: inline_array_sql, 1096 exp.CastToStrType: rename_func("CAST"), 1097 exp.CountIf: rename_func("countIf"), 1098 exp.CompressColumnConstraint: lambda self, 1099 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1100 exp.ComputedColumnConstraint: lambda self, 1101 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1102 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1103 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1104 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1105 exp.DateStrToDate: rename_func("toDate"), 1106 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1107 exp.Explode: rename_func("arrayJoin"), 1108 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1109 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1110 exp.IsNan: rename_func("isNaN"), 1111 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1112 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1113 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1114 exp.JSONPathKey: json_path_key_only_name, 1115 exp.JSONPathRoot: lambda *_: "", 1116 exp.Length: length_or_char_length_sql, 1117 exp.Map: _map_sql, 1118 exp.Median: rename_func("median"), 1119 exp.Nullif: rename_func("nullIf"), 1120 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1121 exp.Pivot: no_pivot_sql, 1122 exp.Quantile: _quantile_sql, 1123 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1124 exp.Rand: rename_func("randCanonical"), 1125 exp.StartsWith: rename_func("startsWith"), 1126 exp.EndsWith: rename_func("endsWith"), 1127 exp.StrPosition: lambda self, e: strposition_sql( 1128 self, 1129 e, 1130 func_name="POSITION", 1131 supports_position=True, 1132 use_ansi_position=False, 1133 ), 1134 exp.TimeToStr: lambda self, e: self.func( 1135 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1136 ), 1137 exp.TimeStrToTime: _timestrtotime_sql, 1138 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1139 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1140 exp.Typeof: rename_func("toTypeName"), 1141 exp.VarMap: _map_sql, 1142 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1143 exp.MD5Digest: rename_func("MD5"), 1144 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1145 exp.SHA: rename_func("SHA1"), 1146 exp.SHA2: sha256_sql, 1147 exp.UnixToTime: _unix_to_time_sql, 1148 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1149 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1150 exp.Variance: rename_func("varSamp"), 1151 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1152 exp.Stddev: rename_func("stddevSamp"), 1153 exp.Chr: rename_func("CHAR"), 1154 exp.Lag: lambda self, e: self.func( 1155 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1156 ), 1157 exp.Lead: lambda self, e: self.func( 1158 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1159 ), 1160 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1161 rename_func("editDistance") 1162 ), 1163 exp.ParseDatetime: rename_func("parseDateTime"), 1164 } 1165 1166 PROPERTIES_LOCATION = { 1167 **generator.Generator.PROPERTIES_LOCATION, 1168 exp.OnCluster: exp.Properties.Location.POST_NAME, 1169 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1170 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1171 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1172 } 1173 1174 # There's no list in docs, but it can be found in Clickhouse code 1175 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1176 ON_CLUSTER_TARGETS = { 1177 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1178 "DATABASE", 1179 "TABLE", 1180 "VIEW", 1181 "DICTIONARY", 1182 "INDEX", 1183 "FUNCTION", 1184 "NAMED COLLECTION", 1185 } 1186 1187 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1188 NON_NULLABLE_TYPES = { 1189 exp.DataType.Type.ARRAY, 1190 exp.DataType.Type.MAP, 1191 exp.DataType.Type.STRUCT, 1192 exp.DataType.Type.POINT, 1193 exp.DataType.Type.RING, 1194 exp.DataType.Type.LINESTRING, 1195 exp.DataType.Type.MULTILINESTRING, 1196 exp.DataType.Type.POLYGON, 1197 exp.DataType.Type.MULTIPOLYGON, 1198 } 1199 1200 def offset_sql(self, expression: exp.Offset) -> str: 1201 offset = super().offset_sql(expression) 1202 1203 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1204 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1205 parent = expression.parent 1206 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1207 offset = f"{offset} ROWS" 1208 1209 return offset 1210 1211 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1212 strtodate_sql = self.function_fallback_sql(expression) 1213 1214 if not isinstance(expression.parent, exp.Cast): 1215 # StrToDate returns DATEs in other dialects (eg. postgres), so 1216 # this branch aims to improve the transpilation to clickhouse 1217 return self.cast_sql(exp.cast(expression, "DATE")) 1218 1219 return strtodate_sql 1220 1221 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1222 this = expression.this 1223 1224 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1225 return self.sql(this) 1226 1227 return super().cast_sql(expression, safe_prefix=safe_prefix) 1228 1229 def trycast_sql(self, expression: exp.TryCast) -> str: 1230 dtype = expression.to 1231 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1232 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1233 dtype.set("nullable", True) 1234 1235 return super().cast_sql(expression) 1236 1237 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1238 this = self.json_path_part(expression.this) 1239 return str(int(this) + 1) if is_int(this) else this 1240 1241 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1242 return f"AS {self.sql(expression, 'this')}" 1243 1244 def _any_to_has( 1245 self, 1246 expression: exp.EQ | exp.NEQ, 1247 default: t.Callable[[t.Any], str], 1248 prefix: str = "", 1249 ) -> str: 1250 if isinstance(expression.left, exp.Any): 1251 arr = expression.left 1252 this = expression.right 1253 elif isinstance(expression.right, exp.Any): 1254 arr = expression.right 1255 this = expression.left 1256 else: 1257 return default(expression) 1258 1259 return prefix + self.func("has", arr.this.unnest(), this) 1260 1261 def eq_sql(self, expression: exp.EQ) -> str: 1262 return self._any_to_has(expression, super().eq_sql) 1263 1264 def neq_sql(self, expression: exp.NEQ) -> str: 1265 return self._any_to_has(expression, super().neq_sql, "NOT ") 1266 1267 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1268 # Manually add a flag to make the search case-insensitive 1269 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1270 return self.func("match", expression.this, regex) 1271 1272 def datatype_sql(self, expression: exp.DataType) -> str: 1273 # String is the standard ClickHouse type, every other variant is just an alias. 1274 # Additionally, any supplied length parameter will be ignored. 1275 # 1276 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1277 if expression.this in self.STRING_TYPE_MAPPING: 1278 dtype = "String" 1279 else: 1280 dtype = super().datatype_sql(expression) 1281 1282 # This section changes the type to `Nullable(...)` if the following conditions hold: 1283 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1284 # and change their semantics 1285 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1286 # constraint: "Type of Map key must be a type, that can be represented by integer or 1287 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1288 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1289 parent = expression.parent 1290 nullable = expression.args.get("nullable") 1291 if nullable is True or ( 1292 nullable is None 1293 and not ( 1294 isinstance(parent, exp.DataType) 1295 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1296 and expression.index in (None, 0) 1297 ) 1298 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1299 ): 1300 dtype = f"Nullable({dtype})" 1301 1302 return dtype 1303 1304 def cte_sql(self, expression: exp.CTE) -> str: 1305 if expression.args.get("scalar"): 1306 this = self.sql(expression, "this") 1307 alias = self.sql(expression, "alias") 1308 return f"{this} AS {alias}" 1309 1310 return super().cte_sql(expression) 1311 1312 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1313 return super().after_limit_modifiers(expression) + [ 1314 ( 1315 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1316 if expression.args.get("settings") 1317 else "" 1318 ), 1319 ( 1320 self.seg("FORMAT ") + self.sql(expression, "format") 1321 if expression.args.get("format") 1322 else "" 1323 ), 1324 ] 1325 1326 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1327 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1328 1329 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1330 return f"ON CLUSTER {self.sql(expression, 'this')}" 1331 1332 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1333 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1334 exp.Properties.Location.POST_NAME 1335 ): 1336 this_name = self.sql( 1337 expression.this if isinstance(expression.this, exp.Schema) else expression, 1338 "this", 1339 ) 1340 this_properties = " ".join( 1341 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1342 ) 1343 this_schema = self.schema_columns_sql(expression.this) 1344 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1345 1346 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1347 1348 return super().createable_sql(expression, locations) 1349 1350 def create_sql(self, expression: exp.Create) -> str: 1351 # The comment property comes last in CTAS statements, i.e. after the query 1352 query = expression.expression 1353 if isinstance(query, exp.Query): 1354 comment_prop = expression.find(exp.SchemaCommentProperty) 1355 if comment_prop: 1356 comment_prop.pop() 1357 query.replace(exp.paren(query)) 1358 else: 1359 comment_prop = None 1360 1361 create_sql = super().create_sql(expression) 1362 1363 comment_sql = self.sql(comment_prop) 1364 comment_sql = f" {comment_sql}" if comment_sql else "" 1365 1366 return f"{create_sql}{comment_sql}" 1367 1368 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1369 this = self.indent(self.sql(expression, "this")) 1370 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1371 1372 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1373 this = self.sql(expression, "this") 1374 this = f" {this}" if this else "" 1375 expr = self.sql(expression, "expression") 1376 expr = f" {expr}" if expr else "" 1377 index_type = self.sql(expression, "index_type") 1378 index_type = f" TYPE {index_type}" if index_type else "" 1379 granularity = self.sql(expression, "granularity") 1380 granularity = f" GRANULARITY {granularity}" if granularity else "" 1381 1382 return f"INDEX{this}{expr}{index_type}{granularity}" 1383 1384 def partition_sql(self, expression: exp.Partition) -> str: 1385 return f"PARTITION {self.expressions(expression, flat=True)}" 1386 1387 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1388 return f"ID {self.sql(expression.this)}" 1389 1390 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1391 return ( 1392 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1393 ) 1394 1395 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1396 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1397 1398 def is_sql(self, expression: exp.Is) -> str: 1399 is_sql = super().is_sql(expression) 1400 1401 if isinstance(expression.parent, exp.Not): 1402 # value IS NOT NULL -> NOT (value IS NULL) 1403 is_sql = self.wrap(is_sql) 1404 1405 return is_sql 1406 1407 def in_sql(self, expression: exp.In) -> str: 1408 in_sql = super().in_sql(expression) 1409 1410 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1411 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1412 1413 return in_sql 1414 1415 def not_sql(self, expression: exp.Not) -> str: 1416 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1417 # let `GLOBAL IN` child interpose `NOT` 1418 return self.sql(expression, "this") 1419 1420 return super().not_sql(expression) 1421 1422 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1423 # If the VALUES clause contains tuples of expressions, we need to treat it 1424 # as a table since Clickhouse will automatically alias it as such. 1425 alias = expression.args.get("alias") 1426 1427 if alias and alias.args.get("columns") and expression.expressions: 1428 values = expression.expressions[0].expressions 1429 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1430 else: 1431 values_as_table = True 1432 1433 return super().values_sql(expression, values_as_table=values_as_table)
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Default NULL
ordering method to use if not explicitly set.
Possible values: "nulls_are_small"
, "nulls_are_large"
, "nulls_are_last"
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether number literals can include underscores for better readability
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Helper for dialects that use a different name for the same creatable kind. For example, the Clickhouse equivalent of CREATE SCHEMA is CREATE DATABASE.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases
245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BYTE_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 317 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 319 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 321 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 322 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 323 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 324 "JSONEXTRACTSTRING": build_json_extract_path( 325 exp.JSONExtractScalar, zero_based_indexing=False 326 ), 327 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 328 "MAP": parser.build_var_map, 329 "MATCH": exp.RegexpLike.from_arg_list, 330 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 331 "RANDCANONICAL": exp.Rand.from_arg_list, 332 "STR_TO_DATE": _build_str_to_date, 333 "TUPLE": exp.Struct.from_arg_list, 334 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 335 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 336 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 337 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 338 "UNIQ": exp.ApproxDistinct.from_arg_list, 339 "XOR": lambda args: exp.Xor(expressions=args), 340 "MD5": exp.MD5Digest.from_arg_list, 341 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 342 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 343 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 344 "TOTYPENAME": exp.Typeof.from_arg_list, 345 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 346 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 347 } 348 FUNCTIONS.pop("TRANSFORM") 349 FUNCTIONS.pop("APPROX_TOP_SUM") 350 351 AGG_FUNCTIONS = { 352 "count", 353 "min", 354 "max", 355 "sum", 356 "avg", 357 "any", 358 "stddevPop", 359 "stddevSamp", 360 "varPop", 361 "varSamp", 362 "corr", 363 "covarPop", 364 "covarSamp", 365 "entropy", 366 "exponentialMovingAverage", 367 "intervalLengthSum", 368 "kolmogorovSmirnovTest", 369 "mannWhitneyUTest", 370 "median", 371 "rankCorr", 372 "sumKahan", 373 "studentTTest", 374 "welchTTest", 375 "anyHeavy", 376 "anyLast", 377 "boundingRatio", 378 "first_value", 379 "last_value", 380 "argMin", 381 "argMax", 382 "avgWeighted", 383 "topK", 384 "approx_top_sum", 385 "topKWeighted", 386 "deltaSum", 387 "deltaSumTimestamp", 388 "groupArray", 389 "groupArrayLast", 390 "groupUniqArray", 391 "groupArrayInsertAt", 392 "groupArrayMovingAvg", 393 "groupArrayMovingSum", 394 "groupArraySample", 395 "groupBitAnd", 396 "groupBitOr", 397 "groupBitXor", 398 "groupBitmap", 399 "groupBitmapAnd", 400 "groupBitmapOr", 401 "groupBitmapXor", 402 "sumWithOverflow", 403 "sumMap", 404 "minMap", 405 "maxMap", 406 "skewSamp", 407 "skewPop", 408 "kurtSamp", 409 "kurtPop", 410 "uniq", 411 "uniqExact", 412 "uniqCombined", 413 "uniqCombined64", 414 "uniqHLL12", 415 "uniqTheta", 416 "quantile", 417 "quantiles", 418 "quantileExact", 419 "quantilesExact", 420 "quantileExactLow", 421 "quantilesExactLow", 422 "quantileExactHigh", 423 "quantilesExactHigh", 424 "quantileExactWeighted", 425 "quantilesExactWeighted", 426 "quantileTiming", 427 "quantilesTiming", 428 "quantileTimingWeighted", 429 "quantilesTimingWeighted", 430 "quantileDeterministic", 431 "quantilesDeterministic", 432 "quantileTDigest", 433 "quantilesTDigest", 434 "quantileTDigestWeighted", 435 "quantilesTDigestWeighted", 436 "quantileBFloat16", 437 "quantilesBFloat16", 438 "quantileBFloat16Weighted", 439 "quantilesBFloat16Weighted", 440 "simpleLinearRegression", 441 "stochasticLinearRegression", 442 "stochasticLogisticRegression", 443 "categoricalInformationValue", 444 "contingency", 445 "cramersV", 446 "cramersVBiasCorrected", 447 "theilsU", 448 "maxIntersections", 449 "maxIntersectionsPosition", 450 "meanZTest", 451 "quantileInterpolatedWeighted", 452 "quantilesInterpolatedWeighted", 453 "quantileGK", 454 "quantilesGK", 455 "sparkBar", 456 "sumCount", 457 "largestTriangleThreeBuckets", 458 "histogram", 459 "sequenceMatch", 460 "sequenceCount", 461 "windowFunnel", 462 "retention", 463 "uniqUpTo", 464 "sequenceNextNode", 465 "exponentialTimeDecayedAvg", 466 } 467 468 AGG_FUNCTIONS_SUFFIXES = [ 469 "If", 470 "Array", 471 "ArrayIf", 472 "Map", 473 "SimpleState", 474 "State", 475 "Merge", 476 "MergeState", 477 "ForEach", 478 "Distinct", 479 "OrDefault", 480 "OrNull", 481 "Resample", 482 "ArgMin", 483 "ArgMax", 484 ] 485 486 FUNC_TOKENS = { 487 *parser.Parser.FUNC_TOKENS, 488 TokenType.AND, 489 TokenType.OR, 490 TokenType.SET, 491 } 492 493 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 494 495 ID_VAR_TOKENS = { 496 *parser.Parser.ID_VAR_TOKENS, 497 TokenType.LIKE, 498 } 499 500 AGG_FUNC_MAPPING = ( 501 lambda functions, suffixes: { 502 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 503 } 504 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 505 506 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 507 508 FUNCTION_PARSERS = { 509 **parser.Parser.FUNCTION_PARSERS, 510 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 511 "QUANTILE": lambda self: self._parse_quantile(), 512 "MEDIAN": lambda self: self._parse_quantile(), 513 "COLUMNS": lambda self: self._parse_columns(), 514 } 515 516 FUNCTION_PARSERS.pop("MATCH") 517 518 PROPERTY_PARSERS = { 519 **parser.Parser.PROPERTY_PARSERS, 520 "ENGINE": lambda self: self._parse_engine_property(), 521 } 522 PROPERTY_PARSERS.pop("DYNAMIC") 523 524 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 525 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 526 527 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 528 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 529 530 RANGE_PARSERS = { 531 **parser.Parser.RANGE_PARSERS, 532 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 533 } 534 535 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 536 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 537 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 538 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 539 540 JOIN_KINDS = { 541 *parser.Parser.JOIN_KINDS, 542 TokenType.ANY, 543 TokenType.ASOF, 544 TokenType.ARRAY, 545 } 546 547 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 548 TokenType.ANY, 549 TokenType.ARRAY, 550 TokenType.FINAL, 551 TokenType.FORMAT, 552 TokenType.SETTINGS, 553 } 554 555 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 556 TokenType.FORMAT, 557 } 558 559 LOG_DEFAULTS_TO_LN = True 560 561 QUERY_MODIFIER_PARSERS = { 562 **parser.Parser.QUERY_MODIFIER_PARSERS, 563 TokenType.SETTINGS: lambda self: ( 564 "settings", 565 self._advance() or self._parse_csv(self._parse_assignment), 566 ), 567 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 568 } 569 570 CONSTRAINT_PARSERS = { 571 **parser.Parser.CONSTRAINT_PARSERS, 572 "INDEX": lambda self: self._parse_index_constraint(), 573 "CODEC": lambda self: self._parse_compress(), 574 } 575 576 ALTER_PARSERS = { 577 **parser.Parser.ALTER_PARSERS, 578 "REPLACE": lambda self: self._parse_alter_table_replace(), 579 } 580 581 SCHEMA_UNNAMED_CONSTRAINTS = { 582 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 583 "INDEX", 584 } 585 586 PLACEHOLDER_PARSERS = { 587 **parser.Parser.PLACEHOLDER_PARSERS, 588 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 589 } 590 591 def _parse_engine_property(self) -> exp.EngineProperty: 592 self._match(TokenType.EQ) 593 return self.expression( 594 exp.EngineProperty, 595 this=self._parse_field(any_token=True, anonymous_func=True), 596 ) 597 598 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 599 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 600 return self._parse_lambda() 601 602 def _parse_types( 603 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 604 ) -> t.Optional[exp.Expression]: 605 dtype = super()._parse_types( 606 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 607 ) 608 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 609 # Mark every type as non-nullable which is ClickHouse's default, unless it's 610 # already marked as nullable. This marker helps us transpile types from other 611 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 612 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 613 # fail in ClickHouse without the `Nullable` type constructor. 614 dtype.set("nullable", False) 615 616 return dtype 617 618 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 619 index = self._index 620 this = self._parse_bitwise() 621 if self._match(TokenType.FROM): 622 self._retreat(index) 623 return super()._parse_extract() 624 625 # We return Anonymous here because extract and regexpExtract have different semantics, 626 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 627 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 628 # 629 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 630 self._match(TokenType.COMMA) 631 return self.expression( 632 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 633 ) 634 635 def _parse_assignment(self) -> t.Optional[exp.Expression]: 636 this = super()._parse_assignment() 637 638 if self._match(TokenType.PLACEHOLDER): 639 return self.expression( 640 exp.If, 641 this=this, 642 true=self._parse_assignment(), 643 false=self._match(TokenType.COLON) and self._parse_assignment(), 644 ) 645 646 return this 647 648 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 649 """ 650 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 651 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 652 """ 653 index = self._index 654 655 this = self._parse_id_var() 656 self._match(TokenType.COLON) 657 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 658 self._match_text_seq("IDENTIFIER") and "Identifier" 659 ) 660 661 if not kind: 662 self._retreat(index) 663 return None 664 elif not self._match(TokenType.R_BRACE): 665 self.raise_error("Expecting }") 666 667 if isinstance(this, exp.Identifier) and not this.quoted: 668 this = exp.var(this.name) 669 670 return self.expression(exp.Placeholder, this=this, kind=kind) 671 672 def _parse_bracket( 673 self, this: t.Optional[exp.Expression] = None 674 ) -> t.Optional[exp.Expression]: 675 l_brace = self._match(TokenType.L_BRACE, advance=False) 676 bracket = super()._parse_bracket(this) 677 678 if l_brace and isinstance(bracket, exp.Struct): 679 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 680 for expression in bracket.expressions: 681 if not isinstance(expression, exp.PropertyEQ): 682 break 683 684 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 685 varmap.args["values"].append("expressions", expression.expression) 686 687 return varmap 688 689 return bracket 690 691 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 692 this = super()._parse_in(this) 693 this.set("is_global", is_global) 694 return this 695 696 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 697 is_negated = self._match(TokenType.NOT) 698 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 699 return self.expression(exp.Not, this=this) if is_negated else this 700 701 def _parse_table( 702 self, 703 schema: bool = False, 704 joins: bool = False, 705 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 706 parse_bracket: bool = False, 707 is_db_reference: bool = False, 708 parse_partition: bool = False, 709 consume_pipe: bool = False, 710 ) -> t.Optional[exp.Expression]: 711 this = super()._parse_table( 712 schema=schema, 713 joins=joins, 714 alias_tokens=alias_tokens, 715 parse_bracket=parse_bracket, 716 is_db_reference=is_db_reference, 717 ) 718 719 if isinstance(this, exp.Table): 720 inner = this.this 721 alias = this.args.get("alias") 722 723 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 724 alias.set("columns", [exp.to_identifier("generate_series")]) 725 726 if self._match(TokenType.FINAL): 727 this = self.expression(exp.Final, this=this) 728 729 return this 730 731 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 732 return super()._parse_position(haystack_first=True) 733 734 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 735 def _parse_cte(self) -> t.Optional[exp.CTE]: 736 # WITH <identifier> AS <subquery expression> 737 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 738 739 if not cte: 740 # WITH <expression> AS <identifier> 741 cte = self.expression( 742 exp.CTE, 743 this=self._parse_assignment(), 744 alias=self._parse_table_alias(), 745 scalar=True, 746 ) 747 748 return cte 749 750 def _parse_join_parts( 751 self, 752 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 753 is_global = self._match(TokenType.GLOBAL) and self._prev 754 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 755 756 if kind_pre: 757 kind = self._match_set(self.JOIN_KINDS) and self._prev 758 side = self._match_set(self.JOIN_SIDES) and self._prev 759 return is_global, side, kind 760 761 return ( 762 is_global, 763 self._match_set(self.JOIN_SIDES) and self._prev, 764 self._match_set(self.JOIN_KINDS) and self._prev, 765 ) 766 767 def _parse_join( 768 self, skip_join_token: bool = False, parse_bracket: bool = False 769 ) -> t.Optional[exp.Join]: 770 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 771 if join: 772 join.set("global", join.args.pop("method", None)) 773 774 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 775 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 776 if join.kind == "ARRAY": 777 for table in join.find_all(exp.Table): 778 table.replace(table.to_column()) 779 780 return join 781 782 def _parse_function( 783 self, 784 functions: t.Optional[t.Dict[str, t.Callable]] = None, 785 anonymous: bool = False, 786 optional_parens: bool = True, 787 any_token: bool = False, 788 ) -> t.Optional[exp.Expression]: 789 expr = super()._parse_function( 790 functions=functions, 791 anonymous=anonymous, 792 optional_parens=optional_parens, 793 any_token=any_token, 794 ) 795 796 func = expr.this if isinstance(expr, exp.Window) else expr 797 798 # Aggregate functions can be split in 2 parts: <func_name><suffix> 799 parts = ( 800 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 801 ) 802 803 if parts: 804 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 805 params = self._parse_func_params(anon_func) 806 807 kwargs = { 808 "this": anon_func.this, 809 "expressions": anon_func.expressions, 810 } 811 if parts[1]: 812 exp_class: t.Type[exp.Expression] = ( 813 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 814 ) 815 else: 816 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 817 818 kwargs["exp_class"] = exp_class 819 if params: 820 kwargs["params"] = params 821 822 func = self.expression(**kwargs) 823 824 if isinstance(expr, exp.Window): 825 # The window's func was parsed as Anonymous in base parser, fix its 826 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 827 expr.set("this", func) 828 elif params: 829 # Params have blocked super()._parse_function() from parsing the following window 830 # (if that exists) as they're standing between the function call and the window spec 831 expr = self._parse_window(func) 832 else: 833 expr = func 834 835 return expr 836 837 def _parse_func_params( 838 self, this: t.Optional[exp.Func] = None 839 ) -> t.Optional[t.List[exp.Expression]]: 840 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 841 return self._parse_csv(self._parse_lambda) 842 843 if self._match(TokenType.L_PAREN): 844 params = self._parse_csv(self._parse_lambda) 845 self._match_r_paren(this) 846 return params 847 848 return None 849 850 def _parse_quantile(self) -> exp.Quantile: 851 this = self._parse_lambda() 852 params = self._parse_func_params() 853 if params: 854 return self.expression(exp.Quantile, this=params[0], quantile=this) 855 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 856 857 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 858 return super()._parse_wrapped_id_vars(optional=True) 859 860 def _parse_primary_key( 861 self, wrapped_optional: bool = False, in_props: bool = False 862 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 863 return super()._parse_primary_key( 864 wrapped_optional=wrapped_optional or in_props, in_props=in_props 865 ) 866 867 def _parse_on_property(self) -> t.Optional[exp.Expression]: 868 index = self._index 869 if self._match_text_seq("CLUSTER"): 870 this = self._parse_string() or self._parse_id_var() 871 if this: 872 return self.expression(exp.OnCluster, this=this) 873 else: 874 self._retreat(index) 875 return None 876 877 def _parse_index_constraint( 878 self, kind: t.Optional[str] = None 879 ) -> exp.IndexColumnConstraint: 880 # INDEX name1 expr TYPE type1(args) GRANULARITY value 881 this = self._parse_id_var() 882 expression = self._parse_assignment() 883 884 index_type = self._match_text_seq("TYPE") and ( 885 self._parse_function() or self._parse_var() 886 ) 887 888 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 889 890 return self.expression( 891 exp.IndexColumnConstraint, 892 this=this, 893 expression=expression, 894 index_type=index_type, 895 granularity=granularity, 896 ) 897 898 def _parse_partition(self) -> t.Optional[exp.Partition]: 899 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 900 if not self._match(TokenType.PARTITION): 901 return None 902 903 if self._match_text_seq("ID"): 904 # Corresponds to the PARTITION ID <string_value> syntax 905 expressions: t.List[exp.Expression] = [ 906 self.expression(exp.PartitionId, this=self._parse_string()) 907 ] 908 else: 909 expressions = self._parse_expressions() 910 911 return self.expression(exp.Partition, expressions=expressions) 912 913 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 914 partition = self._parse_partition() 915 916 if not partition or not self._match(TokenType.FROM): 917 return None 918 919 return self.expression( 920 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 921 ) 922 923 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 924 if not self._match_text_seq("PROJECTION"): 925 return None 926 927 return self.expression( 928 exp.ProjectionDef, 929 this=self._parse_id_var(), 930 expression=self._parse_wrapped(self._parse_statement), 931 ) 932 933 def _parse_constraint(self) -> t.Optional[exp.Expression]: 934 return super()._parse_constraint() or self._parse_projection_def() 935 936 def _parse_alias( 937 self, this: t.Optional[exp.Expression], explicit: bool = False 938 ) -> t.Optional[exp.Expression]: 939 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 940 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 941 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 942 return this 943 944 return super()._parse_alias(this=this, explicit=explicit) 945 946 def _parse_expression(self) -> t.Optional[exp.Expression]: 947 this = super()._parse_expression() 948 949 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 950 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 951 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 952 self._match(TokenType.R_PAREN) 953 954 return this 955 956 def _parse_columns(self) -> exp.Expression: 957 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 958 959 while self._next and self._match_text_seq(")", "APPLY", "("): 960 self._match(TokenType.R_PAREN) 961 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 962 return this 963 964 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 965 value = super()._parse_value(values=values) 966 if not value: 967 return None 968 969 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 970 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 971 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 972 # but the final result is not altered by the extra parentheses. 973 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 974 expressions = value.expressions 975 if values and not isinstance(expressions[-1], exp.Tuple): 976 value.set( 977 "expressions", 978 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 979 ) 980 981 return value 982 983 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 984 # ClickHouse allows custom expressions as partition key 985 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 986 return self.expression( 987 exp.PartitionedByProperty, 988 this=self._parse_assignment(), 989 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_HINTS
- LAMBDAS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- STATEMENT_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_ALTER_PARSERS
- INVALID_FUNC_NAME_TOKENS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- NULL_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
991 class Generator(generator.Generator): 992 QUERY_HINTS = False 993 STRUCT_DELIMITER = ("(", ")") 994 NVL2_SUPPORTED = False 995 TABLESAMPLE_REQUIRES_PARENS = False 996 TABLESAMPLE_SIZE_IS_ROWS = False 997 TABLESAMPLE_KEYWORDS = "SAMPLE" 998 LAST_DAY_SUPPORTS_DATE_PART = False 999 CAN_IMPLEMENT_ARRAY_ANY = True 1000 SUPPORTS_TO_NUMBER = False 1001 JOIN_HINTS = False 1002 TABLE_HINTS = False 1003 GROUPINGS_SEP = "" 1004 SET_OP_MODIFIERS = False 1005 ARRAY_SIZE_NAME = "LENGTH" 1006 WRAP_DERIVED_VALUES = False 1007 1008 STRING_TYPE_MAPPING = { 1009 exp.DataType.Type.BLOB: "String", 1010 exp.DataType.Type.CHAR: "String", 1011 exp.DataType.Type.LONGBLOB: "String", 1012 exp.DataType.Type.LONGTEXT: "String", 1013 exp.DataType.Type.MEDIUMBLOB: "String", 1014 exp.DataType.Type.MEDIUMTEXT: "String", 1015 exp.DataType.Type.TINYBLOB: "String", 1016 exp.DataType.Type.TINYTEXT: "String", 1017 exp.DataType.Type.TEXT: "String", 1018 exp.DataType.Type.VARBINARY: "String", 1019 exp.DataType.Type.VARCHAR: "String", 1020 } 1021 1022 SUPPORTED_JSON_PATH_PARTS = { 1023 exp.JSONPathKey, 1024 exp.JSONPathRoot, 1025 exp.JSONPathSubscript, 1026 } 1027 1028 TYPE_MAPPING = { 1029 **generator.Generator.TYPE_MAPPING, 1030 **STRING_TYPE_MAPPING, 1031 exp.DataType.Type.ARRAY: "Array", 1032 exp.DataType.Type.BOOLEAN: "Bool", 1033 exp.DataType.Type.BIGINT: "Int64", 1034 exp.DataType.Type.DATE32: "Date32", 1035 exp.DataType.Type.DATETIME: "DateTime", 1036 exp.DataType.Type.DATETIME2: "DateTime", 1037 exp.DataType.Type.SMALLDATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME64: "DateTime64", 1039 exp.DataType.Type.DECIMAL: "Decimal", 1040 exp.DataType.Type.DECIMAL32: "Decimal32", 1041 exp.DataType.Type.DECIMAL64: "Decimal64", 1042 exp.DataType.Type.DECIMAL128: "Decimal128", 1043 exp.DataType.Type.DECIMAL256: "Decimal256", 1044 exp.DataType.Type.TIMESTAMP: "DateTime", 1045 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1046 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1047 exp.DataType.Type.DOUBLE: "Float64", 1048 exp.DataType.Type.ENUM: "Enum", 1049 exp.DataType.Type.ENUM8: "Enum8", 1050 exp.DataType.Type.ENUM16: "Enum16", 1051 exp.DataType.Type.FIXEDSTRING: "FixedString", 1052 exp.DataType.Type.FLOAT: "Float32", 1053 exp.DataType.Type.INT: "Int32", 1054 exp.DataType.Type.MEDIUMINT: "Int32", 1055 exp.DataType.Type.INT128: "Int128", 1056 exp.DataType.Type.INT256: "Int256", 1057 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1058 exp.DataType.Type.MAP: "Map", 1059 exp.DataType.Type.NESTED: "Nested", 1060 exp.DataType.Type.NOTHING: "Nothing", 1061 exp.DataType.Type.SMALLINT: "Int16", 1062 exp.DataType.Type.STRUCT: "Tuple", 1063 exp.DataType.Type.TINYINT: "Int8", 1064 exp.DataType.Type.UBIGINT: "UInt64", 1065 exp.DataType.Type.UINT: "UInt32", 1066 exp.DataType.Type.UINT128: "UInt128", 1067 exp.DataType.Type.UINT256: "UInt256", 1068 exp.DataType.Type.USMALLINT: "UInt16", 1069 exp.DataType.Type.UTINYINT: "UInt8", 1070 exp.DataType.Type.IPV4: "IPv4", 1071 exp.DataType.Type.IPV6: "IPv6", 1072 exp.DataType.Type.POINT: "Point", 1073 exp.DataType.Type.RING: "Ring", 1074 exp.DataType.Type.LINESTRING: "LineString", 1075 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1076 exp.DataType.Type.POLYGON: "Polygon", 1077 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1078 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1079 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1080 exp.DataType.Type.DYNAMIC: "Dynamic", 1081 } 1082 1083 TRANSFORMS = { 1084 **generator.Generator.TRANSFORMS, 1085 exp.AnyValue: rename_func("any"), 1086 exp.ApproxDistinct: rename_func("uniq"), 1087 exp.ArrayConcat: rename_func("arrayConcat"), 1088 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1089 exp.ArrayRemove: remove_from_array_using_filter, 1090 exp.ArrayReverse: rename_func("arrayReverse"), 1091 exp.ArraySlice: rename_func("arraySlice"), 1092 exp.ArraySum: rename_func("arraySum"), 1093 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1094 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1095 exp.Array: inline_array_sql, 1096 exp.CastToStrType: rename_func("CAST"), 1097 exp.CountIf: rename_func("countIf"), 1098 exp.CompressColumnConstraint: lambda self, 1099 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1100 exp.ComputedColumnConstraint: lambda self, 1101 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1102 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1103 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1104 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1105 exp.DateStrToDate: rename_func("toDate"), 1106 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1107 exp.Explode: rename_func("arrayJoin"), 1108 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1109 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1110 exp.IsNan: rename_func("isNaN"), 1111 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1112 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1113 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1114 exp.JSONPathKey: json_path_key_only_name, 1115 exp.JSONPathRoot: lambda *_: "", 1116 exp.Length: length_or_char_length_sql, 1117 exp.Map: _map_sql, 1118 exp.Median: rename_func("median"), 1119 exp.Nullif: rename_func("nullIf"), 1120 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1121 exp.Pivot: no_pivot_sql, 1122 exp.Quantile: _quantile_sql, 1123 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1124 exp.Rand: rename_func("randCanonical"), 1125 exp.StartsWith: rename_func("startsWith"), 1126 exp.EndsWith: rename_func("endsWith"), 1127 exp.StrPosition: lambda self, e: strposition_sql( 1128 self, 1129 e, 1130 func_name="POSITION", 1131 supports_position=True, 1132 use_ansi_position=False, 1133 ), 1134 exp.TimeToStr: lambda self, e: self.func( 1135 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1136 ), 1137 exp.TimeStrToTime: _timestrtotime_sql, 1138 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1139 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1140 exp.Typeof: rename_func("toTypeName"), 1141 exp.VarMap: _map_sql, 1142 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1143 exp.MD5Digest: rename_func("MD5"), 1144 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1145 exp.SHA: rename_func("SHA1"), 1146 exp.SHA2: sha256_sql, 1147 exp.UnixToTime: _unix_to_time_sql, 1148 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1149 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1150 exp.Variance: rename_func("varSamp"), 1151 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1152 exp.Stddev: rename_func("stddevSamp"), 1153 exp.Chr: rename_func("CHAR"), 1154 exp.Lag: lambda self, e: self.func( 1155 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1156 ), 1157 exp.Lead: lambda self, e: self.func( 1158 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1159 ), 1160 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1161 rename_func("editDistance") 1162 ), 1163 exp.ParseDatetime: rename_func("parseDateTime"), 1164 } 1165 1166 PROPERTIES_LOCATION = { 1167 **generator.Generator.PROPERTIES_LOCATION, 1168 exp.OnCluster: exp.Properties.Location.POST_NAME, 1169 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1170 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1171 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1172 } 1173 1174 # There's no list in docs, but it can be found in Clickhouse code 1175 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1176 ON_CLUSTER_TARGETS = { 1177 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1178 "DATABASE", 1179 "TABLE", 1180 "VIEW", 1181 "DICTIONARY", 1182 "INDEX", 1183 "FUNCTION", 1184 "NAMED COLLECTION", 1185 } 1186 1187 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1188 NON_NULLABLE_TYPES = { 1189 exp.DataType.Type.ARRAY, 1190 exp.DataType.Type.MAP, 1191 exp.DataType.Type.STRUCT, 1192 exp.DataType.Type.POINT, 1193 exp.DataType.Type.RING, 1194 exp.DataType.Type.LINESTRING, 1195 exp.DataType.Type.MULTILINESTRING, 1196 exp.DataType.Type.POLYGON, 1197 exp.DataType.Type.MULTIPOLYGON, 1198 } 1199 1200 def offset_sql(self, expression: exp.Offset) -> str: 1201 offset = super().offset_sql(expression) 1202 1203 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1204 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1205 parent = expression.parent 1206 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1207 offset = f"{offset} ROWS" 1208 1209 return offset 1210 1211 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1212 strtodate_sql = self.function_fallback_sql(expression) 1213 1214 if not isinstance(expression.parent, exp.Cast): 1215 # StrToDate returns DATEs in other dialects (eg. postgres), so 1216 # this branch aims to improve the transpilation to clickhouse 1217 return self.cast_sql(exp.cast(expression, "DATE")) 1218 1219 return strtodate_sql 1220 1221 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1222 this = expression.this 1223 1224 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1225 return self.sql(this) 1226 1227 return super().cast_sql(expression, safe_prefix=safe_prefix) 1228 1229 def trycast_sql(self, expression: exp.TryCast) -> str: 1230 dtype = expression.to 1231 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1232 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1233 dtype.set("nullable", True) 1234 1235 return super().cast_sql(expression) 1236 1237 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1238 this = self.json_path_part(expression.this) 1239 return str(int(this) + 1) if is_int(this) else this 1240 1241 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1242 return f"AS {self.sql(expression, 'this')}" 1243 1244 def _any_to_has( 1245 self, 1246 expression: exp.EQ | exp.NEQ, 1247 default: t.Callable[[t.Any], str], 1248 prefix: str = "", 1249 ) -> str: 1250 if isinstance(expression.left, exp.Any): 1251 arr = expression.left 1252 this = expression.right 1253 elif isinstance(expression.right, exp.Any): 1254 arr = expression.right 1255 this = expression.left 1256 else: 1257 return default(expression) 1258 1259 return prefix + self.func("has", arr.this.unnest(), this) 1260 1261 def eq_sql(self, expression: exp.EQ) -> str: 1262 return self._any_to_has(expression, super().eq_sql) 1263 1264 def neq_sql(self, expression: exp.NEQ) -> str: 1265 return self._any_to_has(expression, super().neq_sql, "NOT ") 1266 1267 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1268 # Manually add a flag to make the search case-insensitive 1269 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1270 return self.func("match", expression.this, regex) 1271 1272 def datatype_sql(self, expression: exp.DataType) -> str: 1273 # String is the standard ClickHouse type, every other variant is just an alias. 1274 # Additionally, any supplied length parameter will be ignored. 1275 # 1276 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1277 if expression.this in self.STRING_TYPE_MAPPING: 1278 dtype = "String" 1279 else: 1280 dtype = super().datatype_sql(expression) 1281 1282 # This section changes the type to `Nullable(...)` if the following conditions hold: 1283 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1284 # and change their semantics 1285 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1286 # constraint: "Type of Map key must be a type, that can be represented by integer or 1287 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1288 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1289 parent = expression.parent 1290 nullable = expression.args.get("nullable") 1291 if nullable is True or ( 1292 nullable is None 1293 and not ( 1294 isinstance(parent, exp.DataType) 1295 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1296 and expression.index in (None, 0) 1297 ) 1298 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1299 ): 1300 dtype = f"Nullable({dtype})" 1301 1302 return dtype 1303 1304 def cte_sql(self, expression: exp.CTE) -> str: 1305 if expression.args.get("scalar"): 1306 this = self.sql(expression, "this") 1307 alias = self.sql(expression, "alias") 1308 return f"{this} AS {alias}" 1309 1310 return super().cte_sql(expression) 1311 1312 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1313 return super().after_limit_modifiers(expression) + [ 1314 ( 1315 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1316 if expression.args.get("settings") 1317 else "" 1318 ), 1319 ( 1320 self.seg("FORMAT ") + self.sql(expression, "format") 1321 if expression.args.get("format") 1322 else "" 1323 ), 1324 ] 1325 1326 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1327 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1328 1329 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1330 return f"ON CLUSTER {self.sql(expression, 'this')}" 1331 1332 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1333 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1334 exp.Properties.Location.POST_NAME 1335 ): 1336 this_name = self.sql( 1337 expression.this if isinstance(expression.this, exp.Schema) else expression, 1338 "this", 1339 ) 1340 this_properties = " ".join( 1341 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1342 ) 1343 this_schema = self.schema_columns_sql(expression.this) 1344 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1345 1346 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1347 1348 return super().createable_sql(expression, locations) 1349 1350 def create_sql(self, expression: exp.Create) -> str: 1351 # The comment property comes last in CTAS statements, i.e. after the query 1352 query = expression.expression 1353 if isinstance(query, exp.Query): 1354 comment_prop = expression.find(exp.SchemaCommentProperty) 1355 if comment_prop: 1356 comment_prop.pop() 1357 query.replace(exp.paren(query)) 1358 else: 1359 comment_prop = None 1360 1361 create_sql = super().create_sql(expression) 1362 1363 comment_sql = self.sql(comment_prop) 1364 comment_sql = f" {comment_sql}" if comment_sql else "" 1365 1366 return f"{create_sql}{comment_sql}" 1367 1368 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1369 this = self.indent(self.sql(expression, "this")) 1370 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1371 1372 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1373 this = self.sql(expression, "this") 1374 this = f" {this}" if this else "" 1375 expr = self.sql(expression, "expression") 1376 expr = f" {expr}" if expr else "" 1377 index_type = self.sql(expression, "index_type") 1378 index_type = f" TYPE {index_type}" if index_type else "" 1379 granularity = self.sql(expression, "granularity") 1380 granularity = f" GRANULARITY {granularity}" if granularity else "" 1381 1382 return f"INDEX{this}{expr}{index_type}{granularity}" 1383 1384 def partition_sql(self, expression: exp.Partition) -> str: 1385 return f"PARTITION {self.expressions(expression, flat=True)}" 1386 1387 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1388 return f"ID {self.sql(expression.this)}" 1389 1390 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1391 return ( 1392 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1393 ) 1394 1395 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1396 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1397 1398 def is_sql(self, expression: exp.Is) -> str: 1399 is_sql = super().is_sql(expression) 1400 1401 if isinstance(expression.parent, exp.Not): 1402 # value IS NOT NULL -> NOT (value IS NULL) 1403 is_sql = self.wrap(is_sql) 1404 1405 return is_sql 1406 1407 def in_sql(self, expression: exp.In) -> str: 1408 in_sql = super().in_sql(expression) 1409 1410 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1411 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1412 1413 return in_sql 1414 1415 def not_sql(self, expression: exp.Not) -> str: 1416 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1417 # let `GLOBAL IN` child interpose `NOT` 1418 return self.sql(expression, "this") 1419 1420 return super().not_sql(expression) 1421 1422 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1423 # If the VALUES clause contains tuples of expressions, we need to treat it 1424 # as a table since Clickhouse will automatically alias it as such. 1425 alias = expression.args.get("alias") 1426 1427 if alias and alias.args.get("columns") and expression.expressions: 1428 values = expression.expressions[0].expressions 1429 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1430 else: 1431 values_as_table = True 1432 1433 return super().values_sql(expression, values_as_table=values_as_table)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1200 def offset_sql(self, expression: exp.Offset) -> str: 1201 offset = super().offset_sql(expression) 1202 1203 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1204 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1205 parent = expression.parent 1206 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1207 offset = f"{offset} ROWS" 1208 1209 return offset
1211 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1212 strtodate_sql = self.function_fallback_sql(expression) 1213 1214 if not isinstance(expression.parent, exp.Cast): 1215 # StrToDate returns DATEs in other dialects (eg. postgres), so 1216 # this branch aims to improve the transpilation to clickhouse 1217 return self.cast_sql(exp.cast(expression, "DATE")) 1218 1219 return strtodate_sql
1221 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1222 this = expression.this 1223 1224 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1225 return self.sql(this) 1226 1227 return super().cast_sql(expression, safe_prefix=safe_prefix)
1229 def trycast_sql(self, expression: exp.TryCast) -> str: 1230 dtype = expression.to 1231 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1232 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1233 dtype.set("nullable", True) 1234 1235 return super().cast_sql(expression)
1272 def datatype_sql(self, expression: exp.DataType) -> str: 1273 # String is the standard ClickHouse type, every other variant is just an alias. 1274 # Additionally, any supplied length parameter will be ignored. 1275 # 1276 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1277 if expression.this in self.STRING_TYPE_MAPPING: 1278 dtype = "String" 1279 else: 1280 dtype = super().datatype_sql(expression) 1281 1282 # This section changes the type to `Nullable(...)` if the following conditions hold: 1283 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1284 # and change their semantics 1285 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1286 # constraint: "Type of Map key must be a type, that can be represented by integer or 1287 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1288 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1289 parent = expression.parent 1290 nullable = expression.args.get("nullable") 1291 if nullable is True or ( 1292 nullable is None 1293 and not ( 1294 isinstance(parent, exp.DataType) 1295 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1296 and expression.index in (None, 0) 1297 ) 1298 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1299 ): 1300 dtype = f"Nullable({dtype})" 1301 1302 return dtype
1312 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1313 return super().after_limit_modifiers(expression) + [ 1314 ( 1315 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1316 if expression.args.get("settings") 1317 else "" 1318 ), 1319 ( 1320 self.seg("FORMAT ") + self.sql(expression, "format") 1321 if expression.args.get("format") 1322 else "" 1323 ), 1324 ]
1332 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1333 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1334 exp.Properties.Location.POST_NAME 1335 ): 1336 this_name = self.sql( 1337 expression.this if isinstance(expression.this, exp.Schema) else expression, 1338 "this", 1339 ) 1340 this_properties = " ".join( 1341 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1342 ) 1343 this_schema = self.schema_columns_sql(expression.this) 1344 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1345 1346 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1347 1348 return super().createable_sql(expression, locations)
1350 def create_sql(self, expression: exp.Create) -> str: 1351 # The comment property comes last in CTAS statements, i.e. after the query 1352 query = expression.expression 1353 if isinstance(query, exp.Query): 1354 comment_prop = expression.find(exp.SchemaCommentProperty) 1355 if comment_prop: 1356 comment_prop.pop() 1357 query.replace(exp.paren(query)) 1358 else: 1359 comment_prop = None 1360 1361 create_sql = super().create_sql(expression) 1362 1363 comment_sql = self.sql(comment_prop) 1364 comment_sql = f" {comment_sql}" if comment_sql else "" 1365 1366 return f"{create_sql}{comment_sql}"
1372 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1373 this = self.sql(expression, "this") 1374 this = f" {this}" if this else "" 1375 expr = self.sql(expression, "expression") 1376 expr = f" {expr}" if expr else "" 1377 index_type = self.sql(expression, "index_type") 1378 index_type = f" TYPE {index_type}" if index_type else "" 1379 granularity = self.sql(expression, "granularity") 1380 granularity = f" GRANULARITY {granularity}" if granularity else "" 1381 1382 return f"INDEX{this}{expr}{index_type}{granularity}"
1422 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1423 # If the VALUES clause contains tuples of expressions, we need to treat it 1424 # as a table since Clickhouse will automatically alias it as such. 1425 alias = expression.args.get("alias") 1426 1427 if alias and alias.args.get("columns") and expression.expressions: 1428 values = expression.expressions[0].expressions 1429 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1430 else: 1431 values_as_table = True 1432 1433 return super().values_sql(expression, values_as_table=values_as_table)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- IGNORE_NULLS_IN_FUNC
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_FETCH
- LIMIT_ONLY_LITERALS
- RENAME_TABLE_WITH_DB
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- JSON_KEY_VALUE_PAIR_SEP
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SUPPORTS_WINDOW_EXCLUDE
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- PARAMETER_TOKEN
- NAMED_PLACEHOLDER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- RESERVED_KEYWORDS
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- subquery_sql
- qualify_sql
- unnest_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- bracket_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql