sqlglot.dialects.clickhouse
1from __future__ import annotations 2 3 4from sqlglot import exp, tokens 5from sqlglot.dialects.dialect import ( 6 Dialect, 7 NormalizationStrategy, 8) 9from sqlglot.generators.clickhouse import ClickHouseGenerator 10from sqlglot.parsers.clickhouse import ClickHouseParser 11from sqlglot.tokens import TokenType 12from sqlglot.typing.clickhouse import EXPRESSION_METADATA 13 14 15class ClickHouse(Dialect): 16 INDEX_OFFSET = 1 17 NORMALIZE_FUNCTIONS: bool | str = False 18 NULL_ORDERING = "nulls_are_last" 19 SUPPORTS_USER_DEFINED_TYPES = False 20 SAFE_DIVISION = True 21 LOG_BASE_FIRST: bool | None = None 22 FORCE_EARLY_ALIAS_REF_EXPANSION = True 23 PRESERVE_ORIGINAL_NAMES = True 24 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 25 IDENTIFIERS_CAN_START_WITH_DIGIT = True 26 HEX_STRING_IS_INTEGER_TYPE = True 27 28 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 29 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 30 31 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 32 33 UNESCAPED_SEQUENCES = { 34 "\\0": "\0", 35 } 36 37 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 38 39 SET_OP_DISTINCT_BY_DEFAULT: dict[type[exp.Expr], bool | None] = { 40 exp.Except: False, 41 exp.Intersect: False, 42 exp.Union: None, 43 } 44 45 def generate_values_aliases(self, expression: exp.Values) -> list[exp.Identifier]: 46 # Clickhouse allows VALUES to have an embedded structure e.g: 47 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 48 # In this case, we don't want to qualify the columns 49 values = expression.expressions[0].expressions 50 51 structure = ( 52 values[0] 53 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 54 else None 55 ) 56 if structure: 57 # Split each column definition into the column name e.g: 58 # 'person String, place String' -> ['person', 'place'] 59 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 60 column_aliases = [ 61 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 62 ] 63 else: 64 # Default column aliases in CH are "c1", "c2", etc. 65 column_aliases = [ 66 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 67 ] 68 69 return column_aliases 70 71 class Tokenizer(tokens.Tokenizer): 72 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 73 IDENTIFIERS = ['"', "`"] 74 IDENTIFIER_ESCAPES = ["\\"] 75 STRING_ESCAPES = ["'", "\\"] 76 BIT_STRINGS = [("0b", "")] 77 HEX_STRINGS = [("0x", ""), ("0X", "")] 78 HEREDOC_STRINGS = ["$"] 79 80 KEYWORDS = { 81 **tokens.Tokenizer.KEYWORDS, 82 ".:": TokenType.DOTCOLON, 83 ".^": TokenType.DOTCARET, 84 "ATTACH": TokenType.COMMAND, 85 "DATE32": TokenType.DATE32, 86 "DETACH": TokenType.DETACH, 87 "DATETIME64": TokenType.DATETIME64, 88 "DICTIONARY": TokenType.DICTIONARY, 89 "DYNAMIC": TokenType.DYNAMIC, 90 "ENUM8": TokenType.ENUM8, 91 "ENUM16": TokenType.ENUM16, 92 "EXCHANGE": TokenType.COMMAND, 93 "FINAL": TokenType.FINAL, 94 "FIXEDSTRING": TokenType.FIXEDSTRING, 95 "FLOAT32": TokenType.FLOAT, 96 "FLOAT64": TokenType.DOUBLE, 97 "GLOBAL": TokenType.GLOBAL, 98 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 99 "MAP": TokenType.MAP, 100 "NESTED": TokenType.NESTED, 101 "NOTHING": TokenType.NOTHING, 102 "SAMPLE": TokenType.TABLE_SAMPLE, 103 "TUPLE": TokenType.STRUCT, 104 "UINT16": TokenType.USMALLINT, 105 "UINT32": TokenType.UINT, 106 "UINT64": TokenType.UBIGINT, 107 "UINT8": TokenType.UTINYINT, 108 "IPV4": TokenType.IPV4, 109 "IPV6": TokenType.IPV6, 110 "POINT": TokenType.POINT, 111 "RING": TokenType.RING, 112 "LINESTRING": TokenType.LINESTRING, 113 "MULTILINESTRING": TokenType.MULTILINESTRING, 114 "POLYGON": TokenType.POLYGON, 115 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 116 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 117 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 118 "SYSTEM": TokenType.COMMAND, 119 "PREWHERE": TokenType.PREWHERE, 120 } 121 122 KEYWORDS.pop("/*+") 123 124 SINGLE_TOKENS = { 125 **tokens.Tokenizer.SINGLE_TOKENS, 126 "$": TokenType.HEREDOC_STRING, 127 } 128 129 Parser = ClickHouseParser 130 131 Generator = ClickHouseGenerator
16class ClickHouse(Dialect): 17 INDEX_OFFSET = 1 18 NORMALIZE_FUNCTIONS: bool | str = False 19 NULL_ORDERING = "nulls_are_last" 20 SUPPORTS_USER_DEFINED_TYPES = False 21 SAFE_DIVISION = True 22 LOG_BASE_FIRST: bool | None = None 23 FORCE_EARLY_ALIAS_REF_EXPANSION = True 24 PRESERVE_ORIGINAL_NAMES = True 25 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 26 IDENTIFIERS_CAN_START_WITH_DIGIT = True 27 HEX_STRING_IS_INTEGER_TYPE = True 28 29 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 30 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 31 32 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 33 34 UNESCAPED_SEQUENCES = { 35 "\\0": "\0", 36 } 37 38 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 39 40 SET_OP_DISTINCT_BY_DEFAULT: dict[type[exp.Expr], bool | None] = { 41 exp.Except: False, 42 exp.Intersect: False, 43 exp.Union: None, 44 } 45 46 def generate_values_aliases(self, expression: exp.Values) -> list[exp.Identifier]: 47 # Clickhouse allows VALUES to have an embedded structure e.g: 48 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 49 # In this case, we don't want to qualify the columns 50 values = expression.expressions[0].expressions 51 52 structure = ( 53 values[0] 54 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 55 else None 56 ) 57 if structure: 58 # Split each column definition into the column name e.g: 59 # 'person String, place String' -> ['person', 'place'] 60 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 61 column_aliases = [ 62 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 63 ] 64 else: 65 # Default column aliases in CH are "c1", "c2", etc. 66 column_aliases = [ 67 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 68 ] 69 70 return column_aliases 71 72 class Tokenizer(tokens.Tokenizer): 73 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 74 IDENTIFIERS = ['"', "`"] 75 IDENTIFIER_ESCAPES = ["\\"] 76 STRING_ESCAPES = ["'", "\\"] 77 BIT_STRINGS = [("0b", "")] 78 HEX_STRINGS = [("0x", ""), ("0X", "")] 79 HEREDOC_STRINGS = ["$"] 80 81 KEYWORDS = { 82 **tokens.Tokenizer.KEYWORDS, 83 ".:": TokenType.DOTCOLON, 84 ".^": TokenType.DOTCARET, 85 "ATTACH": TokenType.COMMAND, 86 "DATE32": TokenType.DATE32, 87 "DETACH": TokenType.DETACH, 88 "DATETIME64": TokenType.DATETIME64, 89 "DICTIONARY": TokenType.DICTIONARY, 90 "DYNAMIC": TokenType.DYNAMIC, 91 "ENUM8": TokenType.ENUM8, 92 "ENUM16": TokenType.ENUM16, 93 "EXCHANGE": TokenType.COMMAND, 94 "FINAL": TokenType.FINAL, 95 "FIXEDSTRING": TokenType.FIXEDSTRING, 96 "FLOAT32": TokenType.FLOAT, 97 "FLOAT64": TokenType.DOUBLE, 98 "GLOBAL": TokenType.GLOBAL, 99 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 100 "MAP": TokenType.MAP, 101 "NESTED": TokenType.NESTED, 102 "NOTHING": TokenType.NOTHING, 103 "SAMPLE": TokenType.TABLE_SAMPLE, 104 "TUPLE": TokenType.STRUCT, 105 "UINT16": TokenType.USMALLINT, 106 "UINT32": TokenType.UINT, 107 "UINT64": TokenType.UBIGINT, 108 "UINT8": TokenType.UTINYINT, 109 "IPV4": TokenType.IPV4, 110 "IPV6": TokenType.IPV6, 111 "POINT": TokenType.POINT, 112 "RING": TokenType.RING, 113 "LINESTRING": TokenType.LINESTRING, 114 "MULTILINESTRING": TokenType.MULTILINESTRING, 115 "POLYGON": TokenType.POLYGON, 116 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 117 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 118 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 119 "SYSTEM": TokenType.COMMAND, 120 "PREWHERE": TokenType.PREWHERE, 121 } 122 123 KEYWORDS.pop("/*+") 124 125 SINGLE_TOKENS = { 126 **tokens.Tokenizer.SINGLE_TOKENS, 127 "$": TokenType.HEREDOC_STRING, 128 } 129 130 Parser = ClickHouseParser 131 132 Generator = ClickHouseGenerator
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Default NULL ordering method to use if not explicitly set.
Possible values: "nulls_are_small", "nulls_are_large", "nulls_are_last"
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether number literals can include underscores for better readability
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Mapping of an escaped sequence (\n) to its unescaped version (
).
Helper for dialects that use a different name for the same creatable kind. For example, the Clickhouse equivalent of CREATE SCHEMA is CREATE DATABASE.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
46 def generate_values_aliases(self, expression: exp.Values) -> list[exp.Identifier]: 47 # Clickhouse allows VALUES to have an embedded structure e.g: 48 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 49 # In this case, we don't want to qualify the columns 50 values = expression.expressions[0].expressions 51 52 structure = ( 53 values[0] 54 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 55 else None 56 ) 57 if structure: 58 # Split each column definition into the column name e.g: 59 # 'person String, place String' -> ['person', 'place'] 60 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 61 column_aliases = [ 62 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 63 ] 64 else: 65 # Default column aliases in CH are "c1", "c2", etc. 66 column_aliases = [ 67 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 68 ] 69 70 return column_aliases
Whether string literals support escape sequences (e.g. \n). Set by the metaclass based on the tokenizer's STRING_ESCAPES.
Whether byte string literals support escape sequences. Set by the metaclass based on the tokenizer's BYTE_STRING_ESCAPES.
72 class Tokenizer(tokens.Tokenizer): 73 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 74 IDENTIFIERS = ['"', "`"] 75 IDENTIFIER_ESCAPES = ["\\"] 76 STRING_ESCAPES = ["'", "\\"] 77 BIT_STRINGS = [("0b", "")] 78 HEX_STRINGS = [("0x", ""), ("0X", "")] 79 HEREDOC_STRINGS = ["$"] 80 81 KEYWORDS = { 82 **tokens.Tokenizer.KEYWORDS, 83 ".:": TokenType.DOTCOLON, 84 ".^": TokenType.DOTCARET, 85 "ATTACH": TokenType.COMMAND, 86 "DATE32": TokenType.DATE32, 87 "DETACH": TokenType.DETACH, 88 "DATETIME64": TokenType.DATETIME64, 89 "DICTIONARY": TokenType.DICTIONARY, 90 "DYNAMIC": TokenType.DYNAMIC, 91 "ENUM8": TokenType.ENUM8, 92 "ENUM16": TokenType.ENUM16, 93 "EXCHANGE": TokenType.COMMAND, 94 "FINAL": TokenType.FINAL, 95 "FIXEDSTRING": TokenType.FIXEDSTRING, 96 "FLOAT32": TokenType.FLOAT, 97 "FLOAT64": TokenType.DOUBLE, 98 "GLOBAL": TokenType.GLOBAL, 99 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 100 "MAP": TokenType.MAP, 101 "NESTED": TokenType.NESTED, 102 "NOTHING": TokenType.NOTHING, 103 "SAMPLE": TokenType.TABLE_SAMPLE, 104 "TUPLE": TokenType.STRUCT, 105 "UINT16": TokenType.USMALLINT, 106 "UINT32": TokenType.UINT, 107 "UINT64": TokenType.UBIGINT, 108 "UINT8": TokenType.UTINYINT, 109 "IPV4": TokenType.IPV4, 110 "IPV6": TokenType.IPV6, 111 "POINT": TokenType.POINT, 112 "RING": TokenType.RING, 113 "LINESTRING": TokenType.LINESTRING, 114 "MULTILINESTRING": TokenType.MULTILINESTRING, 115 "POLYGON": TokenType.POLYGON, 116 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 117 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 118 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 119 "SYSTEM": TokenType.COMMAND, 120 "PREWHERE": TokenType.PREWHERE, 121 } 122 123 KEYWORDS.pop("/*+") 124 125 SINGLE_TOKENS = { 126 **tokens.Tokenizer.SINGLE_TOKENS, 127 "$": TokenType.HEREDOC_STRING, 128 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BYTE_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- NUMBERS_CAN_HAVE_DECIMALS
- dialect
- tokenize
- sql
- size
- tokens