sqlglot.dialects.hive
1from __future__ import annotations 2 3from copy import deepcopy 4from collections import defaultdict 5 6from sqlglot import exp, jsonpath, tokens 7from sqlglot.dialects.dialect import ( 8 Dialect, 9 NormalizationStrategy, 10) 11from sqlglot.generators.hive import HiveGenerator 12from sqlglot.parsers.hive import HiveParser 13from sqlglot.tokens import TokenType 14from sqlglot.optimizer.annotate_types import TypeAnnotator 15from sqlglot.typing.hive import EXPRESSION_METADATA 16 17 18class Hive(Dialect): 19 ALIAS_POST_TABLESAMPLE = True 20 IDENTIFIERS_CAN_START_WITH_DIGIT = True 21 SUPPORTS_USER_DEFINED_TYPES = False 22 SAFE_DIVISION = True 23 CONCAT_WS_COALESCE = True 24 ARRAY_AGG_INCLUDES_NULLS = None 25 REGEXP_EXTRACT_DEFAULT_GROUP = 1 26 ALTER_TABLE_SUPPORTS_CASCADE = True 27 28 # https://spark.apache.org/docs/latest/sql-ref-identifier.html#description 29 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 30 31 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 32 33 # https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362046#LanguageManualUDF-StringFunctions 34 # https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java#L266-L269 35 INITCAP_DEFAULT_DELIMITER_CHARS = " \t\n\r\f\u000b\u001c\u001d\u001e\u001f" 36 37 # Support only the non-ANSI mode (default for Hive, Spark2, Spark) 38 COERCES_TO = defaultdict(set, deepcopy(TypeAnnotator.COERCES_TO)) 39 for target_type in { 40 *exp.DataType.NUMERIC_TYPES, 41 *exp.DataType.TEMPORAL_TYPES, 42 exp.DType.INTERVAL, 43 }: 44 COERCES_TO[target_type] |= exp.DataType.TEXT_TYPES 45 46 TIME_MAPPING = { 47 "y": "%Y", 48 "Y": "%Y", 49 "YYYY": "%Y", 50 "yyyy": "%Y", 51 "YY": "%y", 52 "yy": "%y", 53 "MMMM": "%B", 54 "MMM": "%b", 55 "MM": "%m", 56 "M": "%-m", 57 "dd": "%d", 58 "d": "%-d", 59 "HH": "%H", 60 "H": "%-H", 61 "hh": "%I", 62 "h": "%-I", 63 "mm": "%M", 64 "m": "%-M", 65 "ss": "%S", 66 "s": "%-S", 67 "SSSSSS": "%f", 68 "a": "%p", 69 "DD": "%j", 70 "D": "%-j", 71 "E": "%a", 72 "EE": "%a", 73 "EEE": "%a", 74 "EEEE": "%A", 75 "z": "%Z", 76 "Z": "%z", 77 } 78 79 DATE_FORMAT = "'yyyy-MM-dd'" 80 DATEINT_FORMAT = "'yyyyMMdd'" 81 TIME_FORMAT = "'yyyy-MM-dd HH:mm:ss'" 82 83 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 84 VAR_TOKENS = { 85 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 86 TokenType.DASH, 87 } 88 89 class Tokenizer(tokens.Tokenizer): 90 QUOTES = ["'", '"'] 91 IDENTIFIERS = ["`"] 92 STRING_ESCAPES = ["\\"] 93 94 SINGLE_TOKENS = { 95 **tokens.Tokenizer.SINGLE_TOKENS, 96 "$": TokenType.PARAMETER, 97 } 98 99 KEYWORDS = { 100 **tokens.Tokenizer.KEYWORDS, 101 "ADD ARCHIVE": TokenType.COMMAND, 102 "ADD ARCHIVES": TokenType.COMMAND, 103 "ADD FILE": TokenType.COMMAND, 104 "ADD FILES": TokenType.COMMAND, 105 "ADD JAR": TokenType.COMMAND, 106 "ADD JARS": TokenType.COMMAND, 107 "MINUS": TokenType.EXCEPT, 108 "MSCK REPAIR": TokenType.COMMAND, 109 "REFRESH": TokenType.REFRESH, 110 "TIMESTAMP AS OF": TokenType.TIMESTAMP_SNAPSHOT, 111 "VERSION AS OF": TokenType.VERSION_SNAPSHOT, 112 "SERDEPROPERTIES": TokenType.SERDE_PROPERTIES, 113 } 114 115 NUMERIC_LITERALS = { 116 "L": "BIGINT", 117 "S": "SMALLINT", 118 "Y": "TINYINT", 119 "D": "DOUBLE", 120 "F": "FLOAT", 121 "BD": "DECIMAL", 122 } 123 124 Parser = HiveParser 125 126 Generator = HiveGenerator
19class Hive(Dialect): 20 ALIAS_POST_TABLESAMPLE = True 21 IDENTIFIERS_CAN_START_WITH_DIGIT = True 22 SUPPORTS_USER_DEFINED_TYPES = False 23 SAFE_DIVISION = True 24 CONCAT_WS_COALESCE = True 25 ARRAY_AGG_INCLUDES_NULLS = None 26 REGEXP_EXTRACT_DEFAULT_GROUP = 1 27 ALTER_TABLE_SUPPORTS_CASCADE = True 28 29 # https://spark.apache.org/docs/latest/sql-ref-identifier.html#description 30 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 31 32 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 33 34 # https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362046#LanguageManualUDF-StringFunctions 35 # https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java#L266-L269 36 INITCAP_DEFAULT_DELIMITER_CHARS = " \t\n\r\f\u000b\u001c\u001d\u001e\u001f" 37 38 # Support only the non-ANSI mode (default for Hive, Spark2, Spark) 39 COERCES_TO = defaultdict(set, deepcopy(TypeAnnotator.COERCES_TO)) 40 for target_type in { 41 *exp.DataType.NUMERIC_TYPES, 42 *exp.DataType.TEMPORAL_TYPES, 43 exp.DType.INTERVAL, 44 }: 45 COERCES_TO[target_type] |= exp.DataType.TEXT_TYPES 46 47 TIME_MAPPING = { 48 "y": "%Y", 49 "Y": "%Y", 50 "YYYY": "%Y", 51 "yyyy": "%Y", 52 "YY": "%y", 53 "yy": "%y", 54 "MMMM": "%B", 55 "MMM": "%b", 56 "MM": "%m", 57 "M": "%-m", 58 "dd": "%d", 59 "d": "%-d", 60 "HH": "%H", 61 "H": "%-H", 62 "hh": "%I", 63 "h": "%-I", 64 "mm": "%M", 65 "m": "%-M", 66 "ss": "%S", 67 "s": "%-S", 68 "SSSSSS": "%f", 69 "a": "%p", 70 "DD": "%j", 71 "D": "%-j", 72 "E": "%a", 73 "EE": "%a", 74 "EEE": "%a", 75 "EEEE": "%A", 76 "z": "%Z", 77 "Z": "%z", 78 } 79 80 DATE_FORMAT = "'yyyy-MM-dd'" 81 DATEINT_FORMAT = "'yyyyMMdd'" 82 TIME_FORMAT = "'yyyy-MM-dd HH:mm:ss'" 83 84 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 85 VAR_TOKENS = { 86 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 87 TokenType.DASH, 88 } 89 90 class Tokenizer(tokens.Tokenizer): 91 QUOTES = ["'", '"'] 92 IDENTIFIERS = ["`"] 93 STRING_ESCAPES = ["\\"] 94 95 SINGLE_TOKENS = { 96 **tokens.Tokenizer.SINGLE_TOKENS, 97 "$": TokenType.PARAMETER, 98 } 99 100 KEYWORDS = { 101 **tokens.Tokenizer.KEYWORDS, 102 "ADD ARCHIVE": TokenType.COMMAND, 103 "ADD ARCHIVES": TokenType.COMMAND, 104 "ADD FILE": TokenType.COMMAND, 105 "ADD FILES": TokenType.COMMAND, 106 "ADD JAR": TokenType.COMMAND, 107 "ADD JARS": TokenType.COMMAND, 108 "MINUS": TokenType.EXCEPT, 109 "MSCK REPAIR": TokenType.COMMAND, 110 "REFRESH": TokenType.REFRESH, 111 "TIMESTAMP AS OF": TokenType.TIMESTAMP_SNAPSHOT, 112 "VERSION AS OF": TokenType.VERSION_SNAPSHOT, 113 "SERDEPROPERTIES": TokenType.SERDE_PROPERTIES, 114 } 115 116 NUMERIC_LITERALS = { 117 "L": "BIGINT", 118 "S": "SMALLINT", 119 "Y": "TINYINT", 120 "D": "DOUBLE", 121 "F": "FLOAT", 122 "BD": "DECIMAL", 123 } 124 125 Parser = HiveParser 126 127 Generator = HiveGenerator
A NULL arg in CONCAT_WS yields NULL by default, but in some dialects it is skipped.
Hive by default does not update the schema of existing partitions when a column is changed. the CASCADE clause is used to indicate that the change should be propagated to all existing partitions. the Spark dialect, while derived from Hive, does not support the CASCADE clause.
Specifies the strategy according to which identifiers should be normalized.
Associates this dialect's time formats with their equivalent Python strftime formats.
Mapping of an escaped sequence (\n) to its unescaped version (
).
Whether string literals support escape sequences (e.g. \n). Set by the metaclass based on the tokenizer's STRING_ESCAPES.
Whether byte string literals support escape sequences. Set by the metaclass based on the tokenizer's BYTE_STRING_ESCAPES.
84 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 85 VAR_TOKENS = { 86 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 87 TokenType.DASH, 88 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- tokenize
- sql
- size
- tokens
90 class Tokenizer(tokens.Tokenizer): 91 QUOTES = ["'", '"'] 92 IDENTIFIERS = ["`"] 93 STRING_ESCAPES = ["\\"] 94 95 SINGLE_TOKENS = { 96 **tokens.Tokenizer.SINGLE_TOKENS, 97 "$": TokenType.PARAMETER, 98 } 99 100 KEYWORDS = { 101 **tokens.Tokenizer.KEYWORDS, 102 "ADD ARCHIVE": TokenType.COMMAND, 103 "ADD ARCHIVES": TokenType.COMMAND, 104 "ADD FILE": TokenType.COMMAND, 105 "ADD FILES": TokenType.COMMAND, 106 "ADD JAR": TokenType.COMMAND, 107 "ADD JARS": TokenType.COMMAND, 108 "MINUS": TokenType.EXCEPT, 109 "MSCK REPAIR": TokenType.COMMAND, 110 "REFRESH": TokenType.REFRESH, 111 "TIMESTAMP AS OF": TokenType.TIMESTAMP_SNAPSHOT, 112 "VERSION AS OF": TokenType.VERSION_SNAPSHOT, 113 "SERDEPROPERTIES": TokenType.SERDE_PROPERTIES, 114 } 115 116 NUMERIC_LITERALS = { 117 "L": "BIGINT", 118 "S": "SMALLINT", 119 "Y": "TINYINT", 120 "D": "DOUBLE", 121 "F": "FLOAT", 122 "BD": "DECIMAL", 123 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMBERS_CAN_HAVE_DECIMALS
- COMMENTS
- dialect
- tokenize
- sql
- size
- tokens