sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import typing as t 4 5 6from sqlglot.optimizer.annotate_types import TypeAnnotator 7 8from sqlglot import exp, jsonpath, tokens 9from sqlglot._typing import E 10from sqlglot.parsers.bigquery import BigQueryParser 11from sqlglot.generators.bigquery import BigQueryGenerator 12from sqlglot.dialects.dialect import ( 13 Dialect, 14 NormalizationStrategy, 15) 16from sqlglot.tokens import TokenType 17from sqlglot.typing.bigquery import EXPRESSION_METADATA 18 19if t.TYPE_CHECKING: 20 from sqlglot.optimizer.annotate_types import TypeAnnotator 21 22 23class BigQuery(Dialect): 24 WEEK_OFFSET = -1 25 UNNEST_COLUMN_ONLY = True 26 SUPPORTS_USER_DEFINED_TYPES = False 27 LOG_BASE_FIRST = False 28 HEX_LOWERCASE = True 29 FORCE_EARLY_ALIAS_REF_EXPANSION = True 30 EXPAND_ONLY_GROUP_ALIAS_REF = True 31 PRESERVE_ORIGINAL_NAMES = True 32 HEX_STRING_IS_INTEGER_TYPE = True 33 BYTE_STRING_IS_BYTES_TYPE = True 34 UUID_IS_STRING_TYPE = True 35 ANNOTATE_ALL_SCOPES = True 36 PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = True 37 TABLES_REFERENCEABLE_AS_COLUMNS = True 38 SUPPORTS_STRUCT_STAR_EXPANSION = True 39 EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True 40 QUERY_RESULTS_ARE_STRUCTS = True 41 JSON_EXTRACT_SCALAR_SCALAR_ONLY = True 42 JSON_PATH_SINGLE_DOT_IS_WILDCARD = True 43 LEAST_GREATEST_IGNORES_NULLS = False 44 DEFAULT_NULL_TYPE = exp.DType.BIGINT 45 PRIORITIZE_NON_LITERAL_TYPES = True 46 ALIAS_POST_VERSION = False 47 48 # https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap 49 INITCAP_DEFAULT_DELIMITER_CHARS = ' \t\n\r\f\v\\[\\](){}/|<>!?@"^#$&~_,.:;*%+\\-' 50 51 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 52 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 53 54 # bigquery udfs are case sensitive 55 NORMALIZE_FUNCTIONS = False 56 57 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 58 TIME_MAPPING = { 59 "%x": "%m/%d/%y", 60 "%D": "%m/%d/%y", 61 "%E6S": "%S.%f", 62 "%e": "%-d", 63 "%F": "%Y-%m-%d", 64 "%T": "%H:%M:%S", 65 "%c": "%a %b %e %H:%M:%S %Y", 66 } 67 68 INVERSE_TIME_MAPPING = { 69 # Preserve %E6S instead of expanding to %T.%f - since both %E6S & %T.%f are semantically different in BigQuery 70 # %E6S is semantically different from %T.%f: %E6S works as a single atomic specifier for seconds with microseconds, while %T.%f expands incorrectly and fails to parse. 71 "%H:%M:%S.%f": "%H:%M:%E6S", 72 } 73 74 FORMAT_MAPPING = { 75 "dd": "%d", 76 "DD": "%d", 77 "mm": "%m", 78 "MM": "%m", 79 "mon": "%b", 80 "MON": "%b", 81 "month": "%B", 82 "MONTH": "%B", 83 "yyyy": "%Y", 84 "YYYY": "%Y", 85 "yy": "%y", 86 "YY": "%y", 87 "HH": "%I", 88 "HH12": "%I", 89 "hh24": "%H", 90 "HH24": "%H", 91 "mi": "%M", 92 "MI": "%M", 93 "ss": "%S", 94 "SS": "%S", 95 "SSSSS": "%f", 96 "tzh": "%z", 97 "TZH": "%z", 98 } 99 100 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 101 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 102 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 103 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 104 PSEUDOCOLUMNS = { 105 "_PARTITIONTIME", 106 "_PARTITIONDATE", 107 "_TABLE_SUFFIX", 108 "_FILE_NAME", 109 "_DBT_MAX_PARTITION", 110 } 111 112 # All set operations require either a DISTINCT or ALL specifier 113 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 114 115 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 116 COERCES_TO = { 117 **TypeAnnotator.COERCES_TO, 118 exp.DType.BIGDECIMAL: {exp.DType.DOUBLE}, 119 } 120 COERCES_TO[exp.DType.DECIMAL] |= {exp.DType.BIGDECIMAL} 121 COERCES_TO[exp.DType.BIGINT] |= {exp.DType.BIGDECIMAL} 122 COERCES_TO[exp.DType.VARCHAR] |= { 123 exp.DType.DATE, 124 exp.DType.DATETIME, 125 exp.DType.TIME, 126 exp.DType.TIMESTAMP, 127 exp.DType.TIMESTAMPTZ, 128 } 129 130 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 131 132 def normalize_identifier(self, expression: E) -> E: 133 if ( 134 isinstance(expression, exp.Identifier) 135 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 136 ): 137 parent = expression.parent 138 while isinstance(parent, exp.Dot): 139 parent = parent.parent 140 141 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 142 # by default. The following check uses a heuristic to detect tables based on whether 143 # they are qualified. This should generally be correct, because tables in BigQuery 144 # must be qualified with at least a dataset, unless @@dataset_id is set. 145 case_sensitive = ( 146 isinstance(parent, exp.UserDefinedFunction) 147 or ( 148 isinstance(parent, exp.Table) 149 and parent.db 150 and (parent.meta_get("quoted_table") or not parent.meta_get("maybe_column")) 151 ) 152 or expression.meta_get("is_table") 153 ) 154 if not case_sensitive: 155 expression.set("this", expression.this.lower()) 156 157 return t.cast(E, expression) 158 159 return super().normalize_identifier(expression) 160 161 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 162 VAR_TOKENS = { 163 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 164 TokenType.DASH, 165 TokenType.NUMBER, 166 } 167 168 class Tokenizer(tokens.Tokenizer): 169 QUOTES = ["'", '"', '"""', "'''"] 170 COMMENTS = ["--", "#", ("/*", "*/")] 171 IDENTIFIERS = ["`"] 172 STRING_ESCAPES = ["\\"] 173 174 HEX_STRINGS = [("0x", ""), ("0X", "")] 175 176 BYTE_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("b", "B")] 177 178 RAW_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("r", "R")] 179 180 NESTED_COMMENTS = False 181 182 KEYWORDS = { 183 **tokens.Tokenizer.KEYWORDS, 184 "ANY TYPE": TokenType.VARIANT, 185 "BEGIN": TokenType.COMMAND, 186 "BEGIN TRANSACTION": TokenType.BEGIN, 187 "BYTEINT": TokenType.INT, 188 "BYTES": TokenType.BINARY, 189 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 190 "DATETIME": TokenType.TIMESTAMP, 191 "DECLARE": TokenType.DECLARE, 192 "ELSEIF": TokenType.COMMAND, 193 "EXCEPTION": TokenType.COMMAND, 194 "EXPORT": TokenType.EXPORT, 195 "FLOAT64": TokenType.DOUBLE, 196 "FOR SYSTEM TIME": TokenType.TIMESTAMP_SNAPSHOT, 197 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 198 "LOOP": TokenType.COMMAND, 199 "MODEL": TokenType.MODEL, 200 "NOT DETERMINISTIC": TokenType.VOLATILE, 201 "RECORD": TokenType.STRUCT, 202 "REPEAT": TokenType.COMMAND, 203 "TIMESTAMP": TokenType.TIMESTAMPTZ, 204 "WHILE": TokenType.COMMAND, 205 } 206 KEYWORDS.pop("DIV") 207 KEYWORDS.pop("VALUES") 208 KEYWORDS.pop("/*+") 209 210 Parser = BigQueryParser 211 212 Generator = BigQueryGenerator
24class BigQuery(Dialect): 25 WEEK_OFFSET = -1 26 UNNEST_COLUMN_ONLY = True 27 SUPPORTS_USER_DEFINED_TYPES = False 28 LOG_BASE_FIRST = False 29 HEX_LOWERCASE = True 30 FORCE_EARLY_ALIAS_REF_EXPANSION = True 31 EXPAND_ONLY_GROUP_ALIAS_REF = True 32 PRESERVE_ORIGINAL_NAMES = True 33 HEX_STRING_IS_INTEGER_TYPE = True 34 BYTE_STRING_IS_BYTES_TYPE = True 35 UUID_IS_STRING_TYPE = True 36 ANNOTATE_ALL_SCOPES = True 37 PROJECTION_ALIASES_SHADOW_SOURCE_NAMES = True 38 TABLES_REFERENCEABLE_AS_COLUMNS = True 39 SUPPORTS_STRUCT_STAR_EXPANSION = True 40 EXCLUDES_PSEUDOCOLUMNS_FROM_STAR = True 41 QUERY_RESULTS_ARE_STRUCTS = True 42 JSON_EXTRACT_SCALAR_SCALAR_ONLY = True 43 JSON_PATH_SINGLE_DOT_IS_WILDCARD = True 44 LEAST_GREATEST_IGNORES_NULLS = False 45 DEFAULT_NULL_TYPE = exp.DType.BIGINT 46 PRIORITIZE_NON_LITERAL_TYPES = True 47 ALIAS_POST_VERSION = False 48 49 # https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#initcap 50 INITCAP_DEFAULT_DELIMITER_CHARS = ' \t\n\r\f\v\\[\\](){}/|<>!?@"^#$&~_,.:;*%+\\-' 51 52 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 53 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 54 55 # bigquery udfs are case sensitive 56 NORMALIZE_FUNCTIONS = False 57 58 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 59 TIME_MAPPING = { 60 "%x": "%m/%d/%y", 61 "%D": "%m/%d/%y", 62 "%E6S": "%S.%f", 63 "%e": "%-d", 64 "%F": "%Y-%m-%d", 65 "%T": "%H:%M:%S", 66 "%c": "%a %b %e %H:%M:%S %Y", 67 } 68 69 INVERSE_TIME_MAPPING = { 70 # Preserve %E6S instead of expanding to %T.%f - since both %E6S & %T.%f are semantically different in BigQuery 71 # %E6S is semantically different from %T.%f: %E6S works as a single atomic specifier for seconds with microseconds, while %T.%f expands incorrectly and fails to parse. 72 "%H:%M:%S.%f": "%H:%M:%E6S", 73 } 74 75 FORMAT_MAPPING = { 76 "dd": "%d", 77 "DD": "%d", 78 "mm": "%m", 79 "MM": "%m", 80 "mon": "%b", 81 "MON": "%b", 82 "month": "%B", 83 "MONTH": "%B", 84 "yyyy": "%Y", 85 "YYYY": "%Y", 86 "yy": "%y", 87 "YY": "%y", 88 "HH": "%I", 89 "HH12": "%I", 90 "hh24": "%H", 91 "HH24": "%H", 92 "mi": "%M", 93 "MI": "%M", 94 "ss": "%S", 95 "SS": "%S", 96 "SSSSS": "%f", 97 "tzh": "%z", 98 "TZH": "%z", 99 } 100 101 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 102 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 103 # https://cloud.google.com/bigquery/docs/querying-wildcard-tables#scanning_a_range_of_tables_using_table_suffix 104 # https://cloud.google.com/bigquery/docs/query-cloud-storage-data#query_the_file_name_pseudo-column 105 PSEUDOCOLUMNS = { 106 "_PARTITIONTIME", 107 "_PARTITIONDATE", 108 "_TABLE_SUFFIX", 109 "_FILE_NAME", 110 "_DBT_MAX_PARTITION", 111 } 112 113 # All set operations require either a DISTINCT or ALL specifier 114 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 115 116 # https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#percentile_cont 117 COERCES_TO = { 118 **TypeAnnotator.COERCES_TO, 119 exp.DType.BIGDECIMAL: {exp.DType.DOUBLE}, 120 } 121 COERCES_TO[exp.DType.DECIMAL] |= {exp.DType.BIGDECIMAL} 122 COERCES_TO[exp.DType.BIGINT] |= {exp.DType.BIGDECIMAL} 123 COERCES_TO[exp.DType.VARCHAR] |= { 124 exp.DType.DATE, 125 exp.DType.DATETIME, 126 exp.DType.TIME, 127 exp.DType.TIMESTAMP, 128 exp.DType.TIMESTAMPTZ, 129 } 130 131 EXPRESSION_METADATA = EXPRESSION_METADATA.copy() 132 133 def normalize_identifier(self, expression: E) -> E: 134 if ( 135 isinstance(expression, exp.Identifier) 136 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 137 ): 138 parent = expression.parent 139 while isinstance(parent, exp.Dot): 140 parent = parent.parent 141 142 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 143 # by default. The following check uses a heuristic to detect tables based on whether 144 # they are qualified. This should generally be correct, because tables in BigQuery 145 # must be qualified with at least a dataset, unless @@dataset_id is set. 146 case_sensitive = ( 147 isinstance(parent, exp.UserDefinedFunction) 148 or ( 149 isinstance(parent, exp.Table) 150 and parent.db 151 and (parent.meta_get("quoted_table") or not parent.meta_get("maybe_column")) 152 ) 153 or expression.meta_get("is_table") 154 ) 155 if not case_sensitive: 156 expression.set("this", expression.this.lower()) 157 158 return t.cast(E, expression) 159 160 return super().normalize_identifier(expression) 161 162 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 163 VAR_TOKENS = { 164 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 165 TokenType.DASH, 166 TokenType.NUMBER, 167 } 168 169 class Tokenizer(tokens.Tokenizer): 170 QUOTES = ["'", '"', '"""', "'''"] 171 COMMENTS = ["--", "#", ("/*", "*/")] 172 IDENTIFIERS = ["`"] 173 STRING_ESCAPES = ["\\"] 174 175 HEX_STRINGS = [("0x", ""), ("0X", "")] 176 177 BYTE_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("b", "B")] 178 179 RAW_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("r", "R")] 180 181 NESTED_COMMENTS = False 182 183 KEYWORDS = { 184 **tokens.Tokenizer.KEYWORDS, 185 "ANY TYPE": TokenType.VARIANT, 186 "BEGIN": TokenType.COMMAND, 187 "BEGIN TRANSACTION": TokenType.BEGIN, 188 "BYTEINT": TokenType.INT, 189 "BYTES": TokenType.BINARY, 190 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 191 "DATETIME": TokenType.TIMESTAMP, 192 "DECLARE": TokenType.DECLARE, 193 "ELSEIF": TokenType.COMMAND, 194 "EXCEPTION": TokenType.COMMAND, 195 "EXPORT": TokenType.EXPORT, 196 "FLOAT64": TokenType.DOUBLE, 197 "FOR SYSTEM TIME": TokenType.TIMESTAMP_SNAPSHOT, 198 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 199 "LOOP": TokenType.COMMAND, 200 "MODEL": TokenType.MODEL, 201 "NOT DETERMINISTIC": TokenType.VOLATILE, 202 "RECORD": TokenType.STRUCT, 203 "REPEAT": TokenType.COMMAND, 204 "TIMESTAMP": TokenType.TIMESTAMPTZ, 205 "WHILE": TokenType.COMMAND, 206 } 207 KEYWORDS.pop("DIV") 208 KEYWORDS.pop("VALUES") 209 KEYWORDS.pop("/*+") 210 211 Parser = BigQueryParser 212 213 Generator = BigQueryGenerator
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether alias reference expansion before qualification should only happen for the GROUP BY clause.
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Whether byte string literals (ex: BigQuery's b'...') are typed as BYTES/BINARY
Whether to annotate all scopes during optimization. Used by BigQuery for UNNEST support.
Whether projection alias names can shadow table/source names in GROUP BY and HAVING clauses.
In BigQuery, when a projection alias has the same name as a source table, the alias takes precedence in GROUP BY and HAVING clauses, and the table becomes inaccessible by that name.
For example, in BigQuery: SELECT id, ARRAY_AGG(col) AS custom_fields FROM custom_fields GROUP BY id HAVING id >= 1
The "custom_fields" source is shadowed by the projection alias, so we cannot qualify "id" with "custom_fields" in GROUP BY/HAVING.
Whether table names can be referenced as columns (treated as structs).
BigQuery allows tables to be referenced as columns in queries, automatically treating them as struct values containing all the table's columns.
For example, in BigQuery: SELECT t FROM my_table AS t -- Returns entire row as a struct
Whether the dialect supports expanding struct fields using star notation (e.g., struct_col.*).
BigQuery allows struct fields to be expanded with the star operator:
SELECT t.struct_col.* FROM table t
RisingWave also allows struct field expansion with the star operator using parentheses:
SELECT (t.struct_col).* FROM table t
This expands to all fields within the struct.
Whether pseudocolumns should be excluded from star expansion (SELECT *).
Pseudocolumns are special dialect-specific columns (e.g., Oracle's ROWNUM, ROWID, LEVEL, or BigQuery's _PARTITIONTIME, _PARTITIONDATE) that are implicitly available but not part of the table schema. When this is True, SELECT * will not include these pseudocolumns; they must be explicitly selected.
Whether query results are typed as structs in metadata for type inference.
In BigQuery, subqueries store their column types as a STRUCT in metadata,
enabling special type inference for ARRAY(SELECT ...) expressions:
ARRAY(SELECT x, y FROM t) → ARRAY For single column subqueries, BigQuery unwraps the struct:
ARRAY(SELECT x FROM t) → ARRAY This is metadata-only for type inference.
Whether JSON_EXTRACT_SCALAR returns null if a non-scalar value is selected.
Whether a single DOT in a JSON path (e.g. $.) is treated as a valid wildcard key.
Whether LEAST/GREATEST functions ignore NULL values, e.g:
- BigQuery, Snowflake, MySQL, Presto/Trino: LEAST(1, NULL, 2) -> NULL
- Spark, Postgres, DuckDB, TSQL: LEAST(1, NULL, 2) -> 1
The default type of NULL for producing the correct projection type.
For example, in BigQuery the default type of the NULL value is INT64.
Whether to prioritize non-literal types over literals during type annotation.
Whether the table alias comes after version (timestamp or iceberg snapshot).
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy').
If empty, the corresponding trie will be constructed off of TIME_MAPPING.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT * queries.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
133 def normalize_identifier(self, expression: E) -> E: 134 if ( 135 isinstance(expression, exp.Identifier) 136 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 137 ): 138 parent = expression.parent 139 while isinstance(parent, exp.Dot): 140 parent = parent.parent 141 142 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 143 # by default. The following check uses a heuristic to detect tables based on whether 144 # they are qualified. This should generally be correct, because tables in BigQuery 145 # must be qualified with at least a dataset, unless @@dataset_id is set. 146 case_sensitive = ( 147 isinstance(parent, exp.UserDefinedFunction) 148 or ( 149 isinstance(parent, exp.Table) 150 and parent.db 151 and (parent.meta_get("quoted_table") or not parent.meta_get("maybe_column")) 152 ) 153 or expression.meta_get("is_table") 154 ) 155 if not case_sensitive: 156 expression.set("this", expression.this.lower()) 157 158 return t.cast(E, expression) 159 160 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO would be resolved as foo in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n) to its unescaped version (
).
Whether string literals support escape sequences (e.g. \n). Set by the metaclass based on the tokenizer's STRING_ESCAPES.
Whether byte string literals support escape sequences. Set by the metaclass based on the tokenizer's BYTE_STRING_ESCAPES.
162 class JSONPathTokenizer(jsonpath.JSONPathTokenizer): 163 VAR_TOKENS = { 164 *jsonpath.JSONPathTokenizer.VAR_TOKENS, 165 TokenType.DASH, 166 TokenType.NUMBER, 167 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BIT_STRINGS
- BYTE_STRINGS
- HEX_STRINGS
- RAW_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- IDENTIFIERS
- QUOTES
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- COMMENTS
- dialect
- tokenize
- sql
- size
- tokens
169 class Tokenizer(tokens.Tokenizer): 170 QUOTES = ["'", '"', '"""', "'''"] 171 COMMENTS = ["--", "#", ("/*", "*/")] 172 IDENTIFIERS = ["`"] 173 STRING_ESCAPES = ["\\"] 174 175 HEX_STRINGS = [("0x", ""), ("0X", "")] 176 177 BYTE_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("b", "B")] 178 179 RAW_STRINGS = [(prefix + q, q) for q in t.cast(list[str], QUOTES) for prefix in ("r", "R")] 180 181 NESTED_COMMENTS = False 182 183 KEYWORDS = { 184 **tokens.Tokenizer.KEYWORDS, 185 "ANY TYPE": TokenType.VARIANT, 186 "BEGIN": TokenType.COMMAND, 187 "BEGIN TRANSACTION": TokenType.BEGIN, 188 "BYTEINT": TokenType.INT, 189 "BYTES": TokenType.BINARY, 190 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 191 "DATETIME": TokenType.TIMESTAMP, 192 "DECLARE": TokenType.DECLARE, 193 "ELSEIF": TokenType.COMMAND, 194 "EXCEPTION": TokenType.COMMAND, 195 "EXPORT": TokenType.EXPORT, 196 "FLOAT64": TokenType.DOUBLE, 197 "FOR SYSTEM TIME": TokenType.TIMESTAMP_SNAPSHOT, 198 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 199 "LOOP": TokenType.COMMAND, 200 "MODEL": TokenType.MODEL, 201 "NOT DETERMINISTIC": TokenType.VOLATILE, 202 "RECORD": TokenType.STRUCT, 203 "REPEAT": TokenType.COMMAND, 204 "TIMESTAMP": TokenType.TIMESTAMPTZ, 205 "WHILE": TokenType.COMMAND, 206 } 207 KEYWORDS.pop("DIV") 208 KEYWORDS.pop("VALUES") 209 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- ESCAPE_FOLLOW_CHARS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- NUMBERS_CAN_HAVE_DECIMALS
- dialect
- tokenize
- sql
- size
- tokens