sqlglot.dialects.clickhouse
1from __future__ import annotations 2import typing as t 3import datetime 4from sqlglot import exp, generator, parser, tokens 5from sqlglot._typing import E 6from sqlglot.dialects.dialect import ( 7 Dialect, 8 NormalizationStrategy, 9 arg_max_or_min_no_count, 10 build_date_delta, 11 build_formatted_time, 12 inline_array_sql, 13 json_extract_segments, 14 json_path_key_only_name, 15 length_or_char_length_sql, 16 no_pivot_sql, 17 build_json_extract_path, 18 rename_func, 19 remove_from_array_using_filter, 20 sha256_sql, 21 strposition_sql, 22 var_map_sql, 23 timestamptrunc_sql, 24 unit_to_var, 25 trim_sql, 26) 27from sqlglot.generator import Generator 28from sqlglot.helper import is_int, seq_get 29from sqlglot.tokens import Token, TokenType 30from sqlglot.generator import unsupported_args 31 32DATEΤΙΜΕ_DELTA = t.Union[exp.DateAdd, exp.DateDiff, exp.DateSub, exp.TimestampSub, exp.TimestampAdd] 33 34 35def _build_datetime_format( 36 expr_type: t.Type[E], 37) -> t.Callable[[t.List], E]: 38 def _builder(args: t.List) -> E: 39 expr = build_formatted_time(expr_type, "clickhouse")(args) 40 41 timezone = seq_get(args, 2) 42 if timezone: 43 expr.set("zone", timezone) 44 45 return expr 46 47 return _builder 48 49 50def _unix_to_time_sql(self: ClickHouse.Generator, expression: exp.UnixToTime) -> str: 51 scale = expression.args.get("scale") 52 timestamp = expression.this 53 54 if scale in (None, exp.UnixToTime.SECONDS): 55 return self.func("fromUnixTimestamp", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 56 if scale == exp.UnixToTime.MILLIS: 57 return self.func("fromUnixTimestamp64Milli", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 58 if scale == exp.UnixToTime.MICROS: 59 return self.func("fromUnixTimestamp64Micro", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 60 if scale == exp.UnixToTime.NANOS: 61 return self.func("fromUnixTimestamp64Nano", exp.cast(timestamp, exp.DataType.Type.BIGINT)) 62 63 return self.func( 64 "fromUnixTimestamp", 65 exp.cast( 66 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 67 ), 68 ) 69 70 71def _lower_func(sql: str) -> str: 72 index = sql.index("(") 73 return sql[:index].lower() + sql[index:] 74 75 76def _quantile_sql(self: ClickHouse.Generator, expression: exp.Quantile) -> str: 77 quantile = expression.args["quantile"] 78 args = f"({self.sql(expression, 'this')})" 79 80 if isinstance(quantile, exp.Array): 81 func = self.func("quantiles", *quantile) 82 else: 83 func = self.func("quantile", quantile) 84 85 return func + args 86 87 88def _build_count_if(args: t.List) -> exp.CountIf | exp.CombinedAggFunc: 89 if len(args) == 1: 90 return exp.CountIf(this=seq_get(args, 0)) 91 92 return exp.CombinedAggFunc(this="countIf", expressions=args) 93 94 95def _build_str_to_date(args: t.List) -> exp.Cast | exp.Anonymous: 96 if len(args) == 3: 97 return exp.Anonymous(this="STR_TO_DATE", expressions=args) 98 99 strtodate = exp.StrToDate.from_arg_list(args) 100 return exp.cast(strtodate, exp.DataType.build(exp.DataType.Type.DATETIME)) 101 102 103def _datetime_delta_sql(name: str) -> t.Callable[[Generator, DATEΤΙΜΕ_DELTA], str]: 104 def _delta_sql(self: Generator, expression: DATEΤΙΜΕ_DELTA) -> str: 105 if not expression.unit: 106 return rename_func(name)(self, expression) 107 108 return self.func( 109 name, 110 unit_to_var(expression), 111 expression.expression, 112 expression.this, 113 expression.args.get("zone"), 114 ) 115 116 return _delta_sql 117 118 119def _timestrtotime_sql(self: ClickHouse.Generator, expression: exp.TimeStrToTime): 120 ts = expression.this 121 122 tz = expression.args.get("zone") 123 if tz and isinstance(ts, exp.Literal): 124 # Clickhouse will not accept timestamps that include a UTC offset, so we must remove them. 125 # The first step to removing is parsing the string with `datetime.datetime.fromisoformat`. 126 # 127 # In python <3.11, `fromisoformat()` can only parse timestamps of millisecond (3 digit) 128 # or microsecond (6 digit) precision. It will error if passed any other number of fractional 129 # digits, so we extract the fractional seconds and pad to 6 digits before parsing. 130 ts_string = ts.name.strip() 131 132 # separate [date and time] from [fractional seconds and UTC offset] 133 ts_parts = ts_string.split(".") 134 if len(ts_parts) == 2: 135 # separate fractional seconds and UTC offset 136 offset_sep = "+" if "+" in ts_parts[1] else "-" 137 ts_frac_parts = ts_parts[1].split(offset_sep) 138 num_frac_parts = len(ts_frac_parts) 139 140 # pad to 6 digits if fractional seconds present 141 ts_frac_parts[0] = ts_frac_parts[0].ljust(6, "0") 142 ts_string = "".join( 143 [ 144 ts_parts[0], # date and time 145 ".", 146 ts_frac_parts[0], # fractional seconds 147 offset_sep if num_frac_parts > 1 else "", 148 ts_frac_parts[1] if num_frac_parts > 1 else "", # utc offset (if present) 149 ] 150 ) 151 152 # return literal with no timezone, eg turn '2020-01-01 12:13:14-08:00' into '2020-01-01 12:13:14' 153 # this is because Clickhouse encodes the timezone as a data type parameter and throws an error if 154 # it's part of the timestamp string 155 ts_without_tz = ( 156 datetime.datetime.fromisoformat(ts_string).replace(tzinfo=None).isoformat(sep=" ") 157 ) 158 ts = exp.Literal.string(ts_without_tz) 159 160 # Non-nullable DateTime64 with microsecond precision 161 expressions = [exp.DataTypeParam(this=tz)] if tz else [] 162 datatype = exp.DataType.build( 163 exp.DataType.Type.DATETIME64, 164 expressions=[exp.DataTypeParam(this=exp.Literal.number(6)), *expressions], 165 nullable=False, 166 ) 167 168 return self.sql(exp.cast(ts, datatype, dialect=self.dialect)) 169 170 171def _map_sql(self: ClickHouse.Generator, expression: exp.Map | exp.VarMap) -> str: 172 if not (expression.parent and expression.parent.arg_key == "settings"): 173 return _lower_func(var_map_sql(self, expression)) 174 175 keys = expression.args.get("keys") 176 values = expression.args.get("values") 177 178 if not isinstance(keys, exp.Array) or not isinstance(values, exp.Array): 179 self.unsupported("Cannot convert array columns into map.") 180 return "" 181 182 args = [] 183 for key, value in zip(keys.expressions, values.expressions): 184 args.append(f"{self.sql(key)}: {self.sql(value)}") 185 186 csv_args = ", ".join(args) 187 188 return f"{{{csv_args}}}" 189 190 191class ClickHouse(Dialect): 192 NORMALIZE_FUNCTIONS: bool | str = False 193 NULL_ORDERING = "nulls_are_last" 194 SUPPORTS_USER_DEFINED_TYPES = False 195 SAFE_DIVISION = True 196 LOG_BASE_FIRST: t.Optional[bool] = None 197 FORCE_EARLY_ALIAS_REF_EXPANSION = True 198 PRESERVE_ORIGINAL_NAMES = True 199 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 200 IDENTIFIERS_CAN_START_WITH_DIGIT = True 201 HEX_STRING_IS_INTEGER_TYPE = True 202 203 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 204 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 205 206 UNESCAPED_SEQUENCES = { 207 "\\0": "\0", 208 } 209 210 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 211 212 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 213 exp.Except: False, 214 exp.Intersect: False, 215 exp.Union: None, 216 } 217 218 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 219 # Clickhouse allows VALUES to have an embedded structure e.g: 220 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 221 # In this case, we don't want to qualify the columns 222 values = expression.expressions[0].expressions 223 224 structure = ( 225 values[0] 226 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 227 else None 228 ) 229 if structure: 230 # Split each column definition into the column name e.g: 231 # 'person String, place String' -> ['person', 'place'] 232 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 233 column_aliases = [ 234 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 235 ] 236 else: 237 # Default column aliases in CH are "c1", "c2", etc. 238 column_aliases = [ 239 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 240 ] 241 242 return column_aliases 243 244 class Tokenizer(tokens.Tokenizer): 245 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 246 IDENTIFIERS = ['"', "`"] 247 IDENTIFIER_ESCAPES = ["\\"] 248 STRING_ESCAPES = ["'", "\\"] 249 BIT_STRINGS = [("0b", "")] 250 HEX_STRINGS = [("0x", ""), ("0X", "")] 251 HEREDOC_STRINGS = ["$"] 252 253 KEYWORDS = { 254 **tokens.Tokenizer.KEYWORDS, 255 ".:": TokenType.DOTCOLON, 256 "ATTACH": TokenType.COMMAND, 257 "DATE32": TokenType.DATE32, 258 "DATETIME64": TokenType.DATETIME64, 259 "DICTIONARY": TokenType.DICTIONARY, 260 "DYNAMIC": TokenType.DYNAMIC, 261 "ENUM8": TokenType.ENUM8, 262 "ENUM16": TokenType.ENUM16, 263 "EXCHANGE": TokenType.COMMAND, 264 "FINAL": TokenType.FINAL, 265 "FIXEDSTRING": TokenType.FIXEDSTRING, 266 "FLOAT32": TokenType.FLOAT, 267 "FLOAT64": TokenType.DOUBLE, 268 "GLOBAL": TokenType.GLOBAL, 269 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 270 "MAP": TokenType.MAP, 271 "NESTED": TokenType.NESTED, 272 "NOTHING": TokenType.NOTHING, 273 "SAMPLE": TokenType.TABLE_SAMPLE, 274 "TUPLE": TokenType.STRUCT, 275 "UINT16": TokenType.USMALLINT, 276 "UINT32": TokenType.UINT, 277 "UINT64": TokenType.UBIGINT, 278 "UINT8": TokenType.UTINYINT, 279 "IPV4": TokenType.IPV4, 280 "IPV6": TokenType.IPV6, 281 "POINT": TokenType.POINT, 282 "RING": TokenType.RING, 283 "LINESTRING": TokenType.LINESTRING, 284 "MULTILINESTRING": TokenType.MULTILINESTRING, 285 "POLYGON": TokenType.POLYGON, 286 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 287 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 288 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 289 "SYSTEM": TokenType.COMMAND, 290 "PREWHERE": TokenType.PREWHERE, 291 } 292 KEYWORDS.pop("/*+") 293 294 SINGLE_TOKENS = { 295 **tokens.Tokenizer.SINGLE_TOKENS, 296 "$": TokenType.HEREDOC_STRING, 297 } 298 299 class Parser(parser.Parser): 300 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 301 # * select x from t1 union all select x from t2 limit 1; 302 # * select x from t1 union all (select x from t2 limit 1); 303 MODIFIERS_ATTACHED_TO_SET_OP = False 304 INTERVAL_SPANS = False 305 OPTIONAL_ALIAS_TOKEN_CTE = False 306 JOINS_HAVE_EQUAL_PRECEDENCE = True 307 308 FUNCTIONS = { 309 **parser.Parser.FUNCTIONS, 310 "ANY": exp.AnyValue.from_arg_list, 311 "ARRAYSUM": exp.ArraySum.from_arg_list, 312 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 313 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 314 "COUNTIF": _build_count_if, 315 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 316 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 317 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 319 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 321 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 322 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 323 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 324 "JSONEXTRACTSTRING": build_json_extract_path( 325 exp.JSONExtractScalar, zero_based_indexing=False 326 ), 327 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 328 "L2Distance": exp.EuclideanDistance.from_arg_list, 329 "MAP": parser.build_var_map, 330 "MATCH": exp.RegexpLike.from_arg_list, 331 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 332 "RANDCANONICAL": exp.Rand.from_arg_list, 333 "STR_TO_DATE": _build_str_to_date, 334 "TUPLE": exp.Struct.from_arg_list, 335 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 336 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 338 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "UNIQ": exp.ApproxDistinct.from_arg_list, 340 "XOR": lambda args: exp.Xor(expressions=args), 341 "MD5": exp.MD5Digest.from_arg_list, 342 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 343 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 344 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 345 "TOTYPENAME": exp.Typeof.from_arg_list, 346 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 347 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 348 } 349 FUNCTIONS.pop("TRANSFORM") 350 FUNCTIONS.pop("APPROX_TOP_SUM") 351 352 AGG_FUNCTIONS = { 353 "count", 354 "min", 355 "max", 356 "sum", 357 "avg", 358 "any", 359 "stddevPop", 360 "stddevSamp", 361 "varPop", 362 "varSamp", 363 "corr", 364 "covarPop", 365 "covarSamp", 366 "entropy", 367 "exponentialMovingAverage", 368 "intervalLengthSum", 369 "kolmogorovSmirnovTest", 370 "mannWhitneyUTest", 371 "median", 372 "rankCorr", 373 "sumKahan", 374 "studentTTest", 375 "welchTTest", 376 "anyHeavy", 377 "anyLast", 378 "boundingRatio", 379 "first_value", 380 "last_value", 381 "argMin", 382 "argMax", 383 "avgWeighted", 384 "topK", 385 "approx_top_sum", 386 "topKWeighted", 387 "deltaSum", 388 "deltaSumTimestamp", 389 "groupArray", 390 "groupArrayLast", 391 "groupUniqArray", 392 "groupArrayInsertAt", 393 "groupArrayMovingAvg", 394 "groupArrayMovingSum", 395 "groupArraySample", 396 "groupBitAnd", 397 "groupBitOr", 398 "groupBitXor", 399 "groupBitmap", 400 "groupBitmapAnd", 401 "groupBitmapOr", 402 "groupBitmapXor", 403 "sumWithOverflow", 404 "sumMap", 405 "minMap", 406 "maxMap", 407 "skewSamp", 408 "skewPop", 409 "kurtSamp", 410 "kurtPop", 411 "uniq", 412 "uniqExact", 413 "uniqCombined", 414 "uniqCombined64", 415 "uniqHLL12", 416 "uniqTheta", 417 "quantile", 418 "quantiles", 419 "quantileExact", 420 "quantilesExact", 421 "quantileExactLow", 422 "quantilesExactLow", 423 "quantileExactHigh", 424 "quantilesExactHigh", 425 "quantileExactWeighted", 426 "quantilesExactWeighted", 427 "quantileTiming", 428 "quantilesTiming", 429 "quantileTimingWeighted", 430 "quantilesTimingWeighted", 431 "quantileDeterministic", 432 "quantilesDeterministic", 433 "quantileTDigest", 434 "quantilesTDigest", 435 "quantileTDigestWeighted", 436 "quantilesTDigestWeighted", 437 "quantileBFloat16", 438 "quantilesBFloat16", 439 "quantileBFloat16Weighted", 440 "quantilesBFloat16Weighted", 441 "simpleLinearRegression", 442 "stochasticLinearRegression", 443 "stochasticLogisticRegression", 444 "categoricalInformationValue", 445 "contingency", 446 "cramersV", 447 "cramersVBiasCorrected", 448 "theilsU", 449 "maxIntersections", 450 "maxIntersectionsPosition", 451 "meanZTest", 452 "quantileInterpolatedWeighted", 453 "quantilesInterpolatedWeighted", 454 "quantileGK", 455 "quantilesGK", 456 "sparkBar", 457 "sumCount", 458 "largestTriangleThreeBuckets", 459 "histogram", 460 "sequenceMatch", 461 "sequenceCount", 462 "windowFunnel", 463 "retention", 464 "uniqUpTo", 465 "sequenceNextNode", 466 "exponentialTimeDecayedAvg", 467 } 468 469 AGG_FUNCTIONS_SUFFIXES = [ 470 "If", 471 "Array", 472 "ArrayIf", 473 "Map", 474 "SimpleState", 475 "State", 476 "Merge", 477 "MergeState", 478 "ForEach", 479 "Distinct", 480 "OrDefault", 481 "OrNull", 482 "Resample", 483 "ArgMin", 484 "ArgMax", 485 ] 486 487 FUNC_TOKENS = { 488 *parser.Parser.FUNC_TOKENS, 489 TokenType.AND, 490 TokenType.OR, 491 TokenType.SET, 492 } 493 494 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 495 496 ID_VAR_TOKENS = { 497 *parser.Parser.ID_VAR_TOKENS, 498 TokenType.LIKE, 499 } 500 501 AGG_FUNC_MAPPING = ( 502 lambda functions, suffixes: { 503 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 504 } 505 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 506 507 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 508 509 FUNCTION_PARSERS = { 510 **parser.Parser.FUNCTION_PARSERS, 511 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 512 "QUANTILE": lambda self: self._parse_quantile(), 513 "MEDIAN": lambda self: self._parse_quantile(), 514 "COLUMNS": lambda self: self._parse_columns(), 515 } 516 517 FUNCTION_PARSERS.pop("MATCH") 518 519 PROPERTY_PARSERS = { 520 **parser.Parser.PROPERTY_PARSERS, 521 "ENGINE": lambda self: self._parse_engine_property(), 522 } 523 PROPERTY_PARSERS.pop("DYNAMIC") 524 525 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 526 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 527 528 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 529 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 530 531 RANGE_PARSERS = { 532 **parser.Parser.RANGE_PARSERS, 533 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 534 } 535 536 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 537 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 538 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 539 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 540 541 JOIN_KINDS = { 542 *parser.Parser.JOIN_KINDS, 543 TokenType.ANY, 544 TokenType.ASOF, 545 TokenType.ARRAY, 546 } 547 548 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 549 TokenType.ANY, 550 TokenType.ARRAY, 551 TokenType.FINAL, 552 TokenType.FORMAT, 553 TokenType.SETTINGS, 554 } 555 556 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 557 TokenType.FORMAT, 558 } 559 560 LOG_DEFAULTS_TO_LN = True 561 562 QUERY_MODIFIER_PARSERS = { 563 **parser.Parser.QUERY_MODIFIER_PARSERS, 564 TokenType.SETTINGS: lambda self: ( 565 "settings", 566 self._advance() or self._parse_csv(self._parse_assignment), 567 ), 568 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 569 } 570 571 CONSTRAINT_PARSERS = { 572 **parser.Parser.CONSTRAINT_PARSERS, 573 "INDEX": lambda self: self._parse_index_constraint(), 574 "CODEC": lambda self: self._parse_compress(), 575 } 576 577 ALTER_PARSERS = { 578 **parser.Parser.ALTER_PARSERS, 579 "REPLACE": lambda self: self._parse_alter_table_replace(), 580 } 581 582 SCHEMA_UNNAMED_CONSTRAINTS = { 583 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 584 "INDEX", 585 } 586 587 PLACEHOLDER_PARSERS = { 588 **parser.Parser.PLACEHOLDER_PARSERS, 589 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 590 } 591 592 def _parse_engine_property(self) -> exp.EngineProperty: 593 self._match(TokenType.EQ) 594 return self.expression( 595 exp.EngineProperty, 596 this=self._parse_field(any_token=True, anonymous_func=True), 597 ) 598 599 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 600 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 601 return self._parse_lambda() 602 603 def _parse_types( 604 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 605 ) -> t.Optional[exp.Expression]: 606 dtype = super()._parse_types( 607 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 608 ) 609 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 610 # Mark every type as non-nullable which is ClickHouse's default, unless it's 611 # already marked as nullable. This marker helps us transpile types from other 612 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 613 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 614 # fail in ClickHouse without the `Nullable` type constructor. 615 dtype.set("nullable", False) 616 617 return dtype 618 619 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 620 index = self._index 621 this = self._parse_bitwise() 622 if self._match(TokenType.FROM): 623 self._retreat(index) 624 return super()._parse_extract() 625 626 # We return Anonymous here because extract and regexpExtract have different semantics, 627 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 628 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 629 # 630 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 631 self._match(TokenType.COMMA) 632 return self.expression( 633 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 634 ) 635 636 def _parse_assignment(self) -> t.Optional[exp.Expression]: 637 this = super()._parse_assignment() 638 639 if self._match(TokenType.PLACEHOLDER): 640 return self.expression( 641 exp.If, 642 this=this, 643 true=self._parse_assignment(), 644 false=self._match(TokenType.COLON) and self._parse_assignment(), 645 ) 646 647 return this 648 649 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 650 """ 651 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 652 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 653 """ 654 index = self._index 655 656 this = self._parse_id_var() 657 self._match(TokenType.COLON) 658 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 659 self._match_text_seq("IDENTIFIER") and "Identifier" 660 ) 661 662 if not kind: 663 self._retreat(index) 664 return None 665 elif not self._match(TokenType.R_BRACE): 666 self.raise_error("Expecting }") 667 668 if isinstance(this, exp.Identifier) and not this.quoted: 669 this = exp.var(this.name) 670 671 return self.expression(exp.Placeholder, this=this, kind=kind) 672 673 def _parse_bracket( 674 self, this: t.Optional[exp.Expression] = None 675 ) -> t.Optional[exp.Expression]: 676 l_brace = self._match(TokenType.L_BRACE, advance=False) 677 bracket = super()._parse_bracket(this) 678 679 if l_brace and isinstance(bracket, exp.Struct): 680 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 681 for expression in bracket.expressions: 682 if not isinstance(expression, exp.PropertyEQ): 683 break 684 685 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 686 varmap.args["values"].append("expressions", expression.expression) 687 688 return varmap 689 690 return bracket 691 692 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 693 this = super()._parse_in(this) 694 this.set("is_global", is_global) 695 return this 696 697 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 698 is_negated = self._match(TokenType.NOT) 699 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 700 return self.expression(exp.Not, this=this) if is_negated else this 701 702 def _parse_table( 703 self, 704 schema: bool = False, 705 joins: bool = False, 706 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 707 parse_bracket: bool = False, 708 is_db_reference: bool = False, 709 parse_partition: bool = False, 710 consume_pipe: bool = False, 711 ) -> t.Optional[exp.Expression]: 712 this = super()._parse_table( 713 schema=schema, 714 joins=joins, 715 alias_tokens=alias_tokens, 716 parse_bracket=parse_bracket, 717 is_db_reference=is_db_reference, 718 ) 719 720 if isinstance(this, exp.Table): 721 inner = this.this 722 alias = this.args.get("alias") 723 724 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 725 alias.set("columns", [exp.to_identifier("generate_series")]) 726 727 if self._match(TokenType.FINAL): 728 this = self.expression(exp.Final, this=this) 729 730 return this 731 732 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 733 return super()._parse_position(haystack_first=True) 734 735 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 736 def _parse_cte(self) -> t.Optional[exp.CTE]: 737 # WITH <identifier> AS <subquery expression> 738 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 739 740 if not cte: 741 # WITH <expression> AS <identifier> 742 cte = self.expression( 743 exp.CTE, 744 this=self._parse_assignment(), 745 alias=self._parse_table_alias(), 746 scalar=True, 747 ) 748 749 return cte 750 751 def _parse_join_parts( 752 self, 753 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 754 is_global = self._match(TokenType.GLOBAL) and self._prev 755 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 756 757 if kind_pre: 758 kind = self._match_set(self.JOIN_KINDS) and self._prev 759 side = self._match_set(self.JOIN_SIDES) and self._prev 760 return is_global, side, kind 761 762 return ( 763 is_global, 764 self._match_set(self.JOIN_SIDES) and self._prev, 765 self._match_set(self.JOIN_KINDS) and self._prev, 766 ) 767 768 def _parse_join( 769 self, skip_join_token: bool = False, parse_bracket: bool = False 770 ) -> t.Optional[exp.Join]: 771 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 772 if join: 773 join.set("global", join.args.pop("method", None)) 774 775 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 776 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 777 if join.kind == "ARRAY": 778 for table in join.find_all(exp.Table): 779 table.replace(table.to_column()) 780 781 return join 782 783 def _parse_function( 784 self, 785 functions: t.Optional[t.Dict[str, t.Callable]] = None, 786 anonymous: bool = False, 787 optional_parens: bool = True, 788 any_token: bool = False, 789 ) -> t.Optional[exp.Expression]: 790 expr = super()._parse_function( 791 functions=functions, 792 anonymous=anonymous, 793 optional_parens=optional_parens, 794 any_token=any_token, 795 ) 796 797 func = expr.this if isinstance(expr, exp.Window) else expr 798 799 # Aggregate functions can be split in 2 parts: <func_name><suffix> 800 parts = ( 801 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 802 ) 803 804 if parts: 805 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 806 params = self._parse_func_params(anon_func) 807 808 kwargs = { 809 "this": anon_func.this, 810 "expressions": anon_func.expressions, 811 } 812 if parts[1]: 813 exp_class: t.Type[exp.Expression] = ( 814 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 815 ) 816 else: 817 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 818 819 kwargs["exp_class"] = exp_class 820 if params: 821 kwargs["params"] = params 822 823 func = self.expression(**kwargs) 824 825 if isinstance(expr, exp.Window): 826 # The window's func was parsed as Anonymous in base parser, fix its 827 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 828 expr.set("this", func) 829 elif params: 830 # Params have blocked super()._parse_function() from parsing the following window 831 # (if that exists) as they're standing between the function call and the window spec 832 expr = self._parse_window(func) 833 else: 834 expr = func 835 836 return expr 837 838 def _parse_func_params( 839 self, this: t.Optional[exp.Func] = None 840 ) -> t.Optional[t.List[exp.Expression]]: 841 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 842 return self._parse_csv(self._parse_lambda) 843 844 if self._match(TokenType.L_PAREN): 845 params = self._parse_csv(self._parse_lambda) 846 self._match_r_paren(this) 847 return params 848 849 return None 850 851 def _parse_quantile(self) -> exp.Quantile: 852 this = self._parse_lambda() 853 params = self._parse_func_params() 854 if params: 855 return self.expression(exp.Quantile, this=params[0], quantile=this) 856 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 857 858 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 859 return super()._parse_wrapped_id_vars(optional=True) 860 861 def _parse_primary_key( 862 self, wrapped_optional: bool = False, in_props: bool = False 863 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 864 return super()._parse_primary_key( 865 wrapped_optional=wrapped_optional or in_props, in_props=in_props 866 ) 867 868 def _parse_on_property(self) -> t.Optional[exp.Expression]: 869 index = self._index 870 if self._match_text_seq("CLUSTER"): 871 this = self._parse_string() or self._parse_id_var() 872 if this: 873 return self.expression(exp.OnCluster, this=this) 874 else: 875 self._retreat(index) 876 return None 877 878 def _parse_index_constraint( 879 self, kind: t.Optional[str] = None 880 ) -> exp.IndexColumnConstraint: 881 # INDEX name1 expr TYPE type1(args) GRANULARITY value 882 this = self._parse_id_var() 883 expression = self._parse_assignment() 884 885 index_type = self._match_text_seq("TYPE") and ( 886 self._parse_function() or self._parse_var() 887 ) 888 889 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 890 891 return self.expression( 892 exp.IndexColumnConstraint, 893 this=this, 894 expression=expression, 895 index_type=index_type, 896 granularity=granularity, 897 ) 898 899 def _parse_partition(self) -> t.Optional[exp.Partition]: 900 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 901 if not self._match(TokenType.PARTITION): 902 return None 903 904 if self._match_text_seq("ID"): 905 # Corresponds to the PARTITION ID <string_value> syntax 906 expressions: t.List[exp.Expression] = [ 907 self.expression(exp.PartitionId, this=self._parse_string()) 908 ] 909 else: 910 expressions = self._parse_expressions() 911 912 return self.expression(exp.Partition, expressions=expressions) 913 914 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 915 partition = self._parse_partition() 916 917 if not partition or not self._match(TokenType.FROM): 918 return None 919 920 return self.expression( 921 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 922 ) 923 924 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 925 if not self._match_text_seq("PROJECTION"): 926 return None 927 928 return self.expression( 929 exp.ProjectionDef, 930 this=self._parse_id_var(), 931 expression=self._parse_wrapped(self._parse_statement), 932 ) 933 934 def _parse_constraint(self) -> t.Optional[exp.Expression]: 935 return super()._parse_constraint() or self._parse_projection_def() 936 937 def _parse_alias( 938 self, this: t.Optional[exp.Expression], explicit: bool = False 939 ) -> t.Optional[exp.Expression]: 940 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 941 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 942 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 943 return this 944 945 return super()._parse_alias(this=this, explicit=explicit) 946 947 def _parse_expression(self) -> t.Optional[exp.Expression]: 948 this = super()._parse_expression() 949 950 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 951 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 952 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 953 self._match(TokenType.R_PAREN) 954 955 return this 956 957 def _parse_columns(self) -> exp.Expression: 958 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 959 960 while self._next and self._match_text_seq(")", "APPLY", "("): 961 self._match(TokenType.R_PAREN) 962 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 963 return this 964 965 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 966 value = super()._parse_value(values=values) 967 if not value: 968 return None 969 970 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 971 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 972 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 973 # but the final result is not altered by the extra parentheses. 974 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 975 expressions = value.expressions 976 if values and not isinstance(expressions[-1], exp.Tuple): 977 value.set( 978 "expressions", 979 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 980 ) 981 982 return value 983 984 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 985 # ClickHouse allows custom expressions as partition key 986 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 987 return self.expression( 988 exp.PartitionedByProperty, 989 this=self._parse_assignment(), 990 ) 991 992 class Generator(generator.Generator): 993 QUERY_HINTS = False 994 STRUCT_DELIMITER = ("(", ")") 995 NVL2_SUPPORTED = False 996 TABLESAMPLE_REQUIRES_PARENS = False 997 TABLESAMPLE_SIZE_IS_ROWS = False 998 TABLESAMPLE_KEYWORDS = "SAMPLE" 999 LAST_DAY_SUPPORTS_DATE_PART = False 1000 CAN_IMPLEMENT_ARRAY_ANY = True 1001 SUPPORTS_TO_NUMBER = False 1002 JOIN_HINTS = False 1003 TABLE_HINTS = False 1004 GROUPINGS_SEP = "" 1005 SET_OP_MODIFIERS = False 1006 ARRAY_SIZE_NAME = "LENGTH" 1007 WRAP_DERIVED_VALUES = False 1008 1009 STRING_TYPE_MAPPING = { 1010 exp.DataType.Type.BLOB: "String", 1011 exp.DataType.Type.CHAR: "String", 1012 exp.DataType.Type.LONGBLOB: "String", 1013 exp.DataType.Type.LONGTEXT: "String", 1014 exp.DataType.Type.MEDIUMBLOB: "String", 1015 exp.DataType.Type.MEDIUMTEXT: "String", 1016 exp.DataType.Type.TINYBLOB: "String", 1017 exp.DataType.Type.TINYTEXT: "String", 1018 exp.DataType.Type.TEXT: "String", 1019 exp.DataType.Type.VARBINARY: "String", 1020 exp.DataType.Type.VARCHAR: "String", 1021 } 1022 1023 SUPPORTED_JSON_PATH_PARTS = { 1024 exp.JSONPathKey, 1025 exp.JSONPathRoot, 1026 exp.JSONPathSubscript, 1027 } 1028 1029 TYPE_MAPPING = { 1030 **generator.Generator.TYPE_MAPPING, 1031 **STRING_TYPE_MAPPING, 1032 exp.DataType.Type.ARRAY: "Array", 1033 exp.DataType.Type.BOOLEAN: "Bool", 1034 exp.DataType.Type.BIGINT: "Int64", 1035 exp.DataType.Type.DATE32: "Date32", 1036 exp.DataType.Type.DATETIME: "DateTime", 1037 exp.DataType.Type.DATETIME2: "DateTime", 1038 exp.DataType.Type.SMALLDATETIME: "DateTime", 1039 exp.DataType.Type.DATETIME64: "DateTime64", 1040 exp.DataType.Type.DECIMAL: "Decimal", 1041 exp.DataType.Type.DECIMAL32: "Decimal32", 1042 exp.DataType.Type.DECIMAL64: "Decimal64", 1043 exp.DataType.Type.DECIMAL128: "Decimal128", 1044 exp.DataType.Type.DECIMAL256: "Decimal256", 1045 exp.DataType.Type.TIMESTAMP: "DateTime", 1046 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1047 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1048 exp.DataType.Type.DOUBLE: "Float64", 1049 exp.DataType.Type.ENUM: "Enum", 1050 exp.DataType.Type.ENUM8: "Enum8", 1051 exp.DataType.Type.ENUM16: "Enum16", 1052 exp.DataType.Type.FIXEDSTRING: "FixedString", 1053 exp.DataType.Type.FLOAT: "Float32", 1054 exp.DataType.Type.INT: "Int32", 1055 exp.DataType.Type.MEDIUMINT: "Int32", 1056 exp.DataType.Type.INT128: "Int128", 1057 exp.DataType.Type.INT256: "Int256", 1058 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1059 exp.DataType.Type.MAP: "Map", 1060 exp.DataType.Type.NESTED: "Nested", 1061 exp.DataType.Type.NOTHING: "Nothing", 1062 exp.DataType.Type.SMALLINT: "Int16", 1063 exp.DataType.Type.STRUCT: "Tuple", 1064 exp.DataType.Type.TINYINT: "Int8", 1065 exp.DataType.Type.UBIGINT: "UInt64", 1066 exp.DataType.Type.UINT: "UInt32", 1067 exp.DataType.Type.UINT128: "UInt128", 1068 exp.DataType.Type.UINT256: "UInt256", 1069 exp.DataType.Type.USMALLINT: "UInt16", 1070 exp.DataType.Type.UTINYINT: "UInt8", 1071 exp.DataType.Type.IPV4: "IPv4", 1072 exp.DataType.Type.IPV6: "IPv6", 1073 exp.DataType.Type.POINT: "Point", 1074 exp.DataType.Type.RING: "Ring", 1075 exp.DataType.Type.LINESTRING: "LineString", 1076 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1077 exp.DataType.Type.POLYGON: "Polygon", 1078 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1079 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1080 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1081 exp.DataType.Type.DYNAMIC: "Dynamic", 1082 } 1083 1084 TRANSFORMS = { 1085 **generator.Generator.TRANSFORMS, 1086 exp.AnyValue: rename_func("any"), 1087 exp.ApproxDistinct: rename_func("uniq"), 1088 exp.ArrayConcat: rename_func("arrayConcat"), 1089 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1090 exp.ArrayRemove: remove_from_array_using_filter, 1091 exp.ArrayReverse: rename_func("arrayReverse"), 1092 exp.ArraySlice: rename_func("arraySlice"), 1093 exp.ArraySum: rename_func("arraySum"), 1094 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1095 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1096 exp.Array: inline_array_sql, 1097 exp.CastToStrType: rename_func("CAST"), 1098 exp.CountIf: rename_func("countIf"), 1099 exp.CosineDistance: rename_func("cosineDistance"), 1100 exp.CompressColumnConstraint: lambda self, 1101 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1102 exp.ComputedColumnConstraint: lambda self, 1103 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1104 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1105 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1106 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1107 exp.DateStrToDate: rename_func("toDate"), 1108 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1109 exp.Explode: rename_func("arrayJoin"), 1110 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1111 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1112 exp.IsNan: rename_func("isNaN"), 1113 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1114 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1115 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONPathKey: json_path_key_only_name, 1117 exp.JSONPathRoot: lambda *_: "", 1118 exp.Length: length_or_char_length_sql, 1119 exp.Map: _map_sql, 1120 exp.Median: rename_func("median"), 1121 exp.Nullif: rename_func("nullIf"), 1122 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1123 exp.Pivot: no_pivot_sql, 1124 exp.Quantile: _quantile_sql, 1125 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1126 exp.Rand: rename_func("randCanonical"), 1127 exp.StartsWith: rename_func("startsWith"), 1128 exp.EndsWith: rename_func("endsWith"), 1129 exp.EuclideanDistance: rename_func("L2Distance"), 1130 exp.StrPosition: lambda self, e: strposition_sql( 1131 self, 1132 e, 1133 func_name="POSITION", 1134 supports_position=True, 1135 use_ansi_position=False, 1136 ), 1137 exp.TimeToStr: lambda self, e: self.func( 1138 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1139 ), 1140 exp.TimeStrToTime: _timestrtotime_sql, 1141 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1142 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1143 exp.Typeof: rename_func("toTypeName"), 1144 exp.VarMap: _map_sql, 1145 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1146 exp.MD5Digest: rename_func("MD5"), 1147 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1148 exp.SHA: rename_func("SHA1"), 1149 exp.SHA2: sha256_sql, 1150 exp.UnixToTime: _unix_to_time_sql, 1151 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1152 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1153 exp.Variance: rename_func("varSamp"), 1154 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1155 exp.Stddev: rename_func("stddevSamp"), 1156 exp.Chr: rename_func("CHAR"), 1157 exp.Lag: lambda self, e: self.func( 1158 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1159 ), 1160 exp.Lead: lambda self, e: self.func( 1161 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1162 ), 1163 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1164 rename_func("editDistance") 1165 ), 1166 exp.ParseDatetime: rename_func("parseDateTime"), 1167 } 1168 1169 PROPERTIES_LOCATION = { 1170 **generator.Generator.PROPERTIES_LOCATION, 1171 exp.OnCluster: exp.Properties.Location.POST_NAME, 1172 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1173 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1174 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1175 } 1176 1177 # There's no list in docs, but it can be found in Clickhouse code 1178 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1179 ON_CLUSTER_TARGETS = { 1180 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1181 "DATABASE", 1182 "TABLE", 1183 "VIEW", 1184 "DICTIONARY", 1185 "INDEX", 1186 "FUNCTION", 1187 "NAMED COLLECTION", 1188 } 1189 1190 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1191 NON_NULLABLE_TYPES = { 1192 exp.DataType.Type.ARRAY, 1193 exp.DataType.Type.MAP, 1194 exp.DataType.Type.STRUCT, 1195 exp.DataType.Type.POINT, 1196 exp.DataType.Type.RING, 1197 exp.DataType.Type.LINESTRING, 1198 exp.DataType.Type.MULTILINESTRING, 1199 exp.DataType.Type.POLYGON, 1200 exp.DataType.Type.MULTIPOLYGON, 1201 } 1202 1203 def offset_sql(self, expression: exp.Offset) -> str: 1204 offset = super().offset_sql(expression) 1205 1206 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1207 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1208 parent = expression.parent 1209 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1210 offset = f"{offset} ROWS" 1211 1212 return offset 1213 1214 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1215 strtodate_sql = self.function_fallback_sql(expression) 1216 1217 if not isinstance(expression.parent, exp.Cast): 1218 # StrToDate returns DATEs in other dialects (eg. postgres), so 1219 # this branch aims to improve the transpilation to clickhouse 1220 return self.cast_sql(exp.cast(expression, "DATE")) 1221 1222 return strtodate_sql 1223 1224 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1225 this = expression.this 1226 1227 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1228 return self.sql(this) 1229 1230 return super().cast_sql(expression, safe_prefix=safe_prefix) 1231 1232 def trycast_sql(self, expression: exp.TryCast) -> str: 1233 dtype = expression.to 1234 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1235 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1236 dtype.set("nullable", True) 1237 1238 return super().cast_sql(expression) 1239 1240 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1241 this = self.json_path_part(expression.this) 1242 return str(int(this) + 1) if is_int(this) else this 1243 1244 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1245 return f"AS {self.sql(expression, 'this')}" 1246 1247 def _any_to_has( 1248 self, 1249 expression: exp.EQ | exp.NEQ, 1250 default: t.Callable[[t.Any], str], 1251 prefix: str = "", 1252 ) -> str: 1253 if isinstance(expression.left, exp.Any): 1254 arr = expression.left 1255 this = expression.right 1256 elif isinstance(expression.right, exp.Any): 1257 arr = expression.right 1258 this = expression.left 1259 else: 1260 return default(expression) 1261 1262 return prefix + self.func("has", arr.this.unnest(), this) 1263 1264 def eq_sql(self, expression: exp.EQ) -> str: 1265 return self._any_to_has(expression, super().eq_sql) 1266 1267 def neq_sql(self, expression: exp.NEQ) -> str: 1268 return self._any_to_has(expression, super().neq_sql, "NOT ") 1269 1270 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1271 # Manually add a flag to make the search case-insensitive 1272 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1273 return self.func("match", expression.this, regex) 1274 1275 def datatype_sql(self, expression: exp.DataType) -> str: 1276 # String is the standard ClickHouse type, every other variant is just an alias. 1277 # Additionally, any supplied length parameter will be ignored. 1278 # 1279 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1280 if expression.this in self.STRING_TYPE_MAPPING: 1281 dtype = "String" 1282 else: 1283 dtype = super().datatype_sql(expression) 1284 1285 # This section changes the type to `Nullable(...)` if the following conditions hold: 1286 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1287 # and change their semantics 1288 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1289 # constraint: "Type of Map key must be a type, that can be represented by integer or 1290 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1291 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1292 parent = expression.parent 1293 nullable = expression.args.get("nullable") 1294 if nullable is True or ( 1295 nullable is None 1296 and not ( 1297 isinstance(parent, exp.DataType) 1298 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1299 and expression.index in (None, 0) 1300 ) 1301 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1302 ): 1303 dtype = f"Nullable({dtype})" 1304 1305 return dtype 1306 1307 def cte_sql(self, expression: exp.CTE) -> str: 1308 if expression.args.get("scalar"): 1309 this = self.sql(expression, "this") 1310 alias = self.sql(expression, "alias") 1311 return f"{this} AS {alias}" 1312 1313 return super().cte_sql(expression) 1314 1315 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1316 return super().after_limit_modifiers(expression) + [ 1317 ( 1318 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1319 if expression.args.get("settings") 1320 else "" 1321 ), 1322 ( 1323 self.seg("FORMAT ") + self.sql(expression, "format") 1324 if expression.args.get("format") 1325 else "" 1326 ), 1327 ] 1328 1329 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1330 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1331 1332 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1333 return f"ON CLUSTER {self.sql(expression, 'this')}" 1334 1335 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1336 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1337 exp.Properties.Location.POST_NAME 1338 ): 1339 this_name = self.sql( 1340 expression.this if isinstance(expression.this, exp.Schema) else expression, 1341 "this", 1342 ) 1343 this_properties = " ".join( 1344 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1345 ) 1346 this_schema = self.schema_columns_sql(expression.this) 1347 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1348 1349 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1350 1351 return super().createable_sql(expression, locations) 1352 1353 def create_sql(self, expression: exp.Create) -> str: 1354 # The comment property comes last in CTAS statements, i.e. after the query 1355 query = expression.expression 1356 if isinstance(query, exp.Query): 1357 comment_prop = expression.find(exp.SchemaCommentProperty) 1358 if comment_prop: 1359 comment_prop.pop() 1360 query.replace(exp.paren(query)) 1361 else: 1362 comment_prop = None 1363 1364 create_sql = super().create_sql(expression) 1365 1366 comment_sql = self.sql(comment_prop) 1367 comment_sql = f" {comment_sql}" if comment_sql else "" 1368 1369 return f"{create_sql}{comment_sql}" 1370 1371 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1372 this = self.indent(self.sql(expression, "this")) 1373 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1374 1375 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1376 this = self.sql(expression, "this") 1377 this = f" {this}" if this else "" 1378 expr = self.sql(expression, "expression") 1379 expr = f" {expr}" if expr else "" 1380 index_type = self.sql(expression, "index_type") 1381 index_type = f" TYPE {index_type}" if index_type else "" 1382 granularity = self.sql(expression, "granularity") 1383 granularity = f" GRANULARITY {granularity}" if granularity else "" 1384 1385 return f"INDEX{this}{expr}{index_type}{granularity}" 1386 1387 def partition_sql(self, expression: exp.Partition) -> str: 1388 return f"PARTITION {self.expressions(expression, flat=True)}" 1389 1390 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1391 return f"ID {self.sql(expression.this)}" 1392 1393 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1394 return ( 1395 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1396 ) 1397 1398 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1399 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1400 1401 def is_sql(self, expression: exp.Is) -> str: 1402 is_sql = super().is_sql(expression) 1403 1404 if isinstance(expression.parent, exp.Not): 1405 # value IS NOT NULL -> NOT (value IS NULL) 1406 is_sql = self.wrap(is_sql) 1407 1408 return is_sql 1409 1410 def in_sql(self, expression: exp.In) -> str: 1411 in_sql = super().in_sql(expression) 1412 1413 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1414 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1415 1416 return in_sql 1417 1418 def not_sql(self, expression: exp.Not) -> str: 1419 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1420 # let `GLOBAL IN` child interpose `NOT` 1421 return self.sql(expression, "this") 1422 1423 return super().not_sql(expression) 1424 1425 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1426 # If the VALUES clause contains tuples of expressions, we need to treat it 1427 # as a table since Clickhouse will automatically alias it as such. 1428 alias = expression.args.get("alias") 1429 1430 if alias and alias.args.get("columns") and expression.expressions: 1431 values = expression.expressions[0].expressions 1432 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1433 else: 1434 values_as_table = True 1435 1436 return super().values_sql(expression, values_as_table=values_as_table)
192class ClickHouse(Dialect): 193 NORMALIZE_FUNCTIONS: bool | str = False 194 NULL_ORDERING = "nulls_are_last" 195 SUPPORTS_USER_DEFINED_TYPES = False 196 SAFE_DIVISION = True 197 LOG_BASE_FIRST: t.Optional[bool] = None 198 FORCE_EARLY_ALIAS_REF_EXPANSION = True 199 PRESERVE_ORIGINAL_NAMES = True 200 NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True 201 IDENTIFIERS_CAN_START_WITH_DIGIT = True 202 HEX_STRING_IS_INTEGER_TYPE = True 203 204 # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 205 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE 206 207 UNESCAPED_SEQUENCES = { 208 "\\0": "\0", 209 } 210 211 CREATABLE_KIND_MAPPING = {"DATABASE": "SCHEMA"} 212 213 SET_OP_DISTINCT_BY_DEFAULT: t.Dict[t.Type[exp.Expression], t.Optional[bool]] = { 214 exp.Except: False, 215 exp.Intersect: False, 216 exp.Union: None, 217 } 218 219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases 244 245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 } 299 300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 317 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 322 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 323 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 324 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 325 "JSONEXTRACTSTRING": build_json_extract_path( 326 exp.JSONExtractScalar, zero_based_indexing=False 327 ), 328 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 329 "L2Distance": exp.EuclideanDistance.from_arg_list, 330 "MAP": parser.build_var_map, 331 "MATCH": exp.RegexpLike.from_arg_list, 332 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 333 "RANDCANONICAL": exp.Rand.from_arg_list, 334 "STR_TO_DATE": _build_str_to_date, 335 "TUPLE": exp.Struct.from_arg_list, 336 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "UNIQ": exp.ApproxDistinct.from_arg_list, 341 "XOR": lambda args: exp.Xor(expressions=args), 342 "MD5": exp.MD5Digest.from_arg_list, 343 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 344 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 345 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 346 "TOTYPENAME": exp.Typeof.from_arg_list, 347 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 348 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 349 } 350 FUNCTIONS.pop("TRANSFORM") 351 FUNCTIONS.pop("APPROX_TOP_SUM") 352 353 AGG_FUNCTIONS = { 354 "count", 355 "min", 356 "max", 357 "sum", 358 "avg", 359 "any", 360 "stddevPop", 361 "stddevSamp", 362 "varPop", 363 "varSamp", 364 "corr", 365 "covarPop", 366 "covarSamp", 367 "entropy", 368 "exponentialMovingAverage", 369 "intervalLengthSum", 370 "kolmogorovSmirnovTest", 371 "mannWhitneyUTest", 372 "median", 373 "rankCorr", 374 "sumKahan", 375 "studentTTest", 376 "welchTTest", 377 "anyHeavy", 378 "anyLast", 379 "boundingRatio", 380 "first_value", 381 "last_value", 382 "argMin", 383 "argMax", 384 "avgWeighted", 385 "topK", 386 "approx_top_sum", 387 "topKWeighted", 388 "deltaSum", 389 "deltaSumTimestamp", 390 "groupArray", 391 "groupArrayLast", 392 "groupUniqArray", 393 "groupArrayInsertAt", 394 "groupArrayMovingAvg", 395 "groupArrayMovingSum", 396 "groupArraySample", 397 "groupBitAnd", 398 "groupBitOr", 399 "groupBitXor", 400 "groupBitmap", 401 "groupBitmapAnd", 402 "groupBitmapOr", 403 "groupBitmapXor", 404 "sumWithOverflow", 405 "sumMap", 406 "minMap", 407 "maxMap", 408 "skewSamp", 409 "skewPop", 410 "kurtSamp", 411 "kurtPop", 412 "uniq", 413 "uniqExact", 414 "uniqCombined", 415 "uniqCombined64", 416 "uniqHLL12", 417 "uniqTheta", 418 "quantile", 419 "quantiles", 420 "quantileExact", 421 "quantilesExact", 422 "quantileExactLow", 423 "quantilesExactLow", 424 "quantileExactHigh", 425 "quantilesExactHigh", 426 "quantileExactWeighted", 427 "quantilesExactWeighted", 428 "quantileTiming", 429 "quantilesTiming", 430 "quantileTimingWeighted", 431 "quantilesTimingWeighted", 432 "quantileDeterministic", 433 "quantilesDeterministic", 434 "quantileTDigest", 435 "quantilesTDigest", 436 "quantileTDigestWeighted", 437 "quantilesTDigestWeighted", 438 "quantileBFloat16", 439 "quantilesBFloat16", 440 "quantileBFloat16Weighted", 441 "quantilesBFloat16Weighted", 442 "simpleLinearRegression", 443 "stochasticLinearRegression", 444 "stochasticLogisticRegression", 445 "categoricalInformationValue", 446 "contingency", 447 "cramersV", 448 "cramersVBiasCorrected", 449 "theilsU", 450 "maxIntersections", 451 "maxIntersectionsPosition", 452 "meanZTest", 453 "quantileInterpolatedWeighted", 454 "quantilesInterpolatedWeighted", 455 "quantileGK", 456 "quantilesGK", 457 "sparkBar", 458 "sumCount", 459 "largestTriangleThreeBuckets", 460 "histogram", 461 "sequenceMatch", 462 "sequenceCount", 463 "windowFunnel", 464 "retention", 465 "uniqUpTo", 466 "sequenceNextNode", 467 "exponentialTimeDecayedAvg", 468 } 469 470 AGG_FUNCTIONS_SUFFIXES = [ 471 "If", 472 "Array", 473 "ArrayIf", 474 "Map", 475 "SimpleState", 476 "State", 477 "Merge", 478 "MergeState", 479 "ForEach", 480 "Distinct", 481 "OrDefault", 482 "OrNull", 483 "Resample", 484 "ArgMin", 485 "ArgMax", 486 ] 487 488 FUNC_TOKENS = { 489 *parser.Parser.FUNC_TOKENS, 490 TokenType.AND, 491 TokenType.OR, 492 TokenType.SET, 493 } 494 495 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 496 497 ID_VAR_TOKENS = { 498 *parser.Parser.ID_VAR_TOKENS, 499 TokenType.LIKE, 500 } 501 502 AGG_FUNC_MAPPING = ( 503 lambda functions, suffixes: { 504 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 505 } 506 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 507 508 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 509 510 FUNCTION_PARSERS = { 511 **parser.Parser.FUNCTION_PARSERS, 512 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 513 "QUANTILE": lambda self: self._parse_quantile(), 514 "MEDIAN": lambda self: self._parse_quantile(), 515 "COLUMNS": lambda self: self._parse_columns(), 516 } 517 518 FUNCTION_PARSERS.pop("MATCH") 519 520 PROPERTY_PARSERS = { 521 **parser.Parser.PROPERTY_PARSERS, 522 "ENGINE": lambda self: self._parse_engine_property(), 523 } 524 PROPERTY_PARSERS.pop("DYNAMIC") 525 526 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 527 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 528 529 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 530 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 531 532 RANGE_PARSERS = { 533 **parser.Parser.RANGE_PARSERS, 534 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 535 } 536 537 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 538 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 539 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 540 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 541 542 JOIN_KINDS = { 543 *parser.Parser.JOIN_KINDS, 544 TokenType.ANY, 545 TokenType.ASOF, 546 TokenType.ARRAY, 547 } 548 549 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 550 TokenType.ANY, 551 TokenType.ARRAY, 552 TokenType.FINAL, 553 TokenType.FORMAT, 554 TokenType.SETTINGS, 555 } 556 557 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 558 TokenType.FORMAT, 559 } 560 561 LOG_DEFAULTS_TO_LN = True 562 563 QUERY_MODIFIER_PARSERS = { 564 **parser.Parser.QUERY_MODIFIER_PARSERS, 565 TokenType.SETTINGS: lambda self: ( 566 "settings", 567 self._advance() or self._parse_csv(self._parse_assignment), 568 ), 569 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 570 } 571 572 CONSTRAINT_PARSERS = { 573 **parser.Parser.CONSTRAINT_PARSERS, 574 "INDEX": lambda self: self._parse_index_constraint(), 575 "CODEC": lambda self: self._parse_compress(), 576 } 577 578 ALTER_PARSERS = { 579 **parser.Parser.ALTER_PARSERS, 580 "REPLACE": lambda self: self._parse_alter_table_replace(), 581 } 582 583 SCHEMA_UNNAMED_CONSTRAINTS = { 584 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 585 "INDEX", 586 } 587 588 PLACEHOLDER_PARSERS = { 589 **parser.Parser.PLACEHOLDER_PARSERS, 590 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 591 } 592 593 def _parse_engine_property(self) -> exp.EngineProperty: 594 self._match(TokenType.EQ) 595 return self.expression( 596 exp.EngineProperty, 597 this=self._parse_field(any_token=True, anonymous_func=True), 598 ) 599 600 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 601 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 602 return self._parse_lambda() 603 604 def _parse_types( 605 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 606 ) -> t.Optional[exp.Expression]: 607 dtype = super()._parse_types( 608 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 609 ) 610 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 611 # Mark every type as non-nullable which is ClickHouse's default, unless it's 612 # already marked as nullable. This marker helps us transpile types from other 613 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 614 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 615 # fail in ClickHouse without the `Nullable` type constructor. 616 dtype.set("nullable", False) 617 618 return dtype 619 620 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 621 index = self._index 622 this = self._parse_bitwise() 623 if self._match(TokenType.FROM): 624 self._retreat(index) 625 return super()._parse_extract() 626 627 # We return Anonymous here because extract and regexpExtract have different semantics, 628 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 629 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 630 # 631 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 632 self._match(TokenType.COMMA) 633 return self.expression( 634 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 635 ) 636 637 def _parse_assignment(self) -> t.Optional[exp.Expression]: 638 this = super()._parse_assignment() 639 640 if self._match(TokenType.PLACEHOLDER): 641 return self.expression( 642 exp.If, 643 this=this, 644 true=self._parse_assignment(), 645 false=self._match(TokenType.COLON) and self._parse_assignment(), 646 ) 647 648 return this 649 650 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 651 """ 652 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 653 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 654 """ 655 index = self._index 656 657 this = self._parse_id_var() 658 self._match(TokenType.COLON) 659 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 660 self._match_text_seq("IDENTIFIER") and "Identifier" 661 ) 662 663 if not kind: 664 self._retreat(index) 665 return None 666 elif not self._match(TokenType.R_BRACE): 667 self.raise_error("Expecting }") 668 669 if isinstance(this, exp.Identifier) and not this.quoted: 670 this = exp.var(this.name) 671 672 return self.expression(exp.Placeholder, this=this, kind=kind) 673 674 def _parse_bracket( 675 self, this: t.Optional[exp.Expression] = None 676 ) -> t.Optional[exp.Expression]: 677 l_brace = self._match(TokenType.L_BRACE, advance=False) 678 bracket = super()._parse_bracket(this) 679 680 if l_brace and isinstance(bracket, exp.Struct): 681 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 682 for expression in bracket.expressions: 683 if not isinstance(expression, exp.PropertyEQ): 684 break 685 686 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 687 varmap.args["values"].append("expressions", expression.expression) 688 689 return varmap 690 691 return bracket 692 693 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 694 this = super()._parse_in(this) 695 this.set("is_global", is_global) 696 return this 697 698 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 699 is_negated = self._match(TokenType.NOT) 700 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 701 return self.expression(exp.Not, this=this) if is_negated else this 702 703 def _parse_table( 704 self, 705 schema: bool = False, 706 joins: bool = False, 707 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 708 parse_bracket: bool = False, 709 is_db_reference: bool = False, 710 parse_partition: bool = False, 711 consume_pipe: bool = False, 712 ) -> t.Optional[exp.Expression]: 713 this = super()._parse_table( 714 schema=schema, 715 joins=joins, 716 alias_tokens=alias_tokens, 717 parse_bracket=parse_bracket, 718 is_db_reference=is_db_reference, 719 ) 720 721 if isinstance(this, exp.Table): 722 inner = this.this 723 alias = this.args.get("alias") 724 725 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 726 alias.set("columns", [exp.to_identifier("generate_series")]) 727 728 if self._match(TokenType.FINAL): 729 this = self.expression(exp.Final, this=this) 730 731 return this 732 733 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 734 return super()._parse_position(haystack_first=True) 735 736 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 737 def _parse_cte(self) -> t.Optional[exp.CTE]: 738 # WITH <identifier> AS <subquery expression> 739 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 740 741 if not cte: 742 # WITH <expression> AS <identifier> 743 cte = self.expression( 744 exp.CTE, 745 this=self._parse_assignment(), 746 alias=self._parse_table_alias(), 747 scalar=True, 748 ) 749 750 return cte 751 752 def _parse_join_parts( 753 self, 754 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 755 is_global = self._match(TokenType.GLOBAL) and self._prev 756 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 757 758 if kind_pre: 759 kind = self._match_set(self.JOIN_KINDS) and self._prev 760 side = self._match_set(self.JOIN_SIDES) and self._prev 761 return is_global, side, kind 762 763 return ( 764 is_global, 765 self._match_set(self.JOIN_SIDES) and self._prev, 766 self._match_set(self.JOIN_KINDS) and self._prev, 767 ) 768 769 def _parse_join( 770 self, skip_join_token: bool = False, parse_bracket: bool = False 771 ) -> t.Optional[exp.Join]: 772 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 773 if join: 774 join.set("global", join.args.pop("method", None)) 775 776 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 777 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 778 if join.kind == "ARRAY": 779 for table in join.find_all(exp.Table): 780 table.replace(table.to_column()) 781 782 return join 783 784 def _parse_function( 785 self, 786 functions: t.Optional[t.Dict[str, t.Callable]] = None, 787 anonymous: bool = False, 788 optional_parens: bool = True, 789 any_token: bool = False, 790 ) -> t.Optional[exp.Expression]: 791 expr = super()._parse_function( 792 functions=functions, 793 anonymous=anonymous, 794 optional_parens=optional_parens, 795 any_token=any_token, 796 ) 797 798 func = expr.this if isinstance(expr, exp.Window) else expr 799 800 # Aggregate functions can be split in 2 parts: <func_name><suffix> 801 parts = ( 802 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 803 ) 804 805 if parts: 806 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 807 params = self._parse_func_params(anon_func) 808 809 kwargs = { 810 "this": anon_func.this, 811 "expressions": anon_func.expressions, 812 } 813 if parts[1]: 814 exp_class: t.Type[exp.Expression] = ( 815 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 816 ) 817 else: 818 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 819 820 kwargs["exp_class"] = exp_class 821 if params: 822 kwargs["params"] = params 823 824 func = self.expression(**kwargs) 825 826 if isinstance(expr, exp.Window): 827 # The window's func was parsed as Anonymous in base parser, fix its 828 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 829 expr.set("this", func) 830 elif params: 831 # Params have blocked super()._parse_function() from parsing the following window 832 # (if that exists) as they're standing between the function call and the window spec 833 expr = self._parse_window(func) 834 else: 835 expr = func 836 837 return expr 838 839 def _parse_func_params( 840 self, this: t.Optional[exp.Func] = None 841 ) -> t.Optional[t.List[exp.Expression]]: 842 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 843 return self._parse_csv(self._parse_lambda) 844 845 if self._match(TokenType.L_PAREN): 846 params = self._parse_csv(self._parse_lambda) 847 self._match_r_paren(this) 848 return params 849 850 return None 851 852 def _parse_quantile(self) -> exp.Quantile: 853 this = self._parse_lambda() 854 params = self._parse_func_params() 855 if params: 856 return self.expression(exp.Quantile, this=params[0], quantile=this) 857 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 858 859 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 860 return super()._parse_wrapped_id_vars(optional=True) 861 862 def _parse_primary_key( 863 self, wrapped_optional: bool = False, in_props: bool = False 864 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 865 return super()._parse_primary_key( 866 wrapped_optional=wrapped_optional or in_props, in_props=in_props 867 ) 868 869 def _parse_on_property(self) -> t.Optional[exp.Expression]: 870 index = self._index 871 if self._match_text_seq("CLUSTER"): 872 this = self._parse_string() or self._parse_id_var() 873 if this: 874 return self.expression(exp.OnCluster, this=this) 875 else: 876 self._retreat(index) 877 return None 878 879 def _parse_index_constraint( 880 self, kind: t.Optional[str] = None 881 ) -> exp.IndexColumnConstraint: 882 # INDEX name1 expr TYPE type1(args) GRANULARITY value 883 this = self._parse_id_var() 884 expression = self._parse_assignment() 885 886 index_type = self._match_text_seq("TYPE") and ( 887 self._parse_function() or self._parse_var() 888 ) 889 890 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 891 892 return self.expression( 893 exp.IndexColumnConstraint, 894 this=this, 895 expression=expression, 896 index_type=index_type, 897 granularity=granularity, 898 ) 899 900 def _parse_partition(self) -> t.Optional[exp.Partition]: 901 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 902 if not self._match(TokenType.PARTITION): 903 return None 904 905 if self._match_text_seq("ID"): 906 # Corresponds to the PARTITION ID <string_value> syntax 907 expressions: t.List[exp.Expression] = [ 908 self.expression(exp.PartitionId, this=self._parse_string()) 909 ] 910 else: 911 expressions = self._parse_expressions() 912 913 return self.expression(exp.Partition, expressions=expressions) 914 915 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 916 partition = self._parse_partition() 917 918 if not partition or not self._match(TokenType.FROM): 919 return None 920 921 return self.expression( 922 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 923 ) 924 925 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 926 if not self._match_text_seq("PROJECTION"): 927 return None 928 929 return self.expression( 930 exp.ProjectionDef, 931 this=self._parse_id_var(), 932 expression=self._parse_wrapped(self._parse_statement), 933 ) 934 935 def _parse_constraint(self) -> t.Optional[exp.Expression]: 936 return super()._parse_constraint() or self._parse_projection_def() 937 938 def _parse_alias( 939 self, this: t.Optional[exp.Expression], explicit: bool = False 940 ) -> t.Optional[exp.Expression]: 941 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 942 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 943 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 944 return this 945 946 return super()._parse_alias(this=this, explicit=explicit) 947 948 def _parse_expression(self) -> t.Optional[exp.Expression]: 949 this = super()._parse_expression() 950 951 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 952 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 953 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 954 self._match(TokenType.R_PAREN) 955 956 return this 957 958 def _parse_columns(self) -> exp.Expression: 959 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 960 961 while self._next and self._match_text_seq(")", "APPLY", "("): 962 self._match(TokenType.R_PAREN) 963 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 964 return this 965 966 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 967 value = super()._parse_value(values=values) 968 if not value: 969 return None 970 971 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 972 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 973 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 974 # but the final result is not altered by the extra parentheses. 975 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 976 expressions = value.expressions 977 if values and not isinstance(expressions[-1], exp.Tuple): 978 value.set( 979 "expressions", 980 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 981 ) 982 983 return value 984 985 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 986 # ClickHouse allows custom expressions as partition key 987 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 988 return self.expression( 989 exp.PartitionedByProperty, 990 this=self._parse_assignment(), 991 ) 992 993 class Generator(generator.Generator): 994 QUERY_HINTS = False 995 STRUCT_DELIMITER = ("(", ")") 996 NVL2_SUPPORTED = False 997 TABLESAMPLE_REQUIRES_PARENS = False 998 TABLESAMPLE_SIZE_IS_ROWS = False 999 TABLESAMPLE_KEYWORDS = "SAMPLE" 1000 LAST_DAY_SUPPORTS_DATE_PART = False 1001 CAN_IMPLEMENT_ARRAY_ANY = True 1002 SUPPORTS_TO_NUMBER = False 1003 JOIN_HINTS = False 1004 TABLE_HINTS = False 1005 GROUPINGS_SEP = "" 1006 SET_OP_MODIFIERS = False 1007 ARRAY_SIZE_NAME = "LENGTH" 1008 WRAP_DERIVED_VALUES = False 1009 1010 STRING_TYPE_MAPPING = { 1011 exp.DataType.Type.BLOB: "String", 1012 exp.DataType.Type.CHAR: "String", 1013 exp.DataType.Type.LONGBLOB: "String", 1014 exp.DataType.Type.LONGTEXT: "String", 1015 exp.DataType.Type.MEDIUMBLOB: "String", 1016 exp.DataType.Type.MEDIUMTEXT: "String", 1017 exp.DataType.Type.TINYBLOB: "String", 1018 exp.DataType.Type.TINYTEXT: "String", 1019 exp.DataType.Type.TEXT: "String", 1020 exp.DataType.Type.VARBINARY: "String", 1021 exp.DataType.Type.VARCHAR: "String", 1022 } 1023 1024 SUPPORTED_JSON_PATH_PARTS = { 1025 exp.JSONPathKey, 1026 exp.JSONPathRoot, 1027 exp.JSONPathSubscript, 1028 } 1029 1030 TYPE_MAPPING = { 1031 **generator.Generator.TYPE_MAPPING, 1032 **STRING_TYPE_MAPPING, 1033 exp.DataType.Type.ARRAY: "Array", 1034 exp.DataType.Type.BOOLEAN: "Bool", 1035 exp.DataType.Type.BIGINT: "Int64", 1036 exp.DataType.Type.DATE32: "Date32", 1037 exp.DataType.Type.DATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME2: "DateTime", 1039 exp.DataType.Type.SMALLDATETIME: "DateTime", 1040 exp.DataType.Type.DATETIME64: "DateTime64", 1041 exp.DataType.Type.DECIMAL: "Decimal", 1042 exp.DataType.Type.DECIMAL32: "Decimal32", 1043 exp.DataType.Type.DECIMAL64: "Decimal64", 1044 exp.DataType.Type.DECIMAL128: "Decimal128", 1045 exp.DataType.Type.DECIMAL256: "Decimal256", 1046 exp.DataType.Type.TIMESTAMP: "DateTime", 1047 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1048 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1049 exp.DataType.Type.DOUBLE: "Float64", 1050 exp.DataType.Type.ENUM: "Enum", 1051 exp.DataType.Type.ENUM8: "Enum8", 1052 exp.DataType.Type.ENUM16: "Enum16", 1053 exp.DataType.Type.FIXEDSTRING: "FixedString", 1054 exp.DataType.Type.FLOAT: "Float32", 1055 exp.DataType.Type.INT: "Int32", 1056 exp.DataType.Type.MEDIUMINT: "Int32", 1057 exp.DataType.Type.INT128: "Int128", 1058 exp.DataType.Type.INT256: "Int256", 1059 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1060 exp.DataType.Type.MAP: "Map", 1061 exp.DataType.Type.NESTED: "Nested", 1062 exp.DataType.Type.NOTHING: "Nothing", 1063 exp.DataType.Type.SMALLINT: "Int16", 1064 exp.DataType.Type.STRUCT: "Tuple", 1065 exp.DataType.Type.TINYINT: "Int8", 1066 exp.DataType.Type.UBIGINT: "UInt64", 1067 exp.DataType.Type.UINT: "UInt32", 1068 exp.DataType.Type.UINT128: "UInt128", 1069 exp.DataType.Type.UINT256: "UInt256", 1070 exp.DataType.Type.USMALLINT: "UInt16", 1071 exp.DataType.Type.UTINYINT: "UInt8", 1072 exp.DataType.Type.IPV4: "IPv4", 1073 exp.DataType.Type.IPV6: "IPv6", 1074 exp.DataType.Type.POINT: "Point", 1075 exp.DataType.Type.RING: "Ring", 1076 exp.DataType.Type.LINESTRING: "LineString", 1077 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1078 exp.DataType.Type.POLYGON: "Polygon", 1079 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1080 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1081 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1082 exp.DataType.Type.DYNAMIC: "Dynamic", 1083 } 1084 1085 TRANSFORMS = { 1086 **generator.Generator.TRANSFORMS, 1087 exp.AnyValue: rename_func("any"), 1088 exp.ApproxDistinct: rename_func("uniq"), 1089 exp.ArrayConcat: rename_func("arrayConcat"), 1090 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1091 exp.ArrayRemove: remove_from_array_using_filter, 1092 exp.ArrayReverse: rename_func("arrayReverse"), 1093 exp.ArraySlice: rename_func("arraySlice"), 1094 exp.ArraySum: rename_func("arraySum"), 1095 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1096 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1097 exp.Array: inline_array_sql, 1098 exp.CastToStrType: rename_func("CAST"), 1099 exp.CountIf: rename_func("countIf"), 1100 exp.CosineDistance: rename_func("cosineDistance"), 1101 exp.CompressColumnConstraint: lambda self, 1102 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1103 exp.ComputedColumnConstraint: lambda self, 1104 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1105 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1106 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1107 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1108 exp.DateStrToDate: rename_func("toDate"), 1109 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1110 exp.Explode: rename_func("arrayJoin"), 1111 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1112 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1113 exp.IsNan: rename_func("isNaN"), 1114 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1115 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1117 exp.JSONPathKey: json_path_key_only_name, 1118 exp.JSONPathRoot: lambda *_: "", 1119 exp.Length: length_or_char_length_sql, 1120 exp.Map: _map_sql, 1121 exp.Median: rename_func("median"), 1122 exp.Nullif: rename_func("nullIf"), 1123 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1124 exp.Pivot: no_pivot_sql, 1125 exp.Quantile: _quantile_sql, 1126 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1127 exp.Rand: rename_func("randCanonical"), 1128 exp.StartsWith: rename_func("startsWith"), 1129 exp.EndsWith: rename_func("endsWith"), 1130 exp.EuclideanDistance: rename_func("L2Distance"), 1131 exp.StrPosition: lambda self, e: strposition_sql( 1132 self, 1133 e, 1134 func_name="POSITION", 1135 supports_position=True, 1136 use_ansi_position=False, 1137 ), 1138 exp.TimeToStr: lambda self, e: self.func( 1139 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1140 ), 1141 exp.TimeStrToTime: _timestrtotime_sql, 1142 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1143 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1144 exp.Typeof: rename_func("toTypeName"), 1145 exp.VarMap: _map_sql, 1146 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1147 exp.MD5Digest: rename_func("MD5"), 1148 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1149 exp.SHA: rename_func("SHA1"), 1150 exp.SHA2: sha256_sql, 1151 exp.UnixToTime: _unix_to_time_sql, 1152 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1153 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1154 exp.Variance: rename_func("varSamp"), 1155 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1156 exp.Stddev: rename_func("stddevSamp"), 1157 exp.Chr: rename_func("CHAR"), 1158 exp.Lag: lambda self, e: self.func( 1159 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1160 ), 1161 exp.Lead: lambda self, e: self.func( 1162 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1163 ), 1164 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1165 rename_func("editDistance") 1166 ), 1167 exp.ParseDatetime: rename_func("parseDateTime"), 1168 } 1169 1170 PROPERTIES_LOCATION = { 1171 **generator.Generator.PROPERTIES_LOCATION, 1172 exp.OnCluster: exp.Properties.Location.POST_NAME, 1173 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1174 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1175 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1176 } 1177 1178 # There's no list in docs, but it can be found in Clickhouse code 1179 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1180 ON_CLUSTER_TARGETS = { 1181 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1182 "DATABASE", 1183 "TABLE", 1184 "VIEW", 1185 "DICTIONARY", 1186 "INDEX", 1187 "FUNCTION", 1188 "NAMED COLLECTION", 1189 } 1190 1191 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1192 NON_NULLABLE_TYPES = { 1193 exp.DataType.Type.ARRAY, 1194 exp.DataType.Type.MAP, 1195 exp.DataType.Type.STRUCT, 1196 exp.DataType.Type.POINT, 1197 exp.DataType.Type.RING, 1198 exp.DataType.Type.LINESTRING, 1199 exp.DataType.Type.MULTILINESTRING, 1200 exp.DataType.Type.POLYGON, 1201 exp.DataType.Type.MULTIPOLYGON, 1202 } 1203 1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset 1214 1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql 1224 1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix) 1232 1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression) 1240 1241 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1242 this = self.json_path_part(expression.this) 1243 return str(int(this) + 1) if is_int(this) else this 1244 1245 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1246 return f"AS {self.sql(expression, 'this')}" 1247 1248 def _any_to_has( 1249 self, 1250 expression: exp.EQ | exp.NEQ, 1251 default: t.Callable[[t.Any], str], 1252 prefix: str = "", 1253 ) -> str: 1254 if isinstance(expression.left, exp.Any): 1255 arr = expression.left 1256 this = expression.right 1257 elif isinstance(expression.right, exp.Any): 1258 arr = expression.right 1259 this = expression.left 1260 else: 1261 return default(expression) 1262 1263 return prefix + self.func("has", arr.this.unnest(), this) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 return self._any_to_has(expression, super().eq_sql) 1267 1268 def neq_sql(self, expression: exp.NEQ) -> str: 1269 return self._any_to_has(expression, super().neq_sql, "NOT ") 1270 1271 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1272 # Manually add a flag to make the search case-insensitive 1273 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1274 return self.func("match", expression.this, regex) 1275 1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype 1307 1308 def cte_sql(self, expression: exp.CTE) -> str: 1309 if expression.args.get("scalar"): 1310 this = self.sql(expression, "this") 1311 alias = self.sql(expression, "alias") 1312 return f"{this} AS {alias}" 1313 1314 return super().cte_sql(expression) 1315 1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ] 1329 1330 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1331 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1332 1333 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1334 return f"ON CLUSTER {self.sql(expression, 'this')}" 1335 1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations) 1353 1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}" 1371 1372 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1373 this = self.indent(self.sql(expression, "this")) 1374 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1375 1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}" 1387 1388 def partition_sql(self, expression: exp.Partition) -> str: 1389 return f"PARTITION {self.expressions(expression, flat=True)}" 1390 1391 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1392 return f"ID {self.sql(expression.this)}" 1393 1394 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1395 return ( 1396 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1397 ) 1398 1399 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1400 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1401 1402 def is_sql(self, expression: exp.Is) -> str: 1403 is_sql = super().is_sql(expression) 1404 1405 if isinstance(expression.parent, exp.Not): 1406 # value IS NOT NULL -> NOT (value IS NULL) 1407 is_sql = self.wrap(is_sql) 1408 1409 return is_sql 1410 1411 def in_sql(self, expression: exp.In) -> str: 1412 in_sql = super().in_sql(expression) 1413 1414 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1415 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1416 1417 return in_sql 1418 1419 def not_sql(self, expression: exp.Not) -> str: 1420 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1421 # let `GLOBAL IN` child interpose `NOT` 1422 return self.sql(expression, "this") 1423 1424 return super().not_sql(expression) 1425 1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Default NULL ordering method to use if not explicitly set.
Possible values: "nulls_are_small", "nulls_are_large", "nulls_are_last"
Whether the base comes first in the LOG function.
Possible values: True, False, None (two arguments are not supported by LOG)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether number literals can include underscores for better readability
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Mapping of an escaped sequence (\n) to its unescaped version (
).
Helper for dialects that use a different name for the same creatable kind. For example, the Clickhouse equivalent of CREATE SCHEMA is CREATE DATABASE.
Whether a set operation uses DISTINCT by default. This is None when either DISTINCT or ALL
must be explicitly specified.
219 def generate_values_aliases(self, expression: exp.Values) -> t.List[exp.Identifier]: 220 # Clickhouse allows VALUES to have an embedded structure e.g: 221 # VALUES('person String, place String', ('Noah', 'Paris'), ...) 222 # In this case, we don't want to qualify the columns 223 values = expression.expressions[0].expressions 224 225 structure = ( 226 values[0] 227 if (len(values) > 1 and values[0].is_string and isinstance(values[1], exp.Tuple)) 228 else None 229 ) 230 if structure: 231 # Split each column definition into the column name e.g: 232 # 'person String, place String' -> ['person', 'place'] 233 structure_coldefs = [coldef.strip() for coldef in structure.name.split(",")] 234 column_aliases = [ 235 exp.to_identifier(coldef.split(" ")[0]) for coldef in structure_coldefs 236 ] 237 else: 238 # Default column aliases in CH are "c1", "c2", etc. 239 column_aliases = [ 240 exp.to_identifier(f"c{i + 1}") for i in range(len(values[0].expressions)) 241 ] 242 243 return column_aliases
245 class Tokenizer(tokens.Tokenizer): 246 COMMENTS = ["--", "#", "#!", ("/*", "*/")] 247 IDENTIFIERS = ['"', "`"] 248 IDENTIFIER_ESCAPES = ["\\"] 249 STRING_ESCAPES = ["'", "\\"] 250 BIT_STRINGS = [("0b", "")] 251 HEX_STRINGS = [("0x", ""), ("0X", "")] 252 HEREDOC_STRINGS = ["$"] 253 254 KEYWORDS = { 255 **tokens.Tokenizer.KEYWORDS, 256 ".:": TokenType.DOTCOLON, 257 "ATTACH": TokenType.COMMAND, 258 "DATE32": TokenType.DATE32, 259 "DATETIME64": TokenType.DATETIME64, 260 "DICTIONARY": TokenType.DICTIONARY, 261 "DYNAMIC": TokenType.DYNAMIC, 262 "ENUM8": TokenType.ENUM8, 263 "ENUM16": TokenType.ENUM16, 264 "EXCHANGE": TokenType.COMMAND, 265 "FINAL": TokenType.FINAL, 266 "FIXEDSTRING": TokenType.FIXEDSTRING, 267 "FLOAT32": TokenType.FLOAT, 268 "FLOAT64": TokenType.DOUBLE, 269 "GLOBAL": TokenType.GLOBAL, 270 "LOWCARDINALITY": TokenType.LOWCARDINALITY, 271 "MAP": TokenType.MAP, 272 "NESTED": TokenType.NESTED, 273 "NOTHING": TokenType.NOTHING, 274 "SAMPLE": TokenType.TABLE_SAMPLE, 275 "TUPLE": TokenType.STRUCT, 276 "UINT16": TokenType.USMALLINT, 277 "UINT32": TokenType.UINT, 278 "UINT64": TokenType.UBIGINT, 279 "UINT8": TokenType.UTINYINT, 280 "IPV4": TokenType.IPV4, 281 "IPV6": TokenType.IPV6, 282 "POINT": TokenType.POINT, 283 "RING": TokenType.RING, 284 "LINESTRING": TokenType.LINESTRING, 285 "MULTILINESTRING": TokenType.MULTILINESTRING, 286 "POLYGON": TokenType.POLYGON, 287 "MULTIPOLYGON": TokenType.MULTIPOLYGON, 288 "AGGREGATEFUNCTION": TokenType.AGGREGATEFUNCTION, 289 "SIMPLEAGGREGATEFUNCTION": TokenType.SIMPLEAGGREGATEFUNCTION, 290 "SYSTEM": TokenType.COMMAND, 291 "PREWHERE": TokenType.PREWHERE, 292 } 293 KEYWORDS.pop("/*+") 294 295 SINGLE_TOKENS = { 296 **tokens.Tokenizer.SINGLE_TOKENS, 297 "$": TokenType.HEREDOC_STRING, 298 }
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- BYTE_STRINGS
- RAW_STRINGS
- UNICODE_STRINGS
- QUOTES
- VAR_SINGLE_TOKENS
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
300 class Parser(parser.Parser): 301 # Tested in ClickHouse's playground, it seems that the following two queries do the same thing 302 # * select x from t1 union all select x from t2 limit 1; 303 # * select x from t1 union all (select x from t2 limit 1); 304 MODIFIERS_ATTACHED_TO_SET_OP = False 305 INTERVAL_SPANS = False 306 OPTIONAL_ALIAS_TOKEN_CTE = False 307 JOINS_HAVE_EQUAL_PRECEDENCE = True 308 309 FUNCTIONS = { 310 **parser.Parser.FUNCTIONS, 311 "ANY": exp.AnyValue.from_arg_list, 312 "ARRAYSUM": exp.ArraySum.from_arg_list, 313 "ARRAYREVERSE": exp.ArrayReverse.from_arg_list, 314 "ARRAYSLICE": exp.ArraySlice.from_arg_list, 315 "COUNTIF": _build_count_if, 316 "COSINEDISTANCE": exp.CosineDistance.from_arg_list, 317 "DATE_ADD": build_date_delta(exp.DateAdd, default_unit=None), 318 "DATEADD": build_date_delta(exp.DateAdd, default_unit=None), 319 "DATE_DIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 320 "DATEDIFF": build_date_delta(exp.DateDiff, default_unit=None, supports_timezone=True), 321 "DATE_FORMAT": _build_datetime_format(exp.TimeToStr), 322 "DATE_SUB": build_date_delta(exp.DateSub, default_unit=None), 323 "DATESUB": build_date_delta(exp.DateSub, default_unit=None), 324 "FORMATDATETIME": _build_datetime_format(exp.TimeToStr), 325 "JSONEXTRACTSTRING": build_json_extract_path( 326 exp.JSONExtractScalar, zero_based_indexing=False 327 ), 328 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 329 "L2Distance": exp.EuclideanDistance.from_arg_list, 330 "MAP": parser.build_var_map, 331 "MATCH": exp.RegexpLike.from_arg_list, 332 "PARSEDATETIME": _build_datetime_format(exp.ParseDatetime), 333 "RANDCANONICAL": exp.Rand.from_arg_list, 334 "STR_TO_DATE": _build_str_to_date, 335 "TUPLE": exp.Struct.from_arg_list, 336 "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None), 337 "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None), 338 "TIMESTAMP_ADD": build_date_delta(exp.TimestampAdd, default_unit=None), 339 "TIMESTAMPADD": build_date_delta(exp.TimestampAdd, default_unit=None), 340 "UNIQ": exp.ApproxDistinct.from_arg_list, 341 "XOR": lambda args: exp.Xor(expressions=args), 342 "MD5": exp.MD5Digest.from_arg_list, 343 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 344 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 345 "SUBSTRINGINDEX": exp.SubstringIndex.from_arg_list, 346 "TOTYPENAME": exp.Typeof.from_arg_list, 347 "EDITDISTANCE": exp.Levenshtein.from_arg_list, 348 "LEVENSHTEINDISTANCE": exp.Levenshtein.from_arg_list, 349 } 350 FUNCTIONS.pop("TRANSFORM") 351 FUNCTIONS.pop("APPROX_TOP_SUM") 352 353 AGG_FUNCTIONS = { 354 "count", 355 "min", 356 "max", 357 "sum", 358 "avg", 359 "any", 360 "stddevPop", 361 "stddevSamp", 362 "varPop", 363 "varSamp", 364 "corr", 365 "covarPop", 366 "covarSamp", 367 "entropy", 368 "exponentialMovingAverage", 369 "intervalLengthSum", 370 "kolmogorovSmirnovTest", 371 "mannWhitneyUTest", 372 "median", 373 "rankCorr", 374 "sumKahan", 375 "studentTTest", 376 "welchTTest", 377 "anyHeavy", 378 "anyLast", 379 "boundingRatio", 380 "first_value", 381 "last_value", 382 "argMin", 383 "argMax", 384 "avgWeighted", 385 "topK", 386 "approx_top_sum", 387 "topKWeighted", 388 "deltaSum", 389 "deltaSumTimestamp", 390 "groupArray", 391 "groupArrayLast", 392 "groupUniqArray", 393 "groupArrayInsertAt", 394 "groupArrayMovingAvg", 395 "groupArrayMovingSum", 396 "groupArraySample", 397 "groupBitAnd", 398 "groupBitOr", 399 "groupBitXor", 400 "groupBitmap", 401 "groupBitmapAnd", 402 "groupBitmapOr", 403 "groupBitmapXor", 404 "sumWithOverflow", 405 "sumMap", 406 "minMap", 407 "maxMap", 408 "skewSamp", 409 "skewPop", 410 "kurtSamp", 411 "kurtPop", 412 "uniq", 413 "uniqExact", 414 "uniqCombined", 415 "uniqCombined64", 416 "uniqHLL12", 417 "uniqTheta", 418 "quantile", 419 "quantiles", 420 "quantileExact", 421 "quantilesExact", 422 "quantileExactLow", 423 "quantilesExactLow", 424 "quantileExactHigh", 425 "quantilesExactHigh", 426 "quantileExactWeighted", 427 "quantilesExactWeighted", 428 "quantileTiming", 429 "quantilesTiming", 430 "quantileTimingWeighted", 431 "quantilesTimingWeighted", 432 "quantileDeterministic", 433 "quantilesDeterministic", 434 "quantileTDigest", 435 "quantilesTDigest", 436 "quantileTDigestWeighted", 437 "quantilesTDigestWeighted", 438 "quantileBFloat16", 439 "quantilesBFloat16", 440 "quantileBFloat16Weighted", 441 "quantilesBFloat16Weighted", 442 "simpleLinearRegression", 443 "stochasticLinearRegression", 444 "stochasticLogisticRegression", 445 "categoricalInformationValue", 446 "contingency", 447 "cramersV", 448 "cramersVBiasCorrected", 449 "theilsU", 450 "maxIntersections", 451 "maxIntersectionsPosition", 452 "meanZTest", 453 "quantileInterpolatedWeighted", 454 "quantilesInterpolatedWeighted", 455 "quantileGK", 456 "quantilesGK", 457 "sparkBar", 458 "sumCount", 459 "largestTriangleThreeBuckets", 460 "histogram", 461 "sequenceMatch", 462 "sequenceCount", 463 "windowFunnel", 464 "retention", 465 "uniqUpTo", 466 "sequenceNextNode", 467 "exponentialTimeDecayedAvg", 468 } 469 470 AGG_FUNCTIONS_SUFFIXES = [ 471 "If", 472 "Array", 473 "ArrayIf", 474 "Map", 475 "SimpleState", 476 "State", 477 "Merge", 478 "MergeState", 479 "ForEach", 480 "Distinct", 481 "OrDefault", 482 "OrNull", 483 "Resample", 484 "ArgMin", 485 "ArgMax", 486 ] 487 488 FUNC_TOKENS = { 489 *parser.Parser.FUNC_TOKENS, 490 TokenType.AND, 491 TokenType.OR, 492 TokenType.SET, 493 } 494 495 RESERVED_TOKENS = parser.Parser.RESERVED_TOKENS - {TokenType.SELECT} 496 497 ID_VAR_TOKENS = { 498 *parser.Parser.ID_VAR_TOKENS, 499 TokenType.LIKE, 500 } 501 502 AGG_FUNC_MAPPING = ( 503 lambda functions, suffixes: { 504 f"{f}{sfx}": (f, sfx) for sfx in (suffixes + [""]) for f in functions 505 } 506 )(AGG_FUNCTIONS, AGG_FUNCTIONS_SUFFIXES) 507 508 FUNCTIONS_WITH_ALIASED_ARGS = {*parser.Parser.FUNCTIONS_WITH_ALIASED_ARGS, "TUPLE"} 509 510 FUNCTION_PARSERS = { 511 **parser.Parser.FUNCTION_PARSERS, 512 "ARRAYJOIN": lambda self: self.expression(exp.Explode, this=self._parse_expression()), 513 "QUANTILE": lambda self: self._parse_quantile(), 514 "MEDIAN": lambda self: self._parse_quantile(), 515 "COLUMNS": lambda self: self._parse_columns(), 516 } 517 518 FUNCTION_PARSERS.pop("MATCH") 519 520 PROPERTY_PARSERS = { 521 **parser.Parser.PROPERTY_PARSERS, 522 "ENGINE": lambda self: self._parse_engine_property(), 523 } 524 PROPERTY_PARSERS.pop("DYNAMIC") 525 526 NO_PAREN_FUNCTION_PARSERS = parser.Parser.NO_PAREN_FUNCTION_PARSERS.copy() 527 NO_PAREN_FUNCTION_PARSERS.pop("ANY") 528 529 NO_PAREN_FUNCTIONS = parser.Parser.NO_PAREN_FUNCTIONS.copy() 530 NO_PAREN_FUNCTIONS.pop(TokenType.CURRENT_TIMESTAMP) 531 532 RANGE_PARSERS = { 533 **parser.Parser.RANGE_PARSERS, 534 TokenType.GLOBAL: lambda self, this: self._parse_global_in(this), 535 } 536 537 # The PLACEHOLDER entry is popped because 1) it doesn't affect Clickhouse (it corresponds to 538 # the postgres-specific JSONBContains parser) and 2) it makes parsing the ternary op simpler. 539 COLUMN_OPERATORS = parser.Parser.COLUMN_OPERATORS.copy() 540 COLUMN_OPERATORS.pop(TokenType.PLACEHOLDER) 541 542 JOIN_KINDS = { 543 *parser.Parser.JOIN_KINDS, 544 TokenType.ANY, 545 TokenType.ASOF, 546 TokenType.ARRAY, 547 } 548 549 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - { 550 TokenType.ANY, 551 TokenType.ARRAY, 552 TokenType.FINAL, 553 TokenType.FORMAT, 554 TokenType.SETTINGS, 555 } 556 557 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - { 558 TokenType.FORMAT, 559 } 560 561 LOG_DEFAULTS_TO_LN = True 562 563 QUERY_MODIFIER_PARSERS = { 564 **parser.Parser.QUERY_MODIFIER_PARSERS, 565 TokenType.SETTINGS: lambda self: ( 566 "settings", 567 self._advance() or self._parse_csv(self._parse_assignment), 568 ), 569 TokenType.FORMAT: lambda self: ("format", self._advance() or self._parse_id_var()), 570 } 571 572 CONSTRAINT_PARSERS = { 573 **parser.Parser.CONSTRAINT_PARSERS, 574 "INDEX": lambda self: self._parse_index_constraint(), 575 "CODEC": lambda self: self._parse_compress(), 576 } 577 578 ALTER_PARSERS = { 579 **parser.Parser.ALTER_PARSERS, 580 "REPLACE": lambda self: self._parse_alter_table_replace(), 581 } 582 583 SCHEMA_UNNAMED_CONSTRAINTS = { 584 *parser.Parser.SCHEMA_UNNAMED_CONSTRAINTS, 585 "INDEX", 586 } 587 588 PLACEHOLDER_PARSERS = { 589 **parser.Parser.PLACEHOLDER_PARSERS, 590 TokenType.L_BRACE: lambda self: self._parse_query_parameter(), 591 } 592 593 def _parse_engine_property(self) -> exp.EngineProperty: 594 self._match(TokenType.EQ) 595 return self.expression( 596 exp.EngineProperty, 597 this=self._parse_field(any_token=True, anonymous_func=True), 598 ) 599 600 # https://clickhouse.com/docs/en/sql-reference/statements/create/function 601 def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]: 602 return self._parse_lambda() 603 604 def _parse_types( 605 self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True 606 ) -> t.Optional[exp.Expression]: 607 dtype = super()._parse_types( 608 check_func=check_func, schema=schema, allow_identifiers=allow_identifiers 609 ) 610 if isinstance(dtype, exp.DataType) and dtype.args.get("nullable") is not True: 611 # Mark every type as non-nullable which is ClickHouse's default, unless it's 612 # already marked as nullable. This marker helps us transpile types from other 613 # dialects to ClickHouse, so that we can e.g. produce `CAST(x AS Nullable(String))` 614 # from `CAST(x AS TEXT)`. If there is a `NULL` value in `x`, the former would 615 # fail in ClickHouse without the `Nullable` type constructor. 616 dtype.set("nullable", False) 617 618 return dtype 619 620 def _parse_extract(self) -> exp.Extract | exp.Anonymous: 621 index = self._index 622 this = self._parse_bitwise() 623 if self._match(TokenType.FROM): 624 self._retreat(index) 625 return super()._parse_extract() 626 627 # We return Anonymous here because extract and regexpExtract have different semantics, 628 # so parsing extract(foo, bar) into RegexpExtract can potentially break queries. E.g., 629 # `extract('foobar', 'b')` works, but ClickHouse crashes for `regexpExtract('foobar', 'b')`. 630 # 631 # TODO: can we somehow convert the former into an equivalent `regexpExtract` call? 632 self._match(TokenType.COMMA) 633 return self.expression( 634 exp.Anonymous, this="extract", expressions=[this, self._parse_bitwise()] 635 ) 636 637 def _parse_assignment(self) -> t.Optional[exp.Expression]: 638 this = super()._parse_assignment() 639 640 if self._match(TokenType.PLACEHOLDER): 641 return self.expression( 642 exp.If, 643 this=this, 644 true=self._parse_assignment(), 645 false=self._match(TokenType.COLON) and self._parse_assignment(), 646 ) 647 648 return this 649 650 def _parse_query_parameter(self) -> t.Optional[exp.Expression]: 651 """ 652 Parse a placeholder expression like SELECT {abc: UInt32} or FROM {table: Identifier} 653 https://clickhouse.com/docs/en/sql-reference/syntax#defining-and-using-query-parameters 654 """ 655 index = self._index 656 657 this = self._parse_id_var() 658 self._match(TokenType.COLON) 659 kind = self._parse_types(check_func=False, allow_identifiers=False) or ( 660 self._match_text_seq("IDENTIFIER") and "Identifier" 661 ) 662 663 if not kind: 664 self._retreat(index) 665 return None 666 elif not self._match(TokenType.R_BRACE): 667 self.raise_error("Expecting }") 668 669 if isinstance(this, exp.Identifier) and not this.quoted: 670 this = exp.var(this.name) 671 672 return self.expression(exp.Placeholder, this=this, kind=kind) 673 674 def _parse_bracket( 675 self, this: t.Optional[exp.Expression] = None 676 ) -> t.Optional[exp.Expression]: 677 l_brace = self._match(TokenType.L_BRACE, advance=False) 678 bracket = super()._parse_bracket(this) 679 680 if l_brace and isinstance(bracket, exp.Struct): 681 varmap = exp.VarMap(keys=exp.Array(), values=exp.Array()) 682 for expression in bracket.expressions: 683 if not isinstance(expression, exp.PropertyEQ): 684 break 685 686 varmap.args["keys"].append("expressions", exp.Literal.string(expression.name)) 687 varmap.args["values"].append("expressions", expression.expression) 688 689 return varmap 690 691 return bracket 692 693 def _parse_in(self, this: t.Optional[exp.Expression], is_global: bool = False) -> exp.In: 694 this = super()._parse_in(this) 695 this.set("is_global", is_global) 696 return this 697 698 def _parse_global_in(self, this: t.Optional[exp.Expression]) -> exp.Not | exp.In: 699 is_negated = self._match(TokenType.NOT) 700 this = self._match(TokenType.IN) and self._parse_in(this, is_global=True) 701 return self.expression(exp.Not, this=this) if is_negated else this 702 703 def _parse_table( 704 self, 705 schema: bool = False, 706 joins: bool = False, 707 alias_tokens: t.Optional[t.Collection[TokenType]] = None, 708 parse_bracket: bool = False, 709 is_db_reference: bool = False, 710 parse_partition: bool = False, 711 consume_pipe: bool = False, 712 ) -> t.Optional[exp.Expression]: 713 this = super()._parse_table( 714 schema=schema, 715 joins=joins, 716 alias_tokens=alias_tokens, 717 parse_bracket=parse_bracket, 718 is_db_reference=is_db_reference, 719 ) 720 721 if isinstance(this, exp.Table): 722 inner = this.this 723 alias = this.args.get("alias") 724 725 if isinstance(inner, exp.GenerateSeries) and alias and not alias.columns: 726 alias.set("columns", [exp.to_identifier("generate_series")]) 727 728 if self._match(TokenType.FINAL): 729 this = self.expression(exp.Final, this=this) 730 731 return this 732 733 def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition: 734 return super()._parse_position(haystack_first=True) 735 736 # https://clickhouse.com/docs/en/sql-reference/statements/select/with/ 737 def _parse_cte(self) -> t.Optional[exp.CTE]: 738 # WITH <identifier> AS <subquery expression> 739 cte: t.Optional[exp.CTE] = self._try_parse(super()._parse_cte) 740 741 if not cte: 742 # WITH <expression> AS <identifier> 743 cte = self.expression( 744 exp.CTE, 745 this=self._parse_assignment(), 746 alias=self._parse_table_alias(), 747 scalar=True, 748 ) 749 750 return cte 751 752 def _parse_join_parts( 753 self, 754 ) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]: 755 is_global = self._match(TokenType.GLOBAL) and self._prev 756 kind_pre = self._match_set(self.JOIN_KINDS, advance=False) and self._prev 757 758 if kind_pre: 759 kind = self._match_set(self.JOIN_KINDS) and self._prev 760 side = self._match_set(self.JOIN_SIDES) and self._prev 761 return is_global, side, kind 762 763 return ( 764 is_global, 765 self._match_set(self.JOIN_SIDES) and self._prev, 766 self._match_set(self.JOIN_KINDS) and self._prev, 767 ) 768 769 def _parse_join( 770 self, skip_join_token: bool = False, parse_bracket: bool = False 771 ) -> t.Optional[exp.Join]: 772 join = super()._parse_join(skip_join_token=skip_join_token, parse_bracket=True) 773 if join: 774 join.set("global", join.args.pop("method", None)) 775 776 # tbl ARRAY JOIN arr <-- this should be a `Column` reference, not a `Table` 777 # https://clickhouse.com/docs/en/sql-reference/statements/select/array-join 778 if join.kind == "ARRAY": 779 for table in join.find_all(exp.Table): 780 table.replace(table.to_column()) 781 782 return join 783 784 def _parse_function( 785 self, 786 functions: t.Optional[t.Dict[str, t.Callable]] = None, 787 anonymous: bool = False, 788 optional_parens: bool = True, 789 any_token: bool = False, 790 ) -> t.Optional[exp.Expression]: 791 expr = super()._parse_function( 792 functions=functions, 793 anonymous=anonymous, 794 optional_parens=optional_parens, 795 any_token=any_token, 796 ) 797 798 func = expr.this if isinstance(expr, exp.Window) else expr 799 800 # Aggregate functions can be split in 2 parts: <func_name><suffix> 801 parts = ( 802 self.AGG_FUNC_MAPPING.get(func.this) if isinstance(func, exp.Anonymous) else None 803 ) 804 805 if parts: 806 anon_func: exp.Anonymous = t.cast(exp.Anonymous, func) 807 params = self._parse_func_params(anon_func) 808 809 kwargs = { 810 "this": anon_func.this, 811 "expressions": anon_func.expressions, 812 } 813 if parts[1]: 814 exp_class: t.Type[exp.Expression] = ( 815 exp.CombinedParameterizedAgg if params else exp.CombinedAggFunc 816 ) 817 else: 818 exp_class = exp.ParameterizedAgg if params else exp.AnonymousAggFunc 819 820 kwargs["exp_class"] = exp_class 821 if params: 822 kwargs["params"] = params 823 824 func = self.expression(**kwargs) 825 826 if isinstance(expr, exp.Window): 827 # The window's func was parsed as Anonymous in base parser, fix its 828 # type to be ClickHouse style CombinedAnonymousAggFunc / AnonymousAggFunc 829 expr.set("this", func) 830 elif params: 831 # Params have blocked super()._parse_function() from parsing the following window 832 # (if that exists) as they're standing between the function call and the window spec 833 expr = self._parse_window(func) 834 else: 835 expr = func 836 837 return expr 838 839 def _parse_func_params( 840 self, this: t.Optional[exp.Func] = None 841 ) -> t.Optional[t.List[exp.Expression]]: 842 if self._match_pair(TokenType.R_PAREN, TokenType.L_PAREN): 843 return self._parse_csv(self._parse_lambda) 844 845 if self._match(TokenType.L_PAREN): 846 params = self._parse_csv(self._parse_lambda) 847 self._match_r_paren(this) 848 return params 849 850 return None 851 852 def _parse_quantile(self) -> exp.Quantile: 853 this = self._parse_lambda() 854 params = self._parse_func_params() 855 if params: 856 return self.expression(exp.Quantile, this=params[0], quantile=this) 857 return self.expression(exp.Quantile, this=this, quantile=exp.Literal.number(0.5)) 858 859 def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]: 860 return super()._parse_wrapped_id_vars(optional=True) 861 862 def _parse_primary_key( 863 self, wrapped_optional: bool = False, in_props: bool = False 864 ) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey: 865 return super()._parse_primary_key( 866 wrapped_optional=wrapped_optional or in_props, in_props=in_props 867 ) 868 869 def _parse_on_property(self) -> t.Optional[exp.Expression]: 870 index = self._index 871 if self._match_text_seq("CLUSTER"): 872 this = self._parse_string() or self._parse_id_var() 873 if this: 874 return self.expression(exp.OnCluster, this=this) 875 else: 876 self._retreat(index) 877 return None 878 879 def _parse_index_constraint( 880 self, kind: t.Optional[str] = None 881 ) -> exp.IndexColumnConstraint: 882 # INDEX name1 expr TYPE type1(args) GRANULARITY value 883 this = self._parse_id_var() 884 expression = self._parse_assignment() 885 886 index_type = self._match_text_seq("TYPE") and ( 887 self._parse_function() or self._parse_var() 888 ) 889 890 granularity = self._match_text_seq("GRANULARITY") and self._parse_term() 891 892 return self.expression( 893 exp.IndexColumnConstraint, 894 this=this, 895 expression=expression, 896 index_type=index_type, 897 granularity=granularity, 898 ) 899 900 def _parse_partition(self) -> t.Optional[exp.Partition]: 901 # https://clickhouse.com/docs/en/sql-reference/statements/alter/partition#how-to-set-partition-expression 902 if not self._match(TokenType.PARTITION): 903 return None 904 905 if self._match_text_seq("ID"): 906 # Corresponds to the PARTITION ID <string_value> syntax 907 expressions: t.List[exp.Expression] = [ 908 self.expression(exp.PartitionId, this=self._parse_string()) 909 ] 910 else: 911 expressions = self._parse_expressions() 912 913 return self.expression(exp.Partition, expressions=expressions) 914 915 def _parse_alter_table_replace(self) -> t.Optional[exp.Expression]: 916 partition = self._parse_partition() 917 918 if not partition or not self._match(TokenType.FROM): 919 return None 920 921 return self.expression( 922 exp.ReplacePartition, expression=partition, source=self._parse_table_parts() 923 ) 924 925 def _parse_projection_def(self) -> t.Optional[exp.ProjectionDef]: 926 if not self._match_text_seq("PROJECTION"): 927 return None 928 929 return self.expression( 930 exp.ProjectionDef, 931 this=self._parse_id_var(), 932 expression=self._parse_wrapped(self._parse_statement), 933 ) 934 935 def _parse_constraint(self) -> t.Optional[exp.Expression]: 936 return super()._parse_constraint() or self._parse_projection_def() 937 938 def _parse_alias( 939 self, this: t.Optional[exp.Expression], explicit: bool = False 940 ) -> t.Optional[exp.Expression]: 941 # In clickhouse "SELECT <expr> APPLY(...)" is a query modifier, 942 # so "APPLY" shouldn't be parsed as <expr>'s alias. However, "SELECT <expr> apply" is a valid alias 943 if self._match_pair(TokenType.APPLY, TokenType.L_PAREN, advance=False): 944 return this 945 946 return super()._parse_alias(this=this, explicit=explicit) 947 948 def _parse_expression(self) -> t.Optional[exp.Expression]: 949 this = super()._parse_expression() 950 951 # Clickhouse allows "SELECT <expr> [APPLY(func)] [...]]" modifier 952 while self._match_pair(TokenType.APPLY, TokenType.L_PAREN): 953 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 954 self._match(TokenType.R_PAREN) 955 956 return this 957 958 def _parse_columns(self) -> exp.Expression: 959 this: exp.Expression = self.expression(exp.Columns, this=self._parse_lambda()) 960 961 while self._next and self._match_text_seq(")", "APPLY", "("): 962 self._match(TokenType.R_PAREN) 963 this = exp.Apply(this=this, expression=self._parse_var(any_token=True)) 964 return this 965 966 def _parse_value(self, values: bool = True) -> t.Optional[exp.Tuple]: 967 value = super()._parse_value(values=values) 968 if not value: 969 return None 970 971 # In Clickhouse "SELECT * FROM VALUES (1, 2, 3)" generates a table with a single column, in contrast 972 # to other dialects. For this case, we canonicalize the values into a tuple-of-tuples AST if it's not already one. 973 # In INSERT INTO statements the same clause actually references multiple columns (opposite semantics), 974 # but the final result is not altered by the extra parentheses. 975 # Note: Clickhouse allows VALUES([structure], value, ...) so the branch checks for the last expression 976 expressions = value.expressions 977 if values and not isinstance(expressions[-1], exp.Tuple): 978 value.set( 979 "expressions", 980 [self.expression(exp.Tuple, expressions=[expr]) for expr in expressions], 981 ) 982 983 return value 984 985 def _parse_partitioned_by(self) -> exp.PartitionedByProperty: 986 # ClickHouse allows custom expressions as partition key 987 # https://clickhouse.com/docs/engines/table-engines/mergetree-family/custom-partitioning-key 988 return self.expression( 989 exp.PartitionedByProperty, 990 this=self._parse_assignment(), 991 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- NESTED_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_HINTS
- LAMBDAS
- CAST_COLUMN_OPERATORS
- EXPRESSION_PARSERS
- STATEMENT_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_ALTER_PARSERS
- INVALID_FUNC_NAME_TOKENS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_TOKENS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- NULL_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- PREFIXED_PIVOT_COLUMNS
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- SUPPORTS_IMPLICIT_UNNEST
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- MAP_KEYS_ARE_ARBITRARY_EXPRESSIONS
- JSON_EXTRACT_REQUIRES_JSON_EXPRESSION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- build_cast
- errors
- sql
993 class Generator(generator.Generator): 994 QUERY_HINTS = False 995 STRUCT_DELIMITER = ("(", ")") 996 NVL2_SUPPORTED = False 997 TABLESAMPLE_REQUIRES_PARENS = False 998 TABLESAMPLE_SIZE_IS_ROWS = False 999 TABLESAMPLE_KEYWORDS = "SAMPLE" 1000 LAST_DAY_SUPPORTS_DATE_PART = False 1001 CAN_IMPLEMENT_ARRAY_ANY = True 1002 SUPPORTS_TO_NUMBER = False 1003 JOIN_HINTS = False 1004 TABLE_HINTS = False 1005 GROUPINGS_SEP = "" 1006 SET_OP_MODIFIERS = False 1007 ARRAY_SIZE_NAME = "LENGTH" 1008 WRAP_DERIVED_VALUES = False 1009 1010 STRING_TYPE_MAPPING = { 1011 exp.DataType.Type.BLOB: "String", 1012 exp.DataType.Type.CHAR: "String", 1013 exp.DataType.Type.LONGBLOB: "String", 1014 exp.DataType.Type.LONGTEXT: "String", 1015 exp.DataType.Type.MEDIUMBLOB: "String", 1016 exp.DataType.Type.MEDIUMTEXT: "String", 1017 exp.DataType.Type.TINYBLOB: "String", 1018 exp.DataType.Type.TINYTEXT: "String", 1019 exp.DataType.Type.TEXT: "String", 1020 exp.DataType.Type.VARBINARY: "String", 1021 exp.DataType.Type.VARCHAR: "String", 1022 } 1023 1024 SUPPORTED_JSON_PATH_PARTS = { 1025 exp.JSONPathKey, 1026 exp.JSONPathRoot, 1027 exp.JSONPathSubscript, 1028 } 1029 1030 TYPE_MAPPING = { 1031 **generator.Generator.TYPE_MAPPING, 1032 **STRING_TYPE_MAPPING, 1033 exp.DataType.Type.ARRAY: "Array", 1034 exp.DataType.Type.BOOLEAN: "Bool", 1035 exp.DataType.Type.BIGINT: "Int64", 1036 exp.DataType.Type.DATE32: "Date32", 1037 exp.DataType.Type.DATETIME: "DateTime", 1038 exp.DataType.Type.DATETIME2: "DateTime", 1039 exp.DataType.Type.SMALLDATETIME: "DateTime", 1040 exp.DataType.Type.DATETIME64: "DateTime64", 1041 exp.DataType.Type.DECIMAL: "Decimal", 1042 exp.DataType.Type.DECIMAL32: "Decimal32", 1043 exp.DataType.Type.DECIMAL64: "Decimal64", 1044 exp.DataType.Type.DECIMAL128: "Decimal128", 1045 exp.DataType.Type.DECIMAL256: "Decimal256", 1046 exp.DataType.Type.TIMESTAMP: "DateTime", 1047 exp.DataType.Type.TIMESTAMPNTZ: "DateTime", 1048 exp.DataType.Type.TIMESTAMPTZ: "DateTime", 1049 exp.DataType.Type.DOUBLE: "Float64", 1050 exp.DataType.Type.ENUM: "Enum", 1051 exp.DataType.Type.ENUM8: "Enum8", 1052 exp.DataType.Type.ENUM16: "Enum16", 1053 exp.DataType.Type.FIXEDSTRING: "FixedString", 1054 exp.DataType.Type.FLOAT: "Float32", 1055 exp.DataType.Type.INT: "Int32", 1056 exp.DataType.Type.MEDIUMINT: "Int32", 1057 exp.DataType.Type.INT128: "Int128", 1058 exp.DataType.Type.INT256: "Int256", 1059 exp.DataType.Type.LOWCARDINALITY: "LowCardinality", 1060 exp.DataType.Type.MAP: "Map", 1061 exp.DataType.Type.NESTED: "Nested", 1062 exp.DataType.Type.NOTHING: "Nothing", 1063 exp.DataType.Type.SMALLINT: "Int16", 1064 exp.DataType.Type.STRUCT: "Tuple", 1065 exp.DataType.Type.TINYINT: "Int8", 1066 exp.DataType.Type.UBIGINT: "UInt64", 1067 exp.DataType.Type.UINT: "UInt32", 1068 exp.DataType.Type.UINT128: "UInt128", 1069 exp.DataType.Type.UINT256: "UInt256", 1070 exp.DataType.Type.USMALLINT: "UInt16", 1071 exp.DataType.Type.UTINYINT: "UInt8", 1072 exp.DataType.Type.IPV4: "IPv4", 1073 exp.DataType.Type.IPV6: "IPv6", 1074 exp.DataType.Type.POINT: "Point", 1075 exp.DataType.Type.RING: "Ring", 1076 exp.DataType.Type.LINESTRING: "LineString", 1077 exp.DataType.Type.MULTILINESTRING: "MultiLineString", 1078 exp.DataType.Type.POLYGON: "Polygon", 1079 exp.DataType.Type.MULTIPOLYGON: "MultiPolygon", 1080 exp.DataType.Type.AGGREGATEFUNCTION: "AggregateFunction", 1081 exp.DataType.Type.SIMPLEAGGREGATEFUNCTION: "SimpleAggregateFunction", 1082 exp.DataType.Type.DYNAMIC: "Dynamic", 1083 } 1084 1085 TRANSFORMS = { 1086 **generator.Generator.TRANSFORMS, 1087 exp.AnyValue: rename_func("any"), 1088 exp.ApproxDistinct: rename_func("uniq"), 1089 exp.ArrayConcat: rename_func("arrayConcat"), 1090 exp.ArrayFilter: lambda self, e: self.func("arrayFilter", e.expression, e.this), 1091 exp.ArrayRemove: remove_from_array_using_filter, 1092 exp.ArrayReverse: rename_func("arrayReverse"), 1093 exp.ArraySlice: rename_func("arraySlice"), 1094 exp.ArraySum: rename_func("arraySum"), 1095 exp.ArgMax: arg_max_or_min_no_count("argMax"), 1096 exp.ArgMin: arg_max_or_min_no_count("argMin"), 1097 exp.Array: inline_array_sql, 1098 exp.CastToStrType: rename_func("CAST"), 1099 exp.CountIf: rename_func("countIf"), 1100 exp.CosineDistance: rename_func("cosineDistance"), 1101 exp.CompressColumnConstraint: lambda self, 1102 e: f"CODEC({self.expressions(e, key='this', flat=True)})", 1103 exp.ComputedColumnConstraint: lambda self, 1104 e: f"{'MATERIALIZED' if e.args.get('persisted') else 'ALIAS'} {self.sql(e, 'this')}", 1105 exp.CurrentDate: lambda self, e: self.func("CURRENT_DATE"), 1106 exp.DateAdd: _datetime_delta_sql("DATE_ADD"), 1107 exp.DateDiff: _datetime_delta_sql("DATE_DIFF"), 1108 exp.DateStrToDate: rename_func("toDate"), 1109 exp.DateSub: _datetime_delta_sql("DATE_SUB"), 1110 exp.Explode: rename_func("arrayJoin"), 1111 exp.FarmFingerprint: rename_func("farmFingerprint64"), 1112 exp.Final: lambda self, e: f"{self.sql(e, 'this')} FINAL", 1113 exp.IsNan: rename_func("isNaN"), 1114 exp.JSONCast: lambda self, e: f"{self.sql(e, 'this')}.:{self.sql(e, 'to')}", 1115 exp.JSONExtract: json_extract_segments("JSONExtractString", quoted_index=False), 1116 exp.JSONExtractScalar: json_extract_segments("JSONExtractString", quoted_index=False), 1117 exp.JSONPathKey: json_path_key_only_name, 1118 exp.JSONPathRoot: lambda *_: "", 1119 exp.Length: length_or_char_length_sql, 1120 exp.Map: _map_sql, 1121 exp.Median: rename_func("median"), 1122 exp.Nullif: rename_func("nullIf"), 1123 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 1124 exp.Pivot: no_pivot_sql, 1125 exp.Quantile: _quantile_sql, 1126 exp.RegexpLike: lambda self, e: self.func("match", e.this, e.expression), 1127 exp.Rand: rename_func("randCanonical"), 1128 exp.StartsWith: rename_func("startsWith"), 1129 exp.EndsWith: rename_func("endsWith"), 1130 exp.EuclideanDistance: rename_func("L2Distance"), 1131 exp.StrPosition: lambda self, e: strposition_sql( 1132 self, 1133 e, 1134 func_name="POSITION", 1135 supports_position=True, 1136 use_ansi_position=False, 1137 ), 1138 exp.TimeToStr: lambda self, e: self.func( 1139 "formatDateTime", e.this, self.format_time(e), e.args.get("zone") 1140 ), 1141 exp.TimeStrToTime: _timestrtotime_sql, 1142 exp.TimestampAdd: _datetime_delta_sql("TIMESTAMP_ADD"), 1143 exp.TimestampSub: _datetime_delta_sql("TIMESTAMP_SUB"), 1144 exp.Typeof: rename_func("toTypeName"), 1145 exp.VarMap: _map_sql, 1146 exp.Xor: lambda self, e: self.func("xor", e.this, e.expression, *e.expressions), 1147 exp.MD5Digest: rename_func("MD5"), 1148 exp.MD5: lambda self, e: self.func("LOWER", self.func("HEX", self.func("MD5", e.this))), 1149 exp.SHA: rename_func("SHA1"), 1150 exp.SHA2: sha256_sql, 1151 exp.UnixToTime: _unix_to_time_sql, 1152 exp.TimestampTrunc: timestamptrunc_sql(zone=True), 1153 exp.Trim: lambda self, e: trim_sql(self, e, default_trim_type="BOTH"), 1154 exp.Variance: rename_func("varSamp"), 1155 exp.SchemaCommentProperty: lambda self, e: self.naked_property(e), 1156 exp.Stddev: rename_func("stddevSamp"), 1157 exp.Chr: rename_func("CHAR"), 1158 exp.Lag: lambda self, e: self.func( 1159 "lagInFrame", e.this, e.args.get("offset"), e.args.get("default") 1160 ), 1161 exp.Lead: lambda self, e: self.func( 1162 "leadInFrame", e.this, e.args.get("offset"), e.args.get("default") 1163 ), 1164 exp.Levenshtein: unsupported_args("ins_cost", "del_cost", "sub_cost", "max_dist")( 1165 rename_func("editDistance") 1166 ), 1167 exp.ParseDatetime: rename_func("parseDateTime"), 1168 } 1169 1170 PROPERTIES_LOCATION = { 1171 **generator.Generator.PROPERTIES_LOCATION, 1172 exp.OnCluster: exp.Properties.Location.POST_NAME, 1173 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1174 exp.ToTableProperty: exp.Properties.Location.POST_NAME, 1175 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1176 } 1177 1178 # There's no list in docs, but it can be found in Clickhouse code 1179 # see `ClickHouse/src/Parsers/ParserCreate*.cpp` 1180 ON_CLUSTER_TARGETS = { 1181 "SCHEMA", # Transpiled CREATE SCHEMA may have OnCluster property set 1182 "DATABASE", 1183 "TABLE", 1184 "VIEW", 1185 "DICTIONARY", 1186 "INDEX", 1187 "FUNCTION", 1188 "NAMED COLLECTION", 1189 } 1190 1191 # https://clickhouse.com/docs/en/sql-reference/data-types/nullable 1192 NON_NULLABLE_TYPES = { 1193 exp.DataType.Type.ARRAY, 1194 exp.DataType.Type.MAP, 1195 exp.DataType.Type.STRUCT, 1196 exp.DataType.Type.POINT, 1197 exp.DataType.Type.RING, 1198 exp.DataType.Type.LINESTRING, 1199 exp.DataType.Type.MULTILINESTRING, 1200 exp.DataType.Type.POLYGON, 1201 exp.DataType.Type.MULTIPOLYGON, 1202 } 1203 1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset 1214 1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql 1224 1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix) 1232 1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression) 1240 1241 def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str: 1242 this = self.json_path_part(expression.this) 1243 return str(int(this) + 1) if is_int(this) else this 1244 1245 def likeproperty_sql(self, expression: exp.LikeProperty) -> str: 1246 return f"AS {self.sql(expression, 'this')}" 1247 1248 def _any_to_has( 1249 self, 1250 expression: exp.EQ | exp.NEQ, 1251 default: t.Callable[[t.Any], str], 1252 prefix: str = "", 1253 ) -> str: 1254 if isinstance(expression.left, exp.Any): 1255 arr = expression.left 1256 this = expression.right 1257 elif isinstance(expression.right, exp.Any): 1258 arr = expression.right 1259 this = expression.left 1260 else: 1261 return default(expression) 1262 1263 return prefix + self.func("has", arr.this.unnest(), this) 1264 1265 def eq_sql(self, expression: exp.EQ) -> str: 1266 return self._any_to_has(expression, super().eq_sql) 1267 1268 def neq_sql(self, expression: exp.NEQ) -> str: 1269 return self._any_to_has(expression, super().neq_sql, "NOT ") 1270 1271 def regexpilike_sql(self, expression: exp.RegexpILike) -> str: 1272 # Manually add a flag to make the search case-insensitive 1273 regex = self.func("CONCAT", "'(?i)'", expression.expression) 1274 return self.func("match", expression.this, regex) 1275 1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype 1307 1308 def cte_sql(self, expression: exp.CTE) -> str: 1309 if expression.args.get("scalar"): 1310 this = self.sql(expression, "this") 1311 alias = self.sql(expression, "alias") 1312 return f"{this} AS {alias}" 1313 1314 return super().cte_sql(expression) 1315 1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ] 1329 1330 def placeholder_sql(self, expression: exp.Placeholder) -> str: 1331 return f"{{{expression.name}: {self.sql(expression, 'kind')}}}" 1332 1333 def oncluster_sql(self, expression: exp.OnCluster) -> str: 1334 return f"ON CLUSTER {self.sql(expression, 'this')}" 1335 1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations) 1353 1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}" 1371 1372 def prewhere_sql(self, expression: exp.PreWhere) -> str: 1373 this = self.indent(self.sql(expression, "this")) 1374 return f"{self.seg('PREWHERE')}{self.sep()}{this}" 1375 1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}" 1387 1388 def partition_sql(self, expression: exp.Partition) -> str: 1389 return f"PARTITION {self.expressions(expression, flat=True)}" 1390 1391 def partitionid_sql(self, expression: exp.PartitionId) -> str: 1392 return f"ID {self.sql(expression.this)}" 1393 1394 def replacepartition_sql(self, expression: exp.ReplacePartition) -> str: 1395 return ( 1396 f"REPLACE {self.sql(expression.expression)} FROM {self.sql(expression, 'source')}" 1397 ) 1398 1399 def projectiondef_sql(self, expression: exp.ProjectionDef) -> str: 1400 return f"PROJECTION {self.sql(expression.this)} {self.wrap(expression.expression)}" 1401 1402 def is_sql(self, expression: exp.Is) -> str: 1403 is_sql = super().is_sql(expression) 1404 1405 if isinstance(expression.parent, exp.Not): 1406 # value IS NOT NULL -> NOT (value IS NULL) 1407 is_sql = self.wrap(is_sql) 1408 1409 return is_sql 1410 1411 def in_sql(self, expression: exp.In) -> str: 1412 in_sql = super().in_sql(expression) 1413 1414 if isinstance(expression.parent, exp.Not) and expression.args.get("is_global"): 1415 in_sql = in_sql.replace("GLOBAL IN", "GLOBAL NOT IN", 1) 1416 1417 return in_sql 1418 1419 def not_sql(self, expression: exp.Not) -> str: 1420 if isinstance(expression.this, exp.In) and expression.this.args.get("is_global"): 1421 # let `GLOBAL IN` child interpose `NOT` 1422 return self.sql(expression, "this") 1423 1424 return super().not_sql(expression) 1425 1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHEREclause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1204 def offset_sql(self, expression: exp.Offset) -> str: 1205 offset = super().offset_sql(expression) 1206 1207 # OFFSET ... FETCH syntax requires a "ROW" or "ROWS" keyword 1208 # https://clickhouse.com/docs/sql-reference/statements/select/offset 1209 parent = expression.parent 1210 if isinstance(parent, exp.Select) and isinstance(parent.args.get("limit"), exp.Fetch): 1211 offset = f"{offset} ROWS" 1212 1213 return offset
1215 def strtodate_sql(self, expression: exp.StrToDate) -> str: 1216 strtodate_sql = self.function_fallback_sql(expression) 1217 1218 if not isinstance(expression.parent, exp.Cast): 1219 # StrToDate returns DATEs in other dialects (eg. postgres), so 1220 # this branch aims to improve the transpilation to clickhouse 1221 return self.cast_sql(exp.cast(expression, "DATE")) 1222 1223 return strtodate_sql
1225 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1226 this = expression.this 1227 1228 if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"): 1229 return self.sql(this) 1230 1231 return super().cast_sql(expression, safe_prefix=safe_prefix)
1233 def trycast_sql(self, expression: exp.TryCast) -> str: 1234 dtype = expression.to 1235 if not dtype.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True): 1236 # Casting x into Nullable(T) appears to behave similarly to TRY_CAST(x AS T) 1237 dtype.set("nullable", True) 1238 1239 return super().cast_sql(expression)
1276 def datatype_sql(self, expression: exp.DataType) -> str: 1277 # String is the standard ClickHouse type, every other variant is just an alias. 1278 # Additionally, any supplied length parameter will be ignored. 1279 # 1280 # https://clickhouse.com/docs/en/sql-reference/data-types/string 1281 if expression.this in self.STRING_TYPE_MAPPING: 1282 dtype = "String" 1283 else: 1284 dtype = super().datatype_sql(expression) 1285 1286 # This section changes the type to `Nullable(...)` if the following conditions hold: 1287 # - It's marked as nullable - this ensures we won't wrap ClickHouse types with `Nullable` 1288 # and change their semantics 1289 # - It's not the key type of a `Map`. This is because ClickHouse enforces the following 1290 # constraint: "Type of Map key must be a type, that can be represented by integer or 1291 # String or FixedString (possibly LowCardinality) or UUID or IPv6" 1292 # - It's not a composite type, e.g. `Nullable(Array(...))` is not a valid type 1293 parent = expression.parent 1294 nullable = expression.args.get("nullable") 1295 if nullable is True or ( 1296 nullable is None 1297 and not ( 1298 isinstance(parent, exp.DataType) 1299 and parent.is_type(exp.DataType.Type.MAP, check_nullable=True) 1300 and expression.index in (None, 0) 1301 ) 1302 and not expression.is_type(*self.NON_NULLABLE_TYPES, check_nullable=True) 1303 ): 1304 dtype = f"Nullable({dtype})" 1305 1306 return dtype
1316 def after_limit_modifiers(self, expression: exp.Expression) -> t.List[str]: 1317 return super().after_limit_modifiers(expression) + [ 1318 ( 1319 self.seg("SETTINGS ") + self.expressions(expression, key="settings", flat=True) 1320 if expression.args.get("settings") 1321 else "" 1322 ), 1323 ( 1324 self.seg("FORMAT ") + self.sql(expression, "format") 1325 if expression.args.get("format") 1326 else "" 1327 ), 1328 ]
1336 def createable_sql(self, expression: exp.Create, locations: t.DefaultDict) -> str: 1337 if expression.kind in self.ON_CLUSTER_TARGETS and locations.get( 1338 exp.Properties.Location.POST_NAME 1339 ): 1340 this_name = self.sql( 1341 expression.this if isinstance(expression.this, exp.Schema) else expression, 1342 "this", 1343 ) 1344 this_properties = " ".join( 1345 [self.sql(prop) for prop in locations[exp.Properties.Location.POST_NAME]] 1346 ) 1347 this_schema = self.schema_columns_sql(expression.this) 1348 this_schema = f"{self.sep()}{this_schema}" if this_schema else "" 1349 1350 return f"{this_name}{self.sep()}{this_properties}{this_schema}" 1351 1352 return super().createable_sql(expression, locations)
1354 def create_sql(self, expression: exp.Create) -> str: 1355 # The comment property comes last in CTAS statements, i.e. after the query 1356 query = expression.expression 1357 if isinstance(query, exp.Query): 1358 comment_prop = expression.find(exp.SchemaCommentProperty) 1359 if comment_prop: 1360 comment_prop.pop() 1361 query.replace(exp.paren(query)) 1362 else: 1363 comment_prop = None 1364 1365 create_sql = super().create_sql(expression) 1366 1367 comment_sql = self.sql(comment_prop) 1368 comment_sql = f" {comment_sql}" if comment_sql else "" 1369 1370 return f"{create_sql}{comment_sql}"
1376 def indexcolumnconstraint_sql(self, expression: exp.IndexColumnConstraint) -> str: 1377 this = self.sql(expression, "this") 1378 this = f" {this}" if this else "" 1379 expr = self.sql(expression, "expression") 1380 expr = f" {expr}" if expr else "" 1381 index_type = self.sql(expression, "index_type") 1382 index_type = f" TYPE {index_type}" if index_type else "" 1383 granularity = self.sql(expression, "granularity") 1384 granularity = f" GRANULARITY {granularity}" if granularity else "" 1385 1386 return f"INDEX{this}{expr}{index_type}{granularity}"
1426 def values_sql(self, expression: exp.Values, values_as_table: bool = True) -> str: 1427 # If the VALUES clause contains tuples of expressions, we need to treat it 1428 # as a table since Clickhouse will automatically alias it as such. 1429 alias = expression.args.get("alias") 1430 1431 if alias and alias.args.get("columns") and expression.expressions: 1432 values = expression.expressions[0].expressions 1433 values_as_table = any(isinstance(value, exp.Tuple) for value in values) 1434 else: 1435 values_as_table = True 1436 1437 return super().values_sql(expression, values_as_table=values_as_table)
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- IGNORE_NULLS_IN_FUNC
- LOCKING_READS_SUPPORTED
- EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- INTERVAL_ALLOWS_PLURAL_FORM
- LIMIT_FETCH
- LIMIT_ONLY_LITERALS
- RENAME_TABLE_WITH_DB
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- UNNEST_WITH_ORDINALITY
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- COLLATE_IS_FUNC
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- SUPPORTS_TABLE_ALIAS_COLUMNS
- UNPIVOT_ALIASES_ARE_IDENTIFIERS
- JSON_KEY_VALUE_PAIR_SEP
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- JSON_PATH_SINGLE_QUOTE_ESCAPE
- SUPPORTS_WINDOW_EXCLUDE
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- UNICODE_SUBSTITUTE
- STAR_EXCEPT
- HEX_FUNC
- WITH_PROPERTIES_PREFIX
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- SUPPORTS_EXPLODING_PROJECTIONS
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- SUPPORTS_UNIX_SECONDS
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- SUPPORTS_BETWEEN_FLAGS
- SUPPORTS_LIKE_QUANTIFIERS
- MATCH_AGAINST_TABLE_PREFIX
- UNSUPPORTED_TYPES
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- PARAMETER_TOKEN
- NAMED_PLACEHOLDER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- RESERVED_KEYWORDS
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SAFE_JSON_PATH_KEY_RE
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_parts
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_parts
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- version_sql
- tuple_sql
- update_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- setitem_sql
- set_sql
- queryband_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- subquery_sql
- qualify_sql
- unnest_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- bracket_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- formatphrase_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_unnest_op
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- attimezone_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- altersession_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- like_sql
- ilike_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- generateembedding_sql
- featuresattime_sql
- vectorsearch_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- revoke_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql
- decodecase_sql
- semanticview_sql
- getextract_sql
- datefromunixdate_sql
- space_sql
- buildproperty_sql
- refreshtriggerproperty_sql