sqlglot.tokens
1from __future__ import annotations 2 3import os 4import typing as t 5from enum import auto 6 7from sqlglot.errors import SqlglotError, TokenError 8from sqlglot.helper import AutoName 9from sqlglot.trie import TrieResult, in_trie, new_trie 10 11if t.TYPE_CHECKING: 12 from sqlglot.dialects.dialect import DialectType 13 14 15try: 16 from sqlglotrs import ( # type: ignore 17 Tokenizer as RsTokenizer, 18 TokenizerDialectSettings as RsTokenizerDialectSettings, 19 TokenizerSettings as RsTokenizerSettings, 20 TokenTypeSettings as RsTokenTypeSettings, 21 ) 22 23 USE_RS_TOKENIZER = os.environ.get("SQLGLOTRS_TOKENIZER", "1") == "1" 24except ImportError: 25 USE_RS_TOKENIZER = False 26 27 28class TokenType(AutoName): 29 L_PAREN = auto() 30 R_PAREN = auto() 31 L_BRACKET = auto() 32 R_BRACKET = auto() 33 L_BRACE = auto() 34 R_BRACE = auto() 35 COMMA = auto() 36 DOT = auto() 37 DASH = auto() 38 PLUS = auto() 39 COLON = auto() 40 DOTCOLON = auto() 41 DCOLON = auto() 42 DCOLONDOLLAR = auto() 43 DCOLONPERCENT = auto() 44 DQMARK = auto() 45 SEMICOLON = auto() 46 STAR = auto() 47 BACKSLASH = auto() 48 SLASH = auto() 49 LT = auto() 50 LTE = auto() 51 GT = auto() 52 GTE = auto() 53 NOT = auto() 54 EQ = auto() 55 NEQ = auto() 56 NULLSAFE_EQ = auto() 57 COLON_EQ = auto() 58 COLON_GT = auto() 59 NCOLON_GT = auto() 60 AND = auto() 61 OR = auto() 62 AMP = auto() 63 DPIPE = auto() 64 PIPE_GT = auto() 65 PIPE = auto() 66 PIPE_SLASH = auto() 67 DPIPE_SLASH = auto() 68 CARET = auto() 69 CARET_AT = auto() 70 TILDA = auto() 71 ARROW = auto() 72 DARROW = auto() 73 FARROW = auto() 74 HASH = auto() 75 HASH_ARROW = auto() 76 DHASH_ARROW = auto() 77 LR_ARROW = auto() 78 DAT = auto() 79 LT_AT = auto() 80 AT_GT = auto() 81 DOLLAR = auto() 82 PARAMETER = auto() 83 SESSION = auto() 84 SESSION_PARAMETER = auto() 85 DAMP = auto() 86 XOR = auto() 87 DSTAR = auto() 88 89 URI_START = auto() 90 91 BLOCK_START = auto() 92 BLOCK_END = auto() 93 94 SPACE = auto() 95 BREAK = auto() 96 97 STRING = auto() 98 NUMBER = auto() 99 IDENTIFIER = auto() 100 DATABASE = auto() 101 COLUMN = auto() 102 COLUMN_DEF = auto() 103 SCHEMA = auto() 104 TABLE = auto() 105 WAREHOUSE = auto() 106 STAGE = auto() 107 STREAMLIT = auto() 108 VAR = auto() 109 BIT_STRING = auto() 110 HEX_STRING = auto() 111 BYTE_STRING = auto() 112 NATIONAL_STRING = auto() 113 RAW_STRING = auto() 114 HEREDOC_STRING = auto() 115 UNICODE_STRING = auto() 116 117 # types 118 BIT = auto() 119 BOOLEAN = auto() 120 TINYINT = auto() 121 UTINYINT = auto() 122 SMALLINT = auto() 123 USMALLINT = auto() 124 MEDIUMINT = auto() 125 UMEDIUMINT = auto() 126 INT = auto() 127 UINT = auto() 128 BIGINT = auto() 129 UBIGINT = auto() 130 INT128 = auto() 131 UINT128 = auto() 132 INT256 = auto() 133 UINT256 = auto() 134 FLOAT = auto() 135 DOUBLE = auto() 136 UDOUBLE = auto() 137 DECIMAL = auto() 138 DECIMAL32 = auto() 139 DECIMAL64 = auto() 140 DECIMAL128 = auto() 141 DECIMAL256 = auto() 142 UDECIMAL = auto() 143 BIGDECIMAL = auto() 144 CHAR = auto() 145 NCHAR = auto() 146 VARCHAR = auto() 147 NVARCHAR = auto() 148 BPCHAR = auto() 149 TEXT = auto() 150 MEDIUMTEXT = auto() 151 LONGTEXT = auto() 152 BLOB = auto() 153 MEDIUMBLOB = auto() 154 LONGBLOB = auto() 155 TINYBLOB = auto() 156 TINYTEXT = auto() 157 NAME = auto() 158 BINARY = auto() 159 VARBINARY = auto() 160 JSON = auto() 161 JSONB = auto() 162 TIME = auto() 163 TIMETZ = auto() 164 TIMESTAMP = auto() 165 TIMESTAMPTZ = auto() 166 TIMESTAMPLTZ = auto() 167 TIMESTAMPNTZ = auto() 168 TIMESTAMP_S = auto() 169 TIMESTAMP_MS = auto() 170 TIMESTAMP_NS = auto() 171 DATETIME = auto() 172 DATETIME2 = auto() 173 DATETIME64 = auto() 174 SMALLDATETIME = auto() 175 DATE = auto() 176 DATE32 = auto() 177 INT4RANGE = auto() 178 INT4MULTIRANGE = auto() 179 INT8RANGE = auto() 180 INT8MULTIRANGE = auto() 181 NUMRANGE = auto() 182 NUMMULTIRANGE = auto() 183 TSRANGE = auto() 184 TSMULTIRANGE = auto() 185 TSTZRANGE = auto() 186 TSTZMULTIRANGE = auto() 187 DATERANGE = auto() 188 DATEMULTIRANGE = auto() 189 UUID = auto() 190 GEOGRAPHY = auto() 191 GEOGRAPHYPOINT = auto() 192 NULLABLE = auto() 193 GEOMETRY = auto() 194 POINT = auto() 195 RING = auto() 196 LINESTRING = auto() 197 MULTILINESTRING = auto() 198 POLYGON = auto() 199 MULTIPOLYGON = auto() 200 HLLSKETCH = auto() 201 HSTORE = auto() 202 SUPER = auto() 203 SERIAL = auto() 204 SMALLSERIAL = auto() 205 BIGSERIAL = auto() 206 XML = auto() 207 YEAR = auto() 208 USERDEFINED = auto() 209 MONEY = auto() 210 SMALLMONEY = auto() 211 ROWVERSION = auto() 212 IMAGE = auto() 213 VARIANT = auto() 214 OBJECT = auto() 215 INET = auto() 216 IPADDRESS = auto() 217 IPPREFIX = auto() 218 IPV4 = auto() 219 IPV6 = auto() 220 ENUM = auto() 221 ENUM8 = auto() 222 ENUM16 = auto() 223 FIXEDSTRING = auto() 224 LOWCARDINALITY = auto() 225 NESTED = auto() 226 AGGREGATEFUNCTION = auto() 227 SIMPLEAGGREGATEFUNCTION = auto() 228 TDIGEST = auto() 229 UNKNOWN = auto() 230 VECTOR = auto() 231 DYNAMIC = auto() 232 VOID = auto() 233 234 # keywords 235 ALIAS = auto() 236 ALTER = auto() 237 ALL = auto() 238 ANTI = auto() 239 ANY = auto() 240 APPLY = auto() 241 ARRAY = auto() 242 ASC = auto() 243 ASOF = auto() 244 ATTACH = auto() 245 AUTO_INCREMENT = auto() 246 BEGIN = auto() 247 BETWEEN = auto() 248 BULK_COLLECT_INTO = auto() 249 CACHE = auto() 250 CASE = auto() 251 CHARACTER_SET = auto() 252 CLUSTER_BY = auto() 253 COLLATE = auto() 254 COMMAND = auto() 255 COMMENT = auto() 256 COMMIT = auto() 257 CONNECT_BY = auto() 258 CONSTRAINT = auto() 259 COPY = auto() 260 CREATE = auto() 261 CROSS = auto() 262 CUBE = auto() 263 CURRENT_DATE = auto() 264 CURRENT_DATETIME = auto() 265 CURRENT_SCHEMA = auto() 266 CURRENT_TIME = auto() 267 CURRENT_TIMESTAMP = auto() 268 CURRENT_USER = auto() 269 DECLARE = auto() 270 DEFAULT = auto() 271 DELETE = auto() 272 DESC = auto() 273 DESCRIBE = auto() 274 DETACH = auto() 275 DICTIONARY = auto() 276 DISTINCT = auto() 277 DISTRIBUTE_BY = auto() 278 DIV = auto() 279 DROP = auto() 280 ELSE = auto() 281 END = auto() 282 ESCAPE = auto() 283 EXCEPT = auto() 284 EXECUTE = auto() 285 EXISTS = auto() 286 FALSE = auto() 287 FETCH = auto() 288 FILE_FORMAT = auto() 289 FILTER = auto() 290 FINAL = auto() 291 FIRST = auto() 292 FOR = auto() 293 FORCE = auto() 294 FOREIGN_KEY = auto() 295 FORMAT = auto() 296 FROM = auto() 297 FULL = auto() 298 FUNCTION = auto() 299 GET = auto() 300 GLOB = auto() 301 GLOBAL = auto() 302 GRANT = auto() 303 GROUP_BY = auto() 304 GROUPING_SETS = auto() 305 HAVING = auto() 306 HINT = auto() 307 IGNORE = auto() 308 ILIKE = auto() 309 IN = auto() 310 INDEX = auto() 311 INNER = auto() 312 INSERT = auto() 313 INTERSECT = auto() 314 INTERVAL = auto() 315 INTO = auto() 316 INTRODUCER = auto() 317 IRLIKE = auto() 318 IS = auto() 319 ISNULL = auto() 320 JOIN = auto() 321 JOIN_MARKER = auto() 322 KEEP = auto() 323 KEY = auto() 324 KILL = auto() 325 LANGUAGE = auto() 326 LATERAL = auto() 327 LEFT = auto() 328 LIKE = auto() 329 LIMIT = auto() 330 LIST = auto() 331 LOAD = auto() 332 LOCK = auto() 333 MAP = auto() 334 MATCH_CONDITION = auto() 335 MATCH_RECOGNIZE = auto() 336 MEMBER_OF = auto() 337 MERGE = auto() 338 MOD = auto() 339 MODEL = auto() 340 NATURAL = auto() 341 NEXT = auto() 342 NOTHING = auto() 343 NOTNULL = auto() 344 NULL = auto() 345 OBJECT_IDENTIFIER = auto() 346 OFFSET = auto() 347 ON = auto() 348 ONLY = auto() 349 OPERATOR = auto() 350 ORDER_BY = auto() 351 ORDER_SIBLINGS_BY = auto() 352 ORDERED = auto() 353 ORDINALITY = auto() 354 OUTER = auto() 355 OVER = auto() 356 OVERLAPS = auto() 357 OVERWRITE = auto() 358 PARTITION = auto() 359 PARTITION_BY = auto() 360 PERCENT = auto() 361 PIVOT = auto() 362 PLACEHOLDER = auto() 363 POSITIONAL = auto() 364 PRAGMA = auto() 365 PREWHERE = auto() 366 PRIMARY_KEY = auto() 367 PROCEDURE = auto() 368 PROPERTIES = auto() 369 PSEUDO_TYPE = auto() 370 PUT = auto() 371 QUALIFY = auto() 372 QUOTE = auto() 373 RANGE = auto() 374 RECURSIVE = auto() 375 REFRESH = auto() 376 RENAME = auto() 377 REPLACE = auto() 378 RETURNING = auto() 379 REVOKE = auto() 380 REFERENCES = auto() 381 RIGHT = auto() 382 RLIKE = auto() 383 ROLLBACK = auto() 384 ROLLUP = auto() 385 ROW = auto() 386 ROWS = auto() 387 SELECT = auto() 388 SEMI = auto() 389 SEPARATOR = auto() 390 SEQUENCE = auto() 391 SERDE_PROPERTIES = auto() 392 SET = auto() 393 SETTINGS = auto() 394 SHOW = auto() 395 SIMILAR_TO = auto() 396 SOME = auto() 397 SORT_BY = auto() 398 START_WITH = auto() 399 STORAGE_INTEGRATION = auto() 400 STRAIGHT_JOIN = auto() 401 STRUCT = auto() 402 SUMMARIZE = auto() 403 TABLE_SAMPLE = auto() 404 TAG = auto() 405 TEMPORARY = auto() 406 TOP = auto() 407 THEN = auto() 408 TRUE = auto() 409 TRUNCATE = auto() 410 UNCACHE = auto() 411 UNION = auto() 412 UNNEST = auto() 413 UNPIVOT = auto() 414 UPDATE = auto() 415 USE = auto() 416 USING = auto() 417 VALUES = auto() 418 VIEW = auto() 419 SEMANTIC_VIEW = auto() 420 VOLATILE = auto() 421 WHEN = auto() 422 WHERE = auto() 423 WINDOW = auto() 424 WITH = auto() 425 UNIQUE = auto() 426 UTC_DATE = auto() 427 UTC_TIME = auto() 428 UTC_TIMESTAMP = auto() 429 VERSION_SNAPSHOT = auto() 430 TIMESTAMP_SNAPSHOT = auto() 431 OPTION = auto() 432 SINK = auto() 433 SOURCE = auto() 434 ANALYZE = auto() 435 NAMESPACE = auto() 436 EXPORT = auto() 437 438 # sentinel 439 HIVE_TOKEN_STREAM = auto() 440 441 442_ALL_TOKEN_TYPES = list(TokenType) 443_TOKEN_TYPE_TO_INDEX = {token_type: i for i, token_type in enumerate(_ALL_TOKEN_TYPES)} 444 445 446class Token: 447 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 448 449 @classmethod 450 def number(cls, number: int) -> Token: 451 """Returns a NUMBER token with `number` as its text.""" 452 return cls(TokenType.NUMBER, str(number)) 453 454 @classmethod 455 def string(cls, string: str) -> Token: 456 """Returns a STRING token with `string` as its text.""" 457 return cls(TokenType.STRING, string) 458 459 @classmethod 460 def identifier(cls, identifier: str) -> Token: 461 """Returns an IDENTIFIER token with `identifier` as its text.""" 462 return cls(TokenType.IDENTIFIER, identifier) 463 464 @classmethod 465 def var(cls, var: str) -> Token: 466 """Returns an VAR token with `var` as its text.""" 467 return cls(TokenType.VAR, var) 468 469 def __init__( 470 self, 471 token_type: TokenType, 472 text: str, 473 line: int = 1, 474 col: int = 1, 475 start: int = 0, 476 end: int = 0, 477 comments: t.Optional[t.List[str]] = None, 478 ) -> None: 479 """Token initializer. 480 481 Args: 482 token_type: The TokenType Enum. 483 text: The text of the token. 484 line: The line that the token ends on. 485 col: The column that the token ends on. 486 start: The start index of the token. 487 end: The ending index of the token. 488 comments: The comments to attach to the token. 489 """ 490 self.token_type = token_type 491 self.text = text 492 self.line = line 493 self.col = col 494 self.start = start 495 self.end = end 496 self.comments = [] if comments is None else comments 497 498 def __repr__(self) -> str: 499 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 500 return f"<Token {attributes}>" 501 502 503class _Tokenizer(type): 504 def __new__(cls, clsname, bases, attrs): 505 klass = super().__new__(cls, clsname, bases, attrs) 506 507 def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 508 return dict( 509 (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr 510 ) 511 512 def _quotes_to_format( 513 token_type: TokenType, arr: t.List[str | t.Tuple[str, str]] 514 ) -> t.Dict[str, t.Tuple[str, TokenType]]: 515 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 516 517 klass._QUOTES = _convert_quotes(klass.QUOTES) 518 klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS) 519 520 klass._FORMAT_STRINGS = { 521 **{ 522 p + s: (e, TokenType.NATIONAL_STRING) 523 for s, e in klass._QUOTES.items() 524 for p in ("n", "N") 525 }, 526 **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS), 527 **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS), 528 **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS), 529 **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS), 530 **_quotes_to_format(TokenType.HEREDOC_STRING, klass.HEREDOC_STRINGS), 531 **_quotes_to_format(TokenType.UNICODE_STRING, klass.UNICODE_STRINGS), 532 } 533 534 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) 535 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) 536 klass._COMMENTS = { 537 **dict( 538 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) 539 for comment in klass.COMMENTS 540 ), 541 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 542 } 543 if klass.HINT_START in klass.KEYWORDS: 544 klass._COMMENTS[klass.HINT_START] = "*/" 545 546 klass._KEYWORD_TRIE = new_trie( 547 key.upper() 548 for key in ( 549 *klass.KEYWORDS, 550 *klass._COMMENTS, 551 *klass._QUOTES, 552 *klass._FORMAT_STRINGS, 553 ) 554 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) 555 ) 556 557 if USE_RS_TOKENIZER: 558 settings = RsTokenizerSettings( 559 white_space={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.WHITE_SPACE.items()}, 560 single_tokens={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.SINGLE_TOKENS.items()}, 561 keywords={k: _TOKEN_TYPE_TO_INDEX[v] for k, v in klass.KEYWORDS.items()}, 562 numeric_literals=klass.NUMERIC_LITERALS, 563 identifiers=klass._IDENTIFIERS, 564 identifier_escapes=klass._IDENTIFIER_ESCAPES, 565 string_escapes=klass._STRING_ESCAPES, 566 quotes=klass._QUOTES, 567 format_strings={ 568 k: (v1, _TOKEN_TYPE_TO_INDEX[v2]) 569 for k, (v1, v2) in klass._FORMAT_STRINGS.items() 570 }, 571 has_bit_strings=bool(klass.BIT_STRINGS), 572 has_hex_strings=bool(klass.HEX_STRINGS), 573 comments=klass._COMMENTS, 574 var_single_tokens=klass.VAR_SINGLE_TOKENS, 575 commands={_TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMANDS}, 576 command_prefix_tokens={ 577 _TOKEN_TYPE_TO_INDEX[v] for v in klass.COMMAND_PREFIX_TOKENS 578 }, 579 heredoc_tag_is_identifier=klass.HEREDOC_TAG_IS_IDENTIFIER, 580 string_escapes_allowed_in_raw_strings=klass.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS, 581 nested_comments=klass.NESTED_COMMENTS, 582 hint_start=klass.HINT_START, 583 tokens_preceding_hint={ 584 _TOKEN_TYPE_TO_INDEX[v] for v in klass.TOKENS_PRECEDING_HINT 585 }, 586 ) 587 token_types = RsTokenTypeSettings( 588 bit_string=_TOKEN_TYPE_TO_INDEX[TokenType.BIT_STRING], 589 break_=_TOKEN_TYPE_TO_INDEX[TokenType.BREAK], 590 dcolon=_TOKEN_TYPE_TO_INDEX[TokenType.DCOLON], 591 heredoc_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEREDOC_STRING], 592 raw_string=_TOKEN_TYPE_TO_INDEX[TokenType.RAW_STRING], 593 hex_string=_TOKEN_TYPE_TO_INDEX[TokenType.HEX_STRING], 594 identifier=_TOKEN_TYPE_TO_INDEX[TokenType.IDENTIFIER], 595 number=_TOKEN_TYPE_TO_INDEX[TokenType.NUMBER], 596 parameter=_TOKEN_TYPE_TO_INDEX[TokenType.PARAMETER], 597 semicolon=_TOKEN_TYPE_TO_INDEX[TokenType.SEMICOLON], 598 string=_TOKEN_TYPE_TO_INDEX[TokenType.STRING], 599 var=_TOKEN_TYPE_TO_INDEX[TokenType.VAR], 600 heredoc_string_alternative=_TOKEN_TYPE_TO_INDEX[klass.HEREDOC_STRING_ALTERNATIVE], 601 hint=_TOKEN_TYPE_TO_INDEX[TokenType.HINT], 602 ) 603 klass._RS_TOKENIZER = RsTokenizer(settings, token_types) 604 else: 605 klass._RS_TOKENIZER = None 606 607 return klass 608 609 610class Tokenizer(metaclass=_Tokenizer): 611 SINGLE_TOKENS = { 612 "(": TokenType.L_PAREN, 613 ")": TokenType.R_PAREN, 614 "[": TokenType.L_BRACKET, 615 "]": TokenType.R_BRACKET, 616 "{": TokenType.L_BRACE, 617 "}": TokenType.R_BRACE, 618 "&": TokenType.AMP, 619 "^": TokenType.CARET, 620 ":": TokenType.COLON, 621 ",": TokenType.COMMA, 622 ".": TokenType.DOT, 623 "-": TokenType.DASH, 624 "=": TokenType.EQ, 625 ">": TokenType.GT, 626 "<": TokenType.LT, 627 "%": TokenType.MOD, 628 "!": TokenType.NOT, 629 "|": TokenType.PIPE, 630 "+": TokenType.PLUS, 631 ";": TokenType.SEMICOLON, 632 "/": TokenType.SLASH, 633 "\\": TokenType.BACKSLASH, 634 "*": TokenType.STAR, 635 "~": TokenType.TILDA, 636 "?": TokenType.PLACEHOLDER, 637 "@": TokenType.PARAMETER, 638 "#": TokenType.HASH, 639 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 640 "'": TokenType.UNKNOWN, 641 "`": TokenType.UNKNOWN, 642 '"': TokenType.UNKNOWN, 643 } 644 645 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 646 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 647 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 648 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 649 HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = [] 650 UNICODE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 651 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 652 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 653 STRING_ESCAPES = ["'"] 654 VAR_SINGLE_TOKENS: t.Set[str] = set() 655 656 # The strings in this list can always be used as escapes, regardless of the surrounding 657 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 658 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 659 IDENTIFIER_ESCAPES: t.List[str] = [] 660 661 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 662 HEREDOC_TAG_IS_IDENTIFIER = False 663 664 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 665 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 666 667 # Whether string escape characters function as such when placed within raw strings 668 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 669 670 NESTED_COMMENTS = True 671 672 HINT_START = "/*+" 673 674 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 675 676 # Autofilled 677 _COMMENTS: t.Dict[str, str] = {} 678 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 679 _IDENTIFIERS: t.Dict[str, str] = {} 680 _IDENTIFIER_ESCAPES: t.Set[str] = set() 681 _QUOTES: t.Dict[str, str] = {} 682 _STRING_ESCAPES: t.Set[str] = set() 683 _KEYWORD_TRIE: t.Dict = {} 684 _RS_TOKENIZER: t.Optional[t.Any] = None 685 686 KEYWORDS: t.Dict[str, TokenType] = { 687 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 688 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 689 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 690 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 691 HINT_START: TokenType.HINT, 692 "==": TokenType.EQ, 693 "::": TokenType.DCOLON, 694 "||": TokenType.DPIPE, 695 "|>": TokenType.PIPE_GT, 696 ">=": TokenType.GTE, 697 "<=": TokenType.LTE, 698 "<>": TokenType.NEQ, 699 "!=": TokenType.NEQ, 700 ":=": TokenType.COLON_EQ, 701 "<=>": TokenType.NULLSAFE_EQ, 702 "->": TokenType.ARROW, 703 "->>": TokenType.DARROW, 704 "=>": TokenType.FARROW, 705 "#>": TokenType.HASH_ARROW, 706 "#>>": TokenType.DHASH_ARROW, 707 "<->": TokenType.LR_ARROW, 708 "&&": TokenType.DAMP, 709 "??": TokenType.DQMARK, 710 "~~~": TokenType.GLOB, 711 "~~": TokenType.LIKE, 712 "~~*": TokenType.ILIKE, 713 "~*": TokenType.IRLIKE, 714 "ALL": TokenType.ALL, 715 "AND": TokenType.AND, 716 "ANTI": TokenType.ANTI, 717 "ANY": TokenType.ANY, 718 "ASC": TokenType.ASC, 719 "AS": TokenType.ALIAS, 720 "ASOF": TokenType.ASOF, 721 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 722 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 723 "BEGIN": TokenType.BEGIN, 724 "BETWEEN": TokenType.BETWEEN, 725 "CACHE": TokenType.CACHE, 726 "UNCACHE": TokenType.UNCACHE, 727 "CASE": TokenType.CASE, 728 "CHARACTER SET": TokenType.CHARACTER_SET, 729 "CLUSTER BY": TokenType.CLUSTER_BY, 730 "COLLATE": TokenType.COLLATE, 731 "COLUMN": TokenType.COLUMN, 732 "COMMIT": TokenType.COMMIT, 733 "CONNECT BY": TokenType.CONNECT_BY, 734 "CONSTRAINT": TokenType.CONSTRAINT, 735 "COPY": TokenType.COPY, 736 "CREATE": TokenType.CREATE, 737 "CROSS": TokenType.CROSS, 738 "CUBE": TokenType.CUBE, 739 "CURRENT_DATE": TokenType.CURRENT_DATE, 740 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 741 "CURRENT_TIME": TokenType.CURRENT_TIME, 742 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 743 "CURRENT_USER": TokenType.CURRENT_USER, 744 "DATABASE": TokenType.DATABASE, 745 "DEFAULT": TokenType.DEFAULT, 746 "DELETE": TokenType.DELETE, 747 "DESC": TokenType.DESC, 748 "DESCRIBE": TokenType.DESCRIBE, 749 "DISTINCT": TokenType.DISTINCT, 750 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 751 "DIV": TokenType.DIV, 752 "DROP": TokenType.DROP, 753 "ELSE": TokenType.ELSE, 754 "END": TokenType.END, 755 "ENUM": TokenType.ENUM, 756 "ESCAPE": TokenType.ESCAPE, 757 "EXCEPT": TokenType.EXCEPT, 758 "EXECUTE": TokenType.EXECUTE, 759 "EXISTS": TokenType.EXISTS, 760 "FALSE": TokenType.FALSE, 761 "FETCH": TokenType.FETCH, 762 "FILTER": TokenType.FILTER, 763 "FIRST": TokenType.FIRST, 764 "FULL": TokenType.FULL, 765 "FUNCTION": TokenType.FUNCTION, 766 "FOR": TokenType.FOR, 767 "FOREIGN KEY": TokenType.FOREIGN_KEY, 768 "FORMAT": TokenType.FORMAT, 769 "FROM": TokenType.FROM, 770 "GEOGRAPHY": TokenType.GEOGRAPHY, 771 "GEOMETRY": TokenType.GEOMETRY, 772 "GLOB": TokenType.GLOB, 773 "GROUP BY": TokenType.GROUP_BY, 774 "GROUPING SETS": TokenType.GROUPING_SETS, 775 "HAVING": TokenType.HAVING, 776 "ILIKE": TokenType.ILIKE, 777 "IN": TokenType.IN, 778 "INDEX": TokenType.INDEX, 779 "INET": TokenType.INET, 780 "INNER": TokenType.INNER, 781 "INSERT": TokenType.INSERT, 782 "INTERVAL": TokenType.INTERVAL, 783 "INTERSECT": TokenType.INTERSECT, 784 "INTO": TokenType.INTO, 785 "IS": TokenType.IS, 786 "ISNULL": TokenType.ISNULL, 787 "JOIN": TokenType.JOIN, 788 "KEEP": TokenType.KEEP, 789 "KILL": TokenType.KILL, 790 "LATERAL": TokenType.LATERAL, 791 "LEFT": TokenType.LEFT, 792 "LIKE": TokenType.LIKE, 793 "LIMIT": TokenType.LIMIT, 794 "LOAD": TokenType.LOAD, 795 "LOCK": TokenType.LOCK, 796 "MERGE": TokenType.MERGE, 797 "NAMESPACE": TokenType.NAMESPACE, 798 "NATURAL": TokenType.NATURAL, 799 "NEXT": TokenType.NEXT, 800 "NOT": TokenType.NOT, 801 "NOTNULL": TokenType.NOTNULL, 802 "NULL": TokenType.NULL, 803 "OBJECT": TokenType.OBJECT, 804 "OFFSET": TokenType.OFFSET, 805 "ON": TokenType.ON, 806 "OR": TokenType.OR, 807 "XOR": TokenType.XOR, 808 "ORDER BY": TokenType.ORDER_BY, 809 "ORDINALITY": TokenType.ORDINALITY, 810 "OUTER": TokenType.OUTER, 811 "OVER": TokenType.OVER, 812 "OVERLAPS": TokenType.OVERLAPS, 813 "OVERWRITE": TokenType.OVERWRITE, 814 "PARTITION": TokenType.PARTITION, 815 "PARTITION BY": TokenType.PARTITION_BY, 816 "PARTITIONED BY": TokenType.PARTITION_BY, 817 "PARTITIONED_BY": TokenType.PARTITION_BY, 818 "PERCENT": TokenType.PERCENT, 819 "PIVOT": TokenType.PIVOT, 820 "PRAGMA": TokenType.PRAGMA, 821 "PRIMARY KEY": TokenType.PRIMARY_KEY, 822 "PROCEDURE": TokenType.PROCEDURE, 823 "QUALIFY": TokenType.QUALIFY, 824 "RANGE": TokenType.RANGE, 825 "RECURSIVE": TokenType.RECURSIVE, 826 "REGEXP": TokenType.RLIKE, 827 "RENAME": TokenType.RENAME, 828 "REPLACE": TokenType.REPLACE, 829 "RETURNING": TokenType.RETURNING, 830 "REFERENCES": TokenType.REFERENCES, 831 "RIGHT": TokenType.RIGHT, 832 "RLIKE": TokenType.RLIKE, 833 "ROLLBACK": TokenType.ROLLBACK, 834 "ROLLUP": TokenType.ROLLUP, 835 "ROW": TokenType.ROW, 836 "ROWS": TokenType.ROWS, 837 "SCHEMA": TokenType.SCHEMA, 838 "SELECT": TokenType.SELECT, 839 "SEMI": TokenType.SEMI, 840 "SESSION": TokenType.SESSION, 841 "SET": TokenType.SET, 842 "SETTINGS": TokenType.SETTINGS, 843 "SHOW": TokenType.SHOW, 844 "SIMILAR TO": TokenType.SIMILAR_TO, 845 "SOME": TokenType.SOME, 846 "SORT BY": TokenType.SORT_BY, 847 "START WITH": TokenType.START_WITH, 848 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 849 "TABLE": TokenType.TABLE, 850 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 851 "TEMP": TokenType.TEMPORARY, 852 "TEMPORARY": TokenType.TEMPORARY, 853 "THEN": TokenType.THEN, 854 "TRUE": TokenType.TRUE, 855 "TRUNCATE": TokenType.TRUNCATE, 856 "UNION": TokenType.UNION, 857 "UNKNOWN": TokenType.UNKNOWN, 858 "UNNEST": TokenType.UNNEST, 859 "UNPIVOT": TokenType.UNPIVOT, 860 "UPDATE": TokenType.UPDATE, 861 "USE": TokenType.USE, 862 "USING": TokenType.USING, 863 "UUID": TokenType.UUID, 864 "VALUES": TokenType.VALUES, 865 "VIEW": TokenType.VIEW, 866 "VOLATILE": TokenType.VOLATILE, 867 "WHEN": TokenType.WHEN, 868 "WHERE": TokenType.WHERE, 869 "WINDOW": TokenType.WINDOW, 870 "WITH": TokenType.WITH, 871 "APPLY": TokenType.APPLY, 872 "ARRAY": TokenType.ARRAY, 873 "BIT": TokenType.BIT, 874 "BOOL": TokenType.BOOLEAN, 875 "BOOLEAN": TokenType.BOOLEAN, 876 "BYTE": TokenType.TINYINT, 877 "MEDIUMINT": TokenType.MEDIUMINT, 878 "INT1": TokenType.TINYINT, 879 "TINYINT": TokenType.TINYINT, 880 "INT16": TokenType.SMALLINT, 881 "SHORT": TokenType.SMALLINT, 882 "SMALLINT": TokenType.SMALLINT, 883 "HUGEINT": TokenType.INT128, 884 "UHUGEINT": TokenType.UINT128, 885 "INT2": TokenType.SMALLINT, 886 "INTEGER": TokenType.INT, 887 "INT": TokenType.INT, 888 "INT4": TokenType.INT, 889 "INT32": TokenType.INT, 890 "INT64": TokenType.BIGINT, 891 "INT128": TokenType.INT128, 892 "INT256": TokenType.INT256, 893 "LONG": TokenType.BIGINT, 894 "BIGINT": TokenType.BIGINT, 895 "INT8": TokenType.TINYINT, 896 "UINT": TokenType.UINT, 897 "UINT128": TokenType.UINT128, 898 "UINT256": TokenType.UINT256, 899 "DEC": TokenType.DECIMAL, 900 "DECIMAL": TokenType.DECIMAL, 901 "DECIMAL32": TokenType.DECIMAL32, 902 "DECIMAL64": TokenType.DECIMAL64, 903 "DECIMAL128": TokenType.DECIMAL128, 904 "DECIMAL256": TokenType.DECIMAL256, 905 "BIGDECIMAL": TokenType.BIGDECIMAL, 906 "BIGNUMERIC": TokenType.BIGDECIMAL, 907 "LIST": TokenType.LIST, 908 "MAP": TokenType.MAP, 909 "NULLABLE": TokenType.NULLABLE, 910 "NUMBER": TokenType.DECIMAL, 911 "NUMERIC": TokenType.DECIMAL, 912 "FIXED": TokenType.DECIMAL, 913 "REAL": TokenType.FLOAT, 914 "FLOAT": TokenType.FLOAT, 915 "FLOAT4": TokenType.FLOAT, 916 "FLOAT8": TokenType.DOUBLE, 917 "DOUBLE": TokenType.DOUBLE, 918 "DOUBLE PRECISION": TokenType.DOUBLE, 919 "JSON": TokenType.JSON, 920 "JSONB": TokenType.JSONB, 921 "CHAR": TokenType.CHAR, 922 "CHARACTER": TokenType.CHAR, 923 "CHAR VARYING": TokenType.VARCHAR, 924 "CHARACTER VARYING": TokenType.VARCHAR, 925 "NCHAR": TokenType.NCHAR, 926 "VARCHAR": TokenType.VARCHAR, 927 "VARCHAR2": TokenType.VARCHAR, 928 "NVARCHAR": TokenType.NVARCHAR, 929 "NVARCHAR2": TokenType.NVARCHAR, 930 "BPCHAR": TokenType.BPCHAR, 931 "STR": TokenType.TEXT, 932 "STRING": TokenType.TEXT, 933 "TEXT": TokenType.TEXT, 934 "LONGTEXT": TokenType.LONGTEXT, 935 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 936 "TINYTEXT": TokenType.TINYTEXT, 937 "CLOB": TokenType.TEXT, 938 "LONGVARCHAR": TokenType.TEXT, 939 "BINARY": TokenType.BINARY, 940 "BLOB": TokenType.VARBINARY, 941 "LONGBLOB": TokenType.LONGBLOB, 942 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 943 "TINYBLOB": TokenType.TINYBLOB, 944 "BYTEA": TokenType.VARBINARY, 945 "VARBINARY": TokenType.VARBINARY, 946 "TIME": TokenType.TIME, 947 "TIMETZ": TokenType.TIMETZ, 948 "TIMESTAMP": TokenType.TIMESTAMP, 949 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 950 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 951 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 952 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 953 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 954 "DATE": TokenType.DATE, 955 "DATETIME": TokenType.DATETIME, 956 "INT4RANGE": TokenType.INT4RANGE, 957 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 958 "INT8RANGE": TokenType.INT8RANGE, 959 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 960 "NUMRANGE": TokenType.NUMRANGE, 961 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 962 "TSRANGE": TokenType.TSRANGE, 963 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 964 "TSTZRANGE": TokenType.TSTZRANGE, 965 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 966 "DATERANGE": TokenType.DATERANGE, 967 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 968 "UNIQUE": TokenType.UNIQUE, 969 "VECTOR": TokenType.VECTOR, 970 "STRUCT": TokenType.STRUCT, 971 "SEQUENCE": TokenType.SEQUENCE, 972 "VARIANT": TokenType.VARIANT, 973 "ALTER": TokenType.ALTER, 974 "ANALYZE": TokenType.ANALYZE, 975 "CALL": TokenType.COMMAND, 976 "COMMENT": TokenType.COMMENT, 977 "EXPLAIN": TokenType.COMMAND, 978 "GRANT": TokenType.GRANT, 979 "REVOKE": TokenType.REVOKE, 980 "OPTIMIZE": TokenType.COMMAND, 981 "PREPARE": TokenType.COMMAND, 982 "VACUUM": TokenType.COMMAND, 983 "USER-DEFINED": TokenType.USERDEFINED, 984 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 985 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 986 } 987 988 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 989 " ": TokenType.SPACE, 990 "\t": TokenType.SPACE, 991 "\n": TokenType.BREAK, 992 "\r": TokenType.BREAK, 993 } 994 995 COMMANDS = { 996 TokenType.COMMAND, 997 TokenType.EXECUTE, 998 TokenType.FETCH, 999 TokenType.SHOW, 1000 TokenType.RENAME, 1001 } 1002 1003 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 1004 1005 # Handle numeric literals like in hive (3L = BIGINT) 1006 NUMERIC_LITERALS: t.Dict[str, str] = {} 1007 1008 COMMENTS = ["--", ("/*", "*/")] 1009 1010 __slots__ = ( 1011 "sql", 1012 "size", 1013 "tokens", 1014 "dialect", 1015 "use_rs_tokenizer", 1016 "_start", 1017 "_current", 1018 "_line", 1019 "_col", 1020 "_comments", 1021 "_char", 1022 "_end", 1023 "_peek", 1024 "_prev_token_line", 1025 "_rs_dialect_settings", 1026 ) 1027 1028 def __init__( 1029 self, 1030 dialect: DialectType = None, 1031 use_rs_tokenizer: t.Optional[bool] = None, 1032 **opts: t.Any, 1033 ) -> None: 1034 from sqlglot.dialects import Dialect 1035 1036 self.dialect = Dialect.get_or_raise(dialect) 1037 1038 # initialize `use_rs_tokenizer`, and allow it to be overwritten per Tokenizer instance 1039 self.use_rs_tokenizer = ( 1040 use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER 1041 ) 1042 1043 if self.use_rs_tokenizer: 1044 self._rs_dialect_settings = RsTokenizerDialectSettings( 1045 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 1046 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 1047 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 1048 ) 1049 1050 self.reset() 1051 1052 def reset(self) -> None: 1053 self.sql = "" 1054 self.size = 0 1055 self.tokens: t.List[Token] = [] 1056 self._start = 0 1057 self._current = 0 1058 self._line = 1 1059 self._col = 0 1060 self._comments: t.List[str] = [] 1061 1062 self._char = "" 1063 self._end = False 1064 self._peek = "" 1065 self._prev_token_line = -1 1066 1067 def tokenize(self, sql: str) -> t.List[Token]: 1068 """Returns a list of tokens corresponding to the SQL string `sql`.""" 1069 if self.use_rs_tokenizer: 1070 return self.tokenize_rs(sql) 1071 1072 self.reset() 1073 self.sql = sql 1074 self.size = len(sql) 1075 1076 try: 1077 self._scan() 1078 except Exception as e: 1079 start = max(self._current - 50, 0) 1080 end = min(self._current + 50, self.size - 1) 1081 context = self.sql[start:end] 1082 raise TokenError(f"Error tokenizing '{context}'") from e 1083 1084 return self.tokens 1085 1086 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 1087 while self.size and not self._end: 1088 current = self._current 1089 1090 # Skip spaces here rather than iteratively calling advance() for performance reasons 1091 while current < self.size: 1092 char = self.sql[current] 1093 1094 if char.isspace() and (char == " " or char == "\t"): 1095 current += 1 1096 else: 1097 break 1098 1099 offset = current - self._current if current > self._current else 1 1100 1101 self._start = current 1102 self._advance(offset) 1103 1104 if not self._char.isspace(): 1105 if self._char.isdigit(): 1106 self._scan_number() 1107 elif self._char in self._IDENTIFIERS: 1108 self._scan_identifier(self._IDENTIFIERS[self._char]) 1109 else: 1110 self._scan_keywords() 1111 1112 if until and until(): 1113 break 1114 1115 if self.tokens and self._comments: 1116 self.tokens[-1].comments.extend(self._comments) 1117 1118 def _chars(self, size: int) -> str: 1119 if size == 1: 1120 return self._char 1121 1122 start = self._current - 1 1123 end = start + size 1124 1125 return self.sql[start:end] if end <= self.size else "" 1126 1127 def _advance(self, i: int = 1, alnum: bool = False) -> None: 1128 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 1129 # Ensures we don't count an extra line if we get a \r\n line break sequence 1130 if not (self._char == "\r" and self._peek == "\n"): 1131 self._col = i 1132 self._line += 1 1133 else: 1134 self._col += i 1135 1136 self._current += i 1137 self._end = self._current >= self.size 1138 self._char = self.sql[self._current - 1] 1139 self._peek = "" if self._end else self.sql[self._current] 1140 1141 if alnum and self._char.isalnum(): 1142 # Here we use local variables instead of attributes for better performance 1143 _col = self._col 1144 _current = self._current 1145 _end = self._end 1146 _peek = self._peek 1147 1148 while _peek.isalnum(): 1149 _col += 1 1150 _current += 1 1151 _end = _current >= self.size 1152 _peek = "" if _end else self.sql[_current] 1153 1154 self._col = _col 1155 self._current = _current 1156 self._end = _end 1157 self._peek = _peek 1158 self._char = self.sql[_current - 1] 1159 1160 @property 1161 def _text(self) -> str: 1162 return self.sql[self._start : self._current] 1163 1164 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 1165 self._prev_token_line = self._line 1166 1167 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 1168 self.tokens[-1].comments.extend(self._comments) 1169 self._comments = [] 1170 1171 self.tokens.append( 1172 Token( 1173 token_type, 1174 text=self._text if text is None else text, 1175 line=self._line, 1176 col=self._col, 1177 start=self._start, 1178 end=self._current - 1, 1179 comments=self._comments, 1180 ) 1181 ) 1182 self._comments = [] 1183 1184 # If we have either a semicolon or a begin token before the command's token, we'll parse 1185 # whatever follows the command's token as a string 1186 if ( 1187 token_type in self.COMMANDS 1188 and self._peek != ";" 1189 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 1190 ): 1191 start = self._current 1192 tokens = len(self.tokens) 1193 self._scan(lambda: self._peek == ";") 1194 self.tokens = self.tokens[:tokens] 1195 text = self.sql[start : self._current].strip() 1196 if text: 1197 self._add(TokenType.STRING, text) 1198 1199 def _scan_keywords(self) -> None: 1200 size = 0 1201 word = None 1202 chars = self._text 1203 char = chars 1204 prev_space = False 1205 skip = False 1206 trie = self._KEYWORD_TRIE 1207 single_token = char in self.SINGLE_TOKENS 1208 1209 while chars: 1210 if skip: 1211 result = TrieResult.PREFIX 1212 else: 1213 result, trie = in_trie(trie, char.upper()) 1214 1215 if result == TrieResult.FAILED: 1216 break 1217 if result == TrieResult.EXISTS: 1218 word = chars 1219 1220 end = self._current + size 1221 size += 1 1222 1223 if end < self.size: 1224 char = self.sql[end] 1225 single_token = single_token or char in self.SINGLE_TOKENS 1226 is_space = char.isspace() 1227 1228 if not is_space or not prev_space: 1229 if is_space: 1230 char = " " 1231 chars += char 1232 prev_space = is_space 1233 skip = False 1234 else: 1235 skip = True 1236 else: 1237 char = "" 1238 break 1239 1240 if word: 1241 if self._scan_string(word): 1242 return 1243 if self._scan_comment(word): 1244 return 1245 if prev_space or single_token or not char: 1246 self._advance(size - 1) 1247 word = word.upper() 1248 self._add(self.KEYWORDS[word], text=word) 1249 return 1250 1251 if self._char in self.SINGLE_TOKENS: 1252 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 1253 return 1254 1255 self._scan_var() 1256 1257 def _scan_comment(self, comment_start: str) -> bool: 1258 if comment_start not in self._COMMENTS: 1259 return False 1260 1261 comment_start_line = self._line 1262 comment_start_size = len(comment_start) 1263 comment_end = self._COMMENTS[comment_start] 1264 1265 if comment_end: 1266 # Skip the comment's start delimiter 1267 self._advance(comment_start_size) 1268 1269 comment_count = 1 1270 comment_end_size = len(comment_end) 1271 1272 while not self._end: 1273 if self._chars(comment_end_size) == comment_end: 1274 comment_count -= 1 1275 if not comment_count: 1276 break 1277 1278 self._advance(alnum=True) 1279 1280 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 1281 if ( 1282 self.NESTED_COMMENTS 1283 and not self._end 1284 and self._chars(comment_end_size) == comment_start 1285 ): 1286 self._advance(comment_start_size) 1287 comment_count += 1 1288 1289 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 1290 self._advance(comment_end_size - 1) 1291 else: 1292 while not self._end and self.WHITE_SPACE.get(self._peek) is not TokenType.BREAK: 1293 self._advance(alnum=True) 1294 self._comments.append(self._text[comment_start_size:]) 1295 1296 if ( 1297 comment_start == self.HINT_START 1298 and self.tokens 1299 and self.tokens[-1].token_type in self.TOKENS_PRECEDING_HINT 1300 ): 1301 self._add(TokenType.HINT) 1302 1303 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 1304 # Multiple consecutive comments are preserved by appending them to the current comments list. 1305 if comment_start_line == self._prev_token_line: 1306 self.tokens[-1].comments.extend(self._comments) 1307 self._comments = [] 1308 self._prev_token_line = self._line 1309 1310 return True 1311 1312 def _scan_number(self) -> None: 1313 if self._char == "0": 1314 peek = self._peek.upper() 1315 if peek == "B": 1316 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 1317 elif peek == "X": 1318 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 1319 1320 decimal = False 1321 scientific = 0 1322 1323 while True: 1324 if self._peek.isdigit(): 1325 self._advance() 1326 elif self._peek == "." and not decimal: 1327 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER: 1328 return self._add(TokenType.NUMBER) 1329 decimal = True 1330 self._advance() 1331 elif self._peek in ("-", "+") and scientific == 1: 1332 scientific += 1 1333 self._advance() 1334 elif self._peek.upper() == "E" and not scientific: 1335 scientific += 1 1336 self._advance() 1337 elif self._peek.isidentifier(): 1338 number_text = self._text 1339 literal = "" 1340 1341 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1342 literal += self._peek 1343 self._advance() 1344 1345 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), "")) 1346 1347 if token_type: 1348 self._add(TokenType.NUMBER, number_text) 1349 self._add(TokenType.DCOLON, "::") 1350 return self._add(token_type, literal) 1351 else: 1352 replaced = literal.replace("_", "") 1353 if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit(): 1354 return self._add(TokenType.NUMBER, number_text + replaced) 1355 if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: 1356 return self._add(TokenType.VAR) 1357 1358 self._advance(-len(literal)) 1359 return self._add(TokenType.NUMBER, number_text) 1360 else: 1361 return self._add(TokenType.NUMBER) 1362 1363 def _scan_bits(self) -> None: 1364 self._advance() 1365 value = self._extract_value() 1366 try: 1367 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1368 int(value, 2) 1369 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1370 except ValueError: 1371 self._add(TokenType.IDENTIFIER) 1372 1373 def _scan_hex(self) -> None: 1374 self._advance() 1375 value = self._extract_value() 1376 try: 1377 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1378 int(value, 16) 1379 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1380 except ValueError: 1381 self._add(TokenType.IDENTIFIER) 1382 1383 def _extract_value(self) -> str: 1384 while True: 1385 char = self._peek.strip() 1386 if char and char not in self.SINGLE_TOKENS: 1387 self._advance(alnum=True) 1388 else: 1389 break 1390 1391 return self._text 1392 1393 def _scan_string(self, start: str) -> bool: 1394 base = None 1395 token_type = TokenType.STRING 1396 1397 if start in self._QUOTES: 1398 end = self._QUOTES[start] 1399 elif start in self._FORMAT_STRINGS: 1400 end, token_type = self._FORMAT_STRINGS[start] 1401 1402 if token_type == TokenType.HEX_STRING: 1403 base = 16 1404 elif token_type == TokenType.BIT_STRING: 1405 base = 2 1406 elif token_type == TokenType.HEREDOC_STRING: 1407 self._advance() 1408 1409 if self._char == end: 1410 tag = "" 1411 else: 1412 tag = self._extract_string( 1413 end, 1414 raw_string=True, 1415 raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER, 1416 ) 1417 1418 if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()): 1419 if not self._end: 1420 self._advance(-1) 1421 1422 self._advance(-len(tag)) 1423 self._add(self.HEREDOC_STRING_ALTERNATIVE) 1424 return True 1425 1426 end = f"{start}{tag}{end}" 1427 else: 1428 return False 1429 1430 self._advance(len(start)) 1431 text = self._extract_string(end, raw_string=token_type == TokenType.RAW_STRING) 1432 1433 if base and text: 1434 try: 1435 int(text, base) 1436 except Exception: 1437 raise TokenError( 1438 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1439 ) 1440 1441 self._add(token_type, text) 1442 return True 1443 1444 def _scan_identifier(self, identifier_end: str) -> None: 1445 self._advance() 1446 text = self._extract_string( 1447 identifier_end, escapes=self._IDENTIFIER_ESCAPES | {identifier_end} 1448 ) 1449 self._add(TokenType.IDENTIFIER, text) 1450 1451 def _scan_var(self) -> None: 1452 while True: 1453 char = self._peek.strip() 1454 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1455 self._advance(alnum=True) 1456 else: 1457 break 1458 1459 self._add( 1460 TokenType.VAR 1461 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1462 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1463 ) 1464 1465 def _extract_string( 1466 self, 1467 delimiter: str, 1468 escapes: t.Optional[t.Set[str]] = None, 1469 raw_string: bool = False, 1470 raise_unmatched: bool = True, 1471 ) -> str: 1472 text = "" 1473 delim_size = len(delimiter) 1474 escapes = self._STRING_ESCAPES if escapes is None else escapes 1475 1476 while True: 1477 if ( 1478 not raw_string 1479 and self.dialect.UNESCAPED_SEQUENCES 1480 and self._peek 1481 and self._char in self.STRING_ESCAPES 1482 ): 1483 unescaped_sequence = self.dialect.UNESCAPED_SEQUENCES.get(self._char + self._peek) 1484 if unescaped_sequence: 1485 self._advance(2) 1486 text += unescaped_sequence 1487 continue 1488 if ( 1489 (self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS or not raw_string) 1490 and self._char in escapes 1491 and (self._peek == delimiter or self._peek in escapes) 1492 and (self._char not in self._QUOTES or self._char == self._peek) 1493 ): 1494 if self._peek == delimiter: 1495 text += self._peek 1496 else: 1497 text += self._char + self._peek 1498 1499 if self._current + 1 < self.size: 1500 self._advance(2) 1501 else: 1502 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1503 else: 1504 if self._chars(delim_size) == delimiter: 1505 if delim_size > 1: 1506 self._advance(delim_size - 1) 1507 break 1508 1509 if self._end: 1510 if not raise_unmatched: 1511 return text + self._char 1512 1513 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1514 1515 current = self._current - 1 1516 self._advance(alnum=True) 1517 text += self.sql[current : self._current - 1] 1518 1519 return text 1520 1521 def tokenize_rs(self, sql: str) -> t.List[Token]: 1522 if not self._RS_TOKENIZER: 1523 raise SqlglotError("Rust tokenizer is not available") 1524 1525 tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 1526 for token in tokens: 1527 token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] 1528 1529 # Setting this here so partial token lists can be inspected even if there is a failure 1530 self.tokens = tokens 1531 1532 if error_msg is not None: 1533 raise TokenError(error_msg) 1534 1535 return tokens
29class TokenType(AutoName): 30 L_PAREN = auto() 31 R_PAREN = auto() 32 L_BRACKET = auto() 33 R_BRACKET = auto() 34 L_BRACE = auto() 35 R_BRACE = auto() 36 COMMA = auto() 37 DOT = auto() 38 DASH = auto() 39 PLUS = auto() 40 COLON = auto() 41 DOTCOLON = auto() 42 DCOLON = auto() 43 DCOLONDOLLAR = auto() 44 DCOLONPERCENT = auto() 45 DQMARK = auto() 46 SEMICOLON = auto() 47 STAR = auto() 48 BACKSLASH = auto() 49 SLASH = auto() 50 LT = auto() 51 LTE = auto() 52 GT = auto() 53 GTE = auto() 54 NOT = auto() 55 EQ = auto() 56 NEQ = auto() 57 NULLSAFE_EQ = auto() 58 COLON_EQ = auto() 59 COLON_GT = auto() 60 NCOLON_GT = auto() 61 AND = auto() 62 OR = auto() 63 AMP = auto() 64 DPIPE = auto() 65 PIPE_GT = auto() 66 PIPE = auto() 67 PIPE_SLASH = auto() 68 DPIPE_SLASH = auto() 69 CARET = auto() 70 CARET_AT = auto() 71 TILDA = auto() 72 ARROW = auto() 73 DARROW = auto() 74 FARROW = auto() 75 HASH = auto() 76 HASH_ARROW = auto() 77 DHASH_ARROW = auto() 78 LR_ARROW = auto() 79 DAT = auto() 80 LT_AT = auto() 81 AT_GT = auto() 82 DOLLAR = auto() 83 PARAMETER = auto() 84 SESSION = auto() 85 SESSION_PARAMETER = auto() 86 DAMP = auto() 87 XOR = auto() 88 DSTAR = auto() 89 90 URI_START = auto() 91 92 BLOCK_START = auto() 93 BLOCK_END = auto() 94 95 SPACE = auto() 96 BREAK = auto() 97 98 STRING = auto() 99 NUMBER = auto() 100 IDENTIFIER = auto() 101 DATABASE = auto() 102 COLUMN = auto() 103 COLUMN_DEF = auto() 104 SCHEMA = auto() 105 TABLE = auto() 106 WAREHOUSE = auto() 107 STAGE = auto() 108 STREAMLIT = auto() 109 VAR = auto() 110 BIT_STRING = auto() 111 HEX_STRING = auto() 112 BYTE_STRING = auto() 113 NATIONAL_STRING = auto() 114 RAW_STRING = auto() 115 HEREDOC_STRING = auto() 116 UNICODE_STRING = auto() 117 118 # types 119 BIT = auto() 120 BOOLEAN = auto() 121 TINYINT = auto() 122 UTINYINT = auto() 123 SMALLINT = auto() 124 USMALLINT = auto() 125 MEDIUMINT = auto() 126 UMEDIUMINT = auto() 127 INT = auto() 128 UINT = auto() 129 BIGINT = auto() 130 UBIGINT = auto() 131 INT128 = auto() 132 UINT128 = auto() 133 INT256 = auto() 134 UINT256 = auto() 135 FLOAT = auto() 136 DOUBLE = auto() 137 UDOUBLE = auto() 138 DECIMAL = auto() 139 DECIMAL32 = auto() 140 DECIMAL64 = auto() 141 DECIMAL128 = auto() 142 DECIMAL256 = auto() 143 UDECIMAL = auto() 144 BIGDECIMAL = auto() 145 CHAR = auto() 146 NCHAR = auto() 147 VARCHAR = auto() 148 NVARCHAR = auto() 149 BPCHAR = auto() 150 TEXT = auto() 151 MEDIUMTEXT = auto() 152 LONGTEXT = auto() 153 BLOB = auto() 154 MEDIUMBLOB = auto() 155 LONGBLOB = auto() 156 TINYBLOB = auto() 157 TINYTEXT = auto() 158 NAME = auto() 159 BINARY = auto() 160 VARBINARY = auto() 161 JSON = auto() 162 JSONB = auto() 163 TIME = auto() 164 TIMETZ = auto() 165 TIMESTAMP = auto() 166 TIMESTAMPTZ = auto() 167 TIMESTAMPLTZ = auto() 168 TIMESTAMPNTZ = auto() 169 TIMESTAMP_S = auto() 170 TIMESTAMP_MS = auto() 171 TIMESTAMP_NS = auto() 172 DATETIME = auto() 173 DATETIME2 = auto() 174 DATETIME64 = auto() 175 SMALLDATETIME = auto() 176 DATE = auto() 177 DATE32 = auto() 178 INT4RANGE = auto() 179 INT4MULTIRANGE = auto() 180 INT8RANGE = auto() 181 INT8MULTIRANGE = auto() 182 NUMRANGE = auto() 183 NUMMULTIRANGE = auto() 184 TSRANGE = auto() 185 TSMULTIRANGE = auto() 186 TSTZRANGE = auto() 187 TSTZMULTIRANGE = auto() 188 DATERANGE = auto() 189 DATEMULTIRANGE = auto() 190 UUID = auto() 191 GEOGRAPHY = auto() 192 GEOGRAPHYPOINT = auto() 193 NULLABLE = auto() 194 GEOMETRY = auto() 195 POINT = auto() 196 RING = auto() 197 LINESTRING = auto() 198 MULTILINESTRING = auto() 199 POLYGON = auto() 200 MULTIPOLYGON = auto() 201 HLLSKETCH = auto() 202 HSTORE = auto() 203 SUPER = auto() 204 SERIAL = auto() 205 SMALLSERIAL = auto() 206 BIGSERIAL = auto() 207 XML = auto() 208 YEAR = auto() 209 USERDEFINED = auto() 210 MONEY = auto() 211 SMALLMONEY = auto() 212 ROWVERSION = auto() 213 IMAGE = auto() 214 VARIANT = auto() 215 OBJECT = auto() 216 INET = auto() 217 IPADDRESS = auto() 218 IPPREFIX = auto() 219 IPV4 = auto() 220 IPV6 = auto() 221 ENUM = auto() 222 ENUM8 = auto() 223 ENUM16 = auto() 224 FIXEDSTRING = auto() 225 LOWCARDINALITY = auto() 226 NESTED = auto() 227 AGGREGATEFUNCTION = auto() 228 SIMPLEAGGREGATEFUNCTION = auto() 229 TDIGEST = auto() 230 UNKNOWN = auto() 231 VECTOR = auto() 232 DYNAMIC = auto() 233 VOID = auto() 234 235 # keywords 236 ALIAS = auto() 237 ALTER = auto() 238 ALL = auto() 239 ANTI = auto() 240 ANY = auto() 241 APPLY = auto() 242 ARRAY = auto() 243 ASC = auto() 244 ASOF = auto() 245 ATTACH = auto() 246 AUTO_INCREMENT = auto() 247 BEGIN = auto() 248 BETWEEN = auto() 249 BULK_COLLECT_INTO = auto() 250 CACHE = auto() 251 CASE = auto() 252 CHARACTER_SET = auto() 253 CLUSTER_BY = auto() 254 COLLATE = auto() 255 COMMAND = auto() 256 COMMENT = auto() 257 COMMIT = auto() 258 CONNECT_BY = auto() 259 CONSTRAINT = auto() 260 COPY = auto() 261 CREATE = auto() 262 CROSS = auto() 263 CUBE = auto() 264 CURRENT_DATE = auto() 265 CURRENT_DATETIME = auto() 266 CURRENT_SCHEMA = auto() 267 CURRENT_TIME = auto() 268 CURRENT_TIMESTAMP = auto() 269 CURRENT_USER = auto() 270 DECLARE = auto() 271 DEFAULT = auto() 272 DELETE = auto() 273 DESC = auto() 274 DESCRIBE = auto() 275 DETACH = auto() 276 DICTIONARY = auto() 277 DISTINCT = auto() 278 DISTRIBUTE_BY = auto() 279 DIV = auto() 280 DROP = auto() 281 ELSE = auto() 282 END = auto() 283 ESCAPE = auto() 284 EXCEPT = auto() 285 EXECUTE = auto() 286 EXISTS = auto() 287 FALSE = auto() 288 FETCH = auto() 289 FILE_FORMAT = auto() 290 FILTER = auto() 291 FINAL = auto() 292 FIRST = auto() 293 FOR = auto() 294 FORCE = auto() 295 FOREIGN_KEY = auto() 296 FORMAT = auto() 297 FROM = auto() 298 FULL = auto() 299 FUNCTION = auto() 300 GET = auto() 301 GLOB = auto() 302 GLOBAL = auto() 303 GRANT = auto() 304 GROUP_BY = auto() 305 GROUPING_SETS = auto() 306 HAVING = auto() 307 HINT = auto() 308 IGNORE = auto() 309 ILIKE = auto() 310 IN = auto() 311 INDEX = auto() 312 INNER = auto() 313 INSERT = auto() 314 INTERSECT = auto() 315 INTERVAL = auto() 316 INTO = auto() 317 INTRODUCER = auto() 318 IRLIKE = auto() 319 IS = auto() 320 ISNULL = auto() 321 JOIN = auto() 322 JOIN_MARKER = auto() 323 KEEP = auto() 324 KEY = auto() 325 KILL = auto() 326 LANGUAGE = auto() 327 LATERAL = auto() 328 LEFT = auto() 329 LIKE = auto() 330 LIMIT = auto() 331 LIST = auto() 332 LOAD = auto() 333 LOCK = auto() 334 MAP = auto() 335 MATCH_CONDITION = auto() 336 MATCH_RECOGNIZE = auto() 337 MEMBER_OF = auto() 338 MERGE = auto() 339 MOD = auto() 340 MODEL = auto() 341 NATURAL = auto() 342 NEXT = auto() 343 NOTHING = auto() 344 NOTNULL = auto() 345 NULL = auto() 346 OBJECT_IDENTIFIER = auto() 347 OFFSET = auto() 348 ON = auto() 349 ONLY = auto() 350 OPERATOR = auto() 351 ORDER_BY = auto() 352 ORDER_SIBLINGS_BY = auto() 353 ORDERED = auto() 354 ORDINALITY = auto() 355 OUTER = auto() 356 OVER = auto() 357 OVERLAPS = auto() 358 OVERWRITE = auto() 359 PARTITION = auto() 360 PARTITION_BY = auto() 361 PERCENT = auto() 362 PIVOT = auto() 363 PLACEHOLDER = auto() 364 POSITIONAL = auto() 365 PRAGMA = auto() 366 PREWHERE = auto() 367 PRIMARY_KEY = auto() 368 PROCEDURE = auto() 369 PROPERTIES = auto() 370 PSEUDO_TYPE = auto() 371 PUT = auto() 372 QUALIFY = auto() 373 QUOTE = auto() 374 RANGE = auto() 375 RECURSIVE = auto() 376 REFRESH = auto() 377 RENAME = auto() 378 REPLACE = auto() 379 RETURNING = auto() 380 REVOKE = auto() 381 REFERENCES = auto() 382 RIGHT = auto() 383 RLIKE = auto() 384 ROLLBACK = auto() 385 ROLLUP = auto() 386 ROW = auto() 387 ROWS = auto() 388 SELECT = auto() 389 SEMI = auto() 390 SEPARATOR = auto() 391 SEQUENCE = auto() 392 SERDE_PROPERTIES = auto() 393 SET = auto() 394 SETTINGS = auto() 395 SHOW = auto() 396 SIMILAR_TO = auto() 397 SOME = auto() 398 SORT_BY = auto() 399 START_WITH = auto() 400 STORAGE_INTEGRATION = auto() 401 STRAIGHT_JOIN = auto() 402 STRUCT = auto() 403 SUMMARIZE = auto() 404 TABLE_SAMPLE = auto() 405 TAG = auto() 406 TEMPORARY = auto() 407 TOP = auto() 408 THEN = auto() 409 TRUE = auto() 410 TRUNCATE = auto() 411 UNCACHE = auto() 412 UNION = auto() 413 UNNEST = auto() 414 UNPIVOT = auto() 415 UPDATE = auto() 416 USE = auto() 417 USING = auto() 418 VALUES = auto() 419 VIEW = auto() 420 SEMANTIC_VIEW = auto() 421 VOLATILE = auto() 422 WHEN = auto() 423 WHERE = auto() 424 WINDOW = auto() 425 WITH = auto() 426 UNIQUE = auto() 427 UTC_DATE = auto() 428 UTC_TIME = auto() 429 UTC_TIMESTAMP = auto() 430 VERSION_SNAPSHOT = auto() 431 TIMESTAMP_SNAPSHOT = auto() 432 OPTION = auto() 433 SINK = auto() 434 SOURCE = auto() 435 ANALYZE = auto() 436 NAMESPACE = auto() 437 EXPORT = auto() 438 439 # sentinel 440 HIVE_TOKEN_STREAM = auto()
An enumeration.
L_PAREN =
<TokenType.L_PAREN: 'L_PAREN'>
R_PAREN =
<TokenType.R_PAREN: 'R_PAREN'>
L_BRACKET =
<TokenType.L_BRACKET: 'L_BRACKET'>
R_BRACKET =
<TokenType.R_BRACKET: 'R_BRACKET'>
L_BRACE =
<TokenType.L_BRACE: 'L_BRACE'>
R_BRACE =
<TokenType.R_BRACE: 'R_BRACE'>
COMMA =
<TokenType.COMMA: 'COMMA'>
DOT =
<TokenType.DOT: 'DOT'>
DASH =
<TokenType.DASH: 'DASH'>
PLUS =
<TokenType.PLUS: 'PLUS'>
COLON =
<TokenType.COLON: 'COLON'>
DOTCOLON =
<TokenType.DOTCOLON: 'DOTCOLON'>
DCOLON =
<TokenType.DCOLON: 'DCOLON'>
DCOLONDOLLAR =
<TokenType.DCOLONDOLLAR: 'DCOLONDOLLAR'>
DCOLONPERCENT =
<TokenType.DCOLONPERCENT: 'DCOLONPERCENT'>
DQMARK =
<TokenType.DQMARK: 'DQMARK'>
SEMICOLON =
<TokenType.SEMICOLON: 'SEMICOLON'>
STAR =
<TokenType.STAR: 'STAR'>
BACKSLASH =
<TokenType.BACKSLASH: 'BACKSLASH'>
SLASH =
<TokenType.SLASH: 'SLASH'>
LT =
<TokenType.LT: 'LT'>
LTE =
<TokenType.LTE: 'LTE'>
GT =
<TokenType.GT: 'GT'>
GTE =
<TokenType.GTE: 'GTE'>
NOT =
<TokenType.NOT: 'NOT'>
EQ =
<TokenType.EQ: 'EQ'>
NEQ =
<TokenType.NEQ: 'NEQ'>
NULLSAFE_EQ =
<TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>
COLON_EQ =
<TokenType.COLON_EQ: 'COLON_EQ'>
COLON_GT =
<TokenType.COLON_GT: 'COLON_GT'>
NCOLON_GT =
<TokenType.NCOLON_GT: 'NCOLON_GT'>
AND =
<TokenType.AND: 'AND'>
OR =
<TokenType.OR: 'OR'>
AMP =
<TokenType.AMP: 'AMP'>
DPIPE =
<TokenType.DPIPE: 'DPIPE'>
PIPE_GT =
<TokenType.PIPE_GT: 'PIPE_GT'>
PIPE =
<TokenType.PIPE: 'PIPE'>
PIPE_SLASH =
<TokenType.PIPE_SLASH: 'PIPE_SLASH'>
DPIPE_SLASH =
<TokenType.DPIPE_SLASH: 'DPIPE_SLASH'>
CARET =
<TokenType.CARET: 'CARET'>
CARET_AT =
<TokenType.CARET_AT: 'CARET_AT'>
TILDA =
<TokenType.TILDA: 'TILDA'>
ARROW =
<TokenType.ARROW: 'ARROW'>
DARROW =
<TokenType.DARROW: 'DARROW'>
FARROW =
<TokenType.FARROW: 'FARROW'>
HASH =
<TokenType.HASH: 'HASH'>
HASH_ARROW =
<TokenType.HASH_ARROW: 'HASH_ARROW'>
DHASH_ARROW =
<TokenType.DHASH_ARROW: 'DHASH_ARROW'>
LR_ARROW =
<TokenType.LR_ARROW: 'LR_ARROW'>
DAT =
<TokenType.DAT: 'DAT'>
LT_AT =
<TokenType.LT_AT: 'LT_AT'>
AT_GT =
<TokenType.AT_GT: 'AT_GT'>
DOLLAR =
<TokenType.DOLLAR: 'DOLLAR'>
PARAMETER =
<TokenType.PARAMETER: 'PARAMETER'>
SESSION =
<TokenType.SESSION: 'SESSION'>
SESSION_PARAMETER =
<TokenType.SESSION_PARAMETER: 'SESSION_PARAMETER'>
DAMP =
<TokenType.DAMP: 'DAMP'>
XOR =
<TokenType.XOR: 'XOR'>
DSTAR =
<TokenType.DSTAR: 'DSTAR'>
URI_START =
<TokenType.URI_START: 'URI_START'>
BLOCK_START =
<TokenType.BLOCK_START: 'BLOCK_START'>
BLOCK_END =
<TokenType.BLOCK_END: 'BLOCK_END'>
SPACE =
<TokenType.SPACE: 'SPACE'>
BREAK =
<TokenType.BREAK: 'BREAK'>
STRING =
<TokenType.STRING: 'STRING'>
NUMBER =
<TokenType.NUMBER: 'NUMBER'>
IDENTIFIER =
<TokenType.IDENTIFIER: 'IDENTIFIER'>
DATABASE =
<TokenType.DATABASE: 'DATABASE'>
COLUMN =
<TokenType.COLUMN: 'COLUMN'>
COLUMN_DEF =
<TokenType.COLUMN_DEF: 'COLUMN_DEF'>
SCHEMA =
<TokenType.SCHEMA: 'SCHEMA'>
TABLE =
<TokenType.TABLE: 'TABLE'>
WAREHOUSE =
<TokenType.WAREHOUSE: 'WAREHOUSE'>
STAGE =
<TokenType.STAGE: 'STAGE'>
STREAMLIT =
<TokenType.STREAMLIT: 'STREAMLIT'>
VAR =
<TokenType.VAR: 'VAR'>
BIT_STRING =
<TokenType.BIT_STRING: 'BIT_STRING'>
HEX_STRING =
<TokenType.HEX_STRING: 'HEX_STRING'>
BYTE_STRING =
<TokenType.BYTE_STRING: 'BYTE_STRING'>
NATIONAL_STRING =
<TokenType.NATIONAL_STRING: 'NATIONAL_STRING'>
RAW_STRING =
<TokenType.RAW_STRING: 'RAW_STRING'>
HEREDOC_STRING =
<TokenType.HEREDOC_STRING: 'HEREDOC_STRING'>
UNICODE_STRING =
<TokenType.UNICODE_STRING: 'UNICODE_STRING'>
BIT =
<TokenType.BIT: 'BIT'>
BOOLEAN =
<TokenType.BOOLEAN: 'BOOLEAN'>
TINYINT =
<TokenType.TINYINT: 'TINYINT'>
UTINYINT =
<TokenType.UTINYINT: 'UTINYINT'>
SMALLINT =
<TokenType.SMALLINT: 'SMALLINT'>
USMALLINT =
<TokenType.USMALLINT: 'USMALLINT'>
MEDIUMINT =
<TokenType.MEDIUMINT: 'MEDIUMINT'>
UMEDIUMINT =
<TokenType.UMEDIUMINT: 'UMEDIUMINT'>
INT =
<TokenType.INT: 'INT'>
UINT =
<TokenType.UINT: 'UINT'>
BIGINT =
<TokenType.BIGINT: 'BIGINT'>
UBIGINT =
<TokenType.UBIGINT: 'UBIGINT'>
INT128 =
<TokenType.INT128: 'INT128'>
UINT128 =
<TokenType.UINT128: 'UINT128'>
INT256 =
<TokenType.INT256: 'INT256'>
UINT256 =
<TokenType.UINT256: 'UINT256'>
FLOAT =
<TokenType.FLOAT: 'FLOAT'>
DOUBLE =
<TokenType.DOUBLE: 'DOUBLE'>
UDOUBLE =
<TokenType.UDOUBLE: 'UDOUBLE'>
DECIMAL =
<TokenType.DECIMAL: 'DECIMAL'>
DECIMAL32 =
<TokenType.DECIMAL32: 'DECIMAL32'>
DECIMAL64 =
<TokenType.DECIMAL64: 'DECIMAL64'>
DECIMAL128 =
<TokenType.DECIMAL128: 'DECIMAL128'>
DECIMAL256 =
<TokenType.DECIMAL256: 'DECIMAL256'>
UDECIMAL =
<TokenType.UDECIMAL: 'UDECIMAL'>
BIGDECIMAL =
<TokenType.BIGDECIMAL: 'BIGDECIMAL'>
CHAR =
<TokenType.CHAR: 'CHAR'>
NCHAR =
<TokenType.NCHAR: 'NCHAR'>
VARCHAR =
<TokenType.VARCHAR: 'VARCHAR'>
NVARCHAR =
<TokenType.NVARCHAR: 'NVARCHAR'>
BPCHAR =
<TokenType.BPCHAR: 'BPCHAR'>
TEXT =
<TokenType.TEXT: 'TEXT'>
MEDIUMTEXT =
<TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>
LONGTEXT =
<TokenType.LONGTEXT: 'LONGTEXT'>
BLOB =
<TokenType.BLOB: 'BLOB'>
MEDIUMBLOB =
<TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>
LONGBLOB =
<TokenType.LONGBLOB: 'LONGBLOB'>
TINYBLOB =
<TokenType.TINYBLOB: 'TINYBLOB'>
TINYTEXT =
<TokenType.TINYTEXT: 'TINYTEXT'>
NAME =
<TokenType.NAME: 'NAME'>
BINARY =
<TokenType.BINARY: 'BINARY'>
VARBINARY =
<TokenType.VARBINARY: 'VARBINARY'>
JSON =
<TokenType.JSON: 'JSON'>
JSONB =
<TokenType.JSONB: 'JSONB'>
TIME =
<TokenType.TIME: 'TIME'>
TIMETZ =
<TokenType.TIMETZ: 'TIMETZ'>
TIMESTAMP =
<TokenType.TIMESTAMP: 'TIMESTAMP'>
TIMESTAMPTZ =
<TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>
TIMESTAMPLTZ =
<TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>
TIMESTAMPNTZ =
<TokenType.TIMESTAMPNTZ: 'TIMESTAMPNTZ'>
TIMESTAMP_S =
<TokenType.TIMESTAMP_S: 'TIMESTAMP_S'>
TIMESTAMP_MS =
<TokenType.TIMESTAMP_MS: 'TIMESTAMP_MS'>
TIMESTAMP_NS =
<TokenType.TIMESTAMP_NS: 'TIMESTAMP_NS'>
DATETIME =
<TokenType.DATETIME: 'DATETIME'>
DATETIME2 =
<TokenType.DATETIME2: 'DATETIME2'>
DATETIME64 =
<TokenType.DATETIME64: 'DATETIME64'>
SMALLDATETIME =
<TokenType.SMALLDATETIME: 'SMALLDATETIME'>
DATE =
<TokenType.DATE: 'DATE'>
DATE32 =
<TokenType.DATE32: 'DATE32'>
INT4RANGE =
<TokenType.INT4RANGE: 'INT4RANGE'>
INT4MULTIRANGE =
<TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>
INT8RANGE =
<TokenType.INT8RANGE: 'INT8RANGE'>
INT8MULTIRANGE =
<TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>
NUMRANGE =
<TokenType.NUMRANGE: 'NUMRANGE'>
NUMMULTIRANGE =
<TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>
TSRANGE =
<TokenType.TSRANGE: 'TSRANGE'>
TSMULTIRANGE =
<TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>
TSTZRANGE =
<TokenType.TSTZRANGE: 'TSTZRANGE'>
TSTZMULTIRANGE =
<TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>
DATERANGE =
<TokenType.DATERANGE: 'DATERANGE'>
DATEMULTIRANGE =
<TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>
UUID =
<TokenType.UUID: 'UUID'>
GEOGRAPHY =
<TokenType.GEOGRAPHY: 'GEOGRAPHY'>
GEOGRAPHYPOINT =
<TokenType.GEOGRAPHYPOINT: 'GEOGRAPHYPOINT'>
NULLABLE =
<TokenType.NULLABLE: 'NULLABLE'>
GEOMETRY =
<TokenType.GEOMETRY: 'GEOMETRY'>
POINT =
<TokenType.POINT: 'POINT'>
RING =
<TokenType.RING: 'RING'>
LINESTRING =
<TokenType.LINESTRING: 'LINESTRING'>
MULTILINESTRING =
<TokenType.MULTILINESTRING: 'MULTILINESTRING'>
POLYGON =
<TokenType.POLYGON: 'POLYGON'>
MULTIPOLYGON =
<TokenType.MULTIPOLYGON: 'MULTIPOLYGON'>
HLLSKETCH =
<TokenType.HLLSKETCH: 'HLLSKETCH'>
HSTORE =
<TokenType.HSTORE: 'HSTORE'>
SUPER =
<TokenType.SUPER: 'SUPER'>
SERIAL =
<TokenType.SERIAL: 'SERIAL'>
SMALLSERIAL =
<TokenType.SMALLSERIAL: 'SMALLSERIAL'>
BIGSERIAL =
<TokenType.BIGSERIAL: 'BIGSERIAL'>
XML =
<TokenType.XML: 'XML'>
YEAR =
<TokenType.YEAR: 'YEAR'>
USERDEFINED =
<TokenType.USERDEFINED: 'USERDEFINED'>
MONEY =
<TokenType.MONEY: 'MONEY'>
SMALLMONEY =
<TokenType.SMALLMONEY: 'SMALLMONEY'>
ROWVERSION =
<TokenType.ROWVERSION: 'ROWVERSION'>
IMAGE =
<TokenType.IMAGE: 'IMAGE'>
VARIANT =
<TokenType.VARIANT: 'VARIANT'>
OBJECT =
<TokenType.OBJECT: 'OBJECT'>
INET =
<TokenType.INET: 'INET'>
IPADDRESS =
<TokenType.IPADDRESS: 'IPADDRESS'>
IPPREFIX =
<TokenType.IPPREFIX: 'IPPREFIX'>
IPV4 =
<TokenType.IPV4: 'IPV4'>
IPV6 =
<TokenType.IPV6: 'IPV6'>
ENUM =
<TokenType.ENUM: 'ENUM'>
ENUM8 =
<TokenType.ENUM8: 'ENUM8'>
ENUM16 =
<TokenType.ENUM16: 'ENUM16'>
FIXEDSTRING =
<TokenType.FIXEDSTRING: 'FIXEDSTRING'>
LOWCARDINALITY =
<TokenType.LOWCARDINALITY: 'LOWCARDINALITY'>
NESTED =
<TokenType.NESTED: 'NESTED'>
AGGREGATEFUNCTION =
<TokenType.AGGREGATEFUNCTION: 'AGGREGATEFUNCTION'>
SIMPLEAGGREGATEFUNCTION =
<TokenType.SIMPLEAGGREGATEFUNCTION: 'SIMPLEAGGREGATEFUNCTION'>
TDIGEST =
<TokenType.TDIGEST: 'TDIGEST'>
UNKNOWN =
<TokenType.UNKNOWN: 'UNKNOWN'>
VECTOR =
<TokenType.VECTOR: 'VECTOR'>
DYNAMIC =
<TokenType.DYNAMIC: 'DYNAMIC'>
VOID =
<TokenType.VOID: 'VOID'>
ALIAS =
<TokenType.ALIAS: 'ALIAS'>
ALTER =
<TokenType.ALTER: 'ALTER'>
ALL =
<TokenType.ALL: 'ALL'>
ANTI =
<TokenType.ANTI: 'ANTI'>
ANY =
<TokenType.ANY: 'ANY'>
APPLY =
<TokenType.APPLY: 'APPLY'>
ARRAY =
<TokenType.ARRAY: 'ARRAY'>
ASC =
<TokenType.ASC: 'ASC'>
ASOF =
<TokenType.ASOF: 'ASOF'>
ATTACH =
<TokenType.ATTACH: 'ATTACH'>
AUTO_INCREMENT =
<TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>
BEGIN =
<TokenType.BEGIN: 'BEGIN'>
BETWEEN =
<TokenType.BETWEEN: 'BETWEEN'>
BULK_COLLECT_INTO =
<TokenType.BULK_COLLECT_INTO: 'BULK_COLLECT_INTO'>
CACHE =
<TokenType.CACHE: 'CACHE'>
CASE =
<TokenType.CASE: 'CASE'>
CHARACTER_SET =
<TokenType.CHARACTER_SET: 'CHARACTER_SET'>
CLUSTER_BY =
<TokenType.CLUSTER_BY: 'CLUSTER_BY'>
COLLATE =
<TokenType.COLLATE: 'COLLATE'>
COMMAND =
<TokenType.COMMAND: 'COMMAND'>
COMMENT =
<TokenType.COMMENT: 'COMMENT'>
COMMIT =
<TokenType.COMMIT: 'COMMIT'>
CONNECT_BY =
<TokenType.CONNECT_BY: 'CONNECT_BY'>
CONSTRAINT =
<TokenType.CONSTRAINT: 'CONSTRAINT'>
COPY =
<TokenType.COPY: 'COPY'>
CREATE =
<TokenType.CREATE: 'CREATE'>
CROSS =
<TokenType.CROSS: 'CROSS'>
CUBE =
<TokenType.CUBE: 'CUBE'>
CURRENT_DATE =
<TokenType.CURRENT_DATE: 'CURRENT_DATE'>
CURRENT_DATETIME =
<TokenType.CURRENT_DATETIME: 'CURRENT_DATETIME'>
CURRENT_SCHEMA =
<TokenType.CURRENT_SCHEMA: 'CURRENT_SCHEMA'>
CURRENT_TIME =
<TokenType.CURRENT_TIME: 'CURRENT_TIME'>
CURRENT_TIMESTAMP =
<TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>
CURRENT_USER =
<TokenType.CURRENT_USER: 'CURRENT_USER'>
DECLARE =
<TokenType.DECLARE: 'DECLARE'>
DEFAULT =
<TokenType.DEFAULT: 'DEFAULT'>
DELETE =
<TokenType.DELETE: 'DELETE'>
DESC =
<TokenType.DESC: 'DESC'>
DESCRIBE =
<TokenType.DESCRIBE: 'DESCRIBE'>
DETACH =
<TokenType.DETACH: 'DETACH'>
DICTIONARY =
<TokenType.DICTIONARY: 'DICTIONARY'>
DISTINCT =
<TokenType.DISTINCT: 'DISTINCT'>
DISTRIBUTE_BY =
<TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>
DIV =
<TokenType.DIV: 'DIV'>
DROP =
<TokenType.DROP: 'DROP'>
ELSE =
<TokenType.ELSE: 'ELSE'>
END =
<TokenType.END: 'END'>
ESCAPE =
<TokenType.ESCAPE: 'ESCAPE'>
EXCEPT =
<TokenType.EXCEPT: 'EXCEPT'>
EXECUTE =
<TokenType.EXECUTE: 'EXECUTE'>
EXISTS =
<TokenType.EXISTS: 'EXISTS'>
FALSE =
<TokenType.FALSE: 'FALSE'>
FETCH =
<TokenType.FETCH: 'FETCH'>
FILE_FORMAT =
<TokenType.FILE_FORMAT: 'FILE_FORMAT'>
FILTER =
<TokenType.FILTER: 'FILTER'>
FINAL =
<TokenType.FINAL: 'FINAL'>
FIRST =
<TokenType.FIRST: 'FIRST'>
FOR =
<TokenType.FOR: 'FOR'>
FORCE =
<TokenType.FORCE: 'FORCE'>
FOREIGN_KEY =
<TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>
FORMAT =
<TokenType.FORMAT: 'FORMAT'>
FROM =
<TokenType.FROM: 'FROM'>
FULL =
<TokenType.FULL: 'FULL'>
FUNCTION =
<TokenType.FUNCTION: 'FUNCTION'>
GET =
<TokenType.GET: 'GET'>
GLOB =
<TokenType.GLOB: 'GLOB'>
GLOBAL =
<TokenType.GLOBAL: 'GLOBAL'>
GRANT =
<TokenType.GRANT: 'GRANT'>
GROUP_BY =
<TokenType.GROUP_BY: 'GROUP_BY'>
GROUPING_SETS =
<TokenType.GROUPING_SETS: 'GROUPING_SETS'>
HAVING =
<TokenType.HAVING: 'HAVING'>
HINT =
<TokenType.HINT: 'HINT'>
IGNORE =
<TokenType.IGNORE: 'IGNORE'>
ILIKE =
<TokenType.ILIKE: 'ILIKE'>
IN =
<TokenType.IN: 'IN'>
INDEX =
<TokenType.INDEX: 'INDEX'>
INNER =
<TokenType.INNER: 'INNER'>
INSERT =
<TokenType.INSERT: 'INSERT'>
INTERSECT =
<TokenType.INTERSECT: 'INTERSECT'>
INTERVAL =
<TokenType.INTERVAL: 'INTERVAL'>
INTO =
<TokenType.INTO: 'INTO'>
INTRODUCER =
<TokenType.INTRODUCER: 'INTRODUCER'>
IRLIKE =
<TokenType.IRLIKE: 'IRLIKE'>
IS =
<TokenType.IS: 'IS'>
ISNULL =
<TokenType.ISNULL: 'ISNULL'>
JOIN =
<TokenType.JOIN: 'JOIN'>
JOIN_MARKER =
<TokenType.JOIN_MARKER: 'JOIN_MARKER'>
KEEP =
<TokenType.KEEP: 'KEEP'>
KEY =
<TokenType.KEY: 'KEY'>
KILL =
<TokenType.KILL: 'KILL'>
LANGUAGE =
<TokenType.LANGUAGE: 'LANGUAGE'>
LATERAL =
<TokenType.LATERAL: 'LATERAL'>
LEFT =
<TokenType.LEFT: 'LEFT'>
LIKE =
<TokenType.LIKE: 'LIKE'>
LIMIT =
<TokenType.LIMIT: 'LIMIT'>
LIST =
<TokenType.LIST: 'LIST'>
LOAD =
<TokenType.LOAD: 'LOAD'>
LOCK =
<TokenType.LOCK: 'LOCK'>
MAP =
<TokenType.MAP: 'MAP'>
MATCH_CONDITION =
<TokenType.MATCH_CONDITION: 'MATCH_CONDITION'>
MATCH_RECOGNIZE =
<TokenType.MATCH_RECOGNIZE: 'MATCH_RECOGNIZE'>
MEMBER_OF =
<TokenType.MEMBER_OF: 'MEMBER_OF'>
MERGE =
<TokenType.MERGE: 'MERGE'>
MOD =
<TokenType.MOD: 'MOD'>
MODEL =
<TokenType.MODEL: 'MODEL'>
NATURAL =
<TokenType.NATURAL: 'NATURAL'>
NEXT =
<TokenType.NEXT: 'NEXT'>
NOTHING =
<TokenType.NOTHING: 'NOTHING'>
NOTNULL =
<TokenType.NOTNULL: 'NOTNULL'>
NULL =
<TokenType.NULL: 'NULL'>
OBJECT_IDENTIFIER =
<TokenType.OBJECT_IDENTIFIER: 'OBJECT_IDENTIFIER'>
OFFSET =
<TokenType.OFFSET: 'OFFSET'>
ON =
<TokenType.ON: 'ON'>
ONLY =
<TokenType.ONLY: 'ONLY'>
OPERATOR =
<TokenType.OPERATOR: 'OPERATOR'>
ORDER_BY =
<TokenType.ORDER_BY: 'ORDER_BY'>
ORDER_SIBLINGS_BY =
<TokenType.ORDER_SIBLINGS_BY: 'ORDER_SIBLINGS_BY'>
ORDERED =
<TokenType.ORDERED: 'ORDERED'>
ORDINALITY =
<TokenType.ORDINALITY: 'ORDINALITY'>
OUTER =
<TokenType.OUTER: 'OUTER'>
OVER =
<TokenType.OVER: 'OVER'>
OVERLAPS =
<TokenType.OVERLAPS: 'OVERLAPS'>
OVERWRITE =
<TokenType.OVERWRITE: 'OVERWRITE'>
PARTITION =
<TokenType.PARTITION: 'PARTITION'>
PARTITION_BY =
<TokenType.PARTITION_BY: 'PARTITION_BY'>
PERCENT =
<TokenType.PERCENT: 'PERCENT'>
PIVOT =
<TokenType.PIVOT: 'PIVOT'>
PLACEHOLDER =
<TokenType.PLACEHOLDER: 'PLACEHOLDER'>
POSITIONAL =
<TokenType.POSITIONAL: 'POSITIONAL'>
PRAGMA =
<TokenType.PRAGMA: 'PRAGMA'>
PREWHERE =
<TokenType.PREWHERE: 'PREWHERE'>
PRIMARY_KEY =
<TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>
PROCEDURE =
<TokenType.PROCEDURE: 'PROCEDURE'>
PROPERTIES =
<TokenType.PROPERTIES: 'PROPERTIES'>
PSEUDO_TYPE =
<TokenType.PSEUDO_TYPE: 'PSEUDO_TYPE'>
PUT =
<TokenType.PUT: 'PUT'>
QUALIFY =
<TokenType.QUALIFY: 'QUALIFY'>
QUOTE =
<TokenType.QUOTE: 'QUOTE'>
RANGE =
<TokenType.RANGE: 'RANGE'>
RECURSIVE =
<TokenType.RECURSIVE: 'RECURSIVE'>
REFRESH =
<TokenType.REFRESH: 'REFRESH'>
RENAME =
<TokenType.RENAME: 'RENAME'>
REPLACE =
<TokenType.REPLACE: 'REPLACE'>
RETURNING =
<TokenType.RETURNING: 'RETURNING'>
REVOKE =
<TokenType.REVOKE: 'REVOKE'>
REFERENCES =
<TokenType.REFERENCES: 'REFERENCES'>
RIGHT =
<TokenType.RIGHT: 'RIGHT'>
RLIKE =
<TokenType.RLIKE: 'RLIKE'>
ROLLBACK =
<TokenType.ROLLBACK: 'ROLLBACK'>
ROLLUP =
<TokenType.ROLLUP: 'ROLLUP'>
ROW =
<TokenType.ROW: 'ROW'>
ROWS =
<TokenType.ROWS: 'ROWS'>
SELECT =
<TokenType.SELECT: 'SELECT'>
SEMI =
<TokenType.SEMI: 'SEMI'>
SEPARATOR =
<TokenType.SEPARATOR: 'SEPARATOR'>
SEQUENCE =
<TokenType.SEQUENCE: 'SEQUENCE'>
SERDE_PROPERTIES =
<TokenType.SERDE_PROPERTIES: 'SERDE_PROPERTIES'>
SET =
<TokenType.SET: 'SET'>
SETTINGS =
<TokenType.SETTINGS: 'SETTINGS'>
SHOW =
<TokenType.SHOW: 'SHOW'>
SIMILAR_TO =
<TokenType.SIMILAR_TO: 'SIMILAR_TO'>
SOME =
<TokenType.SOME: 'SOME'>
SORT_BY =
<TokenType.SORT_BY: 'SORT_BY'>
START_WITH =
<TokenType.START_WITH: 'START_WITH'>
STORAGE_INTEGRATION =
<TokenType.STORAGE_INTEGRATION: 'STORAGE_INTEGRATION'>
STRAIGHT_JOIN =
<TokenType.STRAIGHT_JOIN: 'STRAIGHT_JOIN'>
STRUCT =
<TokenType.STRUCT: 'STRUCT'>
SUMMARIZE =
<TokenType.SUMMARIZE: 'SUMMARIZE'>
TABLE_SAMPLE =
<TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>
TAG =
<TokenType.TAG: 'TAG'>
TEMPORARY =
<TokenType.TEMPORARY: 'TEMPORARY'>
TOP =
<TokenType.TOP: 'TOP'>
THEN =
<TokenType.THEN: 'THEN'>
TRUE =
<TokenType.TRUE: 'TRUE'>
TRUNCATE =
<TokenType.TRUNCATE: 'TRUNCATE'>
UNCACHE =
<TokenType.UNCACHE: 'UNCACHE'>
UNION =
<TokenType.UNION: 'UNION'>
UNNEST =
<TokenType.UNNEST: 'UNNEST'>
UNPIVOT =
<TokenType.UNPIVOT: 'UNPIVOT'>
UPDATE =
<TokenType.UPDATE: 'UPDATE'>
USE =
<TokenType.USE: 'USE'>
USING =
<TokenType.USING: 'USING'>
VALUES =
<TokenType.VALUES: 'VALUES'>
VIEW =
<TokenType.VIEW: 'VIEW'>
SEMANTIC_VIEW =
<TokenType.SEMANTIC_VIEW: 'SEMANTIC_VIEW'>
VOLATILE =
<TokenType.VOLATILE: 'VOLATILE'>
WHEN =
<TokenType.WHEN: 'WHEN'>
WHERE =
<TokenType.WHERE: 'WHERE'>
WINDOW =
<TokenType.WINDOW: 'WINDOW'>
WITH =
<TokenType.WITH: 'WITH'>
UNIQUE =
<TokenType.UNIQUE: 'UNIQUE'>
UTC_DATE =
<TokenType.UTC_DATE: 'UTC_DATE'>
UTC_TIME =
<TokenType.UTC_TIME: 'UTC_TIME'>
UTC_TIMESTAMP =
<TokenType.UTC_TIMESTAMP: 'UTC_TIMESTAMP'>
VERSION_SNAPSHOT =
<TokenType.VERSION_SNAPSHOT: 'VERSION_SNAPSHOT'>
TIMESTAMP_SNAPSHOT =
<TokenType.TIMESTAMP_SNAPSHOT: 'TIMESTAMP_SNAPSHOT'>
OPTION =
<TokenType.OPTION: 'OPTION'>
SINK =
<TokenType.SINK: 'SINK'>
SOURCE =
<TokenType.SOURCE: 'SOURCE'>
ANALYZE =
<TokenType.ANALYZE: 'ANALYZE'>
NAMESPACE =
<TokenType.NAMESPACE: 'NAMESPACE'>
EXPORT =
<TokenType.EXPORT: 'EXPORT'>
HIVE_TOKEN_STREAM =
<TokenType.HIVE_TOKEN_STREAM: 'HIVE_TOKEN_STREAM'>
class
Token:
447class Token: 448 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 449 450 @classmethod 451 def number(cls, number: int) -> Token: 452 """Returns a NUMBER token with `number` as its text.""" 453 return cls(TokenType.NUMBER, str(number)) 454 455 @classmethod 456 def string(cls, string: str) -> Token: 457 """Returns a STRING token with `string` as its text.""" 458 return cls(TokenType.STRING, string) 459 460 @classmethod 461 def identifier(cls, identifier: str) -> Token: 462 """Returns an IDENTIFIER token with `identifier` as its text.""" 463 return cls(TokenType.IDENTIFIER, identifier) 464 465 @classmethod 466 def var(cls, var: str) -> Token: 467 """Returns an VAR token with `var` as its text.""" 468 return cls(TokenType.VAR, var) 469 470 def __init__( 471 self, 472 token_type: TokenType, 473 text: str, 474 line: int = 1, 475 col: int = 1, 476 start: int = 0, 477 end: int = 0, 478 comments: t.Optional[t.List[str]] = None, 479 ) -> None: 480 """Token initializer. 481 482 Args: 483 token_type: The TokenType Enum. 484 text: The text of the token. 485 line: The line that the token ends on. 486 col: The column that the token ends on. 487 start: The start index of the token. 488 end: The ending index of the token. 489 comments: The comments to attach to the token. 490 """ 491 self.token_type = token_type 492 self.text = text 493 self.line = line 494 self.col = col 495 self.start = start 496 self.end = end 497 self.comments = [] if comments is None else comments 498 499 def __repr__(self) -> str: 500 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 501 return f"<Token {attributes}>"
Token( token_type: TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: Optional[List[str]] = None)
470 def __init__( 471 self, 472 token_type: TokenType, 473 text: str, 474 line: int = 1, 475 col: int = 1, 476 start: int = 0, 477 end: int = 0, 478 comments: t.Optional[t.List[str]] = None, 479 ) -> None: 480 """Token initializer. 481 482 Args: 483 token_type: The TokenType Enum. 484 text: The text of the token. 485 line: The line that the token ends on. 486 col: The column that the token ends on. 487 start: The start index of the token. 488 end: The ending index of the token. 489 comments: The comments to attach to the token. 490 """ 491 self.token_type = token_type 492 self.text = text 493 self.line = line 494 self.col = col 495 self.start = start 496 self.end = end 497 self.comments = [] if comments is None else comments
Token initializer.
Arguments:
- token_type: The TokenType Enum.
- text: The text of the token.
- line: The line that the token ends on.
- col: The column that the token ends on.
- start: The start index of the token.
- end: The ending index of the token.
- comments: The comments to attach to the token.
450 @classmethod 451 def number(cls, number: int) -> Token: 452 """Returns a NUMBER token with `number` as its text.""" 453 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number as its text.
455 @classmethod 456 def string(cls, string: str) -> Token: 457 """Returns a STRING token with `string` as its text.""" 458 return cls(TokenType.STRING, string)
Returns a STRING token with string as its text.
460 @classmethod 461 def identifier(cls, identifier: str) -> Token: 462 """Returns an IDENTIFIER token with `identifier` as its text.""" 463 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier as its text.
class
Tokenizer:
611class Tokenizer(metaclass=_Tokenizer): 612 SINGLE_TOKENS = { 613 "(": TokenType.L_PAREN, 614 ")": TokenType.R_PAREN, 615 "[": TokenType.L_BRACKET, 616 "]": TokenType.R_BRACKET, 617 "{": TokenType.L_BRACE, 618 "}": TokenType.R_BRACE, 619 "&": TokenType.AMP, 620 "^": TokenType.CARET, 621 ":": TokenType.COLON, 622 ",": TokenType.COMMA, 623 ".": TokenType.DOT, 624 "-": TokenType.DASH, 625 "=": TokenType.EQ, 626 ">": TokenType.GT, 627 "<": TokenType.LT, 628 "%": TokenType.MOD, 629 "!": TokenType.NOT, 630 "|": TokenType.PIPE, 631 "+": TokenType.PLUS, 632 ";": TokenType.SEMICOLON, 633 "/": TokenType.SLASH, 634 "\\": TokenType.BACKSLASH, 635 "*": TokenType.STAR, 636 "~": TokenType.TILDA, 637 "?": TokenType.PLACEHOLDER, 638 "@": TokenType.PARAMETER, 639 "#": TokenType.HASH, 640 # Used for breaking a var like x'y' but nothing else the token type doesn't matter 641 "'": TokenType.UNKNOWN, 642 "`": TokenType.UNKNOWN, 643 '"': TokenType.UNKNOWN, 644 } 645 646 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 647 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 648 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 649 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 650 HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = [] 651 UNICODE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 652 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 653 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 654 STRING_ESCAPES = ["'"] 655 VAR_SINGLE_TOKENS: t.Set[str] = set() 656 657 # The strings in this list can always be used as escapes, regardless of the surrounding 658 # identifier delimiters. By default, the closing delimiter is assumed to also act as an 659 # identifier escape, e.g. if we use double-quotes, then they also act as escapes: "x""" 660 IDENTIFIER_ESCAPES: t.List[str] = [] 661 662 # Whether the heredoc tags follow the same lexical rules as unquoted identifiers 663 HEREDOC_TAG_IS_IDENTIFIER = False 664 665 # Token that we'll generate as a fallback if the heredoc prefix doesn't correspond to a heredoc 666 HEREDOC_STRING_ALTERNATIVE = TokenType.VAR 667 668 # Whether string escape characters function as such when placed within raw strings 669 STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS = True 670 671 NESTED_COMMENTS = True 672 673 HINT_START = "/*+" 674 675 TOKENS_PRECEDING_HINT = {TokenType.SELECT, TokenType.INSERT, TokenType.UPDATE, TokenType.DELETE} 676 677 # Autofilled 678 _COMMENTS: t.Dict[str, str] = {} 679 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 680 _IDENTIFIERS: t.Dict[str, str] = {} 681 _IDENTIFIER_ESCAPES: t.Set[str] = set() 682 _QUOTES: t.Dict[str, str] = {} 683 _STRING_ESCAPES: t.Set[str] = set() 684 _KEYWORD_TRIE: t.Dict = {} 685 _RS_TOKENIZER: t.Optional[t.Any] = None 686 687 KEYWORDS: t.Dict[str, TokenType] = { 688 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 689 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 690 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 691 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 692 HINT_START: TokenType.HINT, 693 "==": TokenType.EQ, 694 "::": TokenType.DCOLON, 695 "||": TokenType.DPIPE, 696 "|>": TokenType.PIPE_GT, 697 ">=": TokenType.GTE, 698 "<=": TokenType.LTE, 699 "<>": TokenType.NEQ, 700 "!=": TokenType.NEQ, 701 ":=": TokenType.COLON_EQ, 702 "<=>": TokenType.NULLSAFE_EQ, 703 "->": TokenType.ARROW, 704 "->>": TokenType.DARROW, 705 "=>": TokenType.FARROW, 706 "#>": TokenType.HASH_ARROW, 707 "#>>": TokenType.DHASH_ARROW, 708 "<->": TokenType.LR_ARROW, 709 "&&": TokenType.DAMP, 710 "??": TokenType.DQMARK, 711 "~~~": TokenType.GLOB, 712 "~~": TokenType.LIKE, 713 "~~*": TokenType.ILIKE, 714 "~*": TokenType.IRLIKE, 715 "ALL": TokenType.ALL, 716 "AND": TokenType.AND, 717 "ANTI": TokenType.ANTI, 718 "ANY": TokenType.ANY, 719 "ASC": TokenType.ASC, 720 "AS": TokenType.ALIAS, 721 "ASOF": TokenType.ASOF, 722 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 723 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 724 "BEGIN": TokenType.BEGIN, 725 "BETWEEN": TokenType.BETWEEN, 726 "CACHE": TokenType.CACHE, 727 "UNCACHE": TokenType.UNCACHE, 728 "CASE": TokenType.CASE, 729 "CHARACTER SET": TokenType.CHARACTER_SET, 730 "CLUSTER BY": TokenType.CLUSTER_BY, 731 "COLLATE": TokenType.COLLATE, 732 "COLUMN": TokenType.COLUMN, 733 "COMMIT": TokenType.COMMIT, 734 "CONNECT BY": TokenType.CONNECT_BY, 735 "CONSTRAINT": TokenType.CONSTRAINT, 736 "COPY": TokenType.COPY, 737 "CREATE": TokenType.CREATE, 738 "CROSS": TokenType.CROSS, 739 "CUBE": TokenType.CUBE, 740 "CURRENT_DATE": TokenType.CURRENT_DATE, 741 "CURRENT_SCHEMA": TokenType.CURRENT_SCHEMA, 742 "CURRENT_TIME": TokenType.CURRENT_TIME, 743 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 744 "CURRENT_USER": TokenType.CURRENT_USER, 745 "DATABASE": TokenType.DATABASE, 746 "DEFAULT": TokenType.DEFAULT, 747 "DELETE": TokenType.DELETE, 748 "DESC": TokenType.DESC, 749 "DESCRIBE": TokenType.DESCRIBE, 750 "DISTINCT": TokenType.DISTINCT, 751 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 752 "DIV": TokenType.DIV, 753 "DROP": TokenType.DROP, 754 "ELSE": TokenType.ELSE, 755 "END": TokenType.END, 756 "ENUM": TokenType.ENUM, 757 "ESCAPE": TokenType.ESCAPE, 758 "EXCEPT": TokenType.EXCEPT, 759 "EXECUTE": TokenType.EXECUTE, 760 "EXISTS": TokenType.EXISTS, 761 "FALSE": TokenType.FALSE, 762 "FETCH": TokenType.FETCH, 763 "FILTER": TokenType.FILTER, 764 "FIRST": TokenType.FIRST, 765 "FULL": TokenType.FULL, 766 "FUNCTION": TokenType.FUNCTION, 767 "FOR": TokenType.FOR, 768 "FOREIGN KEY": TokenType.FOREIGN_KEY, 769 "FORMAT": TokenType.FORMAT, 770 "FROM": TokenType.FROM, 771 "GEOGRAPHY": TokenType.GEOGRAPHY, 772 "GEOMETRY": TokenType.GEOMETRY, 773 "GLOB": TokenType.GLOB, 774 "GROUP BY": TokenType.GROUP_BY, 775 "GROUPING SETS": TokenType.GROUPING_SETS, 776 "HAVING": TokenType.HAVING, 777 "ILIKE": TokenType.ILIKE, 778 "IN": TokenType.IN, 779 "INDEX": TokenType.INDEX, 780 "INET": TokenType.INET, 781 "INNER": TokenType.INNER, 782 "INSERT": TokenType.INSERT, 783 "INTERVAL": TokenType.INTERVAL, 784 "INTERSECT": TokenType.INTERSECT, 785 "INTO": TokenType.INTO, 786 "IS": TokenType.IS, 787 "ISNULL": TokenType.ISNULL, 788 "JOIN": TokenType.JOIN, 789 "KEEP": TokenType.KEEP, 790 "KILL": TokenType.KILL, 791 "LATERAL": TokenType.LATERAL, 792 "LEFT": TokenType.LEFT, 793 "LIKE": TokenType.LIKE, 794 "LIMIT": TokenType.LIMIT, 795 "LOAD": TokenType.LOAD, 796 "LOCK": TokenType.LOCK, 797 "MERGE": TokenType.MERGE, 798 "NAMESPACE": TokenType.NAMESPACE, 799 "NATURAL": TokenType.NATURAL, 800 "NEXT": TokenType.NEXT, 801 "NOT": TokenType.NOT, 802 "NOTNULL": TokenType.NOTNULL, 803 "NULL": TokenType.NULL, 804 "OBJECT": TokenType.OBJECT, 805 "OFFSET": TokenType.OFFSET, 806 "ON": TokenType.ON, 807 "OR": TokenType.OR, 808 "XOR": TokenType.XOR, 809 "ORDER BY": TokenType.ORDER_BY, 810 "ORDINALITY": TokenType.ORDINALITY, 811 "OUTER": TokenType.OUTER, 812 "OVER": TokenType.OVER, 813 "OVERLAPS": TokenType.OVERLAPS, 814 "OVERWRITE": TokenType.OVERWRITE, 815 "PARTITION": TokenType.PARTITION, 816 "PARTITION BY": TokenType.PARTITION_BY, 817 "PARTITIONED BY": TokenType.PARTITION_BY, 818 "PARTITIONED_BY": TokenType.PARTITION_BY, 819 "PERCENT": TokenType.PERCENT, 820 "PIVOT": TokenType.PIVOT, 821 "PRAGMA": TokenType.PRAGMA, 822 "PRIMARY KEY": TokenType.PRIMARY_KEY, 823 "PROCEDURE": TokenType.PROCEDURE, 824 "QUALIFY": TokenType.QUALIFY, 825 "RANGE": TokenType.RANGE, 826 "RECURSIVE": TokenType.RECURSIVE, 827 "REGEXP": TokenType.RLIKE, 828 "RENAME": TokenType.RENAME, 829 "REPLACE": TokenType.REPLACE, 830 "RETURNING": TokenType.RETURNING, 831 "REFERENCES": TokenType.REFERENCES, 832 "RIGHT": TokenType.RIGHT, 833 "RLIKE": TokenType.RLIKE, 834 "ROLLBACK": TokenType.ROLLBACK, 835 "ROLLUP": TokenType.ROLLUP, 836 "ROW": TokenType.ROW, 837 "ROWS": TokenType.ROWS, 838 "SCHEMA": TokenType.SCHEMA, 839 "SELECT": TokenType.SELECT, 840 "SEMI": TokenType.SEMI, 841 "SESSION": TokenType.SESSION, 842 "SET": TokenType.SET, 843 "SETTINGS": TokenType.SETTINGS, 844 "SHOW": TokenType.SHOW, 845 "SIMILAR TO": TokenType.SIMILAR_TO, 846 "SOME": TokenType.SOME, 847 "SORT BY": TokenType.SORT_BY, 848 "START WITH": TokenType.START_WITH, 849 "STRAIGHT_JOIN": TokenType.STRAIGHT_JOIN, 850 "TABLE": TokenType.TABLE, 851 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 852 "TEMP": TokenType.TEMPORARY, 853 "TEMPORARY": TokenType.TEMPORARY, 854 "THEN": TokenType.THEN, 855 "TRUE": TokenType.TRUE, 856 "TRUNCATE": TokenType.TRUNCATE, 857 "UNION": TokenType.UNION, 858 "UNKNOWN": TokenType.UNKNOWN, 859 "UNNEST": TokenType.UNNEST, 860 "UNPIVOT": TokenType.UNPIVOT, 861 "UPDATE": TokenType.UPDATE, 862 "USE": TokenType.USE, 863 "USING": TokenType.USING, 864 "UUID": TokenType.UUID, 865 "VALUES": TokenType.VALUES, 866 "VIEW": TokenType.VIEW, 867 "VOLATILE": TokenType.VOLATILE, 868 "WHEN": TokenType.WHEN, 869 "WHERE": TokenType.WHERE, 870 "WINDOW": TokenType.WINDOW, 871 "WITH": TokenType.WITH, 872 "APPLY": TokenType.APPLY, 873 "ARRAY": TokenType.ARRAY, 874 "BIT": TokenType.BIT, 875 "BOOL": TokenType.BOOLEAN, 876 "BOOLEAN": TokenType.BOOLEAN, 877 "BYTE": TokenType.TINYINT, 878 "MEDIUMINT": TokenType.MEDIUMINT, 879 "INT1": TokenType.TINYINT, 880 "TINYINT": TokenType.TINYINT, 881 "INT16": TokenType.SMALLINT, 882 "SHORT": TokenType.SMALLINT, 883 "SMALLINT": TokenType.SMALLINT, 884 "HUGEINT": TokenType.INT128, 885 "UHUGEINT": TokenType.UINT128, 886 "INT2": TokenType.SMALLINT, 887 "INTEGER": TokenType.INT, 888 "INT": TokenType.INT, 889 "INT4": TokenType.INT, 890 "INT32": TokenType.INT, 891 "INT64": TokenType.BIGINT, 892 "INT128": TokenType.INT128, 893 "INT256": TokenType.INT256, 894 "LONG": TokenType.BIGINT, 895 "BIGINT": TokenType.BIGINT, 896 "INT8": TokenType.TINYINT, 897 "UINT": TokenType.UINT, 898 "UINT128": TokenType.UINT128, 899 "UINT256": TokenType.UINT256, 900 "DEC": TokenType.DECIMAL, 901 "DECIMAL": TokenType.DECIMAL, 902 "DECIMAL32": TokenType.DECIMAL32, 903 "DECIMAL64": TokenType.DECIMAL64, 904 "DECIMAL128": TokenType.DECIMAL128, 905 "DECIMAL256": TokenType.DECIMAL256, 906 "BIGDECIMAL": TokenType.BIGDECIMAL, 907 "BIGNUMERIC": TokenType.BIGDECIMAL, 908 "LIST": TokenType.LIST, 909 "MAP": TokenType.MAP, 910 "NULLABLE": TokenType.NULLABLE, 911 "NUMBER": TokenType.DECIMAL, 912 "NUMERIC": TokenType.DECIMAL, 913 "FIXED": TokenType.DECIMAL, 914 "REAL": TokenType.FLOAT, 915 "FLOAT": TokenType.FLOAT, 916 "FLOAT4": TokenType.FLOAT, 917 "FLOAT8": TokenType.DOUBLE, 918 "DOUBLE": TokenType.DOUBLE, 919 "DOUBLE PRECISION": TokenType.DOUBLE, 920 "JSON": TokenType.JSON, 921 "JSONB": TokenType.JSONB, 922 "CHAR": TokenType.CHAR, 923 "CHARACTER": TokenType.CHAR, 924 "CHAR VARYING": TokenType.VARCHAR, 925 "CHARACTER VARYING": TokenType.VARCHAR, 926 "NCHAR": TokenType.NCHAR, 927 "VARCHAR": TokenType.VARCHAR, 928 "VARCHAR2": TokenType.VARCHAR, 929 "NVARCHAR": TokenType.NVARCHAR, 930 "NVARCHAR2": TokenType.NVARCHAR, 931 "BPCHAR": TokenType.BPCHAR, 932 "STR": TokenType.TEXT, 933 "STRING": TokenType.TEXT, 934 "TEXT": TokenType.TEXT, 935 "LONGTEXT": TokenType.LONGTEXT, 936 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 937 "TINYTEXT": TokenType.TINYTEXT, 938 "CLOB": TokenType.TEXT, 939 "LONGVARCHAR": TokenType.TEXT, 940 "BINARY": TokenType.BINARY, 941 "BLOB": TokenType.VARBINARY, 942 "LONGBLOB": TokenType.LONGBLOB, 943 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 944 "TINYBLOB": TokenType.TINYBLOB, 945 "BYTEA": TokenType.VARBINARY, 946 "VARBINARY": TokenType.VARBINARY, 947 "TIME": TokenType.TIME, 948 "TIMETZ": TokenType.TIMETZ, 949 "TIMESTAMP": TokenType.TIMESTAMP, 950 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 951 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 952 "TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ, 953 "TIMESTAMPNTZ": TokenType.TIMESTAMPNTZ, 954 "TIMESTAMP_NTZ": TokenType.TIMESTAMPNTZ, 955 "DATE": TokenType.DATE, 956 "DATETIME": TokenType.DATETIME, 957 "INT4RANGE": TokenType.INT4RANGE, 958 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 959 "INT8RANGE": TokenType.INT8RANGE, 960 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 961 "NUMRANGE": TokenType.NUMRANGE, 962 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 963 "TSRANGE": TokenType.TSRANGE, 964 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 965 "TSTZRANGE": TokenType.TSTZRANGE, 966 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 967 "DATERANGE": TokenType.DATERANGE, 968 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 969 "UNIQUE": TokenType.UNIQUE, 970 "VECTOR": TokenType.VECTOR, 971 "STRUCT": TokenType.STRUCT, 972 "SEQUENCE": TokenType.SEQUENCE, 973 "VARIANT": TokenType.VARIANT, 974 "ALTER": TokenType.ALTER, 975 "ANALYZE": TokenType.ANALYZE, 976 "CALL": TokenType.COMMAND, 977 "COMMENT": TokenType.COMMENT, 978 "EXPLAIN": TokenType.COMMAND, 979 "GRANT": TokenType.GRANT, 980 "REVOKE": TokenType.REVOKE, 981 "OPTIMIZE": TokenType.COMMAND, 982 "PREPARE": TokenType.COMMAND, 983 "VACUUM": TokenType.COMMAND, 984 "USER-DEFINED": TokenType.USERDEFINED, 985 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 986 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 987 } 988 989 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 990 " ": TokenType.SPACE, 991 "\t": TokenType.SPACE, 992 "\n": TokenType.BREAK, 993 "\r": TokenType.BREAK, 994 } 995 996 COMMANDS = { 997 TokenType.COMMAND, 998 TokenType.EXECUTE, 999 TokenType.FETCH, 1000 TokenType.SHOW, 1001 TokenType.RENAME, 1002 } 1003 1004 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 1005 1006 # Handle numeric literals like in hive (3L = BIGINT) 1007 NUMERIC_LITERALS: t.Dict[str, str] = {} 1008 1009 COMMENTS = ["--", ("/*", "*/")] 1010 1011 __slots__ = ( 1012 "sql", 1013 "size", 1014 "tokens", 1015 "dialect", 1016 "use_rs_tokenizer", 1017 "_start", 1018 "_current", 1019 "_line", 1020 "_col", 1021 "_comments", 1022 "_char", 1023 "_end", 1024 "_peek", 1025 "_prev_token_line", 1026 "_rs_dialect_settings", 1027 ) 1028 1029 def __init__( 1030 self, 1031 dialect: DialectType = None, 1032 use_rs_tokenizer: t.Optional[bool] = None, 1033 **opts: t.Any, 1034 ) -> None: 1035 from sqlglot.dialects import Dialect 1036 1037 self.dialect = Dialect.get_or_raise(dialect) 1038 1039 # initialize `use_rs_tokenizer`, and allow it to be overwritten per Tokenizer instance 1040 self.use_rs_tokenizer = ( 1041 use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER 1042 ) 1043 1044 if self.use_rs_tokenizer: 1045 self._rs_dialect_settings = RsTokenizerDialectSettings( 1046 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 1047 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 1048 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 1049 ) 1050 1051 self.reset() 1052 1053 def reset(self) -> None: 1054 self.sql = "" 1055 self.size = 0 1056 self.tokens: t.List[Token] = [] 1057 self._start = 0 1058 self._current = 0 1059 self._line = 1 1060 self._col = 0 1061 self._comments: t.List[str] = [] 1062 1063 self._char = "" 1064 self._end = False 1065 self._peek = "" 1066 self._prev_token_line = -1 1067 1068 def tokenize(self, sql: str) -> t.List[Token]: 1069 """Returns a list of tokens corresponding to the SQL string `sql`.""" 1070 if self.use_rs_tokenizer: 1071 return self.tokenize_rs(sql) 1072 1073 self.reset() 1074 self.sql = sql 1075 self.size = len(sql) 1076 1077 try: 1078 self._scan() 1079 except Exception as e: 1080 start = max(self._current - 50, 0) 1081 end = min(self._current + 50, self.size - 1) 1082 context = self.sql[start:end] 1083 raise TokenError(f"Error tokenizing '{context}'") from e 1084 1085 return self.tokens 1086 1087 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 1088 while self.size and not self._end: 1089 current = self._current 1090 1091 # Skip spaces here rather than iteratively calling advance() for performance reasons 1092 while current < self.size: 1093 char = self.sql[current] 1094 1095 if char.isspace() and (char == " " or char == "\t"): 1096 current += 1 1097 else: 1098 break 1099 1100 offset = current - self._current if current > self._current else 1 1101 1102 self._start = current 1103 self._advance(offset) 1104 1105 if not self._char.isspace(): 1106 if self._char.isdigit(): 1107 self._scan_number() 1108 elif self._char in self._IDENTIFIERS: 1109 self._scan_identifier(self._IDENTIFIERS[self._char]) 1110 else: 1111 self._scan_keywords() 1112 1113 if until and until(): 1114 break 1115 1116 if self.tokens and self._comments: 1117 self.tokens[-1].comments.extend(self._comments) 1118 1119 def _chars(self, size: int) -> str: 1120 if size == 1: 1121 return self._char 1122 1123 start = self._current - 1 1124 end = start + size 1125 1126 return self.sql[start:end] if end <= self.size else "" 1127 1128 def _advance(self, i: int = 1, alnum: bool = False) -> None: 1129 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 1130 # Ensures we don't count an extra line if we get a \r\n line break sequence 1131 if not (self._char == "\r" and self._peek == "\n"): 1132 self._col = i 1133 self._line += 1 1134 else: 1135 self._col += i 1136 1137 self._current += i 1138 self._end = self._current >= self.size 1139 self._char = self.sql[self._current - 1] 1140 self._peek = "" if self._end else self.sql[self._current] 1141 1142 if alnum and self._char.isalnum(): 1143 # Here we use local variables instead of attributes for better performance 1144 _col = self._col 1145 _current = self._current 1146 _end = self._end 1147 _peek = self._peek 1148 1149 while _peek.isalnum(): 1150 _col += 1 1151 _current += 1 1152 _end = _current >= self.size 1153 _peek = "" if _end else self.sql[_current] 1154 1155 self._col = _col 1156 self._current = _current 1157 self._end = _end 1158 self._peek = _peek 1159 self._char = self.sql[_current - 1] 1160 1161 @property 1162 def _text(self) -> str: 1163 return self.sql[self._start : self._current] 1164 1165 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 1166 self._prev_token_line = self._line 1167 1168 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 1169 self.tokens[-1].comments.extend(self._comments) 1170 self._comments = [] 1171 1172 self.tokens.append( 1173 Token( 1174 token_type, 1175 text=self._text if text is None else text, 1176 line=self._line, 1177 col=self._col, 1178 start=self._start, 1179 end=self._current - 1, 1180 comments=self._comments, 1181 ) 1182 ) 1183 self._comments = [] 1184 1185 # If we have either a semicolon or a begin token before the command's token, we'll parse 1186 # whatever follows the command's token as a string 1187 if ( 1188 token_type in self.COMMANDS 1189 and self._peek != ";" 1190 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 1191 ): 1192 start = self._current 1193 tokens = len(self.tokens) 1194 self._scan(lambda: self._peek == ";") 1195 self.tokens = self.tokens[:tokens] 1196 text = self.sql[start : self._current].strip() 1197 if text: 1198 self._add(TokenType.STRING, text) 1199 1200 def _scan_keywords(self) -> None: 1201 size = 0 1202 word = None 1203 chars = self._text 1204 char = chars 1205 prev_space = False 1206 skip = False 1207 trie = self._KEYWORD_TRIE 1208 single_token = char in self.SINGLE_TOKENS 1209 1210 while chars: 1211 if skip: 1212 result = TrieResult.PREFIX 1213 else: 1214 result, trie = in_trie(trie, char.upper()) 1215 1216 if result == TrieResult.FAILED: 1217 break 1218 if result == TrieResult.EXISTS: 1219 word = chars 1220 1221 end = self._current + size 1222 size += 1 1223 1224 if end < self.size: 1225 char = self.sql[end] 1226 single_token = single_token or char in self.SINGLE_TOKENS 1227 is_space = char.isspace() 1228 1229 if not is_space or not prev_space: 1230 if is_space: 1231 char = " " 1232 chars += char 1233 prev_space = is_space 1234 skip = False 1235 else: 1236 skip = True 1237 else: 1238 char = "" 1239 break 1240 1241 if word: 1242 if self._scan_string(word): 1243 return 1244 if self._scan_comment(word): 1245 return 1246 if prev_space or single_token or not char: 1247 self._advance(size - 1) 1248 word = word.upper() 1249 self._add(self.KEYWORDS[word], text=word) 1250 return 1251 1252 if self._char in self.SINGLE_TOKENS: 1253 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 1254 return 1255 1256 self._scan_var() 1257 1258 def _scan_comment(self, comment_start: str) -> bool: 1259 if comment_start not in self._COMMENTS: 1260 return False 1261 1262 comment_start_line = self._line 1263 comment_start_size = len(comment_start) 1264 comment_end = self._COMMENTS[comment_start] 1265 1266 if comment_end: 1267 # Skip the comment's start delimiter 1268 self._advance(comment_start_size) 1269 1270 comment_count = 1 1271 comment_end_size = len(comment_end) 1272 1273 while not self._end: 1274 if self._chars(comment_end_size) == comment_end: 1275 comment_count -= 1 1276 if not comment_count: 1277 break 1278 1279 self._advance(alnum=True) 1280 1281 # Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres 1282 if ( 1283 self.NESTED_COMMENTS 1284 and not self._end 1285 and self._chars(comment_end_size) == comment_start 1286 ): 1287 self._advance(comment_start_size) 1288 comment_count += 1 1289 1290 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 1291 self._advance(comment_end_size - 1) 1292 else: 1293 while not self._end and self.WHITE_SPACE.get(self._peek) is not TokenType.BREAK: 1294 self._advance(alnum=True) 1295 self._comments.append(self._text[comment_start_size:]) 1296 1297 if ( 1298 comment_start == self.HINT_START 1299 and self.tokens 1300 and self.tokens[-1].token_type in self.TOKENS_PRECEDING_HINT 1301 ): 1302 self._add(TokenType.HINT) 1303 1304 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 1305 # Multiple consecutive comments are preserved by appending them to the current comments list. 1306 if comment_start_line == self._prev_token_line: 1307 self.tokens[-1].comments.extend(self._comments) 1308 self._comments = [] 1309 self._prev_token_line = self._line 1310 1311 return True 1312 1313 def _scan_number(self) -> None: 1314 if self._char == "0": 1315 peek = self._peek.upper() 1316 if peek == "B": 1317 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 1318 elif peek == "X": 1319 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 1320 1321 decimal = False 1322 scientific = 0 1323 1324 while True: 1325 if self._peek.isdigit(): 1326 self._advance() 1327 elif self._peek == "." and not decimal: 1328 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER: 1329 return self._add(TokenType.NUMBER) 1330 decimal = True 1331 self._advance() 1332 elif self._peek in ("-", "+") and scientific == 1: 1333 scientific += 1 1334 self._advance() 1335 elif self._peek.upper() == "E" and not scientific: 1336 scientific += 1 1337 self._advance() 1338 elif self._peek.isidentifier(): 1339 number_text = self._text 1340 literal = "" 1341 1342 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 1343 literal += self._peek 1344 self._advance() 1345 1346 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), "")) 1347 1348 if token_type: 1349 self._add(TokenType.NUMBER, number_text) 1350 self._add(TokenType.DCOLON, "::") 1351 return self._add(token_type, literal) 1352 else: 1353 replaced = literal.replace("_", "") 1354 if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit(): 1355 return self._add(TokenType.NUMBER, number_text + replaced) 1356 if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: 1357 return self._add(TokenType.VAR) 1358 1359 self._advance(-len(literal)) 1360 return self._add(TokenType.NUMBER, number_text) 1361 else: 1362 return self._add(TokenType.NUMBER) 1363 1364 def _scan_bits(self) -> None: 1365 self._advance() 1366 value = self._extract_value() 1367 try: 1368 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 1369 int(value, 2) 1370 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 1371 except ValueError: 1372 self._add(TokenType.IDENTIFIER) 1373 1374 def _scan_hex(self) -> None: 1375 self._advance() 1376 value = self._extract_value() 1377 try: 1378 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 1379 int(value, 16) 1380 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 1381 except ValueError: 1382 self._add(TokenType.IDENTIFIER) 1383 1384 def _extract_value(self) -> str: 1385 while True: 1386 char = self._peek.strip() 1387 if char and char not in self.SINGLE_TOKENS: 1388 self._advance(alnum=True) 1389 else: 1390 break 1391 1392 return self._text 1393 1394 def _scan_string(self, start: str) -> bool: 1395 base = None 1396 token_type = TokenType.STRING 1397 1398 if start in self._QUOTES: 1399 end = self._QUOTES[start] 1400 elif start in self._FORMAT_STRINGS: 1401 end, token_type = self._FORMAT_STRINGS[start] 1402 1403 if token_type == TokenType.HEX_STRING: 1404 base = 16 1405 elif token_type == TokenType.BIT_STRING: 1406 base = 2 1407 elif token_type == TokenType.HEREDOC_STRING: 1408 self._advance() 1409 1410 if self._char == end: 1411 tag = "" 1412 else: 1413 tag = self._extract_string( 1414 end, 1415 raw_string=True, 1416 raise_unmatched=not self.HEREDOC_TAG_IS_IDENTIFIER, 1417 ) 1418 1419 if tag and self.HEREDOC_TAG_IS_IDENTIFIER and (self._end or not tag.isidentifier()): 1420 if not self._end: 1421 self._advance(-1) 1422 1423 self._advance(-len(tag)) 1424 self._add(self.HEREDOC_STRING_ALTERNATIVE) 1425 return True 1426 1427 end = f"{start}{tag}{end}" 1428 else: 1429 return False 1430 1431 self._advance(len(start)) 1432 text = self._extract_string(end, raw_string=token_type == TokenType.RAW_STRING) 1433 1434 if base and text: 1435 try: 1436 int(text, base) 1437 except Exception: 1438 raise TokenError( 1439 f"Numeric string contains invalid characters from {self._line}:{self._start}" 1440 ) 1441 1442 self._add(token_type, text) 1443 return True 1444 1445 def _scan_identifier(self, identifier_end: str) -> None: 1446 self._advance() 1447 text = self._extract_string( 1448 identifier_end, escapes=self._IDENTIFIER_ESCAPES | {identifier_end} 1449 ) 1450 self._add(TokenType.IDENTIFIER, text) 1451 1452 def _scan_var(self) -> None: 1453 while True: 1454 char = self._peek.strip() 1455 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 1456 self._advance(alnum=True) 1457 else: 1458 break 1459 1460 self._add( 1461 TokenType.VAR 1462 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 1463 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 1464 ) 1465 1466 def _extract_string( 1467 self, 1468 delimiter: str, 1469 escapes: t.Optional[t.Set[str]] = None, 1470 raw_string: bool = False, 1471 raise_unmatched: bool = True, 1472 ) -> str: 1473 text = "" 1474 delim_size = len(delimiter) 1475 escapes = self._STRING_ESCAPES if escapes is None else escapes 1476 1477 while True: 1478 if ( 1479 not raw_string 1480 and self.dialect.UNESCAPED_SEQUENCES 1481 and self._peek 1482 and self._char in self.STRING_ESCAPES 1483 ): 1484 unescaped_sequence = self.dialect.UNESCAPED_SEQUENCES.get(self._char + self._peek) 1485 if unescaped_sequence: 1486 self._advance(2) 1487 text += unescaped_sequence 1488 continue 1489 if ( 1490 (self.STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS or not raw_string) 1491 and self._char in escapes 1492 and (self._peek == delimiter or self._peek in escapes) 1493 and (self._char not in self._QUOTES or self._char == self._peek) 1494 ): 1495 if self._peek == delimiter: 1496 text += self._peek 1497 else: 1498 text += self._char + self._peek 1499 1500 if self._current + 1 < self.size: 1501 self._advance(2) 1502 else: 1503 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 1504 else: 1505 if self._chars(delim_size) == delimiter: 1506 if delim_size > 1: 1507 self._advance(delim_size - 1) 1508 break 1509 1510 if self._end: 1511 if not raise_unmatched: 1512 return text + self._char 1513 1514 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 1515 1516 current = self._current - 1 1517 self._advance(alnum=True) 1518 text += self.sql[current : self._current - 1] 1519 1520 return text 1521 1522 def tokenize_rs(self, sql: str) -> t.List[Token]: 1523 if not self._RS_TOKENIZER: 1524 raise SqlglotError("Rust tokenizer is not available") 1525 1526 tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 1527 for token in tokens: 1528 token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] 1529 1530 # Setting this here so partial token lists can be inspected even if there is a failure 1531 self.tokens = tokens 1532 1533 if error_msg is not None: 1534 raise TokenError(error_msg) 1535 1536 return tokens
Tokenizer( dialect: Union[str, sqlglot.dialects.Dialect, Type[sqlglot.dialects.Dialect], NoneType] = None, use_rs_tokenizer: Optional[bool] = None, **opts: Any)
1029 def __init__( 1030 self, 1031 dialect: DialectType = None, 1032 use_rs_tokenizer: t.Optional[bool] = None, 1033 **opts: t.Any, 1034 ) -> None: 1035 from sqlglot.dialects import Dialect 1036 1037 self.dialect = Dialect.get_or_raise(dialect) 1038 1039 # initialize `use_rs_tokenizer`, and allow it to be overwritten per Tokenizer instance 1040 self.use_rs_tokenizer = ( 1041 use_rs_tokenizer if use_rs_tokenizer is not None else USE_RS_TOKENIZER 1042 ) 1043 1044 if self.use_rs_tokenizer: 1045 self._rs_dialect_settings = RsTokenizerDialectSettings( 1046 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, 1047 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 1048 numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, 1049 ) 1050 1051 self.reset()
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 'L_PAREN'>, ')': <TokenType.R_PAREN: 'R_PAREN'>, '[': <TokenType.L_BRACKET: 'L_BRACKET'>, ']': <TokenType.R_BRACKET: 'R_BRACKET'>, '{': <TokenType.L_BRACE: 'L_BRACE'>, '}': <TokenType.R_BRACE: 'R_BRACE'>, '&': <TokenType.AMP: 'AMP'>, '^': <TokenType.CARET: 'CARET'>, ':': <TokenType.COLON: 'COLON'>, ',': <TokenType.COMMA: 'COMMA'>, '.': <TokenType.DOT: 'DOT'>, '-': <TokenType.DASH: 'DASH'>, '=': <TokenType.EQ: 'EQ'>, '>': <TokenType.GT: 'GT'>, '<': <TokenType.LT: 'LT'>, '%': <TokenType.MOD: 'MOD'>, '!': <TokenType.NOT: 'NOT'>, '|': <TokenType.PIPE: 'PIPE'>, '+': <TokenType.PLUS: 'PLUS'>, ';': <TokenType.SEMICOLON: 'SEMICOLON'>, '/': <TokenType.SLASH: 'SLASH'>, '\\': <TokenType.BACKSLASH: 'BACKSLASH'>, '*': <TokenType.STAR: 'STAR'>, '~': <TokenType.TILDA: 'TILDA'>, '?': <TokenType.PLACEHOLDER: 'PLACEHOLDER'>, '@': <TokenType.PARAMETER: 'PARAMETER'>, '#': <TokenType.HASH: 'HASH'>, "'": <TokenType.UNKNOWN: 'UNKNOWN'>, '`': <TokenType.UNKNOWN: 'UNKNOWN'>, '"': <TokenType.UNKNOWN: 'UNKNOWN'>}
HEREDOC_STRING_ALTERNATIVE =
<TokenType.VAR: 'VAR'>
TOKENS_PRECEDING_HINT =
{<TokenType.INSERT: 'INSERT'>, <TokenType.SELECT: 'SELECT'>, <TokenType.DELETE: 'DELETE'>, <TokenType.UPDATE: 'UPDATE'>}
KEYWORDS: Dict[str, TokenType] =
{'{%': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%-': <TokenType.BLOCK_START: 'BLOCK_START'>, '%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '+%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '{{+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{{-': <TokenType.BLOCK_START: 'BLOCK_START'>, '+}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '/*+': <TokenType.HINT: 'HINT'>, '==': <TokenType.EQ: 'EQ'>, '::': <TokenType.DCOLON: 'DCOLON'>, '||': <TokenType.DPIPE: 'DPIPE'>, '|>': <TokenType.PIPE_GT: 'PIPE_GT'>, '>=': <TokenType.GTE: 'GTE'>, '<=': <TokenType.LTE: 'LTE'>, '<>': <TokenType.NEQ: 'NEQ'>, '!=': <TokenType.NEQ: 'NEQ'>, ':=': <TokenType.COLON_EQ: 'COLON_EQ'>, '<=>': <TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>, '->': <TokenType.ARROW: 'ARROW'>, '->>': <TokenType.DARROW: 'DARROW'>, '=>': <TokenType.FARROW: 'FARROW'>, '#>': <TokenType.HASH_ARROW: 'HASH_ARROW'>, '#>>': <TokenType.DHASH_ARROW: 'DHASH_ARROW'>, '<->': <TokenType.LR_ARROW: 'LR_ARROW'>, '&&': <TokenType.DAMP: 'DAMP'>, '??': <TokenType.DQMARK: 'DQMARK'>, '~~~': <TokenType.GLOB: 'GLOB'>, '~~': <TokenType.LIKE: 'LIKE'>, '~~*': <TokenType.ILIKE: 'ILIKE'>, '~*': <TokenType.IRLIKE: 'IRLIKE'>, 'ALL': <TokenType.ALL: 'ALL'>, 'AND': <TokenType.AND: 'AND'>, 'ANTI': <TokenType.ANTI: 'ANTI'>, 'ANY': <TokenType.ANY: 'ANY'>, 'ASC': <TokenType.ASC: 'ASC'>, 'AS': <TokenType.ALIAS: 'ALIAS'>, 'ASOF': <TokenType.ASOF: 'ASOF'>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'BEGIN': <TokenType.BEGIN: 'BEGIN'>, 'BETWEEN': <TokenType.BETWEEN: 'BETWEEN'>, 'CACHE': <TokenType.CACHE: 'CACHE'>, 'UNCACHE': <TokenType.UNCACHE: 'UNCACHE'>, 'CASE': <TokenType.CASE: 'CASE'>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 'CHARACTER_SET'>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 'CLUSTER_BY'>, 'COLLATE': <TokenType.COLLATE: 'COLLATE'>, 'COLUMN': <TokenType.COLUMN: 'COLUMN'>, 'COMMIT': <TokenType.COMMIT: 'COMMIT'>, 'CONNECT BY': <TokenType.CONNECT_BY: 'CONNECT_BY'>, 'CONSTRAINT': <TokenType.CONSTRAINT: 'CONSTRAINT'>, 'COPY': <TokenType.COPY: 'COPY'>, 'CREATE': <TokenType.CREATE: 'CREATE'>, 'CROSS': <TokenType.CROSS: 'CROSS'>, 'CUBE': <TokenType.CUBE: 'CUBE'>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 'CURRENT_DATE'>, 'CURRENT_SCHEMA': <TokenType.CURRENT_SCHEMA: 'CURRENT_SCHEMA'>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 'CURRENT_TIME'>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>, 'CURRENT_USER': <TokenType.CURRENT_USER: 'CURRENT_USER'>, 'DATABASE': <TokenType.DATABASE: 'DATABASE'>, 'DEFAULT': <TokenType.DEFAULT: 'DEFAULT'>, 'DELETE': <TokenType.DELETE: 'DELETE'>, 'DESC': <TokenType.DESC: 'DESC'>, 'DESCRIBE': <TokenType.DESCRIBE: 'DESCRIBE'>, 'DISTINCT': <TokenType.DISTINCT: 'DISTINCT'>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>, 'DIV': <TokenType.DIV: 'DIV'>, 'DROP': <TokenType.DROP: 'DROP'>, 'ELSE': <TokenType.ELSE: 'ELSE'>, 'END': <TokenType.END: 'END'>, 'ENUM': <TokenType.ENUM: 'ENUM'>, 'ESCAPE': <TokenType.ESCAPE: 'ESCAPE'>, 'EXCEPT': <TokenType.EXCEPT: 'EXCEPT'>, 'EXECUTE': <TokenType.EXECUTE: 'EXECUTE'>, 'EXISTS': <TokenType.EXISTS: 'EXISTS'>, 'FALSE': <TokenType.FALSE: 'FALSE'>, 'FETCH': <TokenType.FETCH: 'FETCH'>, 'FILTER': <TokenType.FILTER: 'FILTER'>, 'FIRST': <TokenType.FIRST: 'FIRST'>, 'FULL': <TokenType.FULL: 'FULL'>, 'FUNCTION': <TokenType.FUNCTION: 'FUNCTION'>, 'FOR': <TokenType.FOR: 'FOR'>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>, 'FORMAT': <TokenType.FORMAT: 'FORMAT'>, 'FROM': <TokenType.FROM: 'FROM'>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 'GEOGRAPHY'>, 'GEOMETRY': <TokenType.GEOMETRY: 'GEOMETRY'>, 'GLOB': <TokenType.GLOB: 'GLOB'>, 'GROUP BY': <TokenType.GROUP_BY: 'GROUP_BY'>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 'GROUPING_SETS'>, 'HAVING': <TokenType.HAVING: 'HAVING'>, 'ILIKE': <TokenType.ILIKE: 'ILIKE'>, 'IN': <TokenType.IN: 'IN'>, 'INDEX': <TokenType.INDEX: 'INDEX'>, 'INET': <TokenType.INET: 'INET'>, 'INNER': <TokenType.INNER: 'INNER'>, 'INSERT': <TokenType.INSERT: 'INSERT'>, 'INTERVAL': <TokenType.INTERVAL: 'INTERVAL'>, 'INTERSECT': <TokenType.INTERSECT: 'INTERSECT'>, 'INTO': <TokenType.INTO: 'INTO'>, 'IS': <TokenType.IS: 'IS'>, 'ISNULL': <TokenType.ISNULL: 'ISNULL'>, 'JOIN': <TokenType.JOIN: 'JOIN'>, 'KEEP': <TokenType.KEEP: 'KEEP'>, 'KILL': <TokenType.KILL: 'KILL'>, 'LATERAL': <TokenType.LATERAL: 'LATERAL'>, 'LEFT': <TokenType.LEFT: 'LEFT'>, 'LIKE': <TokenType.LIKE: 'LIKE'>, 'LIMIT': <TokenType.LIMIT: 'LIMIT'>, 'LOAD': <TokenType.LOAD: 'LOAD'>, 'LOCK': <TokenType.LOCK: 'LOCK'>, 'MERGE': <TokenType.MERGE: 'MERGE'>, 'NAMESPACE': <TokenType.NAMESPACE: 'NAMESPACE'>, 'NATURAL': <TokenType.NATURAL: 'NATURAL'>, 'NEXT': <TokenType.NEXT: 'NEXT'>, 'NOT': <TokenType.NOT: 'NOT'>, 'NOTNULL': <TokenType.NOTNULL: 'NOTNULL'>, 'NULL': <TokenType.NULL: 'NULL'>, 'OBJECT': <TokenType.OBJECT: 'OBJECT'>, 'OFFSET': <TokenType.OFFSET: 'OFFSET'>, 'ON': <TokenType.ON: 'ON'>, 'OR': <TokenType.OR: 'OR'>, 'XOR': <TokenType.XOR: 'XOR'>, 'ORDER BY': <TokenType.ORDER_BY: 'ORDER_BY'>, 'ORDINALITY': <TokenType.ORDINALITY: 'ORDINALITY'>, 'OUTER': <TokenType.OUTER: 'OUTER'>, 'OVER': <TokenType.OVER: 'OVER'>, 'OVERLAPS': <TokenType.OVERLAPS: 'OVERLAPS'>, 'OVERWRITE': <TokenType.OVERWRITE: 'OVERWRITE'>, 'PARTITION': <TokenType.PARTITION: 'PARTITION'>, 'PARTITION BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PERCENT': <TokenType.PERCENT: 'PERCENT'>, 'PIVOT': <TokenType.PIVOT: 'PIVOT'>, 'PRAGMA': <TokenType.PRAGMA: 'PRAGMA'>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>, 'PROCEDURE': <TokenType.PROCEDURE: 'PROCEDURE'>, 'QUALIFY': <TokenType.QUALIFY: 'QUALIFY'>, 'RANGE': <TokenType.RANGE: 'RANGE'>, 'RECURSIVE': <TokenType.RECURSIVE: 'RECURSIVE'>, 'REGEXP': <TokenType.RLIKE: 'RLIKE'>, 'RENAME': <TokenType.RENAME: 'RENAME'>, 'REPLACE': <TokenType.REPLACE: 'REPLACE'>, 'RETURNING': <TokenType.RETURNING: 'RETURNING'>, 'REFERENCES': <TokenType.REFERENCES: 'REFERENCES'>, 'RIGHT': <TokenType.RIGHT: 'RIGHT'>, 'RLIKE': <TokenType.RLIKE: 'RLIKE'>, 'ROLLBACK': <TokenType.ROLLBACK: 'ROLLBACK'>, 'ROLLUP': <TokenType.ROLLUP: 'ROLLUP'>, 'ROW': <TokenType.ROW: 'ROW'>, 'ROWS': <TokenType.ROWS: 'ROWS'>, 'SCHEMA': <TokenType.SCHEMA: 'SCHEMA'>, 'SELECT': <TokenType.SELECT: 'SELECT'>, 'SEMI': <TokenType.SEMI: 'SEMI'>, 'SESSION': <TokenType.SESSION: 'SESSION'>, 'SET': <TokenType.SET: 'SET'>, 'SETTINGS': <TokenType.SETTINGS: 'SETTINGS'>, 'SHOW': <TokenType.SHOW: 'SHOW'>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 'SIMILAR_TO'>, 'SOME': <TokenType.SOME: 'SOME'>, 'SORT BY': <TokenType.SORT_BY: 'SORT_BY'>, 'START WITH': <TokenType.START_WITH: 'START_WITH'>, 'STRAIGHT_JOIN': <TokenType.STRAIGHT_JOIN: 'STRAIGHT_JOIN'>, 'TABLE': <TokenType.TABLE: 'TABLE'>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>, 'TEMP': <TokenType.TEMPORARY: 'TEMPORARY'>, 'TEMPORARY': <TokenType.TEMPORARY: 'TEMPORARY'>, 'THEN': <TokenType.THEN: 'THEN'>, 'TRUE': <TokenType.TRUE: 'TRUE'>, 'TRUNCATE': <TokenType.TRUNCATE: 'TRUNCATE'>, 'UNION': <TokenType.UNION: 'UNION'>, 'UNKNOWN': <TokenType.UNKNOWN: 'UNKNOWN'>, 'UNNEST': <TokenType.UNNEST: 'UNNEST'>, 'UNPIVOT': <TokenType.UNPIVOT: 'UNPIVOT'>, 'UPDATE': <TokenType.UPDATE: 'UPDATE'>, 'USE': <TokenType.USE: 'USE'>, 'USING': <TokenType.USING: 'USING'>, 'UUID': <TokenType.UUID: 'UUID'>, 'VALUES': <TokenType.VALUES: 'VALUES'>, 'VIEW': <TokenType.VIEW: 'VIEW'>, 'VOLATILE': <TokenType.VOLATILE: 'VOLATILE'>, 'WHEN': <TokenType.WHEN: 'WHEN'>, 'WHERE': <TokenType.WHERE: 'WHERE'>, 'WINDOW': <TokenType.WINDOW: 'WINDOW'>, 'WITH': <TokenType.WITH: 'WITH'>, 'APPLY': <TokenType.APPLY: 'APPLY'>, 'ARRAY': <TokenType.ARRAY: 'ARRAY'>, 'BIT': <TokenType.BIT: 'BIT'>, 'BOOL': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BOOLEAN': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BYTE': <TokenType.TINYINT: 'TINYINT'>, 'MEDIUMINT': <TokenType.MEDIUMINT: 'MEDIUMINT'>, 'INT1': <TokenType.TINYINT: 'TINYINT'>, 'TINYINT': <TokenType.TINYINT: 'TINYINT'>, 'INT16': <TokenType.SMALLINT: 'SMALLINT'>, 'SHORT': <TokenType.SMALLINT: 'SMALLINT'>, 'SMALLINT': <TokenType.SMALLINT: 'SMALLINT'>, 'HUGEINT': <TokenType.INT128: 'INT128'>, 'UHUGEINT': <TokenType.UINT128: 'UINT128'>, 'INT2': <TokenType.SMALLINT: 'SMALLINT'>, 'INTEGER': <TokenType.INT: 'INT'>, 'INT': <TokenType.INT: 'INT'>, 'INT4': <TokenType.INT: 'INT'>, 'INT32': <TokenType.INT: 'INT'>, 'INT64': <TokenType.BIGINT: 'BIGINT'>, 'INT128': <TokenType.INT128: 'INT128'>, 'INT256': <TokenType.INT256: 'INT256'>, 'LONG': <TokenType.BIGINT: 'BIGINT'>, 'BIGINT': <TokenType.BIGINT: 'BIGINT'>, 'INT8': <TokenType.TINYINT: 'TINYINT'>, 'UINT': <TokenType.UINT: 'UINT'>, 'UINT128': <TokenType.UINT128: 'UINT128'>, 'UINT256': <TokenType.UINT256: 'UINT256'>, 'DEC': <TokenType.DECIMAL: 'DECIMAL'>, 'DECIMAL': <TokenType.DECIMAL: 'DECIMAL'>, 'DECIMAL32': <TokenType.DECIMAL32: 'DECIMAL32'>, 'DECIMAL64': <TokenType.DECIMAL64: 'DECIMAL64'>, 'DECIMAL128': <TokenType.DECIMAL128: 'DECIMAL128'>, 'DECIMAL256': <TokenType.DECIMAL256: 'DECIMAL256'>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'LIST': <TokenType.LIST: 'LIST'>, 'MAP': <TokenType.MAP: 'MAP'>, 'NULLABLE': <TokenType.NULLABLE: 'NULLABLE'>, 'NUMBER': <TokenType.DECIMAL: 'DECIMAL'>, 'NUMERIC': <TokenType.DECIMAL: 'DECIMAL'>, 'FIXED': <TokenType.DECIMAL: 'DECIMAL'>, 'REAL': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT4': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT8': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 'DOUBLE'>, 'JSON': <TokenType.JSON: 'JSON'>, 'JSONB': <TokenType.JSONB: 'JSONB'>, 'CHAR': <TokenType.CHAR: 'CHAR'>, 'CHARACTER': <TokenType.CHAR: 'CHAR'>, 'CHAR VARYING': <TokenType.VARCHAR: 'VARCHAR'>, 'CHARACTER VARYING': <TokenType.VARCHAR: 'VARCHAR'>, 'NCHAR': <TokenType.NCHAR: 'NCHAR'>, 'VARCHAR': <TokenType.VARCHAR: 'VARCHAR'>, 'VARCHAR2': <TokenType.VARCHAR: 'VARCHAR'>, 'NVARCHAR': <TokenType.NVARCHAR: 'NVARCHAR'>, 'NVARCHAR2': <TokenType.NVARCHAR: 'NVARCHAR'>, 'BPCHAR': <TokenType.BPCHAR: 'BPCHAR'>, 'STR': <TokenType.TEXT: 'TEXT'>, 'STRING': <TokenType.TEXT: 'TEXT'>, 'TEXT': <TokenType.TEXT: 'TEXT'>, 'LONGTEXT': <TokenType.LONGTEXT: 'LONGTEXT'>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>, 'TINYTEXT': <TokenType.TINYTEXT: 'TINYTEXT'>, 'CLOB': <TokenType.TEXT: 'TEXT'>, 'LONGVARCHAR': <TokenType.TEXT: 'TEXT'>, 'BINARY': <TokenType.BINARY: 'BINARY'>, 'BLOB': <TokenType.VARBINARY: 'VARBINARY'>, 'LONGBLOB': <TokenType.LONGBLOB: 'LONGBLOB'>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>, 'TINYBLOB': <TokenType.TINYBLOB: 'TINYBLOB'>, 'BYTEA': <TokenType.VARBINARY: 'VARBINARY'>, 'VARBINARY': <TokenType.VARBINARY: 'VARBINARY'>, 'TIME': <TokenType.TIME: 'TIME'>, 'TIMETZ': <TokenType.TIMETZ: 'TIMETZ'>, 'TIMESTAMP': <TokenType.TIMESTAMP: 'TIMESTAMP'>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>, 'TIMESTAMP_LTZ': <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>, 'TIMESTAMPNTZ': <TokenType.TIMESTAMPNTZ: 'TIMESTAMPNTZ'>, 'TIMESTAMP_NTZ': <TokenType.TIMESTAMPNTZ: 'TIMESTAMPNTZ'>, 'DATE': <TokenType.DATE: 'DATE'>, 'DATETIME': <TokenType.DATETIME: 'DATETIME'>, 'INT4RANGE': <TokenType.INT4RANGE: 'INT4RANGE'>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>, 'INT8RANGE': <TokenType.INT8RANGE: 'INT8RANGE'>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>, 'NUMRANGE': <TokenType.NUMRANGE: 'NUMRANGE'>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>, 'TSRANGE': <TokenType.TSRANGE: 'TSRANGE'>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>, 'TSTZRANGE': <TokenType.TSTZRANGE: 'TSTZRANGE'>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>, 'DATERANGE': <TokenType.DATERANGE: 'DATERANGE'>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>, 'UNIQUE': <TokenType.UNIQUE: 'UNIQUE'>, 'VECTOR': <TokenType.VECTOR: 'VECTOR'>, 'STRUCT': <TokenType.STRUCT: 'STRUCT'>, 'SEQUENCE': <TokenType.SEQUENCE: 'SEQUENCE'>, 'VARIANT': <TokenType.VARIANT: 'VARIANT'>, 'ALTER': <TokenType.ALTER: 'ALTER'>, 'ANALYZE': <TokenType.ANALYZE: 'ANALYZE'>, 'CALL': <TokenType.COMMAND: 'COMMAND'>, 'COMMENT': <TokenType.COMMENT: 'COMMENT'>, 'EXPLAIN': <TokenType.COMMAND: 'COMMAND'>, 'GRANT': <TokenType.GRANT: 'GRANT'>, 'REVOKE': <TokenType.REVOKE: 'REVOKE'>, 'OPTIMIZE': <TokenType.COMMAND: 'COMMAND'>, 'PREPARE': <TokenType.COMMAND: 'COMMAND'>, 'VACUUM': <TokenType.COMMAND: 'COMMAND'>, 'USER-DEFINED': <TokenType.USERDEFINED: 'USERDEFINED'>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 'VERSION_SNAPSHOT'>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 'TIMESTAMP_SNAPSHOT'>}
WHITE_SPACE: Dict[Optional[str], TokenType] =
{' ': <TokenType.SPACE: 'SPACE'>, '\t': <TokenType.SPACE: 'SPACE'>, '\n': <TokenType.BREAK: 'BREAK'>, '\r': <TokenType.BREAK: 'BREAK'>}
COMMANDS =
{<TokenType.COMMAND: 'COMMAND'>, <TokenType.FETCH: 'FETCH'>, <TokenType.EXECUTE: 'EXECUTE'>, <TokenType.RENAME: 'RENAME'>, <TokenType.SHOW: 'SHOW'>}
COMMAND_PREFIX_TOKENS =
{<TokenType.BEGIN: 'BEGIN'>, <TokenType.SEMICOLON: 'SEMICOLON'>}
def
reset(self) -> None:
1053 def reset(self) -> None: 1054 self.sql = "" 1055 self.size = 0 1056 self.tokens: t.List[Token] = [] 1057 self._start = 0 1058 self._current = 0 1059 self._line = 1 1060 self._col = 0 1061 self._comments: t.List[str] = [] 1062 1063 self._char = "" 1064 self._end = False 1065 self._peek = "" 1066 self._prev_token_line = -1
1068 def tokenize(self, sql: str) -> t.List[Token]: 1069 """Returns a list of tokens corresponding to the SQL string `sql`.""" 1070 if self.use_rs_tokenizer: 1071 return self.tokenize_rs(sql) 1072 1073 self.reset() 1074 self.sql = sql 1075 self.size = len(sql) 1076 1077 try: 1078 self._scan() 1079 except Exception as e: 1080 start = max(self._current - 50, 0) 1081 end = min(self._current + 50, self.size - 1) 1082 context = self.sql[start:end] 1083 raise TokenError(f"Error tokenizing '{context}'") from e 1084 1085 return self.tokens
Returns a list of tokens corresponding to the SQL string sql.
1522 def tokenize_rs(self, sql: str) -> t.List[Token]: 1523 if not self._RS_TOKENIZER: 1524 raise SqlglotError("Rust tokenizer is not available") 1525 1526 tokens, error_msg = self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 1527 for token in tokens: 1528 token.token_type = _ALL_TOKEN_TYPES[token.token_type_index] 1529 1530 # Setting this here so partial token lists can be inspected even if there is a failure 1531 self.tokens = tokens 1532 1533 if error_msg is not None: 1534 raise TokenError(error_msg) 1535 1536 return tokens