lib/prompt_toolkit/contrib/regular_languages/regex_parser.py

"""
Parser for parsing a regular expression.
Take a string representing a regular expression and return the root node of its
parse tree.

usage::

    root_node = parse_regex('(hello|world)')

Remarks:
- The regex parser processes multiline, it ignores all whitespace and supports
  multiple named groups with the same name and #-style comments.

Limitations:
- Lookahead is not supported.
"""

from __future__ import annotations

import re

__all__ = [
    "Repeat",
    "Variable",
    "Regex",
    "Lookahead",
    "tokenize_regex",
    "parse_regex",
]


class Node:
    """
    Base class for all the grammar nodes.
    (You don't initialize this one.)
    """

    def __add__(self, other_node: Node) -> NodeSequence:
        return NodeSequence([self, other_node])

    def __or__(self, other_node: Node) -> AnyNode:
        return AnyNode([self, other_node])


class AnyNode(Node):
    """
    Union operation (OR operation) between several grammars. You don't
    initialize this yourself, but it's a result of a "Grammar1 | Grammar2"
    operation.
    """

    def __init__(self, children: list[Node]) -> None:
        self.children = children

    def __or__(self, other_node: Node) -> AnyNode:
        return AnyNode(self.children + [other_node])

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.children!r})"


class NodeSequence(Node):
    """
    Concatenation operation of several grammars. You don't initialize this
    yourself, but it's a result of a "Grammar1 + Grammar2" operation.
    """

    def __init__(self, children: list[Node]) -> None:
        self.children = children

    def __add__(self, other_node: Node) -> NodeSequence:
        return NodeSequence(self.children + [other_node])

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.children!r})"


class Regex(Node):
    """
    Regular expression.
    """

    def __init__(self, regex: str) -> None:
        re.compile(regex)  # Validate

        self.regex = regex

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(/{self.regex}/)"


class Lookahead(Node):
    """
    Lookahead expression.
    """

    def __init__(self, childnode: Node, negative: bool = False) -> None:
        self.childnode = childnode
        self.negative = negative

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}({self.childnode!r})"


class Variable(Node):
    """
    Mark a variable in the regular grammar. This will be translated into a
    named group. Each variable can have his own completer, validator, etc..

    :param childnode: The grammar which is wrapped inside this variable.
    :param varname: String.
    """

    def __init__(self, childnode: Node, varname: str = "") -> None:
        self.childnode = childnode
        self.varname = varname

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(childnode={self.childnode!r}, varname={self.varname!r})"


class Repeat(Node):
    def __init__(
        self,
        childnode: Node,
        min_repeat: int = 0,
        max_repeat: int | None = None,
        greedy: bool = True,
    ) -> None:
        self.childnode = childnode
        self.min_repeat = min_repeat
        self.max_repeat = max_repeat
        self.greedy = greedy

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(childnode={self.childnode!r})"


def tokenize_regex(input: str) -> list[str]:
    """
    Takes a string, representing a regular expression as input, and tokenizes
    it.

    :param input: string, representing a regular expression.
    :returns: List of tokens.
    """
    # Regular expression for tokenizing other regular expressions.
    p = re.compile(
        r"""^(
        \(\?P\<[a-zA-Z0-9_-]+\>  | # Start of named group.
        \(\?#[^)]*\)             | # Comment
        \(\?=                    | # Start of lookahead assertion
        \(\?!                    | # Start of negative lookahead assertion
        \(\?<=                   | # If preceded by.
        \(\?<                    | # If not preceded by.
        \(?:                     | # Start of group. (non capturing.)
        \(                       | # Start of group.
        \(?[iLmsux]              | # Flags.
        \(?P=[a-zA-Z]+\)         | # Back reference to named group
        \)                       | # End of group.
        \{[^{}]*\}               | # Repetition
        \*\? | \+\? | \?\?\      | # Non greedy repetition.
        \* | \+ | \?             | # Repetition
        \#.*\n                   | # Comment
        \\. |

        # Character group.
        \[
            ( [^\]\\]  |  \\.)*
        \]                  |

        [^(){}]             |
        .
    )""",
        re.VERBOSE,
    )

    tokens = []

    while input:
        m = p.match(input)
        if m:
            token, input = input[: m.end()], input[m.end() :]
            if not token.isspace():
                tokens.append(token)
        else:
            raise Exception("Could not tokenize input regex.")

    return tokens


def parse_regex(regex_tokens: list[str]) -> Node:
    """
    Takes a list of tokens from the tokenizer, and returns a parse tree.
    """
    # We add a closing brace because that represents the final pop of the stack.
    tokens: list[str] = [")"] + regex_tokens[::-1]

    def wrap(lst: list[Node]) -> Node:
        """Turn list into sequence when it contains several items."""
        if len(lst) == 1:
            return lst[0]
        else:
            return NodeSequence(lst)

    def _parse() -> Node:
        or_list: list[list[Node]] = []
        result: list[Node] = []

        def wrapped_result() -> Node:
            if or_list == []:
                return wrap(result)
            else:
                or_list.append(result)
                return AnyNode([wrap(i) for i in or_list])

        while tokens:
            t = tokens.pop()

            if t.startswith("(?P<"):
                variable = Variable(_parse(), varname=t[4:-1])
                result.append(variable)

            elif t in ("*", "*?"):
                greedy = t == "*"
                result[-1] = Repeat(result[-1], greedy=greedy)

            elif t in ("+", "+?"):
                greedy = t == "+"
                result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)

            elif t in ("?", "??"):
                if result == []:
                    raise Exception("Nothing to repeat." + repr(tokens))
                else:
                    greedy = t == "?"
                    result[-1] = Repeat(
                        result[-1], min_repeat=0, max_repeat=1, greedy=greedy
                    )

            elif t == "|":
                or_list.append(result)
                result = []

            elif t in ("(", "(?:"):
                result.append(_parse())

            elif t == "(?!":
                result.append(Lookahead(_parse(), negative=True))

            elif t == "(?=":
                result.append(Lookahead(_parse(), negative=False))

            elif t == ")":
                return wrapped_result()

            elif t.startswith("#"):
                pass

            elif t.startswith("{"):
                # TODO: implement!
                raise Exception(f"{t}-style repetition not yet supported")

            elif t.startswith("(?"):
                raise Exception(f"{t!r} not supported")

            elif t.isspace():
                pass
            else:
                result.append(Regex(t))

        raise Exception("Expecting ')' token")

    result = _parse()

    if len(tokens) != 0:
        raise Exception("Unmatched parentheses.")
    else:
        return result
Init im going to bed -=- 2026-02-12 02:28:23 +02:00			`"""`
			`Parser for parsing a regular expression.`
			`Take a string representing a regular expression and return the root node of its`
			`parse tree.`

			`usage::`

			`root_node = parse_regex('(hello\|world)')`

			`Remarks:`
			`- The regex parser processes multiline, it ignores all whitespace and supports`
			`multiple named groups with the same name and #-style comments.`

			`Limitations:`
			`- Lookahead is not supported.`
			`"""`

			`from __future__ import annotations`

			`import re`

			`__all__ = [`
			`"Repeat",`
			`"Variable",`
			`"Regex",`
			`"Lookahead",`
			`"tokenize_regex",`
			`"parse_regex",`
			`]`


			`class Node:`
			`"""`
			`Base class for all the grammar nodes.`
			`(You don't initialize this one.)`
			`"""`

			`def __add__(self, other_node: Node) -> NodeSequence:`
			`return NodeSequence([self, other_node])`

			`def __or__(self, other_node: Node) -> AnyNode:`
			`return AnyNode([self, other_node])`


			`class AnyNode(Node):`
			`"""`
			`Union operation (OR operation) between several grammars. You don't`
			`initialize this yourself, but it's a result of a "Grammar1 \| Grammar2"`
			`operation.`
			`"""`

			`def __init__(self, children: list[Node]) -> None:`
			`self.children = children`

			`def __or__(self, other_node: Node) -> AnyNode:`
			`return AnyNode(self.children + [other_node])`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}({self.children!r})"`


			`class NodeSequence(Node):`
			`"""`
			`Concatenation operation of several grammars. You don't initialize this`
			`yourself, but it's a result of a "Grammar1 + Grammar2" operation.`
			`"""`

			`def __init__(self, children: list[Node]) -> None:`
			`self.children = children`

			`def __add__(self, other_node: Node) -> NodeSequence:`
			`return NodeSequence(self.children + [other_node])`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}({self.children!r})"`


			`class Regex(Node):`
			`"""`
			`Regular expression.`
			`"""`

			`def __init__(self, regex: str) -> None:`
			`re.compile(regex) # Validate`

			`self.regex = regex`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}(/{self.regex}/)"`


			`class Lookahead(Node):`
			`"""`
			`Lookahead expression.`
			`"""`

			`def __init__(self, childnode: Node, negative: bool = False) -> None:`
			`self.childnode = childnode`
			`self.negative = negative`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}({self.childnode!r})"`


			`class Variable(Node):`
			`"""`
			`Mark a variable in the regular grammar. This will be translated into a`
			`named group. Each variable can have his own completer, validator, etc..`

			`:param childnode: The grammar which is wrapped inside this variable.`
			`:param varname: String.`
			`"""`

			`def __init__(self, childnode: Node, varname: str = "") -> None:`
			`self.childnode = childnode`
			`self.varname = varname`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}(childnode={self.childnode!r}, varname={self.varname!r})"`


			`class Repeat(Node):`
			`def __init__(`
			`self,`
			`childnode: Node,`
			`min_repeat: int = 0,`
			`max_repeat: int \| None = None,`
			`greedy: bool = True,`
			`) -> None:`
			`self.childnode = childnode`
			`self.min_repeat = min_repeat`
			`self.max_repeat = max_repeat`
			`self.greedy = greedy`

			`def __repr__(self) -> str:`
			`return f"{self.__class__.__name__}(childnode={self.childnode!r})"`


			`def tokenize_regex(input: str) -> list[str]:`
			`"""`
			`Takes a string, representing a regular expression as input, and tokenizes`
			`it.`

			`:param input: string, representing a regular expression.`
			`:returns: List of tokens.`
			`"""`
			`# Regular expression for tokenizing other regular expressions.`
			`p = re.compile(`
			`r"""^(`
			`\(\?P\<[a-zA-Z0-9_-]+\> \| # Start of named group.`
			`\(\?#[^)]*\) \| # Comment`
			`\(\?= \| # Start of lookahead assertion`
			`\(\?! \| # Start of negative lookahead assertion`
			`\(\?<= \| # If preceded by.`
			`\(\?< \| # If not preceded by.`
			`\(?: \| # Start of group. (non capturing.)`
			`\( \| # Start of group.`
			`\(?[iLmsux] \| # Flags.`
			`\(?P=[a-zA-Z]+\) \| # Back reference to named group`
			`\) \| # End of group.`
			`\{[^{}]*\} \| # Repetition`
			`\*\? \| \+\? \| \?\?\ \| # Non greedy repetition.`
			`\* \| \+ \| \? \| # Repetition`
			`\#.*\n \| # Comment`
			`\\. \|`

			`# Character group.`
			`\[`
			`( [^\]\\] \| \\.)*`
			`\] \|`

			`[^(){}] \|`
			`.`
			`)""",`
			`re.VERBOSE,`
			`)`

			`tokens = []`

			`while input:`
			`m = p.match(input)`
			`if m:`
			`token, input = input[: m.end()], input[m.end() :]`
			`if not token.isspace():`
			`tokens.append(token)`
			`else:`
			`raise Exception("Could not tokenize input regex.")`

			`return tokens`


			`def parse_regex(regex_tokens: list[str]) -> Node:`
			`"""`
			`Takes a list of tokens from the tokenizer, and returns a parse tree.`
			`"""`
			`# We add a closing brace because that represents the final pop of the stack.`
			`tokens: list[str] = [")"] + regex_tokens[::-1]`

			`def wrap(lst: list[Node]) -> Node:`
			`"""Turn list into sequence when it contains several items."""`
			`if len(lst) == 1:`
			`return lst[0]`
			`else:`
			`return NodeSequence(lst)`

			`def _parse() -> Node:`
			`or_list: list[list[Node]] = []`
			`result: list[Node] = []`

			`def wrapped_result() -> Node:`
			`if or_list == []:`
			`return wrap(result)`
			`else:`
			`or_list.append(result)`
			`return AnyNode([wrap(i) for i in or_list])`

			`while tokens:`
			`t = tokens.pop()`

			`if t.startswith("(?P<"):`
			`variable = Variable(_parse(), varname=t[4:-1])`
			`result.append(variable)`

			`elif t in ("", "?"):`
			`greedy = t == "*"`
			`result[-1] = Repeat(result[-1], greedy=greedy)`

			`elif t in ("+", "+?"):`
			`greedy = t == "+"`
			`result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy)`

			`elif t in ("?", "??"):`
			`if result == []:`
			`raise Exception("Nothing to repeat." + repr(tokens))`
			`else:`
			`greedy = t == "?"`
			`result[-1] = Repeat(`
			`result[-1], min_repeat=0, max_repeat=1, greedy=greedy`
			`)`

			`elif t == "\|":`
			`or_list.append(result)`
			`result = []`

			`elif t in ("(", "(?:"):`
			`result.append(_parse())`

			`elif t == "(?!":`
			`result.append(Lookahead(_parse(), negative=True))`

			`elif t == "(?=":`
			`result.append(Lookahead(_parse(), negative=False))`

			`elif t == ")":`
			`return wrapped_result()`

			`elif t.startswith("#"):`
			`pass`

			`elif t.startswith("{"):`
			`# TODO: implement!`
			`raise Exception(f"{t}-style repetition not yet supported")`

			`elif t.startswith("(?"):`
			`raise Exception(f"{t!r} not supported")`

			`elif t.isspace():`
			`pass`
			`else:`
			`result.append(Regex(t))`

			`raise Exception("Expecting ')' token")`

			`result = _parse()`

			`if len(tokens) != 0:`
			`raise Exception("Unmatched parentheses.")`
			`else:`
			`return result`