complete regex parser

This commit is contained in:
Michael Schären 2026-03-05 23:07:00 +01:00
parent d0205282de
commit 374195c4d6

View File

@ -34,210 +34,123 @@ class ZeroOrMore(RegExp):
# See sample tests or test output for examples of usage. # See sample tests or test output for examples of usage.
def parse_regexp(indata): def parse_regexp(pattern: str):
return RegExParser(indata).compile() try:
parser = RegexParser(pattern)
result = parser.parse_regex()
if not parser.end():
raise ValueError("Unexpected characters")
return result
except ValueError:
return None
class RegExParser: class RegexParser:
compiled_pattern: RegExp def __init__(self, pattern):
operators = "()*|."
def __init__(self, pattern: str):
self.pattern = pattern self.pattern = pattern
self.pos = 0
def compile(self) -> RegExp | None: # ----------------------------
# validate # helpers
if not self.is_valid(): # ----------------------------
def peek(self):
if self.pos >= len(self.pattern):
return None
return self.pattern[self.pos]
def consume(self):
c = self.peek()
if c is not None:
self.pos += 1
return c
def end(self):
return self.pos >= len(self.pattern)
# ----------------------------
# regex grammar
# ----------------------------
def parse_regex(self):
return self.parse_alternation()
def parse_alternation(self):
left = self.parse_concatenation()
seen_or = False
while self.peek() == "|":
if seen_or:
raise ValueError("Only one '|' allowed per group")
seen_or = True
self.consume()
right = self.parse_concatenation()
left = Or(left, right)
return left
def parse_concatenation(self):
nodes = []
while True:
c = self.peek()
if c is None or c in "|)":
break
nodes.append(self.parse_repetition())
if not nodes:
return None return None
sequences = RegExParser.parse_sequences(self.pattern) if len(nodes) == 1:
return nodes[0]
print(sequences) return Str(nodes)
return RegExParser.parse_regex(sequences) def parse_repetition(self):
node = self.parse_atom()
def is_valid(self): while self.peek() == "*":
if not self.pattern: self.consume()
return False
sequences = RegExParser.parse_sequences(self.pattern) if isinstance(node, ZeroOrMore):
raise ValueError("Consecutive '*' not allowed")
if not sequences: node = ZeroOrMore(node)
return False
if any([not string or string.count("|") > 1 for string in sequences]): return node
return False
elif isinstance(sequences, str) and sequences.count("|") > 1:
return False
if self.pattern.find("*") == 0: def parse_atom(self):
return False c = self.peek()
if "**" in self.pattern: if c is None:
return False
return True
@staticmethod
def parse_sequences(pattern):
result = []
buffer = []
depth = 0
left_paren_count = pattern.count("(")
right_paren_count = pattern.count(")")
if left_paren_count == -1 and right_paren_count == -1:
return [pattern]
if left_paren_count == -1 or right_paren_count == -1:
return None
if left_paren_count > right_paren_count:
return None return None
for c in pattern: if c == "(":
if c == "(": self.consume()
if depth == 0: node = self.parse_regex()
if buffer:
result.append("".join(buffer))
buffer = []
else:
buffer.append(c)
depth += 1
elif c == ")": if self.peek() != ")":
depth -= 1 raise ValueError("Unmatched '('")
if depth < 0:
return None
elif depth == 0:
group = "".join(buffer)
next_sequence = RegExParser.parse_sequences(group)
if isinstance(next_sequence, str):
next_sequence = "(" + next_sequence + ")"
result.append(next_sequence)
buffer = []
else:
buffer.append(c)
else: self.consume()
buffer.append(c) return node
if buffer: if c == "*":
result.append("".join(buffer)) raise ValueError("'*' cannot start an expression")
return result if len(result) > 1 else result[0] if c == ".":
self.consume()
return Any()
@staticmethod self.consume()
def parse_regex(sequences) -> RegExp | list[RegExp] | None: return Normal(c)
if isinstance(sequences, str):
if sequences.find("*") == 0:
sequences = sequences[1:]
if sequences == "":
return None
if sequences.count("|") == 1:
or_groups = sequences.split("|")
left_exp = RegExParser.get_type(or_groups[0])
right_exp = RegExParser.get_type(or_groups[1])
return Or(left_exp, right_exp)
else:
return RegExParser.get_type(sequences)
else:
result: list[RegExp] = []
sequence_idx = 0
while sequence_idx < len(sequences):
current_sequence = sequences[sequence_idx]
if sequence_idx < len(sequences) - 1:
next_sequence = sequences[sequence_idx + 1]
if isinstance(next_sequence, str) and next_sequence.startswith(
"*|"
):
next_sequence = next_sequence[2:]
if next_sequence:
sequence_idx += 1
else:
sequence_idx += 2
next_sequence = sequences[sequence_idx]
regex = Or(
ZeroOrMore(RegExParser.parse_regex(current_sequence)),
RegExParser.parse_regex(next_sequence),
)
if regex:
result.append(regex)
elif isinstance(next_sequence, str) and next_sequence.startswith(
"*"
):
next_sequence = next_sequence[1:]
regex = ZeroOrMore(RegExParser.parse_regex(current_sequence))
if regex:
result.append(regex)
elif isinstance(next_sequence, str) and next_sequence.startswith(
"|"
):
next_sequence = next_sequence[1:]
if next_sequence:
sequence_idx += 1
else:
sequence_idx += 2
next_sequence = sequences[sequence_idx]
regex = Or(
RegExParser.parse_regex(current_sequence),
RegExParser.parse_regex(next_sequence),
)
if regex:
result.append(regex)
else:
regex = RegExParser.parse_regex(current_sequence)
if regex:
if isinstance(regex, list):
result.extend(regex)
else:
result.append(regex)
else:
regex = RegExParser.parse_regex(current_sequence)
if regex:
if isinstance(regex, list):
result.extend(regex)
else:
result.append(regex)
sequence_idx += 1
return Str(result) if len(result) > 1 else result[0]
@staticmethod
def get_type(sequence) -> RegExp | list[RegExp]:
regex: RegExp | list[RegExp]
if sequence.startswith("("):
sequence = sequence[1:]
if sequence.endswith(")"):
sequence = sequence[:-1]
if len(sequence) > 1:
string = []
for char_idx in range(len(sequence) - 1):
if sequence[char_idx] == "*":
continue
next_char = sequence[char_idx + 1]
if next_char == "*":
string.append(ZeroOrMore(RegExParser.get_type(sequence[char_idx])))
else:
string.append(RegExParser.get_type(sequence[char_idx]))
last_char = sequence[len(sequence) - 1]
if last_char != "*":
string.append(RegExParser.get_type(last_char))
regex = Str(string) if len(string) > 1 else string[0]
else:
regex = Any() if sequence == "." else Normal(sequence)
return regex
if __name__ == "__main__": if __name__ == "__main__":