RegExParser with working test cases

This commit is contained in:
Michael Schären 2026-03-05 11:13:28 +01:00
parent 574d5af49b
commit 027c78ed62

View File

@ -1,20 +1,39 @@
class RegExp: class RegExp:
def __init__(self, *args): def __init__(self, *args):
self.args = args self.args = args
def __repr__(self): def __repr__(self):
args = ", ".join(map(repr, self.args)) args = ", ".join(map(repr, self.args))
return f"{self.__class__.__name__}({args})" return f"{self.__class__.__name__}({args})"
def __eq__(self, other): def __eq__(self, other):
return type(self) is type(other) and self.args == other.args return type(self) is type(other) and self.args == other.args
class Any(RegExp): pass
class Normal(RegExp): pass
class Or(RegExp): pass class Any(RegExp):
class Str(RegExp): pass pass
class ZeroOrMore(RegExp): pass
class Normal(RegExp):
pass
class Or(RegExp):
pass
class Str(RegExp):
pass
class ZeroOrMore(RegExp):
pass
# Your task is to build an AST using those nodes. # Your task is to build an AST using those nodes.
# See sample tests or test output for examples of usage. # See sample tests or test output for examples of usage.
def parse_regexp(indata): def parse_regexp(indata):
return RegExParser(indata).compile() return RegExParser(indata).compile()
@ -46,6 +65,8 @@ class RegExParser:
if any([not string or string.count("|") > 1 for string in sequences]): if any([not string or string.count("|") > 1 for string in sequences]):
return False return False
elif isinstance(sequences, str) and sequences.count("|") > 1:
return False
if self.pattern.find("*") == 0: if self.pattern.find("*") == 0:
return False return False
@ -57,33 +78,47 @@ class RegExParser:
@staticmethod @staticmethod
def parse_sequences(pattern): def parse_sequences(pattern):
left_paren = pattern.find('(')
right_paren = pattern.rfind(')')
if left_paren == -1 and right_paren == -1:
return [pattern]
if left_paren == -1 or right_paren == -1:
return None
if left_paren > right_paren:
return None
left_side = pattern[:left_paren]
middle = pattern[left_paren + 1: right_paren]
right_side = pattern[right_paren + 1:]
result = [] result = []
if left_side: buffer = []
result.append(left_side) depth = 0
if middle: left_paren_count = pattern.count("(")
result.append(RegExParser.parse_sequences(middle)) right_paren_count = pattern.count(")")
if left_paren_count == -1 and right_paren_count == -1:
return [pattern]
if left_paren_count == -1 or right_paren_count == -1:
return None
if left_paren_count > right_paren_count:
return None
if right_side: for c in pattern:
result.append(right_side) if c == "(":
if depth == 0:
if buffer:
result.append("".join(buffer))
buffer = []
else:
buffer.append(c)
depth += 1
return result elif c == ")":
depth -= 1
if depth < 0:
return None
elif depth == 0:
group = "".join(buffer)
result.append(RegExParser.parse_sequences(group))
buffer = []
else:
buffer.append(c)
else:
buffer.append(c)
if buffer:
result.append("".join(buffer))
return result if len(result) > 1 else result[0]
@staticmethod @staticmethod
def parse_regex(sequences) -> RegExp | None: def parse_regex(sequences) -> RegExp | None:
@ -106,15 +141,50 @@ class RegExParser:
else: else:
result: list[RegExp] = [] result: list[RegExp] = []
for sequence_idx in range(len(sequences)): sequence_idx = 0
while sequence_idx < len(sequences):
current_sequence = sequences[sequence_idx] current_sequence = sequences[sequence_idx]
if sequence_idx < len(sequences) - 1: if sequence_idx < len(sequences) - 1:
next_sequence = sequences[sequence_idx + 1] next_sequence = sequences[sequence_idx + 1]
if isinstance(next_sequence, str) and next_sequence.startswith("*"): if isinstance(next_sequence, str) and next_sequence.startswith(
"*|"
):
next_sequence = next_sequence[2:]
if next_sequence:
sequence_idx += 1
else:
sequence_idx += 2
next_sequence = sequences[sequence_idx]
regex = Or(
ZeroOrMore(RegExParser.parse_regex(current_sequence)),
RegExParser.parse_regex(next_sequence),
)
if regex:
result.append(regex)
elif isinstance(next_sequence, str) and next_sequence.startswith(
"*"
):
next_sequence = next_sequence[1:]
regex = ZeroOrMore(RegExParser.parse_regex(current_sequence)) regex = ZeroOrMore(RegExParser.parse_regex(current_sequence))
if regex: if regex:
result.append(regex) result.append(regex)
elif isinstance(next_sequence, str) and next_sequence.startswith(
"|"
):
next_sequence = next_sequence[1:]
if next_sequence:
sequence_idx += 1
else:
sequence_idx += 2
next_sequence = sequences[sequence_idx]
regex = Or(
RegExParser.parse_regex(current_sequence),
RegExParser.parse_regex(next_sequence),
)
if regex:
result.append(regex)
else: else:
regex = RegExParser.parse_regex(current_sequence) regex = RegExParser.parse_regex(current_sequence)
if regex: if regex:
@ -124,9 +194,10 @@ class RegExParser:
if regex: if regex:
result.append(regex) result.append(regex)
sequence_idx += 1
return Str(result) if len(result) > 1 else result[0] return Str(result) if len(result) > 1 else result[0]
@staticmethod @staticmethod
def get_type(sequence) -> RegExp: def get_type(sequence) -> RegExp:
regex: RegExp regex: RegExp
@ -154,17 +225,9 @@ class RegExParser:
return regex return regex
if __name__ == "__main__": if __name__ == "__main__":
test_cases = [ test_cases = ["((aa)|ab)*|a"]
"",
"(",
"(hi!",
")(",
"a|t|y",
"a**",
"]K\nBYg<y)eoqj9+~Nim,$#-LkOM\\0E6zxC1^:WfD4A/v`@w.%J8_s(x|[\rI",
"((aa)|ab)*|a"
]
for test_case in test_cases: for test_case in test_cases:
print("---------------------") print("---------------------")