//! This file provides a default implementation of Regex. use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; use std::fmt::{Debug, Display}; /// The type of a node in a regular expression. /// /// # Example /// /// If a node has type "Kleene", this means it represents a star /// construct in a regular expression, and its children are the /// contents of the star. /// /// # Note /// /// There is no "concatenation" node type. A concatenation of two /// nodes is represented as the two nodes being successive children in /// their common parent node. /// /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] pub enum RegexType { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals /// aa* Plus, /// An optional node Optional, /// An or node means an alternation of its children. Or, /// A paren node represents a parenthesis. Paren, /// An empty node #[default] Empty, /// A literal node Lit(T), } /// A default implementation of regular expressions. #[derive(Debug)] pub struct DefaultRegex { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec>, } impl Default for DefaultRegex { fn default() -> Self { Self { graph: Default::default(), types: Default::default(), } } } impl DefaultRegex { /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { self.types.len() } /// Return true if and only if this regular expression has no /// nodes. pub fn is_empty(&self) -> bool { self.types.is_empty() } /// Add a node as the child of an existing node or as a root. pub fn add_node( &mut self, label: RegexType, parent: Option, ) -> Result<(), ParseError> { self.graph.extend(parent.iter().copied())?; self.types.push(label); Ok(()) } } // REVIEW: This may not be needed. impl Cata, A> for &DefaultRegex where A: Algebra>, { fn cata(self, mut alg: A) -> T { let mut results: Vec> = std::iter::repeat_with(Default::default) .take(self.len()) .collect(); for index in 0..=self.len() { let algebra_result = { let results_of_children: Vec = self .graph .children_of(index) .unwrap() .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) .collect(); alg(results_of_children) }; // Artificially use this value to satisfy the compiler. let _ = std::mem::replace(&mut results[index], Some(algebra_result)); } std::mem::replace(&mut results[0], None).unwrap() } } impl Display for DefaultRegex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { #[derive(Copy, Clone)] enum StackElement { Seen(usize), Unseen(usize), } impl StackElement { fn index(&self) -> usize { match self { Seen(index) => *index, Unseen(index) => *index, } } fn is_seen(&self) -> bool { match self { Seen(_) => true, Unseen(_) => false, } } } use StackElement::{Seen, Unseen}; let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); stack.push(Unseen(0)); while let Some(top) = stack.pop() { let node_type = types[top.index()]; // TODO: Do not use unwrap here. match node_type { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(f, "*")?; } } RegexType::Plus => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(f, "+")?; } } RegexType::Optional => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(f, "?")?; } } RegexType::Or => { if !top.is_seen() { write!(f, "(")?; let len = self.len(); stack.push(Unseen(types.len() - 1)); for (child_index, child) in self .graph .children_of(top.index()) .unwrap() .enumerate() .rev() { if child_index != len - 1 && child_index != 0 { stack.push(Unseen(child)); stack.push(Seen(top.index())); } else { stack.push(Unseen(child)); } } } else { write!(f, "|")?; } } RegexType::Paren => { write!(f, ")")?; } RegexType::Empty => { stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } RegexType::Lit(label) => write!(f, "{label}")?, } } Ok(()) } } impl Graph for DefaultRegex { type Iter<'a> = ::Iter<'a> where Self: 'a; #[inline] fn is_empty(&self) -> bool { self.graph.is_empty() } #[inline] fn nodes_len(&self) -> usize { self.graph.nodes_len() } #[inline] fn children_of(&self, node_id: usize) -> Result, GError> { self.graph.children_of(node_id) } #[inline] fn degree(&self, node_id: usize) -> Result { self.graph.degree(node_id) } #[inline] fn is_empty_node(&self, node_id: usize) -> Result { self.graph.is_empty_node(node_id) } #[inline] fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } } impl Regex> for DefaultRegex { #[inline] fn vertex_label(&self, node_id: usize) -> Result, Error> { self.types .get(node_id) .copied() .ok_or(Error::UnknownNode(node_id)) } } /// An error type for holding parsing errors. #[derive(Debug)] pub enum ParseError { /// Encounter an invalid state. Invalid, /// An error from graph operations. Graph(GError), /// Encounter an empty stack. EmptyStack, /// Encounter a non-single stack at the end. NonSingleStack, /// Encounter a stack whose element is out of bounds. /// /// The first component is the stack element, while the second the /// bound. InvalidStack(usize, usize), /// Encounter a repetition construct without a preceding element. InvalidRepetition(usize), /// Invalid character InvalidCharacter(char), } impl Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self:?}") } } impl std::error::Error for ParseError {} impl From for ParseError { fn from(ge: GError) -> Self { Self::Graph(ge) } } /// The direction of parsing. /// /// This means whether we want to stay at the same level of /// parent-child hierarchy, or to become the child, or to climb back /// to the last parent. #[derive(Debug, Copy, Clone, Default)] pub enum ParseDirection { /// Climb back to the last parent. Up, /// Stay at the same level in the hierarchy. #[default] Right, /// Become the child. Down, } impl Display for ParseDirection { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let direction = match self { ParseDirection::Up => "↑", ParseDirection::Right => "→", ParseDirection::Down => "↓", }; write!(f, "{direction}") } } impl DesRec for DefaultRegex { type Label = RegexType; type Regex = Self; type Error = ParseError; type Aux = ParseDirection; type Scanner<'a> = Box Result, Self::Error>>; fn parse<'a>( mut input: &'a str, mut scanner: Self::Scanner<'a>, post_p: bool, ) -> Result, &'a str)>, Self::Error> { use ParseDirection::*; use RegexType::*; let mut intermediate_stack: Vec<(RegexType, ParseDirection)> = Vec::with_capacity(input.len()); // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { if let Some((len, label, direction)) = scanner(input)? { if len == 0 { break; } input = &input[len..]; intermediate_stack.push((label, direction)); // If we encounter an opening parenthesis, we add // another auxiliary instruction. if matches!((label, direction), (Or, Down)) { intermediate_stack.push((Empty, Down)); } } else { break; } } let inter_len = intermediate_stack.len(); let mut parents_stack: Vec = Vec::with_capacity(inter_len + 2); parents_stack.push(0); parents_stack.push(1); let mut list_of_children: Vec> = std::iter::repeat_with(Vec::new) .take(inter_len + 2) .collect(); list_of_children[0].push(1); let mut types: Vec> = vec![Or, Empty]; types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); // Converts the sequence of tokens and directions into a // regular expression. for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { let mut parent: usize; let mut parent_children: &mut Vec; if let Some(parent_stack_parent) = parents_stack.last().copied() { parent = parent_stack_parent; match list_of_children.get_mut(parent) { Some(stack_parent_children) => { parent_children = stack_parent_children; match (label, direction) { (Paren, Up) => { // a closing parenthesis does not need // to be counted as a child parents_stack.pop(); // a closing parenthesis jumps out of // two levels at once parents_stack.pop(); } (Empty, Up) => { // an upwards pipe // first add the current node to the parent of the parent parents_stack.pop(); if let Some(parent_stack_parent) = parents_stack.last().copied() { parent = parent_stack_parent; if let Some(stack_parent_children) = list_of_children.get_mut(parent) { parent_children = stack_parent_children; parent_children.push(index + 2); } else { return Err(ParseError::InvalidStack( parent, inter_len + 2, )); } } else { return Err(ParseError::EmptyStack); } // then make the current node the new parent parents_stack.push(index + 2); } (_, Up) => { parents_stack.pop(); parent_children.push(index + 2); } (_, Right) => { parent_children.push(index + 2); } (_, Down) => { parents_stack.push(index + 2); parent_children.push(index + 2); } } } None => return Err(ParseError::InvalidStack(parent, inter_len)), } } else { // There are unbalanced closing parentheses. return Err(ParseError::EmptyStack); } // A special handling of repetition constructs as postfix // operators: it swaps with the preceding element. if post_p { match label { Kleene | Plus | Optional => { // remove the current node from the parent parent_children.pop(); if let Some(preceding) = parent_children.last().copied() { list_of_children.swap(preceding, index + 2); types.swap(preceding, index + 2); match list_of_children.get_mut(preceding) { Some(preceding_children) => { preceding_children.push(index + 2); } None => { return Err(ParseError::InvalidStack(preceding, inter_len + 2)) } } } else { return Err(ParseError::InvalidRepetition(index)); } } _ => {} } } } // There are unbalanced opening parentheses. if parents_stack.len() != 2 { return Err(ParseError::NonSingleStack); } let graph = list_of_children.into(); let result = DefaultRegex { graph, types }; Ok(Some((result, input))) } } #[cfg(test)] mod test_des_rec { use super::*; fn test_scanner( input: &str, ) -> Result, ParseDirection)>, ParseError> { use ParseDirection::*; use RegexType::*; if let Some(first) = input.chars().next() { match first { '*' => Ok(Some((1, Kleene, Right))), '+' => Ok(Some((1, Plus, Right))), '?' => Ok(Some((1, Optional, Right))), '|' => Ok(Some((1, Empty, Up))), '(' => Ok(Some((1, Or, Down))), ')' => Ok(Some((1, Paren, Up))), ' '..='~' => Ok(Some((1, Lit(first), Right))), _ => Err(ParseError::InvalidCharacter(first)), } } else { Ok(None) } } #[test] fn test_des_rec() -> Result<(), Box> { let input_string = "a*b?c+|(d*| +)?".to_owned(); if let Some((regex, remain)) = DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); Ok(()) } else { unreachable!() } } }