//! This file provides a default implementation of Regex. use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; use std::{ collections::HashMap, fmt::{Debug, Display, Write}, marker::PhantomData, }; /// The type of a node in a regular expression. /// /// # Example /// /// If a node has type "Kleene", this means it represents a star /// construct in a regular expression, and its children are the /// contents of the star. /// /// # Note /// /// There is no "concatenation" node type. A concatenation of two /// nodes is represented as the two nodes being successive children in /// their common parent node. /// /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] pub enum RegexType { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals /// aa* Plus, /// An optional node Optional, /// An or node means an alternation of its children. Or, /// A paren node represents a parenthesis. Paren, /// An empty node #[default] Empty, /// A literal node Lit(T), } impl Display for RegexType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { RegexType::Kleene => write!(f, "*"), RegexType::Plus => write!(f, "+"), RegexType::Optional => write!(f, "?"), RegexType::Or => write!(f, "|"), RegexType::Paren => write!(f, "()"), RegexType::Empty => write!(f, "empty"), RegexType::Lit(value) => write!(f, "{value}"), } } } /// A default implementation of regular expressions. #[derive(Debug, Clone)] pub struct DefaultRegex { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec>, /// The root of the graph. /// /// If it is None, it indicates the regular expression is empty. root: Option, } impl Default for DefaultRegex { fn default() -> Self { Self { graph: Default::default(), types: Default::default(), root: Default::default(), } } } impl DefaultRegex { /// Set the root of the regular expression. #[inline] pub fn set_root(&mut self, root: Option) { self.root = root; } /// Construct a regular expression from a raw adjacency list graph /// and an array of labels. /// /// # Error /// /// If the graph contains cycles, this function will return an /// error. This check is added to make sure that the regular /// expression contains no cycles, otherwise operations with /// regular expressions might enter infinite loops later. pub fn new(graph: ALGraph, types: Vec>) -> Result { if graph.contains_cycles()? { Err(Error::Cycle) } else { Ok(Self { graph, types, root: Some(0), }) } } /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { self.types.len() } /// Return true if and only if this regular expression has no /// nodes. pub fn is_empty(&self) -> bool { self.types.is_empty() } /// Add a node as the child of an existing node or as a root. pub fn add_node( &mut self, label: RegexType, parent: Option, ) -> Result<(), ParseError> { self.graph.extend(parent.iter().copied())?; self.types.push(label); Ok(()) } /// Return the internal graph. /// /// Normally this should not be used. pub fn internal_graph(&self) -> ALGraph { self.graph.clone() } /// Return the internal types array. /// /// Normally this should not be used. pub fn internal_types(&self) -> Vec> { self.types.clone() } /// Return the array of parents. /// /// The element at index N of the returned array is the parent of /// that N-th element. If an element has no parents, a `None` is /// placed at its place. /// /// # Error /// /// If some edge points to an invalid node, an erro will be /// returned. /// /// Also, if the graph contains cycles, an error is also returned. pub fn parents_array(&self) -> Result>, Error> { if self.graph.contains_cycles().map_err(|e| match e { GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), _ => unreachable!(), })? { return Err(Error::Cycle); } let len = self.len(); let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); for source in self.graph.nodes() { for (index, target) in self .graph .children_of(source) .map_err(|_| Error::UnknownNode(source))? .enumerate() { result .get_mut(target) .ok_or(Error::UnknownNode(target))? .replace((source, index)); } } Ok(result) } } // REVIEW: This may not be needed. impl Cata, A> for &DefaultRegex where A: Algebra>, { fn cata(self, mut alg: A) -> T { let mut results: Vec> = std::iter::repeat_with(Default::default) .take(self.len()) .collect(); for index in 0..=self.len() { let algebra_result = { let results_of_children: Vec = self .graph .children_of(index) .unwrap() .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) .collect(); alg(results_of_children) }; // Artificially use this value to satisfy the compiler. let _ = std::mem::replace(&mut results[index], Some(algebra_result)); } std::mem::replace(&mut results[0], None).unwrap() } } impl DefaultRegex { /// Use a function to format the labels of the regular expressions /// and format the entire regular expression with this aid. pub fn to_string_with(&self, mut f: F) -> Result where F: FnMut(T) -> String, { #[derive(Copy, Clone)] enum StackElement { Seen(usize), Unseen(usize), } impl StackElement { fn index(&self) -> usize { match self { Seen(index) => *index, Unseen(index) => *index, } } fn is_seen(self) -> bool { matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; let mut result = String::new(); let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); stack.push(Unseen(0)); while let Some(top) = stack.pop() { let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); if self.degree(top.index()).unwrap() > 1 { write!(result, "(")?; stack.push(Unseen(types.len() - 1)); } stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "*")?; } } RegexType::Plus => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "+")?; } } RegexType::Optional => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { write!(result, "(")?; let len = self.len(); stack.push(Unseen(types.len() - 1)); for (child_index, child) in self .graph .children_of(top.index()) .unwrap() .enumerate() .rev() { if child_index != len - 1 && child_index != 0 { stack.push(Unseen(child)); stack.push(Seen(top.index())); } else { stack.push(Unseen(child)); } } } else { write!(result, "|")?; } } RegexType::Paren => { write!(result, ")")?; } RegexType::Empty => { stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); if self.graph.is_empty_node(top.index()).unwrap() { write!(result, "ε")?; } } RegexType::Lit(label) => write!(result, "{}", f(*label))?, } } Ok(result) } /// Use a function to format the labels of the regular expressions /// and format the entire regular expression with this aid, with a /// dot at a specified position. pub fn to_string_with_dot(&self, mut f: F, dot_pos: usize) -> Result where F: FnMut(T) -> String, { #[derive(Copy, Clone)] enum StackElement { Seen(usize), Unseen(usize), } impl StackElement { fn index(&self) -> usize { match self { Seen(index) => *index, Unseen(index) => *index, } } fn is_seen(self) -> bool { matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; let mut result = String::new(); let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); stack.push(Unseen(0)); while let Some(top) = stack.pop() { let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); if self.degree(top.index()).unwrap() > 1 { write!(result, "(")?; stack.push(Unseen(types.len() - 1)); } stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "*")?; } } RegexType::Plus => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "+")?; } } RegexType::Optional => { if !top.is_seen() { stack.push(Seen(top.index())); stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); } else { write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { write!(result, "(")?; let len = self.len(); stack.push(Unseen(types.len() - 1)); for (child_index, child) in self .graph .children_of(top.index()) .unwrap() .enumerate() .rev() { if child_index != len - 1 && child_index != 0 { stack.push(Unseen(child)); stack.push(Seen(top.index())); } else { stack.push(Unseen(child)); } } } else { write!(result, "|")?; } } RegexType::Paren => { write!(result, ")")?; } RegexType::Empty => { stack.extend( self.graph .children_of(top.index()) .unwrap() .map(Unseen) .rev(), ); if self.graph.is_empty_node(top.index()).unwrap() { write!(result, "ε")?; } } RegexType::Lit(label) => write!(result, "{}", f(*label))?, } if top.index() == dot_pos { write!(result, " · ")?; } } Ok(result) } } impl Display for DefaultRegex { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } impl Graph for DefaultRegex { type Iter<'a> = ::Iter<'a> where Self: 'a; #[inline] fn is_empty(&self) -> bool { self.graph.is_empty() } #[inline] fn nodes_len(&self) -> usize { self.graph.nodes_len() } #[inline] fn children_of(&self, node_id: usize) -> Result, GError> { self.graph.children_of(node_id) } #[inline] fn degree(&self, node_id: usize) -> Result { self.graph.degree(node_id) } #[inline] fn is_empty_node(&self, node_id: usize) -> Result { self.graph.is_empty_node(node_id) } #[inline] fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } #[inline] fn replace_by_builder(&mut self, _builder: impl graph::Builder) { unimplemented!() } } impl Regex> for DefaultRegex { /// Return the root of the regular expression. #[inline] fn root(&self) -> Option { self.root } #[inline] fn vertex_label(&self, node_id: usize) -> Result, Error> { self.types .get(node_id) .copied() .ok_or(Error::UnknownNode(node_id)) } } /// An error type for holding parsing errors. #[derive(Debug, Copy, Clone)] pub enum ParseError { /// A cycle is encountered. Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. Graph(GError), /// Encounter an empty stack. EmptyStack, /// Encounter a non-single stack at the end. NonSingleStack, /// Encounter a stack whose element is out of bounds. /// /// The first component is the stack element, while the second the /// bound. InvalidStack(usize, usize), /// Encounter a repetition construct without a preceding element. InvalidRepetition(usize), /// Invalid character InvalidCharacter(char), } impl Display for ParseError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{self:?}") } } impl std::error::Error for ParseError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { if let ParseError::Graph(gerr) = self { Some(gerr) } else { None } } } impl From for ParseError { fn from(ge: GError) -> Self { Self::Graph(ge) } } /// The direction of parsing. /// /// This means whether we want to stay at the same level of /// parent-child hierarchy, or to become the child, or to climb back /// to the last parent. #[derive(Debug, Copy, Clone, Default)] pub enum ParseDirection { /// Climb back to the last parent. Up, /// Stay at the same level in the hierarchy. #[default] Right, /// Become the child. Down, } impl Display for ParseDirection { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let direction = match self { ParseDirection::Up => "↑", ParseDirection::Right => "→", ParseDirection::Down => "↓", }; write!(f, "{direction}") } } /// A default recursive descent parser for regular expressions of /// terminals or non-terminals. #[derive(Debug, Clone)] pub struct DefaultRegParser { ter_map: HashMap, non_map: HashMap, _phantom: PhantomData, } impl DefaultRegParser { /// Query if a terminal or a non-terminal is already found. /// /// If found, return the associated index of the terminal or /// non-terminal. pub fn query(&self, tnt: &str, terminal_p: bool) -> Option { if terminal_p { self.ter_map.get(tnt).copied() } else { self.non_map.get(tnt).copied() } } /// Add a terminal or a non-terminal. pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { if terminal_p { let ter_len = self.ter_map.len(); self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); } else { let non_len = self.non_map.len(); self.non_map.entry(tnt.to_owned()).or_insert(non_len); } } } impl Default for DefaultRegParser { fn default() -> Self { Self { ter_map: Default::default(), non_map: Default::default(), _phantom: PhantomData, } } } impl DesRec for DefaultRegParser { type Label = RegexType; type Regex = DefaultRegex; type Error = ParseError; type Inter = ParseDirection; type Scanner<'a, 'b> = Box< dyn FnMut( &'b Self, &'a str, ) -> Result, Self::Error>, > where RegexType:'b; fn parse<'a, 'b>( &'b self, mut input: &'a str, mut scanner: Self::Scanner<'a, 'b>, post_p: bool, ) -> Result, &'a str)>, Self::Error> where Self::Label: 'b, { use ParseDirection::*; use RegexType::*; let mut intermediate_stack: Vec<(RegexType, ParseDirection)> = Vec::with_capacity(input.len()); // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } input = &input[len..]; intermediate_stack.push((label, direction)); // If we encounter an opening parenthesis, we add // another auxiliary instruction. if matches!((label, direction), (Or, Down)) { intermediate_stack.push((Empty, Down)); } } else { break; } } let inter_len = intermediate_stack.len(); let mut parents_stack: Vec = Vec::with_capacity(inter_len + 2); parents_stack.push(0); parents_stack.push(1); let mut list_of_children: Vec> = std::iter::repeat_with(Vec::new) .take(inter_len + 2) .collect(); list_of_children[0].push(1); let mut types: Vec> = vec![Or, Empty]; types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); // Converts the sequence of tokens and directions into a // regular expression. for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { let mut parent: usize; let mut parent_children: &mut Vec; if let Some(parent_stack_parent) = parents_stack.last().copied() { parent = parent_stack_parent; match list_of_children.get_mut(parent) { Some(stack_parent_children) => { parent_children = stack_parent_children; match (label, direction) { (Paren, Up) => { // a closing parenthesis does not need // to be counted as a child parents_stack.pop(); // a closing parenthesis jumps out of // two levels at once parents_stack.pop(); } (Empty, Up) => { // an upwards pipe // first add the current node to the parent of the parent parents_stack.pop(); if let Some(parent_stack_parent) = parents_stack.last().copied() { parent = parent_stack_parent; if let Some(stack_parent_children) = list_of_children.get_mut(parent) { parent_children = stack_parent_children; parent_children.push(index + 2); } else { return Err(ParseError::InvalidStack( parent, inter_len + 2, )); } } else { return Err(ParseError::EmptyStack); } // then make the current node the new parent parents_stack.push(index + 2); } (_, Up) => { parents_stack.pop(); parent_children.push(index + 2); } (_, Right) => { parent_children.push(index + 2); } (_, Down) => { parents_stack.push(index + 2); parent_children.push(index + 2); } } } None => return Err(ParseError::InvalidStack(parent, inter_len)), } } else { // There are unbalanced closing parentheses. return Err(ParseError::EmptyStack); } // A special handling of repetition constructs as postfix // operators: it swaps with the preceding element. if post_p { match label { Kleene | Plus | Optional => { // remove the current node from the parent parent_children.pop(); if let Some(preceding) = parent_children.last().copied() { list_of_children.swap(preceding, index + 2); types.swap(preceding, index + 2); match list_of_children.get_mut(preceding) { Some(preceding_children) => { preceding_children.push(index + 2); } None => { return Err(ParseError::InvalidStack(preceding, inter_len + 2)) } } } else { return Err(ParseError::InvalidRepetition(index)); } } _ => {} } } } // There are unbalanced opening parentheses. if parents_stack.len() != 2 { return Err(ParseError::NonSingleStack); } let graph = list_of_children.into(); // Check there are no cycles let result = DefaultRegex::new(graph, types).map_err(|e| match e { Error::Graph(ge) => ParseError::Graph(ge), Error::Cycle => ParseError::Cycle, _ => unreachable!(), })?; Ok(Some((result, input))) } } #[cfg(test)] mod test_des_rec { use super::*; use crate::desrec::DesRec; fn test_scanner<'a, 'b, T>( _parser: &'b DefaultRegParser, input: &'a str, ) -> Result, ParseDirection)>, ParseError> where T: GraphLabel + Display + Debug, T: 'b, { use ParseDirection::*; use RegexType::*; if let Some(first) = input.chars().next() { match first { '*' => Ok(Some((1, Kleene, Right))), '+' => Ok(Some((1, Plus, Right))), '?' => Ok(Some((1, Optional, Right))), '|' => Ok(Some((1, Empty, Up))), '(' => Ok(Some((1, Or, Down))), ')' => Ok(Some((1, Paren, Up))), ' '..='~' => Ok(Some((1, Lit(first), Right))), _ => Err(ParseError::InvalidCharacter(first)), } } else { Ok(None) } } #[test] fn test_des_rec() -> Result<(), Box> { let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); let parser: DefaultRegParser = Default::default(); if let Some((regex, remain)) = DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); let parents = regex.parents_array()?; println!("parents = {parents:?}"); Ok(()) } else { unreachable!() } } #[test] fn test_display() -> Result<(), Box> { use graph::builder::Builder; use RegexType::*; let mut builder = graph::adlist::ALGBuilder::default(); let mut types: Vec> = Vec::with_capacity(4); types.push(Kleene); builder.add_vertex(); types.push(Lit(0)); builder.add_vertex(); builder.add_edge(0, 1, ())?; types.push(Lit(1)); builder.add_vertex(); builder.add_edge(0, 2, ())?; types.push(Lit(2)); builder.add_vertex(); builder.add_edge(0, 3, ())?; let graph = builder.build(); let regex = DefaultRegex::new(graph, types)?; println!("regex = {regex}"); Ok(()) } }