diff options
author | JSDurand <mmemmew@gmail.com> | 2022-12-14 23:48:22 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2022-12-14 23:48:22 +0800 |
commit | 9f1c88b863e247da3cd60d2792a7a13b18e25e53 (patch) | |
tree | d29c0e19793a88a1de6898fdfd2a376fca21378f /nfa/src/default/regex.rs | |
parent | cb7bcfad4ab0041aaf3fde3185e27ee46bb37788 (diff) |
a temporary check point
just to save things in a commit
Diffstat (limited to 'nfa/src/default/regex.rs')
-rw-r--r-- | nfa/src/default/regex.rs | 575 |
1 files changed, 575 insertions, 0 deletions
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs new file mode 100644 index 0000000..a60f801 --- /dev/null +++ b/nfa/src/default/regex.rs @@ -0,0 +1,575 @@ +//! This file provides a default implementation of Regex. + +use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; + +use crate::{desrec::DesRec, error::Error, Regex}; + +use receme::{algebra::Algebra, catana::Cata}; + +use std::fmt::{Debug, Display}; + +/// The type of a node in a regular expression. +/// +/// # Example +/// +/// If a node has type "Kleene", this means it represents a star +/// construct in a regular expression, and its children are the +/// contents of the star. +/// +/// # Note +/// +/// There is no "concatenation" node type. A concatenation of two +/// nodes is represented as the two nodes being successive children in +/// their common parent node. +/// +/// This is possible because a regular expression has a root node. +/// For the sake of convenience, the root node has type "Or". +#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum RegexType<T: GraphLabel + Display> { + /// A star node is a node with an arbitrary number of repetitions. + Kleene, + /// A plus node is a node with at least one repetition: a+ equals + /// aa* + Plus, + /// An optional node + Optional, + /// An or node means an alternation of its children. + Or, + /// A paren node represents a parenthesis. + Paren, + /// An empty node + #[default] + Empty, + /// A literal node + Lit(T), +} + +/// A default implementation of regular expressions. +#[derive(Debug)] +pub struct DefaultRegex<T: GraphLabel + Display> { + /// The underlying graph is stored using adjacency lists. + graph: ALGraph, + /// The types of the underlying nodes. + types: Vec<RegexType<T>>, +} + +impl<T: GraphLabel + Display> Default for DefaultRegex<T> { + fn default() -> Self { + Self { + graph: Default::default(), + types: Default::default(), + } + } +} + +impl<T: GraphLabel + Display> DefaultRegex<T> { + /// Return the number of elements in this regular expression, + /// counting nodes. + pub fn len(&self) -> usize { + self.types.len() + } + + /// Return true if and only if this regular expression has no + /// nodes. + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } + + /// Add a node as the child of an existing node or as a root. + pub fn add_node( + &mut self, + label: RegexType<T>, + parent: Option<usize>, + ) -> Result<(), ParseError> { + self.graph.extend(parent.iter().copied())?; + self.types.push(label); + + Ok(()) + } +} + +// REVIEW: This may not be needed. +impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> +where + A: Algebra<T, Vec<T>>, +{ + fn cata(self, mut alg: A) -> T { + let mut results: Vec<Option<T>> = std::iter::repeat_with(Default::default) + .take(self.len()) + .collect(); + + for index in 0..=self.len() { + let algebra_result = { + let results_of_children: Vec<T> = self + .graph + .children_of(index) + .unwrap() + .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) + .collect(); + + alg(results_of_children) + }; + + // Artificially use this value to satisfy the compiler. + let _ = std::mem::replace(&mut results[index], Some(algebra_result)); + } + + std::mem::replace(&mut results[0], None).unwrap() + } +} + +impl<T: GraphLabel + Display> Display for DefaultRegex<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Copy, Clone)] + enum StackElement { + Seen(usize), + Unseen(usize), + } + + impl StackElement { + fn index(&self) -> usize { + match self { + Seen(index) => *index, + Unseen(index) => *index, + } + } + + fn is_seen(&self) -> bool { + match self { + Seen(_) => true, + Unseen(_) => false, + } + } + } + + use StackElement::{Seen, Unseen}; + + let mut stack: Vec<StackElement> = Vec::new(); + let mut types = self.types.clone(); + types.push(RegexType::Paren); + + stack.push(Unseen(0)); + + while let Some(top) = stack.pop() { + let node_type = types[top.index()]; + + // TODO: Do not use unwrap here. + + match node_type { + RegexType::Kleene => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "*")?; + } + } + RegexType::Plus => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "+")?; + } + } + RegexType::Optional => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "?")?; + } + } + RegexType::Or => { + if !top.is_seen() { + write!(f, "(")?; + + let len = self.len(); + + stack.push(Unseen(types.len() - 1)); + + for (child_index, child) in self + .graph + .children_of(top.index()) + .unwrap() + .enumerate() + .rev() + { + if child_index != len - 1 && child_index != 0 { + stack.push(Unseen(child)); + stack.push(Seen(top.index())); + } else { + stack.push(Unseen(child)); + } + } + } else { + write!(f, "|")?; + } + } + RegexType::Paren => { + write!(f, ")")?; + } + RegexType::Empty => { + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } + RegexType::Lit(label) => write!(f, "{label}")?, + } + } + + Ok(()) + } +} + +impl<T: GraphLabel + Display> Graph for DefaultRegex<T> { + type Iter<'a> = <ALGraph as Graph>::Iter<'a> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result<usize, GError> { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { + self.graph.has_edge(source, target) + } +} + +impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { + #[inline] + fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> { + self.types + .get(node_id) + .copied() + .ok_or(Error::UnknownNode(node_id)) + } +} + +/// An error type for holding parsing errors. +#[derive(Debug)] +pub enum ParseError { + /// Encounter an invalid state. + Invalid, + /// An error from graph operations. + Graph(GError), + /// Encounter an empty stack. + EmptyStack, + /// Encounter a non-single stack at the end. + NonSingleStack, + /// Encounter a stack whose element is out of bounds. + /// + /// The first component is the stack element, while the second the + /// bound. + InvalidStack(usize, usize), + /// Encounter a repetition construct without a preceding element. + InvalidRepetition(usize), + /// Invalid character + InvalidCharacter(char), +} + +impl Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for ParseError {} + +impl From<GError> for ParseError { + fn from(ge: GError) -> Self { + Self::Graph(ge) + } +} + +/// The direction of parsing. +/// +/// This means whether we want to stay at the same level of +/// parent-child hierarchy, or to become the child, or to climb back +/// to the last parent. +#[derive(Debug, Copy, Clone, Default)] +pub enum ParseDirection { + /// Climb back to the last parent. + Up, + /// Stay at the same level in the hierarchy. + #[default] + Right, + /// Become the child. + Down, +} + +impl Display for ParseDirection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let direction = match self { + ParseDirection::Up => "↑", + ParseDirection::Right => "→", + ParseDirection::Down => "↓", + }; + + write!(f, "{direction}") + } +} + +impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { + type Label = RegexType<T>; + + type Regex = Self; + + type Error = ParseError; + + type Aux = ParseDirection; + + type Scanner<'a> = + Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>; + + fn parse<'a>( + mut input: &'a str, + mut scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> { + use ParseDirection::*; + use RegexType::*; + + let mut intermediate_stack: Vec<(RegexType<T>, ParseDirection)> = + Vec::with_capacity(input.len()); + + // Classifies the input into a sequence of tokens with + // directions. + while !input.is_empty() { + if let Some((len, label, direction)) = scanner(input)? { + if len == 0 { + break; + } + + input = &input[len..]; + + intermediate_stack.push((label, direction)); + + // If we encounter an opening parenthesis, we add + // another auxiliary instruction. + if matches!((label, direction), (Or, Down)) { + intermediate_stack.push((Empty, Down)); + } + } else { + break; + } + } + + let inter_len = intermediate_stack.len(); + + let mut parents_stack: Vec<usize> = Vec::with_capacity(inter_len + 2); + + parents_stack.push(0); + parents_stack.push(1); + + let mut list_of_children: Vec<Vec<usize>> = std::iter::repeat_with(Vec::new) + .take(inter_len + 2) + .collect(); + + list_of_children[0].push(1); + + let mut types: Vec<RegexType<T>> = vec![Or, Empty]; + + types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); + + // Converts the sequence of tokens and directions into a + // regular expression. + for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { + let mut parent: usize; + let mut parent_children: &mut Vec<usize>; + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + match list_of_children.get_mut(parent) { + Some(stack_parent_children) => { + parent_children = stack_parent_children; + + match (label, direction) { + (Paren, Up) => { + // a closing parenthesis does not need + // to be counted as a child + parents_stack.pop(); + // a closing parenthesis jumps out of + // two levels at once + parents_stack.pop(); + } + (Empty, Up) => { + // an upwards pipe + + // first add the current node to the parent of the parent + parents_stack.pop(); + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + if let Some(stack_parent_children) = + list_of_children.get_mut(parent) + { + parent_children = stack_parent_children; + + parent_children.push(index + 2); + } else { + return Err(ParseError::InvalidStack( + parent, + inter_len + 2, + )); + } + } else { + return Err(ParseError::EmptyStack); + } + + // then make the current node the new parent + parents_stack.push(index + 2); + } + (_, Up) => { + parents_stack.pop(); + parent_children.push(index + 2); + } + (_, Right) => { + parent_children.push(index + 2); + } + (_, Down) => { + parents_stack.push(index + 2); + parent_children.push(index + 2); + } + } + } + None => return Err(ParseError::InvalidStack(parent, inter_len)), + } + } else { + // There are unbalanced closing parentheses. + return Err(ParseError::EmptyStack); + } + + // A special handling of repetition constructs as postfix + // operators: it swaps with the preceding element. + if post_p { + match label { + Kleene | Plus | Optional => { + // remove the current node from the parent + parent_children.pop(); + + if let Some(preceding) = parent_children.last().copied() { + list_of_children.swap(preceding, index + 2); + + types.swap(preceding, index + 2); + + match list_of_children.get_mut(preceding) { + Some(preceding_children) => { + preceding_children.push(index + 2); + } + None => { + return Err(ParseError::InvalidStack(preceding, inter_len + 2)) + } + } + } else { + return Err(ParseError::InvalidRepetition(index)); + } + } + _ => {} + } + } + } + + // There are unbalanced opening parentheses. + if parents_stack.len() != 2 { + return Err(ParseError::NonSingleStack); + } + + let graph = list_of_children.into(); + + let result = DefaultRegex { graph, types }; + + Ok(Some((result, input))) + } +} + +#[cfg(test)] +mod test_des_rec { + use super::*; + + fn test_scanner( + input: &str, + ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + #[test] + fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> { + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + if let Some((regex, remain)) = + DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)? + { + println!("regex = {regex}"); + println!("remain = {remain}"); + + println!("regex length = {}", regex.len()); + + Ok(()) + } else { + unreachable!() + } + } +} |