diff options
Diffstat (limited to 'nfa/src/default')
-rw-r--r-- | nfa/src/default/mod.rs | 121 | ||||
-rw-r--r-- | nfa/src/default/nfa.rs | 120 | ||||
-rw-r--r-- | nfa/src/default/regex.rs | 575 |
3 files changed, 699 insertions, 117 deletions
diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index 805540b..b9ee398 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,123 +1,10 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa]. -//! -//! It is used as the default implementation. +//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`Regex`][super::Regex]. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +pub mod nfa; -use super::{error::Error, Nfa, Regex}; - -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - -#[non_exhaustive] -#[derive(Debug)] -/// Default NFA implementation. -pub struct DefaultNFA<T: GraphLabel> { - graph: DLGraph<T>, -} - -impl<T: GraphLabel> Default for DefaultNFA<T> { - fn default() -> Self { - let graph = Default::default(); - Self { graph } - } -} - -impl<T: GraphLabel> Graph for DefaultNFA<T> { - type Iter<'a> = <DLGraph<T> as Graph>::Iter<'a> where T: 'a; - - #[inline] - fn is_empty(&self) -> bool { - self.graph.is_empty() - } - - #[inline] - fn nodes_len(&self) -> usize { - self.graph.nodes_len() - } - - #[inline] - fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> { - self.graph.children_of(node_id) - } - - #[inline] - fn degree(&self, node_id: usize) -> Result<usize, GError> { - self.graph.degree(node_id) - } - - #[inline] - fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> { - self.graph.is_empty_node(node_id) - } - - #[inline] - fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { - self.graph.has_edge(source, target) - } -} - -impl<T: GraphLabel> LabelGraph<T> for DefaultNFA<T> { - type Iter<'a> = <DLGraph<T> as LabelGraph<T>>::Iter<'a> where T: 'a; - - type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a; - - // TODO: Return the label from the contained regular language. - #[inline] - fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> { - if self.has_node(node_id) { - todo!() - } else { - Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) - } - } - - #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, GError> { - self.graph.edge_label(source, target) - } - - #[inline] - fn find_children_with_label( - &self, - node_id: usize, - label: &T, - ) -> Result<<Self as LabelGraph<T>>::Iter<'_>, GError> { - self.graph.find_children_with_label(node_id, label) - } - - #[inline] - fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> { - self.graph.labels_of(node_id) - } -} - -impl<T: GraphLabel> Nfa<T> for DefaultNFA<T> { - #[allow(unused)] - fn to_nfa(regex: impl Regex<T>) -> Self { - todo!() - } - - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() - } - - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() - } - - fn nulling(&mut self) -> Result<(), Error> { - todo!() - } -} +pub mod regex; #[cfg(test)] mod default_nfa_test {} diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs new file mode 100644 index 0000000..3c2bd83 --- /dev/null +++ b/nfa/src/default/nfa.rs @@ -0,0 +1,120 @@ +//! This file provides a default implementation of NFA. + +// TODO: Store the regular expression in the NFA as well. +// +// The current focus of the project is to understand the growth rate +// of the algorithm, to know whether I made a mistake in the previous +// iteration of the implementation, or the algorithm is not as fast as +// the author estimated, which is not quite likely, of course. +// +// Thus I shall establish a friendly interface that allows me to view +// and debug the atomic languages and the languages, transparently. + +use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; + +use crate::{error::Error, Nfa, Regex}; + +use std::fmt::Display; + +#[non_exhaustive] +#[derive(Debug)] +/// Default NFA implementation. +pub struct DefaultNFA<T: GraphLabel + Display> { + graph: DLGraph<T>, +} + +impl<T: GraphLabel + Display> Default for DefaultNFA<T> { + fn default() -> Self { + Self { + graph: Default::default(), + } + } +} + +impl<T: GraphLabel + Display> Graph for DefaultNFA<T> { + type Iter<'a> = <DLGraph<T> as Graph>::Iter<'a> where T: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result<usize, GError> { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { + self.graph.has_edge(source, target) + } +} + +impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { + type Iter<'a> = <DLGraph<T> as LabelGraph<T>>::Iter<'a> where T: 'a; + + type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a; + + // TODO: Return the label from the contained regular language. + #[inline] + fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> { + if self.has_node(node_id) { + todo!() + } else { + Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) + } + } + + #[inline] + fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, GError> { + self.graph.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &T, + ) -> Result<<Self as LabelGraph<T>>::Iter<'_>, GError> { + self.graph.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> { + self.graph.labels_of(node_id) + } +} + +impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { + #[allow(unused)] + fn to_nfa(regex: impl Regex<T>) -> Self { + todo!() + } + + fn remove_epsilon(&mut self) -> Result<(), Error> { + todo!() + } + + fn remove_dead(&mut self) -> Result<(), Error> { + todo!() + } + + fn nulling(&mut self) -> Result<(), Error> { + todo!() + } +} diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs new file mode 100644 index 0000000..a60f801 --- /dev/null +++ b/nfa/src/default/regex.rs @@ -0,0 +1,575 @@ +//! This file provides a default implementation of Regex. + +use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; + +use crate::{desrec::DesRec, error::Error, Regex}; + +use receme::{algebra::Algebra, catana::Cata}; + +use std::fmt::{Debug, Display}; + +/// The type of a node in a regular expression. +/// +/// # Example +/// +/// If a node has type "Kleene", this means it represents a star +/// construct in a regular expression, and its children are the +/// contents of the star. +/// +/// # Note +/// +/// There is no "concatenation" node type. A concatenation of two +/// nodes is represented as the two nodes being successive children in +/// their common parent node. +/// +/// This is possible because a regular expression has a root node. +/// For the sake of convenience, the root node has type "Or". +#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum RegexType<T: GraphLabel + Display> { + /// A star node is a node with an arbitrary number of repetitions. + Kleene, + /// A plus node is a node with at least one repetition: a+ equals + /// aa* + Plus, + /// An optional node + Optional, + /// An or node means an alternation of its children. + Or, + /// A paren node represents a parenthesis. + Paren, + /// An empty node + #[default] + Empty, + /// A literal node + Lit(T), +} + +/// A default implementation of regular expressions. +#[derive(Debug)] +pub struct DefaultRegex<T: GraphLabel + Display> { + /// The underlying graph is stored using adjacency lists. + graph: ALGraph, + /// The types of the underlying nodes. + types: Vec<RegexType<T>>, +} + +impl<T: GraphLabel + Display> Default for DefaultRegex<T> { + fn default() -> Self { + Self { + graph: Default::default(), + types: Default::default(), + } + } +} + +impl<T: GraphLabel + Display> DefaultRegex<T> { + /// Return the number of elements in this regular expression, + /// counting nodes. + pub fn len(&self) -> usize { + self.types.len() + } + + /// Return true if and only if this regular expression has no + /// nodes. + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } + + /// Add a node as the child of an existing node or as a root. + pub fn add_node( + &mut self, + label: RegexType<T>, + parent: Option<usize>, + ) -> Result<(), ParseError> { + self.graph.extend(parent.iter().copied())?; + self.types.push(label); + + Ok(()) + } +} + +// REVIEW: This may not be needed. +impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> +where + A: Algebra<T, Vec<T>>, +{ + fn cata(self, mut alg: A) -> T { + let mut results: Vec<Option<T>> = std::iter::repeat_with(Default::default) + .take(self.len()) + .collect(); + + for index in 0..=self.len() { + let algebra_result = { + let results_of_children: Vec<T> = self + .graph + .children_of(index) + .unwrap() + .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) + .collect(); + + alg(results_of_children) + }; + + // Artificially use this value to satisfy the compiler. + let _ = std::mem::replace(&mut results[index], Some(algebra_result)); + } + + std::mem::replace(&mut results[0], None).unwrap() + } +} + +impl<T: GraphLabel + Display> Display for DefaultRegex<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Copy, Clone)] + enum StackElement { + Seen(usize), + Unseen(usize), + } + + impl StackElement { + fn index(&self) -> usize { + match self { + Seen(index) => *index, + Unseen(index) => *index, + } + } + + fn is_seen(&self) -> bool { + match self { + Seen(_) => true, + Unseen(_) => false, + } + } + } + + use StackElement::{Seen, Unseen}; + + let mut stack: Vec<StackElement> = Vec::new(); + let mut types = self.types.clone(); + types.push(RegexType::Paren); + + stack.push(Unseen(0)); + + while let Some(top) = stack.pop() { + let node_type = types[top.index()]; + + // TODO: Do not use unwrap here. + + match node_type { + RegexType::Kleene => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "*")?; + } + } + RegexType::Plus => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "+")?; + } + } + RegexType::Optional => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "?")?; + } + } + RegexType::Or => { + if !top.is_seen() { + write!(f, "(")?; + + let len = self.len(); + + stack.push(Unseen(types.len() - 1)); + + for (child_index, child) in self + .graph + .children_of(top.index()) + .unwrap() + .enumerate() + .rev() + { + if child_index != len - 1 && child_index != 0 { + stack.push(Unseen(child)); + stack.push(Seen(top.index())); + } else { + stack.push(Unseen(child)); + } + } + } else { + write!(f, "|")?; + } + } + RegexType::Paren => { + write!(f, ")")?; + } + RegexType::Empty => { + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } + RegexType::Lit(label) => write!(f, "{label}")?, + } + } + + Ok(()) + } +} + +impl<T: GraphLabel + Display> Graph for DefaultRegex<T> { + type Iter<'a> = <ALGraph as Graph>::Iter<'a> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result<usize, GError> { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { + self.graph.has_edge(source, target) + } +} + +impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { + #[inline] + fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> { + self.types + .get(node_id) + .copied() + .ok_or(Error::UnknownNode(node_id)) + } +} + +/// An error type for holding parsing errors. +#[derive(Debug)] +pub enum ParseError { + /// Encounter an invalid state. + Invalid, + /// An error from graph operations. + Graph(GError), + /// Encounter an empty stack. + EmptyStack, + /// Encounter a non-single stack at the end. + NonSingleStack, + /// Encounter a stack whose element is out of bounds. + /// + /// The first component is the stack element, while the second the + /// bound. + InvalidStack(usize, usize), + /// Encounter a repetition construct without a preceding element. + InvalidRepetition(usize), + /// Invalid character + InvalidCharacter(char), +} + +impl Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for ParseError {} + +impl From<GError> for ParseError { + fn from(ge: GError) -> Self { + Self::Graph(ge) + } +} + +/// The direction of parsing. +/// +/// This means whether we want to stay at the same level of +/// parent-child hierarchy, or to become the child, or to climb back +/// to the last parent. +#[derive(Debug, Copy, Clone, Default)] +pub enum ParseDirection { + /// Climb back to the last parent. + Up, + /// Stay at the same level in the hierarchy. + #[default] + Right, + /// Become the child. + Down, +} + +impl Display for ParseDirection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let direction = match self { + ParseDirection::Up => "↑", + ParseDirection::Right => "→", + ParseDirection::Down => "↓", + }; + + write!(f, "{direction}") + } +} + +impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { + type Label = RegexType<T>; + + type Regex = Self; + + type Error = ParseError; + + type Aux = ParseDirection; + + type Scanner<'a> = + Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>; + + fn parse<'a>( + mut input: &'a str, + mut scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> { + use ParseDirection::*; + use RegexType::*; + + let mut intermediate_stack: Vec<(RegexType<T>, ParseDirection)> = + Vec::with_capacity(input.len()); + + // Classifies the input into a sequence of tokens with + // directions. + while !input.is_empty() { + if let Some((len, label, direction)) = scanner(input)? { + if len == 0 { + break; + } + + input = &input[len..]; + + intermediate_stack.push((label, direction)); + + // If we encounter an opening parenthesis, we add + // another auxiliary instruction. + if matches!((label, direction), (Or, Down)) { + intermediate_stack.push((Empty, Down)); + } + } else { + break; + } + } + + let inter_len = intermediate_stack.len(); + + let mut parents_stack: Vec<usize> = Vec::with_capacity(inter_len + 2); + + parents_stack.push(0); + parents_stack.push(1); + + let mut list_of_children: Vec<Vec<usize>> = std::iter::repeat_with(Vec::new) + .take(inter_len + 2) + .collect(); + + list_of_children[0].push(1); + + let mut types: Vec<RegexType<T>> = vec![Or, Empty]; + + types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); + + // Converts the sequence of tokens and directions into a + // regular expression. + for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { + let mut parent: usize; + let mut parent_children: &mut Vec<usize>; + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + match list_of_children.get_mut(parent) { + Some(stack_parent_children) => { + parent_children = stack_parent_children; + + match (label, direction) { + (Paren, Up) => { + // a closing parenthesis does not need + // to be counted as a child + parents_stack.pop(); + // a closing parenthesis jumps out of + // two levels at once + parents_stack.pop(); + } + (Empty, Up) => { + // an upwards pipe + + // first add the current node to the parent of the parent + parents_stack.pop(); + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + if let Some(stack_parent_children) = + list_of_children.get_mut(parent) + { + parent_children = stack_parent_children; + + parent_children.push(index + 2); + } else { + return Err(ParseError::InvalidStack( + parent, + inter_len + 2, + )); + } + } else { + return Err(ParseError::EmptyStack); + } + + // then make the current node the new parent + parents_stack.push(index + 2); + } + (_, Up) => { + parents_stack.pop(); + parent_children.push(index + 2); + } + (_, Right) => { + parent_children.push(index + 2); + } + (_, Down) => { + parents_stack.push(index + 2); + parent_children.push(index + 2); + } + } + } + None => return Err(ParseError::InvalidStack(parent, inter_len)), + } + } else { + // There are unbalanced closing parentheses. + return Err(ParseError::EmptyStack); + } + + // A special handling of repetition constructs as postfix + // operators: it swaps with the preceding element. + if post_p { + match label { + Kleene | Plus | Optional => { + // remove the current node from the parent + parent_children.pop(); + + if let Some(preceding) = parent_children.last().copied() { + list_of_children.swap(preceding, index + 2); + + types.swap(preceding, index + 2); + + match list_of_children.get_mut(preceding) { + Some(preceding_children) => { + preceding_children.push(index + 2); + } + None => { + return Err(ParseError::InvalidStack(preceding, inter_len + 2)) + } + } + } else { + return Err(ParseError::InvalidRepetition(index)); + } + } + _ => {} + } + } + } + + // There are unbalanced opening parentheses. + if parents_stack.len() != 2 { + return Err(ParseError::NonSingleStack); + } + + let graph = list_of_children.into(); + + let result = DefaultRegex { graph, types }; + + Ok(Some((result, input))) + } +} + +#[cfg(test)] +mod test_des_rec { + use super::*; + + fn test_scanner( + input: &str, + ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + #[test] + fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> { + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + if let Some((regex, remain)) = + DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)? + { + println!("regex = {regex}"); + println!("remain = {remain}"); + + println!("regex length = {}", regex.len()); + + Ok(()) + } else { + unreachable!() + } + } +} |