From 9f1c88b863e247da3cd60d2792a7a13b18e25e53 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 14 Dec 2022 23:48:22 +0800 Subject: a temporary check point just to save things in a commit --- nfa/src/default/mod.rs | 121 +--------- nfa/src/default/nfa.rs | 120 ++++++++++ nfa/src/default/regex.rs | 575 +++++++++++++++++++++++++++++++++++++++++++++++ nfa/src/desrec.rs | 50 +++++ nfa/src/error.rs | 6 + nfa/src/lib.rs | 8 +- 6 files changed, 759 insertions(+), 121 deletions(-) create mode 100644 nfa/src/default/nfa.rs create mode 100644 nfa/src/default/regex.rs create mode 100644 nfa/src/desrec.rs (limited to 'nfa/src') diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index 805540b..b9ee398 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,123 +1,10 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa]. -//! -//! It is used as the default implementation. +//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`Regex`][super::Regex]. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +pub mod nfa; -use super::{error::Error, Nfa, Regex}; - -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - -#[non_exhaustive] -#[derive(Debug)] -/// Default NFA implementation. -pub struct DefaultNFA { - graph: DLGraph, -} - -impl Default for DefaultNFA { - fn default() -> Self { - let graph = Default::default(); - Self { graph } - } -} - -impl Graph for DefaultNFA { - type Iter<'a> = as Graph>::Iter<'a> where T: 'a; - - #[inline] - fn is_empty(&self) -> bool { - self.graph.is_empty() - } - - #[inline] - fn nodes_len(&self) -> usize { - self.graph.nodes_len() - } - - #[inline] - fn children_of(&self, node_id: usize) -> Result, GError> { - self.graph.children_of(node_id) - } - - #[inline] - fn degree(&self, node_id: usize) -> Result { - self.graph.degree(node_id) - } - - #[inline] - fn is_empty_node(&self, node_id: usize) -> Result { - self.graph.is_empty_node(node_id) - } - - #[inline] - fn has_edge(&self, source: usize, target: usize) -> Result { - self.graph.has_edge(source, target) - } -} - -impl LabelGraph for DefaultNFA { - type Iter<'a> = as LabelGraph>::Iter<'a> where T: 'a; - - type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; - - // TODO: Return the label from the contained regular language. - #[inline] - fn vertex_label(&self, node_id: usize) -> Result, GError> { - if self.has_node(node_id) { - todo!() - } else { - Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) - } - } - - #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result, GError> { - self.graph.edge_label(source, target) - } - - #[inline] - fn find_children_with_label( - &self, - node_id: usize, - label: &T, - ) -> Result<>::Iter<'_>, GError> { - self.graph.find_children_with_label(node_id, label) - } - - #[inline] - fn labels_of(&self, node_id: usize) -> Result, GError> { - self.graph.labels_of(node_id) - } -} - -impl Nfa for DefaultNFA { - #[allow(unused)] - fn to_nfa(regex: impl Regex) -> Self { - todo!() - } - - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() - } - - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() - } - - fn nulling(&mut self) -> Result<(), Error> { - todo!() - } -} +pub mod regex; #[cfg(test)] mod default_nfa_test {} diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs new file mode 100644 index 0000000..3c2bd83 --- /dev/null +++ b/nfa/src/default/nfa.rs @@ -0,0 +1,120 @@ +//! This file provides a default implementation of NFA. + +// TODO: Store the regular expression in the NFA as well. +// +// The current focus of the project is to understand the growth rate +// of the algorithm, to know whether I made a mistake in the previous +// iteration of the implementation, or the algorithm is not as fast as +// the author estimated, which is not quite likely, of course. +// +// Thus I shall establish a friendly interface that allows me to view +// and debug the atomic languages and the languages, transparently. + +use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; + +use crate::{error::Error, Nfa, Regex}; + +use std::fmt::Display; + +#[non_exhaustive] +#[derive(Debug)] +/// Default NFA implementation. +pub struct DefaultNFA { + graph: DLGraph, +} + +impl Default for DefaultNFA { + fn default() -> Self { + Self { + graph: Default::default(), + } + } +} + +impl Graph for DefaultNFA { + type Iter<'a> = as Graph>::Iter<'a> where T: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result { + self.graph.has_edge(source, target) + } +} + +impl LabelGraph for DefaultNFA { + type Iter<'a> = as LabelGraph>::Iter<'a> where T: 'a; + + type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; + + // TODO: Return the label from the contained regular language. + #[inline] + fn vertex_label(&self, node_id: usize) -> Result, GError> { + if self.has_node(node_id) { + todo!() + } else { + Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) + } + } + + #[inline] + fn edge_label(&self, source: usize, target: usize) -> Result, GError> { + self.graph.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &T, + ) -> Result<>::Iter<'_>, GError> { + self.graph.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result, GError> { + self.graph.labels_of(node_id) + } +} + +impl Nfa for DefaultNFA { + #[allow(unused)] + fn to_nfa(regex: impl Regex) -> Self { + todo!() + } + + fn remove_epsilon(&mut self) -> Result<(), Error> { + todo!() + } + + fn remove_dead(&mut self) -> Result<(), Error> { + todo!() + } + + fn nulling(&mut self) -> Result<(), Error> { + todo!() + } +} diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs new file mode 100644 index 0000000..a60f801 --- /dev/null +++ b/nfa/src/default/regex.rs @@ -0,0 +1,575 @@ +//! This file provides a default implementation of Regex. + +use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; + +use crate::{desrec::DesRec, error::Error, Regex}; + +use receme::{algebra::Algebra, catana::Cata}; + +use std::fmt::{Debug, Display}; + +/// The type of a node in a regular expression. +/// +/// # Example +/// +/// If a node has type "Kleene", this means it represents a star +/// construct in a regular expression, and its children are the +/// contents of the star. +/// +/// # Note +/// +/// There is no "concatenation" node type. A concatenation of two +/// nodes is represented as the two nodes being successive children in +/// their common parent node. +/// +/// This is possible because a regular expression has a root node. +/// For the sake of convenience, the root node has type "Or". +#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum RegexType { + /// A star node is a node with an arbitrary number of repetitions. + Kleene, + /// A plus node is a node with at least one repetition: a+ equals + /// aa* + Plus, + /// An optional node + Optional, + /// An or node means an alternation of its children. + Or, + /// A paren node represents a parenthesis. + Paren, + /// An empty node + #[default] + Empty, + /// A literal node + Lit(T), +} + +/// A default implementation of regular expressions. +#[derive(Debug)] +pub struct DefaultRegex { + /// The underlying graph is stored using adjacency lists. + graph: ALGraph, + /// The types of the underlying nodes. + types: Vec>, +} + +impl Default for DefaultRegex { + fn default() -> Self { + Self { + graph: Default::default(), + types: Default::default(), + } + } +} + +impl DefaultRegex { + /// Return the number of elements in this regular expression, + /// counting nodes. + pub fn len(&self) -> usize { + self.types.len() + } + + /// Return true if and only if this regular expression has no + /// nodes. + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } + + /// Add a node as the child of an existing node or as a root. + pub fn add_node( + &mut self, + label: RegexType, + parent: Option, + ) -> Result<(), ParseError> { + self.graph.extend(parent.iter().copied())?; + self.types.push(label); + + Ok(()) + } +} + +// REVIEW: This may not be needed. +impl Cata, A> for &DefaultRegex +where + A: Algebra>, +{ + fn cata(self, mut alg: A) -> T { + let mut results: Vec> = std::iter::repeat_with(Default::default) + .take(self.len()) + .collect(); + + for index in 0..=self.len() { + let algebra_result = { + let results_of_children: Vec = self + .graph + .children_of(index) + .unwrap() + .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) + .collect(); + + alg(results_of_children) + }; + + // Artificially use this value to satisfy the compiler. + let _ = std::mem::replace(&mut results[index], Some(algebra_result)); + } + + std::mem::replace(&mut results[0], None).unwrap() + } +} + +impl Display for DefaultRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Copy, Clone)] + enum StackElement { + Seen(usize), + Unseen(usize), + } + + impl StackElement { + fn index(&self) -> usize { + match self { + Seen(index) => *index, + Unseen(index) => *index, + } + } + + fn is_seen(&self) -> bool { + match self { + Seen(_) => true, + Unseen(_) => false, + } + } + } + + use StackElement::{Seen, Unseen}; + + let mut stack: Vec = Vec::new(); + let mut types = self.types.clone(); + types.push(RegexType::Paren); + + stack.push(Unseen(0)); + + while let Some(top) = stack.pop() { + let node_type = types[top.index()]; + + // TODO: Do not use unwrap here. + + match node_type { + RegexType::Kleene => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "*")?; + } + } + RegexType::Plus => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "+")?; + } + } + RegexType::Optional => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "?")?; + } + } + RegexType::Or => { + if !top.is_seen() { + write!(f, "(")?; + + let len = self.len(); + + stack.push(Unseen(types.len() - 1)); + + for (child_index, child) in self + .graph + .children_of(top.index()) + .unwrap() + .enumerate() + .rev() + { + if child_index != len - 1 && child_index != 0 { + stack.push(Unseen(child)); + stack.push(Seen(top.index())); + } else { + stack.push(Unseen(child)); + } + } + } else { + write!(f, "|")?; + } + } + RegexType::Paren => { + write!(f, ")")?; + } + RegexType::Empty => { + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } + RegexType::Lit(label) => write!(f, "{label}")?, + } + } + + Ok(()) + } +} + +impl Graph for DefaultRegex { + type Iter<'a> = ::Iter<'a> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result { + self.graph.has_edge(source, target) + } +} + +impl Regex> for DefaultRegex { + #[inline] + fn vertex_label(&self, node_id: usize) -> Result, Error> { + self.types + .get(node_id) + .copied() + .ok_or(Error::UnknownNode(node_id)) + } +} + +/// An error type for holding parsing errors. +#[derive(Debug)] +pub enum ParseError { + /// Encounter an invalid state. + Invalid, + /// An error from graph operations. + Graph(GError), + /// Encounter an empty stack. + EmptyStack, + /// Encounter a non-single stack at the end. + NonSingleStack, + /// Encounter a stack whose element is out of bounds. + /// + /// The first component is the stack element, while the second the + /// bound. + InvalidStack(usize, usize), + /// Encounter a repetition construct without a preceding element. + InvalidRepetition(usize), + /// Invalid character + InvalidCharacter(char), +} + +impl Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for ParseError {} + +impl From for ParseError { + fn from(ge: GError) -> Self { + Self::Graph(ge) + } +} + +/// The direction of parsing. +/// +/// This means whether we want to stay at the same level of +/// parent-child hierarchy, or to become the child, or to climb back +/// to the last parent. +#[derive(Debug, Copy, Clone, Default)] +pub enum ParseDirection { + /// Climb back to the last parent. + Up, + /// Stay at the same level in the hierarchy. + #[default] + Right, + /// Become the child. + Down, +} + +impl Display for ParseDirection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let direction = match self { + ParseDirection::Up => "↑", + ParseDirection::Right => "→", + ParseDirection::Down => "↓", + }; + + write!(f, "{direction}") + } +} + +impl DesRec for DefaultRegex { + type Label = RegexType; + + type Regex = Self; + + type Error = ParseError; + + type Aux = ParseDirection; + + type Scanner<'a> = + Box Result, Self::Error>>; + + fn parse<'a>( + mut input: &'a str, + mut scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result, &'a str)>, Self::Error> { + use ParseDirection::*; + use RegexType::*; + + let mut intermediate_stack: Vec<(RegexType, ParseDirection)> = + Vec::with_capacity(input.len()); + + // Classifies the input into a sequence of tokens with + // directions. + while !input.is_empty() { + if let Some((len, label, direction)) = scanner(input)? { + if len == 0 { + break; + } + + input = &input[len..]; + + intermediate_stack.push((label, direction)); + + // If we encounter an opening parenthesis, we add + // another auxiliary instruction. + if matches!((label, direction), (Or, Down)) { + intermediate_stack.push((Empty, Down)); + } + } else { + break; + } + } + + let inter_len = intermediate_stack.len(); + + let mut parents_stack: Vec = Vec::with_capacity(inter_len + 2); + + parents_stack.push(0); + parents_stack.push(1); + + let mut list_of_children: Vec> = std::iter::repeat_with(Vec::new) + .take(inter_len + 2) + .collect(); + + list_of_children[0].push(1); + + let mut types: Vec> = vec![Or, Empty]; + + types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); + + // Converts the sequence of tokens and directions into a + // regular expression. + for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { + let mut parent: usize; + let mut parent_children: &mut Vec; + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + match list_of_children.get_mut(parent) { + Some(stack_parent_children) => { + parent_children = stack_parent_children; + + match (label, direction) { + (Paren, Up) => { + // a closing parenthesis does not need + // to be counted as a child + parents_stack.pop(); + // a closing parenthesis jumps out of + // two levels at once + parents_stack.pop(); + } + (Empty, Up) => { + // an upwards pipe + + // first add the current node to the parent of the parent + parents_stack.pop(); + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + if let Some(stack_parent_children) = + list_of_children.get_mut(parent) + { + parent_children = stack_parent_children; + + parent_children.push(index + 2); + } else { + return Err(ParseError::InvalidStack( + parent, + inter_len + 2, + )); + } + } else { + return Err(ParseError::EmptyStack); + } + + // then make the current node the new parent + parents_stack.push(index + 2); + } + (_, Up) => { + parents_stack.pop(); + parent_children.push(index + 2); + } + (_, Right) => { + parent_children.push(index + 2); + } + (_, Down) => { + parents_stack.push(index + 2); + parent_children.push(index + 2); + } + } + } + None => return Err(ParseError::InvalidStack(parent, inter_len)), + } + } else { + // There are unbalanced closing parentheses. + return Err(ParseError::EmptyStack); + } + + // A special handling of repetition constructs as postfix + // operators: it swaps with the preceding element. + if post_p { + match label { + Kleene | Plus | Optional => { + // remove the current node from the parent + parent_children.pop(); + + if let Some(preceding) = parent_children.last().copied() { + list_of_children.swap(preceding, index + 2); + + types.swap(preceding, index + 2); + + match list_of_children.get_mut(preceding) { + Some(preceding_children) => { + preceding_children.push(index + 2); + } + None => { + return Err(ParseError::InvalidStack(preceding, inter_len + 2)) + } + } + } else { + return Err(ParseError::InvalidRepetition(index)); + } + } + _ => {} + } + } + } + + // There are unbalanced opening parentheses. + if parents_stack.len() != 2 { + return Err(ParseError::NonSingleStack); + } + + let graph = list_of_children.into(); + + let result = DefaultRegex { graph, types }; + + Ok(Some((result, input))) + } +} + +#[cfg(test)] +mod test_des_rec { + use super::*; + + fn test_scanner( + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + #[test] + fn test_des_rec() -> Result<(), Box> { + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + if let Some((regex, remain)) = + DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? + { + println!("regex = {regex}"); + println!("remain = {remain}"); + + println!("regex length = {}", regex.len()); + + Ok(()) + } else { + unreachable!() + } + } +} diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs new file mode 100644 index 0000000..c57d313 --- /dev/null +++ b/nfa/src/desrec.rs @@ -0,0 +1,50 @@ +//! This file defines the expected behaviours of a recursive descent +//! parser. + +use super::Regex; +use graph::GraphLabel; + +/// A thin wrapper of the parse output, to simplify the function +/// signature a little. +pub type ParseOutput<'a, T> = Option<(T, &'a str)>; + +/// Types implementing this trait provide a method to be recursively +/// parsed. +pub trait DesRec { + /// The type of labels of the resulting regular expression. + type Label: GraphLabel; + + /// The type of the resulting regular expression. + type Regex: Regex; + + /// The type of errors encountered when parsing. + type Error: std::error::Error; + + /// Auxiliary data when parsing + type Aux; + + /// The type of a scanner that classifies inputs into tokens. + /// + /// The return type indicates the result of classification: + /// + /// - `Err(_)`: the classification fails + /// + /// - `Ok(None)`: the classification succeeds, and the parsing + /// should stop here + /// + /// - `Ok(Some(_))`: the classification succeeds and the parsing + /// should continue. + type Scanner<'a>: FnMut(&'a str) -> Result, Self::Error>; + + /// Parse a string into a regular expression with the aid of this + /// type. + /// + /// Accept a slice of string and return either a parsing error, or + /// a pair of correctly parsed regular expression and the + /// remaining slice. + fn parse<'a>( + input: &'a str, + scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result, Self::Error>; +} diff --git a/nfa/src/error.rs b/nfa/src/error.rs index 6112878..0c6bb3c 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -5,9 +5,15 @@ use graph::error::Error as GError; use std::fmt::{Display, Formatter}; #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +/// The error type for NFA operations. pub enum Error { + /// An unknown node id is encountered. UnknownNode(usize), + /// Some operations are not supported by the implementations. + /// + /// Everything is a trade-off, wink wink. UnsupportedOperation, + /// This error comes from some underlying graph operation. Graph(GError), } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index ef207cf..1e2a30c 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -6,7 +6,7 @@ //! implements the Graph trait from the [`graph`] crate, and then use //! that external graph type as [`Graph`][graph::Graph] here. -mod error; +pub mod error; extern crate graph; @@ -49,13 +49,12 @@ pub trait Regex: Graph + Display { // TODO: add functions that determine if certain "positions" in a // regular language satisfy some special properties, like at the // end of a Kleene star, or at the end of a regular language, et - // cetera. These will be needed later. + // cetera. These might be needed later. } /// The expected behvaiour of a nondeterministic finite automaton. /// -/// Every NFA is a special labelled graph, so this trait extends the -/// [`LabelGraph`][graph::LabelGraph] trait. +/// Every NFA is a special labelled graph. pub trait Nfa: LabelGraph { /// Remove all empty transitions from the nondeterministic finite /// automaton. @@ -89,6 +88,7 @@ pub trait Nfa: LabelGraph { } pub mod default; +pub mod desrec; #[cfg(test)] mod nfa_tests {} -- cgit v1.2.3-18-g5258