diff options
author | JSDurand <mmemmew@gmail.com> | 2022-12-23 00:36:31 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2022-12-23 00:36:31 +0800 |
commit | 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (patch) | |
tree | 343eea3c634efbbf72c64ed5dd778ecce60c3eea /nfa/src | |
parent | 9f1c88b863e247da3cd60d2792a7a13b18e25e53 (diff) |
renaming core to chain and some other changes
Some changes:
- The core crate is renamed to "chain".
- The crate "viz" is added, which will provide layered graph drawing
algorithms.
- A function is added to convert from a grammar to the regular
language of its left-linear closures.
- A function is added to convert from a nondeterministic finite
automaton to its "null" closure. A null closure is the same
automaton with edges added, as if some edges are "null". Whether an
edge is null is determined by a function.
Combined with the previous change, we can convert a grammar to the
regular language of the null closure of its left-linear closures.
---
Now it remains to test more grammars and add an Atom trait, before
finishing the part about compilations.
Diffstat (limited to 'nfa/src')
-rw-r--r-- | nfa/src/default/nfa.rs | 374 | ||||
-rw-r--r-- | nfa/src/default/regex.rs | 313 | ||||
-rw-r--r-- | nfa/src/desrec.rs | 22 | ||||
-rw-r--r-- | nfa/src/error.rs | 11 | ||||
-rw-r--r-- | nfa/src/lib.rs | 94 |
5 files changed, 743 insertions, 71 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 3c2bd83..229ba4d 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,25 +1,26 @@ //! This file provides a default implementation of NFA. -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. +// TODO: The current focus of the project is to understand the growth +// rate of the algorithm, to know whether I made a mistake in the +// previous iteration of the implementation, or the algorithm is not +// as fast as the author estimated, which is not quite likely, of +// course. // // Thus I shall establish a friendly interface that allows me to view // and debug the atomic languages and the languages, transparently. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +use graph::{ + builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, + LabelGraph, +}; -use crate::{error::Error, Nfa, Regex}; +use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; -use std::fmt::Display; +use core::fmt::{Debug, Display}; -#[non_exhaustive] -#[derive(Debug)] /// Default NFA implementation. -pub struct DefaultNFA<T: GraphLabel + Display> { +#[derive(Debug, Clone)] +pub struct DefaultNFA<T: GraphLabel> { graph: DLGraph<T>, } @@ -63,6 +64,16 @@ impl<T: GraphLabel + Display> Graph for DefaultNFA<T> { fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { self.graph.has_edge(source, target) } + + #[inline] + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + self.graph.print_viz(filename) + } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) { + unimplemented!() + } } impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { @@ -70,11 +81,10 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a; - // TODO: Return the label from the contained regular language. #[inline] fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> { if self.has_node(node_id) { - todo!() + unimplemented!() } else { Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) } @@ -98,12 +108,232 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> { self.graph.labels_of(node_id) } + + #[inline] + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result<bool, GError> { + self.graph.has_edge_label(node_id, label, target) + } } impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { - #[allow(unused)] - fn to_nfa(regex: impl Regex<T>) -> Self { - todo!() + type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>; + + fn to_nfa( + regexps: &[impl Regex<RegexType<T>>], + sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, + ) -> Result<Self::FromRegex<DOption<T>>, Error> { + let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); + + if total_regexps_len == 0 { + return Ok(Default::default()); + } + + // We reserve two more rooms for later uses. + let nfa_len = total_regexps_len * 2 + 2; + + let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len); + + // Since DOption<T> implements Copy when T does, we can use a + // variable to hold the empty label to avoid repetitions. + let empty_label: DOption<T> = Default::default(); + + for _ in 0..nfa_len { + builder.add_vertex(); + } + + let accumulators: Vec<usize> = { + let mut result = Vec::with_capacity(regexps.len() + 1); + + result.push(0); + + let mut accumulator = 0; + + for regexp in regexps.iter() { + accumulator += regexp.nodes_len() * 2; + result.push(accumulator); + } + + result.pop(); + + result + }; + + assert_eq!(accumulators.len(), regexps.len()); + + /// Get offset from `accumulators` safely. + macro_rules! get_offset_safe ( + ($num:expr) => { + *accumulators.get($num).ok_or(Error::UnknownNode($num))? + } + ); + + /// Get offset from `accumulators` without checking bounds. + macro_rules! get_offset_unsafe ( + ($num:expr) => { + { unsafe { *accumulators.get_unchecked($num) } } + } + ); + + for (index, regex) in regexps.iter().enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + // A regular expression without roots is empty, so we + // skip it. + continue; + }; + + let regex_len = regex.nodes_len(); + + // It is safe here to call `get_offset_unsafe`, as `index` + // is guaranteed to be strictly less than the length of + // `accumulators` by an assertion above. + let offset = get_offset_unsafe!(index); + + let mut stack: Vec<usize> = Vec::with_capacity(regex_len); + + stack.push(root); + + while let Some(top_index) = stack.pop() { + let top_label = regex.vertex_label(top_index)?; + + let nfa_start = offset + 2 * top_index; + let nfa_end = offset + 2 * top_index + 1; + + match top_label { + RegexType::Kleene => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } + } + RegexType::Plus => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Optional => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } + } + RegexType::Or => { + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(nfa_start, child_start, empty_label)?; + builder.add_edge(child_end, nfa_end, empty_label)?; + } + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Paren => { + // Ignore Paren nodes since currently these + // are used only for printing purposes. + } + RegexType::Empty => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Lit(value) => { + // The children would be ignored even if for + // some reason this literal node had some + // children. + + match sub_pred(value)? { + SoC::Sub(sub_non) => { + // a non-terminal + + let sub_offset = get_offset_safe!(sub_non); + let sub_nfa_start = sub_offset + 2 * sub_non; + let sub_nfa_end = sub_offset + 2 * sub_non + 1; + + builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; + builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + } + SoC::Carry(new_value) => { + // a terminal + + builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + } + } + } + } + } + } + + let graph = builder.build(); + + Ok(DefaultNFA { graph }) } fn remove_epsilon(&mut self) -> Result<(), Error> { @@ -114,7 +344,113 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { todo!() } - fn nulling(&mut self) -> Result<(), Error> { - todo!() + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { + let mut updated = true; + let mut builder = self.graph.builder_ref(); + + while updated { + updated = false; + + let mut nullable = false; + + let mut to_add = Vec::new(); + + for (source, target) in builder.edges() { + for label in builder.edge_label(source, target)? { + if f(label) { + nullable = true; + + break; + } + } + + if nullable { + for (label, child_iter) in builder.labels_of(target)? { + for child in child_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + + for (source, child, label) in to_add { + builder.add_edge(source, child, label)?; + } + } + + self.graph.replace_by_builder(builder); + + Ok(()) + } +} + +impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + Debug::fmt(self, f) + } +} + +#[cfg(test)] +mod test_to_nfa { + #![allow(unused_imports)] + use super::*; + use crate::SoC; + + use crate::{ + default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + desrec::DesRec, + }; + + fn new_regex() -> Result<DefaultRegex<char>, ParseError> { + let parser = DefaultRegParser::<char>::default(); + + fn test_scanner<T: GraphLabel + Display + Debug>( + _parser: &DefaultRegParser<T>, + input: &str, + ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + Ok( + DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)? + .unwrap_or(Default::default()) + .0, + ) + } + + #[test] + fn test_to_nfa() -> Result<(), Box<dyn std::error::Error>> { + let regex = new_regex()?; + + println!("regex = {regex}"); + + let nfa: DefaultNFA<DOption<char>> = + DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) } } diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index a60f801..2670e32 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; -use std::fmt::{Debug, Display}; +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + marker::PhantomData, +}; /// The type of a node in a regular expression. /// @@ -25,7 +29,7 @@ use std::fmt::{Debug, Display}; /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum RegexType<T: GraphLabel + Display> { +pub enum RegexType<T: GraphLabel> { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals @@ -44,13 +48,31 @@ pub enum RegexType<T: GraphLabel + Display> { Lit(T), } +impl<T: GraphLabel> Display for RegexType<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RegexType::Kleene => write!(f, "*"), + RegexType::Plus => write!(f, "+"), + RegexType::Optional => write!(f, "?"), + RegexType::Or => write!(f, "|"), + RegexType::Paren => write!(f, "()"), + RegexType::Empty => write!(f, "empty"), + RegexType::Lit(value) => write!(f, "{value}"), + } + } +} + /// A default implementation of regular expressions. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DefaultRegex<T: GraphLabel + Display> { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec<RegexType<T>>, + /// The root of the graph. + /// + /// If it is None, it indicates the regular expression is empty. + root: Option<usize>, } impl<T: GraphLabel + Display> Default for DefaultRegex<T> { @@ -58,11 +80,39 @@ impl<T: GraphLabel + Display> Default for DefaultRegex<T> { Self { graph: Default::default(), types: Default::default(), + root: Default::default(), } } } -impl<T: GraphLabel + Display> DefaultRegex<T> { +impl<T: GraphLabel> DefaultRegex<T> { + /// Set the root of the regular expression. + #[inline] + pub fn set_root(&mut self, root: Option<usize>) { + self.root = root; + } + + /// Construct a regular expression from a raw adjacency list graph + /// and an array of labels. + /// + /// # Error + /// + /// If the graph contains cycles, this function will return an + /// error. This check is added to make sure that the regular + /// expression contains no cycles, otherwise operations with + /// regular expressions might enter infinite loops later. + pub fn new(graph: ALGraph, types: Vec<RegexType<T>>) -> Result<Self, Error> { + if graph.contains_cycles()? { + Err(Error::Cycle) + } else { + Ok(Self { + graph, + types, + root: Some(0), + }) + } + } + /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { @@ -86,10 +136,65 @@ impl<T: GraphLabel + Display> DefaultRegex<T> { Ok(()) } + + /// Return the internal graph. + /// + /// Normally this should not be used. + pub fn internal_graph(&self) -> ALGraph { + self.graph.clone() + } + + /// Return the internal types array. + /// + /// Normally this should not be used. + pub fn internal_types(&self) -> Vec<RegexType<T>> { + self.types.clone() + } + + /// Return the array of parents. + /// + /// The element at index N of the returned array is the parent of + /// that N-th element. If an element has no parents, a `None` is + /// placed at its place. + /// + /// # Error + /// + /// If some edge points to an invalid node, an erro will be + /// returned. + /// + /// Also, if the graph contains cycles, an error is also returned. + pub fn parents_array(&self) -> Result<Vec<Option<(usize, usize)>>, Error> { + if self.graph.contains_cycles().map_err(|e| match e { + GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), + _ => unreachable!(), + })? { + return Err(Error::Cycle); + } + + let len = self.len(); + + let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); + + for source in self.graph.nodes() { + for (index, target) in self + .graph + .children_of(source) + .map_err(|_| Error::UnknownNode(source))? + .enumerate() + { + result + .get_mut(target) + .ok_or(Error::UnknownNode(target))? + .replace((source, index)); + } + } + + Ok(result) + } } // REVIEW: This may not be needed. -impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> +impl<S: GraphLabel, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> where A: Algebra<T, Vec<T>>, { @@ -118,8 +223,13 @@ where } } -impl<T: GraphLabel + Display> Display for DefaultRegex<T> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl<T: GraphLabel> DefaultRegex<T> { + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid. + pub fn to_string_with<F>(&self, mut f: F) -> Result<String, std::fmt::Error> + where + F: FnMut(T) -> String, + { #[derive(Copy, Clone)] enum StackElement { Seen(usize), @@ -134,16 +244,15 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { } } - fn is_seen(&self) -> bool { - match self { - Seen(_) => true, - Unseen(_) => false, - } + fn is_seen(self) -> bool { + matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; + let mut result = String::new(); + let mut stack: Vec<StackElement> = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); @@ -159,6 +268,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + stack.extend( self.graph .children_of(top.index()) @@ -167,7 +282,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "*")?; + write!(result, "*")?; } } RegexType::Plus => { @@ -181,7 +296,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "+")?; + write!(result, "+")?; } } RegexType::Optional => { @@ -195,12 +310,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "?")?; + write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { - write!(f, "(")?; + write!(result, "(")?; let len = self.len(); @@ -221,11 +336,11 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { } } } else { - write!(f, "|")?; + write!(result, "|")?; } } RegexType::Paren => { - write!(f, ")")?; + write!(result, ")")?; } RegexType::Empty => { stack.extend( @@ -236,11 +351,17 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } - RegexType::Lit(label) => write!(f, "{label}")?, + RegexType::Lit(label) => write!(result, "{}", f(label))?, } } - Ok(()) + Ok(result) + } +} + +impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } @@ -278,9 +399,20 @@ impl<T: GraphLabel + Display> Graph for DefaultRegex<T> { fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { self.graph.has_edge(source, target) } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + unimplemented!() + } } -impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { +impl<T: GraphLabel + Display + Debug> Regex<RegexType<T>> for DefaultRegex<T> { + /// Return the root of the regular expression. + #[inline] + fn root(&self) -> Option<usize> { + self.root + } + #[inline] fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> { self.types @@ -291,8 +423,10 @@ impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { } /// An error type for holding parsing errors. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum ParseError { + /// A cycle is encountered. + Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. @@ -354,23 +488,77 @@ impl Display for ParseDirection { } } -impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { +/// A default recursive descent parser for regular expressions of +/// terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct DefaultRegParser<T: GraphLabel + Display> { + ter_map: HashMap<String, usize>, + non_map: HashMap<String, usize>, + _phantom: PhantomData<T>, +} + +impl<T: GraphLabel + Display> DefaultRegParser<T> { + /// Query if a terminal or a non-terminal is already found. + /// + /// If found, return the associated index of the terminal or + /// non-terminal. + pub fn query(&self, tnt: &str, terminal_p: bool) -> Option<usize> { + if terminal_p { + self.ter_map.get(tnt).copied() + } else { + self.non_map.get(tnt).copied() + } + } + + /// Add a terminal or a non-terminal. + pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { + if terminal_p { + let ter_len = self.ter_map.len(); + + self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); + } else { + let non_len = self.non_map.len(); + + self.non_map.entry(tnt.to_owned()).or_insert(non_len); + } + } +} + +impl<T: GraphLabel + Display> Default for DefaultRegParser<T> { + fn default() -> Self { + Self { + ter_map: Default::default(), + non_map: Default::default(), + _phantom: PhantomData, + } + } +} + +impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegParser<T> { type Label = RegexType<T>; - type Regex = Self; + type Regex = DefaultRegex<T>; type Error = ParseError; - type Aux = ParseDirection; + type Inter = ParseDirection; - type Scanner<'a> = - Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>; + type Scanner<'a, 'b> = Box< + dyn FnMut( + &'b Self, + &'a str, + ) -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>, + > where RegexType<T>:'b; - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, mut input: &'a str, - mut scanner: Self::Scanner<'a>, + mut scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> { + ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> + where + Self::Label: 'b, + { use ParseDirection::*; use RegexType::*; @@ -380,7 +568,7 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { - if let Some((len, label, direction)) = scanner(input)? { + if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } @@ -523,7 +711,12 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { let graph = list_of_children.into(); - let result = DefaultRegex { graph, types }; + // Check there are no cycles + let result = DefaultRegex::new(graph, types).map_err(|e| match e { + Error::Graph(ge) => ParseError::Graph(ge), + Error::Cycle => ParseError::Cycle, + _ => unreachable!(), + })?; Ok(Some((result, input))) } @@ -533,9 +726,17 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { mod test_des_rec { use super::*; - fn test_scanner( - input: &str, - ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> { + use crate::desrec::DesRec; + + #[allow(dead_code, unused)] + fn test_scanner<'a, 'b, T>( + parser: &'b DefaultRegParser<T>, + input: &'a str, + ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> + where + T: GraphLabel + Display + Debug, + T: 'b, + { use ParseDirection::*; use RegexType::*; @@ -557,19 +758,57 @@ mod test_des_rec { #[test] fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> { - let input_string = "a*b?c+|(d*| +)?".to_owned(); + let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); + + let parser: DefaultRegParser<char> = Default::default(); if let Some((regex, remain)) = - DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)? + DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); + let parents = regex.parents_array()?; + + println!("parents = {parents:?}"); + Ok(()) } else { unreachable!() } } + + #[test] + fn test_display() -> Result<(), Box<dyn std::error::Error>> { + use graph::builder::Builder; + use RegexType::*; + + let mut builder = graph::adlist::ALGBuilder::default(); + let mut types: Vec<RegexType<usize>> = Vec::with_capacity(4); + + types.push(Kleene); + builder.add_vertex(); + + types.push(Lit(0)); + builder.add_vertex(); + builder.add_edge(0, 1, ())?; + + types.push(Lit(1)); + builder.add_vertex(); + builder.add_edge(0, 2, ())?; + + types.push(Lit(2)); + builder.add_vertex(); + builder.add_edge(0, 3, ())?; + + let graph = builder.build(); + + let regex = DefaultRegex::new(graph, types)?; + + println!("regex = {regex}"); + + Ok(()) + } } diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs index c57d313..ac45c2d 100644 --- a/nfa/src/desrec.rs +++ b/nfa/src/desrec.rs @@ -20,8 +20,8 @@ pub trait DesRec { /// The type of errors encountered when parsing. type Error: std::error::Error; - /// Auxiliary data when parsing - type Aux; + /// Intermediate data when parsing + type Inter; /// The type of a scanner that classifies inputs into tokens. /// @@ -34,7 +34,14 @@ pub trait DesRec { /// /// - `Ok(Some(_))`: the classification succeeds and the parsing /// should continue. - type Scanner<'a>: FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>; + type Scanner<'a, 'b>: FnMut( + &'b Self, + &'a str, + ) + -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error> + where + Self: 'b, + Self::Label: 'b; /// Parse a string into a regular expression with the aid of this /// type. @@ -42,9 +49,12 @@ pub trait DesRec { /// Accept a slice of string and return either a parsing error, or /// a pair of correctly parsed regular expression and the /// remaining slice. - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, input: &'a str, - scanner: Self::Scanner<'a>, + scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error>; + ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error> + where + Self::Label: 'b; } diff --git a/nfa/src/error.rs b/nfa/src/error.rs index 0c6bb3c..ad75077 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -2,10 +2,11 @@ use graph::error::Error as GError; -use std::fmt::{Display, Formatter}; +use core::fmt::{Display, Formatter}; -#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] /// The error type for NFA operations. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[non_exhaustive] pub enum Error { /// An unknown node id is encountered. UnknownNode(usize), @@ -13,8 +14,12 @@ pub enum Error { /// /// Everything is a trade-off, wink wink. UnsupportedOperation, + /// There is no root in the underlying regular expression. + NoRoot, /// This error comes from some underlying graph operation. Graph(GError), + /// A cycle is found when constructing regular expressions. + Cycle, } impl Display for Error { @@ -23,6 +28,8 @@ impl Display for Error { Error::UnknownNode(id) => write!(f, "unknown node: {id}"), Error::UnsupportedOperation => write!(f, "unsupported operation"), Error::Graph(e) => write!(f, "graph error: {e}"), + Error::NoRoot => write!(f, "no root found"), + Error::Cycle => write!(f, "a cycle is found when constructing regular expressions"), } } } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index 1e2a30c..b1d62b3 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -12,17 +12,23 @@ extern crate graph; use core::fmt::Display; +use std::ops::{Deref, DerefMut}; + use graph::{Graph, GraphLabel, LabelGraph}; use error::Error; +pub use desrec::DesRec; + +use default::regex::RegexType; + /// The expected behaviour of a regular language. /// /// Nondeterministic finite automata are equivalent to regular /// languages. Since regular languages are easier to understand for a /// human being, nondeterministic finite automata include the data for /// the equivalent regular languages. -pub trait Regex<T: GraphLabel>: Graph + Display { +pub trait Regex<T: GraphLabel>: Graph + Display + Clone { /// Return the label of a vertex, or an error if the node is /// invalid. fn vertex_label(&self, node_id: usize) -> Result<T, Error>; @@ -52,6 +58,65 @@ pub trait Regex<T: GraphLabel>: Graph + Display { // cetera. These might be needed later. } +/// Since Option<T> does not inherit the Display from T, we wrap it to +/// provide an automatic implementation of Display. +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct DOption<T>(Option<T>); + +impl<T> Default for DOption<T> { + fn default() -> Self { + Self(None) + } +} + +impl<T> Deref for DOption<T> { + type Target = Option<T>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<T> DerefMut for DOption<T> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<T: Display> Display for DOption<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.deref() { + Some(value) => Display::fmt(value, f), + None => write!(f, "ε"), + } + } +} + +/// Substitute or Carry +/// +/// This enumeration indicates whether a label from a regular +/// expression should be substituted by another regular expression, or +/// to be carried around in the resulting nondeterministic finite +/// automaton, in the process of the [`to_nfa`][Nfa::to_nfa] function. +/// +/// # Transform labels +/// +/// The label that is returned to be carried can be different from the +/// original label, as a way to transform the labels. +/// +/// # Remark on the abbreviation +/// +/// It happens "by chance" that this abbreviation coincides with the +/// abbreviation of "system on chip". Since I know nothing about this +/// topic, this is just a meaningless pun. +#[derive(Debug, Copy, Clone)] +pub enum SoC<T> { + /// To be substituted by another regular expression. + Sub(usize), + /// To carry around this label. + Carry(T), +} + /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. @@ -68,9 +133,23 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> { Err(Error::UnsupportedOperation) } - /// Build a nondeterministic finite automaton out of a regular - /// language. - fn to_nfa(regex: impl Regex<T>) -> Self; + /// When we convert a regular expression to a nondeterministic + /// finite automaton, the label should be optional, so we use a + /// different type for the result type. + type FromRegex<S: GraphLabel + Display + Default>; + + /// Build a nondeterministic finite automaton out of a set of + /// regular expressions. + /// + /// The SUB_PRED is a predicate function that returns an optional + /// index for a label. If it returns some index, then this means + /// we should substitute the index-th regular expression in the + /// set, whereever that label occurs in the set of regular + /// expressions. + fn to_nfa( + regexps: &[impl Regex<RegexType<T>>], + sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, + ) -> Result<Self::FromRegex<DOption<T>>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. @@ -78,13 +157,14 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> { /// A state is dead if there are no edges going to the state. fn remove_dead(&mut self) -> Result<(), Error>; - /// For each empty transition from A to B, and for every edge from - /// B to C, say, add an edge from A to C. + /// For each edge from A to B whose edge is considered nullable by + /// a function `f`, and for every edge from B to C, add an edge + /// from A to C. /// /// This is needed specifically by the algorithm to produce a set /// of atomic languages that represent "null-closed" derived /// languages. - fn nulling(&mut self) -> Result<(), Error>; + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>; } pub mod default; |