From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 23 Dec 2022 00:36:31 +0800 Subject: renaming core to chain and some other changes Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations. --- nfa/src/default/nfa.rs | 374 ++++++++++++++++++++++++++++++++++++++++++++--- nfa/src/default/regex.rs | 313 ++++++++++++++++++++++++++++++++++----- nfa/src/desrec.rs | 22 ++- nfa/src/error.rs | 11 +- nfa/src/lib.rs | 94 +++++++++++- 5 files changed, 743 insertions(+), 71 deletions(-) (limited to 'nfa/src') diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 3c2bd83..229ba4d 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,25 +1,26 @@ //! This file provides a default implementation of NFA. -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. +// TODO: The current focus of the project is to understand the growth +// rate of the algorithm, to know whether I made a mistake in the +// previous iteration of the implementation, or the algorithm is not +// as fast as the author estimated, which is not quite likely, of +// course. // // Thus I shall establish a friendly interface that allows me to view // and debug the atomic languages and the languages, transparently. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +use graph::{ + builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, + LabelGraph, +}; -use crate::{error::Error, Nfa, Regex}; +use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; -use std::fmt::Display; +use core::fmt::{Debug, Display}; -#[non_exhaustive] -#[derive(Debug)] /// Default NFA implementation. -pub struct DefaultNFA { +#[derive(Debug, Clone)] +pub struct DefaultNFA { graph: DLGraph, } @@ -63,6 +64,16 @@ impl Graph for DefaultNFA { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + self.graph.print_viz(filename) + } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl Builder) { + unimplemented!() + } } impl LabelGraph for DefaultNFA { @@ -70,11 +81,10 @@ impl LabelGraph for DefaultNFA { type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; - // TODO: Return the label from the contained regular language. #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { - todo!() + unimplemented!() } else { Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) } @@ -98,12 +108,232 @@ impl LabelGraph for DefaultNFA { fn labels_of(&self, node_id: usize) -> Result, GError> { self.graph.labels_of(node_id) } + + #[inline] + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { + self.graph.has_edge_label(node_id, label, target) + } } impl Nfa for DefaultNFA { - #[allow(unused)] - fn to_nfa(regex: impl Regex) -> Self { - todo!() + type FromRegex = DefaultNFA; + + fn to_nfa( + regexps: &[impl Regex>], + sub_pred: impl Fn(T) -> Result, Error>, + ) -> Result>, Error> { + let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); + + if total_regexps_len == 0 { + return Ok(Default::default()); + } + + // We reserve two more rooms for later uses. + let nfa_len = total_regexps_len * 2 + 2; + + let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); + + // Since DOption implements Copy when T does, we can use a + // variable to hold the empty label to avoid repetitions. + let empty_label: DOption = Default::default(); + + for _ in 0..nfa_len { + builder.add_vertex(); + } + + let accumulators: Vec = { + let mut result = Vec::with_capacity(regexps.len() + 1); + + result.push(0); + + let mut accumulator = 0; + + for regexp in regexps.iter() { + accumulator += regexp.nodes_len() * 2; + result.push(accumulator); + } + + result.pop(); + + result + }; + + assert_eq!(accumulators.len(), regexps.len()); + + /// Get offset from `accumulators` safely. + macro_rules! get_offset_safe ( + ($num:expr) => { + *accumulators.get($num).ok_or(Error::UnknownNode($num))? + } + ); + + /// Get offset from `accumulators` without checking bounds. + macro_rules! get_offset_unsafe ( + ($num:expr) => { + { unsafe { *accumulators.get_unchecked($num) } } + } + ); + + for (index, regex) in regexps.iter().enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + // A regular expression without roots is empty, so we + // skip it. + continue; + }; + + let regex_len = regex.nodes_len(); + + // It is safe here to call `get_offset_unsafe`, as `index` + // is guaranteed to be strictly less than the length of + // `accumulators` by an assertion above. + let offset = get_offset_unsafe!(index); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(root); + + while let Some(top_index) = stack.pop() { + let top_label = regex.vertex_label(top_index)?; + + let nfa_start = offset + 2 * top_index; + let nfa_end = offset + 2 * top_index + 1; + + match top_label { + RegexType::Kleene => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } + } + RegexType::Plus => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Optional => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } + } + RegexType::Or => { + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(nfa_start, child_start, empty_label)?; + builder.add_edge(child_end, nfa_end, empty_label)?; + } + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Paren => { + // Ignore Paren nodes since currently these + // are used only for printing purposes. + } + RegexType::Empty => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Lit(value) => { + // The children would be ignored even if for + // some reason this literal node had some + // children. + + match sub_pred(value)? { + SoC::Sub(sub_non) => { + // a non-terminal + + let sub_offset = get_offset_safe!(sub_non); + let sub_nfa_start = sub_offset + 2 * sub_non; + let sub_nfa_end = sub_offset + 2 * sub_non + 1; + + builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; + builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + } + SoC::Carry(new_value) => { + // a terminal + + builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + } + } + } + } + } + } + + let graph = builder.build(); + + Ok(DefaultNFA { graph }) } fn remove_epsilon(&mut self) -> Result<(), Error> { @@ -114,7 +344,113 @@ impl Nfa for DefaultNFA { todo!() } - fn nulling(&mut self) -> Result<(), Error> { - todo!() + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { + let mut updated = true; + let mut builder = self.graph.builder_ref(); + + while updated { + updated = false; + + let mut nullable = false; + + let mut to_add = Vec::new(); + + for (source, target) in builder.edges() { + for label in builder.edge_label(source, target)? { + if f(label) { + nullable = true; + + break; + } + } + + if nullable { + for (label, child_iter) in builder.labels_of(target)? { + for child in child_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + + for (source, child, label) in to_add { + builder.add_edge(source, child, label)?; + } + } + + self.graph.replace_by_builder(builder); + + Ok(()) + } +} + +impl Display for DefaultNFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + Debug::fmt(self, f) + } +} + +#[cfg(test)] +mod test_to_nfa { + #![allow(unused_imports)] + use super::*; + use crate::SoC; + + use crate::{ + default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + desrec::DesRec, + }; + + fn new_regex() -> Result, ParseError> { + let parser = DefaultRegParser::::default(); + + fn test_scanner( + _parser: &DefaultRegParser, + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + Ok( + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? + .unwrap_or(Default::default()) + .0, + ) + } + + #[test] + fn test_to_nfa() -> Result<(), Box> { + let regex = new_regex()?; + + println!("regex = {regex}"); + + let nfa: DefaultNFA> = + DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) } } diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index a60f801..2670e32 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; -use std::fmt::{Debug, Display}; +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + marker::PhantomData, +}; /// The type of a node in a regular expression. /// @@ -25,7 +29,7 @@ use std::fmt::{Debug, Display}; /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum RegexType { +pub enum RegexType { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals @@ -44,13 +48,31 @@ pub enum RegexType { Lit(T), } +impl Display for RegexType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RegexType::Kleene => write!(f, "*"), + RegexType::Plus => write!(f, "+"), + RegexType::Optional => write!(f, "?"), + RegexType::Or => write!(f, "|"), + RegexType::Paren => write!(f, "()"), + RegexType::Empty => write!(f, "empty"), + RegexType::Lit(value) => write!(f, "{value}"), + } + } +} + /// A default implementation of regular expressions. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DefaultRegex { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec>, + /// The root of the graph. + /// + /// If it is None, it indicates the regular expression is empty. + root: Option, } impl Default for DefaultRegex { @@ -58,11 +80,39 @@ impl Default for DefaultRegex { Self { graph: Default::default(), types: Default::default(), + root: Default::default(), } } } -impl DefaultRegex { +impl DefaultRegex { + /// Set the root of the regular expression. + #[inline] + pub fn set_root(&mut self, root: Option) { + self.root = root; + } + + /// Construct a regular expression from a raw adjacency list graph + /// and an array of labels. + /// + /// # Error + /// + /// If the graph contains cycles, this function will return an + /// error. This check is added to make sure that the regular + /// expression contains no cycles, otherwise operations with + /// regular expressions might enter infinite loops later. + pub fn new(graph: ALGraph, types: Vec>) -> Result { + if graph.contains_cycles()? { + Err(Error::Cycle) + } else { + Ok(Self { + graph, + types, + root: Some(0), + }) + } + } + /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { @@ -86,10 +136,65 @@ impl DefaultRegex { Ok(()) } + + /// Return the internal graph. + /// + /// Normally this should not be used. + pub fn internal_graph(&self) -> ALGraph { + self.graph.clone() + } + + /// Return the internal types array. + /// + /// Normally this should not be used. + pub fn internal_types(&self) -> Vec> { + self.types.clone() + } + + /// Return the array of parents. + /// + /// The element at index N of the returned array is the parent of + /// that N-th element. If an element has no parents, a `None` is + /// placed at its place. + /// + /// # Error + /// + /// If some edge points to an invalid node, an erro will be + /// returned. + /// + /// Also, if the graph contains cycles, an error is also returned. + pub fn parents_array(&self) -> Result>, Error> { + if self.graph.contains_cycles().map_err(|e| match e { + GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), + _ => unreachable!(), + })? { + return Err(Error::Cycle); + } + + let len = self.len(); + + let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); + + for source in self.graph.nodes() { + for (index, target) in self + .graph + .children_of(source) + .map_err(|_| Error::UnknownNode(source))? + .enumerate() + { + result + .get_mut(target) + .ok_or(Error::UnknownNode(target))? + .replace((source, index)); + } + } + + Ok(result) + } } // REVIEW: This may not be needed. -impl Cata, A> for &DefaultRegex +impl Cata, A> for &DefaultRegex where A: Algebra>, { @@ -118,8 +223,13 @@ where } } -impl Display for DefaultRegex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl DefaultRegex { + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid. + pub fn to_string_with(&self, mut f: F) -> Result + where + F: FnMut(T) -> String, + { #[derive(Copy, Clone)] enum StackElement { Seen(usize), @@ -134,16 +244,15 @@ impl Display for DefaultRegex { } } - fn is_seen(&self) -> bool { - match self { - Seen(_) => true, - Unseen(_) => false, - } + fn is_seen(self) -> bool { + matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; + let mut result = String::new(); + let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); @@ -159,6 +268,12 @@ impl Display for DefaultRegex { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + stack.extend( self.graph .children_of(top.index()) @@ -167,7 +282,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "*")?; + write!(result, "*")?; } } RegexType::Plus => { @@ -181,7 +296,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "+")?; + write!(result, "+")?; } } RegexType::Optional => { @@ -195,12 +310,12 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "?")?; + write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { - write!(f, "(")?; + write!(result, "(")?; let len = self.len(); @@ -221,11 +336,11 @@ impl Display for DefaultRegex { } } } else { - write!(f, "|")?; + write!(result, "|")?; } } RegexType::Paren => { - write!(f, ")")?; + write!(result, ")")?; } RegexType::Empty => { stack.extend( @@ -236,11 +351,17 @@ impl Display for DefaultRegex { .rev(), ); } - RegexType::Lit(label) => write!(f, "{label}")?, + RegexType::Lit(label) => write!(result, "{}", f(label))?, } } - Ok(()) + Ok(result) + } +} + +impl Display for DefaultRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } @@ -278,9 +399,20 @@ impl Graph for DefaultRegex { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl graph::Builder) { + unimplemented!() + } } -impl Regex> for DefaultRegex { +impl Regex> for DefaultRegex { + /// Return the root of the regular expression. + #[inline] + fn root(&self) -> Option { + self.root + } + #[inline] fn vertex_label(&self, node_id: usize) -> Result, Error> { self.types @@ -291,8 +423,10 @@ impl Regex> for DefaultRegex { } /// An error type for holding parsing errors. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum ParseError { + /// A cycle is encountered. + Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. @@ -354,23 +488,77 @@ impl Display for ParseDirection { } } -impl DesRec for DefaultRegex { +/// A default recursive descent parser for regular expressions of +/// terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct DefaultRegParser { + ter_map: HashMap, + non_map: HashMap, + _phantom: PhantomData, +} + +impl DefaultRegParser { + /// Query if a terminal or a non-terminal is already found. + /// + /// If found, return the associated index of the terminal or + /// non-terminal. + pub fn query(&self, tnt: &str, terminal_p: bool) -> Option { + if terminal_p { + self.ter_map.get(tnt).copied() + } else { + self.non_map.get(tnt).copied() + } + } + + /// Add a terminal or a non-terminal. + pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { + if terminal_p { + let ter_len = self.ter_map.len(); + + self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); + } else { + let non_len = self.non_map.len(); + + self.non_map.entry(tnt.to_owned()).or_insert(non_len); + } + } +} + +impl Default for DefaultRegParser { + fn default() -> Self { + Self { + ter_map: Default::default(), + non_map: Default::default(), + _phantom: PhantomData, + } + } +} + +impl DesRec for DefaultRegParser { type Label = RegexType; - type Regex = Self; + type Regex = DefaultRegex; type Error = ParseError; - type Aux = ParseDirection; + type Inter = ParseDirection; - type Scanner<'a> = - Box Result, Self::Error>>; + type Scanner<'a, 'b> = Box< + dyn FnMut( + &'b Self, + &'a str, + ) -> Result, Self::Error>, + > where RegexType:'b; - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, mut input: &'a str, - mut scanner: Self::Scanner<'a>, + mut scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result, &'a str)>, Self::Error> { + ) -> Result, &'a str)>, Self::Error> + where + Self::Label: 'b, + { use ParseDirection::*; use RegexType::*; @@ -380,7 +568,7 @@ impl DesRec for DefaultRegex { // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { - if let Some((len, label, direction)) = scanner(input)? { + if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } @@ -523,7 +711,12 @@ impl DesRec for DefaultRegex { let graph = list_of_children.into(); - let result = DefaultRegex { graph, types }; + // Check there are no cycles + let result = DefaultRegex::new(graph, types).map_err(|e| match e { + Error::Graph(ge) => ParseError::Graph(ge), + Error::Cycle => ParseError::Cycle, + _ => unreachable!(), + })?; Ok(Some((result, input))) } @@ -533,9 +726,17 @@ impl DesRec for DefaultRegex { mod test_des_rec { use super::*; - fn test_scanner( - input: &str, - ) -> Result, ParseDirection)>, ParseError> { + use crate::desrec::DesRec; + + #[allow(dead_code, unused)] + fn test_scanner<'a, 'b, T>( + parser: &'b DefaultRegParser, + input: &'a str, + ) -> Result, ParseDirection)>, ParseError> + where + T: GraphLabel + Display + Debug, + T: 'b, + { use ParseDirection::*; use RegexType::*; @@ -557,19 +758,57 @@ mod test_des_rec { #[test] fn test_des_rec() -> Result<(), Box> { - let input_string = "a*b?c+|(d*| +)?".to_owned(); + let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); + + let parser: DefaultRegParser = Default::default(); if let Some((regex, remain)) = - DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); + let parents = regex.parents_array()?; + + println!("parents = {parents:?}"); + Ok(()) } else { unreachable!() } } + + #[test] + fn test_display() -> Result<(), Box> { + use graph::builder::Builder; + use RegexType::*; + + let mut builder = graph::adlist::ALGBuilder::default(); + let mut types: Vec> = Vec::with_capacity(4); + + types.push(Kleene); + builder.add_vertex(); + + types.push(Lit(0)); + builder.add_vertex(); + builder.add_edge(0, 1, ())?; + + types.push(Lit(1)); + builder.add_vertex(); + builder.add_edge(0, 2, ())?; + + types.push(Lit(2)); + builder.add_vertex(); + builder.add_edge(0, 3, ())?; + + let graph = builder.build(); + + let regex = DefaultRegex::new(graph, types)?; + + println!("regex = {regex}"); + + Ok(()) + } } diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs index c57d313..ac45c2d 100644 --- a/nfa/src/desrec.rs +++ b/nfa/src/desrec.rs @@ -20,8 +20,8 @@ pub trait DesRec { /// The type of errors encountered when parsing. type Error: std::error::Error; - /// Auxiliary data when parsing - type Aux; + /// Intermediate data when parsing + type Inter; /// The type of a scanner that classifies inputs into tokens. /// @@ -34,7 +34,14 @@ pub trait DesRec { /// /// - `Ok(Some(_))`: the classification succeeds and the parsing /// should continue. - type Scanner<'a>: FnMut(&'a str) -> Result, Self::Error>; + type Scanner<'a, 'b>: FnMut( + &'b Self, + &'a str, + ) + -> Result, Self::Error> + where + Self: 'b, + Self::Label: 'b; /// Parse a string into a regular expression with the aid of this /// type. @@ -42,9 +49,12 @@ pub trait DesRec { /// Accept a slice of string and return either a parsing error, or /// a pair of correctly parsed regular expression and the /// remaining slice. - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, input: &'a str, - scanner: Self::Scanner<'a>, + scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result, Self::Error>; + ) -> Result, Self::Error> + where + Self::Label: 'b; } diff --git a/nfa/src/error.rs b/nfa/src/error.rs index 0c6bb3c..ad75077 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -2,10 +2,11 @@ use graph::error::Error as GError; -use std::fmt::{Display, Formatter}; +use core::fmt::{Display, Formatter}; -#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] /// The error type for NFA operations. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[non_exhaustive] pub enum Error { /// An unknown node id is encountered. UnknownNode(usize), @@ -13,8 +14,12 @@ pub enum Error { /// /// Everything is a trade-off, wink wink. UnsupportedOperation, + /// There is no root in the underlying regular expression. + NoRoot, /// This error comes from some underlying graph operation. Graph(GError), + /// A cycle is found when constructing regular expressions. + Cycle, } impl Display for Error { @@ -23,6 +28,8 @@ impl Display for Error { Error::UnknownNode(id) => write!(f, "unknown node: {id}"), Error::UnsupportedOperation => write!(f, "unsupported operation"), Error::Graph(e) => write!(f, "graph error: {e}"), + Error::NoRoot => write!(f, "no root found"), + Error::Cycle => write!(f, "a cycle is found when constructing regular expressions"), } } } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index 1e2a30c..b1d62b3 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -12,17 +12,23 @@ extern crate graph; use core::fmt::Display; +use std::ops::{Deref, DerefMut}; + use graph::{Graph, GraphLabel, LabelGraph}; use error::Error; +pub use desrec::DesRec; + +use default::regex::RegexType; + /// The expected behaviour of a regular language. /// /// Nondeterministic finite automata are equivalent to regular /// languages. Since regular languages are easier to understand for a /// human being, nondeterministic finite automata include the data for /// the equivalent regular languages. -pub trait Regex: Graph + Display { +pub trait Regex: Graph + Display + Clone { /// Return the label of a vertex, or an error if the node is /// invalid. fn vertex_label(&self, node_id: usize) -> Result; @@ -52,6 +58,65 @@ pub trait Regex: Graph + Display { // cetera. These might be needed later. } +/// Since Option does not inherit the Display from T, we wrap it to +/// provide an automatic implementation of Display. +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct DOption(Option); + +impl Default for DOption { + fn default() -> Self { + Self(None) + } +} + +impl Deref for DOption { + type Target = Option; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DOption { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Display for DOption { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.deref() { + Some(value) => Display::fmt(value, f), + None => write!(f, "ε"), + } + } +} + +/// Substitute or Carry +/// +/// This enumeration indicates whether a label from a regular +/// expression should be substituted by another regular expression, or +/// to be carried around in the resulting nondeterministic finite +/// automaton, in the process of the [`to_nfa`][Nfa::to_nfa] function. +/// +/// # Transform labels +/// +/// The label that is returned to be carried can be different from the +/// original label, as a way to transform the labels. +/// +/// # Remark on the abbreviation +/// +/// It happens "by chance" that this abbreviation coincides with the +/// abbreviation of "system on chip". Since I know nothing about this +/// topic, this is just a meaningless pun. +#[derive(Debug, Copy, Clone)] +pub enum SoC { + /// To be substituted by another regular expression. + Sub(usize), + /// To carry around this label. + Carry(T), +} + /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. @@ -68,9 +133,23 @@ pub trait Nfa: LabelGraph { Err(Error::UnsupportedOperation) } - /// Build a nondeterministic finite automaton out of a regular - /// language. - fn to_nfa(regex: impl Regex) -> Self; + /// When we convert a regular expression to a nondeterministic + /// finite automaton, the label should be optional, so we use a + /// different type for the result type. + type FromRegex; + + /// Build a nondeterministic finite automaton out of a set of + /// regular expressions. + /// + /// The SUB_PRED is a predicate function that returns an optional + /// index for a label. If it returns some index, then this means + /// we should substitute the index-th regular expression in the + /// set, whereever that label occurs in the set of regular + /// expressions. + fn to_nfa( + regexps: &[impl Regex>], + sub_pred: impl Fn(T) -> Result, Error>, + ) -> Result>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. @@ -78,13 +157,14 @@ pub trait Nfa: LabelGraph { /// A state is dead if there are no edges going to the state. fn remove_dead(&mut self) -> Result<(), Error>; - /// For each empty transition from A to B, and for every edge from - /// B to C, say, add an edge from A to C. + /// For each edge from A to B whose edge is considered nullable by + /// a function `f`, and for every edge from B to C, add an edge + /// from A to C. /// /// This is needed specifically by the algorithm to produce a set /// of atomic languages that represent "null-closed" derived /// languages. - fn nulling(&mut self) -> Result<(), Error>; + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>; } pub mod default; -- cgit v1.2.3-18-g5258