From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 23 Dec 2022 00:36:31 +0800 Subject: renaming core to chain and some other changes Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations. --- nfa/src/default/regex.rs | 313 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 276 insertions(+), 37 deletions(-) (limited to 'nfa/src/default/regex.rs') diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index a60f801..2670e32 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; -use std::fmt::{Debug, Display}; +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + marker::PhantomData, +}; /// The type of a node in a regular expression. /// @@ -25,7 +29,7 @@ use std::fmt::{Debug, Display}; /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum RegexType { +pub enum RegexType { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals @@ -44,13 +48,31 @@ pub enum RegexType { Lit(T), } +impl Display for RegexType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RegexType::Kleene => write!(f, "*"), + RegexType::Plus => write!(f, "+"), + RegexType::Optional => write!(f, "?"), + RegexType::Or => write!(f, "|"), + RegexType::Paren => write!(f, "()"), + RegexType::Empty => write!(f, "empty"), + RegexType::Lit(value) => write!(f, "{value}"), + } + } +} + /// A default implementation of regular expressions. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DefaultRegex { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec>, + /// The root of the graph. + /// + /// If it is None, it indicates the regular expression is empty. + root: Option, } impl Default for DefaultRegex { @@ -58,11 +80,39 @@ impl Default for DefaultRegex { Self { graph: Default::default(), types: Default::default(), + root: Default::default(), } } } -impl DefaultRegex { +impl DefaultRegex { + /// Set the root of the regular expression. + #[inline] + pub fn set_root(&mut self, root: Option) { + self.root = root; + } + + /// Construct a regular expression from a raw adjacency list graph + /// and an array of labels. + /// + /// # Error + /// + /// If the graph contains cycles, this function will return an + /// error. This check is added to make sure that the regular + /// expression contains no cycles, otherwise operations with + /// regular expressions might enter infinite loops later. + pub fn new(graph: ALGraph, types: Vec>) -> Result { + if graph.contains_cycles()? { + Err(Error::Cycle) + } else { + Ok(Self { + graph, + types, + root: Some(0), + }) + } + } + /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { @@ -86,10 +136,65 @@ impl DefaultRegex { Ok(()) } + + /// Return the internal graph. + /// + /// Normally this should not be used. + pub fn internal_graph(&self) -> ALGraph { + self.graph.clone() + } + + /// Return the internal types array. + /// + /// Normally this should not be used. + pub fn internal_types(&self) -> Vec> { + self.types.clone() + } + + /// Return the array of parents. + /// + /// The element at index N of the returned array is the parent of + /// that N-th element. If an element has no parents, a `None` is + /// placed at its place. + /// + /// # Error + /// + /// If some edge points to an invalid node, an erro will be + /// returned. + /// + /// Also, if the graph contains cycles, an error is also returned. + pub fn parents_array(&self) -> Result>, Error> { + if self.graph.contains_cycles().map_err(|e| match e { + GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), + _ => unreachable!(), + })? { + return Err(Error::Cycle); + } + + let len = self.len(); + + let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); + + for source in self.graph.nodes() { + for (index, target) in self + .graph + .children_of(source) + .map_err(|_| Error::UnknownNode(source))? + .enumerate() + { + result + .get_mut(target) + .ok_or(Error::UnknownNode(target))? + .replace((source, index)); + } + } + + Ok(result) + } } // REVIEW: This may not be needed. -impl Cata, A> for &DefaultRegex +impl Cata, A> for &DefaultRegex where A: Algebra>, { @@ -118,8 +223,13 @@ where } } -impl Display for DefaultRegex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl DefaultRegex { + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid. + pub fn to_string_with(&self, mut f: F) -> Result + where + F: FnMut(T) -> String, + { #[derive(Copy, Clone)] enum StackElement { Seen(usize), @@ -134,16 +244,15 @@ impl Display for DefaultRegex { } } - fn is_seen(&self) -> bool { - match self { - Seen(_) => true, - Unseen(_) => false, - } + fn is_seen(self) -> bool { + matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; + let mut result = String::new(); + let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); @@ -159,6 +268,12 @@ impl Display for DefaultRegex { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + stack.extend( self.graph .children_of(top.index()) @@ -167,7 +282,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "*")?; + write!(result, "*")?; } } RegexType::Plus => { @@ -181,7 +296,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "+")?; + write!(result, "+")?; } } RegexType::Optional => { @@ -195,12 +310,12 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "?")?; + write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { - write!(f, "(")?; + write!(result, "(")?; let len = self.len(); @@ -221,11 +336,11 @@ impl Display for DefaultRegex { } } } else { - write!(f, "|")?; + write!(result, "|")?; } } RegexType::Paren => { - write!(f, ")")?; + write!(result, ")")?; } RegexType::Empty => { stack.extend( @@ -236,11 +351,17 @@ impl Display for DefaultRegex { .rev(), ); } - RegexType::Lit(label) => write!(f, "{label}")?, + RegexType::Lit(label) => write!(result, "{}", f(label))?, } } - Ok(()) + Ok(result) + } +} + +impl Display for DefaultRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } @@ -278,9 +399,20 @@ impl Graph for DefaultRegex { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl graph::Builder) { + unimplemented!() + } } -impl Regex> for DefaultRegex { +impl Regex> for DefaultRegex { + /// Return the root of the regular expression. + #[inline] + fn root(&self) -> Option { + self.root + } + #[inline] fn vertex_label(&self, node_id: usize) -> Result, Error> { self.types @@ -291,8 +423,10 @@ impl Regex> for DefaultRegex { } /// An error type for holding parsing errors. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum ParseError { + /// A cycle is encountered. + Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. @@ -354,23 +488,77 @@ impl Display for ParseDirection { } } -impl DesRec for DefaultRegex { +/// A default recursive descent parser for regular expressions of +/// terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct DefaultRegParser { + ter_map: HashMap, + non_map: HashMap, + _phantom: PhantomData, +} + +impl DefaultRegParser { + /// Query if a terminal or a non-terminal is already found. + /// + /// If found, return the associated index of the terminal or + /// non-terminal. + pub fn query(&self, tnt: &str, terminal_p: bool) -> Option { + if terminal_p { + self.ter_map.get(tnt).copied() + } else { + self.non_map.get(tnt).copied() + } + } + + /// Add a terminal or a non-terminal. + pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { + if terminal_p { + let ter_len = self.ter_map.len(); + + self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); + } else { + let non_len = self.non_map.len(); + + self.non_map.entry(tnt.to_owned()).or_insert(non_len); + } + } +} + +impl Default for DefaultRegParser { + fn default() -> Self { + Self { + ter_map: Default::default(), + non_map: Default::default(), + _phantom: PhantomData, + } + } +} + +impl DesRec for DefaultRegParser { type Label = RegexType; - type Regex = Self; + type Regex = DefaultRegex; type Error = ParseError; - type Aux = ParseDirection; + type Inter = ParseDirection; - type Scanner<'a> = - Box Result, Self::Error>>; + type Scanner<'a, 'b> = Box< + dyn FnMut( + &'b Self, + &'a str, + ) -> Result, Self::Error>, + > where RegexType:'b; - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, mut input: &'a str, - mut scanner: Self::Scanner<'a>, + mut scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result, &'a str)>, Self::Error> { + ) -> Result, &'a str)>, Self::Error> + where + Self::Label: 'b, + { use ParseDirection::*; use RegexType::*; @@ -380,7 +568,7 @@ impl DesRec for DefaultRegex { // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { - if let Some((len, label, direction)) = scanner(input)? { + if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } @@ -523,7 +711,12 @@ impl DesRec for DefaultRegex { let graph = list_of_children.into(); - let result = DefaultRegex { graph, types }; + // Check there are no cycles + let result = DefaultRegex::new(graph, types).map_err(|e| match e { + Error::Graph(ge) => ParseError::Graph(ge), + Error::Cycle => ParseError::Cycle, + _ => unreachable!(), + })?; Ok(Some((result, input))) } @@ -533,9 +726,17 @@ impl DesRec for DefaultRegex { mod test_des_rec { use super::*; - fn test_scanner( - input: &str, - ) -> Result, ParseDirection)>, ParseError> { + use crate::desrec::DesRec; + + #[allow(dead_code, unused)] + fn test_scanner<'a, 'b, T>( + parser: &'b DefaultRegParser, + input: &'a str, + ) -> Result, ParseDirection)>, ParseError> + where + T: GraphLabel + Display + Debug, + T: 'b, + { use ParseDirection::*; use RegexType::*; @@ -557,19 +758,57 @@ mod test_des_rec { #[test] fn test_des_rec() -> Result<(), Box> { - let input_string = "a*b?c+|(d*| +)?".to_owned(); + let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); + + let parser: DefaultRegParser = Default::default(); if let Some((regex, remain)) = - DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); + let parents = regex.parents_array()?; + + println!("parents = {parents:?}"); + Ok(()) } else { unreachable!() } } + + #[test] + fn test_display() -> Result<(), Box> { + use graph::builder::Builder; + use RegexType::*; + + let mut builder = graph::adlist::ALGBuilder::default(); + let mut types: Vec> = Vec::with_capacity(4); + + types.push(Kleene); + builder.add_vertex(); + + types.push(Lit(0)); + builder.add_vertex(); + builder.add_edge(0, 1, ())?; + + types.push(Lit(1)); + builder.add_vertex(); + builder.add_edge(0, 2, ())?; + + types.push(Lit(2)); + builder.add_vertex(); + builder.add_edge(0, 3, ())?; + + let graph = builder.build(); + + let regex = DefaultRegex::new(graph, types)?; + + println!("regex = {regex}"); + + Ok(()) + } } -- cgit v1.2.3-18-g5258