From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 23 Dec 2022 00:36:31 +0800 Subject: renaming core to chain and some other changes Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations. --- nfa/src/default/nfa.rs | 374 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 355 insertions(+), 19 deletions(-) (limited to 'nfa/src/default/nfa.rs') diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 3c2bd83..229ba4d 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,25 +1,26 @@ //! This file provides a default implementation of NFA. -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. +// TODO: The current focus of the project is to understand the growth +// rate of the algorithm, to know whether I made a mistake in the +// previous iteration of the implementation, or the algorithm is not +// as fast as the author estimated, which is not quite likely, of +// course. // // Thus I shall establish a friendly interface that allows me to view // and debug the atomic languages and the languages, transparently. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +use graph::{ + builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, + LabelGraph, +}; -use crate::{error::Error, Nfa, Regex}; +use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; -use std::fmt::Display; +use core::fmt::{Debug, Display}; -#[non_exhaustive] -#[derive(Debug)] /// Default NFA implementation. -pub struct DefaultNFA { +#[derive(Debug, Clone)] +pub struct DefaultNFA { graph: DLGraph, } @@ -63,6 +64,16 @@ impl Graph for DefaultNFA { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + self.graph.print_viz(filename) + } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl Builder) { + unimplemented!() + } } impl LabelGraph for DefaultNFA { @@ -70,11 +81,10 @@ impl LabelGraph for DefaultNFA { type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; - // TODO: Return the label from the contained regular language. #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { - todo!() + unimplemented!() } else { Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) } @@ -98,12 +108,232 @@ impl LabelGraph for DefaultNFA { fn labels_of(&self, node_id: usize) -> Result, GError> { self.graph.labels_of(node_id) } + + #[inline] + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { + self.graph.has_edge_label(node_id, label, target) + } } impl Nfa for DefaultNFA { - #[allow(unused)] - fn to_nfa(regex: impl Regex) -> Self { - todo!() + type FromRegex = DefaultNFA; + + fn to_nfa( + regexps: &[impl Regex>], + sub_pred: impl Fn(T) -> Result, Error>, + ) -> Result>, Error> { + let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); + + if total_regexps_len == 0 { + return Ok(Default::default()); + } + + // We reserve two more rooms for later uses. + let nfa_len = total_regexps_len * 2 + 2; + + let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); + + // Since DOption implements Copy when T does, we can use a + // variable to hold the empty label to avoid repetitions. + let empty_label: DOption = Default::default(); + + for _ in 0..nfa_len { + builder.add_vertex(); + } + + let accumulators: Vec = { + let mut result = Vec::with_capacity(regexps.len() + 1); + + result.push(0); + + let mut accumulator = 0; + + for regexp in regexps.iter() { + accumulator += regexp.nodes_len() * 2; + result.push(accumulator); + } + + result.pop(); + + result + }; + + assert_eq!(accumulators.len(), regexps.len()); + + /// Get offset from `accumulators` safely. + macro_rules! get_offset_safe ( + ($num:expr) => { + *accumulators.get($num).ok_or(Error::UnknownNode($num))? + } + ); + + /// Get offset from `accumulators` without checking bounds. + macro_rules! get_offset_unsafe ( + ($num:expr) => { + { unsafe { *accumulators.get_unchecked($num) } } + } + ); + + for (index, regex) in regexps.iter().enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + // A regular expression without roots is empty, so we + // skip it. + continue; + }; + + let regex_len = regex.nodes_len(); + + // It is safe here to call `get_offset_unsafe`, as `index` + // is guaranteed to be strictly less than the length of + // `accumulators` by an assertion above. + let offset = get_offset_unsafe!(index); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(root); + + while let Some(top_index) = stack.pop() { + let top_label = regex.vertex_label(top_index)?; + + let nfa_start = offset + 2 * top_index; + let nfa_end = offset + 2 * top_index + 1; + + match top_label { + RegexType::Kleene => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } + } + RegexType::Plus => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Optional => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } + } + RegexType::Or => { + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(nfa_start, child_start, empty_label)?; + builder.add_edge(child_end, nfa_end, empty_label)?; + } + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Paren => { + // Ignore Paren nodes since currently these + // are used only for printing purposes. + } + RegexType::Empty => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Lit(value) => { + // The children would be ignored even if for + // some reason this literal node had some + // children. + + match sub_pred(value)? { + SoC::Sub(sub_non) => { + // a non-terminal + + let sub_offset = get_offset_safe!(sub_non); + let sub_nfa_start = sub_offset + 2 * sub_non; + let sub_nfa_end = sub_offset + 2 * sub_non + 1; + + builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; + builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + } + SoC::Carry(new_value) => { + // a terminal + + builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + } + } + } + } + } + } + + let graph = builder.build(); + + Ok(DefaultNFA { graph }) } fn remove_epsilon(&mut self) -> Result<(), Error> { @@ -114,7 +344,113 @@ impl Nfa for DefaultNFA { todo!() } - fn nulling(&mut self) -> Result<(), Error> { - todo!() + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { + let mut updated = true; + let mut builder = self.graph.builder_ref(); + + while updated { + updated = false; + + let mut nullable = false; + + let mut to_add = Vec::new(); + + for (source, target) in builder.edges() { + for label in builder.edge_label(source, target)? { + if f(label) { + nullable = true; + + break; + } + } + + if nullable { + for (label, child_iter) in builder.labels_of(target)? { + for child in child_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + + for (source, child, label) in to_add { + builder.add_edge(source, child, label)?; + } + } + + self.graph.replace_by_builder(builder); + + Ok(()) + } +} + +impl Display for DefaultNFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + Debug::fmt(self, f) + } +} + +#[cfg(test)] +mod test_to_nfa { + #![allow(unused_imports)] + use super::*; + use crate::SoC; + + use crate::{ + default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + desrec::DesRec, + }; + + fn new_regex() -> Result, ParseError> { + let parser = DefaultRegParser::::default(); + + fn test_scanner( + _parser: &DefaultRegParser, + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + Ok( + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? + .unwrap_or(Default::default()) + .0, + ) + } + + #[test] + fn test_to_nfa() -> Result<(), Box> { + let regex = new_regex()?; + + println!("regex = {regex}"); + + let nfa: DefaultNFA> = + DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) } } -- cgit v1.2.3-18-g5258