From bdbd4b4dc21af09711c97d3f903877443199af06 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 3 Jan 2023 23:44:02 +0800 Subject: structural change: separate crates out I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments. --- nfa/src/default/mod.rs | 2 +- nfa/src/default/nfa.rs | 160 ++++++++++++++++++++++++++++++++++++++--------- nfa/src/default/regex.rs | 20 ++++-- 3 files changed, 146 insertions(+), 36 deletions(-) (limited to 'nfa/src/default') diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index b9ee398..115bea5 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,5 +1,5 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`NFA`][super::Nfa] and another that implements the trait //! [`Regex`][super::Regex]. pub mod nfa; diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 229ba4d..e642218 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,17 +1,8 @@ //! This file provides a default implementation of NFA. -// TODO: The current focus of the project is to understand the growth -// rate of the algorithm, to know whether I made a mistake in the -// previous iteration of the implementation, or the algorithm is not -// as fast as the author estimated, which is not quite likely, of -// course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - use graph::{ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, - LabelGraph, + LabelExtGraph, LabelGraph, }; use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; @@ -32,6 +23,11 @@ impl Default for DefaultNFA { } } +// Some boiler-plate delegation implementations. +// +// I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do +// not want to dereference an NFA to a Graph. + impl Graph for DefaultNFA { type Iter<'a> = as Graph>::Iter<'a> where T: 'a; @@ -81,6 +77,11 @@ impl LabelGraph for DefaultNFA { type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; + type EdgeLabelIter<'a> = as LabelGraph>::EdgeLabelIter<'a> + where + Self: 'a, + T: 'a; + #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { @@ -91,7 +92,7 @@ impl LabelGraph for DefaultNFA { } #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result, GError> { + fn edge_label(&self, source: usize, target: usize) -> Result, GError> { self.graph.edge_label(source, target) } @@ -115,12 +116,24 @@ impl LabelGraph for DefaultNFA { } } +impl LabelExtGraph for DefaultNFA { + #[inline] + fn extend(&mut self, edges: impl IntoIterator) -> Result { + self.graph.extend(edges) + } +} + +// TODO: Define a type for the labels of DefaultNFA. + impl Nfa for DefaultNFA { type FromRegex = DefaultNFA; + // Reminder: This is an adopted version of Thompson's + // construction. fn to_nfa( regexps: &[impl Regex>], sub_pred: impl Fn(T) -> Result, Error>, + default: Option, ) -> Result>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); @@ -128,7 +141,7 @@ impl Nfa for DefaultNFA { return Ok(Default::default()); } - // We reserve two more rooms for later uses. + // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); @@ -141,20 +154,18 @@ impl Nfa for DefaultNFA { builder.add_vertex(); } + // add a default edge + builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + let accumulators: Vec = { - let mut result = Vec::with_capacity(regexps.len() + 1); + let mut result = Vec::with_capacity(regexps.len()); result.push(0); - let mut accumulator = 0; - - for regexp in regexps.iter() { - accumulator += regexp.nodes_len() * 2; - result.push(accumulator); + for regexp in regexps.iter().take(regexps.len() - 1) { + result.push(result.last().unwrap() + regexp.nodes_len() * 2); } - result.pop(); - result }; @@ -313,9 +324,8 @@ impl Nfa for DefaultNFA { SoC::Sub(sub_non) => { // a non-terminal - let sub_offset = get_offset_safe!(sub_non); - let sub_nfa_start = sub_offset + 2 * sub_non; - let sub_nfa_end = sub_offset + 2 * sub_non + 1; + let sub_nfa_start = get_offset_safe!(sub_non); + let sub_nfa_end = sub_nfa_start + 1; builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; @@ -336,14 +346,102 @@ impl Nfa for DefaultNFA { Ok(DefaultNFA { graph }) } - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() + fn remove_epsilon bool>(&mut self, f: F) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut updated = true; + + let nodes_len = self.nodes_len(); + + while updated { + updated = false; + + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); + + for source in builder.nodes() { + for (label, target_iter) in builder.labels_of(source)? { + if f(*label) { + // empty label found + for target in target_iter { + for (label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + } + } + + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all epsilon edges + + let sources_and_targets: Vec<_> = builder.edges().collect(); + + for (source, target) in sources_and_targets.into_iter() { + builder.remove_edge(source, target, &f)?; + } + + self.graph = builder.build(); + + Ok(()) } - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut changed = true; + + // Use a counting vector to avoid calculating which nodes are + // dead at each iteration. + let mut target_counts: Vec = + std::iter::repeat(0).take(self.graph.nodes_len()).collect(); + + for (_, target) in self.graph.edges() { + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? += 1; + } + + while changed { + changed = false; + + for node in self.nodes() { + let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0); + + if deadp && !reserve(node) { + let to_remove: Vec = builder.children_of(node)?.collect(); + + if !to_remove.is_empty() { + changed = true; + + for target in to_remove { + builder.remove_edge(node, target, |_| true)?; + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? -= 1; + } + } + } + } + } + + self.graph = builder.build(); + + Ok(()) } + // REVIEW: Combine nulling and remove_epsilon into the same + // method, or not. + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; let mut builder = self.graph.builder_ref(); @@ -396,7 +494,6 @@ impl Display for DefaultNFA { #[cfg(test)] mod test_to_nfa { - #![allow(unused_imports)] use super::*; use crate::SoC; @@ -446,8 +543,11 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA> = - DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + let nfa: DefaultNFA> = DefaultNFA::to_nfa( + &[regex], + |label| Ok(SoC::Carry(label)), + Some(char::default()), + )?; nfa.print_viz("nfa.gv")?; diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index 2670e32..c8ad370 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -260,9 +260,7 @@ impl DefaultRegex { stack.push(Unseen(0)); while let Some(top) = stack.pop() { - let node_type = types[top.index()]; - - // TODO: Do not use unwrap here. + let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { @@ -350,8 +348,12 @@ impl DefaultRegex { .map(Unseen) .rev(), ); + + if self.graph.is_empty_node(top.index()).unwrap() { + write!(result, "ε")?; + } } - RegexType::Lit(label) => write!(result, "{}", f(label))?, + RegexType::Lit(label) => write!(result, "{}", f(*label))?, } } @@ -452,7 +454,15 @@ impl Display for ParseError { } } -impl std::error::Error for ParseError {} +impl std::error::Error for ParseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let ParseError::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} impl From for ParseError { fn from(ge: GError) -> Self { -- cgit v1.2.3-18-g5258