diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-03 23:44:02 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-03 23:44:02 +0800 |
commit | bdbd4b4dc21af09711c97d3f903877443199af06 (patch) | |
tree | c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /nfa/src/default | |
parent | 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff) |
structural change: separate crates out
I put functionalities that are not strictly core to separate crates,
so that the whole package becomes more modular, and makes it easier to
try other parsing algorithms in the future.
Also I have to figure the forests out before finishing the core
chain-rule algorithm, as the part about forests affects the labels of
the grammars directly. From my experiences in writing the previous
version, it is asking for trouble to change the labels type
dramatically at a later point: too many places need to be changed.
Thus I decide to figure the rough part of forests out.
Actually I only have to figure out how to attach forests fragments to
edges of the underlying atomic languages, and the more complex parts
of putting forests together can be left to the recorders, which is my
vision of assembling semi-ring values during the chain-rule machine.
It should be relatively easy to produce forests fragments from
grammars since we are just trying to extract some information from the
grammar, not to manipulate those information in some complicated way.
We have to do some manipulations in the process, though, in order to
make sure that the nulling and epsilon-removal processes do not
invalidate these fragments.
Diffstat (limited to 'nfa/src/default')
-rw-r--r-- | nfa/src/default/mod.rs | 2 | ||||
-rw-r--r-- | nfa/src/default/nfa.rs | 160 | ||||
-rw-r--r-- | nfa/src/default/regex.rs | 20 |
3 files changed, 146 insertions, 36 deletions
diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index b9ee398..115bea5 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,5 +1,5 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`NFA`][super::Nfa] and another that implements the trait //! [`Regex`][super::Regex]. pub mod nfa; diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 229ba4d..e642218 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,17 +1,8 @@ //! This file provides a default implementation of NFA. -// TODO: The current focus of the project is to understand the growth -// rate of the algorithm, to know whether I made a mistake in the -// previous iteration of the implementation, or the algorithm is not -// as fast as the author estimated, which is not quite likely, of -// course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - use graph::{ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, - LabelGraph, + LabelExtGraph, LabelGraph, }; use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; @@ -32,6 +23,11 @@ impl<T: GraphLabel + Display> Default for DefaultNFA<T> { } } +// Some boiler-plate delegation implementations. +// +// I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do +// not want to dereference an NFA to a Graph. + impl<T: GraphLabel + Display> Graph for DefaultNFA<T> { type Iter<'a> = <DLGraph<T> as Graph>::Iter<'a> where T: 'a; @@ -81,6 +77,11 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a; + type EdgeLabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::EdgeLabelIter<'a> + where + Self: 'a, + T: 'a; + #[inline] fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> { if self.has_node(node_id) { @@ -91,7 +92,7 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { } #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, GError> { + fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, GError> { self.graph.edge_label(source, target) } @@ -115,12 +116,24 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { } } +impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> { + #[inline] + fn extend(&mut self, edges: impl IntoIterator<Item = (T, usize)>) -> Result<usize, GError> { + self.graph.extend(edges) + } +} + +// TODO: Define a type for the labels of DefaultNFA. + impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>; + // Reminder: This is an adopted version of Thompson's + // construction. fn to_nfa( regexps: &[impl Regex<RegexType<T>>], sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, + default: Option<T>, ) -> Result<Self::FromRegex<DOption<T>>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); @@ -128,7 +141,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { return Ok(Default::default()); } - // We reserve two more rooms for later uses. + // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len); @@ -141,20 +154,18 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { builder.add_vertex(); } + // add a default edge + builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + let accumulators: Vec<usize> = { - let mut result = Vec::with_capacity(regexps.len() + 1); + let mut result = Vec::with_capacity(regexps.len()); result.push(0); - let mut accumulator = 0; - - for regexp in regexps.iter() { - accumulator += regexp.nodes_len() * 2; - result.push(accumulator); + for regexp in regexps.iter().take(regexps.len() - 1) { + result.push(result.last().unwrap() + regexp.nodes_len() * 2); } - result.pop(); - result }; @@ -313,9 +324,8 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { SoC::Sub(sub_non) => { // a non-terminal - let sub_offset = get_offset_safe!(sub_non); - let sub_nfa_start = sub_offset + 2 * sub_non; - let sub_nfa_end = sub_offset + 2 * sub_non + 1; + let sub_nfa_start = get_offset_safe!(sub_non); + let sub_nfa_end = sub_nfa_start + 1; builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; @@ -336,14 +346,102 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { Ok(DefaultNFA { graph }) } - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() + fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut updated = true; + + let nodes_len = self.nodes_len(); + + while updated { + updated = false; + + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); + + for source in builder.nodes() { + for (label, target_iter) in builder.labels_of(source)? { + if f(*label) { + // empty label found + for target in target_iter { + for (label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + } + } + + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all epsilon edges + + let sources_and_targets: Vec<_> = builder.edges().collect(); + + for (source, target) in sources_and_targets.into_iter() { + builder.remove_edge(source, target, &f)?; + } + + self.graph = builder.build(); + + Ok(()) } - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut changed = true; + + // Use a counting vector to avoid calculating which nodes are + // dead at each iteration. + let mut target_counts: Vec<usize> = + std::iter::repeat(0).take(self.graph.nodes_len()).collect(); + + for (_, target) in self.graph.edges() { + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? += 1; + } + + while changed { + changed = false; + + for node in self.nodes() { + let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0); + + if deadp && !reserve(node) { + let to_remove: Vec<usize> = builder.children_of(node)?.collect(); + + if !to_remove.is_empty() { + changed = true; + + for target in to_remove { + builder.remove_edge(node, target, |_| true)?; + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? -= 1; + } + } + } + } + } + + self.graph = builder.build(); + + Ok(()) } + // REVIEW: Combine nulling and remove_epsilon into the same + // method, or not. + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; let mut builder = self.graph.builder_ref(); @@ -396,7 +494,6 @@ impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> { #[cfg(test)] mod test_to_nfa { - #![allow(unused_imports)] use super::*; use crate::SoC; @@ -446,8 +543,11 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA<DOption<char>> = - DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa( + &[regex], + |label| Ok(SoC::Carry(label)), + Some(char::default()), + )?; nfa.print_viz("nfa.gv")?; diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index 2670e32..c8ad370 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -260,9 +260,7 @@ impl<T: GraphLabel> DefaultRegex<T> { stack.push(Unseen(0)); while let Some(top) = stack.pop() { - let node_type = types[top.index()]; - - // TODO: Do not use unwrap here. + let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { @@ -350,8 +348,12 @@ impl<T: GraphLabel> DefaultRegex<T> { .map(Unseen) .rev(), ); + + if self.graph.is_empty_node(top.index()).unwrap() { + write!(result, "ε")?; + } } - RegexType::Lit(label) => write!(result, "{}", f(label))?, + RegexType::Lit(label) => write!(result, "{}", f(*label))?, } } @@ -452,7 +454,15 @@ impl Display for ParseError { } } -impl std::error::Error for ParseError {} +impl std::error::Error for ParseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let ParseError::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} impl From<GError> for ParseError { fn from(ge: GError) -> Self { |