summaryrefslogtreecommitdiff
path: root/nfa/src
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-03 23:44:02 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-03 23:44:02 +0800
commitbdbd4b4dc21af09711c97d3f903877443199af06 (patch)
treec6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /nfa/src
parent8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)
structural change: separate crates out
I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments.
Diffstat (limited to 'nfa/src')
-rw-r--r--nfa/src/default/mod.rs2
-rw-r--r--nfa/src/default/nfa.rs160
-rw-r--r--nfa/src/default/regex.rs20
-rw-r--r--nfa/src/error.rs18
-rw-r--r--nfa/src/lib.rs91
5 files changed, 231 insertions, 60 deletions
diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs
index b9ee398..115bea5 100644
--- a/nfa/src/default/mod.rs
+++ b/nfa/src/default/mod.rs
@@ -1,5 +1,5 @@
//! This file provides a structure that implements the trait
-//! [`NFA`][super::Nfa] and another that umplements the trait
+//! [`NFA`][super::Nfa] and another that implements the trait
//! [`Regex`][super::Regex].
pub mod nfa;
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index 229ba4d..e642218 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -1,17 +1,8 @@
//! This file provides a default implementation of NFA.
-// TODO: The current focus of the project is to understand the growth
-// rate of the algorithm, to know whether I made a mistake in the
-// previous iteration of the implementation, or the algorithm is not
-// as fast as the author estimated, which is not quite likely, of
-// course.
-//
-// Thus I shall establish a friendly interface that allows me to view
-// and debug the atomic languages and the languages, transparently.
-
use graph::{
builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel,
- LabelGraph,
+ LabelExtGraph, LabelGraph,
};
use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
@@ -32,6 +23,11 @@ impl<T: GraphLabel + Display> Default for DefaultNFA<T> {
}
}
+// Some boiler-plate delegation implementations.
+//
+// I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do
+// not want to dereference an NFA to a Graph.
+
impl<T: GraphLabel + Display> Graph for DefaultNFA<T> {
type Iter<'a> = <DLGraph<T> as Graph>::Iter<'a> where T: 'a;
@@ -81,6 +77,11 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a;
+ type EdgeLabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::EdgeLabelIter<'a>
+ where
+ Self: 'a,
+ T: 'a;
+
#[inline]
fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> {
if self.has_node(node_id) {
@@ -91,7 +92,7 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
}
#[inline]
- fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, GError> {
+ fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, GError> {
self.graph.edge_label(source, target)
}
@@ -115,12 +116,24 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
}
}
+impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> {
+ #[inline]
+ fn extend(&mut self, edges: impl IntoIterator<Item = (T, usize)>) -> Result<usize, GError> {
+ self.graph.extend(edges)
+ }
+}
+
+// TODO: Define a type for the labels of DefaultNFA.
+
impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
+ // Reminder: This is an adopted version of Thompson's
+ // construction.
fn to_nfa(
regexps: &[impl Regex<RegexType<T>>],
sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+ default: Option<T>,
) -> Result<Self::FromRegex<DOption<T>>, Error> {
let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
@@ -128,7 +141,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
return Ok(Default::default());
}
- // We reserve two more rooms for later uses.
+ // We reserve two more rooms for the default edge.
let nfa_len = total_regexps_len * 2 + 2;
let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
@@ -141,20 +154,18 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
builder.add_vertex();
}
+ // add a default edge
+ builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?;
+
let accumulators: Vec<usize> = {
- let mut result = Vec::with_capacity(regexps.len() + 1);
+ let mut result = Vec::with_capacity(regexps.len());
result.push(0);
- let mut accumulator = 0;
-
- for regexp in regexps.iter() {
- accumulator += regexp.nodes_len() * 2;
- result.push(accumulator);
+ for regexp in regexps.iter().take(regexps.len() - 1) {
+ result.push(result.last().unwrap() + regexp.nodes_len() * 2);
}
- result.pop();
-
result
};
@@ -313,9 +324,8 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
SoC::Sub(sub_non) => {
// a non-terminal
- let sub_offset = get_offset_safe!(sub_non);
- let sub_nfa_start = sub_offset + 2 * sub_non;
- let sub_nfa_end = sub_offset + 2 * sub_non + 1;
+ let sub_nfa_start = get_offset_safe!(sub_non);
+ let sub_nfa_end = sub_nfa_start + 1;
builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
@@ -336,14 +346,102 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
Ok(DefaultNFA { graph })
}
- fn remove_epsilon(&mut self) -> Result<(), Error> {
- todo!()
+ fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> {
+ let mut builder = self.graph.builder_ref();
+
+ let mut updated = true;
+
+ let nodes_len = self.nodes_len();
+
+ while updated {
+ updated = false;
+
+ let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len);
+
+ for source in builder.nodes() {
+ for (label, target_iter) in builder.labels_of(source)? {
+ if f(*label) {
+ // empty label found
+ for target in target_iter {
+ for (label, children_iter) in builder.labels_of(target)? {
+ for child in children_iter {
+ if !builder.has_edge_label(source, label, child)? {
+ updated = true;
+
+ to_add.push((source, child, *label));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for (source, target, label) in to_add.into_iter() {
+ builder.add_edge(source, target, label)?;
+ }
+ }
+
+ // Then remove all epsilon edges
+
+ let sources_and_targets: Vec<_> = builder.edges().collect();
+
+ for (source, target) in sources_and_targets.into_iter() {
+ builder.remove_edge(source, target, &f)?;
+ }
+
+ self.graph = builder.build();
+
+ Ok(())
}
- fn remove_dead(&mut self) -> Result<(), Error> {
- todo!()
+ fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> {
+ let mut builder = self.graph.builder_ref();
+
+ let mut changed = true;
+
+ // Use a counting vector to avoid calculating which nodes are
+ // dead at each iteration.
+ let mut target_counts: Vec<usize> =
+ std::iter::repeat(0).take(self.graph.nodes_len()).collect();
+
+ for (_, target) in self.graph.edges() {
+ *target_counts
+ .get_mut(target)
+ .ok_or(Error::UnknownNode(target))? += 1;
+ }
+
+ while changed {
+ changed = false;
+
+ for node in self.nodes() {
+ let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0);
+
+ if deadp && !reserve(node) {
+ let to_remove: Vec<usize> = builder.children_of(node)?.collect();
+
+ if !to_remove.is_empty() {
+ changed = true;
+
+ for target in to_remove {
+ builder.remove_edge(node, target, |_| true)?;
+ *target_counts
+ .get_mut(target)
+ .ok_or(Error::UnknownNode(target))? -= 1;
+ }
+ }
+ }
+ }
+ }
+
+ self.graph = builder.build();
+
+ Ok(())
}
+ // REVIEW: Combine nulling and remove_epsilon into the same
+ // method, or not.
+
fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
let mut updated = true;
let mut builder = self.graph.builder_ref();
@@ -396,7 +494,6 @@ impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> {
#[cfg(test)]
mod test_to_nfa {
- #![allow(unused_imports)]
use super::*;
use crate::SoC;
@@ -446,8 +543,11 @@ mod test_to_nfa {
println!("regex = {regex}");
- let nfa: DefaultNFA<DOption<char>> =
- DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?;
+ let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa(
+ &[regex],
+ |label| Ok(SoC::Carry(label)),
+ Some(char::default()),
+ )?;
nfa.print_viz("nfa.gv")?;
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
index 2670e32..c8ad370 100644
--- a/nfa/src/default/regex.rs
+++ b/nfa/src/default/regex.rs
@@ -260,9 +260,7 @@ impl<T: GraphLabel> DefaultRegex<T> {
stack.push(Unseen(0));
while let Some(top) = stack.pop() {
- let node_type = types[top.index()];
-
- // TODO: Do not use unwrap here.
+ let node_type = types.get(top.index()).unwrap();
match node_type {
RegexType::Kleene => {
@@ -350,8 +348,12 @@ impl<T: GraphLabel> DefaultRegex<T> {
.map(Unseen)
.rev(),
);
+
+ if self.graph.is_empty_node(top.index()).unwrap() {
+ write!(result, "ε")?;
+ }
}
- RegexType::Lit(label) => write!(result, "{}", f(label))?,
+ RegexType::Lit(label) => write!(result, "{}", f(*label))?,
}
}
@@ -452,7 +454,15 @@ impl Display for ParseError {
}
}
-impl std::error::Error for ParseError {}
+impl std::error::Error for ParseError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ if let ParseError::Graph(gerr) = self {
+ Some(gerr)
+ } else {
+ None
+ }
+ }
+}
impl From<GError> for ParseError {
fn from(ge: GError) -> Self {
diff --git a/nfa/src/error.rs b/nfa/src/error.rs
index ad75077..7160555 100644
--- a/nfa/src/error.rs
+++ b/nfa/src/error.rs
@@ -1,6 +1,6 @@
//! This file implements the error type for the crate.
-use graph::error::Error as GError;
+use graph::error::Error as GraphError;
use core::fmt::{Display, Formatter};
@@ -17,7 +17,7 @@ pub enum Error {
/// There is no root in the underlying regular expression.
NoRoot,
/// This error comes from some underlying graph operation.
- Graph(GError),
+ Graph(GraphError),
/// A cycle is found when constructing regular expressions.
Cycle,
}
@@ -34,10 +34,18 @@ impl Display for Error {
}
}
-impl std::error::Error for Error {}
+impl std::error::Error for Error {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ if let Self::Graph(gerr) = self {
+ Some(gerr)
+ } else {
+ None
+ }
+ }
+}
-impl From<GError> for Error {
- fn from(e: GError) -> Self {
+impl From<GraphError> for Error {
+ fn from(e: GraphError) -> Self {
Self::Graph(e)
}
}
diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs
index b1d62b3..31bd69a 100644
--- a/nfa/src/lib.rs
+++ b/nfa/src/lib.rs
@@ -14,7 +14,7 @@ use core::fmt::Display;
use std::ops::{Deref, DerefMut};
-use graph::{Graph, GraphLabel, LabelGraph};
+use graph::{Graph, GraphLabel, LabelExtGraph};
use error::Error;
@@ -52,14 +52,14 @@ pub trait Regex<T: GraphLabel>: Graph + Display + Clone {
}
}
- // TODO: add functions that determine if certain "positions" in a
+ // TODO: Add functions that determine if certain "positions" in a
// regular language satisfy some special properties, like at the
// end of a Kleene star, or at the end of a regular language, et
// cetera. These might be needed later.
}
-/// Since Option<T> does not inherit the Display from T, we wrap it to
-/// provide an automatic implementation of Display.
+/// Since `Option<T>` does not inherit the `Display` from `T`, we wrap
+/// it to provide an automatic implementation of `Display`.
#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)]
pub struct DOption<T>(Option<T>);
@@ -120,42 +120,98 @@ pub enum SoC<T> {
/// The expected behvaiour of a nondeterministic finite automaton.
///
/// Every NFA is a special labelled graph.
-pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
+pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
+ // TODO: This trait should have a type for the labels.
+
/// Remove all empty transitions from the nondeterministic finite
/// automaton.
- fn remove_epsilon(&mut self) -> Result<(), Error>;
+ fn remove_epsilon<F>(&mut self, f: F) -> Result<(), Error>
+ where
+ F: Fn(T) -> bool;
/// Return a state-minimal NFA equivalent with the original one.
///
/// This is not required. It is just to allow me to experiment
/// with NFA optimization algorithms.
- fn minimize(&self) -> Result<Self, Error> {
+ fn minimize(&self) -> Result<Box<Self>, Error> {
Err(Error::UnsupportedOperation)
}
+ /// Check every node or edge by a given predicate.
+ ///
+ /// This should also verify that every node and edge has correct
+ /// indices, so that we can safely use `unwrap` later. A
+ /// consequence is that, if one only wants to check the validity
+ /// of nodes and edges, one can pass a function that always
+ /// returns `true`.
+ #[inline]
+ fn labels_satisfy(&self, f: impl Fn(T) -> bool) -> Result<bool, Error> {
+ let nodes_len = self.nodes_len();
+ let mut result = true;
+
+ for node in self.nodes() {
+ for (label, children_iter) in self.labels_of(node)? {
+ for child in children_iter {
+ if child >= nodes_len {
+ dbg!(node, label);
+ return Err(graph::error::Error::IndexOutOfBounds(child, nodes_len).into());
+ }
+ }
+
+ // NOTE: Avoid executing `f` if `result` is already
+ // false. But still proceed in verifying that nodes
+ // and edges are correct: the correctness of nodes and
+ // edges is more important than the function execution
+ // results, as the former directly affects the safety
+ // of many algorithms.
+ if result && !f(*label) {
+ dbg!(node, label);
+ result = false;
+ }
+ }
+ }
+
+ Ok(result)
+ }
+
/// When we convert a regular expression to a nondeterministic
/// finite automaton, the label should be optional, so we use a
/// different type for the result type.
type FromRegex<S: GraphLabel + Display + Default>;
- /// Build a nondeterministic finite automaton out of a set of
- /// regular expressions.
+ /// Build a nondeterministic finite automaton out of a set
+ /// `regexps` of regular expressions.
+ ///
+ /// The `sub_pred` is a predicate function that returns an
+ /// indication whether to carry the value around or to substitute
+ /// by another regular language in the given set.
+ ///
+ /// The `default` parameter specifies the label of a default edge,
+ /// that goes from a node to another, both of which are not
+ /// associated with the given regular languages.
///
- /// The SUB_PRED is a predicate function that returns an optional
- /// index for a label. If it returns some index, then this means
- /// we should substitute the index-th regular expression in the
- /// set, whereever that label occurs in the set of regular
- /// expressions.
+ /// This function should perform Thompson's construction,
+ /// basically.
fn to_nfa(
regexps: &[impl Regex<RegexType<T>>],
+ // TODO: The transformation should produce more information.
sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+ default: Option<T>,
) -> Result<Self::FromRegex<DOption<T>>, Error>;
/// Remove all dead states from the nondeterministic finite
/// automaton.
///
- /// A state is dead if there are no edges going to the state.
- fn remove_dead(&mut self) -> Result<(), Error>;
+ /// A state is dead if there are no edges going to the state, and
+ /// if it is not reserved.
+ ///
+ /// # Note
+ ///
+ /// Actually an implementation is allowed to just remove all edges
+ /// out of the dead nodes. Then these nodes are considered gone
+ /// from the graph, and we don't need to re-index the existing
+ /// edges. We can call this "a poor man's removal of nodes".
+ fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>;
/// For each edge from A to B whose edge is considered nullable by
/// a function `f`, and for every edge from B to C, add an edge
@@ -169,6 +225,3 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
pub mod default;
pub mod desrec;
-
-#[cfg(test)]
-mod nfa_tests {}