summaryrefslogtreecommitdiff
path: root/nfa
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
commit1a3d346f413325ed37848a6b2526e8e729269833 (patch)
treeab8812f8094d096c68aee53cf516e986cc9a273a /nfa
parentf27d604d93ce583d4404e1874664e08382ea2f00 (diff)
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
Diffstat (limited to 'nfa')
-rw-r--r--nfa/src/default/nfa.rs210
-rw-r--r--nfa/src/lib.rs155
2 files changed, 245 insertions, 120 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index e642218..a23c056 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -5,7 +5,10 @@ use graph::{
LabelExtGraph, LabelGraph,
};
-use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
+use crate::{
+ default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC,
+ TwoEdges,
+};
use core::fmt::{Debug, Display};
@@ -123,8 +126,6 @@ impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> {
}
}
-// TODO: Define a type for the labels of DefaultNFA.
-
impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
@@ -134,7 +135,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
regexps: &[impl Regex<RegexType<T>>],
sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
default: Option<T>,
- ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+ ) -> Result<Self::FromRegex<LabelType<T>>, Error> {
let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
if total_regexps_len == 0 {
@@ -144,18 +145,16 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
// We reserve two more rooms for the default edge.
let nfa_len = total_regexps_len * 2 + 2;
- let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
-
- // Since DOption<T> implements Copy when T does, we can use a
- // variable to hold the empty label to avoid repetitions.
- let empty_label: DOption<T> = Default::default();
+ let mut builder: DLGBuilder<LabelType<T>> = Builder::with_capacity(nfa_len);
for _ in 0..nfa_len {
builder.add_vertex();
}
+ let default = LabelType::new(DOption(default), total_regexps_len, false);
+
// add a default edge
- builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?;
+ builder.add_edge(nfa_len - 2, nfa_len - 1, default)?;
let accumulators: Vec<usize> = {
let mut result = Vec::with_capacity(regexps.len());
@@ -206,6 +205,10 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
stack.push(root);
while let Some(top_index) = stack.pop() {
+ let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false);
+
+ let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true);
+
let top_label = regex.vertex_label(top_index)?;
let nfa_start = offset + 2 * top_index;
@@ -213,7 +216,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
match top_label {
RegexType::Kleene => {
- builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, nfa_end, false_label)?;
let mut source = nfa_start;
@@ -224,14 +227,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let child_start = offset + 2 * child;
let child_end = offset + 2 * child + 1;
- builder.add_edge(source, child_start, empty_label)?;
+ builder.add_edge(source, child_start, false_label)?;
source = child_end;
}
- builder.add_edge(source, nfa_end, empty_label)?;
+ builder.add_edge(source, nfa_end, false_label)?;
- builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ builder.add_edge(nfa_end, nfa_start, false_label)?;
}
}
RegexType::Plus => {
@@ -244,20 +247,20 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let child_start = offset + 2 * child;
let child_end = offset + 2 * child + 1;
- builder.add_edge(source, child_start, empty_label)?;
+ builder.add_edge(source, child_start, false_label)?;
source = child_end;
}
- builder.add_edge(source, nfa_end, empty_label)?;
+ builder.add_edge(source, nfa_end, false_label)?;
- builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ builder.add_edge(nfa_end, nfa_start, false_label)?;
} else {
- builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, nfa_end, false_label)?;
}
}
RegexType::Optional => {
- builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, nfa_end, false_label)?;
let mut source = nfa_start;
@@ -268,12 +271,12 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let child_start = offset + 2 * child;
let child_end = offset + 2 * child + 1;
- builder.add_edge(source, child_start, empty_label)?;
+ builder.add_edge(source, child_start, false_label)?;
source = child_end;
}
- builder.add_edge(source, nfa_end, empty_label)?;
+ builder.add_edge(source, nfa_end, false_label)?;
}
}
RegexType::Or => {
@@ -284,11 +287,11 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let child_start = offset + 2 * child;
let child_end = offset + 2 * child + 1;
- builder.add_edge(nfa_start, child_start, empty_label)?;
- builder.add_edge(child_end, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, child_start, false_label)?;
+ builder.add_edge(child_end, nfa_end, false_label)?;
}
} else {
- builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, nfa_end, false_label)?;
}
}
RegexType::Paren => {
@@ -305,14 +308,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let child_start = offset + 2 * child;
let child_end = offset + 2 * child + 1;
- builder.add_edge(source, child_start, empty_label)?;
+ builder.add_edge(source, child_start, false_label)?;
source = child_end;
}
- builder.add_edge(source, nfa_end, empty_label)?;
+ builder.add_edge(source, nfa_end, false_label)?;
} else {
- builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ builder.add_edge(nfa_start, nfa_end, false_label)?;
}
}
RegexType::Lit(value) => {
@@ -327,13 +330,34 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
let sub_nfa_start = get_offset_safe!(sub_non);
let sub_nfa_end = sub_nfa_start + 1;
- builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
- builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+ // We also need a label for the
+ // original label and non-left-linear
+ // expansion.
+ builder.add_edge(
+ nfa_start,
+ nfa_end,
+ NfaLabel::new(
+ DOption(Some(value)),
+ (offset >> 1) + top_index,
+ false,
+ ),
+ )?;
+
+ builder.add_edge(nfa_start, sub_nfa_start, true_label)?;
+ builder.add_edge(sub_nfa_end, nfa_end, false_label)?;
}
SoC::Carry(new_value) => {
// a terminal
- builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+ builder.add_edge(
+ nfa_start,
+ nfa_end,
+ NfaLabel::new(
+ DOption(Some(new_value)),
+ (offset >> 1) + top_index,
+ false,
+ ),
+ )?;
}
}
}
@@ -346,56 +370,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
Ok(DefaultNFA { graph })
}
- fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> {
- let mut builder = self.graph.builder_ref();
-
- let mut updated = true;
-
- let nodes_len = self.nodes_len();
-
- while updated {
- updated = false;
-
- let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len);
-
- for source in builder.nodes() {
- for (label, target_iter) in builder.labels_of(source)? {
- if f(*label) {
- // empty label found
- for target in target_iter {
- for (label, children_iter) in builder.labels_of(target)? {
- for child in children_iter {
- if !builder.has_edge_label(source, label, child)? {
- updated = true;
-
- to_add.push((source, child, *label));
- }
- }
- }
- }
- }
- }
- }
-
- for (source, target, label) in to_add.into_iter() {
- builder.add_edge(source, target, label)?;
- }
- }
-
- // Then remove all epsilon edges
-
- let sources_and_targets: Vec<_> = builder.edges().collect();
-
- for (source, target) in sources_and_targets.into_iter() {
- builder.remove_edge(source, target, &f)?;
- }
-
- self.graph = builder.build();
-
- Ok(())
- }
-
- fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> {
+ fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> {
let mut builder = self.graph.builder_ref();
let mut changed = true;
@@ -439,48 +414,69 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
Ok(())
}
- // REVIEW: Combine nulling and remove_epsilon into the same
- // method, or not.
+ fn closure(
+ &mut self,
+ mut predicate: impl FnMut(T) -> bool,
+ remove_after_p: bool,
+ mut transform: impl FnMut(TwoEdges<T>) -> T,
+ mut remove_predicate: impl FnMut(T) -> bool,
+ ) -> Result<(), Error> {
+ let mut builder = self.graph.builder_ref();
- fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
let mut updated = true;
- let mut builder = self.graph.builder_ref();
+
+ let nodes_len = self.nodes_len();
while updated {
updated = false;
- let mut nullable = false;
-
- let mut to_add = Vec::new();
-
- for (source, target) in builder.edges() {
- for label in builder.edge_label(source, target)? {
- if f(label) {
- nullable = true;
+ // Just a rough estimate
+ let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len);
- break;
- }
- }
+ // REVIEW: I do not like nested if statements, but I do
+ // not know how to do this otherwise.
- if nullable {
- for (label, child_iter) in builder.labels_of(target)? {
- for child in child_iter {
- if !builder.has_edge_label(source, label, child)? {
- updated = true;
+ for source in builder.nodes() {
+ for (first_label, target_iter) in builder.labels_of(source)? {
+ if predicate(*first_label) {
+ // a null label found
+ for target in target_iter {
+ for (second_label, children_iter) in builder.labels_of(target)? {
+ for child in children_iter {
+ let new_label = transform(TwoEdges(
+ source,
+ target,
+ child,
+ *first_label,
+ *second_label,
+ ));
+
+ if !builder.has_edge_label(source, &new_label, child)? {
+ updated = true;
- to_add.push((source, child, *label));
+ to_add.push((source, child, new_label));
+ }
+ }
}
}
}
}
}
- for (source, child, label) in to_add {
- builder.add_edge(source, child, label)?;
+ for (source, target, label) in to_add.into_iter() {
+ builder.add_edge(source, target, label)?;
+ }
+ }
+
+ // Then remove all nullable edges if demanded
+
+ if remove_after_p {
+ for (source, target) in builder.edges().collect::<Vec<_>>().into_iter() {
+ builder.remove_edge(source, target, &mut remove_predicate)?;
}
}
- self.graph.replace_by_builder(builder);
+ self.graph = builder.build();
Ok(())
}
@@ -532,7 +528,7 @@ mod test_to_nfa {
Ok(
DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
- .unwrap_or(Default::default())
+ .unwrap_or_default()
.0,
)
}
@@ -543,13 +539,13 @@ mod test_to_nfa {
println!("regex = {regex}");
- let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa(
+ let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa(
&[regex],
|label| Ok(SoC::Carry(label)),
Some(char::default()),
)?;
- nfa.print_viz("nfa.gv")?;
+ // nfa.print_viz("nfa.gv")?;
Ok(())
}
diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs
index 31bd69a..c1906e1 100644
--- a/nfa/src/lib.rs
+++ b/nfa/src/lib.rs
@@ -60,8 +60,12 @@ pub trait Regex<T: GraphLabel>: Graph + Display + Clone {
/// Since `Option<T>` does not inherit the `Display` from `T`, we wrap
/// it to provide an automatic implementation of `Display`.
+///
+/// # Convert to `Option`
+///
+/// One can dereference a `DOption` to obtain an `Option`.
#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)]
-pub struct DOption<T>(Option<T>);
+pub struct DOption<T>(pub Option<T>);
impl<T> Default for DOption<T> {
fn default() -> Self {
@@ -117,17 +121,104 @@ pub enum SoC<T> {
Carry(T),
}
+/// This type records some information that is obtained from the
+/// process of converting a regular expression to a nondeterministic
+/// finite automaton.
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash, Default)]
+pub struct NfaLabel<T: GraphLabel> {
+ /// A terminal or a non-terminal.
+ value: T,
+ /// The corresponding position in the rules.
+ moved: usize,
+ /// Whether this comes from left-linear expansion.
+ left_p: bool,
+}
+
+impl<T: GraphLabel> NfaLabel<T> {
+ /// Construct a new label.
+ #[inline]
+ pub fn new(value: T, moved: usize, left_p: bool) -> Self {
+ Self {
+ value,
+ moved,
+ left_p,
+ }
+ }
+
+ /// Retrieve the value from the label.
+ #[inline]
+ pub fn get_value(&self) -> T {
+ self.value
+ }
+ /// Retrieve the moved position from the label.
+ #[inline]
+ pub fn get_moved(&self) -> usize {
+ self.moved
+ }
+ /// Retrieve whether or not the label comes from left-linear
+ /// expansions.
+ #[inline]
+ pub fn get_left_p(&self) -> bool {
+ self.left_p
+ }
+}
+
+impl<T: GraphLabel> Display for NfaLabel<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "edge {} at {}{}", self.value, self.moved, {
+ if self.left_p {
+ ", by left"
+ } else {
+ ""
+ }
+ })
+ }
+}
+
+/// A convenient alias of the information of two edges.
+///
+/// If the tuple is (a, b, c, la, lb), then the first edge goes from a
+/// to b, is labelled la, and the second edge goes from b to c, and is
+/// labelled by lb.
+#[derive(Debug, Clone, Copy)]
+pub struct TwoEdges<T: Copy>(usize, usize, usize, T, T);
+
+impl<T: Copy> TwoEdges<T> {
+ /// Extract the first edge.
+ pub fn first_edge(&self) -> (usize, usize, T) {
+ (self.0, self.1, self.3)
+ }
+
+ /// Extract the second edge.
+ pub fn second_edge(&self) -> (usize, usize, T) {
+ (self.1, self.2, self.4)
+ }
+}
+
+/// The type of nondeterministic finite automata that is obtained from
+/// a regular expression, via the method [`to_nfa`][Nfa::to_nfa].
+pub type LabelType<T> = NfaLabel<DOption<T>>;
+
/// The expected behvaiour of a nondeterministic finite automaton.
///
/// Every NFA is a special labelled graph.
pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
- // TODO: This trait should have a type for the labels.
+ /// When we convert a regular expression to a nondeterministic
+ /// finite automaton, the label should be optional, so we use a
+ /// different type for the result type.
+ type FromRegex<S: GraphLabel + Display + Default>;
+ // FIXME: This should not be needed.
/// Remove all empty transitions from the nondeterministic finite
/// automaton.
- fn remove_epsilon<F>(&mut self, f: F) -> Result<(), Error>
+ #[inline]
+ fn remove_epsilon<F>(&mut self, _f: F) -> Result<(), Error>
where
- F: Fn(T) -> bool;
+ F: FnMut(T) -> bool,
+ {
+ // self.closure(f, true, |two_edges| two_edges.second_edge().2)
+ unimplemented!("deprecated")
+ }
/// Return a state-minimal NFA equivalent with the original one.
///
@@ -174,11 +265,6 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
Ok(result)
}
- /// When we convert a regular expression to a nondeterministic
- /// finite automaton, the label should be optional, so we use a
- /// different type for the result type.
- type FromRegex<S: GraphLabel + Display + Default>;
-
/// Build a nondeterministic finite automaton out of a set
/// `regexps` of regular expressions.
///
@@ -194,10 +280,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
/// basically.
fn to_nfa(
regexps: &[impl Regex<RegexType<T>>],
- // TODO: The transformation should produce more information.
sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
default: Option<T>,
- ) -> Result<Self::FromRegex<DOption<T>>, Error>;
+ ) -> Result<Self::FromRegex<LabelType<T>>, Error>;
/// Remove all dead states from the nondeterministic finite
/// automaton.
@@ -211,8 +296,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
/// out of the dead nodes. Then these nodes are considered gone
/// from the graph, and we don't need to re-index the existing
/// edges. We can call this "a poor man's removal of nodes".
- fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>;
+ fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), Error>;
+ // FIXME: This should not be needed.
/// For each edge from A to B whose edge is considered nullable by
/// a function `f`, and for every edge from B to C, add an edge
/// from A to C.
@@ -220,7 +306,50 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
/// This is needed specifically by the algorithm to produce a set
/// of atomic languages that represent "null-closed" derived
/// languages.
- fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>;
+ #[inline]
+ fn nulling(&mut self, _f: impl FnMut(T) -> bool) -> Result<(), Error> {
+ // self.closure(f, false, |two_edges| two_edges.second_edge().2)
+ unimplemented!("deprecated")
+ }
+
+ /// Return the *closure* of the nondeterministic finite automaton.
+ ///
+ /// # Definition
+ ///
+ /// The closure of a nondeterministic finite automaton N is
+ /// defined as the unique *minimal* nondeterministic finite
+ /// automaton N+ that can be obtained by adjoining some edges to N
+ /// such that if there are edges a -> b and b -> c, and if the
+ /// edge a -> b is deemed as *nullable* by some function, then
+ /// there is an edge a -> c, where the minimality is the
+ /// minimality of the set of edges: if there is another such
+ /// nondeterministic finite automaton M satisfying the above
+ /// property, then the set of edges of N+ is a subset of the set
+ /// of edges of M.
+ ///
+ /// # Remove edges afterwards
+ ///
+ /// If `remove_after_p` is true, remove all those nullable edges.
+ ///
+ /// # Transformation of labels
+ ///
+ /// We can apply a transformer to labels, to be able to combine
+ /// labels in non-trivial ways. If one just wants the *new* label
+ /// that is the label of the edge from b to c in the above
+ /// definition, one can use the function that sends `two_edges` to
+ /// `two_edges.second_edge().2`.
+ ///
+ /// # Error
+ ///
+ /// The function should emit errors if the edges of the
+ /// nondeterministic finite automaton point to invalid nodes.
+ fn closure(
+ &mut self,
+ predicate: impl FnMut(T) -> bool,
+ remove_after_p: bool,
+ transform: impl FnMut(TwoEdges<T>) -> T,
+ remove_predicate: impl FnMut(T) -> bool,
+ ) -> Result<(), Error>;
}
pub mod default;