Record left-linear expansion and forest format

Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
author: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
committer: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
commit: 1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree: ab8812f8094d096c68aee53cf516e986cc9a273a /nfa
parent: f27d604d93ce583d4404e1874664e08382ea2f00 (diff)
2 files changed, 245 insertions, 120 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index e642218..a23c056 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -5,7 +5,10 @@ use graph::{
     LabelExtGraph, LabelGraph,
 };
 
-use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
+use crate::{
+    default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC,
+    TwoEdges,
+};
 
 use core::fmt::{Debug, Display};
 
@@ -123,8 +126,6 @@ impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> {
     }
 }
 
-// TODO: Define a type for the labels of DefaultNFA.
-
 impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
     type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
 
@@ -134,7 +135,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         regexps: &[impl Regex<RegexType<T>>],
         sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
         default: Option<T>,
-    ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+    ) -> Result<Self::FromRegex<LabelType<T>>, Error> {
         let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
 
         if total_regexps_len == 0 {
@@ -144,18 +145,16 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         // We reserve two more rooms for the default edge.
         let nfa_len = total_regexps_len * 2 + 2;
 
-        let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
-
-        // Since DOption<T> implements Copy when T does, we can use a
-        // variable to hold the empty label to avoid repetitions.
-        let empty_label: DOption<T> = Default::default();
+        let mut builder: DLGBuilder<LabelType<T>> = Builder::with_capacity(nfa_len);
 
         for _ in 0..nfa_len {
             builder.add_vertex();
         }
 
+        let default = LabelType::new(DOption(default), total_regexps_len, false);
+
         // add a default edge
-        builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?;
+        builder.add_edge(nfa_len - 2, nfa_len - 1, default)?;
 
         let accumulators: Vec<usize> = {
             let mut result = Vec::with_capacity(regexps.len());
@@ -206,6 +205,10 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
             stack.push(root);
 
             while let Some(top_index) = stack.pop() {
+                let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false);
+
+                let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true);
+
                 let top_label = regex.vertex_label(top_index)?;
 
                 let nfa_start = offset + 2 * top_index;
@@ -213,7 +216,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
 
                 match top_label {
                     RegexType::Kleene => {
-                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        builder.add_edge(nfa_start, nfa_end, false_label)?;
 
                         let mut source = nfa_start;
 
@@ -224,14 +227,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let child_start = offset + 2 * child;
                                 let child_end = offset + 2 * child + 1;
 
-                                builder.add_edge(source, child_start, empty_label)?;
+                                builder.add_edge(source, child_start, false_label)?;
 
                                 source = child_end;
                             }
 
-                            builder.add_edge(source, nfa_end, empty_label)?;
+                            builder.add_edge(source, nfa_end, false_label)?;
 
-                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                            builder.add_edge(nfa_end, nfa_start, false_label)?;
                         }
                     }
                     RegexType::Plus => {
@@ -244,20 +247,20 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let child_start = offset + 2 * child;
                                 let child_end = offset + 2 * child + 1;
 
-                                builder.add_edge(source, child_start, empty_label)?;
+                                builder.add_edge(source, child_start, false_label)?;
 
                                 source = child_end;
                             }
 
-                            builder.add_edge(source, nfa_end, empty_label)?;
+                            builder.add_edge(source, nfa_end, false_label)?;
 
-                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                            builder.add_edge(nfa_end, nfa_start, false_label)?;
                         } else {
-                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                            builder.add_edge(nfa_start, nfa_end, false_label)?;
                         }
                     }
                     RegexType::Optional => {
-                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        builder.add_edge(nfa_start, nfa_end, false_label)?;
 
                         let mut source = nfa_start;
 
@@ -268,12 +271,12 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let child_start = offset + 2 * child;
                                 let child_end = offset + 2 * child + 1;
 
-                                builder.add_edge(source, child_start, empty_label)?;
+                                builder.add_edge(source, child_start, false_label)?;
 
                                 source = child_end;
                             }
 
-                            builder.add_edge(source, nfa_end, empty_label)?;
+                            builder.add_edge(source, nfa_end, false_label)?;
                         }
                     }
                     RegexType::Or => {
@@ -284,11 +287,11 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let child_start = offset + 2 * child;
                                 let child_end = offset + 2 * child + 1;
 
-                                builder.add_edge(nfa_start, child_start, empty_label)?;
-                                builder.add_edge(child_end, nfa_end, empty_label)?;
+                                builder.add_edge(nfa_start, child_start, false_label)?;
+                                builder.add_edge(child_end, nfa_end, false_label)?;
                             }
                         } else {
-                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                            builder.add_edge(nfa_start, nfa_end, false_label)?;
                         }
                     }
                     RegexType::Paren => {
@@ -305,14 +308,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let child_start = offset + 2 * child;
                                 let child_end = offset + 2 * child + 1;
 
-                                builder.add_edge(source, child_start, empty_label)?;
+                                builder.add_edge(source, child_start, false_label)?;
 
                                 source = child_end;
                             }
 
-                            builder.add_edge(source, nfa_end, empty_label)?;
+                            builder.add_edge(source, nfa_end, false_label)?;
                         } else {
-                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                            builder.add_edge(nfa_start, nfa_end, false_label)?;
                         }
                     }
                     RegexType::Lit(value) => {
@@ -327,13 +330,34 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
                                 let sub_nfa_start = get_offset_safe!(sub_non);
                                 let sub_nfa_end = sub_nfa_start + 1;
 
-                                builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
-                                builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+                                // We also need a label for the
+                                // original label and non-left-linear
+                                // expansion.
+                                builder.add_edge(
+                                    nfa_start,
+                                    nfa_end,
+                                    NfaLabel::new(
+                                        DOption(Some(value)),
+                                        (offset >> 1) + top_index,
+                                        false,
+                                    ),
+                                )?;
+
+                                builder.add_edge(nfa_start, sub_nfa_start, true_label)?;
+                                builder.add_edge(sub_nfa_end, nfa_end, false_label)?;
                             }
                             SoC::Carry(new_value) => {
                                 // a terminal
 
-                                builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+                                builder.add_edge(
+                                    nfa_start,
+                                    nfa_end,
+                                    NfaLabel::new(
+                                        DOption(Some(new_value)),
+                                        (offset >> 1) + top_index,
+                                        false,
+                                    ),
+                                )?;
                             }
                         }
                     }
@@ -346,56 +370,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         Ok(DefaultNFA { graph })
     }
 
-    fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> {
-        let mut builder = self.graph.builder_ref();
-
-        let mut updated = true;
-
-        let nodes_len = self.nodes_len();
-
-        while updated {
-            updated = false;
-
-            let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len);
-
-            for source in builder.nodes() {
-                for (label, target_iter) in builder.labels_of(source)? {
-                    if f(*label) {
-                        // empty label found
-                        for target in target_iter {
-                            for (label, children_iter) in builder.labels_of(target)? {
-                                for child in children_iter {
-                                    if !builder.has_edge_label(source, label, child)? {
-                                        updated = true;
-
-                                        to_add.push((source, child, *label));
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            for (source, target, label) in to_add.into_iter() {
-                builder.add_edge(source, target, label)?;
-            }
-        }
-
-        // Then remove all epsilon edges
-
-        let sources_and_targets: Vec<_> = builder.edges().collect();
-
-        for (source, target) in sources_and_targets.into_iter() {
-            builder.remove_edge(source, target, &f)?;
-        }
-
-        self.graph = builder.build();
-
-        Ok(())
-    }
-
-    fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> {
+    fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> {
         let mut builder = self.graph.builder_ref();
 
         let mut changed = true;
@@ -439,48 +414,69 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         Ok(())
     }
 
-    // REVIEW: Combine nulling and remove_epsilon into the same
-    // method, or not.
+    fn closure(
+        &mut self,
+        mut predicate: impl FnMut(T) -> bool,
+        remove_after_p: bool,
+        mut transform: impl FnMut(TwoEdges<T>) -> T,
+        mut remove_predicate: impl FnMut(T) -> bool,
+    ) -> Result<(), Error> {
+        let mut builder = self.graph.builder_ref();
 
-    fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
         let mut updated = true;
-        let mut builder = self.graph.builder_ref();
+
+        let nodes_len = self.nodes_len();
 
         while updated {
             updated = false;
 
-            let mut nullable = false;
-
-            let mut to_add = Vec::new();
-
-            for (source, target) in builder.edges() {
-                for label in builder.edge_label(source, target)? {
-                    if f(label) {
-                        nullable = true;
+            // Just a rough estimate
+            let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len);
 
-                        break;
-                    }
-                }
+            // REVIEW: I do not like nested if statements, but I do
+            // not know how to do this otherwise.
 
-                if nullable {
-                    for (label, child_iter) in builder.labels_of(target)? {
-                        for child in child_iter {
-                            if !builder.has_edge_label(source, label, child)? {
-                                updated = true;
+            for source in builder.nodes() {
+                for (first_label, target_iter) in builder.labels_of(source)? {
+                    if predicate(*first_label) {
+                        // a null label found
+                        for target in target_iter {
+                            for (second_label, children_iter) in builder.labels_of(target)? {
+                                for child in children_iter {
+                                    let new_label = transform(TwoEdges(
+                                        source,
+                                        target,
+                                        child,
+                                        *first_label,
+                                        *second_label,
+                                    ));
+
+                                    if !builder.has_edge_label(source, &new_label, child)? {
+                                        updated = true;
 
-                                to_add.push((source, child, *label));
+                                        to_add.push((source, child, new_label));
+                                    }
+                                }
                             }
                         }
                     }
                 }
             }
 
-            for (source, child, label) in to_add {
-                builder.add_edge(source, child, label)?;
+            for (source, target, label) in to_add.into_iter() {
+                builder.add_edge(source, target, label)?;
+            }
+        }
+
+        // Then remove all nullable edges if demanded
+
+        if remove_after_p {
+            for (source, target) in builder.edges().collect::<Vec<_>>().into_iter() {
+                builder.remove_edge(source, target, &mut remove_predicate)?;
             }
         }
 
-        self.graph.replace_by_builder(builder);
+        self.graph = builder.build();
 
         Ok(())
     }
@@ -532,7 +528,7 @@ mod test_to_nfa {
 
         Ok(
             DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
-                .unwrap_or(Default::default())
+                .unwrap_or_default()
                 .0,
         )
     }
@@ -543,13 +539,13 @@ mod test_to_nfa {
 
         println!("regex = {regex}");
 
-        let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa(
+        let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa(
             &[regex],
             |label| Ok(SoC::Carry(label)),
             Some(char::default()),
         )?;
 
-        nfa.print_viz("nfa.gv")?;
+        // nfa.print_viz("nfa.gv")?;
 
         Ok(())
     }
diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs
index 31bd69a..c1906e1 100644
--- a/nfa/src/lib.rs
+++ b/nfa/src/lib.rs
@@ -60,8 +60,12 @@ pub trait Regex<T: GraphLabel>: Graph + Display + Clone {
 
 /// Since `Option<T>` does not inherit the `Display` from `T`, we wrap
 /// it to provide an automatic implementation of `Display`.
+///
+/// # Convert to `Option`
+///
+/// One can dereference a `DOption` to obtain an `Option`.
 #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)]
-pub struct DOption<T>(Option<T>);
+pub struct DOption<T>(pub Option<T>);
 
 impl<T> Default for DOption<T> {
     fn default() -> Self {
@@ -117,17 +121,104 @@ pub enum SoC<T> {
     Carry(T),
 }
 
+/// This type records some information that is obtained from the
+/// process of converting a regular expression to a nondeterministic
+/// finite automaton.
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash, Default)]
+pub struct NfaLabel<T: GraphLabel> {
+    /// A terminal or a non-terminal.
+    value: T,
+    /// The corresponding position in the rules.
+    moved: usize,
+    /// Whether this comes from left-linear expansion.
+    left_p: bool,
+}
+
+impl<T: GraphLabel> NfaLabel<T> {
+    /// Construct a new label.
+    #[inline]
+    pub fn new(value: T, moved: usize, left_p: bool) -> Self {
+        Self {
+            value,
+            moved,
+            left_p,
+        }
+    }
+
+    /// Retrieve the value from the label.
+    #[inline]
+    pub fn get_value(&self) -> T {
+        self.value
+    }
+    /// Retrieve the moved position from the label.
+    #[inline]
+    pub fn get_moved(&self) -> usize {
+        self.moved
+    }
+    /// Retrieve whether or not the label comes from left-linear
+    /// expansions.
+    #[inline]
+    pub fn get_left_p(&self) -> bool {
+        self.left_p
+    }
+}
+
+impl<T: GraphLabel> Display for NfaLabel<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "edge {} at {}{}", self.value, self.moved, {
+            if self.left_p {
+                ", by left"
+            } else {
+                ""
+            }
+        })
+    }
+}
+
+/// A convenient alias of the information of two edges.
+///
+/// If the tuple is (a, b, c, la, lb), then the first edge goes from a
+/// to b, is labelled la, and the second edge goes from b to c, and is
+/// labelled by lb.
+#[derive(Debug, Clone, Copy)]
+pub struct TwoEdges<T: Copy>(usize, usize, usize, T, T);
+
+impl<T: Copy> TwoEdges<T> {
+    /// Extract the first edge.
+    pub fn first_edge(&self) -> (usize, usize, T) {
+        (self.0, self.1, self.3)
+    }
+
+    /// Extract the second edge.
+    pub fn second_edge(&self) -> (usize, usize, T) {
+        (self.1, self.2, self.4)
+    }
+}
+
+/// The type of nondeterministic finite automata that is obtained from
+/// a regular expression, via the method [`to_nfa`][Nfa::to_nfa].
+pub type LabelType<T> = NfaLabel<DOption<T>>;
+
 /// The expected behvaiour of a nondeterministic finite automaton.
 ///
 /// Every NFA is a special labelled graph.
 pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
-    // TODO: This trait should have a type for the labels.
+    /// When we convert a regular expression to a nondeterministic
+    /// finite automaton, the label should be optional, so we use a
+    /// different type for the result type.
+    type FromRegex<S: GraphLabel + Display + Default>;
 
+    // FIXME: This should not be needed.
     /// Remove all empty transitions from the nondeterministic finite
     /// automaton.
-    fn remove_epsilon<F>(&mut self, f: F) -> Result<(), Error>
+    #[inline]
+    fn remove_epsilon<F>(&mut self, _f: F) -> Result<(), Error>
     where
-        F: Fn(T) -> bool;
+        F: FnMut(T) -> bool,
+    {
+        // self.closure(f, true, |two_edges| two_edges.second_edge().2)
+        unimplemented!("deprecated")
+    }
 
     /// Return a state-minimal NFA equivalent with the original one.
     ///
@@ -174,11 +265,6 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
         Ok(result)
     }
 
-    /// When we convert a regular expression to a nondeterministic
-    /// finite automaton, the label should be optional, so we use a
-    /// different type for the result type.
-    type FromRegex<S: GraphLabel + Display + Default>;
-
     /// Build a nondeterministic finite automaton out of a set
     /// `regexps` of regular expressions.
     ///
@@ -194,10 +280,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
     /// basically.
     fn to_nfa(
         regexps: &[impl Regex<RegexType<T>>],
-        // TODO: The transformation should produce more information.
         sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
         default: Option<T>,
-    ) -> Result<Self::FromRegex<DOption<T>>, Error>;
+    ) -> Result<Self::FromRegex<LabelType<T>>, Error>;
 
     /// Remove all dead states from the nondeterministic finite
     /// automaton.
@@ -211,8 +296,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
     /// out of the dead nodes.  Then these nodes are considered gone
     /// from the graph, and we don't need to re-index the existing
     /// edges.  We can call this "a poor man's removal of nodes".
-    fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>;
+    fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), Error>;
 
+    // FIXME: This should not be needed.
     /// For each edge from A to B whose edge is considered nullable by
     /// a function `f`, and for every edge from B to C, add an edge
     /// from A to C.
@@ -220,7 +306,50 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> {
     /// This is needed specifically by the algorithm to produce a set
     /// of atomic languages that represent "null-closed" derived
     /// languages.
-    fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>;
+    #[inline]
+    fn nulling(&mut self, _f: impl FnMut(T) -> bool) -> Result<(), Error> {
+        // self.closure(f, false, |two_edges| two_edges.second_edge().2)
+        unimplemented!("deprecated")
+    }
+
+    /// Return the *closure* of the nondeterministic finite automaton.
+    ///
+    /// # Definition
+    ///
+    /// The closure of a nondeterministic finite automaton N is
+    /// defined as the unique *minimal* nondeterministic finite
+    /// automaton N+ that can be obtained by adjoining some edges to N
+    /// such that if there are edges a -> b and b -> c, and if the
+    /// edge a -> b is deemed as *nullable* by some function, then
+    /// there is an edge a -> c, where the minimality is the
+    /// minimality of the set of edges: if there is another such
+    /// nondeterministic finite automaton M satisfying the above
+    /// property, then the set of edges of N+ is a subset of the set
+    /// of edges of M.
+    ///
+    /// # Remove edges afterwards
+    ///
+    /// If `remove_after_p` is true, remove all those nullable edges.
+    ///
+    /// # Transformation of labels
+    ///
+    /// We can apply a transformer to labels, to be able to combine
+    /// labels in non-trivial ways.  If one just wants the *new* label
+    /// that is the label of the edge from b to c in the above
+    /// definition, one can use the function that sends `two_edges` to
+    /// `two_edges.second_edge().2`.
+    ///
+    /// # Error
+    ///
+    /// The function should emit errors if the edges of the
+    /// nondeterministic finite automaton point to invalid nodes.
+    fn closure(
+        &mut self,
+        predicate: impl FnMut(T) -> bool,
+        remove_after_p: bool,
+        transform: impl FnMut(TwoEdges<T>) -> T,
+        remove_predicate: impl FnMut(T) -> bool,
+    ) -> Result<(), Error>;
 }
 
 pub mod default;
author	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
committer	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
commit	1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree	ab8812f8094d096c68aee53cf516e986cc9a273a /nfa
parent	f27d604d93ce583d4404e1874664e08382ea2f00 (diff)