Record left-linear expansion and forest format

Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
author: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
committer: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
commit: 1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree: ab8812f8094d096c68aee53cf516e986cc9a273a /chain/src
parent: f27d604d93ce583d4404e1874664e08382ea2f00 (diff)
3 files changed, 107 insertions, 89 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs
index 72989b3..90133f4 100644
--- a/chain/src/atom/default.rs
+++ b/chain/src/atom/default.rs
@@ -4,7 +4,10 @@
 use super::*;
 use grammar::Grammar;
 use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph};
-use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex};
+use nfa::{
+    default::{nfa::DefaultNFA, regex::DefaultRegex},
+    LabelType,
+};
 
 use core::fmt::Display;
 use std::collections::BTreeMap as Map;
@@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>;
 #[derive(Debug, Clone, Default)]
 pub struct DefaultAtom {
     grammar: Grammar,
-    nfa: DefaultNFA<DOption<TNT>>,
+    nfa: DefaultNFA<LabelType<TNT>>,
     // NOTE: This is mostly for printing and debugging
     regexp: Vec<DefaultRegex<TNT>>,
     virtual_nodes: VirtualMap,
@@ -95,7 +98,7 @@ impl Display for DefaultAtom {
 // LabelGraph, in order to implement Nfa.
 
 impl Graph for DefaultAtom {
-    type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b>
+    type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b>
     where
         Self: 'b;
 
@@ -130,23 +133,23 @@ impl Graph for DefaultAtom {
     }
 }
 
-impl LabelGraph<DOption<TNT>> for DefaultAtom {
-    type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b>
+impl LabelGraph<LabelType<TNT>> for DefaultAtom {
+    type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b>
     where
         Self: 'b;
 
-    type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b>
+    type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b>
     where
         Self: 'b,
         DOption<TNT>: 'b;
 
-    type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a>
+    type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a>
     where
         Self: 'a,
         DOption<TNT>: 'a;
 
     #[inline]
-    fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> {
+    fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> {
         self.nfa.vertex_label(node_id)
     }
 
@@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
     fn find_children_with_label(
         &self,
         node_id: usize,
-        label: &DOption<TNT>,
-    ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> {
+        label: &LabelType<TNT>,
+    ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> {
         self.nfa.find_children_with_label(node_id, label)
     }
 
@@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
     fn has_edge_label(
         &self,
         node_id: usize,
-        label: &DOption<TNT>,
+        label: &LabelType<TNT>,
         target: usize,
     ) -> Result<bool, GraphError> {
         self.nfa.has_edge_label(node_id, label, target)
     }
 }
 
-impl LabelExtGraph<DOption<TNT>> for DefaultAtom {
+impl LabelExtGraph<LabelType<TNT>> for DefaultAtom {
     #[inline]
     fn extend(
         &mut self,
-        edges: impl IntoIterator<Item = (DOption<TNT>, usize)>,
+        edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>,
     ) -> Result<usize, GraphError> {
         self.nfa.extend(edges)
     }
 }
 
-impl Nfa<DOption<TNT>> for DefaultAtom {
-    #[inline]
-    fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error>
-    where
-        F: Fn(DOption<TNT>) -> bool,
-    {
-        self.nfa.remove_epsilon(f)
-    }
-
+impl Nfa<LabelType<TNT>> for DefaultAtom {
     type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = ();
 
     #[inline]
     fn to_nfa(
-        _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>],
-        _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>,
-        _default: Option<DOption<TNT>>,
+        _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>],
+        _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>,
+        _default: Option<LabelType<TNT>>,
     ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> {
         // NOTE: We cannot construct an atom from a set of regular
         // languages alone.  So it is appropriate to panic here, if
@@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom {
     }
 
     #[inline]
-    fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> {
+    fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> {
         self.nfa.remove_dead(reserve)
     }
 
     #[inline]
-    fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> {
-        self.nfa.nulling(f)
+    fn closure(
+        &mut self,
+        predicate: impl FnMut(LabelType<TNT>) -> bool,
+        remove_after_p: bool,
+        transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>,
+        remove_predicate: impl FnMut(LabelType<TNT>) -> bool,
+    ) -> Result<(), nfa::error::Error> {
+        self.nfa
+            .closure(predicate, remove_after_p, transform, remove_predicate)
     }
 }
 
@@ -237,46 +239,56 @@ impl DefaultAtom {
 
         let mut nfa = grammar.left_closure_to_nfa(&regexp)?;
 
-        use std::collections::HashSet;
+        use std::collections::{HashMap, HashSet};
 
         let accumulators: Vec<usize> = {
             let mut result = Vec::with_capacity(regexp.len() + 1);
             result.push(0);
 
             for regex in regexp.iter() {
+                // Calling `unwrap` here is safe as `result` is always
+                // non-empty.
                 result.push(regex.nodes_len() * 2 + result.last().unwrap());
             }
 
-            result.into_iter().collect()
+            result
         };
 
         let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect();
 
-        nfa.nulling(|label| {
-            if let Some(label) = *label {
-                match label {
-                    TNT::Ter(_) => false,
-                    // Panics if a non-terminal references an invalid node
-                    // here.
-                    TNT::Non(n) => grammar.is_nullable(n).unwrap(),
+        let nullables: HashSet<usize> = (0..grammar.non_num())
+            .filter(|n| matches!(grammar.is_nullable(*n), Ok(true)))
+            .collect();
+
+        // Perform nulling and remove_epsilon at the same time.
+        nfa.closure(
+            |label| {
+                if let Some(label) = *label.get_value() {
+                    matches!(label, TNT::Non(n) if nullables.contains(&n))
+                } else {
+                    true
                 }
-            } else {
-                true
-            }
-        })?;
-        nfa.remove_epsilon(|label| label.is_none())?;
+            },
+            true,
+            |two_edges| grammar.transform_label_null_epsilon(two_edges),
+            |label| label.get_value().is_none(),
+        )?;
+
         nfa.remove_dead(|node| accumulators_set.contains(&node))?;
 
-        // now add the virtual nodes
+        // Now add the virtual nodes.
         let mut virtual_nodes: VirtualMap = Default::default();
 
         let nt_num = grammar.non_num();
 
         assert!(nt_num <= accumulators.len());
 
-        // Convert an error telling us that an index is out of bounds.
-        //
-        // Panics if the error is not of the expected kind.
+        /// Convert an error telling us that an index is out of bounds.
+        ///
+        /// # Panics
+        ///
+        /// The function panics if the error is not of the expected
+        /// kind.
         fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError {
             match ge {
                 GraphError::IndexOutOfBounds(index, bound) => {
@@ -287,24 +299,34 @@ impl DefaultAtom {
         }
 
         for nt in 0..nt_num {
-            let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa
-                // this is safe because of the above assertion.
+            let children: std::collections::HashMap<_, _> = nfa
+                // This is safe because of the above assertion.
                 .labels_of(*accumulators.get(nt).unwrap())
                 .map_err(index_out_of_bounds_conversion)?
-                .map(|(label, target_iter)| (*label, target_iter.collect()))
+                .map(|(label, target_iter)| (*label, target_iter))
                 .collect();
 
-            for (label, children_vec) in children.into_iter() {
-                if let Some(TNT::Ter(t)) = *label {
-                    let new_index = nfa
-                        .extend(children_vec.into_iter().map(|target| (label, target)))
-                        .map_err(index_out_of_bounds_conversion)?;
+            let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> =
+                HashMap::new();
 
-                    let virtual_node = VirtualNode::new(nt, t);
-
-                    virtual_nodes.insert(virtual_node, new_index);
+            for (label, children_iter) in children.into_iter() {
+                if let Some(TNT::Ter(t)) = *label.get_value() {
+                    terminals_map
+                        .entry(t)
+                        .or_insert_with(|| HashSet::with_capacity(children_iter.len()))
+                        .extend(children_iter.map(|target| (label, target)));
                 }
             }
+
+            for (t, set) in terminals_map.into_iter() {
+                let new_index = nfa
+                    .extend(set.into_iter())
+                    .map_err(index_out_of_bounds_conversion)?;
+
+                let virtual_node = VirtualNode::new(nt, t);
+
+                virtual_nodes.insert(virtual_node, new_index);
+            }
         }
 
         Ok(Self {
@@ -335,8 +357,6 @@ impl Atom for DefaultAtom {
     }
 
     fn empty(&self) -> usize {
-        assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2);
-
-        self.nfa.nodes_len() - 2
+        self.grammar.total() << 1
     }
 }
diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs
index 084acca..065640b 100644
--- a/chain/src/atom/mod.rs
+++ b/chain/src/atom/mod.rs
@@ -7,10 +7,10 @@
 //! I have better ideas in the future.
 
 use grammar::{Error as GrammarError, TNT};
-use nfa::{DOption, Nfa};
+use nfa::{DOption, LabelType, Nfa};
 
 /// The expected behaviours of an atomic language.
-pub trait Atom: Nfa<DOption<TNT>> {
+pub trait Atom: Nfa<LabelType<TNT>> {
     /// Return the index of a node representing the derivative of the
     /// left-linear null closure of `nt` with respect to `t`.
     fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>;
diff --git a/chain/src/plan.org b/chain/src/plan.org
index bbd6683..b708413 100644
--- a/chain/src/plan.org
+++ b/chain/src/plan.org
@@ -2,7 +2,7 @@
 #+AUTHOR: Durand
 #+DATE: <2022-11-18 Ven 19:57>
 
-* Things to do [5/10]
+* Things to do [6/10]
 
 - [X] Implement builders for graphs
 - [X] Find sets of the first terminals for each non-terminal, in the
@@ -38,18 +38,30 @@
       finite automata.
     * [X] Test the removal of dead states, where a state is dead if
       and only if no other states have an edge into that state.
-- [-] Refactor [2/8]
+- [X] Refactor [7/7]
   + [X] Implement a data type for graphs with labels on vertices and
     edges, but do not need to index edges by labels, which can index
     vertices by labels instead.
     * [X] Test it.
   + [X] Implement a builder that borrows a graph mutably.
     * [X] Test it.
-  + [ ] Implement a data type for graphs in which every node knows its
+  + [X] Implement a data type for graphs in which every node knows its
     parents, and every node has a unique label but edges have no
     labels.
-    * [ ] Test it.
-  + [ ] We need to record the expansions of those "virtual nodes".
+    * [X] Test it.
+  + [X] When we pull in some regular language because of the
+    left-linear expansion, we need to mark that branch as coming from
+    left-linear expansions.  This is necessary because we cannot
+    follow a left-linearly expanded branch when we take the derivative
+    of a language.  We only expand left-linearly when we try to access
+    the atomic languages [s]^{(t)}.
+  + [X] An edge of the NFA should carry a label that can be more
+    informative than solely a terminal or a non-terminal.
+  + [X] Add a mechanism for a grammar to attach labels to edges of NFA
+    which are not necessarily Copy-able, and which should be stored in a
+    separate array, such that the labels on edges of NFA point to the
+    elements of the array.
+  + [X] We need to record the expansions of those "virtual nodes".
     That is, the virtual nodes represent atomic languages such as
     [s]^{(t)} where s is a non-terminal and t is a terminal.  To be more
     specific, it represents the derivative of the left-linear closure
@@ -59,32 +71,21 @@
     alone, without consuming any inputs, by expanding according to the
     grammar rules.  This means that we need to know which
     non-terminals were expanded in order to get to a state in [s]^{(t)}.
-  + [ ] Each edge in the chain-rule machine needs to be labelled also
-    with a position in the forest.  This perfectly solves the problem
-    of those "plugs"!
-  + [ ] When we pull in some regular language because of the
-    left-linear expansion, we need to mark that branch as coming from
-    left-linear expansions.  This is necessary because we cannot
-    follow a left-linearly expanded branch when we take the derivative
-    of a language.  We only expand left-linearly when we try to access
-    the atomic languages [s]^{(t)}.  We can mark by returning a set of
-    nodes which are the beginnings of left-linearly expanded branches.
-  + [ ] An edge of the NFA should carry a label that can be more
-    informative than solely a terminal or a non-terminal.
-  + [ ] Add a mechanism for a grammar to attach labels to edges of NFA
-    which are not necessarily Copy-able, and which should be stored in a
-    separate array, such that the labels on edges of NFA point to the
-    elements of the array.
+    * [X] Test
+    * [X] Test more
 - [X] Atom [3/3]
   + [X] Define the Atom trait.
   + [X] Implement virtual nodes for each derivative of each atom: The
     lack of this step might be the root cause of the failure of the
     previous version of this project.
   + [X] Test atoms
-- [-] Implement languages. [1/3]
+- [-] Implement languages. [1/4]
   + [X] First define a trait with the expected behaviours.
   + [ ] Then implement them as doubly labelled graphs.
-  + [ ] Thenimplement finding derivatives by use of the chain rule.
+  + [ ] Then implement finding derivatives by use of the chain rule.
+  + [ ] Each edge in the chain-rule machine needs to be labelled also
+    with a position in the forest.  This perfectly solves the problem
+    of those "plugs"!
 - [-] Implement forests [2/3]
   + [X] Design a format of forests.  This should be the most crucial
     thing to do, in order to have a better understanding of the whole
@@ -92,10 +93,7 @@
     the binarised shared packed parsed forests, that reflects the
     regular expressions in the grammar equations.
   + [X] Define a trait with the expected behaviours.
-  + [-] Implement them using adjacency map: we only need one label per
-    edge, and we do not wish to exclude duplicate edges, and we do not
-    need to index edges by the labels.  All in all, we implement them
-    using a vector of hashmaps.
+  + [-] Implement them as parents-knowing graphs.
 - [ ] Implement semiring. [0/5]
   + [ ] Define the trait.
   + [ ] Implement the boolean semiring.
author	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
committer	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
commit	1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree	ab8812f8094d096c68aee53cf516e986cc9a273a /chain/src
parent	f27d604d93ce583d4404e1874664e08382ea2f00 (diff)