From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001
From: JSDurand <mmemmew@gmail.com>
Date: Wed, 11 Jan 2023 23:47:26 +0800
Subject: Record left-linear expansion and forest format

Now the grammar will record the left-linear expansions when generating
the nondeterministic finite automaton frmo its rules, and will record
whether an edge in the nondeterministic finite automaton comes from a
left-linear expansion.  The latter is needed because while performing
a chain-rule derivation, we do not need the left-linear expanded
derivations in the "first layer".  This might well have been the root
cause of the bad performance of the previous version of this package.

Also I have figured out how to properly generate and handle parse
forests while manipulating the "chain-rule machine".
---
 chain/src/atom/default.rs | 140 ++++++++++++++++++++++++++--------------------
 1 file changed, 80 insertions(+), 60 deletions(-)

(limited to 'chain/src/atom/default.rs')
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs
index 72989b3..90133f4 100644
--- a/chain/src/atom/default.rs
+++ b/chain/src/atom/default.rs
@@ -4,7 +4,10 @@
 use super::*;
 use grammar::Grammar;
 use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph};
-use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex};
+use nfa::{
+    default::{nfa::DefaultNFA, regex::DefaultRegex},
+    LabelType,
+};
 
 use core::fmt::Display;
 use std::collections::BTreeMap as Map;
@@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>;
 #[derive(Debug, Clone, Default)]
 pub struct DefaultAtom {
     grammar: Grammar,
-    nfa: DefaultNFA<DOption<TNT>>,
+    nfa: DefaultNFA<LabelType<TNT>>,
     // NOTE: This is mostly for printing and debugging
     regexp: Vec<DefaultRegex<TNT>>,
     virtual_nodes: VirtualMap,
@@ -95,7 +98,7 @@ impl Display for DefaultAtom {
 // LabelGraph, in order to implement Nfa.
 
 impl Graph for DefaultAtom {
-    type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b>
+    type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b>
     where
         Self: 'b;
 
@@ -130,23 +133,23 @@ impl Graph for DefaultAtom {
     }
 }
 
-impl LabelGraph<DOption<TNT>> for DefaultAtom {
-    type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b>
+impl LabelGraph<LabelType<TNT>> for DefaultAtom {
+    type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b>
     where
         Self: 'b;
 
-    type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b>
+    type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b>
     where
         Self: 'b,
         DOption<TNT>: 'b;
 
-    type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a>
+    type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a>
     where
         Self: 'a,
         DOption<TNT>: 'a;
 
     #[inline]
-    fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> {
+    fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> {
         self.nfa.vertex_label(node_id)
     }
 
@@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
     fn find_children_with_label(
         &self,
         node_id: usize,
-        label: &DOption<TNT>,
-    ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> {
+        label: &LabelType<TNT>,
+    ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> {
         self.nfa.find_children_with_label(node_id, label)
     }
 
@@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom {
     fn has_edge_label(
         &self,
         node_id: usize,
-        label: &DOption<TNT>,
+        label: &LabelType<TNT>,
         target: usize,
     ) -> Result<bool, GraphError> {
         self.nfa.has_edge_label(node_id, label, target)
     }
 }
 
-impl LabelExtGraph<DOption<TNT>> for DefaultAtom {
+impl LabelExtGraph<LabelType<TNT>> for DefaultAtom {
     #[inline]
     fn extend(
         &mut self,
-        edges: impl IntoIterator<Item = (DOption<TNT>, usize)>,
+        edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>,
     ) -> Result<usize, GraphError> {
         self.nfa.extend(edges)
     }
 }
 
-impl Nfa<DOption<TNT>> for DefaultAtom {
-    #[inline]
-    fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error>
-    where
-        F: Fn(DOption<TNT>) -> bool,
-    {
-        self.nfa.remove_epsilon(f)
-    }
-
+impl Nfa<LabelType<TNT>> for DefaultAtom {
     type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = ();
 
     #[inline]
     fn to_nfa(
-        _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>],
-        _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>,
-        _default: Option<DOption<TNT>>,
+        _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>],
+        _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>,
+        _default: Option<LabelType<TNT>>,
     ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> {
         // NOTE: We cannot construct an atom from a set of regular
         // languages alone.  So it is appropriate to panic here, if
@@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom {
     }
 
     #[inline]
-    fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> {
+    fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> {
         self.nfa.remove_dead(reserve)
     }
 
     #[inline]
-    fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> {
-        self.nfa.nulling(f)
+    fn closure(
+        &mut self,
+        predicate: impl FnMut(LabelType<TNT>) -> bool,
+        remove_after_p: bool,
+        transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>,
+        remove_predicate: impl FnMut(LabelType<TNT>) -> bool,
+    ) -> Result<(), nfa::error::Error> {
+        self.nfa
+            .closure(predicate, remove_after_p, transform, remove_predicate)
     }
 }
 
@@ -237,46 +239,56 @@ impl DefaultAtom {
 
         let mut nfa = grammar.left_closure_to_nfa(&regexp)?;
 
-        use std::collections::HashSet;
+        use std::collections::{HashMap, HashSet};
 
         let accumulators: Vec<usize> = {
             let mut result = Vec::with_capacity(regexp.len() + 1);
             result.push(0);
 
             for regex in regexp.iter() {
+                // Calling `unwrap` here is safe as `result` is always
+                // non-empty.
                 result.push(regex.nodes_len() * 2 + result.last().unwrap());
             }
 
-            result.into_iter().collect()
+            result
         };
 
         let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect();
 
-        nfa.nulling(|label| {
-            if let Some(label) = *label {
-                match label {
-                    TNT::Ter(_) => false,
-                    // Panics if a non-terminal references an invalid node
-                    // here.
-                    TNT::Non(n) => grammar.is_nullable(n).unwrap(),
+        let nullables: HashSet<usize> = (0..grammar.non_num())
+            .filter(|n| matches!(grammar.is_nullable(*n), Ok(true)))
+            .collect();
+
+        // Perform nulling and remove_epsilon at the same time.
+        nfa.closure(
+            |label| {
+                if let Some(label) = *label.get_value() {
+                    matches!(label, TNT::Non(n) if nullables.contains(&n))
+                } else {
+                    true
                 }
-            } else {
-                true
-            }
-        })?;
-        nfa.remove_epsilon(|label| label.is_none())?;
+            },
+            true,
+            |two_edges| grammar.transform_label_null_epsilon(two_edges),
+            |label| label.get_value().is_none(),
+        )?;
+
         nfa.remove_dead(|node| accumulators_set.contains(&node))?;
 
-        // now add the virtual nodes
+        // Now add the virtual nodes.
         let mut virtual_nodes: VirtualMap = Default::default();
 
         let nt_num = grammar.non_num();
 
         assert!(nt_num <= accumulators.len());
 
-        // Convert an error telling us that an index is out of bounds.
-        //
-        // Panics if the error is not of the expected kind.
+        /// Convert an error telling us that an index is out of bounds.
+        ///
+        /// # Panics
+        ///
+        /// The function panics if the error is not of the expected
+        /// kind.
         fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError {
             match ge {
                 GraphError::IndexOutOfBounds(index, bound) => {
@@ -287,24 +299,34 @@ impl DefaultAtom {
         }
 
         for nt in 0..nt_num {
-            let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa
-                // this is safe because of the above assertion.
+            let children: std::collections::HashMap<_, _> = nfa
+                // This is safe because of the above assertion.
                 .labels_of(*accumulators.get(nt).unwrap())
                 .map_err(index_out_of_bounds_conversion)?
-                .map(|(label, target_iter)| (*label, target_iter.collect()))
+                .map(|(label, target_iter)| (*label, target_iter))
                 .collect();
 
-            for (label, children_vec) in children.into_iter() {
-                if let Some(TNT::Ter(t)) = *label {
-                    let new_index = nfa
-                        .extend(children_vec.into_iter().map(|target| (label, target)))
-                        .map_err(index_out_of_bounds_conversion)?;
+            let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> =
+                HashMap::new();
 
-                    let virtual_node = VirtualNode::new(nt, t);
-
-                    virtual_nodes.insert(virtual_node, new_index);
+            for (label, children_iter) in children.into_iter() {
+                if let Some(TNT::Ter(t)) = *label.get_value() {
+                    terminals_map
+                        .entry(t)
+                        .or_insert_with(|| HashSet::with_capacity(children_iter.len()))
+                        .extend(children_iter.map(|target| (label, target)));
                 }
             }
+
+            for (t, set) in terminals_map.into_iter() {
+                let new_index = nfa
+                    .extend(set.into_iter())
+                    .map_err(index_out_of_bounds_conversion)?;
+
+                let virtual_node = VirtualNode::new(nt, t);
+
+                virtual_nodes.insert(virtual_node, new_index);
+            }
         }
 
         Ok(Self {
@@ -335,8 +357,6 @@ impl Atom for DefaultAtom {
     }
 
     fn empty(&self) -> usize {
-        assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2);
-
-        self.nfa.nodes_len() - 2
+        self.grammar.total() << 1
     }
 }
-- 
cgit v1.2.3-18-g5258