Record left-linear expansion and forest format

Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
author: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
committer: JSDurand <mmemmew@gmail.com> 2023-01-11 23:47:26 +0800
commit: 1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree: ab8812f8094d096c68aee53cf516e986cc9a273a /grammar/src/first_set.rs
parent: f27d604d93ce583d4404e1874664e08382ea2f00 (diff)
1 files changed, 451 insertions, 0 deletions
diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs
new file mode 100644
index 0000000..2a5ee3c
--- /dev/null
+++ b/grammar/src/first_set.rs
@@ -0,0 +1,451 @@
+//! This file implements a function for computing the set of first
+//! terminals for a grammar.
+
+use super::*;
+
+impl Grammar {
+    /// Compute the set of terminals that can appear as the first
+    /// terminal in some left-linear derivation of a non-terminal, for
+    /// every non-terminal.
+    ///
+    /// This is an algorithm that computes the transitive closure,
+    /// which is a common approach for this task.  But perhaps there
+    /// are more efficient approaches?
+    ///
+    /// Also the function computes the set of "reachable nodes" in the
+    /// process, and records the information in the `first_nodes`
+    /// attribute.
+    pub fn compute_firsts(&mut self) -> Result<(), Error> {
+        match self.state {
+            GrammarState::Initial => {
+                self.state = GrammarState::AfterComputeFirst;
+            }
+            _ => {
+                // This has been called already.
+                return Ok(());
+            }
+        }
+
+        let mut updated = true;
+
+        let non_len = self.non_num();
+
+        use StackElement::{Seen, Unseen};
+
+        while updated {
+            updated = false;
+
+            for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
+                let root = if let Some(root) = regex.root() {
+                    root
+                } else {
+                    if !self.is_nullable(n)? {
+                        updated = true;
+
+                        self.firsts.get_mut(n).unwrap().insert(None);
+
+                        // The default construction of a grammar
+                        // reserves some space for each vector, so
+                        // explicitly setting this can reduce some
+                        // minor memory overhead.
+                        let pointer = self.first_nodes.get_mut(n).unwrap();
+
+                        pointer.clear();
+                        pointer.shrink_to_fit();
+                    }
+
+                    continue;
+                };
+
+                let regex_len = regex.len();
+
+                let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
+
+                stack.push(Unseen(root));
+
+                let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
+                    Vec::with_capacity(regex_len);
+
+                let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
+
+                while let Some(top) = stack.pop() {
+                    let top_index = top.index();
+                    let is_seen = top.is_seen();
+
+                    match regex
+                        .vertex_label(top_index)
+                        .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
+                    {
+                        RegexType::Kleene => {
+                            if !is_seen {
+                                stack.push(Seen(top_index));
+
+                                for child in regex.children_of(top_index).unwrap() {
+                                    stack.push(Unseen(child));
+                                }
+                            } else {
+                                let degree = regex.degree(top_index).unwrap();
+                                let children_stack_len = children_sets_stack.len();
+                                let children_nodes_len = children_nodes_stack.len();
+
+                                assert!(
+                                    children_stack_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                assert!(
+                                    children_nodes_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                let mut this_set = HashSet::new();
+
+                                this_set.insert(None);
+
+                                let mut this_nodes = HashSet::new();
+
+                                this_nodes.insert(top_index);
+
+                                if degree == 0 {
+                                    children_sets_stack.push(this_set);
+                                    children_nodes_stack.push(this_nodes);
+
+                                    continue;
+                                }
+
+                                let mut stop = false;
+
+                                for (child_set, child_nodes) in children_sets_stack
+                                    .drain((children_stack_len - degree)..)
+                                    .zip(
+                                        children_nodes_stack.drain((children_nodes_len - degree)..),
+                                    )
+                                {
+                                    if stop {
+                                        break;
+                                    }
+
+                                    if !child_set.contains(&None) {
+                                        stop = true;
+                                    }
+
+                                    this_set.extend(child_set);
+                                    this_nodes.extend(child_nodes);
+                                }
+
+                                children_sets_stack.push(this_set);
+                                children_nodes_stack.push(this_nodes);
+                            }
+                        }
+                        RegexType::Plus => {
+                            if !is_seen {
+                                stack.push(Seen(top_index));
+
+                                for child in regex.children_of(top_index).unwrap() {
+                                    stack.push(Unseen(child));
+                                }
+                            } else {
+                                let degree = regex.degree(top_index).unwrap();
+                                let children_stack_len = children_sets_stack.len();
+                                let children_nodes_len = children_nodes_stack.len();
+
+                                assert!(
+                                    children_stack_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                assert!(
+                                    children_nodes_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                let mut this_set = HashSet::new();
+
+                                let mut this_nodes = HashSet::new();
+
+                                this_nodes.insert(top_index);
+
+                                if degree == 0 {
+                                    this_set.insert(None);
+
+                                    children_sets_stack.push(this_set);
+                                    children_nodes_stack.push(this_nodes);
+
+                                    continue;
+                                }
+
+                                let mut stop = false;
+
+                                for (child_set, child_nodes) in children_sets_stack
+                                    .drain((children_stack_len - degree)..)
+                                    .zip(
+                                        children_nodes_stack.drain((children_nodes_len - degree)..),
+                                    )
+                                {
+                                    if stop {
+                                        break;
+                                    }
+
+                                    if !child_set.contains(&None) {
+                                        stop = true;
+                                    }
+
+                                    this_set.extend(child_set);
+                                    this_nodes.extend(child_nodes);
+                                }
+
+                                if stop {
+                                    this_set.remove(&None);
+                                }
+
+                                children_sets_stack.push(this_set);
+                                children_nodes_stack.push(this_nodes);
+                            }
+                        }
+                        RegexType::Optional => {
+                            if !is_seen {
+                                stack.push(Seen(top_index));
+
+                                for child in regex.children_of(top_index).unwrap() {
+                                    stack.push(Unseen(child));
+                                }
+                            } else {
+                                let degree = regex.degree(top_index).unwrap();
+                                let children_stack_len = children_sets_stack.len();
+                                let children_nodes_len = children_nodes_stack.len();
+
+                                assert!(
+                                    children_stack_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                assert!(
+                                    children_nodes_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                let mut this_set = HashSet::new();
+
+                                this_set.insert(None);
+
+                                let mut this_nodes = HashSet::new();
+
+                                this_nodes.insert(top_index);
+
+                                if degree == 0 {
+                                    children_sets_stack.push(this_set);
+                                    children_nodes_stack.push(this_nodes);
+
+                                    continue;
+                                }
+
+                                let mut stop = false;
+
+                                for (child_set, child_nodes) in children_sets_stack
+                                    .drain((children_stack_len - degree)..)
+                                    .zip(
+                                        children_nodes_stack.drain((children_nodes_len - degree)..),
+                                    )
+                                {
+                                    if stop {
+                                        break;
+                                    }
+
+                                    if !child_set.contains(&None) {
+                                        stop = true;
+                                    }
+
+                                    this_set.extend(child_set.iter().copied());
+                                    this_nodes.extend(child_nodes.iter().copied());
+                                }
+
+                                children_sets_stack.push(this_set);
+                                children_nodes_stack.push(this_nodes);
+                            }
+                        }
+                        RegexType::Or => {
+                            if !is_seen {
+                                stack.push(Seen(top_index));
+
+                                for child in regex.children_of(top_index).unwrap() {
+                                    stack.push(Unseen(child));
+                                }
+                            } else {
+                                let degree = regex.degree(top_index).unwrap();
+                                let children_stack_len = children_sets_stack.len();
+                                let children_nodes_len = children_nodes_stack.len();
+
+                                assert!(
+                                    children_stack_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                assert!(
+                                    children_nodes_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                let mut this_set = HashSet::new();
+
+                                let mut this_nodes = HashSet::new();
+
+                                this_nodes.insert(top_index);
+
+                                if degree == 0 {
+                                    this_set.insert(None);
+
+                                    children_sets_stack.push(this_set);
+                                    children_nodes_stack.push(this_nodes);
+
+                                    continue;
+                                }
+
+                                for (child_set, child_nodes) in children_sets_stack
+                                    .drain((children_stack_len - degree)..)
+                                    .zip(
+                                        children_nodes_stack.drain((children_nodes_len - degree)..),
+                                    )
+                                {
+                                    this_set.extend(child_set.iter().copied());
+                                    this_nodes.extend(child_nodes.iter().copied());
+                                }
+
+                                children_sets_stack.push(this_set);
+                                children_nodes_stack.push(this_nodes);
+                            }
+                        }
+                        RegexType::Paren => {
+                            // Only for printing purposes
+                            let mut this_set = HashSet::new();
+
+                            this_set.insert(None);
+
+                            children_sets_stack.push(this_set);
+
+                            let mut this_nodes = HashSet::new();
+
+                            this_nodes.insert(top_index);
+
+                            children_nodes_stack.push(this_nodes);
+                        }
+                        RegexType::Empty => {
+                            if !is_seen {
+                                stack.push(Seen(top_index));
+
+                                for child in regex.children_of(top_index).unwrap().rev() {
+                                    stack.push(Unseen(child));
+                                }
+                            } else {
+                                let degree = regex.degree(top_index).unwrap();
+                                let children_stack_len = children_sets_stack.len();
+                                let children_nodes_len = children_nodes_stack.len();
+
+                                assert!(
+                                    children_stack_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                assert!(
+                                    children_nodes_len >= degree,
+                                    "not enough stack elements for {top_index}"
+                                );
+
+                                let mut this_set = HashSet::new();
+
+                                let mut this_nodes = HashSet::new();
+
+                                this_nodes.insert(top_index);
+
+                                if degree == 0 {
+                                    this_set.insert(None);
+
+                                    children_sets_stack.push(this_set);
+                                    children_nodes_stack.push(this_nodes);
+
+                                    continue;
+                                }
+
+                                let mut stop = false;
+
+                                for (child_set, child_nodes) in children_sets_stack
+                                    .drain((children_stack_len - degree)..)
+                                    .zip(
+                                        children_nodes_stack.drain((children_nodes_len - degree)..),
+                                    )
+                                {
+                                    if stop {
+                                        break;
+                                    }
+
+                                    if !child_set.contains(&None) {
+                                        stop = true;
+                                    }
+
+                                    this_set.extend(child_set.iter().copied());
+                                    this_nodes.extend(child_nodes.iter().copied());
+                                }
+
+                                if stop {
+                                    this_set.remove(&None);
+                                }
+
+                                children_sets_stack.push(this_set);
+                                children_nodes_stack.push(this_nodes);
+                            }
+                        }
+                        RegexType::Lit(tnt) => {
+                            match tnt {
+                                TNT::Ter(t) => {
+                                    let mut this_set = HashSet::with_capacity(1);
+
+                                    this_set.insert(Some(t));
+
+                                    children_sets_stack.push(this_set);
+                                }
+                                TNT::Non(non) => {
+                                    let this_set = self
+                                        .firsts
+                                        .get(non)
+                                        .ok_or(Error::IndexOutOfBounds(non, non_len))?
+                                        .clone();
+
+                                    children_sets_stack.push(this_set);
+                                }
+                            }
+
+                            let mut this_nodes = HashSet::with_capacity(1);
+                            this_nodes.insert(top_index);
+
+                            children_nodes_stack.push(this_nodes);
+                        }
+                    }
+                }
+
+                assert_eq!(
+                    children_sets_stack.len(),
+                    1,
+                    "Too many elements left at the end"
+                );
+
+                assert_eq!(
+                    children_nodes_stack.len(),
+                    1,
+                    "Too many elements left at the end"
+                );
+
+                for first in children_sets_stack.pop().unwrap().into_iter() {
+                    if !self.firsts.get(n).unwrap().contains(&first) {
+                        updated = true;
+
+                        self.firsts.get_mut(n).unwrap().insert(first);
+                    }
+                }
+
+                *self.first_nodes.get_mut(n).unwrap() =
+                    children_nodes_stack.pop().unwrap().into_iter().collect();
+            }
+        }
+
+        Ok(())
+    }
+}
author	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
committer	JSDurand <mmemmew@gmail.com>	2023-01-11 23:47:26 +0800
commit	1a3d346f413325ed37848a6b2526e8e729269833 (patch)
tree	ab8812f8094d096c68aee53cf516e986cc9a273a /grammar/src/first_set.rs
parent	f27d604d93ce583d4404e1874664e08382ea2f00 (diff)