summaryrefslogtreecommitdiff
path: root/grammar/src/first_set.rs
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
commit1a3d346f413325ed37848a6b2526e8e729269833 (patch)
treeab8812f8094d096c68aee53cf516e986cc9a273a /grammar/src/first_set.rs
parentf27d604d93ce583d4404e1874664e08382ea2f00 (diff)
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
Diffstat (limited to 'grammar/src/first_set.rs')
-rw-r--r--grammar/src/first_set.rs451
1 files changed, 451 insertions, 0 deletions
diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs
new file mode 100644
index 0000000..2a5ee3c
--- /dev/null
+++ b/grammar/src/first_set.rs
@@ -0,0 +1,451 @@
+//! This file implements a function for computing the set of first
+//! terminals for a grammar.
+
+use super::*;
+
+impl Grammar {
+ /// Compute the set of terminals that can appear as the first
+ /// terminal in some left-linear derivation of a non-terminal, for
+ /// every non-terminal.
+ ///
+ /// This is an algorithm that computes the transitive closure,
+ /// which is a common approach for this task. But perhaps there
+ /// are more efficient approaches?
+ ///
+ /// Also the function computes the set of "reachable nodes" in the
+ /// process, and records the information in the `first_nodes`
+ /// attribute.
+ pub fn compute_firsts(&mut self) -> Result<(), Error> {
+ match self.state {
+ GrammarState::Initial => {
+ self.state = GrammarState::AfterComputeFirst;
+ }
+ _ => {
+ // This has been called already.
+ return Ok(());
+ }
+ }
+
+ let mut updated = true;
+
+ let non_len = self.non_num();
+
+ use StackElement::{Seen, Unseen};
+
+ while updated {
+ updated = false;
+
+ for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
+ let root = if let Some(root) = regex.root() {
+ root
+ } else {
+ if !self.is_nullable(n)? {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(None);
+
+ // The default construction of a grammar
+ // reserves some space for each vector, so
+ // explicitly setting this can reduce some
+ // minor memory overhead.
+ let pointer = self.first_nodes.get_mut(n).unwrap();
+
+ pointer.clear();
+ pointer.shrink_to_fit();
+ }
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
+
+ stack.push(Unseen(root));
+
+ let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
+ Vec::with_capacity(regex_len);
+
+ let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
+
+ while let Some(top) = stack.pop() {
+ let top_index = top.index();
+ let is_seen = top.is_seen();
+
+ match regex
+ .vertex_label(top_index)
+ .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
+ {
+ RegexType::Kleene => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Plus => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Optional => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Or => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Paren => {
+ // Only for printing purposes
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ RegexType::Empty => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap().rev() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Lit(tnt) => {
+ match tnt {
+ TNT::Ter(t) => {
+ let mut this_set = HashSet::with_capacity(1);
+
+ this_set.insert(Some(t));
+
+ children_sets_stack.push(this_set);
+ }
+ TNT::Non(non) => {
+ let this_set = self
+ .firsts
+ .get(non)
+ .ok_or(Error::IndexOutOfBounds(non, non_len))?
+ .clone();
+
+ children_sets_stack.push(this_set);
+ }
+ }
+
+ let mut this_nodes = HashSet::with_capacity(1);
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ }
+
+ assert_eq!(
+ children_sets_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ assert_eq!(
+ children_nodes_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ for first in children_sets_stack.pop().unwrap().into_iter() {
+ if !self.firsts.get(n).unwrap().contains(&first) {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(first);
+ }
+ }
+
+ *self.first_nodes.get_mut(n).unwrap() =
+ children_nodes_stack.pop().unwrap().into_iter().collect();
+ }
+ }
+
+ Ok(())
+ }
+}