diff options
Diffstat (limited to 'grammar/src/first_set.rs')
-rw-r--r-- | grammar/src/first_set.rs | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs new file mode 100644 index 0000000..2a5ee3c --- /dev/null +++ b/grammar/src/first_set.rs @@ -0,0 +1,451 @@ +//! This file implements a function for computing the set of first +//! terminals for a grammar. + +use super::*; + +impl Grammar { + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + match self.state { + GrammarState::Initial => { + self.state = GrammarState::AfterComputeFirst; + } + _ => { + // This has been called already. + return Ok(()); + } + } + + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec<HashSet<Option<usize>>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } +} |