summaryrefslogtreecommitdiff
path: root/grammar
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
commit1a3d346f413325ed37848a6b2526e8e729269833 (patch)
treeab8812f8094d096c68aee53cf516e986cc9a273a /grammar
parentf27d604d93ce583d4404e1874664e08382ea2f00 (diff)
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
Diffstat (limited to 'grammar')
-rw-r--r--grammar/src/first_set.rs451
-rw-r--r--grammar/src/left_closure.rs324
-rw-r--r--grammar/src/lib.rs906
-rw-r--r--grammar/src/test_grammar_helper.rs1
-rw-r--r--grammar/src/tests/test_grammar_left_closure.rs43
5 files changed, 977 insertions, 748 deletions
diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs
new file mode 100644
index 0000000..2a5ee3c
--- /dev/null
+++ b/grammar/src/first_set.rs
@@ -0,0 +1,451 @@
+//! This file implements a function for computing the set of first
+//! terminals for a grammar.
+
+use super::*;
+
+impl Grammar {
+ /// Compute the set of terminals that can appear as the first
+ /// terminal in some left-linear derivation of a non-terminal, for
+ /// every non-terminal.
+ ///
+ /// This is an algorithm that computes the transitive closure,
+ /// which is a common approach for this task. But perhaps there
+ /// are more efficient approaches?
+ ///
+ /// Also the function computes the set of "reachable nodes" in the
+ /// process, and records the information in the `first_nodes`
+ /// attribute.
+ pub fn compute_firsts(&mut self) -> Result<(), Error> {
+ match self.state {
+ GrammarState::Initial => {
+ self.state = GrammarState::AfterComputeFirst;
+ }
+ _ => {
+ // This has been called already.
+ return Ok(());
+ }
+ }
+
+ let mut updated = true;
+
+ let non_len = self.non_num();
+
+ use StackElement::{Seen, Unseen};
+
+ while updated {
+ updated = false;
+
+ for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
+ let root = if let Some(root) = regex.root() {
+ root
+ } else {
+ if !self.is_nullable(n)? {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(None);
+
+ // The default construction of a grammar
+ // reserves some space for each vector, so
+ // explicitly setting this can reduce some
+ // minor memory overhead.
+ let pointer = self.first_nodes.get_mut(n).unwrap();
+
+ pointer.clear();
+ pointer.shrink_to_fit();
+ }
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
+
+ stack.push(Unseen(root));
+
+ let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
+ Vec::with_capacity(regex_len);
+
+ let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
+
+ while let Some(top) = stack.pop() {
+ let top_index = top.index();
+ let is_seen = top.is_seen();
+
+ match regex
+ .vertex_label(top_index)
+ .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
+ {
+ RegexType::Kleene => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Plus => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Optional => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Or => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Paren => {
+ // Only for printing purposes
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ RegexType::Empty => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap().rev() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Lit(tnt) => {
+ match tnt {
+ TNT::Ter(t) => {
+ let mut this_set = HashSet::with_capacity(1);
+
+ this_set.insert(Some(t));
+
+ children_sets_stack.push(this_set);
+ }
+ TNT::Non(non) => {
+ let this_set = self
+ .firsts
+ .get(non)
+ .ok_or(Error::IndexOutOfBounds(non, non_len))?
+ .clone();
+
+ children_sets_stack.push(this_set);
+ }
+ }
+
+ let mut this_nodes = HashSet::with_capacity(1);
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ }
+
+ assert_eq!(
+ children_sets_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ assert_eq!(
+ children_nodes_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ for first in children_sets_stack.pop().unwrap().into_iter() {
+ if !self.firsts.get(n).unwrap().contains(&first) {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(first);
+ }
+ }
+
+ *self.first_nodes.get_mut(n).unwrap() =
+ children_nodes_stack.pop().unwrap().into_iter().collect();
+ }
+ }
+
+ Ok(())
+ }
+}
diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs
new file mode 100644
index 0000000..1630881
--- /dev/null
+++ b/grammar/src/left_closure.rs
@@ -0,0 +1,324 @@
+//! This file implements some functions to compute the regular
+//! language of left-linear closure of a grammar.
+
+use super::*;
+
+use nfa::LabelType;
+
+impl Grammar {
+ /// Return the regular language of the left-linear closures of
+ /// non-terminals in the grammar.
+ ///
+ /// The resulting vector is guaranteed to be of the same length as
+ /// the number of non-terminals.
+ ///
+ /// The resulting regular language is not "self-contained". That
+ /// is to say, its terminals indices are packed indices and are
+ /// meaningless without the interpretation of the grammar. They
+ /// should be converted to a nondeterministic finite automaton and
+ /// then to its null closure later on.
+ pub fn left_closure(&mut self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
+ match self.state {
+ GrammarState::Initial => {
+ return Err(Error::WrongState(
+ self.state,
+ GrammarState::AfterComputeFirst,
+ ))
+ }
+ GrammarState::AfterLeftClosure
+ | GrammarState::AfterNFA
+ | GrammarState::AfterComputeFirst => {}
+ }
+
+ let non_len = self.non_num();
+
+ let mut result = Vec::with_capacity(non_len);
+
+ for (n, rule) in self.rules.iter().enumerate() {
+ let regex = &rule.regex;
+
+ let regex_root = if let Some(root) = regex.root() {
+ root
+ } else {
+ result.push(Default::default());
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ /// A convenient macro to retrieve the children from the
+ /// original regular expression with error propagation.
+ macro_rules! children {
+ ($n:expr) => {
+ regex
+ .children_of($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ /// A convenient macro to retrieve the label from the
+ /// original regular expression with error propagation.
+ macro_rules! label {
+ ($n:expr) => {
+ regex
+ .vertex_label($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ let parents = regex.parents_array().map_err(|e| match e {
+ nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
+ nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
+ _ => unreachable!(),
+ })?;
+
+ use RegexType::*;
+ use TNT::*;
+
+ let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
+ let mut builder = ALGBuilder::default();
+
+ /// Perform a depth-first copy
+ macro_rules! df_copy {
+ ($parent:expr, $index:expr) => {
+ match label!($index) {
+ Kleene | Plus | Optional | Or | Paren | Empty => {
+ let mut stack = vec![($parent, $index)];
+
+ while let Some((top_parent, top_index)) = stack.pop() {
+ let label = label!(top_index);
+ let label = match label {
+ Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
+ _ => label,
+ };
+
+ local_result.push(label);
+
+ let new = builder.add_vertex();
+
+ builder.add_edge(top_parent, new, ()).unwrap();
+
+ stack.extend(children!(top_index).map(|child| (new, child)));
+ }
+ }
+ Lit(remain_tnt) => {
+ local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
+ let new = builder.add_vertex();
+ builder.add_edge($parent, new, ()).unwrap();
+ }
+ }
+ };
+ }
+
+ local_result.push(Or);
+ builder.add_vertex();
+
+ local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
+ let non_lit_index = builder.add_vertex();
+
+ builder.add_edge(0, non_lit_index, ()).unwrap();
+
+ // If this non-terminal is nullable, add an empty variant.
+ if self.is_nullable(n)? {
+ local_result.push(Empty);
+ let empty_index = builder.add_vertex();
+ builder.add_edge(0, empty_index, ()).unwrap();
+ }
+
+ for first_node in self.first_nodes_of(n)?.copied() {
+ assert!(first_node < parents.len());
+
+ let tnt = match label!(first_node) {
+ Lit(tnt) => Lit(tnt),
+ _ => continue,
+ };
+
+ let mut parents_chain = {
+ let mut result = Vec::new();
+ let mut stack = Vec::with_capacity(parents.len());
+
+ stack.push(first_node);
+
+ while let Some(top) = stack.pop() {
+ assert!(top < parents.len());
+ if let Some(parent) = parents.get(top).copied().unwrap() {
+ result.push(parent);
+ stack.push(parent.0);
+ }
+ }
+
+ result.reverse();
+
+ result
+ };
+
+ if let Some((first, _)) = parents_chain.first() {
+ assert_eq!(*first, regex_root);
+ } else {
+ local_result.push(tnt);
+ let lit_index = builder.add_vertex();
+ builder.add_edge(0, lit_index, ()).unwrap();
+
+ continue;
+ }
+
+ // A different, "more local", root.
+ let mut local_root: usize;
+
+ // Handle the direct parent
+ let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
+
+ match label!(parent_node) {
+ Kleene | Plus => {
+ // TODO: If parent_edge_index is 0, make a
+ // Plus node instead.
+ local_result.extend([Empty, tnt]);
+
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ let iterator = children!(parent_node);
+
+ for index in iterator.clone().skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+
+ Or => {
+ local_result.push(tnt);
+ local_root = builder.add_vertex();
+ }
+ Optional | Empty => {
+ // If this path is taken, it should not be
+ // optional.
+ local_result.extend([Empty, tnt]);
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ for index in children!(parent_node).skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ Paren | Lit(_) => unreachable!(),
+ }
+
+ // Handle successive parents
+
+ for (node, edge_index) in parents_chain.into_iter() {
+ let node_type = label!(node);
+
+ match node_type {
+ Kleene | Plus => {
+ // TODO: If edge_index is 0, then just
+ // make this a Plus node.
+
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+
+ local_root = new_index;
+
+ let iterator = children!(node);
+
+ for index in iterator.clone().skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+ RegexType::Or => {}
+ RegexType::Optional | RegexType::Empty => {
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+ local_root = new_index;
+
+ for index in children!(node).skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ RegexType::Paren | RegexType::Lit(_) => unreachable!(),
+ }
+ }
+
+ builder.add_edge(0, local_root, ()).unwrap();
+ }
+
+ local_result.shrink_to_fit();
+
+ let graph = builder.build();
+
+ assert_eq!(graph.nodes_len(), local_result.len());
+
+ result.push(
+ DefaultRegex::new(graph, local_result)
+ .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
+ );
+ }
+
+ assert_eq!(result.len(), non_len);
+
+ self.accumulators = {
+ let mut acc_result = Vec::with_capacity(result.len() + 1);
+ acc_result.push(0);
+
+ for rule in result.iter() {
+ acc_result.push(rule.len() + *acc_result.last().unwrap());
+ }
+
+ acc_result
+ };
+
+ Ok(result)
+ }
+
+ /// Convert the regular language of left-linear closures to its
+ /// equivalent nondeterministic finite automaton.
+ ///
+ /// In the generation of the left-linear closure, the terminals
+ /// and non-terminals are packed into an unsigned integer. We
+ /// unpack them in converting to nondeterministic finite
+ /// automaton.
+ ///
+ /// The resulting nondeterministic finite automaton should be
+ /// transformed to its null-closure for use in our algorithm.
+ pub fn left_closure_to_nfa(
+ &self,
+ closure: &[DefaultRegex<TNT>],
+ ) -> Result<DefaultNFA<LabelType<TNT>>, Error> {
+ let label_transform = |tnt| match tnt {
+ TNT::Ter(t) => {
+ let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
+ Error::IndexOutOfBounds(index, bound) => {
+ graph::error::Error::IndexOutOfBounds(index, bound)
+ }
+ _ => unreachable!(),
+ })?;
+
+ Ok(SoC::Carry(new_tnt))
+ }
+ TNT::Non(n) => Ok(SoC::Sub(n)),
+ };
+
+ let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?;
+
+ Ok(nfa)
+ }
+}
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
index 4e544c9..627ae6f 100644
--- a/grammar/src/lib.rs
+++ b/grammar/src/lib.rs
@@ -15,12 +15,15 @@ use nfa::{
nfa::DefaultNFA,
regex::{DefaultRegex, ParseError, RegexType},
},
- DOption, Nfa, Regex, SoC,
+ LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges,
};
use graph::{adlist::ALGBuilder, builder::Builder, Graph};
-use std::{collections::HashSet, fmt::Display};
+use std::{
+ collections::{HashMap, HashSet},
+ fmt::Display,
+};
/// The type of a terminal.
///
@@ -88,6 +91,9 @@ impl Display for TNT {
#[derive(Debug, Copy, Clone)]
#[non_exhaustive]
pub enum Error {
+ /// The operation requires the grammar to be after a certain
+ /// state, but the grammar is not after that state yet.
+ WrongState(GrammarState, GrammarState),
/// The first component is the index, and the second the bound.
IndexOutOfBounds(usize, usize),
/// Fail to build the N-th regular expression, due to the
@@ -112,6 +118,9 @@ impl Display for Error {
"Failed to build the {n}-th regular expression due to error: {pe}"
),
Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
+ Error::WrongState(current, threshold) => {
+ write!(f, "require state {threshold}, but in state {current}")
+ }
}
}
}
@@ -146,6 +155,36 @@ impl Rule {
}
}
+/// The state of Grammar.
+///
+/// This is used to ensure that the grammar preparation methods are
+/// called in the correct order.
+#[derive(Debug, Copy, Clone, Default)]
+pub enum GrammarState {
+ /// Just initialized
+ #[default]
+ Initial,
+ /// compute_firsts has been called
+ AfterComputeFirst,
+ /// left_closure has been called.
+ AfterLeftClosure,
+ /// left_closure_to_nfa has been called.
+ AfterNFA,
+}
+
+impl Display for GrammarState {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ use GrammarState::*;
+
+ match self {
+ Initial => write!(f, "initial"),
+ AfterComputeFirst => write!(f, "after computation of first set"),
+ AfterLeftClosure => write!(f, "after computation of closure"),
+ AfterNFA => write!(f, "after computation of NFA"),
+ }
+ }
+}
+
/// The type of a grammar.
#[derive(Debug, Clone, Default)]
pub struct Grammar {
@@ -158,6 +197,8 @@ pub struct Grammar {
/// The length of the list must match that of the list of
/// non-terminals.
rules: Vec<Rule>,
+ /// The list of successive sums of lengths of rules.
+ accumulators: Vec<usize>,
// The following two attributes are empty until we call
// `compute_firsts` on the grammar.
/// The list of sets of "first terminals".
@@ -169,9 +210,18 @@ pub struct Grammar {
///
/// The length must match that of the list of non-terminals.
first_nodes: Vec<Vec<usize>>,
- // The following attribute is empty until we call `left_closure`
- // on the grammar.
- left_closure_branches: HashSet<usize>,
+ // The following attribute is empty until we call `closure` on the
+ // NFA with `transform_label_null_epsilon` as the transformer.
+ /// A hash map that maps a tuple `(pos1, pos2)` of positions
+ /// `pos1` and `pos2` in the rules to a vector of rule positions.
+ /// This vector means that in order to expand from `pos1` to
+ /// `pos`, it is necessary to expand according to the positions in
+ /// the vector, so we need to add all these expansions into the
+ /// parse forest later.
+ expansion_map: HashMap<(usize, usize), Vec<usize>>,
+ /// The state of the grammar, which tells us what information has
+ /// been computed for this grammar.
+ state: GrammarState,
}
/// A private type to aid the recursive looping of rergular
@@ -216,7 +266,14 @@ impl Grammar {
.map(|rule| Vec::with_capacity(rule.len()))
.collect();
- let left_closure_branches = HashSet::default();
+ let state = Default::default();
+
+ let expansion_map = Default::default();
+
+ // NOTE: We cannot calculate accumulators here, as we want the
+ // accumulators of the regular expression of the left-closure,
+ // not of the original one.
+ let accumulators = Vec::new();
Self {
ter,
@@ -224,7 +281,9 @@ impl Grammar {
rules,
firsts,
first_nodes,
- left_closure_branches,
+ state,
+ expansion_map,
+ accumulators,
}
}
@@ -258,7 +317,29 @@ impl Grammar {
/// Return the total length of all rules.
#[inline]
pub fn total(&self) -> usize {
- self.rules.iter().map(Rule::len).sum()
+ self.accumulators.last().copied().unwrap_or(0)
+ }
+
+ /// Query if a position is the starting position of a
+ /// non-terminal. If it is, return the non-terminal, else return
+ /// `None` .
+ #[inline]
+ pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option<usize> {
+ for (index, accumulator) in self.accumulators.iter().copied().enumerate() {
+ let shifted_accumulator = accumulator << 1;
+
+ // NOTE: Clippy suggests to call `cmp`, but it seems
+ // compiler might not yet be smart enough to inline that
+ // call, so I just silence clippy here.
+ #[allow(clippy::comparison_chain)]
+ if pos == shifted_accumulator {
+ return Some(index);
+ } else if pos < shifted_accumulator {
+ break;
+ }
+ }
+
+ None
}
/// Return the number of terminals.
@@ -353,754 +434,119 @@ impl Grammar {
.contains(&None))
}
- /// For a NON_TERMINAL, return an iterator that goes over the
- /// nodes that are reachable from the non-terminal through an
- /// empty transition of the nondeterministic finite automaton.
+ /// Query the expansion information from the position `pos1` to
+ /// the position `pos2` .
#[inline]
- pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
- Ok(self
- .first_nodes
- .get(non_terminal)
- .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
- .iter())
- }
-
- /// Return a hash set that contains all nodes in the set of
- /// left-closure regular languages that are added because of the
- /// left-linear expansion.
- pub fn left_closure_branches(&self) -> &HashSet<usize> {
- &self.left_closure_branches
- }
-
- /// Compute the set of terminals that can appear as the first
- /// terminal in some left-linear derivation of a non-terminal, for
- /// every non-terminal.
- ///
- /// This is an algorithm that computes the transitive closure,
- /// which is a common approach for this task. But perhaps there
- /// are more efficient approaches?
- ///
- /// Also the function computes the set of "reachable nodes" in the
- /// process, and records the information in the `first_nodes`
- /// attribute.
- pub fn compute_firsts(&mut self) -> Result<(), Error> {
- let mut updated = true;
-
- let non_len = self.non_num();
-
- use StackElement::{Seen, Unseen};
-
- while updated {
- updated = false;
-
- for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
- let root = if let Some(root) = regex.root() {
- root
- } else {
- if !self.is_nullable(n)? {
- updated = true;
-
- self.firsts.get_mut(n).unwrap().insert(None);
-
- // The default construction of a grammar
- // reserves some space for each vector, so
- // explicitly setting this can reduce some
- // minor memory overhead.
- let pointer = self.first_nodes.get_mut(n).unwrap();
-
- pointer.clear();
- pointer.shrink_to_fit();
- }
-
- continue;
- };
-
- let regex_len = regex.len();
-
- let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
-
- stack.push(Unseen(root));
-
- let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
- Vec::with_capacity(regex_len);
-
- let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
-
- while let Some(top) = stack.pop() {
- let top_index = top.index();
- let is_seen = top.is_seen();
-
- match regex
- .vertex_label(top_index)
- .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
- {
- RegexType::Kleene => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Plus => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Optional => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Or => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Paren => {
- // Only for printing purposes
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- RegexType::Empty => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap().rev() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Lit(tnt) => {
- match tnt {
- TNT::Ter(t) => {
- let mut this_set = HashSet::with_capacity(1);
-
- this_set.insert(Some(t));
-
- children_sets_stack.push(this_set);
- }
- TNT::Non(non) => {
- let this_set = self
- .firsts
- .get(non)
- .ok_or(Error::IndexOutOfBounds(non, non_len))?
- .clone();
-
- children_sets_stack.push(this_set);
- }
- }
-
- let mut this_nodes = HashSet::with_capacity(1);
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- }
- }
-
- assert_eq!(
- children_sets_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- assert_eq!(
- children_nodes_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- for first in children_sets_stack.pop().unwrap().into_iter() {
- if !self.firsts.get(n).unwrap().contains(&first) {
- updated = true;
+ pub fn query_expansion(&self, pos1: usize, pos2: usize) -> Result<Option<&[usize]>, Error> {
+ if pos1 >= self.total() {
+ return Err(Error::IndexOutOfBounds(pos1, self.total()));
+ }
- self.firsts.get_mut(n).unwrap().insert(first);
- }
- }
+ if pos2 >= self.total() {
+ return Err(Error::IndexOutOfBounds(pos2, self.total()));
+ }
- *self.first_nodes.get_mut(n).unwrap() =
- children_nodes_stack.pop().unwrap().into_iter().collect();
+ match self.state {
+ GrammarState::AfterLeftClosure => {}
+ _ => {
+ return Err(Error::WrongState(
+ self.state,
+ GrammarState::AfterLeftClosure,
+ ));
}
}
- Ok(())
+ Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..]))
}
- /// Return the regular language of the left-linear closures of
- /// non-terminals in the grammar.
- ///
- /// The resulting vector is guaranteed to be of the same length as
- /// the number of non-terminals.
- ///
- /// The resulting regular language is not "self-contained". That
- /// is to say, its terminals indices are packed indices and are
- /// meaningless without the interpretation of the grammar. They
- /// should be converted to a nondeterministic finite automaton and
- /// then to its null closure later on.
- pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
- let non_len = self.non_num();
-
- let mut result = Vec::with_capacity(non_len);
-
- for (n, rule) in self.rules.iter().enumerate() {
- let regex = &rule.regex;
-
- let regex_root = if let Some(root) = regex.root() {
- root
- } else {
- result.push(Default::default());
-
- continue;
- };
-
- let regex_len = regex.len();
-
- /// A convenient macro to retrieve the children from the
- /// original regular expression with error propagation.
- macro_rules! children {
- ($n:expr) => {
- regex
- .children_of($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- /// A convenient macro to retrieve the label from the
- /// original regular expression with error propagation.
- macro_rules! label {
- ($n:expr) => {
- regex
- .vertex_label($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- let parents = regex.parents_array().map_err(|e| match e {
- nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
- nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
- _ => unreachable!(),
- })?;
-
- use RegexType::*;
- use TNT::*;
-
- let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
- let mut builder = ALGBuilder::default();
-
- /// Perform a depth-first copy
- macro_rules! df_copy {
- ($parent:expr, $index:expr) => {
- match label!($index) {
- Kleene | Plus | Optional | Or | Paren | Empty => {
- let mut stack = vec![($parent, $index)];
-
- while let Some((top_parent, top_index)) = stack.pop() {
- let label = label!(top_index);
- let label = match label {
- Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
- _ => label,
- };
-
- local_result.push(label);
-
- let new = builder.add_vertex();
-
- builder.add_edge(top_parent, new, ()).unwrap();
-
- stack.extend(children!(top_index).map(|child| (new, child)));
- }
- }
- Lit(remain_tnt) => {
- local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
- let new = builder.add_vertex();
- builder.add_edge($parent, new, ()).unwrap();
- }
- }
- };
- }
-
- local_result.push(Or);
- builder.add_vertex();
-
- local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
- let non_lit_index = builder.add_vertex();
-
- builder.add_edge(0, non_lit_index, ()).unwrap();
-
- // If this non-terminal is nullable, add an empty variant.
- if self.is_nullable(n)? {
- local_result.push(Empty);
- let empty_index = builder.add_vertex();
- builder.add_edge(0, empty_index, ()).unwrap();
+ // REVIEW: Do we have a better way to record expansion information
+ // than to compute the transitive closure?
+
+ /// A transformer of labels to be fed into
+ /// [`closure`][nfa::default::nfa::DefaultNFA::closure], with the
+ /// predicate that returns true if and only if the label of the
+ /// first edge is either empty or a nullable non-terminal.
+ pub fn transform_label_null_epsilon(
+ &mut self,
+ two_edges: TwoEdges<LabelType<TNT>>,
+ ) -> LabelType<TNT> {
+ let (first_source, first_target, first_label) = two_edges.first_edge();
+ let (second_source, second_target, second_label) = two_edges.second_edge();
+
+ #[cfg(debug_assertions)]
+ {
+ assert_eq!(first_target, second_source);
+
+ if let Some(tnt) = *first_label.get_value() {
+ assert!(matches!(tnt, TNT::Non(n) if matches!(self.is_nullable(n), Ok(true))));
}
+ }
- for first_node in self.first_nodes_of(n)?.copied() {
- assert!(first_node < parents.len());
-
- let tnt = match label!(first_node) {
- Lit(tnt) => Lit(tnt),
- _ => continue,
- };
-
- let mut parents_chain = {
- let mut result = Vec::new();
- let mut stack = Vec::with_capacity(parents.len());
-
- stack.push(first_node);
-
- while let Some(top) = stack.pop() {
- assert!(top < parents.len());
- if let Some(parent) = parents.get(top).copied().unwrap() {
- result.push(parent);
- stack.push(parent.0);
- }
- }
+ // Compute if this is from left-linear expansion: it is so if
+ // and only if one if either the edges comes from left-linear
+ // expansion or we are moving across a non-terminal expansion,
+ // that is to say, the source of the second edge is the
+ // starting edge of a non-terminal.
- result.reverse();
+ let mut left_p = first_label.get_left_p() || second_label.get_left_p();
- result
- };
+ // Record left-linear expansion information.
- if parents_chain.is_empty() {
- local_result.push(tnt);
- let lit_index = builder.add_vertex();
- builder.add_edge(0, lit_index, ()).unwrap();
+ if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) {
+ left_p = true;
- continue;
- }
+ if !self
+ .expansion_map
+ .contains_key(&(first_source, second_target))
+ {
+ let original_expansion = self.expansion_map.get(&(second_source, second_target));
- assert_eq!(parents_chain.first().unwrap().0, regex_root);
-
- // A different, "more local", root.
- let mut local_root: usize;
-
- // Handle the direct parent
- let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
-
- match label!(parent_node) {
- Kleene | Plus => {
- // TODO: If parent_edge_index is 0, make a
- // Plus node instead.
- local_result.extend([Empty, tnt]);
-
- local_root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(local_root, lit_index, ()).unwrap();
-
- let iterator = children!(parent_node);
-
- for index in iterator.clone().skip(parent_edge_index + 1) {
- df_copy!(local_root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(local_root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
-
- Or => {
- local_result.push(tnt);
- local_root = builder.add_vertex();
- }
- Optional | Empty => {
- // If this path is taken, it should not be
- // optional.
- local_result.extend([Empty, tnt]);
- local_root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(local_root, lit_index, ()).unwrap();
-
- for index in children!(parent_node).skip(parent_edge_index + 1) {
- df_copy!(local_root, index);
- }
- }
- Paren | Lit(_) => unreachable!(),
- }
+ self.expansion_map.insert(
+ (first_source, second_target),
+ if let Some(original_expansion) = original_expansion {
+ let mut result = original_expansion.clone();
+ result.push(second_nt);
- // Handle successive parents
-
- for (node, edge_index) in parents_chain.into_iter() {
- let node_type = label!(node);
-
- match node_type {
- Kleene | Plus => {
- // TODO: If edge_index is 0, then just
- // make this a Plus node.
-
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, local_root, ()).unwrap();
-
- local_root = new_index;
-
- let iterator = children!(node);
-
- for index in iterator.clone().skip(edge_index + 1) {
- df_copy!(local_root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(local_root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
- RegexType::Or => {}
- RegexType::Optional | RegexType::Empty => {
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, local_root, ()).unwrap();
- local_root = new_index;
-
- for index in children!(node).skip(edge_index + 1) {
- df_copy!(local_root, index);
- }
- }
- RegexType::Paren | RegexType::Lit(_) => unreachable!(),
- }
- }
-
- builder.add_edge(0, local_root, ()).unwrap();
+ result
+ } else {
+ vec![second_nt]
+ },
+ );
}
-
- local_result.shrink_to_fit();
-
- let graph = builder.build();
-
- assert_eq!(graph.nodes_len(), local_result.len());
-
- result.push(
- DefaultRegex::new(graph, local_result)
- .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
- );
}
- assert_eq!(result.len(), non_len);
-
- Ok(result)
+ NfaLabel::new(second_label.get_value(), second_label.get_moved(), left_p)
}
- /// Convert the regular language of left-linear closures to its
- /// equivalent nondeterministic finite automaton.
- ///
- /// In the generation of the left-linear closure, the terminals
- /// and non-terminals are packed into an unsigned integer. We
- /// unpack them in converting to nondeterministic finite
- /// automaton.
- ///
- /// The resulting nondeterministic finite automaton should be
- /// transformed to its null-closure for use in our algorithm.
- pub fn left_closure_to_nfa(
- &self,
- closure: &[DefaultRegex<TNT>],
- ) -> Result<DefaultNFA<DOption<TNT>>, Error> {
- let label_transform = |tnt| match tnt {
- TNT::Ter(t) => {
- let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
- Error::IndexOutOfBounds(index, bound) => {
- graph::error::Error::IndexOutOfBounds(index, bound)
- }
- _ => unreachable!(),
- })?;
-
- Ok(SoC::Carry(new_tnt))
+ /// For a NON_TERMINAL, return an iterator that goes over the
+ /// nodes that are reachable from the non-terminal through an
+ /// empty transition of the nondeterministic finite automaton.
+ #[inline]
+ pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
+ match self.state {
+ GrammarState::Initial => {
+ return Err(Error::WrongState(
+ self.state,
+ GrammarState::AfterComputeFirst,
+ ));
}
- TNT::Non(n) => Ok(SoC::Sub(n)),
- };
-
- let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?;
+ GrammarState::AfterComputeFirst
+ | GrammarState::AfterLeftClosure
+ | GrammarState::AfterNFA => {}
+ }
- Ok(nfa)
+ Ok(self
+ .first_nodes
+ .get(non_terminal)
+ .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
+ .iter())
}
}
+pub mod first_set;
+
+pub mod left_closure;
+
impl Display for Grammar {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
assert_eq!(self.non.len(), self.rules.len());
diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs
index c236952..89f9844 100644
--- a/grammar/src/test_grammar_helper.rs
+++ b/grammar/src/test_grammar_helper.rs
@@ -275,7 +275,6 @@ pub fn new_paren_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
}
/// Return a left recursive grammar.
-#[allow(dead_code)]
pub fn new_left_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())];
let non = vec![
diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs
index 0bc9f4d..003c211 100644
--- a/grammar/src/tests/test_grammar_left_closure.rs
+++ b/grammar/src/tests/test_grammar_left_closure.rs
@@ -1,5 +1,6 @@
use crate::test_grammar_helper::*;
use crate::*;
+use graph::LabelGraph;
use nfa::Nfa;
use std::{
collections::HashSet,
@@ -124,7 +125,7 @@ fn test_remove_epsilon() -> Result<(), Box<dyn std::error::Error>> {
nfa.print_viz("nfa_orig.gv")?;
- nfa.remove_epsilon(|label| label.is_none())?;
+ nfa.remove_epsilon(|label| label.get_value().is_none())?;
nfa.print_viz("nfa_no_epsilon.gv")?;
@@ -174,7 +175,7 @@ fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> {
nfa.print_viz("nfa_orig.gv")?;
- nfa.remove_epsilon(|label| label.is_none())?;
+ nfa.remove_epsilon(|label| label.get_value().is_none())?;
let accumulators: HashSet<usize> = accumulators.into_iter().collect();
@@ -192,7 +193,8 @@ fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> {
#[test]
#[ignore]
fn test_nulling() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_left_recursive_grammar()?;
+ // TODO: Test more grammars.
+ let mut grammar = new_right_recursive_grammar()?;
let closure = new_closure_regex(&mut grammar)?;
let mut lock = stdout().lock();
@@ -244,24 +246,31 @@ fn test_nulling() -> Result<(), Box<dyn std::error::Error>> {
let mut nfa = grammar.left_closure_to_nfa(&closure)?;
- nfa.nulling(|label| {
- if let Some(label) = *label {
- match label {
- TNT::Ter(_) => false,
- // Panics if a non-terminal references an invalid node
- // here.
- TNT::Non(n) => grammar.is_nullable(n).unwrap(),
- }
- } else {
- true
- }
- })?;
-
let grammar_reserve_nodes = |node| accumulators.contains(&node);
writeln!(lock, "accumulators are {accumulators:?}")?;
- nfa.remove_epsilon(|label| label.is_none())?;
+ let nullables: HashSet<usize> = (0..grammar.non_num())
+ .filter(|n| matches!(grammar.is_nullable(*n), Ok(true)))
+ .collect();
+
+ nfa.closure(
+ |label| {
+ if let Some(label) = *label.get_value() {
+ matches!(label, TNT::Non(n) if nullables.contains(&n))
+ } else {
+ true
+ }
+ },
+ true,
+ |two_edges| grammar.transform_label_null_epsilon(two_edges),
+ |label| label.get_value().is_none(),
+ )?;
+
+ for (label, child_iter) in nfa.labels_of(18)? {
+ writeln!(lock, "{label}: {child_iter:?}")?;
+ }
+
nfa.remove_dead(grammar_reserve_nodes)?;
writeln!(lock, "Printing nfa to nfa.gv")?;