From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- grammar/src/first_set.rs | 451 ++++++++++++ grammar/src/left_closure.rs | 324 +++++++++ grammar/src/lib.rs | 906 +++++-------------------- grammar/src/test_grammar_helper.rs | 1 - grammar/src/tests/test_grammar_left_closure.rs | 43 +- 5 files changed, 977 insertions(+), 748 deletions(-) create mode 100644 grammar/src/first_set.rs create mode 100644 grammar/src/left_closure.rs (limited to 'grammar') diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs new file mode 100644 index 0000000..2a5ee3c --- /dev/null +++ b/grammar/src/first_set.rs @@ -0,0 +1,451 @@ +//! This file implements a function for computing the set of first +//! terminals for a grammar. + +use super::*; + +impl Grammar { + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + match self.state { + GrammarState::Initial => { + self.state = GrammarState::AfterComputeFirst; + } + _ => { + // This has been called already. + return Ok(()); + } + } + + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } +} diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs new file mode 100644 index 0000000..1630881 --- /dev/null +++ b/grammar/src/left_closure.rs @@ -0,0 +1,324 @@ +//! This file implements some functions to compute the regular +//! language of left-linear closure of a grammar. + +use super::*; + +use nfa::LabelType; + +impl Grammar { + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&mut self) -> Result>, Error> { + match self.state { + GrammarState::Initial => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterComputeFirst, + )) + } + GrammarState::AfterLeftClosure + | GrammarState::AfterNFA + | GrammarState::AfterComputeFirst => {} + } + + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + // If this non-terminal is nullable, add an empty variant. + if self.is_nullable(n)? { + local_result.push(Empty); + let empty_index = builder.add_vertex(); + builder.add_edge(0, empty_index, ()).unwrap(); + } + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => Lit(tnt), + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if let Some((first, _)) = parents_chain.first() { + assert_eq!(*first, regex_root); + } else { + local_result.push(tnt); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + // A different, "more local", root. + let mut local_root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + // TODO: If parent_edge_index is 0, make a + // Plus node instead. + local_result.extend([Empty, tnt]); + + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(tnt); + local_root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, tnt]); + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + // TODO: If edge_index is 0, then just + // make this a Plus node. + + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + + local_root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + local_root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(local_root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, local_root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + self.accumulators = { + let mut acc_result = Vec::with_capacity(result.len() + 1); + acc_result.push(0); + + for rule in result.iter() { + acc_result.push(rule.len() + *acc_result.last().unwrap()); + } + + acc_result + }; + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex], + ) -> Result>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + + Ok(nfa) + } +} diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 4e544c9..627ae6f 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -15,12 +15,15 @@ use nfa::{ nfa::DefaultNFA, regex::{DefaultRegex, ParseError, RegexType}, }, - DOption, Nfa, Regex, SoC, + LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges, }; use graph::{adlist::ALGBuilder, builder::Builder, Graph}; -use std::{collections::HashSet, fmt::Display}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Display, +}; /// The type of a terminal. /// @@ -88,6 +91,9 @@ impl Display for TNT { #[derive(Debug, Copy, Clone)] #[non_exhaustive] pub enum Error { + /// The operation requires the grammar to be after a certain + /// state, but the grammar is not after that state yet. + WrongState(GrammarState, GrammarState), /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), /// Fail to build the N-th regular expression, due to the @@ -112,6 +118,9 @@ impl Display for Error { "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + Error::WrongState(current, threshold) => { + write!(f, "require state {threshold}, but in state {current}") + } } } } @@ -146,6 +155,36 @@ impl Rule { } } +/// The state of Grammar. +/// +/// This is used to ensure that the grammar preparation methods are +/// called in the correct order. +#[derive(Debug, Copy, Clone, Default)] +pub enum GrammarState { + /// Just initialized + #[default] + Initial, + /// compute_firsts has been called + AfterComputeFirst, + /// left_closure has been called. + AfterLeftClosure, + /// left_closure_to_nfa has been called. + AfterNFA, +} + +impl Display for GrammarState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use GrammarState::*; + + match self { + Initial => write!(f, "initial"), + AfterComputeFirst => write!(f, "after computation of first set"), + AfterLeftClosure => write!(f, "after computation of closure"), + AfterNFA => write!(f, "after computation of NFA"), + } + } +} + /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { @@ -158,6 +197,8 @@ pub struct Grammar { /// The length of the list must match that of the list of /// non-terminals. rules: Vec, + /// The list of successive sums of lengths of rules. + accumulators: Vec, // The following two attributes are empty until we call // `compute_firsts` on the grammar. /// The list of sets of "first terminals". @@ -169,9 +210,18 @@ pub struct Grammar { /// /// The length must match that of the list of non-terminals. first_nodes: Vec>, - // The following attribute is empty until we call `left_closure` - // on the grammar. - left_closure_branches: HashSet, + // The following attribute is empty until we call `closure` on the + // NFA with `transform_label_null_epsilon` as the transformer. + /// A hash map that maps a tuple `(pos1, pos2)` of positions + /// `pos1` and `pos2` in the rules to a vector of rule positions. + /// This vector means that in order to expand from `pos1` to + /// `pos`, it is necessary to expand according to the positions in + /// the vector, so we need to add all these expansions into the + /// parse forest later. + expansion_map: HashMap<(usize, usize), Vec>, + /// The state of the grammar, which tells us what information has + /// been computed for this grammar. + state: GrammarState, } /// A private type to aid the recursive looping of rergular @@ -216,7 +266,14 @@ impl Grammar { .map(|rule| Vec::with_capacity(rule.len())) .collect(); - let left_closure_branches = HashSet::default(); + let state = Default::default(); + + let expansion_map = Default::default(); + + // NOTE: We cannot calculate accumulators here, as we want the + // accumulators of the regular expression of the left-closure, + // not of the original one. + let accumulators = Vec::new(); Self { ter, @@ -224,7 +281,9 @@ impl Grammar { rules, firsts, first_nodes, - left_closure_branches, + state, + expansion_map, + accumulators, } } @@ -258,7 +317,29 @@ impl Grammar { /// Return the total length of all rules. #[inline] pub fn total(&self) -> usize { - self.rules.iter().map(Rule::len).sum() + self.accumulators.last().copied().unwrap_or(0) + } + + /// Query if a position is the starting position of a + /// non-terminal. If it is, return the non-terminal, else return + /// `None` . + #[inline] + pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option { + for (index, accumulator) in self.accumulators.iter().copied().enumerate() { + let shifted_accumulator = accumulator << 1; + + // NOTE: Clippy suggests to call `cmp`, but it seems + // compiler might not yet be smart enough to inline that + // call, so I just silence clippy here. + #[allow(clippy::comparison_chain)] + if pos == shifted_accumulator { + return Some(index); + } else if pos < shifted_accumulator { + break; + } + } + + None } /// Return the number of terminals. @@ -353,754 +434,119 @@ impl Grammar { .contains(&None)) } - /// For a NON_TERMINAL, return an iterator that goes over the - /// nodes that are reachable from the non-terminal through an - /// empty transition of the nondeterministic finite automaton. + /// Query the expansion information from the position `pos1` to + /// the position `pos2` . #[inline] - pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { - Ok(self - .first_nodes - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .iter()) - } - - /// Return a hash set that contains all nodes in the set of - /// left-closure regular languages that are added because of the - /// left-linear expansion. - pub fn left_closure_branches(&self) -> &HashSet { - &self.left_closure_branches - } - - /// Compute the set of terminals that can appear as the first - /// terminal in some left-linear derivation of a non-terminal, for - /// every non-terminal. - /// - /// This is an algorithm that computes the transitive closure, - /// which is a common approach for this task. But perhaps there - /// are more efficient approaches? - /// - /// Also the function computes the set of "reachable nodes" in the - /// process, and records the information in the `first_nodes` - /// attribute. - pub fn compute_firsts(&mut self) -> Result<(), Error> { - let mut updated = true; - - let non_len = self.non_num(); - - use StackElement::{Seen, Unseen}; - - while updated { - updated = false; - - for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { - let root = if let Some(root) = regex.root() { - root - } else { - if !self.is_nullable(n)? { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(None); - - // The default construction of a grammar - // reserves some space for each vector, so - // explicitly setting this can reduce some - // minor memory overhead. - let pointer = self.first_nodes.get_mut(n).unwrap(); - - pointer.clear(); - pointer.shrink_to_fit(); - } - - continue; - }; - - let regex_len = regex.len(); - - let mut stack: Vec = Vec::with_capacity(regex_len); - - stack.push(Unseen(root)); - - let mut children_sets_stack: Vec>> = - Vec::with_capacity(regex_len); - - let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); - - while let Some(top) = stack.pop() { - let top_index = top.index(); - let is_seen = top.is_seen(); - - match regex - .vertex_label(top_index) - .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? - { - RegexType::Kleene => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Plus => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Optional => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Or => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Paren => { - // Only for printing purposes - let mut this_set = HashSet::new(); - - this_set.insert(None); - - children_sets_stack.push(this_set); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - RegexType::Empty => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap().rev() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Lit(tnt) => { - match tnt { - TNT::Ter(t) => { - let mut this_set = HashSet::with_capacity(1); - - this_set.insert(Some(t)); - - children_sets_stack.push(this_set); - } - TNT::Non(non) => { - let this_set = self - .firsts - .get(non) - .ok_or(Error::IndexOutOfBounds(non, non_len))? - .clone(); - - children_sets_stack.push(this_set); - } - } - - let mut this_nodes = HashSet::with_capacity(1); - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - } - } - - assert_eq!( - children_sets_stack.len(), - 1, - "Too many elements left at the end" - ); - - assert_eq!( - children_nodes_stack.len(), - 1, - "Too many elements left at the end" - ); - - for first in children_sets_stack.pop().unwrap().into_iter() { - if !self.firsts.get(n).unwrap().contains(&first) { - updated = true; + pub fn query_expansion(&self, pos1: usize, pos2: usize) -> Result, Error> { + if pos1 >= self.total() { + return Err(Error::IndexOutOfBounds(pos1, self.total())); + } - self.firsts.get_mut(n).unwrap().insert(first); - } - } + if pos2 >= self.total() { + return Err(Error::IndexOutOfBounds(pos2, self.total())); + } - *self.first_nodes.get_mut(n).unwrap() = - children_nodes_stack.pop().unwrap().into_iter().collect(); + match self.state { + GrammarState::AfterLeftClosure => {} + _ => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterLeftClosure, + )); } } - Ok(()) + Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..])) } - /// Return the regular language of the left-linear closures of - /// non-terminals in the grammar. - /// - /// The resulting vector is guaranteed to be of the same length as - /// the number of non-terminals. - /// - /// The resulting regular language is not "self-contained". That - /// is to say, its terminals indices are packed indices and are - /// meaningless without the interpretation of the grammar. They - /// should be converted to a nondeterministic finite automaton and - /// then to its null closure later on. - pub fn left_closure(&self) -> Result>, Error> { - let non_len = self.non_num(); - - let mut result = Vec::with_capacity(non_len); - - for (n, rule) in self.rules.iter().enumerate() { - let regex = &rule.regex; - - let regex_root = if let Some(root) = regex.root() { - root - } else { - result.push(Default::default()); - - continue; - }; - - let regex_len = regex.len(); - - /// A convenient macro to retrieve the children from the - /// original regular expression with error propagation. - macro_rules! children { - ($n:expr) => { - regex - .children_of($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - /// A convenient macro to retrieve the label from the - /// original regular expression with error propagation. - macro_rules! label { - ($n:expr) => { - regex - .vertex_label($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - let parents = regex.parents_array().map_err(|e| match e { - nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), - nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), - _ => unreachable!(), - })?; - - use RegexType::*; - use TNT::*; - - let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); - let mut builder = ALGBuilder::default(); - - /// Perform a depth-first copy - macro_rules! df_copy { - ($parent:expr, $index:expr) => { - match label!($index) { - Kleene | Plus | Optional | Or | Paren | Empty => { - let mut stack = vec![($parent, $index)]; - - while let Some((top_parent, top_index)) = stack.pop() { - let label = label!(top_index); - let label = match label { - Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), - _ => label, - }; - - local_result.push(label); - - let new = builder.add_vertex(); - - builder.add_edge(top_parent, new, ()).unwrap(); - - stack.extend(children!(top_index).map(|child| (new, child))); - } - } - Lit(remain_tnt) => { - local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); - let new = builder.add_vertex(); - builder.add_edge($parent, new, ()).unwrap(); - } - } - }; - } - - local_result.push(Or); - builder.add_vertex(); - - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); - - builder.add_edge(0, non_lit_index, ()).unwrap(); - - // If this non-terminal is nullable, add an empty variant. - if self.is_nullable(n)? { - local_result.push(Empty); - let empty_index = builder.add_vertex(); - builder.add_edge(0, empty_index, ()).unwrap(); + // REVIEW: Do we have a better way to record expansion information + // than to compute the transitive closure? + + /// A transformer of labels to be fed into + /// [`closure`][nfa::default::nfa::DefaultNFA::closure], with the + /// predicate that returns true if and only if the label of the + /// first edge is either empty or a nullable non-terminal. + pub fn transform_label_null_epsilon( + &mut self, + two_edges: TwoEdges>, + ) -> LabelType { + let (first_source, first_target, first_label) = two_edges.first_edge(); + let (second_source, second_target, second_label) = two_edges.second_edge(); + + #[cfg(debug_assertions)] + { + assert_eq!(first_target, second_source); + + if let Some(tnt) = *first_label.get_value() { + assert!(matches!(tnt, TNT::Non(n) if matches!(self.is_nullable(n), Ok(true)))); } + } - for first_node in self.first_nodes_of(n)?.copied() { - assert!(first_node < parents.len()); - - let tnt = match label!(first_node) { - Lit(tnt) => Lit(tnt), - _ => continue, - }; - - let mut parents_chain = { - let mut result = Vec::new(); - let mut stack = Vec::with_capacity(parents.len()); - - stack.push(first_node); - - while let Some(top) = stack.pop() { - assert!(top < parents.len()); - if let Some(parent) = parents.get(top).copied().unwrap() { - result.push(parent); - stack.push(parent.0); - } - } + // Compute if this is from left-linear expansion: it is so if + // and only if one if either the edges comes from left-linear + // expansion or we are moving across a non-terminal expansion, + // that is to say, the source of the second edge is the + // starting edge of a non-terminal. - result.reverse(); + let mut left_p = first_label.get_left_p() || second_label.get_left_p(); - result - }; + // Record left-linear expansion information. - if parents_chain.is_empty() { - local_result.push(tnt); - let lit_index = builder.add_vertex(); - builder.add_edge(0, lit_index, ()).unwrap(); + if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) { + left_p = true; - continue; - } + if !self + .expansion_map + .contains_key(&(first_source, second_target)) + { + let original_expansion = self.expansion_map.get(&(second_source, second_target)); - assert_eq!(parents_chain.first().unwrap().0, regex_root); - - // A different, "more local", root. - let mut local_root: usize; - - // Handle the direct parent - let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); - - match label!(parent_node) { - Kleene | Plus => { - // TODO: If parent_edge_index is 0, make a - // Plus node instead. - local_result.extend([Empty, tnt]); - - local_root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(local_root, lit_index, ()).unwrap(); - - let iterator = children!(parent_node); - - for index in iterator.clone().skip(parent_edge_index + 1) { - df_copy!(local_root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(local_root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - - Or => { - local_result.push(tnt); - local_root = builder.add_vertex(); - } - Optional | Empty => { - // If this path is taken, it should not be - // optional. - local_result.extend([Empty, tnt]); - local_root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(local_root, lit_index, ()).unwrap(); - - for index in children!(parent_node).skip(parent_edge_index + 1) { - df_copy!(local_root, index); - } - } - Paren | Lit(_) => unreachable!(), - } + self.expansion_map.insert( + (first_source, second_target), + if let Some(original_expansion) = original_expansion { + let mut result = original_expansion.clone(); + result.push(second_nt); - // Handle successive parents - - for (node, edge_index) in parents_chain.into_iter() { - let node_type = label!(node); - - match node_type { - Kleene | Plus => { - // TODO: If edge_index is 0, then just - // make this a Plus node. - - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, local_root, ()).unwrap(); - - local_root = new_index; - - let iterator = children!(node); - - for index in iterator.clone().skip(edge_index + 1) { - df_copy!(local_root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(local_root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - RegexType::Or => {} - RegexType::Optional | RegexType::Empty => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, local_root, ()).unwrap(); - local_root = new_index; - - for index in children!(node).skip(edge_index + 1) { - df_copy!(local_root, index); - } - } - RegexType::Paren | RegexType::Lit(_) => unreachable!(), - } - } - - builder.add_edge(0, local_root, ()).unwrap(); + result + } else { + vec![second_nt] + }, + ); } - - local_result.shrink_to_fit(); - - let graph = builder.build(); - - assert_eq!(graph.nodes_len(), local_result.len()); - - result.push( - DefaultRegex::new(graph, local_result) - .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, - ); } - assert_eq!(result.len(), non_len); - - Ok(result) + NfaLabel::new(second_label.get_value(), second_label.get_moved(), left_p) } - /// Convert the regular language of left-linear closures to its - /// equivalent nondeterministic finite automaton. - /// - /// In the generation of the left-linear closure, the terminals - /// and non-terminals are packed into an unsigned integer. We - /// unpack them in converting to nondeterministic finite - /// automaton. - /// - /// The resulting nondeterministic finite automaton should be - /// transformed to its null-closure for use in our algorithm. - pub fn left_closure_to_nfa( - &self, - closure: &[DefaultRegex], - ) -> Result>, Error> { - let label_transform = |tnt| match tnt { - TNT::Ter(t) => { - let new_tnt = self.unpack_tnt(t).map_err(|e| match e { - Error::IndexOutOfBounds(index, bound) => { - graph::error::Error::IndexOutOfBounds(index, bound) - } - _ => unreachable!(), - })?; - - Ok(SoC::Carry(new_tnt)) + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { + match self.state { + GrammarState::Initial => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterComputeFirst, + )); } - TNT::Non(n) => Ok(SoC::Sub(n)), - }; - - let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + GrammarState::AfterComputeFirst + | GrammarState::AfterLeftClosure + | GrammarState::AfterNFA => {} + } - Ok(nfa) + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) } } +pub mod first_set; + +pub mod left_closure; + impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs index c236952..89f9844 100644 --- a/grammar/src/test_grammar_helper.rs +++ b/grammar/src/test_grammar_helper.rs @@ -275,7 +275,6 @@ pub fn new_paren_grammar() -> Result> { } /// Return a left recursive grammar. -#[allow(dead_code)] pub fn new_left_recursive_grammar() -> Result> { let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; let non = vec![ diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs index 0bc9f4d..003c211 100644 --- a/grammar/src/tests/test_grammar_left_closure.rs +++ b/grammar/src/tests/test_grammar_left_closure.rs @@ -1,5 +1,6 @@ use crate::test_grammar_helper::*; use crate::*; +use graph::LabelGraph; use nfa::Nfa; use std::{ collections::HashSet, @@ -124,7 +125,7 @@ fn test_remove_epsilon() -> Result<(), Box> { nfa.print_viz("nfa_orig.gv")?; - nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_epsilon(|label| label.get_value().is_none())?; nfa.print_viz("nfa_no_epsilon.gv")?; @@ -174,7 +175,7 @@ fn test_remove_dead() -> Result<(), Box> { nfa.print_viz("nfa_orig.gv")?; - nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_epsilon(|label| label.get_value().is_none())?; let accumulators: HashSet = accumulators.into_iter().collect(); @@ -192,7 +193,8 @@ fn test_remove_dead() -> Result<(), Box> { #[test] #[ignore] fn test_nulling() -> Result<(), Box> { - let mut grammar = new_left_recursive_grammar()?; + // TODO: Test more grammars. + let mut grammar = new_right_recursive_grammar()?; let closure = new_closure_regex(&mut grammar)?; let mut lock = stdout().lock(); @@ -244,24 +246,31 @@ fn test_nulling() -> Result<(), Box> { let mut nfa = grammar.left_closure_to_nfa(&closure)?; - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), - } - } else { - true - } - })?; - let grammar_reserve_nodes = |node| accumulators.contains(&node); writeln!(lock, "accumulators are {accumulators:?}")?; - nfa.remove_epsilon(|label| label.is_none())?; + let nullables: HashSet = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true + } + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + + for (label, child_iter) in nfa.labels_of(18)? { + writeln!(lock, "{label}: {child_iter:?}")?; + } + nfa.remove_dead(grammar_reserve_nodes)?; writeln!(lock, "Printing nfa to nfa.gv")?; -- cgit v1.2.3-18-g5258