summaryrefslogtreecommitdiff
path: root/grammar/src/left_closure.rs
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-11 23:47:26 +0800
commit1a3d346f413325ed37848a6b2526e8e729269833 (patch)
treeab8812f8094d096c68aee53cf516e986cc9a273a /grammar/src/left_closure.rs
parentf27d604d93ce583d4404e1874664e08382ea2f00 (diff)
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine".
Diffstat (limited to 'grammar/src/left_closure.rs')
-rw-r--r--grammar/src/left_closure.rs324
1 files changed, 324 insertions, 0 deletions
diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs
new file mode 100644
index 0000000..1630881
--- /dev/null
+++ b/grammar/src/left_closure.rs
@@ -0,0 +1,324 @@
+//! This file implements some functions to compute the regular
+//! language of left-linear closure of a grammar.
+
+use super::*;
+
+use nfa::LabelType;
+
+impl Grammar {
+ /// Return the regular language of the left-linear closures of
+ /// non-terminals in the grammar.
+ ///
+ /// The resulting vector is guaranteed to be of the same length as
+ /// the number of non-terminals.
+ ///
+ /// The resulting regular language is not "self-contained". That
+ /// is to say, its terminals indices are packed indices and are
+ /// meaningless without the interpretation of the grammar. They
+ /// should be converted to a nondeterministic finite automaton and
+ /// then to its null closure later on.
+ pub fn left_closure(&mut self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
+ match self.state {
+ GrammarState::Initial => {
+ return Err(Error::WrongState(
+ self.state,
+ GrammarState::AfterComputeFirst,
+ ))
+ }
+ GrammarState::AfterLeftClosure
+ | GrammarState::AfterNFA
+ | GrammarState::AfterComputeFirst => {}
+ }
+
+ let non_len = self.non_num();
+
+ let mut result = Vec::with_capacity(non_len);
+
+ for (n, rule) in self.rules.iter().enumerate() {
+ let regex = &rule.regex;
+
+ let regex_root = if let Some(root) = regex.root() {
+ root
+ } else {
+ result.push(Default::default());
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ /// A convenient macro to retrieve the children from the
+ /// original regular expression with error propagation.
+ macro_rules! children {
+ ($n:expr) => {
+ regex
+ .children_of($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ /// A convenient macro to retrieve the label from the
+ /// original regular expression with error propagation.
+ macro_rules! label {
+ ($n:expr) => {
+ regex
+ .vertex_label($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ let parents = regex.parents_array().map_err(|e| match e {
+ nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
+ nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
+ _ => unreachable!(),
+ })?;
+
+ use RegexType::*;
+ use TNT::*;
+
+ let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
+ let mut builder = ALGBuilder::default();
+
+ /// Perform a depth-first copy
+ macro_rules! df_copy {
+ ($parent:expr, $index:expr) => {
+ match label!($index) {
+ Kleene | Plus | Optional | Or | Paren | Empty => {
+ let mut stack = vec![($parent, $index)];
+
+ while let Some((top_parent, top_index)) = stack.pop() {
+ let label = label!(top_index);
+ let label = match label {
+ Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
+ _ => label,
+ };
+
+ local_result.push(label);
+
+ let new = builder.add_vertex();
+
+ builder.add_edge(top_parent, new, ()).unwrap();
+
+ stack.extend(children!(top_index).map(|child| (new, child)));
+ }
+ }
+ Lit(remain_tnt) => {
+ local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
+ let new = builder.add_vertex();
+ builder.add_edge($parent, new, ()).unwrap();
+ }
+ }
+ };
+ }
+
+ local_result.push(Or);
+ builder.add_vertex();
+
+ local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
+ let non_lit_index = builder.add_vertex();
+
+ builder.add_edge(0, non_lit_index, ()).unwrap();
+
+ // If this non-terminal is nullable, add an empty variant.
+ if self.is_nullable(n)? {
+ local_result.push(Empty);
+ let empty_index = builder.add_vertex();
+ builder.add_edge(0, empty_index, ()).unwrap();
+ }
+
+ for first_node in self.first_nodes_of(n)?.copied() {
+ assert!(first_node < parents.len());
+
+ let tnt = match label!(first_node) {
+ Lit(tnt) => Lit(tnt),
+ _ => continue,
+ };
+
+ let mut parents_chain = {
+ let mut result = Vec::new();
+ let mut stack = Vec::with_capacity(parents.len());
+
+ stack.push(first_node);
+
+ while let Some(top) = stack.pop() {
+ assert!(top < parents.len());
+ if let Some(parent) = parents.get(top).copied().unwrap() {
+ result.push(parent);
+ stack.push(parent.0);
+ }
+ }
+
+ result.reverse();
+
+ result
+ };
+
+ if let Some((first, _)) = parents_chain.first() {
+ assert_eq!(*first, regex_root);
+ } else {
+ local_result.push(tnt);
+ let lit_index = builder.add_vertex();
+ builder.add_edge(0, lit_index, ()).unwrap();
+
+ continue;
+ }
+
+ // A different, "more local", root.
+ let mut local_root: usize;
+
+ // Handle the direct parent
+ let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
+
+ match label!(parent_node) {
+ Kleene | Plus => {
+ // TODO: If parent_edge_index is 0, make a
+ // Plus node instead.
+ local_result.extend([Empty, tnt]);
+
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ let iterator = children!(parent_node);
+
+ for index in iterator.clone().skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+
+ Or => {
+ local_result.push(tnt);
+ local_root = builder.add_vertex();
+ }
+ Optional | Empty => {
+ // If this path is taken, it should not be
+ // optional.
+ local_result.extend([Empty, tnt]);
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ for index in children!(parent_node).skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ Paren | Lit(_) => unreachable!(),
+ }
+
+ // Handle successive parents
+
+ for (node, edge_index) in parents_chain.into_iter() {
+ let node_type = label!(node);
+
+ match node_type {
+ Kleene | Plus => {
+ // TODO: If edge_index is 0, then just
+ // make this a Plus node.
+
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+
+ local_root = new_index;
+
+ let iterator = children!(node);
+
+ for index in iterator.clone().skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+ RegexType::Or => {}
+ RegexType::Optional | RegexType::Empty => {
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+ local_root = new_index;
+
+ for index in children!(node).skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ RegexType::Paren | RegexType::Lit(_) => unreachable!(),
+ }
+ }
+
+ builder.add_edge(0, local_root, ()).unwrap();
+ }
+
+ local_result.shrink_to_fit();
+
+ let graph = builder.build();
+
+ assert_eq!(graph.nodes_len(), local_result.len());
+
+ result.push(
+ DefaultRegex::new(graph, local_result)
+ .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
+ );
+ }
+
+ assert_eq!(result.len(), non_len);
+
+ self.accumulators = {
+ let mut acc_result = Vec::with_capacity(result.len() + 1);
+ acc_result.push(0);
+
+ for rule in result.iter() {
+ acc_result.push(rule.len() + *acc_result.last().unwrap());
+ }
+
+ acc_result
+ };
+
+ Ok(result)
+ }
+
+ /// Convert the regular language of left-linear closures to its
+ /// equivalent nondeterministic finite automaton.
+ ///
+ /// In the generation of the left-linear closure, the terminals
+ /// and non-terminals are packed into an unsigned integer. We
+ /// unpack them in converting to nondeterministic finite
+ /// automaton.
+ ///
+ /// The resulting nondeterministic finite automaton should be
+ /// transformed to its null-closure for use in our algorithm.
+ pub fn left_closure_to_nfa(
+ &self,
+ closure: &[DefaultRegex<TNT>],
+ ) -> Result<DefaultNFA<LabelType<TNT>>, Error> {
+ let label_transform = |tnt| match tnt {
+ TNT::Ter(t) => {
+ let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
+ Error::IndexOutOfBounds(index, bound) => {
+ graph::error::Error::IndexOutOfBounds(index, bound)
+ }
+ _ => unreachable!(),
+ })?;
+
+ Ok(SoC::Carry(new_tnt))
+ }
+ TNT::Non(n) => Ok(SoC::Sub(n)),
+ };
+
+ let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?;
+
+ Ok(nfa)
+ }
+}