diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-11 23:47:26 +0800 |
commit | 1a3d346f413325ed37848a6b2526e8e729269833 (patch) | |
tree | ab8812f8094d096c68aee53cf516e986cc9a273a | |
parent | f27d604d93ce583d4404e1874664e08382ea2f00 (diff) |
Record left-linear expansion and forest format
Now the grammar will record the left-linear expansions when generating
the nondeterministic finite automaton frmo its rules, and will record
whether an edge in the nondeterministic finite automaton comes from a
left-linear expansion. The latter is needed because while performing
a chain-rule derivation, we do not need the left-linear expanded
derivations in the "first layer". This might well have been the root
cause of the bad performance of the previous version of this package.
Also I have figured out how to properly generate and handle parse
forests while manipulating the "chain-rule machine".
-rw-r--r-- | chain/src/atom/default.rs | 140 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 4 | ||||
-rw-r--r-- | chain/src/plan.org | 52 | ||||
-rw-r--r-- | forest/src/default.rs | 144 | ||||
-rw-r--r-- | forest/src/design.org | 95 | ||||
-rw-r--r-- | forest/src/lib.rs | 116 | ||||
-rw-r--r-- | grammar/src/first_set.rs | 451 | ||||
-rw-r--r-- | grammar/src/left_closure.rs | 324 | ||||
-rw-r--r-- | grammar/src/lib.rs | 906 | ||||
-rw-r--r-- | grammar/src/test_grammar_helper.rs | 1 | ||||
-rw-r--r-- | grammar/src/tests/test_grammar_left_closure.rs | 43 | ||||
-rw-r--r-- | graph/src/adlist.rs | 2 | ||||
-rw-r--r-- | graph/src/adset.rs | 2 | ||||
-rw-r--r-- | graph/src/builder.rs | 4 | ||||
-rw-r--r-- | graph/src/labelled/binary.rs | 406 | ||||
-rw-r--r-- | graph/src/labelled/double.rs | 9 | ||||
-rw-r--r-- | graph/src/lib.rs | 31 | ||||
-rw-r--r-- | nfa/src/default/nfa.rs | 210 | ||||
-rw-r--r-- | nfa/src/lib.rs | 155 |
19 files changed, 1903 insertions, 1192 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs index 72989b3..90133f4 100644 --- a/chain/src/atom/default.rs +++ b/chain/src/atom/default.rs @@ -4,7 +4,10 @@ use super::*; use grammar::Grammar; use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; -use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; +use nfa::{ + default::{nfa::DefaultNFA, regex::DefaultRegex}, + LabelType, +}; use core::fmt::Display; use std::collections::BTreeMap as Map; @@ -35,7 +38,7 @@ type VirtualMap = Map<VirtualNode, usize>; #[derive(Debug, Clone, Default)] pub struct DefaultAtom { grammar: Grammar, - nfa: DefaultNFA<DOption<TNT>>, + nfa: DefaultNFA<LabelType<TNT>>, // NOTE: This is mostly for printing and debugging regexp: Vec<DefaultRegex<TNT>>, virtual_nodes: VirtualMap, @@ -95,7 +98,7 @@ impl Display for DefaultAtom { // LabelGraph, in order to implement Nfa. impl Graph for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b> + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as Graph>::Iter<'b> where Self: 'b; @@ -130,23 +133,23 @@ impl Graph for DefaultAtom { } } -impl LabelGraph<DOption<TNT>> for DefaultAtom { - type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b> +impl LabelGraph<LabelType<TNT>> for DefaultAtom { + type Iter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::Iter<'b> where Self: 'b; - type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b> + type LabelIter<'b> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::LabelIter<'b> where Self: 'b, DOption<TNT>: 'b; - type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a> + type EdgeLabelIter<'a> = <DefaultNFA<LabelType<TNT>> as LabelGraph<LabelType<TNT>>>::EdgeLabelIter<'a> where Self: 'a, DOption<TNT>: 'a; #[inline] - fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> { + fn vertex_label(&self, node_id: usize) -> Result<Option<LabelType<TNT>>, GraphError> { self.nfa.vertex_label(node_id) } @@ -163,8 +166,8 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn find_children_with_label( &self, node_id: usize, - label: &DOption<TNT>, - ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> { + label: &LabelType<TNT>, + ) -> Result<<Self as LabelGraph<LabelType<TNT>>>::Iter<'_>, GraphError> { self.nfa.find_children_with_label(node_id, label) } @@ -177,39 +180,31 @@ impl LabelGraph<DOption<TNT>> for DefaultAtom { fn has_edge_label( &self, node_id: usize, - label: &DOption<TNT>, + label: &LabelType<TNT>, target: usize, ) -> Result<bool, GraphError> { self.nfa.has_edge_label(node_id, label, target) } } -impl LabelExtGraph<DOption<TNT>> for DefaultAtom { +impl LabelExtGraph<LabelType<TNT>> for DefaultAtom { #[inline] fn extend( &mut self, - edges: impl IntoIterator<Item = (DOption<TNT>, usize)>, + edges: impl IntoIterator<Item = (LabelType<TNT>, usize)>, ) -> Result<usize, GraphError> { self.nfa.extend(edges) } } -impl Nfa<DOption<TNT>> for DefaultAtom { - #[inline] - fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error> - where - F: Fn(DOption<TNT>) -> bool, - { - self.nfa.remove_epsilon(f) - } - +impl Nfa<LabelType<TNT>> for DefaultAtom { type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = (); #[inline] fn to_nfa( - _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>], - _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>, - _default: Option<DOption<TNT>>, + _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<LabelType<TNT>>>], + _sub_pred: impl Fn(LabelType<TNT>) -> Result<nfa::SoC<LabelType<TNT>>, nfa::error::Error>, + _default: Option<LabelType<TNT>>, ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> { // NOTE: We cannot construct an atom from a set of regular // languages alone. So it is appropriate to panic here, if @@ -218,13 +213,20 @@ impl Nfa<DOption<TNT>> for DefaultAtom { } #[inline] - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), nfa::error::Error> { self.nfa.remove_dead(reserve) } #[inline] - fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> { - self.nfa.nulling(f) + fn closure( + &mut self, + predicate: impl FnMut(LabelType<TNT>) -> bool, + remove_after_p: bool, + transform: impl FnMut(nfa::TwoEdges<LabelType<TNT>>) -> LabelType<TNT>, + remove_predicate: impl FnMut(LabelType<TNT>) -> bool, + ) -> Result<(), nfa::error::Error> { + self.nfa + .closure(predicate, remove_after_p, transform, remove_predicate) } } @@ -237,46 +239,56 @@ impl DefaultAtom { let mut nfa = grammar.left_closure_to_nfa(®exp)?; - use std::collections::HashSet; + use std::collections::{HashMap, HashSet}; let accumulators: Vec<usize> = { let mut result = Vec::with_capacity(regexp.len() + 1); result.push(0); for regex in regexp.iter() { + // Calling `unwrap` here is safe as `result` is always + // non-empty. result.push(regex.nodes_len() * 2 + result.last().unwrap()); } - result.into_iter().collect() + result }; let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect(); - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), + let nullables: HashSet<usize> = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + // Perform nulling and remove_epsilon at the same time. + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true } - } else { - true - } - })?; - nfa.remove_epsilon(|label| label.is_none())?; + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; - // now add the virtual nodes + // Now add the virtual nodes. let mut virtual_nodes: VirtualMap = Default::default(); let nt_num = grammar.non_num(); assert!(nt_num <= accumulators.len()); - // Convert an error telling us that an index is out of bounds. - // - // Panics if the error is not of the expected kind. + /// Convert an error telling us that an index is out of bounds. + /// + /// # Panics + /// + /// The function panics if the error is not of the expected + /// kind. fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { match ge { GraphError::IndexOutOfBounds(index, bound) => { @@ -287,24 +299,34 @@ impl DefaultAtom { } for nt in 0..nt_num { - let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa - // this is safe because of the above assertion. + let children: std::collections::HashMap<_, _> = nfa + // This is safe because of the above assertion. .labels_of(*accumulators.get(nt).unwrap()) .map_err(index_out_of_bounds_conversion)? - .map(|(label, target_iter)| (*label, target_iter.collect())) + .map(|(label, target_iter)| (*label, target_iter)) .collect(); - for (label, children_vec) in children.into_iter() { - if let Some(TNT::Ter(t)) = *label { - let new_index = nfa - .extend(children_vec.into_iter().map(|target| (label, target))) - .map_err(index_out_of_bounds_conversion)?; + let mut terminals_map: HashMap<usize, HashSet<(LabelType<TNT>, usize)>> = + HashMap::new(); - let virtual_node = VirtualNode::new(nt, t); - - virtual_nodes.insert(virtual_node, new_index); + for (label, children_iter) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label.get_value() { + terminals_map + .entry(t) + .or_insert_with(|| HashSet::with_capacity(children_iter.len())) + .extend(children_iter.map(|target| (label, target))); } } + + for (t, set) in terminals_map.into_iter() { + let new_index = nfa + .extend(set.into_iter()) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } } Ok(Self { @@ -335,8 +357,6 @@ impl Atom for DefaultAtom { } fn empty(&self) -> usize { - assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); - - self.nfa.nodes_len() - 2 + self.grammar.total() << 1 } } diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs index 084acca..065640b 100644 --- a/chain/src/atom/mod.rs +++ b/chain/src/atom/mod.rs @@ -7,10 +7,10 @@ //! I have better ideas in the future. use grammar::{Error as GrammarError, TNT}; -use nfa::{DOption, Nfa}; +use nfa::{DOption, LabelType, Nfa}; /// The expected behaviours of an atomic language. -pub trait Atom: Nfa<DOption<TNT>> { +pub trait Atom: Nfa<LabelType<TNT>> { /// Return the index of a node representing the derivative of the /// left-linear null closure of `nt` with respect to `t`. fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; diff --git a/chain/src/plan.org b/chain/src/plan.org index bbd6683..b708413 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,7 +2,7 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [5/10] +* Things to do [6/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the @@ -38,18 +38,30 @@ finite automata. * [X] Test the removal of dead states, where a state is dead if and only if no other states have an edge into that state. -- [-] Refactor [2/8] +- [X] Refactor [7/7] + [X] Implement a data type for graphs with labels on vertices and edges, but do not need to index edges by labels, which can index vertices by labels instead. * [X] Test it. + [X] Implement a builder that borrows a graph mutably. * [X] Test it. - + [ ] Implement a data type for graphs in which every node knows its + + [X] Implement a data type for graphs in which every node knows its parents, and every node has a unique label but edges have no labels. - * [ ] Test it. - + [ ] We need to record the expansions of those "virtual nodes". + * [X] Test it. + + [X] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. + + [X] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [X] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. + + [X] We need to record the expansions of those "virtual nodes". That is, the virtual nodes represent atomic languages such as [s]^{(t)} where s is a non-terminal and t is a terminal. To be more specific, it represents the derivative of the left-linear closure @@ -59,32 +71,21 @@ alone, without consuming any inputs, by expanding according to the grammar rules. This means that we need to know which non-terminals were expanded in order to get to a state in [s]^{(t)}. - + [ ] Each edge in the chain-rule machine needs to be labelled also - with a position in the forest. This perfectly solves the problem - of those "plugs"! - + [ ] When we pull in some regular language because of the - left-linear expansion, we need to mark that branch as coming from - left-linear expansions. This is necessary because we cannot - follow a left-linearly expanded branch when we take the derivative - of a language. We only expand left-linearly when we try to access - the atomic languages [s]^{(t)}. We can mark by returning a set of - nodes which are the beginnings of left-linearly expanded branches. - + [ ] An edge of the NFA should carry a label that can be more - informative than solely a terminal or a non-terminal. - + [ ] Add a mechanism for a grammar to attach labels to edges of NFA - which are not necessarily Copy-able, and which should be stored in a - separate array, such that the labels on edges of NFA point to the - elements of the array. + * [X] Test + * [X] Test more - [X] Atom [3/3] + [X] Define the Atom trait. + [X] Implement virtual nodes for each derivative of each atom: The lack of this step might be the root cause of the failure of the previous version of this project. + [X] Test atoms -- [-] Implement languages. [1/3] +- [-] Implement languages. [1/4] + [X] First define a trait with the expected behaviours. + [ ] Then implement them as doubly labelled graphs. - + [ ] Thenimplement finding derivatives by use of the chain rule. + + [ ] Then implement finding derivatives by use of the chain rule. + + [ ] Each edge in the chain-rule machine needs to be labelled also + with a position in the forest. This perfectly solves the problem + of those "plugs"! - [-] Implement forests [2/3] + [X] Design a format of forests. This should be the most crucial thing to do, in order to have a better understanding of the whole @@ -92,10 +93,7 @@ the binarised shared packed parsed forests, that reflects the regular expressions in the grammar equations. + [X] Define a trait with the expected behaviours. - + [-] Implement them using adjacency map: we only need one label per - edge, and we do not wish to exclude duplicate edges, and we do not - need to index edges by the labels. All in all, we implement them - using a vector of hashmaps. + + [-] Implement them as parents-knowing graphs. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. diff --git a/forest/src/default.rs b/forest/src/default.rs index 5e438d4..d3970e9 100644 --- a/forest/src/default.rs +++ b/forest/src/default.rs @@ -1,141 +1,21 @@ //! This file provides a default implementation for the //! [`Forest`][crate::Forest] trait. +#[allow(unused_imports)] use super::*; +#[allow(unused_imports)] use graph::{error::Error as GraphError, ALGraph, Graph}; +#[allow(unused_imports)] use std::collections::{hash_set::Iter, HashMap, HashSet}; -#[derive(Debug, Clone)] -pub struct DefaultForest<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { - graph: ALGraph, - vertex_labels: HashMap<usize, NodeLabel>, - edge_labels: HashMap<(usize, usize), EdgeLabel>, - plugins: HashSet<usize>, - plugouts: HashSet<usize>, -} +// TODO: Use PLGraph instead. -impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Default for DefaultForest<NodeLabel, EdgeLabel> { - fn default() -> Self { - Self { - graph: Default::default(), - vertex_labels: Default::default(), - edge_labels: Default::default(), - plugins: Default::default(), - plugouts: Default::default(), - } - } -} - -impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Graph for DefaultForest<NodeLabel, EdgeLabel> { - type Iter<'a> = <ALGraph as Graph>::Iter<'a> - where - Self: 'a; - - fn is_empty(&self) -> bool { - self.graph.is_empty() - } - - fn nodes_len(&self) -> usize { - self.graph.nodes_len() - } - - fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GraphError> { - self.graph.children_of(node_id) - } - - fn degree(&self, node_id: usize) -> Result<usize, GraphError> { - self.graph.degree(node_id) - } - - fn is_empty_node(&self, node_id: usize) -> Result<bool, GraphError> { - self.graph.is_empty_node(node_id) - } - - fn has_edge(&self, source: usize, target: usize) -> Result<bool, GraphError> { - self.graph.has_edge(source, target) - } - - fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { - unimplemented!() - } -} - -#[derive(Debug)] -pub struct LabelIter<'a> { - set_iter: Iter<'a, usize>, -} - -impl<'a> Iterator for LabelIter<'a> { - type Item = usize; - - fn next(&mut self) -> Option<Self::Item> { - self.set_iter.next().copied() - } - - fn size_hint(&self) -> (usize, Option<usize>) { - self.set_iter.size_hint() - } -} - -impl<'a> ExactSizeIterator for LabelIter<'a> { - fn len(&self) -> usize { - self.set_iter.len() - } -} - -impl<'a> From<Iter<'a, usize>> for LabelIter<'a> { - fn from(set_iter: Iter<'a, usize>) -> Self { - Self { set_iter } - } -} - -impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Forest<NodeLabel, EdgeLabel> - for DefaultForest<NodeLabel, EdgeLabel> -{ - type PluginIter<'a> = LabelIter<'a> - where - Self: 'a; - - type PlugoutIter<'a> = LabelIter<'a> - where - Self: 'a; - - fn plugins(&self) -> Self::PluginIter<'_> { - self.plugins.iter().into() - } - - fn plugouts(&self) -> Self::PlugoutIter<'_> { - self.plugouts.iter().into() - } - - fn plug(&mut self, other: &Self) -> Result<(), Error> { - // PLAN: Produce a BuilderMut, adjust the indices for edges, - // and then add edges between plugs. - // - // Since I cannot touch the underlying nodes directly, I have - // to add the nodes and edges individually. - - todo!() - } - - fn vertex_label(&self, node_id: usize) -> Result<Option<NodeLabel>, Error> { - if node_id >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(node_id, self.nodes_len())); - } - - Ok(self.vertex_labels.get(&node_id).copied()) - } - - fn edge_label(&self, source: usize, target: usize) -> Result<Option<EdgeLabel>, Error> { - if source >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(source, self.nodes_len())); - } - - if target >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(target, self.nodes_len())); - } - - Ok(self.edge_labels.get(&(source, target)).copied()) - } -} +// #[derive(Debug, Clone)] +// pub struct DefaultForest<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { +// graph: ALGraph, +// vertex_labels: HashMap<usize, NodeLabel>, +// edge_labels: HashMap<(usize, usize), EdgeLabel>, +// plugins: HashSet<usize>, +// plugouts: HashSet<usize>, +// } diff --git a/forest/src/design.org b/forest/src/design.org index 771ca4b..09db113 100644 --- a/forest/src/design.org +++ b/forest/src/design.org @@ -28,7 +28,7 @@ topic sooner or later, why not deal with it now? ;P represent this alternation. Packed nodes are not labelled. They just serve the role of putting nodes together. -* Some thoughts [1/3] +* Some thoughts We do not need to attach forest fragments to nodes of nondeterministic finite automata: we just attach to them lists of grammar slots, which @@ -44,23 +44,98 @@ non-terminals. Some caveats: -- [X] Every node in the forest needs to know its parents. This is - needed for "reductions". That is, after we have parsed the entire +- Every node in the forest needs to know its parents. This is needed + for "reductions". That is, after we have parsed the entire expansion for a non-terminal, we need to go back to where that non-terminal was and continue from there. -- [ ] We need to record the expansions of those "virtual nodes". That - is, the virtual nodes represent atomic languages such as [s]^{(t)} - where s is a non-terminal and t is a terminal. To be more specific, - it represents the derivative of the left-linear closure of the +- We need to record the expansions of those "virtual nodes". That is, + the virtual nodes represent atomic languages such as [s]^{(t)} where s + is a non-terminal and t is a terminal. To be more specific, it + represents the derivative of the left-linear closure of the non-terminal s with respect to the terminal t. The left-linear closure of s is the set of all strings (consisting of terminals and non-terminals alike) that are derivable from s alone, without consuming any inputs, by expanding according to the grammar rules. This means that we need to know if which non-terminals were expanded in order to get to a state in [s]^{(t)}. -- [ ] Each edge in the chain-rule machine needs to be labelled also - with a position in the forest. This perfectly solves the problem of - those "plugs"! +- Each edge in the chain-rule machine needs to be labelled also with a + position in the forest. This perfectly solves the problem of those + "plugs"! +* Formal Definition +We need to figure out the definition of the forest. That it to say, +what will the resulting forest look like for a grammar. +Moreover, we need to know how to correspond each step in the +chain-rule machine with the definition of the forest. + +After these two steps, we can easily proceed with the construction of +the chain-rule machine. + +** Axioms + +There are two *axioms* from which the definition of the forests +follows. + +1. Each node has a unique label, given (principally) by three + components: + 1) input range start + 2) input range end + 3) grammar label +2. If different subtrees share the same node label as the root, a + "clone" of the root should be made, and there should be a + "representative" of all clones, which is treated as a normal node + with that label, and all clones are its children. + +The first axiom ensures that the size of the forest is bounded by the +cube of \(n\), \(n\) is the length of the input sequence. And the +second axiom specifies what to do when subtrees share a parent. + +Since the clones are introduced only when subtrees share a parent, +their number cannot exceed the number of nodes of a forest without +clones. This shows that adding clones does not affect the growth rate +of the size of forests. + +A remark: the /clones/ are called /packed nodes/ in the traditional +terminology, but the terminology of a clone makes more sense to me. + +** Correspondence with chain-rule machine + +Every edge in the chain-rule machine corresponds to an edge in the +forest; denote this correspondence by \(f\). + +The chain-rule operation can be described as follows: + +1. Start with an edge \(e\) with children \(d_1, \ldots, d_n\) in the + chain-rule machine. +2. Prepare a new forest fragment as follows. + 1) For every child \(g\) of \(e\) in the atomic language, if \(g\) + is the terminal that appears as the current input \(t\), let the + new fragment be defined as the node moved by \(g\), alone. + 2) If \(g\) is a non-terminal and its first-set contains \(t\), + then for every left-linearly closed child of \(g\) that is + labelled \(t\), denoted as \(h\), then let the fragment be + defined as the node moved by \(g\), with the pre-recorded + left-linear expansion information from \(g\) to \(h\) appended + as children. +3. For \(i=1,\ldots,n\), consider the edge \(f(d_i)\) in the forest. + There are three cases: + 1) If the next edge is empty, that means we have found something + totally new to add to the forest, so we just add that. + 2) If the next edge after \(f(e)\) contains the new fragment as a + sub-tree already, this means the current branch has already been + explored before, and we have nothing else to do. + 3) If the next edge is non-empty and does not match the new + fragment, this means we have a conflict with a previously found + branch. We solve it simply by following the second axiom above: + we make the node that is the source of \(f(e)\) a clone, and add + the new fragment under a new clone. Note that if the node is + already a clone, we just make a clone under the parent of that + clone. +4. Update the correspondence \(f\) by updating \(f(g)\) and \(f(h)\) + accordingly. + +In this process we always follow the first axiom by ensuring that the +labels of the forest are unique to each node, except for those clones, +which are only created by following the second axiom. diff --git a/forest/src/lib.rs b/forest/src/lib.rs index 3925bd5..c2f4988 100644 --- a/forest/src/lib.rs +++ b/forest/src/lib.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! This file defines the expected behaviours of forests, and provides //! a default implementation for forests. //! @@ -10,12 +11,16 @@ //! out-coming and in-coming plugs. These plugs are used to join //! different fragments of forests together. -use graph::{Graph, GraphLabel}; +use graph::{GraphLabel, LabelGraph, ParentsGraph}; use core::fmt::Display; +// TODO: move this to default + +/// The type of errors for forest operations. #[derive(Debug)] pub enum Error { + /// An index is out of bounds. IndexOutOfBounds(usize, usize), } @@ -31,75 +36,52 @@ impl Display for Error { impl std::error::Error for Error {} -/// A builder of a forest. -pub trait ForestBuilder<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { - /// The type of the resulting forest. - type Output: Forest<NodeLabel, EdgeLabel>; - - /// Construct a new builder with only one node with the given - /// label. - fn new_leaf(label: NodeLabel) -> Self; - - /// Add a child to the builder the given labels for the new node - /// and the added edge. - /// - /// All plug-out nodes within the builder should have a new child - /// with the specified labels, and hence multiple children might - /// be added, and the plug-out nodes should be modified - /// accordingly. - fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); - - /// Build the forest. - fn build(self) -> Self::Output; - - /// Build the forest from a reference. - fn build_ref(&self) -> Self::Output; -} +// /// A builder of a forest. +// pub trait ForestBuilder<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { +// /// The type of the resulting forest. +// type Output: Forest<NodeLabel, EdgeLabel>; + +// /// Construct a new builder with only one node with the given +// /// label. +// fn new_leaf(label: NodeLabel) -> Self; + +// /// Add a child to the builder the given labels for the new node +// /// and the added edge. +// /// +// /// All plug-out nodes within the builder should have a new child +// /// with the specified labels, and hence multiple children might +// /// be added, and the plug-out nodes should be modified +// /// accordingly. +// fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); + +// /// Build the forest. +// fn build(self) -> Self::Output; + +// /// Build the forest from a reference. +// fn build_ref(&self) -> Self::Output; +// } + +// FIXME: The trait should be re-designed. /// The expected behaviours of a forest. /// -/// Note that it contains a "striped down" version of the labelled -/// graphs. -pub trait Forest<NodeLabel: GraphLabel, EdgeLabel: GraphLabel>: Graph { - /// Type of iterator of plug-in nodes. - type PluginIter<'a>: Iterator<Item = usize> + 'a - where - Self: 'a; - - /// Type of iterator of plug-out nodes. - type PlugoutIter<'a>: Iterator<Item = usize> + 'a - where - Self: 'a; - - /// Return the plug-in nodes - fn plugins(&self) -> Self::PluginIter<'_>; - - /// Return the plug-out nodes - fn plugouts(&self) -> Self::PlugoutIter<'_>; - - /// Plug another forest onto this forest. - /// - /// The plug-out nodes of this forest will be joined onto the - /// plug-in nodes of the joining forest. - /// - /// # Performance warning - /// - /// It is recommended to only call this function with a "small" - /// `other`, as this function might copy the whole graph - /// individually, node by node and edge by edge. - fn plug(&mut self, other: &Self) -> Result<(), Error>; - - /// Return the vertex label. - /// - /// A vertex may not have labels. - fn vertex_label(&self, node_id: usize) -> Result<Option<NodeLabel>, Error>; - - /// Return the edge label. - /// - /// An edge may have no labels. If there is no edge from the - /// given source to the given target, then `Ok(None)` is returned - /// as well. - fn edge_label(&self, source: usize, target: usize) -> Result<Option<EdgeLabel>, Error>; +/// Note that it requires only a subset of the functionalities of +/// labelled graphs. +pub trait Forest<T: GraphLabel>: ParentsGraph + LabelGraph<T> { + /// The type of errors for operations on the forest. + type Error: std::error::Error + From<graph::error::Error>; + + /// Detect if the fragment is a prefix of the sub-forest rooted at + /// `node_id`. + fn is_prefix<F: AsRef<Self>>(&self, node_id: usize, fragment: F) -> Result<bool, Self::Error>; + + /// Extend the forest by adjoining another forest at a given node. + fn plant<F: AsRef<Self>>(&mut self, node_id: usize, fragment: F) -> Result<(), Self::Error>; + + /// Clone a node by making a new node and making all the nodes + /// that previously pointed to the old node now point to the new + /// node, and the new node points to the old node. + fn clone(&mut self, node_id: usize) -> Result<(), Self::Error>; } pub mod default; diff --git a/grammar/src/first_set.rs b/grammar/src/first_set.rs new file mode 100644 index 0000000..2a5ee3c --- /dev/null +++ b/grammar/src/first_set.rs @@ -0,0 +1,451 @@ +//! This file implements a function for computing the set of first +//! terminals for a grammar. + +use super::*; + +impl Grammar { + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + match self.state { + GrammarState::Initial => { + self.state = GrammarState::AfterComputeFirst; + } + _ => { + // This has been called already. + return Ok(()); + } + } + + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec<HashSet<Option<usize>>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } +} diff --git a/grammar/src/left_closure.rs b/grammar/src/left_closure.rs new file mode 100644 index 0000000..1630881 --- /dev/null +++ b/grammar/src/left_closure.rs @@ -0,0 +1,324 @@ +//! This file implements some functions to compute the regular +//! language of left-linear closure of a grammar. + +use super::*; + +use nfa::LabelType; + +impl Grammar { + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&mut self) -> Result<Vec<DefaultRegex<TNT>>, Error> { + match self.state { + GrammarState::Initial => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterComputeFirst, + )) + } + GrammarState::AfterLeftClosure + | GrammarState::AfterNFA + | GrammarState::AfterComputeFirst => {} + } + + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + // If this non-terminal is nullable, add an empty variant. + if self.is_nullable(n)? { + local_result.push(Empty); + let empty_index = builder.add_vertex(); + builder.add_edge(0, empty_index, ()).unwrap(); + } + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => Lit(tnt), + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if let Some((first, _)) = parents_chain.first() { + assert_eq!(*first, regex_root); + } else { + local_result.push(tnt); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + // A different, "more local", root. + let mut local_root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + // TODO: If parent_edge_index is 0, make a + // Plus node instead. + local_result.extend([Empty, tnt]); + + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(tnt); + local_root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, tnt]); + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + // TODO: If edge_index is 0, then just + // make this a Plus node. + + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + + local_root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + local_root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(local_root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, local_root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + self.accumulators = { + let mut acc_result = Vec::with_capacity(result.len() + 1); + acc_result.push(0); + + for rule in result.iter() { + acc_result.push(rule.len() + *acc_result.last().unwrap()); + } + + acc_result + }; + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex<TNT>], + ) -> Result<DefaultNFA<LabelType<TNT>>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + + Ok(nfa) + } +} diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs index 4e544c9..627ae6f 100644 --- a/grammar/src/lib.rs +++ b/grammar/src/lib.rs @@ -15,12 +15,15 @@ use nfa::{ nfa::DefaultNFA, regex::{DefaultRegex, ParseError, RegexType}, }, - DOption, Nfa, Regex, SoC, + LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges, }; use graph::{adlist::ALGBuilder, builder::Builder, Graph}; -use std::{collections::HashSet, fmt::Display}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Display, +}; /// The type of a terminal. /// @@ -88,6 +91,9 @@ impl Display for TNT { #[derive(Debug, Copy, Clone)] #[non_exhaustive] pub enum Error { + /// The operation requires the grammar to be after a certain + /// state, but the grammar is not after that state yet. + WrongState(GrammarState, GrammarState), /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), /// Fail to build the N-th regular expression, due to the @@ -112,6 +118,9 @@ impl Display for Error { "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + Error::WrongState(current, threshold) => { + write!(f, "require state {threshold}, but in state {current}") + } } } } @@ -146,6 +155,36 @@ impl Rule { } } +/// The state of Grammar. +/// +/// This is used to ensure that the grammar preparation methods are +/// called in the correct order. +#[derive(Debug, Copy, Clone, Default)] +pub enum GrammarState { + /// Just initialized + #[default] + Initial, + /// compute_firsts has been called + AfterComputeFirst, + /// left_closure has been called. + AfterLeftClosure, + /// left_closure_to_nfa has been called. + AfterNFA, +} + +impl Display for GrammarState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use GrammarState::*; + + match self { + Initial => write!(f, "initial"), + AfterComputeFirst => write!(f, "after computation of first set"), + AfterLeftClosure => write!(f, "after computation of closure"), + AfterNFA => write!(f, "after computation of NFA"), + } + } +} + /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { @@ -158,6 +197,8 @@ pub struct Grammar { /// The length of the list must match that of the list of /// non-terminals. rules: Vec<Rule>, + /// The list of successive sums of lengths of rules. + accumulators: Vec<usize>, // The following two attributes are empty until we call // `compute_firsts` on the grammar. /// The list of sets of "first terminals". @@ -169,9 +210,18 @@ pub struct Grammar { /// /// The length must match that of the list of non-terminals. first_nodes: Vec<Vec<usize>>, - // The following attribute is empty until we call `left_closure` - // on the grammar. - left_closure_branches: HashSet<usize>, + // The following attribute is empty until we call `closure` on the + // NFA with `transform_label_null_epsilon` as the transformer. + /// A hash map that maps a tuple `(pos1, pos2)` of positions + /// `pos1` and `pos2` in the rules to a vector of rule positions. + /// This vector means that in order to expand from `pos1` to + /// `pos`, it is necessary to expand according to the positions in + /// the vector, so we need to add all these expansions into the + /// parse forest later. + expansion_map: HashMap<(usize, usize), Vec<usize>>, + /// The state of the grammar, which tells us what information has + /// been computed for this grammar. + state: GrammarState, } /// A private type to aid the recursive looping of rergular @@ -216,7 +266,14 @@ impl Grammar { .map(|rule| Vec::with_capacity(rule.len())) .collect(); - let left_closure_branches = HashSet::default(); + let state = Default::default(); + + let expansion_map = Default::default(); + + // NOTE: We cannot calculate accumulators here, as we want the + // accumulators of the regular expression of the left-closure, + // not of the original one. + let accumulators = Vec::new(); Self { ter, @@ -224,7 +281,9 @@ impl Grammar { rules, firsts, first_nodes, - left_closure_branches, + state, + expansion_map, + accumulators, } } @@ -258,7 +317,29 @@ impl Grammar { /// Return the total length of all rules. #[inline] pub fn total(&self) -> usize { - self.rules.iter().map(Rule::len).sum() + self.accumulators.last().copied().unwrap_or(0) + } + + /// Query if a position is the starting position of a + /// non-terminal. If it is, return the non-terminal, else return + /// `None` . + #[inline] + pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option<usize> { + for (index, accumulator) in self.accumulators.iter().copied().enumerate() { + let shifted_accumulator = accumulator << 1; + + // NOTE: Clippy suggests to call `cmp`, but it seems + // compiler might not yet be smart enough to inline that + // call, so I just silence clippy here. + #[allow(clippy::comparison_chain)] + if pos == shifted_accumulator { + return Some(index); + } else if pos < shifted_accumulator { + break; + } + } + + None } /// Return the number of terminals. @@ -353,754 +434,119 @@ impl Grammar { .contains(&None)) } - /// For a NON_TERMINAL, return an iterator that goes over the - /// nodes that are reachable from the non-terminal through an - /// empty transition of the nondeterministic finite automaton. + /// Query the expansion information from the position `pos1` to + /// the position `pos2` . #[inline] - pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> { - Ok(self - .first_nodes - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .iter()) - } - - /// Return a hash set that contains all nodes in the set of - /// left-closure regular languages that are added because of the - /// left-linear expansion. - pub fn left_closure_branches(&self) -> &HashSet<usize> { - &self.left_closure_branches - } - - /// Compute the set of terminals that can appear as the first - /// terminal in some left-linear derivation of a non-terminal, for - /// every non-terminal. - /// - /// This is an algorithm that computes the transitive closure, - /// which is a common approach for this task. But perhaps there - /// are more efficient approaches? - /// - /// Also the function computes the set of "reachable nodes" in the - /// process, and records the information in the `first_nodes` - /// attribute. - pub fn compute_firsts(&mut self) -> Result<(), Error> { - let mut updated = true; - - let non_len = self.non_num(); - - use StackElement::{Seen, Unseen}; - - while updated { - updated = false; - - for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { - let root = if let Some(root) = regex.root() { - root - } else { - if !self.is_nullable(n)? { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(None); - - // The default construction of a grammar - // reserves some space for each vector, so - // explicitly setting this can reduce some - // minor memory overhead. - let pointer = self.first_nodes.get_mut(n).unwrap(); - - pointer.clear(); - pointer.shrink_to_fit(); - } - - continue; - }; - - let regex_len = regex.len(); - - let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len); - - stack.push(Unseen(root)); - - let mut children_sets_stack: Vec<HashSet<Option<usize>>> = - Vec::with_capacity(regex_len); - - let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len); - - while let Some(top) = stack.pop() { - let top_index = top.index(); - let is_seen = top.is_seen(); - - match regex - .vertex_label(top_index) - .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? - { - RegexType::Kleene => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Plus => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Optional => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Or => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Paren => { - // Only for printing purposes - let mut this_set = HashSet::new(); - - this_set.insert(None); - - children_sets_stack.push(this_set); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - RegexType::Empty => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap().rev() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Lit(tnt) => { - match tnt { - TNT::Ter(t) => { - let mut this_set = HashSet::with_capacity(1); - - this_set.insert(Some(t)); - - children_sets_stack.push(this_set); - } - TNT::Non(non) => { - let this_set = self - .firsts - .get(non) - .ok_or(Error::IndexOutOfBounds(non, non_len))? - .clone(); - - children_sets_stack.push(this_set); - } - } - - let mut this_nodes = HashSet::with_capacity(1); - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - } - } - - assert_eq!( - children_sets_stack.len(), - 1, - "Too many elements left at the end" - ); - - assert_eq!( - children_nodes_stack.len(), - 1, - "Too many elements left at the end" - ); - - for first in children_sets_stack.pop().unwrap().into_iter() { - if !self.firsts.get(n).unwrap().contains(&first) { - updated = true; + pub fn query_expansion(&self, pos1: usize, pos2: usize) -> Result<Option<&[usize]>, Error> { + if pos1 >= self.total() { + return Err(Error::IndexOutOfBounds(pos1, self.total())); + } - self.firsts.get_mut(n).unwrap().insert(first); - } - } + if pos2 >= self.total() { + return Err(Error::IndexOutOfBounds(pos2, self.total())); + } - *self.first_nodes.get_mut(n).unwrap() = - children_nodes_stack.pop().unwrap().into_iter().collect(); + match self.state { + GrammarState::AfterLeftClosure => {} + _ => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterLeftClosure, + )); } } - Ok(()) + Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..])) } - /// Return the regular language of the left-linear closures of - /// non-terminals in the grammar. - /// - /// The resulting vector is guaranteed to be of the same length as - /// the number of non-terminals. - /// - /// The resulting regular language is not "self-contained". That - /// is to say, its terminals indices are packed indices and are - /// meaningless without the interpretation of the grammar. They - /// should be converted to a nondeterministic finite automaton and - /// then to its null closure later on. - pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> { - let non_len = self.non_num(); - - let mut result = Vec::with_capacity(non_len); - - for (n, rule) in self.rules.iter().enumerate() { - let regex = &rule.regex; - - let regex_root = if let Some(root) = regex.root() { - root - } else { - result.push(Default::default()); - - continue; - }; - - let regex_len = regex.len(); - - /// A convenient macro to retrieve the children from the - /// original regular expression with error propagation. - macro_rules! children { - ($n:expr) => { - regex - .children_of($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - /// A convenient macro to retrieve the label from the - /// original regular expression with error propagation. - macro_rules! label { - ($n:expr) => { - regex - .vertex_label($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - let parents = regex.parents_array().map_err(|e| match e { - nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), - nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), - _ => unreachable!(), - })?; - - use RegexType::*; - use TNT::*; - - let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2); - let mut builder = ALGBuilder::default(); - - /// Perform a depth-first copy - macro_rules! df_copy { - ($parent:expr, $index:expr) => { - match label!($index) { - Kleene | Plus | Optional | Or | Paren | Empty => { - let mut stack = vec![($parent, $index)]; - - while let Some((top_parent, top_index)) = stack.pop() { - let label = label!(top_index); - let label = match label { - Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), - _ => label, - }; - - local_result.push(label); - - let new = builder.add_vertex(); - - builder.add_edge(top_parent, new, ()).unwrap(); - - stack.extend(children!(top_index).map(|child| (new, child))); - } - } - Lit(remain_tnt) => { - local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); - let new = builder.add_vertex(); - builder.add_edge($parent, new, ()).unwrap(); - } - } - }; - } - - local_result.push(Or); - builder.add_vertex(); - - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); - - builder.add_edge(0, non_lit_index, ()).unwrap(); - - // If this non-terminal is nullable, add an empty variant. - if self.is_nullable(n)? { - local_result.push(Empty); - let empty_index = builder.add_vertex(); - builder.add_edge(0, empty_index, ()).unwrap(); + // REVIEW: Do we have a better way to record expansion information + // than to compute the transitive closure? + + /// A transformer of labels to be fed into + /// [`closure`][nfa::default::nfa::DefaultNFA::closure], with the + /// predicate that returns true if and only if the label of the + /// first edge is either empty or a nullable non-terminal. + pub fn transform_label_null_epsilon( + &mut self, + two_edges: TwoEdges<LabelType<TNT>>, + ) -> LabelType<TNT> { + let (first_source, first_target, first_label) = two_edges.first_edge(); + let (second_source, second_target, second_label) = two_edges.second_edge(); + + #[cfg(debug_assertions)] + { + assert_eq!(first_target, second_source); + + if let Some(tnt) = *first_label.get_value() { + assert!(matches!(tnt, TNT::Non(n) if matches!(self.is_nullable(n), Ok(true)))); } + } - for first_node in self.first_nodes_of(n)?.copied() { - assert!(first_node < parents.len()); - - let tnt = match label!(first_node) { - Lit(tnt) => Lit(tnt), - _ => continue, - }; - - let mut parents_chain = { - let mut result = Vec::new(); - let mut stack = Vec::with_capacity(parents.len()); - - stack.push(first_node); - - while let Some(top) = stack.pop() { - assert!(top < parents.len()); - if let Some(parent) = parents.get(top).copied().unwrap() { - result.push(parent); - stack.push(parent.0); - } - } + // Compute if this is from left-linear expansion: it is so if + // and only if one if either the edges comes from left-linear + // expansion or we are moving across a non-terminal expansion, + // that is to say, the source of the second edge is the + // starting edge of a non-terminal. - result.reverse(); + let mut left_p = first_label.get_left_p() || second_label.get_left_p(); - result - }; + // Record left-linear expansion information. - if parents_chain.is_empty() { - local_result.push(tnt); - let lit_index = builder.add_vertex(); - builder.add_edge(0, lit_index, ()).unwrap(); + if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) { + left_p = true; - continue; - } + if !self + .expansion_map + .contains_key(&(first_source, second_target)) + { + let original_expansion = self.expansion_map.get(&(second_source, second_target)); - assert_eq!(parents_chain.first().unwrap().0, regex_root); - - // A different, "more local", root. - let mut local_root: usize; - - // Handle the direct parent - let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); - - match label!(parent_node) { - Kleene | Plus => { - // TODO: If parent_edge_index is 0, make a - // Plus node instead. - local_result.extend([Empty, tnt]); - - local_root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(local_root, lit_index, ()).unwrap(); - - let iterator = children!(parent_node); - - for index in iterator.clone().skip(parent_edge_index + 1) { - df_copy!(local_root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(local_root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - - Or => { - local_result.push(tnt); - local_root = builder.add_vertex(); - } - Optional | Empty => { - // If this path is taken, it should not be - // optional. - local_result.extend([Empty, tnt]); - local_root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(local_root, lit_index, ()).unwrap(); - - for index in children!(parent_node).skip(parent_edge_index + 1) { - df_copy!(local_root, index); - } - } - Paren | Lit(_) => unreachable!(), - } + self.expansion_map.insert( + (first_source, second_target), + if let Some(original_expansion) = original_expansion { + let mut result = original_expansion.clone(); + result.push(second_nt); - // Handle successive parents - - for (node, edge_index) in parents_chain.into_iter() { - let node_type = label!(node); - - match node_type { - Kleene | Plus => { - // TODO: If edge_index is 0, then just - // make this a Plus node. - - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, local_root, ()).unwrap(); - - local_root = new_index; - - let iterator = children!(node); - - for index in iterator.clone().skip(edge_index + 1) { - df_copy!(local_root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(local_root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - RegexType::Or => {} - RegexType::Optional | RegexType::Empty => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, local_root, ()).unwrap(); - local_root = new_index; - - for index in children!(node).skip(edge_index + 1) { - df_copy!(local_root, index); - } - } - RegexType::Paren | RegexType::Lit(_) => unreachable!(), - } - } - - builder.add_edge(0, local_root, ()).unwrap(); + result + } else { + vec![second_nt] + }, + ); } - - local_result.shrink_to_fit(); - - let graph = builder.build(); - - assert_eq!(graph.nodes_len(), local_result.len()); - - result.push( - DefaultRegex::new(graph, local_result) - .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, - ); } - assert_eq!(result.len(), non_len); - - Ok(result) + NfaLabel::new(second_label.get_value(), second_label.get_moved(), left_p) } - /// Convert the regular language of left-linear closures to its - /// equivalent nondeterministic finite automaton. - /// - /// In the generation of the left-linear closure, the terminals - /// and non-terminals are packed into an unsigned integer. We - /// unpack them in converting to nondeterministic finite - /// automaton. - /// - /// The resulting nondeterministic finite automaton should be - /// transformed to its null-closure for use in our algorithm. - pub fn left_closure_to_nfa( - &self, - closure: &[DefaultRegex<TNT>], - ) -> Result<DefaultNFA<DOption<TNT>>, Error> { - let label_transform = |tnt| match tnt { - TNT::Ter(t) => { - let new_tnt = self.unpack_tnt(t).map_err(|e| match e { - Error::IndexOutOfBounds(index, bound) => { - graph::error::Error::IndexOutOfBounds(index, bound) - } - _ => unreachable!(), - })?; - - Ok(SoC::Carry(new_tnt)) + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> { + match self.state { + GrammarState::Initial => { + return Err(Error::WrongState( + self.state, + GrammarState::AfterComputeFirst, + )); } - TNT::Non(n) => Ok(SoC::Sub(n)), - }; - - let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + GrammarState::AfterComputeFirst + | GrammarState::AfterLeftClosure + | GrammarState::AfterNFA => {} + } - Ok(nfa) + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) } } +pub mod first_set; + +pub mod left_closure; + impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs index c236952..89f9844 100644 --- a/grammar/src/test_grammar_helper.rs +++ b/grammar/src/test_grammar_helper.rs @@ -275,7 +275,6 @@ pub fn new_paren_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { } /// Return a left recursive grammar. -#[allow(dead_code)] pub fn new_left_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; let non = vec![ diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs index 0bc9f4d..003c211 100644 --- a/grammar/src/tests/test_grammar_left_closure.rs +++ b/grammar/src/tests/test_grammar_left_closure.rs @@ -1,5 +1,6 @@ use crate::test_grammar_helper::*; use crate::*; +use graph::LabelGraph; use nfa::Nfa; use std::{ collections::HashSet, @@ -124,7 +125,7 @@ fn test_remove_epsilon() -> Result<(), Box<dyn std::error::Error>> { nfa.print_viz("nfa_orig.gv")?; - nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_epsilon(|label| label.get_value().is_none())?; nfa.print_viz("nfa_no_epsilon.gv")?; @@ -174,7 +175,7 @@ fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> { nfa.print_viz("nfa_orig.gv")?; - nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_epsilon(|label| label.get_value().is_none())?; let accumulators: HashSet<usize> = accumulators.into_iter().collect(); @@ -192,7 +193,8 @@ fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> { #[test] #[ignore] fn test_nulling() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_left_recursive_grammar()?; + // TODO: Test more grammars. + let mut grammar = new_right_recursive_grammar()?; let closure = new_closure_regex(&mut grammar)?; let mut lock = stdout().lock(); @@ -244,24 +246,31 @@ fn test_nulling() -> Result<(), Box<dyn std::error::Error>> { let mut nfa = grammar.left_closure_to_nfa(&closure)?; - nfa.nulling(|label| { - if let Some(label) = *label { - match label { - TNT::Ter(_) => false, - // Panics if a non-terminal references an invalid node - // here. - TNT::Non(n) => grammar.is_nullable(n).unwrap(), - } - } else { - true - } - })?; - let grammar_reserve_nodes = |node| accumulators.contains(&node); writeln!(lock, "accumulators are {accumulators:?}")?; - nfa.remove_epsilon(|label| label.is_none())?; + let nullables: HashSet<usize> = (0..grammar.non_num()) + .filter(|n| matches!(grammar.is_nullable(*n), Ok(true))) + .collect(); + + nfa.closure( + |label| { + if let Some(label) = *label.get_value() { + matches!(label, TNT::Non(n) if nullables.contains(&n)) + } else { + true + } + }, + true, + |two_edges| grammar.transform_label_null_epsilon(two_edges), + |label| label.get_value().is_none(), + )?; + + for (label, child_iter) in nfa.labels_of(18)? { + writeln!(lock, "{label}: {child_iter:?}")?; + } + nfa.remove_dead(grammar_reserve_nodes)?; writeln!(lock, "Printing nfa to nfa.gv")?; diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index ba9afb8..ba3077e 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -201,7 +201,7 @@ impl Builder for ALGBuilder { /// implement a customized builder. fn remove_edge<F>(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> where - F: Fn(Self::Label) -> bool, + F: FnMut(Self::Label) -> bool, { let nodes_len = self.nodes.len(); diff --git a/graph/src/adset.rs b/graph/src/adset.rs index adf2aaf..a72935a 100644 --- a/graph/src/adset.rs +++ b/graph/src/adset.rs @@ -215,7 +215,7 @@ impl Builder for ASGBuilder { fn remove_edge<F>(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> where - F: Fn(Self::Label) -> bool, + F: FnMut(Self::Label) -> bool, { let nodes_len = self.nodes.len(); diff --git a/graph/src/builder.rs b/graph/src/builder.rs index 5505e4f..b04e7f6 100644 --- a/graph/src/builder.rs +++ b/graph/src/builder.rs @@ -40,7 +40,7 @@ pub trait Builder: Default { /// target should be removed. fn remove_edge<F>(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> where - F: Fn(Self::Label) -> bool; + F: FnMut(Self::Label) -> bool; /// Convert the builder into a graph. /// @@ -89,5 +89,5 @@ pub trait BuilderMut { /// target should be removed. fn remove_edge<F>(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> where - F: Fn(Self::Label) -> bool; + F: FnMut(Self::Label) -> bool; } diff --git a/graph/src/labelled/binary.rs b/graph/src/labelled/binary.rs index 439505d..67d86f9 100644 --- a/graph/src/labelled/binary.rs +++ b/graph/src/labelled/binary.rs @@ -2,24 +2,89 @@ //! labels and each node knows about its parents. use super::*; -use crate::ParentsGraph; +use crate::{Parent, ParentsGraph}; -use std::collections::{HashMap as Map, HashSet as Set}; +use std::{ + collections::{hash_map::Iter as MapIter, HashMap as Map}, + slice::Iter, +}; use crate::builder::BuilderMut; +/// An ad-hoc `IndexSet` for children. +#[derive(Debug, Clone, Default)] +struct PLChildren { + children: Vec<usize>, + indices: Map<usize, usize>, +} + +impl PLChildren { + fn iter(&self) -> Iter<'_, usize> { + self.children.iter() + } + + fn len(&self) -> usize { + self.children.len() + } + + fn is_empty(&self) -> bool { + self.children.is_empty() + } + + fn contains(&self, key: &usize) -> bool { + self.indices.contains_key(key) + } + + // The return value matters not for me. + fn insert(&mut self, key: usize) { + if let Some(key_index) = self.indices.get(&key) { + debug_assert!(*key_index < self.children.len()); + + *self.children.get_mut(*key_index).unwrap() = key; + } else { + let key_index = self.children.len(); + self.indices.insert(key, key_index); + self.children.push(key); + } + } + + /// Remove an element from children. + /// + /// We must preserve the order of elements, so we have to shift + /// every element that follows it, which leads to a slow + /// performance. So try to avoid removing edges, if possible. + fn remove(&mut self, key: usize) { + let key_index = if let Some(key_index_result) = self.indices.get(&key) { + *key_index_result + } else { + // If the key is not contained in children, we have + // nothing to do. + return; + }; + + let children_len = self.children.len(); + + debug_assert!(key_index < children_len); + + for i in (key_index + 1)..children_len { + let key = self.children.get(i).unwrap(); + *self.indices.get_mut(key).unwrap() -= 1; + } + + self.children.remove(key_index); + } +} + /// A node has some children, some parents, and a label. #[derive(Debug, Clone)] struct PLNode<T: GraphLabel> { - children: Set<usize>, - // REVIEW: If performance for removing edges is a bottleneck, use - // a hash set here. - parents: Vec<usize>, + children: PLChildren, + parents: Map<usize, usize>, label: T, } impl<T: GraphLabel> PLNode<T> { - fn new(children: Set<usize>, parents: Vec<usize>, label: T) -> Self { + fn new(children: PLChildren, parents: Map<usize, usize>, label: T) -> Self { Self { children, parents, @@ -62,7 +127,7 @@ impl<T: GraphLabel> Default for PLGraph<T> { } impl<T: GraphLabel> Graph for PLGraph<T> { - type Iter<'a> = std::iter::Copied<std::collections::hash_set::Iter<'a, usize>> + type Iter<'a> = std::iter::Copied<Iter<'a, usize>> where Self: 'a; @@ -103,6 +168,7 @@ impl<T: GraphLabel> Graph for PLGraph<T> { } } + #[inline] fn has_edge(&self, source: usize, target: usize) -> Result<bool, Error> { if let Some(node) = self.nodes.get(source) { if !self.has_node(target) { @@ -116,7 +182,7 @@ impl<T: GraphLabel> Graph for PLGraph<T> { } fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) { - unimplemented!("for later") + todo!() } fn print_viz(&self, _filename: &str) -> Result<(), std::io::Error> { @@ -124,14 +190,53 @@ impl<T: GraphLabel> Graph for PLGraph<T> { } } +/// An iterator of parents. +/// +/// This is to avoid a boxed allocation. +#[derive(Debug, Clone)] +pub struct ParentIter<'a> { + /// MapIter yields (&usize, &usize), so we need to dereference + /// that. + parents: MapIter<'a, usize, usize>, +} + +impl<'a> ParentIter<'a> { + fn new(parents: MapIter<'a, usize, usize>) -> Self { + Self { parents } + } +} + +impl<'a> Iterator for ParentIter<'a> { + type Item = Parent; + + #[inline] + fn next(&mut self) -> Option<Self::Item> { + self.parents + .next() + .map(|(key, value)| Parent::new(*key, *value)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.parents.size_hint() + } +} + +impl<'a> ExactSizeIterator for ParentIter<'a> { + #[inline] + fn len(&self) -> usize { + self.parents.len() + } +} + impl<T: GraphLabel> ParentsGraph for PLGraph<T> { - type Iter<'a> = std::iter::Copied<std::slice::Iter<'a, usize>> + type Iter<'a> = ParentIter<'a> where Self: 'a; #[inline] fn parents_of(&self, node_id: usize) -> Result<<Self as ParentsGraph>::Iter<'_>, Error> { if let Some(node) = self.nodes.get(node_id) { - Ok(node.parents.iter().copied()) + Ok(ParentIter::new(node.parents.iter())) } else { Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) } @@ -203,10 +308,12 @@ impl<'a, T: GraphLabel> BuilderMut for PLGBuilderMut<'a, T> { where Self::Label: 'b; + #[inline] fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_> { PLGBuilderMut { graph } } + #[inline] fn add_vertex(&mut self, label: Self::Label) -> usize { if let Some(old) = self.graph.label_index_map.get(&label) { *old @@ -222,11 +329,16 @@ impl<'a, T: GraphLabel> BuilderMut for PLGBuilderMut<'a, T> { } } + #[inline] fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { if self.graph.has_edge(source, target)? { - return Err(Error::DuplicatedEdge(source, target)); + return Ok(()); } + // The validity of source and target is guaranteed now. + + let parent_child_index = self.graph.degree(source).unwrap(); + self.graph .nodes .get_mut(source) @@ -239,35 +351,295 @@ impl<'a, T: GraphLabel> BuilderMut for PLGBuilderMut<'a, T> { .get_mut(target) .unwrap() .parents - .push(source); + .insert(source, parent_child_index); Ok(()) } + /// Remove an edge from the source to the target. + /// + /// Since some graphs are labelled, the users are allowed to pass + /// a predicate to determine if an edge from the source to the + /// target should be removed. + /// + /// # Performance + /// + /// Removal is slow since we need to keep the order of the elements. fn remove_edge<F>(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> where - F: Fn(Self::Label) -> bool, + F: FnMut(Self::Label) -> bool, { if !self.graph.has_edge(source, target)? { return Ok(()); } + // Both source and target are valid indices now. + + let child_index = *self + .graph + .nodes + .get(target) + .unwrap() + .parents + .get(&source) + .unwrap(); + + let source_degree = self.graph.degree(source).unwrap(); + + // Decrement the relevant children's parents index. + for i in (child_index + 1)..source_degree { + let child = *self + .graph + .nodes + .get(source) + .unwrap() + .children + .children + .get(i) + .unwrap(); + + *self + .graph + .nodes + .get_mut(child) + .unwrap() + .parents + .get_mut(&source) + .unwrap() -= 1; + } + self.graph .nodes .get_mut(source) .unwrap() .children - .remove(&target); + .remove(target); self.graph .nodes .get_mut(target) .unwrap() .parents - .retain(|parent| *parent != source); + .remove(&source); + + Ok(()) + } +} + +#[cfg(test)] +mod binary_test { + use super::*; + use std::collections::HashSet as Set; + + macro_rules! set { + ($type:tt) => { Set::<$type>::default() }; + ($($num:expr),*) => { + { + let mut set: Set<_> = Set::default(); + $(set.insert($num);)* + set + } + }; + } + + #[test] + fn test_graph_apis() -> Result<(), Error> { + let mut graph: PLGraph<usize> = Default::default(); + + // testing empty graph + assert!(graph.is_empty()); + + let mut builder = PLGBuilderMut::from_graph_mut(&mut graph); + + // testing adding an empty node + assert_eq!(builder.add_vertex(0), 0); + + // testing nodes_len + assert_eq!(graph.nodes_len(), 1); + + let mut builder = PLGBuilderMut::from_graph_mut(&mut graph); + + // testing more vertices and edges + + builder.add_vertex(1); + builder.add_vertex(2); + builder.add_vertex(3); + builder.add_vertex(4); + builder.add_vertex(5); + + builder.add_edge(1, 0, 0)?; + builder.add_edge(2, 0, 0)?; + builder.add_edge(2, 1, 0)?; + builder.add_edge(3, 0, 0)?; + builder.add_edge(3, 2, 0)?; + builder.add_edge(4, 1, 0)?; + builder.add_edge(4, 2, 0)?; + builder.add_edge(5, 2, 0)?; + builder.add_edge(5, 3, 0)?; + builder.add_edge(5, 1, 0)?; + + // testing adding a duplicatedly labelled node + assert_eq!(builder.add_vertex(0), 0); + + let graph = graph; + + // ensuring the correct length + assert_eq!(graph.nodes_len(), 6); + + // testing children_of + assert_eq!(graph.children_of(5)?.collect::<Set<_>>(), set!(1, 3, 2)); + + // testing parents_of + + assert_eq!( + graph.parents_of(0)?.collect::<Set<_>>(), + set!(Parent::new(1, 0), Parent::new(2, 0), Parent::new(3, 0)) + ); + + assert_eq!( + graph.parents_of(1)?.collect::<Set<_>>(), + set!(Parent::new(2, 1), Parent::new(4, 0), Parent::new(5, 2)) + ); + + assert_eq!(graph.parents_of(5)?.len(), 0); + + // testing degree + assert_eq!(graph.degree(4)?, 2); + + // testing is_empty_node + assert!(graph.is_empty_node(0)?); + assert!(!graph.is_empty_node(1)?); + + // testing has_edge + assert!(graph.has_edge(3, 2)?); + assert!(!graph.has_edge(3, 1)?); + assert!(matches!( + graph.has_edge(3, 6), + Err(Error::IndexOutOfBounds(6, 6)) + )); Ok(()) } } -// TODO: tests +#[cfg(test)] +mod test_plgraph_builder { + use super::*; + + #[test] + fn test_builder() -> Result<(), Box<dyn std::error::Error>> { + let mut graph = PLGraph::<usize>::default(); + let mut builder = PLGBuilderMut::from_graph_mut(&mut graph); + + // Add five nodes + builder.add_vertex(0); + builder.add_vertex(1); + builder.add_vertex(2); + builder.add_vertex(3); + builder.add_vertex(4); + + // println!("five empty nodes: {builder:?}"); + + // Link each node to its successor and link the last node with + // the first one to form a cycle. + for i in 0..5 { + builder.add_edge(i, if i < 4 { i + 1 } else { 0 }, 0)?; + } + + // println!("a cycle of five nodes: {builder:?}"); + + // Remove the link from the last node to the first node. + builder.remove_edge(4, 0, |_| true)?; + + // println!("a line of five nodes: {builder:?}"); + + // build a graph + + let graph = graph; + + println!("final graph: {graph:?}"); + + Ok(()) + } + + #[test] + fn test_errors() -> Result<(), Box<dyn std::error::Error>> { + let mut graph = PLGraph::<usize>::default(); + let mut builder = PLGBuilderMut::from_graph_mut(&mut graph); + + // Add five nodes + builder.add_vertex(0); + builder.add_vertex(1); + builder.add_vertex(2); + builder.add_vertex(3); + builder.add_vertex(4); + + // println!("five empty nodes: {builder:?}"); + + // Errors in add_edge + + // println!(); + // println!("Testing errors in add_edge:"); + // println!(); + + assert!(matches!( + builder.add_edge(0, 5, 0), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + // println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.add_edge(10, 5, 0), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + // println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.add_edge(10, 50, 0), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + // println!("Right error for both indices out of bounds"); + + // Errors in remove_edge + + // println!(); + // println!("Testing errors in remove_edge:"); + // println!(); + + assert!(matches!( + builder.remove_edge(0, 5, |_| true), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + // println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.remove_edge(10, 5, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + // println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.remove_edge(10, 50, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + // println!("Right error for both indices out of bounds"); + + assert!(builder.remove_edge(0, 1, |_| true).is_ok()); + + // println!("No errors for removing a non-existing edge"); + + // println!(); + + let graph = graph; + + println!("final graph: {graph:?}"); + + Ok(()) + } +} diff --git a/graph/src/labelled/double.rs b/graph/src/labelled/double.rs index 53b5dc8..4ab8a38 100644 --- a/graph/src/labelled/double.rs +++ b/graph/src/labelled/double.rs @@ -554,9 +554,14 @@ impl<T: GraphLabel> Builder for DLGBuilder<T> { Ok(()) } - fn remove_edge<F>(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> + fn remove_edge<F>( + &mut self, + source: usize, + target: usize, + mut predicate: F, + ) -> Result<(), Error> where - F: Fn(Self::Label) -> bool, + F: FnMut(Self::Label) -> bool, { let nodes_len = self.nodes.len(); diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 26159c6..6813df3 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -214,12 +214,37 @@ impl<T: Hash + Eq + PartialEq + Clone + Copy + Ord + PartialOrd + Display + Debu { } +/// We need both the node index of a parent and the edge index of the +/// child that points to this parent. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub struct Parent(usize, usize); + +impl Parent { + /// Return the index of the parent node. + #[inline] + pub fn node(&self) -> usize { + self.0 + } + + /// Return the index of the edge that points to the child. + #[inline] + pub fn edge(&self) -> usize { + self.1 + } + + /// Construct a parent strucure. + #[inline] + pub fn new(node: usize, edge: usize) -> Self { + Self(node, edge) + } +} + /// A parents-knowing graph can return an iterator of parents for any /// node. pub trait ParentsGraph: Graph { /// The type of an iterator that goes through the parents of a /// node. - type Iter<'a>: Iterator<Item = usize> + 'a + type Iter<'a>: Iterator<Item = Parent> + 'a where Self: 'a; @@ -340,14 +365,14 @@ mod test_cycle_detection { let graph = builder.build_ref(); - assert_eq!(graph.contains_cycles()?, true); + assert!(!graph.contains_cycles()?); // Remove the link from the last node to the first node. builder.remove_edge(4, 0, |_| true)?; let graph = builder.build(); - assert_eq!(graph.contains_cycles()?, false); + assert!(!graph.contains_cycles()?); Ok(()) } diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index e642218..a23c056 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -5,7 +5,10 @@ use graph::{ LabelExtGraph, LabelGraph, }; -use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; +use crate::{ + default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC, + TwoEdges, +}; use core::fmt::{Debug, Display}; @@ -123,8 +126,6 @@ impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> { } } -// TODO: Define a type for the labels of DefaultNFA. - impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>; @@ -134,7 +135,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { regexps: &[impl Regex<RegexType<T>>], sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, default: Option<T>, - ) -> Result<Self::FromRegex<DOption<T>>, Error> { + ) -> Result<Self::FromRegex<LabelType<T>>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); if total_regexps_len == 0 { @@ -144,18 +145,16 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; - let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len); - - // Since DOption<T> implements Copy when T does, we can use a - // variable to hold the empty label to avoid repetitions. - let empty_label: DOption<T> = Default::default(); + let mut builder: DLGBuilder<LabelType<T>> = Builder::with_capacity(nfa_len); for _ in 0..nfa_len { builder.add_vertex(); } + let default = LabelType::new(DOption(default), total_regexps_len, false); + // add a default edge - builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + builder.add_edge(nfa_len - 2, nfa_len - 1, default)?; let accumulators: Vec<usize> = { let mut result = Vec::with_capacity(regexps.len()); @@ -206,6 +205,10 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { stack.push(root); while let Some(top_index) = stack.pop() { + let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false); + + let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true); + let top_label = regex.vertex_label(top_index)?; let nfa_start = offset + 2 * top_index; @@ -213,7 +216,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { match top_label { RegexType::Kleene => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -224,14 +227,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } } RegexType::Plus => { @@ -244,20 +247,20 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; - builder.add_edge(nfa_end, nfa_start, empty_label)?; + builder.add_edge(nfa_end, nfa_start, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Optional => { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; @@ -268,12 +271,12 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } } RegexType::Or => { @@ -284,11 +287,11 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(nfa_start, child_start, empty_label)?; - builder.add_edge(child_end, nfa_end, empty_label)?; + builder.add_edge(nfa_start, child_start, false_label)?; + builder.add_edge(child_end, nfa_end, false_label)?; } } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Paren => { @@ -305,14 +308,14 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; - builder.add_edge(source, child_start, empty_label)?; + builder.add_edge(source, child_start, false_label)?; source = child_end; } - builder.add_edge(source, nfa_end, empty_label)?; + builder.add_edge(source, nfa_end, false_label)?; } else { - builder.add_edge(nfa_start, nfa_end, empty_label)?; + builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Lit(value) => { @@ -327,13 +330,34 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { let sub_nfa_start = get_offset_safe!(sub_non); let sub_nfa_end = sub_nfa_start + 1; - builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; - builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + // We also need a label for the + // original label and non-left-linear + // expansion. + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(value)), + (offset >> 1) + top_index, + false, + ), + )?; + + builder.add_edge(nfa_start, sub_nfa_start, true_label)?; + builder.add_edge(sub_nfa_end, nfa_end, false_label)?; } SoC::Carry(new_value) => { // a terminal - builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + builder.add_edge( + nfa_start, + nfa_end, + NfaLabel::new( + DOption(Some(new_value)), + (offset >> 1) + top_index, + false, + ), + )?; } } } @@ -346,56 +370,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { Ok(DefaultNFA { graph }) } - fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> { - let mut builder = self.graph.builder_ref(); - - let mut updated = true; - - let nodes_len = self.nodes_len(); - - while updated { - updated = false; - - let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - - for source in builder.nodes() { - for (label, target_iter) in builder.labels_of(source)? { - if f(*label) { - // empty label found - for target in target_iter { - for (label, children_iter) in builder.labels_of(target)? { - for child in children_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; - - to_add.push((source, child, *label)); - } - } - } - } - } - } - } - - for (source, target, label) in to_add.into_iter() { - builder.add_edge(source, target, label)?; - } - } - - // Then remove all epsilon edges - - let sources_and_targets: Vec<_> = builder.edges().collect(); - - for (source, target) in sources_and_targets.into_iter() { - builder.remove_edge(source, target, &f)?; - } - - self.graph = builder.build(); - - Ok(()) - } - - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> { let mut builder = self.graph.builder_ref(); let mut changed = true; @@ -439,48 +414,69 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { Ok(()) } - // REVIEW: Combine nulling and remove_epsilon into the same - // method, or not. + fn closure( + &mut self, + mut predicate: impl FnMut(T) -> bool, + remove_after_p: bool, + mut transform: impl FnMut(TwoEdges<T>) -> T, + mut remove_predicate: impl FnMut(T) -> bool, + ) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); - fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; - let mut builder = self.graph.builder_ref(); + + let nodes_len = self.nodes_len(); while updated { updated = false; - let mut nullable = false; - - let mut to_add = Vec::new(); - - for (source, target) in builder.edges() { - for label in builder.edge_label(source, target)? { - if f(label) { - nullable = true; + // Just a rough estimate + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); - break; - } - } + // REVIEW: I do not like nested if statements, but I do + // not know how to do this otherwise. - if nullable { - for (label, child_iter) in builder.labels_of(target)? { - for child in child_iter { - if !builder.has_edge_label(source, label, child)? { - updated = true; + for source in builder.nodes() { + for (first_label, target_iter) in builder.labels_of(source)? { + if predicate(*first_label) { + // a null label found + for target in target_iter { + for (second_label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + let new_label = transform(TwoEdges( + source, + target, + child, + *first_label, + *second_label, + )); + + if !builder.has_edge_label(source, &new_label, child)? { + updated = true; - to_add.push((source, child, *label)); + to_add.push((source, child, new_label)); + } + } } } } } } - for (source, child, label) in to_add { - builder.add_edge(source, child, label)?; + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all nullable edges if demanded + + if remove_after_p { + for (source, target) in builder.edges().collect::<Vec<_>>().into_iter() { + builder.remove_edge(source, target, &mut remove_predicate)?; } } - self.graph.replace_by_builder(builder); + self.graph = builder.build(); Ok(()) } @@ -532,7 +528,7 @@ mod test_to_nfa { Ok( DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)? - .unwrap_or(Default::default()) + .unwrap_or_default() .0, ) } @@ -543,13 +539,13 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa( + let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa( &[regex], |label| Ok(SoC::Carry(label)), Some(char::default()), )?; - nfa.print_viz("nfa.gv")?; + // nfa.print_viz("nfa.gv")?; Ok(()) } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index 31bd69a..c1906e1 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -60,8 +60,12 @@ pub trait Regex<T: GraphLabel>: Graph + Display + Clone { /// Since `Option<T>` does not inherit the `Display` from `T`, we wrap /// it to provide an automatic implementation of `Display`. +/// +/// # Convert to `Option` +/// +/// One can dereference a `DOption` to obtain an `Option`. #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] -pub struct DOption<T>(Option<T>); +pub struct DOption<T>(pub Option<T>); impl<T> Default for DOption<T> { fn default() -> Self { @@ -117,17 +121,104 @@ pub enum SoC<T> { Carry(T), } +/// This type records some information that is obtained from the +/// process of converting a regular expression to a nondeterministic +/// finite automaton. +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash, Default)] +pub struct NfaLabel<T: GraphLabel> { + /// A terminal or a non-terminal. + value: T, + /// The corresponding position in the rules. + moved: usize, + /// Whether this comes from left-linear expansion. + left_p: bool, +} + +impl<T: GraphLabel> NfaLabel<T> { + /// Construct a new label. + #[inline] + pub fn new(value: T, moved: usize, left_p: bool) -> Self { + Self { + value, + moved, + left_p, + } + } + + /// Retrieve the value from the label. + #[inline] + pub fn get_value(&self) -> T { + self.value + } + /// Retrieve the moved position from the label. + #[inline] + pub fn get_moved(&self) -> usize { + self.moved + } + /// Retrieve whether or not the label comes from left-linear + /// expansions. + #[inline] + pub fn get_left_p(&self) -> bool { + self.left_p + } +} + +impl<T: GraphLabel> Display for NfaLabel<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "edge {} at {}{}", self.value, self.moved, { + if self.left_p { + ", by left" + } else { + "" + } + }) + } +} + +/// A convenient alias of the information of two edges. +/// +/// If the tuple is (a, b, c, la, lb), then the first edge goes from a +/// to b, is labelled la, and the second edge goes from b to c, and is +/// labelled by lb. +#[derive(Debug, Clone, Copy)] +pub struct TwoEdges<T: Copy>(usize, usize, usize, T, T); + +impl<T: Copy> TwoEdges<T> { + /// Extract the first edge. + pub fn first_edge(&self) -> (usize, usize, T) { + (self.0, self.1, self.3) + } + + /// Extract the second edge. + pub fn second_edge(&self) -> (usize, usize, T) { + (self.1, self.2, self.4) + } +} + +/// The type of nondeterministic finite automata that is obtained from +/// a regular expression, via the method [`to_nfa`][Nfa::to_nfa]. +pub type LabelType<T> = NfaLabel<DOption<T>>; + /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { - // TODO: This trait should have a type for the labels. + /// When we convert a regular expression to a nondeterministic + /// finite automaton, the label should be optional, so we use a + /// different type for the result type. + type FromRegex<S: GraphLabel + Display + Default>; + // FIXME: This should not be needed. /// Remove all empty transitions from the nondeterministic finite /// automaton. - fn remove_epsilon<F>(&mut self, f: F) -> Result<(), Error> + #[inline] + fn remove_epsilon<F>(&mut self, _f: F) -> Result<(), Error> where - F: Fn(T) -> bool; + F: FnMut(T) -> bool, + { + // self.closure(f, true, |two_edges| two_edges.second_edge().2) + unimplemented!("deprecated") + } /// Return a state-minimal NFA equivalent with the original one. /// @@ -174,11 +265,6 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { Ok(result) } - /// When we convert a regular expression to a nondeterministic - /// finite automaton, the label should be optional, so we use a - /// different type for the result type. - type FromRegex<S: GraphLabel + Display + Default>; - /// Build a nondeterministic finite automaton out of a set /// `regexps` of regular expressions. /// @@ -194,10 +280,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { /// basically. fn to_nfa( regexps: &[impl Regex<RegexType<T>>], - // TODO: The transformation should produce more information. sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, default: Option<T>, - ) -> Result<Self::FromRegex<DOption<T>>, Error>; + ) -> Result<Self::FromRegex<LabelType<T>>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. @@ -211,8 +296,9 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { /// out of the dead nodes. Then these nodes are considered gone /// from the graph, and we don't need to re-index the existing /// edges. We can call this "a poor man's removal of nodes". - fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>; + fn remove_dead(&mut self, reserve: impl FnMut(usize) -> bool) -> Result<(), Error>; + // FIXME: This should not be needed. /// For each edge from A to B whose edge is considered nullable by /// a function `f`, and for every edge from B to C, add an edge /// from A to C. @@ -220,7 +306,50 @@ pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { /// This is needed specifically by the algorithm to produce a set /// of atomic languages that represent "null-closed" derived /// languages. - fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>; + #[inline] + fn nulling(&mut self, _f: impl FnMut(T) -> bool) -> Result<(), Error> { + // self.closure(f, false, |two_edges| two_edges.second_edge().2) + unimplemented!("deprecated") + } + + /// Return the *closure* of the nondeterministic finite automaton. + /// + /// # Definition + /// + /// The closure of a nondeterministic finite automaton N is + /// defined as the unique *minimal* nondeterministic finite + /// automaton N+ that can be obtained by adjoining some edges to N + /// such that if there are edges a -> b and b -> c, and if the + /// edge a -> b is deemed as *nullable* by some function, then + /// there is an edge a -> c, where the minimality is the + /// minimality of the set of edges: if there is another such + /// nondeterministic finite automaton M satisfying the above + /// property, then the set of edges of N+ is a subset of the set + /// of edges of M. + /// + /// # Remove edges afterwards + /// + /// If `remove_after_p` is true, remove all those nullable edges. + /// + /// # Transformation of labels + /// + /// We can apply a transformer to labels, to be able to combine + /// labels in non-trivial ways. If one just wants the *new* label + /// that is the label of the edge from b to c in the above + /// definition, one can use the function that sends `two_edges` to + /// `two_edges.second_edge().2`. + /// + /// # Error + /// + /// The function should emit errors if the edges of the + /// nondeterministic finite automaton point to invalid nodes. + fn closure( + &mut self, + predicate: impl FnMut(T) -> bool, + remove_after_p: bool, + transform: impl FnMut(TwoEdges<T>) -> T, + remove_predicate: impl FnMut(T) -> bool, + ) -> Result<(), Error>; } pub mod default; |