diff options
-rw-r--r-- | Cargo.toml | 6 | ||||
-rw-r--r-- | chain/Cargo.toml | 7 | ||||
-rw-r--r-- | chain/src/atom/default.rs | 342 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 41 | ||||
-rw-r--r-- | chain/src/default.rs | 46 | ||||
-rw-r--r-- | chain/src/lib.rs | 49 | ||||
-rw-r--r-- | chain/src/plan.org | 154 | ||||
-rw-r--r-- | forest/Cargo.toml | 9 | ||||
-rw-r--r-- | forest/src/default.rs | 153 | ||||
-rw-r--r-- | forest/src/lib.rs | 105 | ||||
-rw-r--r-- | grammar/Cargo.toml | 20 | ||||
-rw-r--r-- | grammar/src/lib.rs (renamed from chain/src/grammar.rs) | 302 | ||||
-rw-r--r-- | grammar/src/test_grammar_helper.rs | 368 | ||||
-rw-r--r-- | grammar/src/tests/mod.rs | 30 | ||||
-rw-r--r-- | grammar/src/tests/test_grammar_left_closure.rs | 272 | ||||
-rw-r--r-- | graph/src/adlist.rs | 6 | ||||
-rw-r--r-- | graph/src/adset.rs | 6 | ||||
-rw-r--r-- | graph/src/builder.rs | 3 | ||||
-rw-r--r-- | graph/src/labelled.rs | 135 | ||||
-rw-r--r-- | graph/src/lib.rs | 14 | ||||
-rw-r--r-- | nfa/src/default/mod.rs | 2 | ||||
-rw-r--r-- | nfa/src/default/nfa.rs | 160 | ||||
-rw-r--r-- | nfa/src/default/regex.rs | 20 | ||||
-rw-r--r-- | nfa/src/error.rs | 18 | ||||
-rw-r--r-- | nfa/src/lib.rs | 91 | ||||
-rw-r--r-- | receme/src/lib.rs | 2 | ||||
-rw-r--r-- | receme/src/tree.rs | 2 | ||||
-rw-r--r-- | semiring/Cargo.toml | 8 | ||||
-rw-r--r-- | semiring/src/lib.rs | 14 |
29 files changed, 2042 insertions, 343 deletions
@@ -10,10 +10,12 @@ keywords = ["emacs", "parser"] # generic associated types, which are not stablized until version # 1.65. rust-version = "1.65" -# testing the new resolver, even though this has no dependencies ;p [workspace] -members = ["graph", "receme", "nfa", "chain", "viz"] +members = [ "graph", "receme", "nfa", "chain", "viz", "grammar", + "forest", "semiring"] + +# testing the new resolver, even though this has no dependencies ;p resolver = "2" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/chain/Cargo.toml b/chain/Cargo.toml index 32eed8b..265954c 100644 --- a/chain/Cargo.toml +++ b/chain/Cargo.toml @@ -7,4 +7,9 @@ edition = "2021" [dependencies] nfa = { path = "../nfa" } -graph = { path = "../graph" }
\ No newline at end of file +graph = { path = "../graph" } +grammar = { path = "../grammar" } +forest = { path = "../forest" } + +[dev-dependencies] +grammar = { path = "../grammar", features = ["test-helper"] } diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs new file mode 100644 index 0000000..72989b3 --- /dev/null +++ b/chain/src/atom/default.rs @@ -0,0 +1,342 @@ +//! This file provides a default implementation of the +//! [`Atom`][super::Atom] trait. + +use super::*; +use grammar::Grammar; +use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; +use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; + +use core::fmt::Display; +use std::collections::BTreeMap as Map; + +/// A virtual node represents the derivative of a non-terminal symbol +/// `S` with respect to a terminal symbol `t`. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] +struct VirtualNode { + s: usize, + t: usize, +} + +impl Display for VirtualNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "VN[{}]^({})", self.s, self.t) + } +} + +impl VirtualNode { + fn new(s: usize, t: usize) -> Self { + Self { s, t } + } +} + +type VirtualMap = Map<VirtualNode, usize>; + +/// The type of atomic languages. +#[derive(Debug, Clone, Default)] +pub struct DefaultAtom { + grammar: Grammar, + nfa: DefaultNFA<DOption<TNT>>, + // NOTE: This is mostly for printing and debugging + regexp: Vec<DefaultRegex<TNT>>, + virtual_nodes: VirtualMap, +} + +impl Display for DefaultAtom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let grammar = &self.grammar; + + let error_to_string = |err| format!("{err}"); + let tnt_to_string = |tnt, deco| { + grammar + .name_of_tnt(tnt) + .map(|name| format!("{deco}({name})")) + .unwrap_or_else(error_to_string) + }; + + let display_tnt = |tnt| match tnt { + TNT::Ter(t) => format!( + "({})", + grammar + .unpack_tnt(t) + .map(|tnt| tnt_to_string(tnt, "")) + .unwrap_or_else(error_to_string) + ), + TNT::Non(_) => tnt_to_string(tnt, "H"), + }; + + writeln!(f, "regular expressions:")?; + + let mut accumulators = Vec::with_capacity(self.regexp.len() + 1); + accumulators.push(0usize); + + for regex in self.regexp.iter() { + writeln!(f, "accumulator: {}", accumulators.last().unwrap())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + + let string = regex.to_string_with(display_tnt)?; + + writeln!(f, "{string}")?; + } + + writeln!(f, "total = {}", accumulators.last().unwrap())?; + + writeln!(f, "virtual nodes:")?; + + for (virtual_node, index) in self.virtual_nodes.iter() { + writeln!(f, "{virtual_node}: {index}")?; + } + + Ok(()) + } +} + +// Some boiler-plate delegation implementations for Graph and +// LabelGraph, in order to implement Nfa. + +impl Graph for DefaultAtom { + type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b> + where + Self: 'b; + + fn is_empty(&self) -> bool { + self.nfa.is_empty() + } + + fn nodes_len(&self) -> usize { + self.nfa.nodes_len() + } + + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GraphError> { + self.nfa.children_of(node_id) + } + + fn degree(&self, node_id: usize) -> Result<usize, GraphError> { + self.nfa.degree(node_id) + } + + fn is_empty_node(&self, node_id: usize) -> Result<bool, GraphError> { + self.nfa.is_empty_node(node_id) + } + + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GraphError> { + self.nfa.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + // NOTE: We cannot replace by a builder whose result is an + // atom, not the underlying graph type. + unimplemented!() + } +} + +impl LabelGraph<DOption<TNT>> for DefaultAtom { + type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b> + where + Self: 'b; + + type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b> + where + Self: 'b, + DOption<TNT>: 'b; + + type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a> + where + Self: 'a, + DOption<TNT>: 'a; + + #[inline] + fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> { + self.nfa.vertex_label(node_id) + } + + #[inline] + fn edge_label( + &self, + source: usize, + target: usize, + ) -> Result<Self::EdgeLabelIter<'_>, GraphError> { + self.nfa.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &DOption<TNT>, + ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> { + self.nfa.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GraphError> { + self.nfa.labels_of(node_id) + } + + #[inline] + fn has_edge_label( + &self, + node_id: usize, + label: &DOption<TNT>, + target: usize, + ) -> Result<bool, GraphError> { + self.nfa.has_edge_label(node_id, label, target) + } +} + +impl LabelExtGraph<DOption<TNT>> for DefaultAtom { + #[inline] + fn extend( + &mut self, + edges: impl IntoIterator<Item = (DOption<TNT>, usize)>, + ) -> Result<usize, GraphError> { + self.nfa.extend(edges) + } +} + +impl Nfa<DOption<TNT>> for DefaultAtom { + #[inline] + fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error> + where + F: Fn(DOption<TNT>) -> bool, + { + self.nfa.remove_epsilon(f) + } + + type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = (); + + #[inline] + fn to_nfa( + _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>], + _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>, + _default: Option<DOption<TNT>>, + ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> { + // NOTE: We cannot construct an atom from a set of regular + // languages alone. So it is appropriate to panic here, if + // one tries to do so, for some reason. + unimplemented!() + } + + #[inline] + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.remove_dead(reserve) + } + + #[inline] + fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.nulling(f) + } +} + +impl DefaultAtom { + /// Construct an atomic language from a grammar. + pub fn from_grammar(mut grammar: Grammar) -> Result<Self, GrammarError> { + grammar.compute_firsts()?; + + let regexp = grammar.left_closure()?; + + let mut nfa = grammar.left_closure_to_nfa(®exp)?; + + use std::collections::HashSet; + + let accumulators: Vec<usize> = { + let mut result = Vec::with_capacity(regexp.len() + 1); + result.push(0); + + for regex in regexp.iter() { + result.push(regex.nodes_len() * 2 + result.last().unwrap()); + } + + result.into_iter().collect() + }; + + let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect(); + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; + + // now add the virtual nodes + let mut virtual_nodes: VirtualMap = Default::default(); + + let nt_num = grammar.non_num(); + + assert!(nt_num <= accumulators.len()); + + // Convert an error telling us that an index is out of bounds. + // + // Panics if the error is not of the expected kind. + fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { + match ge { + GraphError::IndexOutOfBounds(index, bound) => { + GrammarError::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + } + } + + for nt in 0..nt_num { + let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa + // this is safe because of the above assertion. + .labels_of(*accumulators.get(nt).unwrap()) + .map_err(index_out_of_bounds_conversion)? + .map(|(label, target_iter)| (*label, target_iter.collect())) + .collect(); + + for (label, children_vec) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label { + let new_index = nfa + .extend(children_vec.into_iter().map(|target| (label, target))) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } + } + } + + Ok(Self { + grammar, + nfa, + regexp, + virtual_nodes, + }) + } +} + +/// A convenient getter for the map of virtual nodes. +fn query(map: &VirtualMap, nt: usize, t: usize) -> Option<usize> { + map.get(&VirtualNode::new(nt, t)).copied() +} + +impl Atom for DefaultAtom { + fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError> { + if nt >= self.grammar.non_num() { + return Err(GrammarError::IndexOutOfBounds(nt, self.grammar.non_num())); + } + + if t >= self.grammar.ter_num() { + return Err(GrammarError::IndexOutOfBounds(t, self.grammar.ter_num())); + } + + Ok(query(&self.virtual_nodes, nt, t)) + } + + fn empty(&self) -> usize { + assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); + + self.nfa.nodes_len() - 2 + } +} diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs new file mode 100644 index 0000000..084acca --- /dev/null +++ b/chain/src/atom/mod.rs @@ -0,0 +1,41 @@ +//! This file defines the behaviour of the Atomic languages, and +//! provides a default implementation. +//! +//! Here I do not to substitute external packages' implementations in +//! the future, so why define a trait for the atomic languages? +//! Because this way I can easily substitute other implementations if +//! I have better ideas in the future. + +use grammar::{Error as GrammarError, TNT}; +use nfa::{DOption, Nfa}; + +/// The expected behaviours of an atomic language. +pub trait Atom: Nfa<DOption<TNT>> { + /// Return the index of a node representing the derivative of the + /// left-linear null closure of `nt` with respect to `t`. + fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; + + /// Return the index of the empty state. + fn empty(&self) -> usize; +} + +pub mod default; + +pub use default::DefaultAtom; + +#[cfg(test)] +mod tests { + use super::*; + use grammar::test_grammar_helper::*; + + #[test] + fn atom() -> Result<(), Box<dyn std::error::Error>> { + let grammar = new_notes_grammar()?; + + let atom = DefaultAtom::from_grammar(grammar)?; + + println!("atom = {atom}"); + + Ok(()) + } +} diff --git a/chain/src/default.rs b/chain/src/default.rs new file mode 100644 index 0000000..e04be9f --- /dev/null +++ b/chain/src/default.rs @@ -0,0 +1,46 @@ +//! This file provides a default implementation of the +//! [`Chain`][crate::Chain] trait. +//! +//! The reason for using a trait is that I might want to experiment +//! with different implementation ideas in the future, and this +//! modular design makes that easy. + +use super::*; +use core::fmt::Display; + +/// The errors related to taking derivatives by chain rule. +#[derive(Debug)] +pub enum Error { + /// An invalid situation happens. + Invalid, +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Invalid => write!(f, "invalid"), + } + } +} + +impl std::error::Error for Error {} + +/// A default implementation for the [`Chain`] trait. +#[derive(Debug, Clone, Default)] +pub struct DefaultChain {} + +impl Chain for DefaultChain { + type Error = Error; + + fn unit() -> Self { + todo!() + } + + fn chain(&mut self, _t: usize) { + todo!() + } + + fn epsilon(&self) -> bool { + todo!() + } +} diff --git a/chain/src/lib.rs b/chain/src/lib.rs index 0ec4d4c..4e21e1d 100644 --- a/chain/src/lib.rs +++ b/chain/src/lib.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! This package implements the core algorithm of the entire //! workspace: parsing with derivatives by means of chain rule and //! regular nulling languages. @@ -7,19 +8,43 @@ //! think is the essence of this algorithm, the chain-rule for //! derivatives of languages. -pub mod grammar; +pub mod atom; -pub fn add(left: usize, right: usize) -> usize { - left + right -} +// TODO: Define errors. + +/// The expected behaviours of a language which can take derivatives +/// by chain rule. +pub trait Chain: Default { + /// The implementations should choose a type to represent errors. + type Error: std::error::Error; -#[cfg(test)] -mod tests { - use super::*; + /// Represents the language that is present after we parse the + /// empty string, that is the initial configuration of the + /// language. This may or may not be different from what + /// `Default::default` gives. + fn unit() -> Self; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } + /// Take the derivative by a terminal symbol. + /// + /// This takes care of the union and the prepending operations. + /// + /// # A little remark about the design + /// + /// I have thought to separate different operations (like the + /// union, the prepending, and single derivatives) and then define + /// a function to put everything together. But I think that + /// design is not convenient to use. Also, I really do not need + /// those operations other than to provide this derivative + /// operation, so why define them? And putting all things + /// together may reduce the number of bugs caused by wrong uses of + /// those component functions, and can reduce the amount of + /// documentation strings a library user needs to read, in order + /// to make use of this trait. So I ended up with this design. + fn chain(&mut self, t: usize); + + /// Return true if and only if the language contains the empty + /// string. + fn epsilon(&self) -> bool; } + +pub mod default; diff --git a/chain/src/plan.org b/chain/src/plan.org index 8c63492..61738a9 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,29 +2,73 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [2/7] +* Things to do [5/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the grammar module. -- [-] NFA [4/5] +- [-] Establish a testing framework of various grammars. [5/6] + + [X] One simple grammar + + [X] Notes + + [X] Parentheses + + [X] Pathelogical left-recursive + + [X] Pathelogical right-recursive + + [ ] Pathelogically ambiguous + # More should be added +- [X] NFA [4/4] + [X] Add regular expression into NFA. + [X] Convert a set of regular expressions into a nondeterministic finite automaton, where non-terminals denote the indices of regular expressions in the set to substitute. - + [X] Convert a grammar into its grammar of left-linear closures of - non-temrinals and their derivatives. - + [X] Convert nondeterministic finite automata to their null - closures. - + [ ] Test more grammars to ensure it works correctly. -- [ ] Define the Atom trait. -- [ ] Implement virtual nodes for each derivative of each atom: The - lack of this step might be the root cause of the failure of the - previous version of this package. -- [ ] Implement languages. [0/2] - + [ ] First implement them as doubly labelled (directed acyclic) - graphs. - + [ ] Implement finding derivatives. + + [X] Convert a grammar to the language of atomic languages. [2/2] + * [X] Convert a grammar into its grammar of left-linear closures + of non-temrinals and their derivatives. + * [X] Convert nondeterministic finite automata to their null + closures. + + [X] Test more grammars to ensure it works correctly. [5/5] + * [X] Test the conversion to left-closure as a set of regular + expressions. + * [X] Test the conversion of a set of regular expressions into a + nondeterministic finite automaton. + * [X] Test the closure of null edges, where the nullity of an edge + is defined by a function. In our specific case, an edge is null + if the grammar symbol that edge represents, either a terminal or + a non-terminal, can match an empty string. + * [X] Test the removal of empty transitions from nondeterministic + finite automata. + * [X] Test the removal of dead states, where a state is dead if + and only if no other states have an edge into that state. +- [ ] Refactor [0/3] + + [ ] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. We can mark by returning a set of + nodes which are the beginnings of left-linearly expanded branches. + + [ ] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [ ] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. +- [X] Atom [3/3] + + [X] Define the Atom trait. + + [X] Implement virtual nodes for each derivative of each atom: The + lack of this step might be the root cause of the failure of the + previous version of this project. + + [X] Test atoms +- [-] Implement languages. [1/3] + + [X] First define a trait with the expected behaviours. + + [ ] Then implement them as doubly labelled graphs. + + [ ] Thenimplement finding derivatives by use of the chain rule. +- [-] Implement forests [0/2] + + [-] Define a trait with the expected behaviours. + + [ ] Implement them using adjacency list: we only need one label + per edge, and we do not wish to exclude duplicate edges, and we do + not need to index edges by the labels. All in all, the labels can + be implemented by a simple hash map that maps edges to labels, so + a special data type is not needed here. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. @@ -32,6 +76,9 @@ + [ ] Implement the free semiring. + [ ] Compute and record a semiring value when computing derivatives. +- [X] Miscellaneous [1/1] + + [X] Implement a function for NFA that checks if a given predicate + is satisfied for each edge label. * Atomic Languages @@ -39,10 +86,6 @@ This describes the behaviours of atomic languages. The atomic language consists of the null closure of any non-terminal symbol in the grammar, and their deriavtives by terminals and non-terminal. -* Script for creating GIF animation - -[[https://gist.github.com/maelvls/5379127][a gist]] - * Derivative Languages This is the main driving device of the algorithm. Basically, the @@ -82,6 +125,61 @@ should be easy enough, since we already have the mechanism of graphs, nondeterministic automata, and semirings. All we need to do is to combine them together. +* Lexers + +I got this idea from [[https://lists.gnu.org/archive/html/emacs-devel/2022-12/msg01127.html][a discussion on emacs-devel]]. The idea can be +formulated as + +#+begin_quote +Potentially it allows invocation of a parser with different variants +of lexers - one mode with block tokens for the exploration of +project's structure, and another mode for indentation and error +checking purposes. +#+end_quote + +I think this is the "right" use of lexers. That is to say, the parser +by default should operate on an abstract type of tokens, which are but +unsigned integers, and rely on the user application to provide a lexer +for turning the actual input, like a string, into an iterator of +tokens. The lexer can choose to do any sort of preprocessing that it +sees fit, or it can do nothing and just turn characters to unsigned +integers. In the latter case, this parser generator works on the +character level. But, when one neeeds, one can instruct the parser +generator to work on the level of preprocessed tokens easily. + +This has the advantage of allowing a certain degree of flexibility in +the type of tokens accepted, while keeping the implementation simple +at the same time: the internal core only has to deal with unsigned +integers, after all. + +* Library for drawing graphs + +In the past, I thought that drawing graphs is only a development tool +for my package, so I can rely on such external tools as GraphViz, as +that would not be needed for my end-users. + +But recently I realized that this is not a need only for the +development of the package, but also for the users as well. To be +more specific, the target users are those who want to "play" with +grammars. So if one cannot view the resulting parse forest diagrams +easily and interactively, it would be hard to "play" with grammars. + +Moreover, the internal workings of the parsing machine might also be +interesting for users to inspect, whether to debug the program or to +know more under the hood. As such it is required to view the diagrams +of the internal directed acyclic graphs. + +For all these reasons, I know regard the ability to draw graphs and to +interact with those graphs is essential to the user experience of this +package. As a consequence, I would like to develop a small default +library for this purpose. I follow the convention adapted by this +package here as well, which is to provide a default implementation for +the functionality, while still leaving the room for users to +substitute with other external packages, if so desired, for whatever +reason. For example, an external package may provide an unprecedented +performance for drawing graphs in Emacs. If such a package appears, I +can easily avail of that external package to draw graphs. + * Testing ground I am in a strong need to test things out. The most important one is @@ -92,3 +190,21 @@ understanding of the main algorithm is plain wrong. This is the main reason I started this rewrite of the package. +* To figure out + +I need to figure out how to construct the parse forest from a +left-most null-closed derivation graph. + +Reading through the original paper again, I found that the author +augmented the atomic languages with annotations of partial derivation +steps on each edge of the nondeterministic finite automaton. In +brief, this is what I tried to achieve with some obscure constructs +such as expanding reduction informations carried by the +nondeterministic finite automata, in one of the previous versions. To +sum up, I really need to carry those informations in the first place, +otherwise the parse forests cannot be reliably construced later on. + +But I really do not want to adopt the approach of the original author. +I still prefer the idea of a recorder. That is to say, instead of +constructing the parse forest after the recognition, we shall +construct the parse forest simultaneously while we are recognizing. diff --git a/forest/Cargo.toml b/forest/Cargo.toml new file mode 100644 index 0000000..b88451d --- /dev/null +++ b/forest/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "forest" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +graph = { path = "../graph" } diff --git a/forest/src/default.rs b/forest/src/default.rs new file mode 100644 index 0000000..36145f4 --- /dev/null +++ b/forest/src/default.rs @@ -0,0 +1,153 @@ +//! This file provides a default implementation for the +//! [`Forest`][crate::Forest] trait. + +use super::*; +use graph::{error::Error as GraphError, ALGraph, Graph}; + +use std::collections::{hash_set::Iter, HashMap, HashSet}; + +#[derive(Debug, Clone)] +pub struct DefaultForest<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { + graph: ALGraph, + vertex_labels: HashMap<usize, NodeLabel>, + edge_labels: HashMap<(usize, usize), EdgeLabel>, + plugins: HashSet<usize>, + plugouts: HashSet<usize>, +} + +impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Default for DefaultForest<NodeLabel, EdgeLabel> { + fn default() -> Self { + Self { + graph: Default::default(), + vertex_labels: Default::default(), + edge_labels: Default::default(), + plugins: Default::default(), + plugouts: Default::default(), + } + } +} + +impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Graph for DefaultForest<NodeLabel, EdgeLabel> { + type Iter<'a> = <ALGraph as Graph>::Iter<'a> + where + Self: 'a; + + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GraphError> { + self.graph.children_of(node_id) + } + + fn degree(&self, node_id: usize) -> Result<usize, GraphError> { + self.graph.degree(node_id) + } + + fn is_empty_node(&self, node_id: usize) -> Result<bool, GraphError> { + self.graph.is_empty_node(node_id) + } + + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GraphError> { + self.graph.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + unimplemented!() + } +} + +impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> ExtGraph + for DefaultForest<NodeLabel, EdgeLabel> +{ + fn extend(&mut self, edges: impl IntoIterator<Item = usize>) -> Result<usize, GraphError> { + self.graph.extend(edges) + } +} + +#[derive(Debug)] +pub struct LabelIter<'a> { + set_iter: Iter<'a, usize>, +} + +impl<'a> Iterator for LabelIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<Self::Item> { + self.set_iter.next().copied() + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.set_iter.size_hint() + } +} + +impl<'a> ExactSizeIterator for LabelIter<'a> { + fn len(&self) -> usize { + let (lower, upper) = self.size_hint(); + // Note: This assertion is overly defensive, but it checks the + // invariant guaranteed by the trait. + debug_assert!(upper == Some(lower)); + lower + } +} + +impl<'a> From<Iter<'a, usize>> for LabelIter<'a> { + fn from(set_iter: Iter<'a, usize>) -> Self { + Self { set_iter } + } +} + +impl<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> Forest<NodeLabel, EdgeLabel> + for DefaultForest<NodeLabel, EdgeLabel> +{ + type PluginIter<'a> = LabelIter<'a> + where + Self: 'a; + + type PlugoutIter<'a> = LabelIter<'a> + where + Self: 'a; + + fn plugins(&self) -> Self::PluginIter<'_> { + self.plugins.iter().into() + } + + fn plugouts(&self) -> Self::PlugoutIter<'_> { + self.plugouts.iter().into() + } + + fn plug(&mut self, other: &Self) -> Result<(), Error> { + // PLAN: Clone the `other` forest to `self`, adjust the + // indices for edges, and then add edges between plugs. + // + // Since I cannot touch the underlying nodes directly, I have + // to add the nodes and edges individually. + + todo!() + } + + fn vertex_label(&self, node_id: usize) -> Result<Option<NodeLabel>, Error> { + if node_id >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(node_id, self.nodes_len())); + } + + Ok(self.vertex_labels.get(&node_id).copied()) + } + + fn edge_label(&self, source: usize, target: usize) -> Result<Option<EdgeLabel>, Error> { + if source >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(source, self.nodes_len())); + } + + if target >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(target, self.nodes_len())); + } + + Ok(self.edge_labels.get(&(source, target)).copied()) + } +} diff --git a/forest/src/lib.rs b/forest/src/lib.rs new file mode 100644 index 0000000..527a78c --- /dev/null +++ b/forest/src/lib.rs @@ -0,0 +1,105 @@ +//! This file defines the expected behaviours of forests, and provides +//! a default implementation for forests. +//! +//! The default forest actually implements the so-called shared packed +//! parse forest, or SPPF. It packs sub-trees with the same parents +//! under the same branch, and lets nodes from different branches +//! share the same children, and hence the name. +//! +//! What is special here is that the forests are marked with some +//! out-coming and in-coming plugs. These plugs are used to join +//! different fragments of forests together. + +use graph::{ExtGraph, GraphLabel}; + +use core::fmt::Display; + +#[derive(Debug)] +pub enum Error { + IndexOutOfBounds(usize, usize), +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(index, bound) => { + write!(f, "index {index} is out of bound {bound}") + } + } + } +} + +impl std::error::Error for Error {} + +/// A builder of a forest. +pub trait ForestBuilder<NodeLabel: GraphLabel, EdgeLabel: GraphLabel> { + /// The type of the resulting forest. + type Output: Forest<NodeLabel, EdgeLabel>; + + /// Construct a new builder with only one node with the given + /// label. + fn new_leaf(label: NodeLabel) -> Self; + + /// Add a child to the builder the given labels for the new node + /// and the added edge. + /// + /// All plug-out nodes within the builder should have a new child + /// with the specified labels, and hence multiple children might + /// be added, and the plug-out nodes should be modified + /// accordingly. + fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); + + /// Build the forest. + fn build(self) -> Self::Output; + + /// Build the forest from a reference. + fn build_ref(&self) -> Self::Output; +} + +/// The expected behaviours of a forest. +/// +/// Note that it contains a "striped down" version of the labelled +/// graphs. +pub trait Forest<NodeLabel: GraphLabel, EdgeLabel: GraphLabel>: ExtGraph { + /// Type of iterator of plug-in nodes. + type PluginIter<'a>: Iterator<Item = usize> + 'a + where + Self: 'a; + + /// Type of iterator of plug-out nodes. + type PlugoutIter<'a>: Iterator<Item = usize> + 'a + where + Self: 'a; + + /// Return the plug-in nodes + fn plugins(&self) -> Self::PluginIter<'_>; + + /// Return the plug-out nodes + fn plugouts(&self) -> Self::PlugoutIter<'_>; + + /// Plug another forest onto this forest. + /// + /// The plug-out nodes of this forest will be joined onto the + /// plug-in nodes of the joining forest. + /// + /// # Performance warning + /// + /// It is recommended to only call this function with a "small" + /// `other`, as this function might copy the whole graph + /// individually, node by node and edge by edge. + fn plug(&mut self, other: &Self) -> Result<(), Error>; + + /// Return the vertex label. + /// + /// A vertex may not have labels. + fn vertex_label(&self, node_id: usize) -> Result<Option<NodeLabel>, Error>; + + /// Return the edge label. + /// + /// An edge may have no labels. If there is no edge from the + /// given source to the given target, then `Ok(None)` is returned + /// as well. + fn edge_label(&self, source: usize, target: usize) -> Result<Option<EdgeLabel>, Error>; +} + +pub mod default; diff --git a/grammar/Cargo.toml b/grammar/Cargo.toml new file mode 100644 index 0000000..20c4b48 --- /dev/null +++ b/grammar/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "grammar" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +default = [] + +# This flag exposes the module "test_grammar_helper" which provides +# some grammars for testing. +test-helper = [] + +[dependencies] +nfa = { path = "../nfa" } +graph = { path = "../graph" } + +[dev-dependencies] +grammar = { path = ".", features = ["test-helper"] } diff --git a/chain/src/grammar.rs b/grammar/src/lib.rs index 287fbca..55adc9a 100644 --- a/chain/src/grammar.rs +++ b/grammar/src/lib.rs @@ -8,21 +8,19 @@ // words, the current focus is not on the optimisations, whereas // scanners are for optimisations only, so to speak. -#![allow(unused_imports)] +// TODO: Separate contents into modules. + use nfa::{ default::{ nfa::DefaultNFA, - regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + regex::{DefaultRegex, ParseError, RegexType}, }, - DOption, DesRec, Nfa, Regex, SoC, + DOption, Nfa, Regex, SoC, }; use graph::{adlist::ALGBuilder, builder::Builder, Graph}; -use std::{ - collections::HashSet, - fmt::{Display, Write}, -}; +use std::{collections::HashSet, fmt::Display}; /// The type of a terminal. /// @@ -118,7 +116,15 @@ impl Display for Error { } } -impl std::error::Error for Error {} +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Error::NFAFail(error) = self { + Some(error) + } else { + None + } + } +} /// A rule is a regular expression of terminals or non-terminals. #[derive(Debug, Clone)] @@ -143,11 +149,29 @@ impl Rule { /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { + /// A list of terminals. ter: Vec<Terminal>, + /// A list of non-terminals. non: Vec<Nonterminal>, + /// A list of rules. + /// + /// The length of the list must match that of the list of + /// non-terminals. rules: Vec<Rule>, + // The following two attributes are empty until we call + // `compute_firsts` on the grammar. + /// The list of sets of "first terminals". + /// + /// The length must match that of the list of non-terminals. firsts: Vec<HashSet<Option<usize>>>, + /// The list of lists of nodes that are reachable after a nullable + /// transition in the regular expression. + /// + /// The length must match that of the list of non-terminals. first_nodes: Vec<Vec<usize>>, + // The following attribute is empty until we call `left_closure` + // on the grammar. + left_closure_branches: HashSet<usize>, } /// A private type to aid the recursive looping of rergular @@ -192,12 +216,15 @@ impl Grammar { .map(|rule| Vec::with_capacity(rule.len())) .collect(); + let left_closure_branches = HashSet::default(); + Self { ter, non, rules, firsts, first_nodes, + left_closure_branches, } } @@ -293,6 +320,15 @@ impl Grammar { /// # Related /// /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. + /// + /// # Errors + /// + /// This function is supposed to return only one type of errors, + /// namely, the IndexOutOfBounds error that results from a bounds + /// check. Breaking this is breaking the guarantee of this + /// function, and is considered a bug. This behaviour can and + /// should be tested. But I have not found a convenient test + /// method for testing various grammars. #[inline] pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> { let ter_num = self.ter.len(); @@ -329,6 +365,13 @@ impl Grammar { .iter()) } + /// Return a hash set that contains all nodes in the set of + /// left-closure regular languages that are added because of the + /// left-linear expansion. + pub fn left_closure_branches(&self) -> &HashSet<usize> { + &self.left_closure_branches + } + /// Compute the set of terminals that can appear as the first /// terminal in some left-linear derivation of a non-terminal, for /// every non-terminal. @@ -865,11 +908,18 @@ impl Grammar { builder.add_edge(0, non_lit_index, ()).unwrap(); + // If this non-terminal is nullable, add an empty variant. + if self.is_nullable(n)? { + local_result.push(Empty); + let empty_index = builder.add_vertex(); + builder.add_edge(0, empty_index, ()).unwrap(); + } + for first_node in self.first_nodes_of(n)?.copied() { assert!(first_node < parents.len()); let tnt = match label!(first_node) { - Lit(tnt) => tnt, + Lit(tnt) => Lit(tnt), _ => continue, }; @@ -893,38 +943,40 @@ impl Grammar { }; if parents_chain.is_empty() { - local_result.push(Lit(tnt)); + local_result.push(tnt); let lit_index = builder.add_vertex(); builder.add_edge(0, lit_index, ()).unwrap(); continue; } - assert!(parents_chain.first().unwrap().0 == regex_root); + assert_eq!(parents_chain.first().unwrap().0, regex_root); // A different, "more local", root. - let mut root: usize; + let mut local_root: usize; // Handle the direct parent let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); match label!(parent_node) { Kleene | Plus => { - local_result.extend([Empty, Lit(tnt)]); + // TODO: If parent_edge_index is 0, make a + // Plus node instead. + local_result.extend([Empty, tnt]); - root = builder.add_vertex(); + local_root = builder.add_vertex(); let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); + builder.add_edge(local_root, lit_index, ()).unwrap(); let iterator = children!(parent_node); for index in iterator.clone().skip(parent_edge_index + 1) { - df_copy!(root, index); + df_copy!(local_root, index); } local_result.push(Kleene); let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); + builder.add_edge(local_root, new_parent, ()).unwrap(); for index in iterator { df_copy!(new_parent, index); @@ -932,19 +984,19 @@ impl Grammar { } Or => { - local_result.push(Lit(tnt)); - root = builder.add_vertex(); + local_result.push(tnt); + local_root = builder.add_vertex(); } Optional | Empty => { // If this path is taken, it should not be // optional. - local_result.extend([Empty, Lit(tnt)]); - root = builder.add_vertex(); + local_result.extend([Empty, tnt]); + local_root = builder.add_vertex(); let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); + builder.add_edge(local_root, lit_index, ()).unwrap(); for index in children!(parent_node).skip(parent_edge_index + 1) { - df_copy!(root, index); + df_copy!(local_root, index); } } Paren | Lit(_) => unreachable!(), @@ -957,21 +1009,24 @@ impl Grammar { match node_type { Kleene | Plus => { + // TODO: If edge_index is 0, then just + // make this a Plus node. + local_result.push(Empty); let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); + builder.add_edge(new_index, local_root, ()).unwrap(); - root = new_index; + local_root = new_index; let iterator = children!(node); for index in iterator.clone().skip(edge_index + 1) { - df_copy!(root, index); + df_copy!(local_root, index); } local_result.push(Kleene); let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); + builder.add_edge(local_root, new_parent, ()).unwrap(); for index in iterator { df_copy!(new_parent, index); @@ -981,18 +1036,18 @@ impl Grammar { RegexType::Optional | RegexType::Empty => { local_result.push(Empty); let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); - root = new_index; + builder.add_edge(new_index, local_root, ()).unwrap(); + local_root = new_index; for index in children!(node).skip(edge_index + 1) { - df_copy!(root, index); + df_copy!(local_root, index); } } RegexType::Paren | RegexType::Lit(_) => unreachable!(), } } - builder.add_edge(0, root, ()).unwrap(); + builder.add_edge(0, local_root, ()).unwrap(); } local_result.shrink_to_fit(); @@ -1040,7 +1095,9 @@ impl Grammar { TNT::Non(n) => Ok(SoC::Sub(n)), }; - DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) + let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + + Ok(nfa) } } @@ -1066,182 +1123,9 @@ impl Display for Grammar { } } -#[cfg(test)] -mod test_grammar_helper { - use super::*; - use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; - use std::fmt::Write; - - // Construct a grammar to test - pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { - let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; - let non = vec![ - Nonterminal("start".to_owned()), - Nonterminal("end".to_owned()), - ]; - - /// A function to scan the inputs. - fn scan_tnt( - parser: &DefaultRegParser<TNT>, - input: &str, - ) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> { - use ParseDirection::*; - use RegexType::*; - use TNT::*; - - let mut chars = input.chars(); - - if let Some(first) = chars.next() { - match first { - '*' => Ok(Some((1, Kleene, Right))), - '+' => Ok(Some((1, Plus, Right))), - '?' => Ok(Some((1, Optional, Right))), - '|' => Ok(Some((1, Empty, Up))), - '(' => Ok(Some((1, Or, Down))), - ')' => Ok(Some((1, Paren, Up))), - 'T' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(t) = parser.query(&name, true) { - Ok(Some((len, Lit(Ter(t)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - 'N' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(n) = parser.query(&name, false) { - Ok(Some((len, Lit(Non(n)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - _ => Err(ParseError::InvalidCharacter(first)), - } - } else { - Ok(None) - } - } - - let mut regex_parser: DefaultRegParser<TNT> = Default::default(); - - regex_parser.add_tnt("a", true); - regex_parser.add_tnt("b", true); - regex_parser.add_tnt("start", false); - regex_parser.add_tnt("end", false); - - let regex_parser = regex_parser; - - let rule1 = Rule { - regex: regex_parser - .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rule2 = Rule { - regex: regex_parser - .parse("Nstart?Nend*", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rules = vec![rule1, rule2]; - - Ok(Grammar::new(ter, non, rules)) - } -} - -#[cfg(test)] -mod test_grammar_display { - use super::test_grammar_helper::new_grammar; - - #[test] - fn test_display() -> Result<(), Box<dyn std::error::Error>> { - println!("{}", new_grammar()?); - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_firsts { - use super::test_grammar_helper::new_grammar; - use super::*; - - #[test] - fn test_firsts() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - - Ok(()) - } -} +// A helper module that provides some grammars for testing. +#[cfg(feature = "test-helper")] +pub mod test_grammar_helper; #[cfg(test)] -mod test_grammar_left_closure { - use super::test_grammar_helper::new_grammar; - use super::*; - - pub fn new_closure_regex( - grammar: &mut Grammar, - ) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> { - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - println!("grammar:"); - println!("{grammar}"); - - grammar.left_closure().map_err(Into::into) - } - - #[test] - fn test_regex() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - - let vec_of_regexps = new_closure_regex(&mut grammar)?; - - for regex in &vec_of_regexps { - println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); - // println!("regex: {regex:?}",); - println!("regex len = {}", regex.nodes_len()); - } - - Ok(()) - } - - #[test] - fn test_nfa() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - let closure = new_closure_regex(&mut grammar)?; - let nfa = grammar.left_closure_to_nfa(&closure)?; - // TODO: print the nfa out - Ok(()) - } -} +mod tests; diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs new file mode 100644 index 0000000..c236952 --- /dev/null +++ b/grammar/src/test_grammar_helper.rs @@ -0,0 +1,368 @@ +//! This module provides some grammars for testing. + +use super::*; +use nfa::{ + default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}, + DesRec, +}; +use std::fmt::Write; + +/// A helper function to compute the first sets of a grammar and +/// return the left-closure of that grammar. +pub fn new_closure_regex( + grammar: &mut Grammar, +) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> { + grammar.compute_firsts()?; + + grammar.left_closure().map_err(Into::into) +} + +/// A function to scan the inputs. +fn scan_tnt( + parser: &DefaultRegParser<TNT>, + input: &str, +) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + use TNT::*; + + let mut chars = input.chars(); + + let mut len = 1; + + while let Some(first) = chars.next() { + match first { + ' ' => { + // ignore whitespaces + len += 1; + } + '*' => return Ok(Some((len, Kleene, Right))), + '+' => return Ok(Some((len, Plus, Right))), + '?' => return Ok(Some((len, Optional, Right))), + '|' => return Ok(Some((len, Empty, Up))), + '(' => return Ok(Some((len, Or, Down))), + ')' => return Ok(Some((len, Paren, Up))), + 'T' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(t) = parser.query(&name, true) { + return Ok(Some((len, Lit(Ter(t)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + 'N' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(n) = parser.query(&name, false) { + return Ok(Some((len, Lit(Non(n)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + _ => { + return Err(ParseError::InvalidCharacter(first)); + } + } + } + + Ok(None) +} + +/// Return a simple testing grammar. +#[allow(dead_code)] +pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("end".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("a", true); + regex_parser.add_tnt("b", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("end", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nstart?Nend*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that might serve as the grammar for my notes, +/// somehow. +#[allow(dead_code)] +pub fn new_notes_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![ + Terminal::new("NL".to_owned()), + Terminal::new("SP".to_owned()), + Terminal::new("CON".to_owned()), + Terminal::new("STAR".to_owned()), + Terminal::new("NOTE".to_owned()), + Terminal::new("PRICE".to_owned()), + Terminal::new("DIGIT".to_owned()), + ]; + let non = vec![ + Nonterminal("document".to_owned()), + Nonterminal("item".to_owned()), + Nonterminal("header".to_owned()), + Nonterminal("title".to_owned()), + Nonterminal("note".to_owned()), + Nonterminal("note-content".to_owned()), + Nonterminal("price".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("NL", true); + regex_parser.add_tnt("SP", true); + regex_parser.add_tnt("CON", true); + regex_parser.add_tnt("STAR", true); + regex_parser.add_tnt("note", true); + regex_parser.add_tnt("price", true); + regex_parser.add_tnt("DIGIT", true); + regex_parser.add_tnt("document", false); + regex_parser.add_tnt("item", false); + regex_parser.add_tnt("header", false); + regex_parser.add_tnt("title", false); + regex_parser.add_tnt("note", false); + regex_parser.add_tnt("notecontent", false); + regex_parser.add_tnt("price", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Nitem+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nheader Nprice?Nnote?", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("TSTAR?TSP Ntitle TNL (TSP|TNL)*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule4 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule5 = Rule { + regex: regex_parser + .parse( + "Tnote Nnotecontent TNL (TSP|TNL)*", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule6 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule7 = Rule { + regex: regex_parser + .parse( + "Tprice TSP TDIGIT+ TNL(TSP | TNL)+", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3, rule4, rule5, rule6, rule7]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that can express parentheses. +#[allow(dead_code)] +pub fn new_paren_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![ + Terminal::new("LEFT".to_owned()), + Terminal::new("RIGHT".to_owned()), + Terminal::new("A".to_owned()), + ]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("content".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("LEFT", true); + regex_parser.add_tnt("RIGHT", true); + regex_parser.add_tnt("A", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("content", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse( + "TLEFT Nstart TRIGHT | Ncontent Nstart | ", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TA +", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a left recursive grammar. +#[allow(dead_code)] +pub fn new_left_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NA NS TC", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB | Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("()", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a right recursive grammar. +#[allow(dead_code)] +pub fn new_right_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NS TC NA|TB Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("NA|", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} +// TODO: more grammars diff --git a/grammar/src/tests/mod.rs b/grammar/src/tests/mod.rs new file mode 100644 index 0000000..55eedff --- /dev/null +++ b/grammar/src/tests/mod.rs @@ -0,0 +1,30 @@ +#[cfg(test)] +mod test_grammar_display { + use crate::test_grammar_helper::new_grammar; + + #[test] + fn test_display() -> Result<(), Box<dyn std::error::Error>> { + new_grammar().map(|g| println!("{g}")) + } +} + +#[cfg(test)] +mod test_grammar_firsts { + use crate::test_grammar_helper::new_grammar; + use std::io::Write; + + #[test] + fn test_firsts() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_grammar()?; + + grammar.compute_firsts()?; + + let mut lock = std::io::stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes).map_err(Into::into) + } +} + +#[cfg(test)] +mod test_grammar_left_closure; diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs new file mode 100644 index 0000000..0bc9f4d --- /dev/null +++ b/grammar/src/tests/test_grammar_left_closure.rs @@ -0,0 +1,272 @@ +use crate::test_grammar_helper::*; +use crate::*; +use nfa::Nfa; +use std::{ + collections::HashSet, + io::{stdout, Write}, +}; + +#[test] +fn test_regex() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_grammar()?; + + let vec_of_regexps = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes)?; + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + for regex in vec_of_regexps.into_iter().skip(1) { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| format!("{tnt}"))? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + Ok(()) +} + +// We ignore this test by default as it is possibly involved with +// printing to a graphviz file. +#[ignore] +#[test] +fn test_nfa() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_notes_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + grammar + .left_closure_to_nfa(&closure) + .map(|_| ()) + .map_err(Into::into) + + // let _nfa = grammar.left_closure_to_nfa(&closure)?; + + // writeln!(lock, "Not printing nfa to nfa.gv")?; + + // nfa.print_viz("nfa.gv").map_err(Into::into) + + // Ok(()) +} + +#[test] +#[ignore] +fn test_remove_epsilon() -> Result<(), Box<dyn std::error::Error>> { + let mut lock = stdout().lock(); + + let mut grammar = new_paren_grammar()?; + + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + let closure = new_closure_regex(&mut grammar)?; + + let mut accumulator_value: usize = 0; + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + writeln!(lock, "regex len = {}", regex.nodes_len())?; + writeln!(lock, "offset = {accumulator_value}")?; + + accumulator_value += regex.nodes_len(); + } + + writeln!(lock, "total = {accumulator_value}")?; + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + nfa.print_viz("nfa_no_epsilon.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_paren_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulator = 0usize; + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(accumulator); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulator += regex.nodes_len() * 2; + + accumulators.push(accumulator); + } + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + let accumulators: HashSet<usize> = accumulators.into_iter().collect(); + + println!("accumulators = {accumulators:?}"); + + let grammar_reserve_node = |node| accumulators.contains(&node); + + nfa.remove_dead(grammar_reserve_node)?; + + nfa.print_viz("nfa_no_dead.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_nulling() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_left_recursive_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(0); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + } + + write!(lock, "nullables:")?; + let mut first_time = true; + for i in 0..(grammar.non_num()) { + if grammar.is_nullable(i)? { + if first_time { + write!(lock, " ")?; + } else { + write!(lock, ", ")?; + } + write!(lock, " {i}")?; + first_time = false; + } + } + writeln!(lock)?; + + let accumulators: HashSet<usize> = accumulators.into_iter().collect(); + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + + let grammar_reserve_nodes = |node| accumulators.contains(&node); + + writeln!(lock, "accumulators are {accumulators:?}")?; + + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(grammar_reserve_nodes)?; + + writeln!(lock, "Printing nfa to nfa.gv")?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) +} diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index 6c1dcd0..ba9afb8 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -155,6 +155,12 @@ impl Builder for ALGBuilder { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(ALNode::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); diff --git a/graph/src/adset.rs b/graph/src/adset.rs index c225986..adf2aaf 100644 --- a/graph/src/adset.rs +++ b/graph/src/adset.rs @@ -190,6 +190,12 @@ impl Builder for ASGBuilder { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(ASNode::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); diff --git a/graph/src/builder.rs b/graph/src/builder.rs index ce85cce..9ab5895 100644 --- a/graph/src/builder.rs +++ b/graph/src/builder.rs @@ -27,6 +27,9 @@ pub trait Builder: Default { /// Add a vertex without children. fn add_vertex(&mut self) -> usize; + /// Add a number of vertices at the same time. + fn add_vertices(&mut self, n: usize); + /// Add an edge from the source to the target. fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>; diff --git a/graph/src/labelled.rs b/graph/src/labelled.rs index d0a02d0..6341787 100644 --- a/graph/src/labelled.rs +++ b/graph/src/labelled.rs @@ -142,10 +142,11 @@ impl<T: GraphLabel> Graph for DLGraph<T> { match self.nodes.get(source) { Some(source_node) => { if self.nodes.get(target).is_none() { - return Err(Error::IndexOutOfBounds(target, self.nodes.len())); + Err(Error::IndexOutOfBounds(target, self.nodes.len())) + } else { + Ok(source_node.by_target.contains_key(&target) + && !source_node.by_target.get(&target).unwrap().is_empty()) } - - Ok(source_node.by_target.contains_key(&target)) } None => Err(Error::IndexOutOfBounds(source, self.nodes.len())), } @@ -198,7 +199,7 @@ impl<T: GraphLabel> Graph for DLGraph<T> { /// A delegation of iterators. /// /// This is used to avoid a boxed pointer to an iterator. -#[derive(Default, Debug)] +#[derive(Default, Debug, Clone)] pub struct LabelIndexIter<'a> { iter: Option<std::iter::Copied<Iter<'a, usize>>>, } @@ -279,25 +280,81 @@ impl<'a, T> Iterator for LabelIter<'a, T> { } } +/// This is used to avoid a boxed pointer to an iterator. +#[derive(Debug)] +pub struct EdgeLabelIter<'a, T> { + iter: Option<Iter<'a, T>>, +} + +impl<'a, T> Default for EdgeLabelIter<'a, T> { + #[inline] + fn default() -> Self { + Self { iter: None } + } +} + +impl<'a, T: Copy + Clone> ExactSizeIterator for EdgeLabelIter<'a, T> { + #[inline] + fn len(&self) -> usize { + if let Some(iter) = &self.iter { + iter.len() + } else { + 0 + } + } +} + +impl<'a, T: Copy + Clone> EdgeLabelIter<'a, T> { + fn new(iter: Iter<'a, T>) -> Self { + Self { iter: Some(iter) } + } +} + +impl<'a, T: Copy + Clone> Iterator for EdgeLabelIter<'a, T> { + type Item = T; + + #[inline] + fn next(&mut self) -> Option<Self::Item> { + if let Some(iter) = &mut self.iter { + iter.next().copied() + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + if let Some(iter) = &self.iter { + iter.size_hint() + } else { + (0, Some(0)) + } + } +} + impl<T: GraphLabel> LabelGraph<T> for DLGraph<T> { type Iter<'a> = LabelIndexIter<'a> where T: 'a; type LabelIter<'a> = LabelIter<'a,T> where T: 'a; - fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, Error> { + type EdgeLabelIter<'a> = EdgeLabelIter<'a,T> + where + Self: 'a, + T: 'a + Copy + Clone; + + fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, Error> { if self.has_edge(source, target)? { - Ok(self - .nodes - .get(source) - .unwrap() - .by_target - .get(&target) - .unwrap() - .iter() - .copied() - .collect()) + Ok(EdgeLabelIter::new( + self.nodes + .get(source) + .unwrap() + .by_target + .get(&target) + .unwrap() + .iter(), + )) } else { - Ok(Vec::new()) + Ok(Default::default()) } } @@ -335,11 +392,11 @@ impl<T: GraphLabel> LabelGraph<T> for DLGraph<T> { return Err(Error::IndexOutOfBounds(target, nodes_len)); } - if let Some(labels) = node.by_target.get(&target) { - Ok(labels.contains(label)) - } else { - Ok(false) - } + Ok(node + .by_target + .get(&target) + .map(|labels| labels.contains(label)) + .unwrap_or(false)) } None => Err(Error::IndexOutOfBounds(node_id, nodes_len)), } @@ -443,6 +500,12 @@ impl<T: GraphLabel> Builder for DLGBuilder<T> { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(Default::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); @@ -541,14 +604,29 @@ impl<T: GraphLabel> Builder for DLGBuilder<T> { // If after removal no labels remain for the target, // we remove the edge entirely, to avoid false empty // edges. - if source_node.flat.is_empty() { - source_node.by_target.remove(&target); - for label in to_remove.iter() { - source_node.by_label.remove(label); + if let Some(set) = source_node.by_target.get(&target) { + if set.is_empty() { + source_node.by_target.remove(&target); } } + for label in to_remove.iter() { + if let Some(set) = source_node.by_label.get(label) { + if set.is_empty() { + source_node.by_label.remove(label); + } + } + } + + // if source_node.flat.is_empty() { + // source_node.by_target.remove(&target); + + // for label in to_remove.iter() { + // source_node.by_label.remove(label); + // } + // } + let new_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); // When source_node is in use, we cannot borrow self @@ -639,10 +717,7 @@ mod label_test { ); // testing edge_label - assert_eq!( - graph.edge_label(5, 2)?.into_iter().collect::<Set<_>>(), - set!(3) - ); + assert_eq!(graph.edge_label(5, 2)?.collect::<Set<_>>(), set!(3)); assert!(matches!( graph.edge_label(6, 2), Err(Error::IndexOutOfBounds(6, 6)) @@ -683,8 +758,6 @@ mod label_test { } } -// TODO: Test print_viz - #[cfg(test)] mod test_dlgraph_builder { use super::*; diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 2d23af3..d4f6d7c 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -233,6 +233,12 @@ pub trait LabelGraph<T: GraphLabel>: Graph { Self: 'a, T: 'a; + /// A type that iterates over labels of an edge. + type EdgeLabelIter<'a>: Iterator<Item = T> + 'a + where + Self: 'a, + T: 'a; + #[inline] /// Return the label of a vertex or an error if the node is /// invalid. @@ -247,15 +253,9 @@ pub trait LabelGraph<T: GraphLabel>: Graph { } } - #[inline] /// Return the label of an edge or an error if some node is /// invalid. - /// - /// The default implementation always returns an empty vector for - /// valid nodes. - fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, Error> { - self.has_edge(source, target).map(|_| Vec::new()) - } + fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, Error>; /// Return an iterator of edges out of a node, whose associated /// label is as given. diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index b9ee398..115bea5 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,5 +1,5 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`NFA`][super::Nfa] and another that implements the trait //! [`Regex`][super::Regex]. pub mod nfa; diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 229ba4d..e642218 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,17 +1,8 @@ //! This file provides a default implementation of NFA. -// TODO: The current focus of the project is to understand the growth -// rate of the algorithm, to know whether I made a mistake in the -// previous iteration of the implementation, or the algorithm is not -// as fast as the author estimated, which is not quite likely, of -// course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - use graph::{ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, - LabelGraph, + LabelExtGraph, LabelGraph, }; use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; @@ -32,6 +23,11 @@ impl<T: GraphLabel + Display> Default for DefaultNFA<T> { } } +// Some boiler-plate delegation implementations. +// +// I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do +// not want to dereference an NFA to a Graph. + impl<T: GraphLabel + Display> Graph for DefaultNFA<T> { type Iter<'a> = <DLGraph<T> as Graph>::Iter<'a> where T: 'a; @@ -81,6 +77,11 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a; + type EdgeLabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::EdgeLabelIter<'a> + where + Self: 'a, + T: 'a; + #[inline] fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> { if self.has_node(node_id) { @@ -91,7 +92,7 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { } #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result<Vec<T>, GError> { + fn edge_label(&self, source: usize, target: usize) -> Result<Self::EdgeLabelIter<'_>, GError> { self.graph.edge_label(source, target) } @@ -115,12 +116,24 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> { } } +impl<T: GraphLabel + Display> LabelExtGraph<T> for DefaultNFA<T> { + #[inline] + fn extend(&mut self, edges: impl IntoIterator<Item = (T, usize)>) -> Result<usize, GError> { + self.graph.extend(edges) + } +} + +// TODO: Define a type for the labels of DefaultNFA. + impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>; + // Reminder: This is an adopted version of Thompson's + // construction. fn to_nfa( regexps: &[impl Regex<RegexType<T>>], sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, + default: Option<T>, ) -> Result<Self::FromRegex<DOption<T>>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); @@ -128,7 +141,7 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { return Ok(Default::default()); } - // We reserve two more rooms for later uses. + // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len); @@ -141,20 +154,18 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { builder.add_vertex(); } + // add a default edge + builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + let accumulators: Vec<usize> = { - let mut result = Vec::with_capacity(regexps.len() + 1); + let mut result = Vec::with_capacity(regexps.len()); result.push(0); - let mut accumulator = 0; - - for regexp in regexps.iter() { - accumulator += regexp.nodes_len() * 2; - result.push(accumulator); + for regexp in regexps.iter().take(regexps.len() - 1) { + result.push(result.last().unwrap() + regexp.nodes_len() * 2); } - result.pop(); - result }; @@ -313,9 +324,8 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { SoC::Sub(sub_non) => { // a non-terminal - let sub_offset = get_offset_safe!(sub_non); - let sub_nfa_start = sub_offset + 2 * sub_non; - let sub_nfa_end = sub_offset + 2 * sub_non + 1; + let sub_nfa_start = get_offset_safe!(sub_non); + let sub_nfa_end = sub_nfa_start + 1; builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; @@ -336,14 +346,102 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> { Ok(DefaultNFA { graph }) } - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() + fn remove_epsilon<F: Fn(T) -> bool>(&mut self, f: F) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut updated = true; + + let nodes_len = self.nodes_len(); + + while updated { + updated = false; + + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); + + for source in builder.nodes() { + for (label, target_iter) in builder.labels_of(source)? { + if f(*label) { + // empty label found + for target in target_iter { + for (label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + } + } + + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all epsilon edges + + let sources_and_targets: Vec<_> = builder.edges().collect(); + + for (source, target) in sources_and_targets.into_iter() { + builder.remove_edge(source, target, &f)?; + } + + self.graph = builder.build(); + + Ok(()) } - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut changed = true; + + // Use a counting vector to avoid calculating which nodes are + // dead at each iteration. + let mut target_counts: Vec<usize> = + std::iter::repeat(0).take(self.graph.nodes_len()).collect(); + + for (_, target) in self.graph.edges() { + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? += 1; + } + + while changed { + changed = false; + + for node in self.nodes() { + let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0); + + if deadp && !reserve(node) { + let to_remove: Vec<usize> = builder.children_of(node)?.collect(); + + if !to_remove.is_empty() { + changed = true; + + for target in to_remove { + builder.remove_edge(node, target, |_| true)?; + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? -= 1; + } + } + } + } + } + + self.graph = builder.build(); + + Ok(()) } + // REVIEW: Combine nulling and remove_epsilon into the same + // method, or not. + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; let mut builder = self.graph.builder_ref(); @@ -396,7 +494,6 @@ impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> { #[cfg(test)] mod test_to_nfa { - #![allow(unused_imports)] use super::*; use crate::SoC; @@ -446,8 +543,11 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA<DOption<char>> = - DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + let nfa: DefaultNFA<DOption<char>> = DefaultNFA::to_nfa( + &[regex], + |label| Ok(SoC::Carry(label)), + Some(char::default()), + )?; nfa.print_viz("nfa.gv")?; diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index 2670e32..c8ad370 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -260,9 +260,7 @@ impl<T: GraphLabel> DefaultRegex<T> { stack.push(Unseen(0)); while let Some(top) = stack.pop() { - let node_type = types[top.index()]; - - // TODO: Do not use unwrap here. + let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { @@ -350,8 +348,12 @@ impl<T: GraphLabel> DefaultRegex<T> { .map(Unseen) .rev(), ); + + if self.graph.is_empty_node(top.index()).unwrap() { + write!(result, "ε")?; + } } - RegexType::Lit(label) => write!(result, "{}", f(label))?, + RegexType::Lit(label) => write!(result, "{}", f(*label))?, } } @@ -452,7 +454,15 @@ impl Display for ParseError { } } -impl std::error::Error for ParseError {} +impl std::error::Error for ParseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let ParseError::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} impl From<GError> for ParseError { fn from(ge: GError) -> Self { diff --git a/nfa/src/error.rs b/nfa/src/error.rs index ad75077..7160555 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -1,6 +1,6 @@ //! This file implements the error type for the crate. -use graph::error::Error as GError; +use graph::error::Error as GraphError; use core::fmt::{Display, Formatter}; @@ -17,7 +17,7 @@ pub enum Error { /// There is no root in the underlying regular expression. NoRoot, /// This error comes from some underlying graph operation. - Graph(GError), + Graph(GraphError), /// A cycle is found when constructing regular expressions. Cycle, } @@ -34,10 +34,18 @@ impl Display for Error { } } -impl std::error::Error for Error {} +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Self::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} -impl From<GError> for Error { - fn from(e: GError) -> Self { +impl From<GraphError> for Error { + fn from(e: GraphError) -> Self { Self::Graph(e) } } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index b1d62b3..31bd69a 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -14,7 +14,7 @@ use core::fmt::Display; use std::ops::{Deref, DerefMut}; -use graph::{Graph, GraphLabel, LabelGraph}; +use graph::{Graph, GraphLabel, LabelExtGraph}; use error::Error; @@ -52,14 +52,14 @@ pub trait Regex<T: GraphLabel>: Graph + Display + Clone { } } - // TODO: add functions that determine if certain "positions" in a + // TODO: Add functions that determine if certain "positions" in a // regular language satisfy some special properties, like at the // end of a Kleene star, or at the end of a regular language, et // cetera. These might be needed later. } -/// Since Option<T> does not inherit the Display from T, we wrap it to -/// provide an automatic implementation of Display. +/// Since `Option<T>` does not inherit the `Display` from `T`, we wrap +/// it to provide an automatic implementation of `Display`. #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] pub struct DOption<T>(Option<T>); @@ -120,42 +120,98 @@ pub enum SoC<T> { /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. -pub trait Nfa<T: GraphLabel>: LabelGraph<T> { +pub trait Nfa<T: GraphLabel>: LabelExtGraph<T> { + // TODO: This trait should have a type for the labels. + /// Remove all empty transitions from the nondeterministic finite /// automaton. - fn remove_epsilon(&mut self) -> Result<(), Error>; + fn remove_epsilon<F>(&mut self, f: F) -> Result<(), Error> + where + F: Fn(T) -> bool; /// Return a state-minimal NFA equivalent with the original one. /// /// This is not required. It is just to allow me to experiment /// with NFA optimization algorithms. - fn minimize(&self) -> Result<Self, Error> { + fn minimize(&self) -> Result<Box<Self>, Error> { Err(Error::UnsupportedOperation) } + /// Check every node or edge by a given predicate. + /// + /// This should also verify that every node and edge has correct + /// indices, so that we can safely use `unwrap` later. A + /// consequence is that, if one only wants to check the validity + /// of nodes and edges, one can pass a function that always + /// returns `true`. + #[inline] + fn labels_satisfy(&self, f: impl Fn(T) -> bool) -> Result<bool, Error> { + let nodes_len = self.nodes_len(); + let mut result = true; + + for node in self.nodes() { + for (label, children_iter) in self.labels_of(node)? { + for child in children_iter { + if child >= nodes_len { + dbg!(node, label); + return Err(graph::error::Error::IndexOutOfBounds(child, nodes_len).into()); + } + } + + // NOTE: Avoid executing `f` if `result` is already + // false. But still proceed in verifying that nodes + // and edges are correct: the correctness of nodes and + // edges is more important than the function execution + // results, as the former directly affects the safety + // of many algorithms. + if result && !f(*label) { + dbg!(node, label); + result = false; + } + } + } + + Ok(result) + } + /// When we convert a regular expression to a nondeterministic /// finite automaton, the label should be optional, so we use a /// different type for the result type. type FromRegex<S: GraphLabel + Display + Default>; - /// Build a nondeterministic finite automaton out of a set of - /// regular expressions. + /// Build a nondeterministic finite automaton out of a set + /// `regexps` of regular expressions. + /// + /// The `sub_pred` is a predicate function that returns an + /// indication whether to carry the value around or to substitute + /// by another regular language in the given set. + /// + /// The `default` parameter specifies the label of a default edge, + /// that goes from a node to another, both of which are not + /// associated with the given regular languages. /// - /// The SUB_PRED is a predicate function that returns an optional - /// index for a label. If it returns some index, then this means - /// we should substitute the index-th regular expression in the - /// set, whereever that label occurs in the set of regular - /// expressions. + /// This function should perform Thompson's construction, + /// basically. fn to_nfa( regexps: &[impl Regex<RegexType<T>>], + // TODO: The transformation should produce more information. sub_pred: impl Fn(T) -> Result<SoC<T>, Error>, + default: Option<T>, ) -> Result<Self::FromRegex<DOption<T>>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. /// - /// A state is dead if there are no edges going to the state. - fn remove_dead(&mut self) -> Result<(), Error>; + /// A state is dead if there are no edges going to the state, and + /// if it is not reserved. + /// + /// # Note + /// + /// Actually an implementation is allowed to just remove all edges + /// out of the dead nodes. Then these nodes are considered gone + /// from the graph, and we don't need to re-index the existing + /// edges. We can call this "a poor man's removal of nodes". + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>; /// For each edge from A to B whose edge is considered nullable by /// a function `f`, and for every edge from B to C, add an edge @@ -169,6 +225,3 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> { pub mod default; pub mod desrec; - -#[cfg(test)] -mod nfa_tests {} diff --git a/receme/src/lib.rs b/receme/src/lib.rs index be1f028..d9e6249 100644 --- a/receme/src/lib.rs +++ b/receme/src/lib.rs @@ -48,7 +48,7 @@ pub mod ralgebra; // The following modules are for default implementations. pub mod tree; -// TODO: benchmarks +// REVIEW: Do we really need this crate? #[cfg(test)] mod test_expr_tree { diff --git a/receme/src/tree.rs b/receme/src/tree.rs index eb64f31..521f913 100644 --- a/receme/src/tree.rs +++ b/receme/src/tree.rs @@ -365,4 +365,4 @@ where } } -// TODO: Para, Apo, Histo, and Futu await us. +// REVIEW: Para, Apo, Histo, and Futu await us. diff --git a/semiring/Cargo.toml b/semiring/Cargo.toml new file mode 100644 index 0000000..0a82e32 --- /dev/null +++ b/semiring/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "semiring" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/semiring/src/lib.rs b/semiring/src/lib.rs new file mode 100644 index 0000000..7d12d9a --- /dev/null +++ b/semiring/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} |