From bdbd4b4dc21af09711c97d3f903877443199af06 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Tue, 3 Jan 2023 23:44:02 +0800 Subject: structural change: separate crates out I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments. --- Cargo.toml | 6 +- chain/Cargo.toml | 7 +- chain/src/atom/default.rs | 342 +++++++ chain/src/atom/mod.rs | 41 + chain/src/default.rs | 46 + chain/src/grammar.rs | 1247 ------------------------ chain/src/lib.rs | 49 +- chain/src/plan.org | 154 ++- forest/Cargo.toml | 9 + forest/src/default.rs | 153 +++ forest/src/lib.rs | 105 ++ grammar/Cargo.toml | 20 + grammar/src/lib.rs | 1131 +++++++++++++++++++++ grammar/src/test_grammar_helper.rs | 368 +++++++ grammar/src/tests/mod.rs | 30 + grammar/src/tests/test_grammar_left_closure.rs | 272 ++++++ graph/src/adlist.rs | 6 + graph/src/adset.rs | 6 + graph/src/builder.rs | 3 + graph/src/labelled.rs | 135 ++- graph/src/lib.rs | 14 +- nfa/src/default/mod.rs | 2 +- nfa/src/default/nfa.rs | 160 ++- nfa/src/default/regex.rs | 20 +- nfa/src/error.rs | 18 +- nfa/src/lib.rs | 91 +- receme/src/lib.rs | 2 +- receme/src/tree.rs | 2 +- semiring/Cargo.toml | 8 + semiring/src/lib.rs | 14 + 30 files changed, 3080 insertions(+), 1381 deletions(-) create mode 100644 chain/src/atom/default.rs create mode 100644 chain/src/atom/mod.rs create mode 100644 chain/src/default.rs delete mode 100644 chain/src/grammar.rs create mode 100644 forest/Cargo.toml create mode 100644 forest/src/default.rs create mode 100644 forest/src/lib.rs create mode 100644 grammar/Cargo.toml create mode 100644 grammar/src/lib.rs create mode 100644 grammar/src/test_grammar_helper.rs create mode 100644 grammar/src/tests/mod.rs create mode 100644 grammar/src/tests/test_grammar_left_closure.rs create mode 100644 semiring/Cargo.toml create mode 100644 semiring/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 77de27f..e34a8cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,10 +10,12 @@ keywords = ["emacs", "parser"] # generic associated types, which are not stablized until version # 1.65. rust-version = "1.65" -# testing the new resolver, even though this has no dependencies ;p [workspace] -members = ["graph", "receme", "nfa", "chain", "viz"] +members = [ "graph", "receme", "nfa", "chain", "viz", "grammar", + "forest", "semiring"] + +# testing the new resolver, even though this has no dependencies ;p resolver = "2" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/chain/Cargo.toml b/chain/Cargo.toml index 32eed8b..265954c 100644 --- a/chain/Cargo.toml +++ b/chain/Cargo.toml @@ -7,4 +7,9 @@ edition = "2021" [dependencies] nfa = { path = "../nfa" } -graph = { path = "../graph" } \ No newline at end of file +graph = { path = "../graph" } +grammar = { path = "../grammar" } +forest = { path = "../forest" } + +[dev-dependencies] +grammar = { path = "../grammar", features = ["test-helper"] } diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs new file mode 100644 index 0000000..72989b3 --- /dev/null +++ b/chain/src/atom/default.rs @@ -0,0 +1,342 @@ +//! This file provides a default implementation of the +//! [`Atom`][super::Atom] trait. + +use super::*; +use grammar::Grammar; +use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; +use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; + +use core::fmt::Display; +use std::collections::BTreeMap as Map; + +/// A virtual node represents the derivative of a non-terminal symbol +/// `S` with respect to a terminal symbol `t`. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] +struct VirtualNode { + s: usize, + t: usize, +} + +impl Display for VirtualNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "VN[{}]^({})", self.s, self.t) + } +} + +impl VirtualNode { + fn new(s: usize, t: usize) -> Self { + Self { s, t } + } +} + +type VirtualMap = Map; + +/// The type of atomic languages. +#[derive(Debug, Clone, Default)] +pub struct DefaultAtom { + grammar: Grammar, + nfa: DefaultNFA>, + // NOTE: This is mostly for printing and debugging + regexp: Vec>, + virtual_nodes: VirtualMap, +} + +impl Display for DefaultAtom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let grammar = &self.grammar; + + let error_to_string = |err| format!("{err}"); + let tnt_to_string = |tnt, deco| { + grammar + .name_of_tnt(tnt) + .map(|name| format!("{deco}({name})")) + .unwrap_or_else(error_to_string) + }; + + let display_tnt = |tnt| match tnt { + TNT::Ter(t) => format!( + "({})", + grammar + .unpack_tnt(t) + .map(|tnt| tnt_to_string(tnt, "")) + .unwrap_or_else(error_to_string) + ), + TNT::Non(_) => tnt_to_string(tnt, "H"), + }; + + writeln!(f, "regular expressions:")?; + + let mut accumulators = Vec::with_capacity(self.regexp.len() + 1); + accumulators.push(0usize); + + for regex in self.regexp.iter() { + writeln!(f, "accumulator: {}", accumulators.last().unwrap())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + + let string = regex.to_string_with(display_tnt)?; + + writeln!(f, "{string}")?; + } + + writeln!(f, "total = {}", accumulators.last().unwrap())?; + + writeln!(f, "virtual nodes:")?; + + for (virtual_node, index) in self.virtual_nodes.iter() { + writeln!(f, "{virtual_node}: {index}")?; + } + + Ok(()) + } +} + +// Some boiler-plate delegation implementations for Graph and +// LabelGraph, in order to implement Nfa. + +impl Graph for DefaultAtom { + type Iter<'b> = > as Graph>::Iter<'b> + where + Self: 'b; + + fn is_empty(&self) -> bool { + self.nfa.is_empty() + } + + fn nodes_len(&self) -> usize { + self.nfa.nodes_len() + } + + fn children_of(&self, node_id: usize) -> Result, GraphError> { + self.nfa.children_of(node_id) + } + + fn degree(&self, node_id: usize) -> Result { + self.nfa.degree(node_id) + } + + fn is_empty_node(&self, node_id: usize) -> Result { + self.nfa.is_empty_node(node_id) + } + + fn has_edge(&self, source: usize, target: usize) -> Result { + self.nfa.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder) { + // NOTE: We cannot replace by a builder whose result is an + // atom, not the underlying graph type. + unimplemented!() + } +} + +impl LabelGraph> for DefaultAtom { + type Iter<'b> = > as LabelGraph>>::Iter<'b> + where + Self: 'b; + + type LabelIter<'b> = > as LabelGraph>>::LabelIter<'b> + where + Self: 'b, + DOption: 'b; + + type EdgeLabelIter<'a> = > as LabelGraph>>::EdgeLabelIter<'a> + where + Self: 'a, + DOption: 'a; + + #[inline] + fn vertex_label(&self, node_id: usize) -> Result>, GraphError> { + self.nfa.vertex_label(node_id) + } + + #[inline] + fn edge_label( + &self, + source: usize, + target: usize, + ) -> Result, GraphError> { + self.nfa.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &DOption, + ) -> Result<>>::Iter<'_>, GraphError> { + self.nfa.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result, GraphError> { + self.nfa.labels_of(node_id) + } + + #[inline] + fn has_edge_label( + &self, + node_id: usize, + label: &DOption, + target: usize, + ) -> Result { + self.nfa.has_edge_label(node_id, label, target) + } +} + +impl LabelExtGraph> for DefaultAtom { + #[inline] + fn extend( + &mut self, + edges: impl IntoIterator, usize)>, + ) -> Result { + self.nfa.extend(edges) + } +} + +impl Nfa> for DefaultAtom { + #[inline] + fn remove_epsilon(&mut self, f: F) -> Result<(), nfa::error::Error> + where + F: Fn(DOption) -> bool, + { + self.nfa.remove_epsilon(f) + } + + type FromRegex = (); + + #[inline] + fn to_nfa( + _regexps: &[impl nfa::Regex>>], + _sub_pred: impl Fn(DOption) -> Result>, nfa::error::Error>, + _default: Option>, + ) -> Result>>, nfa::error::Error> { + // NOTE: We cannot construct an atom from a set of regular + // languages alone. So it is appropriate to panic here, if + // one tries to do so, for some reason. + unimplemented!() + } + + #[inline] + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.remove_dead(reserve) + } + + #[inline] + fn nulling(&mut self, f: impl Fn(DOption) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.nulling(f) + } +} + +impl DefaultAtom { + /// Construct an atomic language from a grammar. + pub fn from_grammar(mut grammar: Grammar) -> Result { + grammar.compute_firsts()?; + + let regexp = grammar.left_closure()?; + + let mut nfa = grammar.left_closure_to_nfa(®exp)?; + + use std::collections::HashSet; + + let accumulators: Vec = { + let mut result = Vec::with_capacity(regexp.len() + 1); + result.push(0); + + for regex in regexp.iter() { + result.push(regex.nodes_len() * 2 + result.last().unwrap()); + } + + result.into_iter().collect() + }; + + let accumulators_set: HashSet = accumulators.iter().copied().collect(); + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; + + // now add the virtual nodes + let mut virtual_nodes: VirtualMap = Default::default(); + + let nt_num = grammar.non_num(); + + assert!(nt_num <= accumulators.len()); + + // Convert an error telling us that an index is out of bounds. + // + // Panics if the error is not of the expected kind. + fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { + match ge { + GraphError::IndexOutOfBounds(index, bound) => { + GrammarError::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + } + } + + for nt in 0..nt_num { + let children: std::collections::HashMap, Vec<_>> = nfa + // this is safe because of the above assertion. + .labels_of(*accumulators.get(nt).unwrap()) + .map_err(index_out_of_bounds_conversion)? + .map(|(label, target_iter)| (*label, target_iter.collect())) + .collect(); + + for (label, children_vec) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label { + let new_index = nfa + .extend(children_vec.into_iter().map(|target| (label, target))) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } + } + } + + Ok(Self { + grammar, + nfa, + regexp, + virtual_nodes, + }) + } +} + +/// A convenient getter for the map of virtual nodes. +fn query(map: &VirtualMap, nt: usize, t: usize) -> Option { + map.get(&VirtualNode::new(nt, t)).copied() +} + +impl Atom for DefaultAtom { + fn atom(&self, nt: usize, t: usize) -> Result, GrammarError> { + if nt >= self.grammar.non_num() { + return Err(GrammarError::IndexOutOfBounds(nt, self.grammar.non_num())); + } + + if t >= self.grammar.ter_num() { + return Err(GrammarError::IndexOutOfBounds(t, self.grammar.ter_num())); + } + + Ok(query(&self.virtual_nodes, nt, t)) + } + + fn empty(&self) -> usize { + assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); + + self.nfa.nodes_len() - 2 + } +} diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs new file mode 100644 index 0000000..084acca --- /dev/null +++ b/chain/src/atom/mod.rs @@ -0,0 +1,41 @@ +//! This file defines the behaviour of the Atomic languages, and +//! provides a default implementation. +//! +//! Here I do not to substitute external packages' implementations in +//! the future, so why define a trait for the atomic languages? +//! Because this way I can easily substitute other implementations if +//! I have better ideas in the future. + +use grammar::{Error as GrammarError, TNT}; +use nfa::{DOption, Nfa}; + +/// The expected behaviours of an atomic language. +pub trait Atom: Nfa> { + /// Return the index of a node representing the derivative of the + /// left-linear null closure of `nt` with respect to `t`. + fn atom(&self, nt: usize, t: usize) -> Result, GrammarError>; + + /// Return the index of the empty state. + fn empty(&self) -> usize; +} + +pub mod default; + +pub use default::DefaultAtom; + +#[cfg(test)] +mod tests { + use super::*; + use grammar::test_grammar_helper::*; + + #[test] + fn atom() -> Result<(), Box> { + let grammar = new_notes_grammar()?; + + let atom = DefaultAtom::from_grammar(grammar)?; + + println!("atom = {atom}"); + + Ok(()) + } +} diff --git a/chain/src/default.rs b/chain/src/default.rs new file mode 100644 index 0000000..e04be9f --- /dev/null +++ b/chain/src/default.rs @@ -0,0 +1,46 @@ +//! This file provides a default implementation of the +//! [`Chain`][crate::Chain] trait. +//! +//! The reason for using a trait is that I might want to experiment +//! with different implementation ideas in the future, and this +//! modular design makes that easy. + +use super::*; +use core::fmt::Display; + +/// The errors related to taking derivatives by chain rule. +#[derive(Debug)] +pub enum Error { + /// An invalid situation happens. + Invalid, +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Invalid => write!(f, "invalid"), + } + } +} + +impl std::error::Error for Error {} + +/// A default implementation for the [`Chain`] trait. +#[derive(Debug, Clone, Default)] +pub struct DefaultChain {} + +impl Chain for DefaultChain { + type Error = Error; + + fn unit() -> Self { + todo!() + } + + fn chain(&mut self, _t: usize) { + todo!() + } + + fn epsilon(&self) -> bool { + todo!() + } +} diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs deleted file mode 100644 index 287fbca..0000000 --- a/chain/src/grammar.rs +++ /dev/null @@ -1,1247 +0,0 @@ -#![warn(missing_docs)] -//! This file implements the extected behaviours of grammars. - -// NOTE: We shall first start with a parser that works at the level of -// characters. The purpose is to first experiment with the workings -// and the performance of the algorithms, before optimising by using -// regular expressions to classify inputs into tokens. In other -// words, the current focus is not on the optimisations, whereas -// scanners are for optimisations only, so to speak. - -#![allow(unused_imports)] -use nfa::{ - default::{ - nfa::DefaultNFA, - regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, - }, - DOption, DesRec, Nfa, Regex, SoC, -}; - -use graph::{adlist::ALGBuilder, builder::Builder, Graph}; - -use std::{ - collections::HashSet, - fmt::{Display, Write}, -}; - -/// The type of a terminal. -/// -/// For the time being this is a wrapper around a string, but in the -/// future it may hold more information of scanners. -#[derive(Debug, Clone, Eq, PartialEq)] -pub struct Terminal { - // If we want to use scanners, per chance add them as a new field - // here. - name: String, -} - -impl Terminal { - /// Create a terminal with the given name. - #[inline] - pub fn new(name: String) -> Self { - Self { name } - } - - /// Return the name of the terminal. - #[inline] - pub fn name(&self) -> &str { - &self.name - } -} - -/// The type of a non-terminal. -/// -/// This is just a wrapper around a string. -#[derive(Debug, Clone)] -pub struct Nonterminal(String); - -impl Nonterminal { - /// Return the name of the nonterminal. - /// - /// Just to improve readability. - #[inline] - pub fn name(&self) -> &str { - &self.0 - } -} - -/// The type of a terminal or a non-terminal. -/// -/// Only an index is stored here. Actual data are stored in two other -/// arrays. -#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum TNT { - /// Terminal variant - Ter(usize), - /// Nonterminal variant - Non(usize), -} - -impl Display for TNT { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TNT::Ter(t) => write!(f, "T({t})"), - TNT::Non(n) => write!(f, "N({n})"), - } - } -} - -/// Errors related to grammar operations. -#[derive(Debug, Copy, Clone)] -#[non_exhaustive] -pub enum Error { - /// The first component is the index, and the second the bound. - IndexOutOfBounds(usize, usize), - /// Fail to build the N-th regular expression, due to the - /// ParseError. - BuildFail(usize, ParseError), - /// fail to build NFA - NFAFail(nfa::error::Error), -} - -impl From for Error { - fn from(nfae: nfa::error::Error) -> Self { - Self::NFAFail(nfae) - } -} - -impl Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), - Error::BuildFail(n, pe) => write!( - f, - "Failed to build the {n}-th regular expression due to error: {pe}" - ), - Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), - } - } -} - -impl std::error::Error for Error {} - -/// A rule is a regular expression of terminals or non-terminals. -#[derive(Debug, Clone)] -pub struct Rule { - regex: DefaultRegex, -} - -impl Rule { - /// Return true if and only if the rule is empty. - #[inline] - pub fn is_empty(&self) -> bool { - self.regex.is_empty() - } - - /// Return the length of the rule. - #[inline] - pub fn len(&self) -> usize { - self.regex.len() - } -} - -/// The type of a grammar. -#[derive(Debug, Clone, Default)] -pub struct Grammar { - ter: Vec, - non: Vec, - rules: Vec, - firsts: Vec>>, - first_nodes: Vec>, -} - -/// A private type to aid the recursive looping of rergular -/// expressions. -#[derive(Copy, Clone)] -enum StackElement { - Seen(usize), - Unseen(usize), -} - -impl StackElement { - fn index(self) -> usize { - match self { - Self::Seen(index) => index, - Self::Unseen(index) => index, - } - } - - fn is_seen(self) -> bool { - matches!(self, Self::Seen(_)) - } -} - -impl Grammar { - /// Construct a grammar from a vector of terminals, a vector of - /// non-terminals, and a vector of rules for the non-temrinals. - /// - /// # Panic - /// - /// If the length of `non` is not equal to that of `rules`, then - /// the function panics. - pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { - assert_eq!(non.len(), rules.len()); - - // One more room is reserved for the `None` value. - let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) - .take(non.len()) - .collect(); - - let first_nodes = rules - .iter() - .map(|rule| Vec::with_capacity(rule.len())) - .collect(); - - Self { - ter, - non, - rules, - firsts, - first_nodes, - } - } - - /// Return the name of a terminal or a non-terminal. - pub fn name_of_tnt(&self, tnt: TNT) -> Result { - match tnt { - TNT::Ter(t) => Ok(format!( - "T{}", - self.ter - .get(t) - .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? - .name() - )), - TNT::Non(n) => Ok(format!( - "N{}", - self.non - .get(n) - .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? - .name() - )), - } - } - - /// Return true if and only if there are no non-terminals in the - /// grammar. - #[inline] - pub fn is_empty(&self) -> bool { - self.non.is_empty() - } - - /// Return the total length of all rules. - #[inline] - pub fn total(&self) -> usize { - self.rules.iter().map(Rule::len).sum() - } - - /// Return the number of terminals. - #[inline] - pub fn ter_num(&self) -> usize { - self.ter.len() - } - - /// Return the number of non-terminals. - #[inline] - pub fn non_num(&self) -> usize { - self.non.len() - } - - /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a - /// single number to represent terminals and non-terminals. - /// - /// # Bounds - /// - /// If a terminal index is greater than or equal to the number of - /// terminals, then this signals an error; mutatis mutandis for - /// non-terminals. - /// - /// # Related - /// - /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. - #[inline] - pub fn pack_tnt(&self, tnt: TNT) -> Result { - let ter_num = self.ter.len(); - let non_num = self.non.len(); - - match tnt { - TNT::Ter(t) => { - if t >= ter_num { - Err(Error::IndexOutOfBounds(t, ter_num)) - } else { - Ok(t) - } - } - TNT::Non(n) => { - if n >= non_num { - Err(Error::IndexOutOfBounds(n, non_num)) - } else { - Ok(n + ter_num) - } - } - } - } - - /// Convert a single number to either a terminal or a - /// non-terminal. - /// - /// # Bounds - /// - /// If the number is greater than or equal to the sum of the - /// numbers of terminals and of non-terminals, then this signals - /// an error. - /// - /// # Related - /// - /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. - #[inline] - pub fn unpack_tnt(&self, flat: usize) -> Result { - let ter_num = self.ter.len(); - let non_num = self.non.len(); - - if flat < ter_num { - Ok(TNT::Ter(flat)) - } else if flat < ter_num + non_num { - Ok(TNT::Non(flat - ter_num)) - } else { - Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) - } - } - - /// Return true if and only if the non-terminal is nullable. - #[inline] - pub fn is_nullable(&self, non_terminal: usize) -> Result { - Ok(self - .firsts - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .contains(&None)) - } - - /// For a NON_TERMINAL, return an iterator that goes over the - /// nodes that are reachable from the non-terminal through an - /// empty transition of the nondeterministic finite automaton. - #[inline] - pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { - Ok(self - .first_nodes - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .iter()) - } - - /// Compute the set of terminals that can appear as the first - /// terminal in some left-linear derivation of a non-terminal, for - /// every non-terminal. - /// - /// This is an algorithm that computes the transitive closure, - /// which is a common approach for this task. But perhaps there - /// are more efficient approaches? - /// - /// Also the function computes the set of "reachable nodes" in the - /// process, and records the information in the `first_nodes` - /// attribute. - pub fn compute_firsts(&mut self) -> Result<(), Error> { - let mut updated = true; - - let non_len = self.non_num(); - - use StackElement::{Seen, Unseen}; - - while updated { - updated = false; - - for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { - let root = if let Some(root) = regex.root() { - root - } else { - if !self.is_nullable(n)? { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(None); - - // The default construction of a grammar - // reserves some space for each vector, so - // explicitly setting this can reduce some - // minor memory overhead. - let pointer = self.first_nodes.get_mut(n).unwrap(); - - pointer.clear(); - pointer.shrink_to_fit(); - } - - continue; - }; - - let regex_len = regex.len(); - - let mut stack: Vec = Vec::with_capacity(regex_len); - - stack.push(Unseen(root)); - - let mut children_sets_stack: Vec>> = - Vec::with_capacity(regex_len); - - let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); - - while let Some(top) = stack.pop() { - let top_index = top.index(); - let is_seen = top.is_seen(); - - match regex - .vertex_label(top_index) - .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? - { - RegexType::Kleene => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Plus => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Optional => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Or => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Paren => { - // Only for printing purposes - let mut this_set = HashSet::new(); - - this_set.insert(None); - - children_sets_stack.push(this_set); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - RegexType::Empty => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap().rev() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Lit(tnt) => { - match tnt { - TNT::Ter(t) => { - let mut this_set = HashSet::with_capacity(1); - - this_set.insert(Some(t)); - - children_sets_stack.push(this_set); - } - TNT::Non(non) => { - let this_set = self - .firsts - .get(non) - .ok_or(Error::IndexOutOfBounds(non, non_len))? - .clone(); - - children_sets_stack.push(this_set); - } - } - - let mut this_nodes = HashSet::with_capacity(1); - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - } - } - - assert_eq!( - children_sets_stack.len(), - 1, - "Too many elements left at the end" - ); - - assert_eq!( - children_nodes_stack.len(), - 1, - "Too many elements left at the end" - ); - - for first in children_sets_stack.pop().unwrap().into_iter() { - if !self.firsts.get(n).unwrap().contains(&first) { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(first); - } - } - - *self.first_nodes.get_mut(n).unwrap() = - children_nodes_stack.pop().unwrap().into_iter().collect(); - } - } - - Ok(()) - } - - /// Return the regular language of the left-linear closures of - /// non-terminals in the grammar. - /// - /// The resulting vector is guaranteed to be of the same length as - /// the number of non-terminals. - /// - /// The resulting regular language is not "self-contained". That - /// is to say, its terminals indices are packed indices and are - /// meaningless without the interpretation of the grammar. They - /// should be converted to a nondeterministic finite automaton and - /// then to its null closure later on. - pub fn left_closure(&self) -> Result>, Error> { - let non_len = self.non_num(); - - let mut result = Vec::with_capacity(non_len); - - for (n, rule) in self.rules.iter().enumerate() { - let regex = &rule.regex; - - let regex_root = if let Some(root) = regex.root() { - root - } else { - result.push(Default::default()); - - continue; - }; - - let regex_len = regex.len(); - - /// A convenient macro to retrieve the children from the - /// original regular expression with error propagation. - macro_rules! children { - ($n:expr) => { - regex - .children_of($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - /// A convenient macro to retrieve the label from the - /// original regular expression with error propagation. - macro_rules! label { - ($n:expr) => { - regex - .vertex_label($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - let parents = regex.parents_array().map_err(|e| match e { - nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), - nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), - _ => unreachable!(), - })?; - - use RegexType::*; - use TNT::*; - - let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); - let mut builder = ALGBuilder::default(); - - /// Perform a depth-first copy - macro_rules! df_copy { - ($parent:expr, $index:expr) => { - match label!($index) { - Kleene | Plus | Optional | Or | Paren | Empty => { - let mut stack = vec![($parent, $index)]; - - while let Some((top_parent, top_index)) = stack.pop() { - let label = label!(top_index); - let label = match label { - Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), - _ => label, - }; - - local_result.push(label); - - let new = builder.add_vertex(); - - builder.add_edge(top_parent, new, ()).unwrap(); - - stack.extend(children!(top_index).map(|child| (new, child))); - } - } - Lit(remain_tnt) => { - local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); - let new = builder.add_vertex(); - builder.add_edge($parent, new, ()).unwrap(); - } - } - }; - } - - local_result.push(Or); - builder.add_vertex(); - - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); - - builder.add_edge(0, non_lit_index, ()).unwrap(); - - for first_node in self.first_nodes_of(n)?.copied() { - assert!(first_node < parents.len()); - - let tnt = match label!(first_node) { - Lit(tnt) => tnt, - _ => continue, - }; - - let mut parents_chain = { - let mut result = Vec::new(); - let mut stack = Vec::with_capacity(parents.len()); - - stack.push(first_node); - - while let Some(top) = stack.pop() { - assert!(top < parents.len()); - if let Some(parent) = parents.get(top).copied().unwrap() { - result.push(parent); - stack.push(parent.0); - } - } - - result.reverse(); - - result - }; - - if parents_chain.is_empty() { - local_result.push(Lit(tnt)); - let lit_index = builder.add_vertex(); - builder.add_edge(0, lit_index, ()).unwrap(); - - continue; - } - - assert!(parents_chain.first().unwrap().0 == regex_root); - - // A different, "more local", root. - let mut root: usize; - - // Handle the direct parent - let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); - - match label!(parent_node) { - Kleene | Plus => { - local_result.extend([Empty, Lit(tnt)]); - - root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); - - let iterator = children!(parent_node); - - for index in iterator.clone().skip(parent_edge_index + 1) { - df_copy!(root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - - Or => { - local_result.push(Lit(tnt)); - root = builder.add_vertex(); - } - Optional | Empty => { - // If this path is taken, it should not be - // optional. - local_result.extend([Empty, Lit(tnt)]); - root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); - - for index in children!(parent_node).skip(parent_edge_index + 1) { - df_copy!(root, index); - } - } - Paren | Lit(_) => unreachable!(), - } - - // Handle successive parents - - for (node, edge_index) in parents_chain.into_iter() { - let node_type = label!(node); - - match node_type { - Kleene | Plus => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); - - root = new_index; - - let iterator = children!(node); - - for index in iterator.clone().skip(edge_index + 1) { - df_copy!(root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - RegexType::Or => {} - RegexType::Optional | RegexType::Empty => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); - root = new_index; - - for index in children!(node).skip(edge_index + 1) { - df_copy!(root, index); - } - } - RegexType::Paren | RegexType::Lit(_) => unreachable!(), - } - } - - builder.add_edge(0, root, ()).unwrap(); - } - - local_result.shrink_to_fit(); - - let graph = builder.build(); - - assert_eq!(graph.nodes_len(), local_result.len()); - - result.push( - DefaultRegex::new(graph, local_result) - .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, - ); - } - - assert_eq!(result.len(), non_len); - - Ok(result) - } - - /// Convert the regular language of left-linear closures to its - /// equivalent nondeterministic finite automaton. - /// - /// In the generation of the left-linear closure, the terminals - /// and non-terminals are packed into an unsigned integer. We - /// unpack them in converting to nondeterministic finite - /// automaton. - /// - /// The resulting nondeterministic finite automaton should be - /// transformed to its null-closure for use in our algorithm. - pub fn left_closure_to_nfa( - &self, - closure: &[DefaultRegex], - ) -> Result>, Error> { - let label_transform = |tnt| match tnt { - TNT::Ter(t) => { - let new_tnt = self.unpack_tnt(t).map_err(|e| match e { - Error::IndexOutOfBounds(index, bound) => { - graph::error::Error::IndexOutOfBounds(index, bound) - } - _ => unreachable!(), - })?; - - Ok(SoC::Carry(new_tnt)) - } - TNT::Non(n) => Ok(SoC::Sub(n)), - }; - - DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) - } -} - -impl Display for Grammar { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - assert_eq!(self.non.len(), self.rules.len()); - - for (nt, rule) in self.non.iter().zip(self.rules.iter()) { - write!(f, "{}: ", nt.name())?; - - writeln!( - f, - "{}", - rule.regex.to_string_with(|tnt| format!( - "({})", - self.name_of_tnt(tnt) - .unwrap_or_else(|_| format!("Unknown {tnt:?}")) - ))? - )?; - } - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_helper { - use super::*; - use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; - use std::fmt::Write; - - // Construct a grammar to test - pub fn new_grammar() -> Result> { - let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; - let non = vec![ - Nonterminal("start".to_owned()), - Nonterminal("end".to_owned()), - ]; - - /// A function to scan the inputs. - fn scan_tnt( - parser: &DefaultRegParser, - input: &str, - ) -> Result, ParseDirection)>, ParseError> { - use ParseDirection::*; - use RegexType::*; - use TNT::*; - - let mut chars = input.chars(); - - if let Some(first) = chars.next() { - match first { - '*' => Ok(Some((1, Kleene, Right))), - '+' => Ok(Some((1, Plus, Right))), - '?' => Ok(Some((1, Optional, Right))), - '|' => Ok(Some((1, Empty, Up))), - '(' => Ok(Some((1, Or, Down))), - ')' => Ok(Some((1, Paren, Up))), - 'T' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(t) = parser.query(&name, true) { - Ok(Some((len, Lit(Ter(t)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - 'N' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(n) = parser.query(&name, false) { - Ok(Some((len, Lit(Non(n)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - _ => Err(ParseError::InvalidCharacter(first)), - } - } else { - Ok(None) - } - } - - let mut regex_parser: DefaultRegParser = Default::default(); - - regex_parser.add_tnt("a", true); - regex_parser.add_tnt("b", true); - regex_parser.add_tnt("start", false); - regex_parser.add_tnt("end", false); - - let regex_parser = regex_parser; - - let rule1 = Rule { - regex: regex_parser - .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rule2 = Rule { - regex: regex_parser - .parse("Nstart?Nend*", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rules = vec![rule1, rule2]; - - Ok(Grammar::new(ter, non, rules)) - } -} - -#[cfg(test)] -mod test_grammar_display { - use super::test_grammar_helper::new_grammar; - - #[test] - fn test_display() -> Result<(), Box> { - println!("{}", new_grammar()?); - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_firsts { - use super::test_grammar_helper::new_grammar; - use super::*; - - #[test] - fn test_firsts() -> Result<(), Box> { - let mut grammar = new_grammar()?; - - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_left_closure { - use super::test_grammar_helper::new_grammar; - use super::*; - - pub fn new_closure_regex( - grammar: &mut Grammar, - ) -> Result>, Box> { - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - println!("grammar:"); - println!("{grammar}"); - - grammar.left_closure().map_err(Into::into) - } - - #[test] - fn test_regex() -> Result<(), Box> { - let mut grammar = new_grammar()?; - - let vec_of_regexps = new_closure_regex(&mut grammar)?; - - for regex in &vec_of_regexps { - println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); - // println!("regex: {regex:?}",); - println!("regex len = {}", regex.nodes_len()); - } - - Ok(()) - } - - #[test] - fn test_nfa() -> Result<(), Box> { - let mut grammar = new_grammar()?; - let closure = new_closure_regex(&mut grammar)?; - let nfa = grammar.left_closure_to_nfa(&closure)?; - // TODO: print the nfa out - Ok(()) - } -} diff --git a/chain/src/lib.rs b/chain/src/lib.rs index 0ec4d4c..4e21e1d 100644 --- a/chain/src/lib.rs +++ b/chain/src/lib.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! This package implements the core algorithm of the entire //! workspace: parsing with derivatives by means of chain rule and //! regular nulling languages. @@ -7,19 +8,43 @@ //! think is the essence of this algorithm, the chain-rule for //! derivatives of languages. -pub mod grammar; +pub mod atom; -pub fn add(left: usize, right: usize) -> usize { - left + right -} +// TODO: Define errors. + +/// The expected behaviours of a language which can take derivatives +/// by chain rule. +pub trait Chain: Default { + /// The implementations should choose a type to represent errors. + type Error: std::error::Error; -#[cfg(test)] -mod tests { - use super::*; + /// Represents the language that is present after we parse the + /// empty string, that is the initial configuration of the + /// language. This may or may not be different from what + /// `Default::default` gives. + fn unit() -> Self; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } + /// Take the derivative by a terminal symbol. + /// + /// This takes care of the union and the prepending operations. + /// + /// # A little remark about the design + /// + /// I have thought to separate different operations (like the + /// union, the prepending, and single derivatives) and then define + /// a function to put everything together. But I think that + /// design is not convenient to use. Also, I really do not need + /// those operations other than to provide this derivative + /// operation, so why define them? And putting all things + /// together may reduce the number of bugs caused by wrong uses of + /// those component functions, and can reduce the amount of + /// documentation strings a library user needs to read, in order + /// to make use of this trait. So I ended up with this design. + fn chain(&mut self, t: usize); + + /// Return true if and only if the language contains the empty + /// string. + fn epsilon(&self) -> bool; } + +pub mod default; diff --git a/chain/src/plan.org b/chain/src/plan.org index 8c63492..61738a9 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,29 +2,73 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [2/7] +* Things to do [5/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the grammar module. -- [-] NFA [4/5] +- [-] Establish a testing framework of various grammars. [5/6] + + [X] One simple grammar + + [X] Notes + + [X] Parentheses + + [X] Pathelogical left-recursive + + [X] Pathelogical right-recursive + + [ ] Pathelogically ambiguous + # More should be added +- [X] NFA [4/4] + [X] Add regular expression into NFA. + [X] Convert a set of regular expressions into a nondeterministic finite automaton, where non-terminals denote the indices of regular expressions in the set to substitute. - + [X] Convert a grammar into its grammar of left-linear closures of - non-temrinals and their derivatives. - + [X] Convert nondeterministic finite automata to their null - closures. - + [ ] Test more grammars to ensure it works correctly. -- [ ] Define the Atom trait. -- [ ] Implement virtual nodes for each derivative of each atom: The - lack of this step might be the root cause of the failure of the - previous version of this package. -- [ ] Implement languages. [0/2] - + [ ] First implement them as doubly labelled (directed acyclic) - graphs. - + [ ] Implement finding derivatives. + + [X] Convert a grammar to the language of atomic languages. [2/2] + * [X] Convert a grammar into its grammar of left-linear closures + of non-temrinals and their derivatives. + * [X] Convert nondeterministic finite automata to their null + closures. + + [X] Test more grammars to ensure it works correctly. [5/5] + * [X] Test the conversion to left-closure as a set of regular + expressions. + * [X] Test the conversion of a set of regular expressions into a + nondeterministic finite automaton. + * [X] Test the closure of null edges, where the nullity of an edge + is defined by a function. In our specific case, an edge is null + if the grammar symbol that edge represents, either a terminal or + a non-terminal, can match an empty string. + * [X] Test the removal of empty transitions from nondeterministic + finite automata. + * [X] Test the removal of dead states, where a state is dead if + and only if no other states have an edge into that state. +- [ ] Refactor [0/3] + + [ ] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. We can mark by returning a set of + nodes which are the beginnings of left-linearly expanded branches. + + [ ] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [ ] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. +- [X] Atom [3/3] + + [X] Define the Atom trait. + + [X] Implement virtual nodes for each derivative of each atom: The + lack of this step might be the root cause of the failure of the + previous version of this project. + + [X] Test atoms +- [-] Implement languages. [1/3] + + [X] First define a trait with the expected behaviours. + + [ ] Then implement them as doubly labelled graphs. + + [ ] Thenimplement finding derivatives by use of the chain rule. +- [-] Implement forests [0/2] + + [-] Define a trait with the expected behaviours. + + [ ] Implement them using adjacency list: we only need one label + per edge, and we do not wish to exclude duplicate edges, and we do + not need to index edges by the labels. All in all, the labels can + be implemented by a simple hash map that maps edges to labels, so + a special data type is not needed here. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. @@ -32,6 +76,9 @@ + [ ] Implement the free semiring. + [ ] Compute and record a semiring value when computing derivatives. +- [X] Miscellaneous [1/1] + + [X] Implement a function for NFA that checks if a given predicate + is satisfied for each edge label. * Atomic Languages @@ -39,10 +86,6 @@ This describes the behaviours of atomic languages. The atomic language consists of the null closure of any non-terminal symbol in the grammar, and their deriavtives by terminals and non-terminal. -* Script for creating GIF animation - -[[https://gist.github.com/maelvls/5379127][a gist]] - * Derivative Languages This is the main driving device of the algorithm. Basically, the @@ -82,6 +125,61 @@ should be easy enough, since we already have the mechanism of graphs, nondeterministic automata, and semirings. All we need to do is to combine them together. +* Lexers + +I got this idea from [[https://lists.gnu.org/archive/html/emacs-devel/2022-12/msg01127.html][a discussion on emacs-devel]]. The idea can be +formulated as + +#+begin_quote +Potentially it allows invocation of a parser with different variants +of lexers - one mode with block tokens for the exploration of +project's structure, and another mode for indentation and error +checking purposes. +#+end_quote + +I think this is the "right" use of lexers. That is to say, the parser +by default should operate on an abstract type of tokens, which are but +unsigned integers, and rely on the user application to provide a lexer +for turning the actual input, like a string, into an iterator of +tokens. The lexer can choose to do any sort of preprocessing that it +sees fit, or it can do nothing and just turn characters to unsigned +integers. In the latter case, this parser generator works on the +character level. But, when one neeeds, one can instruct the parser +generator to work on the level of preprocessed tokens easily. + +This has the advantage of allowing a certain degree of flexibility in +the type of tokens accepted, while keeping the implementation simple +at the same time: the internal core only has to deal with unsigned +integers, after all. + +* Library for drawing graphs + +In the past, I thought that drawing graphs is only a development tool +for my package, so I can rely on such external tools as GraphViz, as +that would not be needed for my end-users. + +But recently I realized that this is not a need only for the +development of the package, but also for the users as well. To be +more specific, the target users are those who want to "play" with +grammars. So if one cannot view the resulting parse forest diagrams +easily and interactively, it would be hard to "play" with grammars. + +Moreover, the internal workings of the parsing machine might also be +interesting for users to inspect, whether to debug the program or to +know more under the hood. As such it is required to view the diagrams +of the internal directed acyclic graphs. + +For all these reasons, I know regard the ability to draw graphs and to +interact with those graphs is essential to the user experience of this +package. As a consequence, I would like to develop a small default +library for this purpose. I follow the convention adapted by this +package here as well, which is to provide a default implementation for +the functionality, while still leaving the room for users to +substitute with other external packages, if so desired, for whatever +reason. For example, an external package may provide an unprecedented +performance for drawing graphs in Emacs. If such a package appears, I +can easily avail of that external package to draw graphs. + * Testing ground I am in a strong need to test things out. The most important one is @@ -92,3 +190,21 @@ understanding of the main algorithm is plain wrong. This is the main reason I started this rewrite of the package. +* To figure out + +I need to figure out how to construct the parse forest from a +left-most null-closed derivation graph. + +Reading through the original paper again, I found that the author +augmented the atomic languages with annotations of partial derivation +steps on each edge of the nondeterministic finite automaton. In +brief, this is what I tried to achieve with some obscure constructs +such as expanding reduction informations carried by the +nondeterministic finite automata, in one of the previous versions. To +sum up, I really need to carry those informations in the first place, +otherwise the parse forests cannot be reliably construced later on. + +But I really do not want to adopt the approach of the original author. +I still prefer the idea of a recorder. That is to say, instead of +constructing the parse forest after the recognition, we shall +construct the parse forest simultaneously while we are recognizing. diff --git a/forest/Cargo.toml b/forest/Cargo.toml new file mode 100644 index 0000000..b88451d --- /dev/null +++ b/forest/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "forest" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +graph = { path = "../graph" } diff --git a/forest/src/default.rs b/forest/src/default.rs new file mode 100644 index 0000000..36145f4 --- /dev/null +++ b/forest/src/default.rs @@ -0,0 +1,153 @@ +//! This file provides a default implementation for the +//! [`Forest`][crate::Forest] trait. + +use super::*; +use graph::{error::Error as GraphError, ALGraph, Graph}; + +use std::collections::{hash_set::Iter, HashMap, HashSet}; + +#[derive(Debug, Clone)] +pub struct DefaultForest { + graph: ALGraph, + vertex_labels: HashMap, + edge_labels: HashMap<(usize, usize), EdgeLabel>, + plugins: HashSet, + plugouts: HashSet, +} + +impl Default for DefaultForest { + fn default() -> Self { + Self { + graph: Default::default(), + vertex_labels: Default::default(), + edge_labels: Default::default(), + plugins: Default::default(), + plugouts: Default::default(), + } + } +} + +impl Graph for DefaultForest { + type Iter<'a> = ::Iter<'a> + where + Self: 'a; + + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + fn children_of(&self, node_id: usize) -> Result, GraphError> { + self.graph.children_of(node_id) + } + + fn degree(&self, node_id: usize) -> Result { + self.graph.degree(node_id) + } + + fn is_empty_node(&self, node_id: usize) -> Result { + self.graph.is_empty_node(node_id) + } + + fn has_edge(&self, source: usize, target: usize) -> Result { + self.graph.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder) { + unimplemented!() + } +} + +impl ExtGraph + for DefaultForest +{ + fn extend(&mut self, edges: impl IntoIterator) -> Result { + self.graph.extend(edges) + } +} + +#[derive(Debug)] +pub struct LabelIter<'a> { + set_iter: Iter<'a, usize>, +} + +impl<'a> Iterator for LabelIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + self.set_iter.next().copied() + } + + fn size_hint(&self) -> (usize, Option) { + self.set_iter.size_hint() + } +} + +impl<'a> ExactSizeIterator for LabelIter<'a> { + fn len(&self) -> usize { + let (lower, upper) = self.size_hint(); + // Note: This assertion is overly defensive, but it checks the + // invariant guaranteed by the trait. + debug_assert!(upper == Some(lower)); + lower + } +} + +impl<'a> From> for LabelIter<'a> { + fn from(set_iter: Iter<'a, usize>) -> Self { + Self { set_iter } + } +} + +impl Forest + for DefaultForest +{ + type PluginIter<'a> = LabelIter<'a> + where + Self: 'a; + + type PlugoutIter<'a> = LabelIter<'a> + where + Self: 'a; + + fn plugins(&self) -> Self::PluginIter<'_> { + self.plugins.iter().into() + } + + fn plugouts(&self) -> Self::PlugoutIter<'_> { + self.plugouts.iter().into() + } + + fn plug(&mut self, other: &Self) -> Result<(), Error> { + // PLAN: Clone the `other` forest to `self`, adjust the + // indices for edges, and then add edges between plugs. + // + // Since I cannot touch the underlying nodes directly, I have + // to add the nodes and edges individually. + + todo!() + } + + fn vertex_label(&self, node_id: usize) -> Result, Error> { + if node_id >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(node_id, self.nodes_len())); + } + + Ok(self.vertex_labels.get(&node_id).copied()) + } + + fn edge_label(&self, source: usize, target: usize) -> Result, Error> { + if source >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(source, self.nodes_len())); + } + + if target >= self.nodes_len() { + return Err(Error::IndexOutOfBounds(target, self.nodes_len())); + } + + Ok(self.edge_labels.get(&(source, target)).copied()) + } +} diff --git a/forest/src/lib.rs b/forest/src/lib.rs new file mode 100644 index 0000000..527a78c --- /dev/null +++ b/forest/src/lib.rs @@ -0,0 +1,105 @@ +//! This file defines the expected behaviours of forests, and provides +//! a default implementation for forests. +//! +//! The default forest actually implements the so-called shared packed +//! parse forest, or SPPF. It packs sub-trees with the same parents +//! under the same branch, and lets nodes from different branches +//! share the same children, and hence the name. +//! +//! What is special here is that the forests are marked with some +//! out-coming and in-coming plugs. These plugs are used to join +//! different fragments of forests together. + +use graph::{ExtGraph, GraphLabel}; + +use core::fmt::Display; + +#[derive(Debug)] +pub enum Error { + IndexOutOfBounds(usize, usize), +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(index, bound) => { + write!(f, "index {index} is out of bound {bound}") + } + } + } +} + +impl std::error::Error for Error {} + +/// A builder of a forest. +pub trait ForestBuilder { + /// The type of the resulting forest. + type Output: Forest; + + /// Construct a new builder with only one node with the given + /// label. + fn new_leaf(label: NodeLabel) -> Self; + + /// Add a child to the builder the given labels for the new node + /// and the added edge. + /// + /// All plug-out nodes within the builder should have a new child + /// with the specified labels, and hence multiple children might + /// be added, and the plug-out nodes should be modified + /// accordingly. + fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); + + /// Build the forest. + fn build(self) -> Self::Output; + + /// Build the forest from a reference. + fn build_ref(&self) -> Self::Output; +} + +/// The expected behaviours of a forest. +/// +/// Note that it contains a "striped down" version of the labelled +/// graphs. +pub trait Forest: ExtGraph { + /// Type of iterator of plug-in nodes. + type PluginIter<'a>: Iterator + 'a + where + Self: 'a; + + /// Type of iterator of plug-out nodes. + type PlugoutIter<'a>: Iterator + 'a + where + Self: 'a; + + /// Return the plug-in nodes + fn plugins(&self) -> Self::PluginIter<'_>; + + /// Return the plug-out nodes + fn plugouts(&self) -> Self::PlugoutIter<'_>; + + /// Plug another forest onto this forest. + /// + /// The plug-out nodes of this forest will be joined onto the + /// plug-in nodes of the joining forest. + /// + /// # Performance warning + /// + /// It is recommended to only call this function with a "small" + /// `other`, as this function might copy the whole graph + /// individually, node by node and edge by edge. + fn plug(&mut self, other: &Self) -> Result<(), Error>; + + /// Return the vertex label. + /// + /// A vertex may not have labels. + fn vertex_label(&self, node_id: usize) -> Result, Error>; + + /// Return the edge label. + /// + /// An edge may have no labels. If there is no edge from the + /// given source to the given target, then `Ok(None)` is returned + /// as well. + fn edge_label(&self, source: usize, target: usize) -> Result, Error>; +} + +pub mod default; diff --git a/grammar/Cargo.toml b/grammar/Cargo.toml new file mode 100644 index 0000000..20c4b48 --- /dev/null +++ b/grammar/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "grammar" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[features] +default = [] + +# This flag exposes the module "test_grammar_helper" which provides +# some grammars for testing. +test-helper = [] + +[dependencies] +nfa = { path = "../nfa" } +graph = { path = "../graph" } + +[dev-dependencies] +grammar = { path = ".", features = ["test-helper"] } diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs new file mode 100644 index 0000000..55adc9a --- /dev/null +++ b/grammar/src/lib.rs @@ -0,0 +1,1131 @@ +#![warn(missing_docs)] +//! This file implements the extected behaviours of grammars. + +// NOTE: We shall first start with a parser that works at the level of +// characters. The purpose is to first experiment with the workings +// and the performance of the algorithms, before optimising by using +// regular expressions to classify inputs into tokens. In other +// words, the current focus is not on the optimisations, whereas +// scanners are for optimisations only, so to speak. + +// TODO: Separate contents into modules. + +use nfa::{ + default::{ + nfa::DefaultNFA, + regex::{DefaultRegex, ParseError, RegexType}, + }, + DOption, Nfa, Regex, SoC, +}; + +use graph::{adlist::ALGBuilder, builder::Builder, Graph}; + +use std::{collections::HashSet, fmt::Display}; + +/// The type of a terminal. +/// +/// For the time being this is a wrapper around a string, but in the +/// future it may hold more information of scanners. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Terminal { + // If we want to use scanners, per chance add them as a new field + // here. + name: String, +} + +impl Terminal { + /// Create a terminal with the given name. + #[inline] + pub fn new(name: String) -> Self { + Self { name } + } + + /// Return the name of the terminal. + #[inline] + pub fn name(&self) -> &str { + &self.name + } +} + +/// The type of a non-terminal. +/// +/// This is just a wrapper around a string. +#[derive(Debug, Clone)] +pub struct Nonterminal(String); + +impl Nonterminal { + /// Return the name of the nonterminal. + /// + /// Just to improve readability. + #[inline] + pub fn name(&self) -> &str { + &self.0 + } +} + +/// The type of a terminal or a non-terminal. +/// +/// Only an index is stored here. Actual data are stored in two other +/// arrays. +#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum TNT { + /// Terminal variant + Ter(usize), + /// Nonterminal variant + Non(usize), +} + +impl Display for TNT { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TNT::Ter(t) => write!(f, "T({t})"), + TNT::Non(n) => write!(f, "N({n})"), + } + } +} + +/// Errors related to grammar operations. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum Error { + /// The first component is the index, and the second the bound. + IndexOutOfBounds(usize, usize), + /// Fail to build the N-th regular expression, due to the + /// ParseError. + BuildFail(usize, ParseError), + /// fail to build NFA + NFAFail(nfa::error::Error), +} + +impl From for Error { + fn from(nfae: nfa::error::Error) -> Self { + Self::NFAFail(nfae) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), + Error::BuildFail(n, pe) => write!( + f, + "Failed to build the {n}-th regular expression due to error: {pe}" + ), + Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Error::NFAFail(error) = self { + Some(error) + } else { + None + } + } +} + +/// A rule is a regular expression of terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct Rule { + regex: DefaultRegex, +} + +impl Rule { + /// Return true if and only if the rule is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.regex.is_empty() + } + + /// Return the length of the rule. + #[inline] + pub fn len(&self) -> usize { + self.regex.len() + } +} + +/// The type of a grammar. +#[derive(Debug, Clone, Default)] +pub struct Grammar { + /// A list of terminals. + ter: Vec, + /// A list of non-terminals. + non: Vec, + /// A list of rules. + /// + /// The length of the list must match that of the list of + /// non-terminals. + rules: Vec, + // The following two attributes are empty until we call + // `compute_firsts` on the grammar. + /// The list of sets of "first terminals". + /// + /// The length must match that of the list of non-terminals. + firsts: Vec>>, + /// The list of lists of nodes that are reachable after a nullable + /// transition in the regular expression. + /// + /// The length must match that of the list of non-terminals. + first_nodes: Vec>, + // The following attribute is empty until we call `left_closure` + // on the grammar. + left_closure_branches: HashSet, +} + +/// A private type to aid the recursive looping of rergular +/// expressions. +#[derive(Copy, Clone)] +enum StackElement { + Seen(usize), + Unseen(usize), +} + +impl StackElement { + fn index(self) -> usize { + match self { + Self::Seen(index) => index, + Self::Unseen(index) => index, + } + } + + fn is_seen(self) -> bool { + matches!(self, Self::Seen(_)) + } +} + +impl Grammar { + /// Construct a grammar from a vector of terminals, a vector of + /// non-terminals, and a vector of rules for the non-temrinals. + /// + /// # Panic + /// + /// If the length of `non` is not equal to that of `rules`, then + /// the function panics. + pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { + assert_eq!(non.len(), rules.len()); + + // One more room is reserved for the `None` value. + let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) + .take(non.len()) + .collect(); + + let first_nodes = rules + .iter() + .map(|rule| Vec::with_capacity(rule.len())) + .collect(); + + let left_closure_branches = HashSet::default(); + + Self { + ter, + non, + rules, + firsts, + first_nodes, + left_closure_branches, + } + } + + /// Return the name of a terminal or a non-terminal. + pub fn name_of_tnt(&self, tnt: TNT) -> Result { + match tnt { + TNT::Ter(t) => Ok(format!( + "T{}", + self.ter + .get(t) + .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? + .name() + )), + TNT::Non(n) => Ok(format!( + "N{}", + self.non + .get(n) + .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? + .name() + )), + } + } + + /// Return true if and only if there are no non-terminals in the + /// grammar. + #[inline] + pub fn is_empty(&self) -> bool { + self.non.is_empty() + } + + /// Return the total length of all rules. + #[inline] + pub fn total(&self) -> usize { + self.rules.iter().map(Rule::len).sum() + } + + /// Return the number of terminals. + #[inline] + pub fn ter_num(&self) -> usize { + self.ter.len() + } + + /// Return the number of non-terminals. + #[inline] + pub fn non_num(&self) -> usize { + self.non.len() + } + + /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a + /// single number to represent terminals and non-terminals. + /// + /// # Bounds + /// + /// If a terminal index is greater than or equal to the number of + /// terminals, then this signals an error; mutatis mutandis for + /// non-terminals. + /// + /// # Related + /// + /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. + #[inline] + pub fn pack_tnt(&self, tnt: TNT) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + match tnt { + TNT::Ter(t) => { + if t >= ter_num { + Err(Error::IndexOutOfBounds(t, ter_num)) + } else { + Ok(t) + } + } + TNT::Non(n) => { + if n >= non_num { + Err(Error::IndexOutOfBounds(n, non_num)) + } else { + Ok(n + ter_num) + } + } + } + } + + /// Convert a single number to either a terminal or a + /// non-terminal. + /// + /// # Bounds + /// + /// If the number is greater than or equal to the sum of the + /// numbers of terminals and of non-terminals, then this signals + /// an error. + /// + /// # Related + /// + /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. + /// + /// # Errors + /// + /// This function is supposed to return only one type of errors, + /// namely, the IndexOutOfBounds error that results from a bounds + /// check. Breaking this is breaking the guarantee of this + /// function, and is considered a bug. This behaviour can and + /// should be tested. But I have not found a convenient test + /// method for testing various grammars. + #[inline] + pub fn unpack_tnt(&self, flat: usize) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + if flat < ter_num { + Ok(TNT::Ter(flat)) + } else if flat < ter_num + non_num { + Ok(TNT::Non(flat - ter_num)) + } else { + Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) + } + } + + /// Return true if and only if the non-terminal is nullable. + #[inline] + pub fn is_nullable(&self, non_terminal: usize) -> Result { + Ok(self + .firsts + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .contains(&None)) + } + + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) + } + + /// Return a hash set that contains all nodes in the set of + /// left-closure regular languages that are added because of the + /// left-linear expansion. + pub fn left_closure_branches(&self) -> &HashSet { + &self.left_closure_branches + } + + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } + + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&self) -> Result>, Error> { + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + // If this non-terminal is nullable, add an empty variant. + if self.is_nullable(n)? { + local_result.push(Empty); + let empty_index = builder.add_vertex(); + builder.add_edge(0, empty_index, ()).unwrap(); + } + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => Lit(tnt), + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if parents_chain.is_empty() { + local_result.push(tnt); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + assert_eq!(parents_chain.first().unwrap().0, regex_root); + + // A different, "more local", root. + let mut local_root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + // TODO: If parent_edge_index is 0, make a + // Plus node instead. + local_result.extend([Empty, tnt]); + + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(tnt); + local_root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, tnt]); + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + // TODO: If edge_index is 0, then just + // make this a Plus node. + + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + + local_root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + local_root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(local_root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, local_root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex], + ) -> Result>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + + Ok(nfa) + } +} + +impl Display for Grammar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + assert_eq!(self.non.len(), self.rules.len()); + + for (nt, rule) in self.non.iter().zip(self.rules.iter()) { + write!(f, "{}: ", nt.name())?; + + writeln!( + f, + "{}", + rule.regex.to_string_with(|tnt| format!( + "({})", + self.name_of_tnt(tnt) + .unwrap_or_else(|_| format!("Unknown {tnt:?}")) + ))? + )?; + } + + Ok(()) + } +} + +// A helper module that provides some grammars for testing. +#[cfg(feature = "test-helper")] +pub mod test_grammar_helper; + +#[cfg(test)] +mod tests; diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs new file mode 100644 index 0000000..c236952 --- /dev/null +++ b/grammar/src/test_grammar_helper.rs @@ -0,0 +1,368 @@ +//! This module provides some grammars for testing. + +use super::*; +use nfa::{ + default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}, + DesRec, +}; +use std::fmt::Write; + +/// A helper function to compute the first sets of a grammar and +/// return the left-closure of that grammar. +pub fn new_closure_regex( + grammar: &mut Grammar, +) -> Result>, Box> { + grammar.compute_firsts()?; + + grammar.left_closure().map_err(Into::into) +} + +/// A function to scan the inputs. +fn scan_tnt( + parser: &DefaultRegParser, + input: &str, +) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + use TNT::*; + + let mut chars = input.chars(); + + let mut len = 1; + + while let Some(first) = chars.next() { + match first { + ' ' => { + // ignore whitespaces + len += 1; + } + '*' => return Ok(Some((len, Kleene, Right))), + '+' => return Ok(Some((len, Plus, Right))), + '?' => return Ok(Some((len, Optional, Right))), + '|' => return Ok(Some((len, Empty, Up))), + '(' => return Ok(Some((len, Or, Down))), + ')' => return Ok(Some((len, Paren, Up))), + 'T' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(t) = parser.query(&name, true) { + return Ok(Some((len, Lit(Ter(t)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + 'N' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(n) = parser.query(&name, false) { + return Ok(Some((len, Lit(Non(n)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + _ => { + return Err(ParseError::InvalidCharacter(first)); + } + } + } + + Ok(None) +} + +/// Return a simple testing grammar. +#[allow(dead_code)] +pub fn new_grammar() -> Result> { + let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("end".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("a", true); + regex_parser.add_tnt("b", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("end", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nstart?Nend*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that might serve as the grammar for my notes, +/// somehow. +#[allow(dead_code)] +pub fn new_notes_grammar() -> Result> { + let ter = vec![ + Terminal::new("NL".to_owned()), + Terminal::new("SP".to_owned()), + Terminal::new("CON".to_owned()), + Terminal::new("STAR".to_owned()), + Terminal::new("NOTE".to_owned()), + Terminal::new("PRICE".to_owned()), + Terminal::new("DIGIT".to_owned()), + ]; + let non = vec![ + Nonterminal("document".to_owned()), + Nonterminal("item".to_owned()), + Nonterminal("header".to_owned()), + Nonterminal("title".to_owned()), + Nonterminal("note".to_owned()), + Nonterminal("note-content".to_owned()), + Nonterminal("price".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("NL", true); + regex_parser.add_tnt("SP", true); + regex_parser.add_tnt("CON", true); + regex_parser.add_tnt("STAR", true); + regex_parser.add_tnt("note", true); + regex_parser.add_tnt("price", true); + regex_parser.add_tnt("DIGIT", true); + regex_parser.add_tnt("document", false); + regex_parser.add_tnt("item", false); + regex_parser.add_tnt("header", false); + regex_parser.add_tnt("title", false); + regex_parser.add_tnt("note", false); + regex_parser.add_tnt("notecontent", false); + regex_parser.add_tnt("price", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Nitem+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nheader Nprice?Nnote?", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("TSTAR?TSP Ntitle TNL (TSP|TNL)*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule4 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule5 = Rule { + regex: regex_parser + .parse( + "Tnote Nnotecontent TNL (TSP|TNL)*", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule6 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule7 = Rule { + regex: regex_parser + .parse( + "Tprice TSP TDIGIT+ TNL(TSP | TNL)+", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3, rule4, rule5, rule6, rule7]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that can express parentheses. +#[allow(dead_code)] +pub fn new_paren_grammar() -> Result> { + let ter = vec![ + Terminal::new("LEFT".to_owned()), + Terminal::new("RIGHT".to_owned()), + Terminal::new("A".to_owned()), + ]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("content".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("LEFT", true); + regex_parser.add_tnt("RIGHT", true); + regex_parser.add_tnt("A", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("content", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse( + "TLEFT Nstart TRIGHT | Ncontent Nstart | ", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TA +", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a left recursive grammar. +#[allow(dead_code)] +pub fn new_left_recursive_grammar() -> Result> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NA NS TC", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB | Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("()", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a right recursive grammar. +#[allow(dead_code)] +pub fn new_right_recursive_grammar() -> Result> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NS TC NA|TB Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("NA|", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} +// TODO: more grammars diff --git a/grammar/src/tests/mod.rs b/grammar/src/tests/mod.rs new file mode 100644 index 0000000..55eedff --- /dev/null +++ b/grammar/src/tests/mod.rs @@ -0,0 +1,30 @@ +#[cfg(test)] +mod test_grammar_display { + use crate::test_grammar_helper::new_grammar; + + #[test] + fn test_display() -> Result<(), Box> { + new_grammar().map(|g| println!("{g}")) + } +} + +#[cfg(test)] +mod test_grammar_firsts { + use crate::test_grammar_helper::new_grammar; + use std::io::Write; + + #[test] + fn test_firsts() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + grammar.compute_firsts()?; + + let mut lock = std::io::stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes).map_err(Into::into) + } +} + +#[cfg(test)] +mod test_grammar_left_closure; diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs new file mode 100644 index 0000000..0bc9f4d --- /dev/null +++ b/grammar/src/tests/test_grammar_left_closure.rs @@ -0,0 +1,272 @@ +use crate::test_grammar_helper::*; +use crate::*; +use nfa::Nfa; +use std::{ + collections::HashSet, + io::{stdout, Write}, +}; + +#[test] +fn test_regex() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + let vec_of_regexps = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes)?; + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + for regex in vec_of_regexps.into_iter().skip(1) { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| format!("{tnt}"))? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + Ok(()) +} + +// We ignore this test by default as it is possibly involved with +// printing to a graphviz file. +#[ignore] +#[test] +fn test_nfa() -> Result<(), Box> { + let mut grammar = new_notes_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + grammar + .left_closure_to_nfa(&closure) + .map(|_| ()) + .map_err(Into::into) + + // let _nfa = grammar.left_closure_to_nfa(&closure)?; + + // writeln!(lock, "Not printing nfa to nfa.gv")?; + + // nfa.print_viz("nfa.gv").map_err(Into::into) + + // Ok(()) +} + +#[test] +#[ignore] +fn test_remove_epsilon() -> Result<(), Box> { + let mut lock = stdout().lock(); + + let mut grammar = new_paren_grammar()?; + + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + let closure = new_closure_regex(&mut grammar)?; + + let mut accumulator_value: usize = 0; + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + writeln!(lock, "regex len = {}", regex.nodes_len())?; + writeln!(lock, "offset = {accumulator_value}")?; + + accumulator_value += regex.nodes_len(); + } + + writeln!(lock, "total = {accumulator_value}")?; + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + nfa.print_viz("nfa_no_epsilon.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_remove_dead() -> Result<(), Box> { + let mut grammar = new_paren_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulator = 0usize; + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(accumulator); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulator += regex.nodes_len() * 2; + + accumulators.push(accumulator); + } + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + let accumulators: HashSet = accumulators.into_iter().collect(); + + println!("accumulators = {accumulators:?}"); + + let grammar_reserve_node = |node| accumulators.contains(&node); + + nfa.remove_dead(grammar_reserve_node)?; + + nfa.print_viz("nfa_no_dead.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_nulling() -> Result<(), Box> { + let mut grammar = new_left_recursive_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(0); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + } + + write!(lock, "nullables:")?; + let mut first_time = true; + for i in 0..(grammar.non_num()) { + if grammar.is_nullable(i)? { + if first_time { + write!(lock, " ")?; + } else { + write!(lock, ", ")?; + } + write!(lock, " {i}")?; + first_time = false; + } + } + writeln!(lock)?; + + let accumulators: HashSet = accumulators.into_iter().collect(); + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + + let grammar_reserve_nodes = |node| accumulators.contains(&node); + + writeln!(lock, "accumulators are {accumulators:?}")?; + + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(grammar_reserve_nodes)?; + + writeln!(lock, "Printing nfa to nfa.gv")?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) +} diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index 6c1dcd0..ba9afb8 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -155,6 +155,12 @@ impl Builder for ALGBuilder { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(ALNode::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); diff --git a/graph/src/adset.rs b/graph/src/adset.rs index c225986..adf2aaf 100644 --- a/graph/src/adset.rs +++ b/graph/src/adset.rs @@ -190,6 +190,12 @@ impl Builder for ASGBuilder { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(ASNode::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); diff --git a/graph/src/builder.rs b/graph/src/builder.rs index ce85cce..9ab5895 100644 --- a/graph/src/builder.rs +++ b/graph/src/builder.rs @@ -27,6 +27,9 @@ pub trait Builder: Default { /// Add a vertex without children. fn add_vertex(&mut self) -> usize; + /// Add a number of vertices at the same time. + fn add_vertices(&mut self, n: usize); + /// Add an edge from the source to the target. fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>; diff --git a/graph/src/labelled.rs b/graph/src/labelled.rs index d0a02d0..6341787 100644 --- a/graph/src/labelled.rs +++ b/graph/src/labelled.rs @@ -142,10 +142,11 @@ impl Graph for DLGraph { match self.nodes.get(source) { Some(source_node) => { if self.nodes.get(target).is_none() { - return Err(Error::IndexOutOfBounds(target, self.nodes.len())); + Err(Error::IndexOutOfBounds(target, self.nodes.len())) + } else { + Ok(source_node.by_target.contains_key(&target) + && !source_node.by_target.get(&target).unwrap().is_empty()) } - - Ok(source_node.by_target.contains_key(&target)) } None => Err(Error::IndexOutOfBounds(source, self.nodes.len())), } @@ -198,7 +199,7 @@ impl Graph for DLGraph { /// A delegation of iterators. /// /// This is used to avoid a boxed pointer to an iterator. -#[derive(Default, Debug)] +#[derive(Default, Debug, Clone)] pub struct LabelIndexIter<'a> { iter: Option>>, } @@ -279,25 +280,81 @@ impl<'a, T> Iterator for LabelIter<'a, T> { } } +/// This is used to avoid a boxed pointer to an iterator. +#[derive(Debug)] +pub struct EdgeLabelIter<'a, T> { + iter: Option>, +} + +impl<'a, T> Default for EdgeLabelIter<'a, T> { + #[inline] + fn default() -> Self { + Self { iter: None } + } +} + +impl<'a, T: Copy + Clone> ExactSizeIterator for EdgeLabelIter<'a, T> { + #[inline] + fn len(&self) -> usize { + if let Some(iter) = &self.iter { + iter.len() + } else { + 0 + } + } +} + +impl<'a, T: Copy + Clone> EdgeLabelIter<'a, T> { + fn new(iter: Iter<'a, T>) -> Self { + Self { iter: Some(iter) } + } +} + +impl<'a, T: Copy + Clone> Iterator for EdgeLabelIter<'a, T> { + type Item = T; + + #[inline] + fn next(&mut self) -> Option { + if let Some(iter) = &mut self.iter { + iter.next().copied() + } else { + None + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + if let Some(iter) = &self.iter { + iter.size_hint() + } else { + (0, Some(0)) + } + } +} + impl LabelGraph for DLGraph { type Iter<'a> = LabelIndexIter<'a> where T: 'a; type LabelIter<'a> = LabelIter<'a,T> where T: 'a; - fn edge_label(&self, source: usize, target: usize) -> Result, Error> { + type EdgeLabelIter<'a> = EdgeLabelIter<'a,T> + where + Self: 'a, + T: 'a + Copy + Clone; + + fn edge_label(&self, source: usize, target: usize) -> Result, Error> { if self.has_edge(source, target)? { - Ok(self - .nodes - .get(source) - .unwrap() - .by_target - .get(&target) - .unwrap() - .iter() - .copied() - .collect()) + Ok(EdgeLabelIter::new( + self.nodes + .get(source) + .unwrap() + .by_target + .get(&target) + .unwrap() + .iter(), + )) } else { - Ok(Vec::new()) + Ok(Default::default()) } } @@ -335,11 +392,11 @@ impl LabelGraph for DLGraph { return Err(Error::IndexOutOfBounds(target, nodes_len)); } - if let Some(labels) = node.by_target.get(&target) { - Ok(labels.contains(label)) - } else { - Ok(false) - } + Ok(node + .by_target + .get(&target) + .map(|labels| labels.contains(label)) + .unwrap_or(false)) } None => Err(Error::IndexOutOfBounds(node_id, nodes_len)), } @@ -443,6 +500,12 @@ impl Builder for DLGBuilder { self.nodes.len() - 1 } + #[inline] + fn add_vertices(&mut self, n: usize) { + self.nodes + .extend(std::iter::repeat_with(Default::default).take(n)); + } + fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error> { let nodes_len = self.nodes.len(); @@ -541,14 +604,29 @@ impl Builder for DLGBuilder { // If after removal no labels remain for the target, // we remove the edge entirely, to avoid false empty // edges. - if source_node.flat.is_empty() { - source_node.by_target.remove(&target); - for label in to_remove.iter() { - source_node.by_label.remove(label); + if let Some(set) = source_node.by_target.get(&target) { + if set.is_empty() { + source_node.by_target.remove(&target); } } + for label in to_remove.iter() { + if let Some(set) = source_node.by_label.get(label) { + if set.is_empty() { + source_node.by_label.remove(label); + } + } + } + + // if source_node.flat.is_empty() { + // source_node.by_target.remove(&target); + + // for label in to_remove.iter() { + // source_node.by_label.remove(label); + // } + // } + let new_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); // When source_node is in use, we cannot borrow self @@ -639,10 +717,7 @@ mod label_test { ); // testing edge_label - assert_eq!( - graph.edge_label(5, 2)?.into_iter().collect::>(), - set!(3) - ); + assert_eq!(graph.edge_label(5, 2)?.collect::>(), set!(3)); assert!(matches!( graph.edge_label(6, 2), Err(Error::IndexOutOfBounds(6, 6)) @@ -683,8 +758,6 @@ mod label_test { } } -// TODO: Test print_viz - #[cfg(test)] mod test_dlgraph_builder { use super::*; diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 2d23af3..d4f6d7c 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -233,6 +233,12 @@ pub trait LabelGraph: Graph { Self: 'a, T: 'a; + /// A type that iterates over labels of an edge. + type EdgeLabelIter<'a>: Iterator + 'a + where + Self: 'a, + T: 'a; + #[inline] /// Return the label of a vertex or an error if the node is /// invalid. @@ -247,15 +253,9 @@ pub trait LabelGraph: Graph { } } - #[inline] /// Return the label of an edge or an error if some node is /// invalid. - /// - /// The default implementation always returns an empty vector for - /// valid nodes. - fn edge_label(&self, source: usize, target: usize) -> Result, Error> { - self.has_edge(source, target).map(|_| Vec::new()) - } + fn edge_label(&self, source: usize, target: usize) -> Result, Error>; /// Return an iterator of edges out of a node, whose associated /// label is as given. diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index b9ee398..115bea5 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,5 +1,5 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`NFA`][super::Nfa] and another that implements the trait //! [`Regex`][super::Regex]. pub mod nfa; diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 229ba4d..e642218 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,17 +1,8 @@ //! This file provides a default implementation of NFA. -// TODO: The current focus of the project is to understand the growth -// rate of the algorithm, to know whether I made a mistake in the -// previous iteration of the implementation, or the algorithm is not -// as fast as the author estimated, which is not quite likely, of -// course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - use graph::{ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, - LabelGraph, + LabelExtGraph, LabelGraph, }; use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; @@ -32,6 +23,11 @@ impl Default for DefaultNFA { } } +// Some boiler-plate delegation implementations. +// +// I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do +// not want to dereference an NFA to a Graph. + impl Graph for DefaultNFA { type Iter<'a> = as Graph>::Iter<'a> where T: 'a; @@ -81,6 +77,11 @@ impl LabelGraph for DefaultNFA { type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; + type EdgeLabelIter<'a> = as LabelGraph>::EdgeLabelIter<'a> + where + Self: 'a, + T: 'a; + #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { @@ -91,7 +92,7 @@ impl LabelGraph for DefaultNFA { } #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result, GError> { + fn edge_label(&self, source: usize, target: usize) -> Result, GError> { self.graph.edge_label(source, target) } @@ -115,12 +116,24 @@ impl LabelGraph for DefaultNFA { } } +impl LabelExtGraph for DefaultNFA { + #[inline] + fn extend(&mut self, edges: impl IntoIterator) -> Result { + self.graph.extend(edges) + } +} + +// TODO: Define a type for the labels of DefaultNFA. + impl Nfa for DefaultNFA { type FromRegex = DefaultNFA; + // Reminder: This is an adopted version of Thompson's + // construction. fn to_nfa( regexps: &[impl Regex>], sub_pred: impl Fn(T) -> Result, Error>, + default: Option, ) -> Result>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); @@ -128,7 +141,7 @@ impl Nfa for DefaultNFA { return Ok(Default::default()); } - // We reserve two more rooms for later uses. + // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); @@ -141,20 +154,18 @@ impl Nfa for DefaultNFA { builder.add_vertex(); } + // add a default edge + builder.add_edge(nfa_len - 2, nfa_len - 1, DOption(default))?; + let accumulators: Vec = { - let mut result = Vec::with_capacity(regexps.len() + 1); + let mut result = Vec::with_capacity(regexps.len()); result.push(0); - let mut accumulator = 0; - - for regexp in regexps.iter() { - accumulator += regexp.nodes_len() * 2; - result.push(accumulator); + for regexp in regexps.iter().take(regexps.len() - 1) { + result.push(result.last().unwrap() + regexp.nodes_len() * 2); } - result.pop(); - result }; @@ -313,9 +324,8 @@ impl Nfa for DefaultNFA { SoC::Sub(sub_non) => { // a non-terminal - let sub_offset = get_offset_safe!(sub_non); - let sub_nfa_start = sub_offset + 2 * sub_non; - let sub_nfa_end = sub_offset + 2 * sub_non + 1; + let sub_nfa_start = get_offset_safe!(sub_non); + let sub_nfa_end = sub_nfa_start + 1; builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; @@ -336,14 +346,102 @@ impl Nfa for DefaultNFA { Ok(DefaultNFA { graph }) } - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() + fn remove_epsilon bool>(&mut self, f: F) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut updated = true; + + let nodes_len = self.nodes_len(); + + while updated { + updated = false; + + let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); + + for source in builder.nodes() { + for (label, target_iter) in builder.labels_of(source)? { + if f(*label) { + // empty label found + for target in target_iter { + for (label, children_iter) in builder.labels_of(target)? { + for child in children_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + } + } + + for (source, target, label) in to_add.into_iter() { + builder.add_edge(source, target, label)?; + } + } + + // Then remove all epsilon edges + + let sources_and_targets: Vec<_> = builder.edges().collect(); + + for (source, target) in sources_and_targets.into_iter() { + builder.remove_edge(source, target, &f)?; + } + + self.graph = builder.build(); + + Ok(()) } - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error> { + let mut builder = self.graph.builder_ref(); + + let mut changed = true; + + // Use a counting vector to avoid calculating which nodes are + // dead at each iteration. + let mut target_counts: Vec = + std::iter::repeat(0).take(self.graph.nodes_len()).collect(); + + for (_, target) in self.graph.edges() { + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? += 1; + } + + while changed { + changed = false; + + for node in self.nodes() { + let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0); + + if deadp && !reserve(node) { + let to_remove: Vec = builder.children_of(node)?.collect(); + + if !to_remove.is_empty() { + changed = true; + + for target in to_remove { + builder.remove_edge(node, target, |_| true)?; + *target_counts + .get_mut(target) + .ok_or(Error::UnknownNode(target))? -= 1; + } + } + } + } + } + + self.graph = builder.build(); + + Ok(()) } + // REVIEW: Combine nulling and remove_epsilon into the same + // method, or not. + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { let mut updated = true; let mut builder = self.graph.builder_ref(); @@ -396,7 +494,6 @@ impl Display for DefaultNFA { #[cfg(test)] mod test_to_nfa { - #![allow(unused_imports)] use super::*; use crate::SoC; @@ -446,8 +543,11 @@ mod test_to_nfa { println!("regex = {regex}"); - let nfa: DefaultNFA> = - DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + let nfa: DefaultNFA> = DefaultNFA::to_nfa( + &[regex], + |label| Ok(SoC::Carry(label)), + Some(char::default()), + )?; nfa.print_viz("nfa.gv")?; diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index 2670e32..c8ad370 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -260,9 +260,7 @@ impl DefaultRegex { stack.push(Unseen(0)); while let Some(top) = stack.pop() { - let node_type = types[top.index()]; - - // TODO: Do not use unwrap here. + let node_type = types.get(top.index()).unwrap(); match node_type { RegexType::Kleene => { @@ -350,8 +348,12 @@ impl DefaultRegex { .map(Unseen) .rev(), ); + + if self.graph.is_empty_node(top.index()).unwrap() { + write!(result, "ε")?; + } } - RegexType::Lit(label) => write!(result, "{}", f(label))?, + RegexType::Lit(label) => write!(result, "{}", f(*label))?, } } @@ -452,7 +454,15 @@ impl Display for ParseError { } } -impl std::error::Error for ParseError {} +impl std::error::Error for ParseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let ParseError::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} impl From for ParseError { fn from(ge: GError) -> Self { diff --git a/nfa/src/error.rs b/nfa/src/error.rs index ad75077..7160555 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -1,6 +1,6 @@ //! This file implements the error type for the crate. -use graph::error::Error as GError; +use graph::error::Error as GraphError; use core::fmt::{Display, Formatter}; @@ -17,7 +17,7 @@ pub enum Error { /// There is no root in the underlying regular expression. NoRoot, /// This error comes from some underlying graph operation. - Graph(GError), + Graph(GraphError), /// A cycle is found when constructing regular expressions. Cycle, } @@ -34,10 +34,18 @@ impl Display for Error { } } -impl std::error::Error for Error {} +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Self::Graph(gerr) = self { + Some(gerr) + } else { + None + } + } +} -impl From for Error { - fn from(e: GError) -> Self { +impl From for Error { + fn from(e: GraphError) -> Self { Self::Graph(e) } } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index b1d62b3..31bd69a 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -14,7 +14,7 @@ use core::fmt::Display; use std::ops::{Deref, DerefMut}; -use graph::{Graph, GraphLabel, LabelGraph}; +use graph::{Graph, GraphLabel, LabelExtGraph}; use error::Error; @@ -52,14 +52,14 @@ pub trait Regex: Graph + Display + Clone { } } - // TODO: add functions that determine if certain "positions" in a + // TODO: Add functions that determine if certain "positions" in a // regular language satisfy some special properties, like at the // end of a Kleene star, or at the end of a regular language, et // cetera. These might be needed later. } -/// Since Option does not inherit the Display from T, we wrap it to -/// provide an automatic implementation of Display. +/// Since `Option` does not inherit the `Display` from `T`, we wrap +/// it to provide an automatic implementation of `Display`. #[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] pub struct DOption(Option); @@ -120,42 +120,98 @@ pub enum SoC { /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. -pub trait Nfa: LabelGraph { +pub trait Nfa: LabelExtGraph { + // TODO: This trait should have a type for the labels. + /// Remove all empty transitions from the nondeterministic finite /// automaton. - fn remove_epsilon(&mut self) -> Result<(), Error>; + fn remove_epsilon(&mut self, f: F) -> Result<(), Error> + where + F: Fn(T) -> bool; /// Return a state-minimal NFA equivalent with the original one. /// /// This is not required. It is just to allow me to experiment /// with NFA optimization algorithms. - fn minimize(&self) -> Result { + fn minimize(&self) -> Result, Error> { Err(Error::UnsupportedOperation) } + /// Check every node or edge by a given predicate. + /// + /// This should also verify that every node and edge has correct + /// indices, so that we can safely use `unwrap` later. A + /// consequence is that, if one only wants to check the validity + /// of nodes and edges, one can pass a function that always + /// returns `true`. + #[inline] + fn labels_satisfy(&self, f: impl Fn(T) -> bool) -> Result { + let nodes_len = self.nodes_len(); + let mut result = true; + + for node in self.nodes() { + for (label, children_iter) in self.labels_of(node)? { + for child in children_iter { + if child >= nodes_len { + dbg!(node, label); + return Err(graph::error::Error::IndexOutOfBounds(child, nodes_len).into()); + } + } + + // NOTE: Avoid executing `f` if `result` is already + // false. But still proceed in verifying that nodes + // and edges are correct: the correctness of nodes and + // edges is more important than the function execution + // results, as the former directly affects the safety + // of many algorithms. + if result && !f(*label) { + dbg!(node, label); + result = false; + } + } + } + + Ok(result) + } + /// When we convert a regular expression to a nondeterministic /// finite automaton, the label should be optional, so we use a /// different type for the result type. type FromRegex; - /// Build a nondeterministic finite automaton out of a set of - /// regular expressions. + /// Build a nondeterministic finite automaton out of a set + /// `regexps` of regular expressions. + /// + /// The `sub_pred` is a predicate function that returns an + /// indication whether to carry the value around or to substitute + /// by another regular language in the given set. + /// + /// The `default` parameter specifies the label of a default edge, + /// that goes from a node to another, both of which are not + /// associated with the given regular languages. /// - /// The SUB_PRED is a predicate function that returns an optional - /// index for a label. If it returns some index, then this means - /// we should substitute the index-th regular expression in the - /// set, whereever that label occurs in the set of regular - /// expressions. + /// This function should perform Thompson's construction, + /// basically. fn to_nfa( regexps: &[impl Regex>], + // TODO: The transformation should produce more information. sub_pred: impl Fn(T) -> Result, Error>, + default: Option, ) -> Result>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. /// - /// A state is dead if there are no edges going to the state. - fn remove_dead(&mut self) -> Result<(), Error>; + /// A state is dead if there are no edges going to the state, and + /// if it is not reserved. + /// + /// # Note + /// + /// Actually an implementation is allowed to just remove all edges + /// out of the dead nodes. Then these nodes are considered gone + /// from the graph, and we don't need to re-index the existing + /// edges. We can call this "a poor man's removal of nodes". + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), Error>; /// For each edge from A to B whose edge is considered nullable by /// a function `f`, and for every edge from B to C, add an edge @@ -169,6 +225,3 @@ pub trait Nfa: LabelGraph { pub mod default; pub mod desrec; - -#[cfg(test)] -mod nfa_tests {} diff --git a/receme/src/lib.rs b/receme/src/lib.rs index be1f028..d9e6249 100644 --- a/receme/src/lib.rs +++ b/receme/src/lib.rs @@ -48,7 +48,7 @@ pub mod ralgebra; // The following modules are for default implementations. pub mod tree; -// TODO: benchmarks +// REVIEW: Do we really need this crate? #[cfg(test)] mod test_expr_tree { diff --git a/receme/src/tree.rs b/receme/src/tree.rs index eb64f31..521f913 100644 --- a/receme/src/tree.rs +++ b/receme/src/tree.rs @@ -365,4 +365,4 @@ where } } -// TODO: Para, Apo, Histo, and Futu await us. +// REVIEW: Para, Apo, Histo, and Futu await us. diff --git a/semiring/Cargo.toml b/semiring/Cargo.toml new file mode 100644 index 0000000..0a82e32 --- /dev/null +++ b/semiring/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "semiring" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/semiring/src/lib.rs b/semiring/src/lib.rs new file mode 100644 index 0000000..7d12d9a --- /dev/null +++ b/semiring/src/lib.rs @@ -0,0 +1,14 @@ +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} -- cgit v1.2.3-18-g5258