From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 23 Dec 2022 00:36:31 +0800 Subject: renaming core to chain and some other changes Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations. --- Cargo.toml | 13 +- benches/bench_receme.rs | 293 ----------- chain/Cargo.toml | 10 + chain/src/grammar.rs | 1247 ++++++++++++++++++++++++++++++++++++++++++++++ chain/src/lib.rs | 25 + chain/src/plan.org | 94 ++++ graph/src/adlist.rs | 242 ++++++++- graph/src/adset.rs | 216 +++++++- graph/src/builder.rs | 53 ++ graph/src/error.rs | 9 +- graph/src/labelled.rs | 390 ++++++++++++++- graph/src/lib.rs | 133 ++++- nfa/src/default/nfa.rs | 374 +++++++++++++- nfa/src/default/regex.rs | 313 ++++++++++-- nfa/src/desrec.rs | 22 +- nfa/src/error.rs | 11 +- nfa/src/lib.rs | 94 +++- repcore/Cargo.toml | 8 - repcore/src/grammar.rs | 59 --- repcore/src/lib.rs | 20 - repcore/src/plan.org | 59 --- viz/Cargo.toml | 9 + viz/src/lib.rs | 31 ++ 23 files changed, 3187 insertions(+), 538 deletions(-) delete mode 100644 benches/bench_receme.rs create mode 100644 chain/Cargo.toml create mode 100644 chain/src/grammar.rs create mode 100644 chain/src/lib.rs create mode 100644 chain/src/plan.org create mode 100644 graph/src/builder.rs delete mode 100644 repcore/Cargo.toml delete mode 100644 repcore/src/grammar.rs delete mode 100644 repcore/src/lib.rs delete mode 100644 repcore/src/plan.org create mode 100644 viz/Cargo.toml create mode 100644 viz/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index e672e74..77de27f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,24 +13,13 @@ rust-version = "1.65" # testing the new resolver, even though this has no dependencies ;p [workspace] -members = ["graph", "receme", "nfa", "repcore"] +members = ["graph", "receme", "nfa", "chain", "viz"] resolver = "2" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -receme = { path = "receme" } - -[dev-dependencies] -criterion = "0.4" [features] default = [] tokenizer = [] - -[profile.bench] -debug = true - -[[bench]] -name = "bench_receme" -harness = false diff --git a/benches/bench_receme.rs b/benches/bench_receme.rs deleted file mode 100644 index c9583a9..0000000 --- a/benches/bench_receme.rs +++ /dev/null @@ -1,293 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -use receme::{ - catana::{Ana, Cata}, - functor::Functor, - hylo::Hylo, - tree::{DFTree, TEStrategy, Tree, TreeIndex}, -}; - -#[derive(Debug, Clone)] -enum Expr { - Add(T, T), - Lit(isize), -} - -impl Functor for Expr { - type Target = Expr; - - fn fmap(self, mut f: impl FnMut(T) -> S) -> Self::Target { - match self { - Expr::Add(a, b) => Expr::Add(f(a), f(b)), - Expr::Lit(value) => Expr::Lit(value), - } - } -} - -#[derive(Debug, Clone)] -enum ExBox { - Add(Box, Box), - Lit(isize), -} - -impl ExBox { - fn lit(value: isize) -> Self { - Self::Lit(value) - } - - fn add(a: ExBox, b: ExBox) -> Self { - Self::Add(Box::new(a), Box::new(b)) - } -} - -pub fn bench(c: &mut Criterion) { - fn construct_tree() -> Tree> { - let strategy: TEStrategy = TEStrategy::UnsafeArena; - - let elements: Vec> = vec![ - Expr::Add(1, 2).fmap(TreeIndex::new), - Expr::Lit(1), - Expr::Add(3, 4).fmap(TreeIndex::new), - Expr::Lit(3), - Expr::Add(5, 6).fmap(TreeIndex::new), - Expr::Lit(10), - Expr::Lit(-14), - ]; - - Tree::new(elements, strategy) - } - - fn construct_box_tree() -> Box { - Box::new(ExBox::add( - ExBox::lit(1), - ExBox::add(ExBox::lit(3), ExBox::add(ExBox::lit(10), ExBox::lit(-14))), - )) - } - - let tree = construct_tree(); - - let safe_tree = { - let mut tree = tree.clone(); - tree.set_strategy(Default::default()); - tree - }; - - let depth_first_tree = { - let mut tree = tree.clone(); - tree.set_strategy(TEStrategy::DepthFirst); - tree - }; - - let boxtree = construct_box_tree(); - - c.bench_function("bench cata", |b| { - b.iter(|| { - let result = tree.clone().cata(|expr: Expr| match expr { - Expr::Add(a, b) => a + b, - Expr::Lit(v) => v, - }); - - black_box(result); - }) - }); - - c.bench_function("bench cata safe", |b| { - b.iter(|| { - let result = safe_tree.clone().cata(|expr: Expr| match expr { - Expr::Add(a, b) => a + b, - Expr::Lit(v) => v, - }); - - black_box(result); - }) - }); - - c.bench_function("bench cata depth first", |b| { - b.iter(|| { - let result = depth_first_tree - .clone() - .cata(|expr: Expr| match expr { - Expr::Add(a, b) => a + b, - Expr::Lit(v) => v, - }); - - black_box(result); - }) - }); - - c.bench_function("bench cata loop", |b| { - b.iter(|| { - let tree = tree.clone(); - - let mut stack: Vec> = vec![Ok(0)]; - let mut result_stack: Vec = vec![]; - - while let Some(top) = stack.pop() { - let top_is_ok_p = top.is_ok(); - - let top = match top { - Ok(top) => top, - Err(top) => top, - }; - - let top_node = tree.nth(top).unwrap(); - - match top_node { - Expr::Add(a, b) => { - if top_is_ok_p { - stack.push(Err(top)); - stack.push(Ok(**b)); - stack.push(Ok(**a)); - } else { - let a = result_stack.pop().unwrap(); - let b = result_stack.pop().unwrap(); - - result_stack.push(a + b); - } - } - Expr::Lit(v) => result_stack.push(*v), - } - } - - let result = result_stack.pop().unwrap(); - - black_box(result); - }) - }); - - c.bench_function("bench ana box loop", |b| { - b.iter(|| { - let boxtree = boxtree.clone(); - - let mut elements = vec![]; - - let mut stack = vec![boxtree]; - - while let Some(bt) = stack.pop() { - match *bt { - ExBox::Add(a, b) => { - let len = elements.len(); - - elements.push(Expr::Add(TreeIndex::new(len + 1), TreeIndex::new(len + 2))); - - stack.push(b); - stack.push(a); - } - ExBox::Lit(v) => elements.push(Expr::Lit(v)), - } - } - - let tree: Tree> = Tree::new(elements, Default::default()); - - black_box(tree); - }) - }); - - c.bench_function("bench ana boxed", |b| { - b.iter(|| { - let boxtree = boxtree.clone(); - - let tree = Tree::ana(boxtree, |bt: Box| match *bt { - ExBox::Add(a, b) => Expr::Add(a, b), - ExBox::Lit(v) => Expr::Lit(v), - }); - - black_box(tree); - }) - }); - - c.bench_function("bench ana depth first boxed", |b| { - b.iter(|| { - let boxtree = boxtree.clone(); - - let tree = DFTree::ana(boxtree, |bt: Box| match *bt { - ExBox::Add(a, b) => Expr::Add(a, b), - ExBox::Lit(v) => Expr::Lit(v), - }); - - black_box(tree); - }) - }); - - c.bench_function("bench hylo", |b| { - b.iter(|| { - let boxtree = boxtree.clone(); - - black_box(Tree::hylo( - boxtree, - |expr| match expr { - Expr::Add(a, b) => a + b, - Expr::Lit(v) => v, - }, - |bt: Box| match *bt { - ExBox::Add(a, b) => Expr::Add(a, b), - ExBox::Lit(v) => Expr::Lit(v), - }, - )) - }) - }); - - c.bench_function("bench hylo loop", |b| { - b.iter(|| { - let boxtree = boxtree.clone(); - - let mut elements = vec![]; - - let mut stack = vec![boxtree]; - - while let Some(bt) = stack.pop() { - match *bt { - ExBox::Add(a, b) => { - let len = elements.len(); - - elements.push(Expr::Add(TreeIndex::new(len + 1), TreeIndex::new(len + 2))); - - stack.push(b); - stack.push(a); - } - ExBox::Lit(v) => elements.push(Expr::Lit(v)), - } - } - - let tree: Tree> = Tree::new(elements, Default::default()); - - let mut stack: Vec> = vec![Ok(0)]; - let mut result_stack: Vec = vec![]; - - while let Some(top) = stack.pop() { - let top_is_ok_p = top.is_ok(); - let top = match top { - Ok(top) => top, - Err(top) => top, - }; - - let top_node = tree.nth(top).unwrap(); - - match top_node { - Expr::Add(a, b) => { - if top_is_ok_p { - stack.push(Err(top)); - stack.push(Ok(**b)); - stack.push(Ok(**a)); - } else { - let a = result_stack.pop().unwrap(); - let b = result_stack.pop().unwrap(); - - result_stack.push(a + b); - } - } - Expr::Lit(v) => result_stack.push(*v), - } - } - - let result = result_stack.pop().unwrap(); - - assert_eq!(result, 0); - - black_box(result); - }) - }); -} - -criterion_group!(benches, bench); -criterion_main!(benches); diff --git a/chain/Cargo.toml b/chain/Cargo.toml new file mode 100644 index 0000000..32eed8b --- /dev/null +++ b/chain/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "chain" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nfa = { path = "../nfa" } +graph = { path = "../graph" } \ No newline at end of file diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs new file mode 100644 index 0000000..287fbca --- /dev/null +++ b/chain/src/grammar.rs @@ -0,0 +1,1247 @@ +#![warn(missing_docs)] +//! This file implements the extected behaviours of grammars. + +// NOTE: We shall first start with a parser that works at the level of +// characters. The purpose is to first experiment with the workings +// and the performance of the algorithms, before optimising by using +// regular expressions to classify inputs into tokens. In other +// words, the current focus is not on the optimisations, whereas +// scanners are for optimisations only, so to speak. + +#![allow(unused_imports)] +use nfa::{ + default::{ + nfa::DefaultNFA, + regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + }, + DOption, DesRec, Nfa, Regex, SoC, +}; + +use graph::{adlist::ALGBuilder, builder::Builder, Graph}; + +use std::{ + collections::HashSet, + fmt::{Display, Write}, +}; + +/// The type of a terminal. +/// +/// For the time being this is a wrapper around a string, but in the +/// future it may hold more information of scanners. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Terminal { + // If we want to use scanners, per chance add them as a new field + // here. + name: String, +} + +impl Terminal { + /// Create a terminal with the given name. + #[inline] + pub fn new(name: String) -> Self { + Self { name } + } + + /// Return the name of the terminal. + #[inline] + pub fn name(&self) -> &str { + &self.name + } +} + +/// The type of a non-terminal. +/// +/// This is just a wrapper around a string. +#[derive(Debug, Clone)] +pub struct Nonterminal(String); + +impl Nonterminal { + /// Return the name of the nonterminal. + /// + /// Just to improve readability. + #[inline] + pub fn name(&self) -> &str { + &self.0 + } +} + +/// The type of a terminal or a non-terminal. +/// +/// Only an index is stored here. Actual data are stored in two other +/// arrays. +#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum TNT { + /// Terminal variant + Ter(usize), + /// Nonterminal variant + Non(usize), +} + +impl Display for TNT { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TNT::Ter(t) => write!(f, "T({t})"), + TNT::Non(n) => write!(f, "N({n})"), + } + } +} + +/// Errors related to grammar operations. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum Error { + /// The first component is the index, and the second the bound. + IndexOutOfBounds(usize, usize), + /// Fail to build the N-th regular expression, due to the + /// ParseError. + BuildFail(usize, ParseError), + /// fail to build NFA + NFAFail(nfa::error::Error), +} + +impl From for Error { + fn from(nfae: nfa::error::Error) -> Self { + Self::NFAFail(nfae) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), + Error::BuildFail(n, pe) => write!( + f, + "Failed to build the {n}-th regular expression due to error: {pe}" + ), + Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + } + } +} + +impl std::error::Error for Error {} + +/// A rule is a regular expression of terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct Rule { + regex: DefaultRegex, +} + +impl Rule { + /// Return true if and only if the rule is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.regex.is_empty() + } + + /// Return the length of the rule. + #[inline] + pub fn len(&self) -> usize { + self.regex.len() + } +} + +/// The type of a grammar. +#[derive(Debug, Clone, Default)] +pub struct Grammar { + ter: Vec, + non: Vec, + rules: Vec, + firsts: Vec>>, + first_nodes: Vec>, +} + +/// A private type to aid the recursive looping of rergular +/// expressions. +#[derive(Copy, Clone)] +enum StackElement { + Seen(usize), + Unseen(usize), +} + +impl StackElement { + fn index(self) -> usize { + match self { + Self::Seen(index) => index, + Self::Unseen(index) => index, + } + } + + fn is_seen(self) -> bool { + matches!(self, Self::Seen(_)) + } +} + +impl Grammar { + /// Construct a grammar from a vector of terminals, a vector of + /// non-terminals, and a vector of rules for the non-temrinals. + /// + /// # Panic + /// + /// If the length of `non` is not equal to that of `rules`, then + /// the function panics. + pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { + assert_eq!(non.len(), rules.len()); + + // One more room is reserved for the `None` value. + let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) + .take(non.len()) + .collect(); + + let first_nodes = rules + .iter() + .map(|rule| Vec::with_capacity(rule.len())) + .collect(); + + Self { + ter, + non, + rules, + firsts, + first_nodes, + } + } + + /// Return the name of a terminal or a non-terminal. + pub fn name_of_tnt(&self, tnt: TNT) -> Result { + match tnt { + TNT::Ter(t) => Ok(format!( + "T{}", + self.ter + .get(t) + .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? + .name() + )), + TNT::Non(n) => Ok(format!( + "N{}", + self.non + .get(n) + .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? + .name() + )), + } + } + + /// Return true if and only if there are no non-terminals in the + /// grammar. + #[inline] + pub fn is_empty(&self) -> bool { + self.non.is_empty() + } + + /// Return the total length of all rules. + #[inline] + pub fn total(&self) -> usize { + self.rules.iter().map(Rule::len).sum() + } + + /// Return the number of terminals. + #[inline] + pub fn ter_num(&self) -> usize { + self.ter.len() + } + + /// Return the number of non-terminals. + #[inline] + pub fn non_num(&self) -> usize { + self.non.len() + } + + /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a + /// single number to represent terminals and non-terminals. + /// + /// # Bounds + /// + /// If a terminal index is greater than or equal to the number of + /// terminals, then this signals an error; mutatis mutandis for + /// non-terminals. + /// + /// # Related + /// + /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. + #[inline] + pub fn pack_tnt(&self, tnt: TNT) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + match tnt { + TNT::Ter(t) => { + if t >= ter_num { + Err(Error::IndexOutOfBounds(t, ter_num)) + } else { + Ok(t) + } + } + TNT::Non(n) => { + if n >= non_num { + Err(Error::IndexOutOfBounds(n, non_num)) + } else { + Ok(n + ter_num) + } + } + } + } + + /// Convert a single number to either a terminal or a + /// non-terminal. + /// + /// # Bounds + /// + /// If the number is greater than or equal to the sum of the + /// numbers of terminals and of non-terminals, then this signals + /// an error. + /// + /// # Related + /// + /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. + #[inline] + pub fn unpack_tnt(&self, flat: usize) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + if flat < ter_num { + Ok(TNT::Ter(flat)) + } else if flat < ter_num + non_num { + Ok(TNT::Non(flat - ter_num)) + } else { + Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) + } + } + + /// Return true if and only if the non-terminal is nullable. + #[inline] + pub fn is_nullable(&self, non_terminal: usize) -> Result { + Ok(self + .firsts + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .contains(&None)) + } + + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) + } + + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } + + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&self) -> Result>, Error> { + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => tnt, + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if parents_chain.is_empty() { + local_result.push(Lit(tnt)); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + assert!(parents_chain.first().unwrap().0 == regex_root); + + // A different, "more local", root. + let mut root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + local_result.extend([Empty, Lit(tnt)]); + + root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(Lit(tnt)); + root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, Lit(tnt)]); + root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, root, ()).unwrap(); + + root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, root, ()).unwrap(); + root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex], + ) -> Result>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) + } +} + +impl Display for Grammar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + assert_eq!(self.non.len(), self.rules.len()); + + for (nt, rule) in self.non.iter().zip(self.rules.iter()) { + write!(f, "{}: ", nt.name())?; + + writeln!( + f, + "{}", + rule.regex.to_string_with(|tnt| format!( + "({})", + self.name_of_tnt(tnt) + .unwrap_or_else(|_| format!("Unknown {tnt:?}")) + ))? + )?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_helper { + use super::*; + use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; + use std::fmt::Write; + + // Construct a grammar to test + pub fn new_grammar() -> Result> { + let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("end".to_owned()), + ]; + + /// A function to scan the inputs. + fn scan_tnt( + parser: &DefaultRegParser, + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + use TNT::*; + + let mut chars = input.chars(); + + if let Some(first) = chars.next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + 'T' => { + let mut name = String::new(); + let mut len = 1; + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(t) = parser.query(&name, true) { + Ok(Some((len, Lit(Ter(t)), Right))) + } else { + Err(ParseError::InvalidCharacter(first)) + } + } + 'N' => { + let mut name = String::new(); + let mut len = 1; + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(n) = parser.query(&name, false) { + Ok(Some((len, Lit(Non(n)), Right))) + } else { + Err(ParseError::InvalidCharacter(first)) + } + } + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("a", true); + regex_parser.add_tnt("b", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("end", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nstart?Nend*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) + } +} + +#[cfg(test)] +mod test_grammar_display { + use super::test_grammar_helper::new_grammar; + + #[test] + fn test_display() -> Result<(), Box> { + println!("{}", new_grammar()?); + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_firsts { + use super::test_grammar_helper::new_grammar; + use super::*; + + #[test] + fn test_firsts() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + grammar.compute_firsts()?; + + println!("grammar firsts: {:?}", grammar.firsts); + println!("grammar first nodes: {:?}", grammar.first_nodes); + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_left_closure { + use super::test_grammar_helper::new_grammar; + use super::*; + + pub fn new_closure_regex( + grammar: &mut Grammar, + ) -> Result>, Box> { + grammar.compute_firsts()?; + + println!("grammar firsts: {:?}", grammar.firsts); + println!("grammar first nodes: {:?}", grammar.first_nodes); + println!("grammar:"); + println!("{grammar}"); + + grammar.left_closure().map_err(Into::into) + } + + #[test] + fn test_regex() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + let vec_of_regexps = new_closure_regex(&mut grammar)?; + + for regex in &vec_of_regexps { + println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); + // println!("regex: {regex:?}",); + println!("regex len = {}", regex.nodes_len()); + } + + Ok(()) + } + + #[test] + fn test_nfa() -> Result<(), Box> { + let mut grammar = new_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + let nfa = grammar.left_closure_to_nfa(&closure)?; + // TODO: print the nfa out + Ok(()) + } +} diff --git a/chain/src/lib.rs b/chain/src/lib.rs new file mode 100644 index 0000000..0ec4d4c --- /dev/null +++ b/chain/src/lib.rs @@ -0,0 +1,25 @@ +//! This package implements the core algorithm of the entire +//! workspace: parsing with derivatives by means of chain rule and +//! regular nulling languages. +//! +//! Since I shall not name my crate "core" to avoid collisions with +//! the Rust's own core, I decided to name this crate after what I +//! think is the essence of this algorithm, the chain-rule for +//! derivatives of languages. + +pub mod grammar; + +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/chain/src/plan.org b/chain/src/plan.org new file mode 100644 index 0000000..8c63492 --- /dev/null +++ b/chain/src/plan.org @@ -0,0 +1,94 @@ +#+TITLE: Plan of the package +#+AUTHOR: Durand +#+DATE: <2022-11-18 Ven 19:57> + +* Things to do [2/7] + +- [X] Implement builders for graphs +- [X] Find sets of the first terminals for each non-terminal, in the + grammar module. +- [-] NFA [4/5] + + [X] Add regular expression into NFA. + + [X] Convert a set of regular expressions into a nondeterministic + finite automaton, where non-terminals denote the indices of + regular expressions in the set to substitute. + + [X] Convert a grammar into its grammar of left-linear closures of + non-temrinals and their derivatives. + + [X] Convert nondeterministic finite automata to their null + closures. + + [ ] Test more grammars to ensure it works correctly. +- [ ] Define the Atom trait. +- [ ] Implement virtual nodes for each derivative of each atom: The + lack of this step might be the root cause of the failure of the + previous version of this package. +- [ ] Implement languages. [0/2] + + [ ] First implement them as doubly labelled (directed acyclic) + graphs. + + [ ] Implement finding derivatives. +- [ ] Implement semiring. [0/5] + + [ ] Define the trait. + + [ ] Implement the boolean semiring. + + [ ] Implement the natural number semiring. + + [ ] Implement the free semiring. + + [ ] Compute and record a semiring value when computing + derivatives. + +* Atomic Languages + +This describes the behaviours of atomic languages. The atomic +language consists of the null closure of any non-terminal symbol in +the grammar, and their deriavtives by terminals and non-terminal. + +* Script for creating GIF animation + +[[https://gist.github.com/maelvls/5379127][a gist]] + +* Derivative Languages + +This is the main driving device of the algorithm. Basically, the +algorithm works by taking successive derivatives, according to the +input symbol. At each step, we calculate the derivative language. In +this process, we also compute some semiring value and store in a +carrier. The end result of the algorithm is the final semiring +value. + +If one wants simply to determine if the input string belongs to the +grammar, one chooses the semiring to be the field with two elements, +the boolean field. If one wants to find how many ways are there to +derive a given input string, then one uses the semiring of natural +numbers instead. If one wants, moreover, to find all the possible +ways to derive a particular input string, then one can use the +free semiring on the set of terminals and non-terminals of the +grammar. Here the free semiring is the left-adjoint functor to the +forgetful functor from the category of semirings to the category of +sets. + +To be more specific, the free semiring on a set is given by sets of +sequences of elements in the set. The addition of the semiring is the +set union operation, and the multiplication is taking the respective +concatenations. + +** Semirings + +So we need a module to define the behaviours of semirings, and provide +some common semirings implementations. Then in the main driving force +we can freely substitute different semirings, according to the +particular needs. + +** Languages + +Then the main part is to define the behaviour of languages. This +should be easy enough, since we already have the mechanism of graphs, +nondeterministic automata, and semirings. All we need to do is to +combine them together. + +* Testing ground + +I am in a strong need to test things out. The most important one is +to visualize each step of the derivation, in a human-friendly manner. +I need this to examine whether my atomic languages are wrongly +implemented, or my atomic languages are wrongly derived, or my +understanding of the main algorithm is plain wrong. + +This is the main reason I started this rewrite of the package. + diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index 18ad770..6c1dcd0 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -3,8 +3,9 @@ //! [`Graph`][super::Graph]. This data type represents graphs using //! adjacency lists internally. -use super::{ExtGraph, Graph}; -use crate::error::Error; +use std::ops::{Deref, DerefMut}; + +use crate::{builder::Builder, error::Error, ExtGraph, Graph}; // #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] // struct ALEdge { @@ -80,6 +81,13 @@ impl Graph for ALGraph { Ok(self.nodes.get(source).unwrap().children.contains(&target)) } } + + #[inline] + fn replace_by_builder(&mut self, builder: impl Builder) { + let graph = builder.build(); + + self.nodes = graph.nodes; + } } impl ExtGraph for ALGraph { @@ -102,11 +110,115 @@ impl ExtGraph for ALGraph { } } -// TODO: Add a way to build a graph by its raw adjacency list representation. impl From>> for ALGraph { fn from(adlist: Vec>) -> Self { - let nodes: Vec = adlist.iter().cloned().map(ALNode::new).collect(); - Self { nodes } + Self { + nodes: adlist.into_iter().map(ALNode::new).collect(), + } + } +} + +// Builder for this implementation of graph + +/// A builder for adjacency list graphs. +#[derive(Debug, Default, Clone)] +pub struct ALGBuilder(ALGraph); + +impl Deref for ALGBuilder { + type Target = ALGraph; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for ALGBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Builder for ALGBuilder { + type Label = (); + + type Result = ALGraph; + + fn with_capacity(size: usize) -> Self { + Self(ALGraph { + nodes: Vec::with_capacity(size), + }) + } + + #[inline] + fn add_vertex(&mut self) -> usize { + self.nodes.push(ALNode::default()); + self.nodes.len() - 1 + } + + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { + let nodes_len = self.nodes.len(); + + let source_children = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + source_children.children.push(target); + + Ok(()) + } + + /// Remove an edge from the source to the target. + /// + /// Since some graphs are labelled, the users are allowed to pass + /// a predicate to determine if an edge from the source to the + /// target should be removed. + /// + /// # Performance note + /// + /// It is possible that the target appears more than once in the + /// vector of children, so we have to remove every instance of + /// target. + /// + /// Since I do not think builders should be used for performance + /// critical tasks, this is fine. + /// + /// If one desires more performance, maybe + /// [`ASGraph`][crate::ASGraph] is a good choice. + /// + /// Of course, if one just wants to use adjacency list + /// representation and wants a performant builder, one can + /// implement a customized builder. + fn remove_edge(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> + where + F: Fn(Self::Label) -> bool, + { + let nodes_len = self.nodes.len(); + + let source_children = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + source_children.children.retain(|child| *child != target); + + Ok(()) + } + + fn build(self) -> Self::Result { + self.0 + } + + fn build_ref(&self) -> Self::Result { + self.0.clone() } } @@ -187,3 +299,123 @@ mod algraph_test { Ok(()) } } + +#[cfg(test)] +mod test_algraph_builder { + use super::*; + + #[test] + fn test_builder() -> Result<(), Box> { + let mut builder = ALGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Link each node to its successor and link the last node with + // the first one to form a cycle. + for i in 0..5 { + builder.add_edge(i, if i == 4 { 0 } else { i + 1 }, ())?; + } + + println!("a cycle of five nodes: {builder:?}"); + + // Remove the link from the last node to the first node. + builder.remove_edge(4, 0, |_| true)?; + + println!("a line of five nodes: {builder:?}"); + + // build a graph + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } + + #[test] + fn test_errors() -> Result<(), Box> { + let mut builder = ALGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Errors in add_edge + + println!(); + println!("Testing errors in add_edge:"); + println!(); + + assert!(matches!( + builder.add_edge(0, 5, ()), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.add_edge(10, 5, ()), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.add_edge(10, 50, ()), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + // Errors in remove_edge + + println!(); + println!("Testing errors in remove_edge:"); + println!(); + + assert!(matches!( + builder.remove_edge(0, 5, |_| true), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.remove_edge(10, 5, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.remove_edge(10, 50, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + assert!(matches!(builder.remove_edge(0, 1, |_| true), Ok(()),)); + + println!("No errors for removing a non-existing edge"); + + println!(); + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } +} diff --git a/graph/src/adset.rs b/graph/src/adset.rs index 58fed4c..c225986 100644 --- a/graph/src/adset.rs +++ b/graph/src/adset.rs @@ -7,8 +7,9 @@ //! duplications of languages, so it is more convenient if the //! underlying graph type **cannot** represent duplicate edges. -use super::{ExtGraph, Graph}; -use crate::error::Error; +use std::ops::{Deref, DerefMut}; + +use crate::{builder::Builder, error::Error, ExtGraph, Graph}; // If one wants to use another implementation for a set, import that // as Set, and nothing else needs to be changed, ideally. @@ -122,6 +123,13 @@ impl Graph for ASGraph { .contains(&ASEdge::new(target))) } } + + #[inline] + fn replace_by_builder(&mut self, builder: impl Builder) { + let graph = builder.build(); + + self.nodes = graph.nodes; + } } impl ExtGraph for ASGraph { @@ -144,6 +152,90 @@ impl ExtGraph for ASGraph { } } +// Builder for this implementation of graph + +/// A builder for adjacency set graphs. +#[derive(Debug, Default, Clone)] +pub struct ASGBuilder(ASGraph); + +impl Deref for ASGBuilder { + type Target = ASGraph; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for ASGBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Builder for ASGBuilder { + type Label = (); + + type Result = ASGraph; + + #[inline] + fn with_capacity(size: usize) -> Self { + Self(ASGraph { + nodes: Vec::with_capacity(size), + }) + } + + #[inline] + fn add_vertex(&mut self) -> usize { + self.nodes.push(ASNode::default()); + self.nodes.len() - 1 + } + + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { + let nodes_len = self.nodes.len(); + + let source_children = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + source_children.children.insert(ASEdge::new(target)); + + Ok(()) + } + + fn remove_edge(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> + where + F: Fn(Self::Label) -> bool, + { + let nodes_len = self.nodes.len(); + + let source_children = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + source_children.children.remove(&ASEdge::new(target)); + + Ok(()) + } + + fn build(self) -> Self::Result { + self.0 + } + + fn build_ref(&self) -> Self::Result { + self.0.clone() + } +} + #[cfg(test)] mod asgraph_test { use super::*; @@ -227,3 +319,123 @@ mod asgraph_test { Ok(()) } } + +#[cfg(test)] +mod test_asgraph_builder { + use super::*; + + #[test] + fn test_builder() -> Result<(), Box> { + let mut builder = ASGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Link each node to its successor and link the last node with + // the first one to form a cycle. + for i in 0..5 { + builder.add_edge(i, if i == 4 { 0 } else { i + 1 }, ())?; + } + + println!("a cycle of five nodes: {builder:?}"); + + // Remove the link from the last node to the first node. + builder.remove_edge(4, 0, |_| true)?; + + println!("a line of five nodes: {builder:?}"); + + // build a graph + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } + + #[test] + fn test_errors() -> Result<(), Box> { + let mut builder = ASGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Errors in add_edge + + println!(); + println!("Testing errors in add_edge:"); + println!(); + + assert!(matches!( + builder.add_edge(0, 5, ()), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.add_edge(10, 5, ()), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.add_edge(10, 50, ()), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + // Errors in remove_edge + + println!(); + println!("Testing errors in remove_edge:"); + println!(); + + assert!(matches!( + builder.remove_edge(0, 5, |_| true), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.remove_edge(10, 5, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.remove_edge(10, 50, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + assert!(matches!(builder.remove_edge(0, 1, |_| true), Ok(()),)); + + println!("No errors for removing a non-existing edge"); + + println!(); + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } +} diff --git a/graph/src/builder.rs b/graph/src/builder.rs new file mode 100644 index 0000000..ce85cce --- /dev/null +++ b/graph/src/builder.rs @@ -0,0 +1,53 @@ +//! This file defines the expected behaviours of a builder of graphs. + +use crate::{error::Error, Graph}; + +/// A builder is actually just a graph that can be altered in more +/// ways than an extension graph can. It should not have any methods +/// from the Graph trait, though, as we shall convert a builder to a +/// normal final graph before using it. +pub trait Builder: Default { + /// Some graphs are labelled. This type should be the type of the + /// labels. + type Label; + + /// The type of the result graph. + type Result: Graph; + + /// Construct an empty builder with the capacity to hold a given + /// number of nodes. + /// + /// Implementations may ignore this method, where the default + /// implementation just calls `Default::default`. + #[inline] + fn with_capacity(_size: usize) -> Self { + Self::default() + } + + /// Add a vertex without children. + fn add_vertex(&mut self) -> usize; + + /// Add an edge from the source to the target. + fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>; + + /// Remove an edge from the source to the target. + /// + /// Since some graphs are labelled, the users are allowed to pass + /// a predicate to determine if an edge from the source to the + /// target should be removed. + fn remove_edge(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> + where + F: Fn(Self::Label) -> bool; + + /// Convert the builder into a graph. + /// + /// This is the purpose of having a builder. + fn build(self) -> Self::Result; + + /// Convert the builder into a graph using a reference. + /// + /// This is similar to [`build`][Builder::build], but takes an + /// immutable reference of the builder, so that the builder can + /// still be used later on. + fn build_ref(&self) -> Self::Result; +} diff --git a/graph/src/error.rs b/graph/src/error.rs index 2162685..3600005 100644 --- a/graph/src/error.rs +++ b/graph/src/error.rs @@ -1,16 +1,20 @@ #![warn(missing_docs)] //! This file implements the error data type of the graph library. -use std::fmt::{self, Display}; +use core::fmt::{self, Display}; /// The error type for methods of the trait [`Graph`][`super::Graph`]. -#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd)] +#[non_exhaustive] pub enum Error { /// The index is out of bounds. /// /// The first component is the index that is out of bounds, and /// the second component is the current length of nodes. IndexOutOfBounds(usize, usize), + /// The graph does not permit duplicate nodes but encounters a + /// repeated node + DuplicatedNode, } impl Display for Error { @@ -19,6 +23,7 @@ impl Display for Error { Error::IndexOutOfBounds(index, len) => { write!(f, "index {index} out of bounds {len} ") } + Error::DuplicatedNode => write!(f, "No duplicate nodes permitted, but found one"), } } } diff --git a/graph/src/labelled.rs b/graph/src/labelled.rs index d02e301..d0a02d0 100644 --- a/graph/src/labelled.rs +++ b/graph/src/labelled.rs @@ -7,10 +7,9 @@ //! needs to be implemented efficiently, we store the mappings between //! labels and edges in both directions. -#[allow(unused_imports)] -use super::{Graph, GraphLabel, LabelExtGraph, LabelGraph}; -#[allow(unused_imports)] -use crate::error::Error; +use std::ops::{Deref, DerefMut}; + +use crate::{builder::Builder, error::Error, Graph, GraphLabel, LabelExtGraph, LabelGraph}; // We use BTreeMap and BTreeSet here as we need to exclude duplicate // edge sets, while an ordinary hashmap and hashset do not allow @@ -21,13 +20,23 @@ use std::collections::{ BTreeMap as Map, BTreeSet as Set, HashMap as HMap, }; -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone)] struct DLNode { by_target: Map>, by_label: Map>, flat: Vec<(T, usize)>, } +impl Default for DLNode { + fn default() -> Self { + Self { + by_target: Default::default(), + by_label: Default::default(), + flat: Default::default(), + } + } +} + impl DLNode { fn new( by_target: Map>, @@ -66,6 +75,18 @@ impl DLGraph { edges_table: HMap::default(), } } + + /// Return a builder. + #[inline] + pub fn builder(self) -> DLGBuilder { + DLGBuilder(self) + } + + /// Return a builder from a reference. + #[inline] + pub fn builder_ref(&self) -> DLGBuilder { + DLGBuilder(self.clone()) + } } impl Default for DLGraph { @@ -129,6 +150,49 @@ impl Graph for DLGraph { None => Err(Error::IndexOutOfBounds(source, self.nodes.len())), } } + + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + let preamble = "digraph nfa { + fontname=\"Helvetica,Arial,sans-serif\" + node [fontname=\"Helvetica,Arial,sans-serif\"] + edge [fontname=\"Helvetica,Arial,sans-serif\"] + rankdir=LR;\n"; + + let mut post = String::new(); + + use std::fmt::Write as FWrite; + + for (source, target) in self.edges() { + for label in self.edge_label(source, target).unwrap() { + let _ = writeln!(post, " {source} -> {target} [label = \"{label}\"]"); + } + } + + post.push_str("}\n"); + + let result = format!("{preamble}{post}"); + + if std::fs::metadata(filename).is_ok() { + std::fs::remove_file(filename)?; + } + + let mut file = std::fs::File::options() + .write(true) + .create(true) + .open(filename)?; + + use std::io::Write; + + file.write_all(result.as_bytes()) + } + + #[inline] + fn replace_by_builder(&mut self, builder: impl Builder) { + let new_graph = builder.build(); + + self.nodes = new_graph.nodes; + self.edges_table = new_graph.edges_table; + } } /// A delegation of iterators. @@ -261,6 +325,25 @@ impl LabelGraph for DLGraph { None => Err(Error::IndexOutOfBounds(node_id, self.nodes.len())), } } + + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { + let nodes_len = self.nodes.len(); + + match self.nodes.get(node_id) { + Some(node) => { + if target >= nodes_len { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + if let Some(labels) = node.by_target.get(&target) { + Ok(labels.contains(label)) + } else { + Ok(false) + } + } + None => Err(Error::IndexOutOfBounds(node_id, nodes_len)), + } + } } impl LabelExtGraph for DLGraph { @@ -315,6 +398,181 @@ impl LabelExtGraph for DLGraph { } } +// Builder for this implementation of graph + +/// A builder for labelled adjacency doubly linked graphs. +#[derive(Debug, Clone)] +pub struct DLGBuilder(DLGraph); + +impl Default for DLGBuilder { + fn default() -> Self { + Self(Default::default()) + } +} + +impl Deref for DLGBuilder { + type Target = DLGraph; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DLGBuilder { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Builder for DLGBuilder { + type Label = T; + + type Result = DLGraph; + + #[inline] + fn with_capacity(size: usize) -> Self { + Self(DLGraph { + nodes: Vec::with_capacity(size), + edges_table: Default::default(), + }) + } + + #[inline] + fn add_vertex(&mut self) -> usize { + self.nodes.push(DLNode::default()); + self.nodes.len() - 1 + } + + fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error> { + let nodes_len = self.nodes.len(); + + let source_node = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + let new_edge_p = !matches!( + source_node + .by_target + .get(&target) + .map(|set| set.contains(&label)), + Some(true) + ); + + if new_edge_p { + let old_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); + + source_node + .by_target + .entry(target) + .or_insert_with(Set::default) + .insert(label); + + source_node + .by_label + .entry(label) + .or_insert_with(Set::default) + .insert(target); + + source_node.flat.push((label, target)); + + let new_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); + + // When source_node is in use, we cannot borrow self + // mutably again, so we move the following two statements + // to after all uses of source_node. + + self.edges_table.remove(&old_children_set); + + self.edges_table.insert(new_children_set, source); + } + + Ok(()) + } + + fn remove_edge(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> + where + F: Fn(Self::Label) -> bool, + { + let nodes_len = self.nodes.len(); + + let source_node = self + .nodes + .get_mut(source) + .ok_or(Error::IndexOutOfBounds(source, nodes_len))?; + + if !(0..nodes_len).contains(&target) { + return Err(Error::IndexOutOfBounds(target, nodes_len)); + } + + if let Some(target_labels_set) = source_node.by_target.get(&target) { + let mut to_remove = Vec::new(); + + for label in target_labels_set.iter() { + if predicate(*label) { + to_remove.push(*label); + } + } + + if !to_remove.is_empty() { + let old_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); + + for label in to_remove.iter().copied() { + // This must be "Some", as the outer "if" checks + source_node + .by_target + .get_mut(&target) + .map(|set| set.remove(&label)); + + source_node + .by_label + .get_mut(&label) + .map(|set| set.remove(&target)); + + source_node.flat.retain(|(child_label, child_target)| { + (*child_label, *child_target) != (label, target) + }); + } + + // If after removal no labels remain for the target, + // we remove the edge entirely, to avoid false empty + // edges. + if source_node.flat.is_empty() { + source_node.by_target.remove(&target); + + for label in to_remove.iter() { + source_node.by_label.remove(label); + } + } + + let new_children_set: Set<(T, usize)> = source_node.flat.iter().copied().collect(); + + // When source_node is in use, we cannot borrow self + // mutably again, so we move the following two + // statements to after all uses of source_node. + + self.edges_table.remove(&old_children_set); + + self.edges_table.insert(new_children_set, source); + } + } + + Ok(()) + } + + fn build_ref(&self) -> Self::Result { + self.0.clone() + } + + fn build(self) -> Self::Result { + self.0 + } +} + #[cfg(test)] mod label_test { use super::*; @@ -424,3 +682,125 @@ mod label_test { Ok(()) } } + +// TODO: Test print_viz + +#[cfg(test)] +mod test_dlgraph_builder { + use super::*; + + #[test] + fn test_builder() -> Result<(), Box> { + let mut builder = DLGBuilder::::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Link each node to its successor and link the last node with + // the first one to form a cycle. + for i in 0..5 { + builder.add_edge(i, if i < 4 { i + 1 } else { 0 }, i)?; + } + + println!("a cycle of five nodes: {builder:?}"); + + // Remove the link from the last node to the first node. + builder.remove_edge(4, 0, |_| true)?; + + println!("a line of five nodes: {builder:?}"); + + // build a graph + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } + + #[test] + fn test_errors() -> Result<(), Box> { + let mut builder = DLGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + println!("five empty nodes: {builder:?}"); + + // Errors in add_edge + + println!(); + println!("Testing errors in add_edge:"); + println!(); + + assert!(matches!( + builder.add_edge(0, 5, 0), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.add_edge(10, 5, 0), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.add_edge(10, 50, 0), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + // Errors in remove_edge + + println!(); + println!("Testing errors in remove_edge:"); + println!(); + + assert!(matches!( + builder.remove_edge(0, 5, |_| true), + Err(Error::IndexOutOfBounds(5, 5)) + )); + + println!("Right error for an index out of bounds as the target"); + + assert!(matches!( + builder.remove_edge(10, 5, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for an index out of bounds as the source"); + + assert!(matches!( + builder.remove_edge(10, 50, |_| true), + Err(Error::IndexOutOfBounds(10, 5)) + )); + + println!("Right error for both indices out of bounds"); + + assert!(matches!(builder.remove_edge(0, 1, |_| true), Ok(()),)); + + println!("No errors for removing a non-existing edge"); + + println!(); + + let graph = builder.build(); + + println!("final graph: {graph:?}"); + + Ok(()) + } +} diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 7b74ee1..2d23af3 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -8,6 +8,8 @@ use std::hash::Hash; +use core::fmt::{Debug, Display}; + pub mod error; pub mod adset; @@ -22,6 +24,10 @@ pub mod labelled; pub use labelled::DLGraph; +pub mod builder; + +pub use builder::Builder; + use error::Error; /// The expected behaviour of an immutable graph. @@ -101,6 +107,83 @@ pub trait Graph: Default { /// Return an error if either the source or the target is an /// invalid node. fn has_edge(&self, source: usize, target: usize) -> Result; + + /// Output the edges into a graphviz file. + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + let preamble = "digraph nfa { + fontname=\"Helvetica,Arial,sans-serif\" + node [fontname=\"Helvetica,Arial,sans-serif\"] + edge [fontname=\"Helvetica,Arial,sans-serif\"] + rankdir=LR;\n"; + + let mut post = String::new(); + + use std::fmt::Write as FWrite; + + for (source, target) in self.edges() { + let _ = writeln!(post, "{source} -> {target}"); + } + + post.push_str("}\n"); + + let result = format!("{preamble}{post}"); + + if std::fs::metadata(filename).is_ok() { + std::fs::remove_file(filename)?; + } + + let mut file = std::fs::File::options() + .write(true) + .create(true) + .open(filename)?; + + use std::io::Write; + + file.write_all(result.as_bytes()) + } + + /// Returns whether or not the graph contains cycles. + /// + /// If, and only if, the graph contains invalid nodes, an error + /// will be signalled. + fn contains_cycles(&self) -> Result { + use std::collections::HashSet; + + let mut seen_nodes: HashSet = HashSet::with_capacity(self.nodes_len()); + + for node in self.nodes() { + if seen_nodes.contains(&node) { + continue; + } + + let mut edges_seen_nodes: HashSet = HashSet::with_capacity(self.nodes_len()); + + let mut stack = Vec::with_capacity(self.nodes_len()); + stack.push(node); + + while let Some(top) = stack.pop() { + if edges_seen_nodes.contains(&top) { + // a cycle is found + return Ok(true); + } + + edges_seen_nodes.insert(top); + + let mut local_stack: Vec = self.children_of(top)?.collect(); + + local_stack.reverse(); + + stack.extend(local_stack); + } + + seen_nodes.extend(edges_seen_nodes); + } + + Ok(false) + } + + /// Replace the contents of the graph by a builder. + fn replace_by_builder(&mut self, builder: impl Builder); } /// A graph that can be extended, but not mutated, in the sense that @@ -121,9 +204,15 @@ pub trait ExtGraph: Graph { } /// The type of labels should be comparable and hashable. -pub trait GraphLabel: Hash + Eq + PartialEq + Clone + Copy + Ord + PartialOrd {} +pub trait GraphLabel: + Hash + Eq + PartialEq + Clone + Copy + Ord + PartialOrd + Display + Debug +{ +} -impl GraphLabel for T {} +impl GraphLabel + for T +{ +} /// A labelled graph is just a graph with labels associated to /// vertices and / or edges. @@ -182,6 +271,10 @@ pub trait LabelGraph: Graph { /// /// The efficiency of this method matters in implementations. fn labels_of(&self, node_id: usize) -> Result, Error>; + + /// Return true if and only if the node has an edge with the given + /// label and target. + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result; } /// A labelled graph that can be extended, but not mutated, in the @@ -201,3 +294,39 @@ pub trait LabelExtGraph: LabelGraph { /// an error. fn extend(&mut self, edges: impl IntoIterator) -> Result; } + +#[cfg(test)] +mod test_cycle_detection { + use super::{builder::Builder, labelled::DLGBuilder, Graph}; + + #[test] + fn test() -> Result<(), Box> { + let mut builder: DLGBuilder = DLGBuilder::default(); + + // Add five nodes + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + builder.add_vertex(); + + // Link each node to its successor and link the last node with + // the first one to form a cycle. + for i in 0..5 { + builder.add_edge(i, if i < 4 { i + 1 } else { 0 }, i)?; + } + + let graph = builder.build_ref(); + + assert_eq!(graph.contains_cycles()?, true); + + // Remove the link from the last node to the first node. + builder.remove_edge(4, 0, |_| true)?; + + let graph = builder.build(); + + assert_eq!(graph.contains_cycles()?, false); + + Ok(()) + } +} diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs index 3c2bd83..229ba4d 100644 --- a/nfa/src/default/nfa.rs +++ b/nfa/src/default/nfa.rs @@ -1,25 +1,26 @@ //! This file provides a default implementation of NFA. -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. +// TODO: The current focus of the project is to understand the growth +// rate of the algorithm, to know whether I made a mistake in the +// previous iteration of the implementation, or the algorithm is not +// as fast as the author estimated, which is not quite likely, of +// course. // // Thus I shall establish a friendly interface that allows me to view // and debug the atomic languages and the languages, transparently. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +use graph::{ + builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, + LabelGraph, +}; -use crate::{error::Error, Nfa, Regex}; +use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC}; -use std::fmt::Display; +use core::fmt::{Debug, Display}; -#[non_exhaustive] -#[derive(Debug)] /// Default NFA implementation. -pub struct DefaultNFA { +#[derive(Debug, Clone)] +pub struct DefaultNFA { graph: DLGraph, } @@ -63,6 +64,16 @@ impl Graph for DefaultNFA { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { + self.graph.print_viz(filename) + } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl Builder) { + unimplemented!() + } } impl LabelGraph for DefaultNFA { @@ -70,11 +81,10 @@ impl LabelGraph for DefaultNFA { type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; - // TODO: Return the label from the contained regular language. #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { - todo!() + unimplemented!() } else { Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) } @@ -98,12 +108,232 @@ impl LabelGraph for DefaultNFA { fn labels_of(&self, node_id: usize) -> Result, GError> { self.graph.labels_of(node_id) } + + #[inline] + fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { + self.graph.has_edge_label(node_id, label, target) + } } impl Nfa for DefaultNFA { - #[allow(unused)] - fn to_nfa(regex: impl Regex) -> Self { - todo!() + type FromRegex = DefaultNFA; + + fn to_nfa( + regexps: &[impl Regex>], + sub_pred: impl Fn(T) -> Result, Error>, + ) -> Result>, Error> { + let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); + + if total_regexps_len == 0 { + return Ok(Default::default()); + } + + // We reserve two more rooms for later uses. + let nfa_len = total_regexps_len * 2 + 2; + + let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); + + // Since DOption implements Copy when T does, we can use a + // variable to hold the empty label to avoid repetitions. + let empty_label: DOption = Default::default(); + + for _ in 0..nfa_len { + builder.add_vertex(); + } + + let accumulators: Vec = { + let mut result = Vec::with_capacity(regexps.len() + 1); + + result.push(0); + + let mut accumulator = 0; + + for regexp in regexps.iter() { + accumulator += regexp.nodes_len() * 2; + result.push(accumulator); + } + + result.pop(); + + result + }; + + assert_eq!(accumulators.len(), regexps.len()); + + /// Get offset from `accumulators` safely. + macro_rules! get_offset_safe ( + ($num:expr) => { + *accumulators.get($num).ok_or(Error::UnknownNode($num))? + } + ); + + /// Get offset from `accumulators` without checking bounds. + macro_rules! get_offset_unsafe ( + ($num:expr) => { + { unsafe { *accumulators.get_unchecked($num) } } + } + ); + + for (index, regex) in regexps.iter().enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + // A regular expression without roots is empty, so we + // skip it. + continue; + }; + + let regex_len = regex.nodes_len(); + + // It is safe here to call `get_offset_unsafe`, as `index` + // is guaranteed to be strictly less than the length of + // `accumulators` by an assertion above. + let offset = get_offset_unsafe!(index); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(root); + + while let Some(top_index) = stack.pop() { + let top_label = regex.vertex_label(top_index)?; + + let nfa_start = offset + 2 * top_index; + let nfa_end = offset + 2 * top_index + 1; + + match top_label { + RegexType::Kleene => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } + } + RegexType::Plus => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + + builder.add_edge(nfa_end, nfa_start, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Optional => { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } + } + RegexType::Or => { + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(nfa_start, child_start, empty_label)?; + builder.add_edge(child_end, nfa_end, empty_label)?; + } + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Paren => { + // Ignore Paren nodes since currently these + // are used only for printing purposes. + } + RegexType::Empty => { + let mut source = nfa_start; + + if !regex.is_empty_node(top_index)? { + for child in regex.children_of(top_index)? { + stack.push(child); + + let child_start = offset + 2 * child; + let child_end = offset + 2 * child + 1; + + builder.add_edge(source, child_start, empty_label)?; + + source = child_end; + } + + builder.add_edge(source, nfa_end, empty_label)?; + } else { + builder.add_edge(nfa_start, nfa_end, empty_label)?; + } + } + RegexType::Lit(value) => { + // The children would be ignored even if for + // some reason this literal node had some + // children. + + match sub_pred(value)? { + SoC::Sub(sub_non) => { + // a non-terminal + + let sub_offset = get_offset_safe!(sub_non); + let sub_nfa_start = sub_offset + 2 * sub_non; + let sub_nfa_end = sub_offset + 2 * sub_non + 1; + + builder.add_edge(nfa_start, sub_nfa_start, empty_label)?; + builder.add_edge(sub_nfa_end, nfa_end, empty_label)?; + } + SoC::Carry(new_value) => { + // a terminal + + builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?; + } + } + } + } + } + } + + let graph = builder.build(); + + Ok(DefaultNFA { graph }) } fn remove_epsilon(&mut self) -> Result<(), Error> { @@ -114,7 +344,113 @@ impl Nfa for DefaultNFA { todo!() } - fn nulling(&mut self) -> Result<(), Error> { - todo!() + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> { + let mut updated = true; + let mut builder = self.graph.builder_ref(); + + while updated { + updated = false; + + let mut nullable = false; + + let mut to_add = Vec::new(); + + for (source, target) in builder.edges() { + for label in builder.edge_label(source, target)? { + if f(label) { + nullable = true; + + break; + } + } + + if nullable { + for (label, child_iter) in builder.labels_of(target)? { + for child in child_iter { + if !builder.has_edge_label(source, label, child)? { + updated = true; + + to_add.push((source, child, *label)); + } + } + } + } + } + + for (source, child, label) in to_add { + builder.add_edge(source, child, label)?; + } + } + + self.graph.replace_by_builder(builder); + + Ok(()) + } +} + +impl Display for DefaultNFA { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + Debug::fmt(self, f) + } +} + +#[cfg(test)] +mod test_to_nfa { + #![allow(unused_imports)] + use super::*; + use crate::SoC; + + use crate::{ + default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + desrec::DesRec, + }; + + fn new_regex() -> Result, ParseError> { + let parser = DefaultRegParser::::default(); + + fn test_scanner( + _parser: &DefaultRegParser, + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + Ok( + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? + .unwrap_or(Default::default()) + .0, + ) + } + + #[test] + fn test_to_nfa() -> Result<(), Box> { + let regex = new_regex()?; + + println!("regex = {regex}"); + + let nfa: DefaultNFA> = + DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) } } diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index a60f801..2670e32 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; -use std::fmt::{Debug, Display}; +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + marker::PhantomData, +}; /// The type of a node in a regular expression. /// @@ -25,7 +29,7 @@ use std::fmt::{Debug, Display}; /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum RegexType { +pub enum RegexType { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals @@ -44,13 +48,31 @@ pub enum RegexType { Lit(T), } +impl Display for RegexType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RegexType::Kleene => write!(f, "*"), + RegexType::Plus => write!(f, "+"), + RegexType::Optional => write!(f, "?"), + RegexType::Or => write!(f, "|"), + RegexType::Paren => write!(f, "()"), + RegexType::Empty => write!(f, "empty"), + RegexType::Lit(value) => write!(f, "{value}"), + } + } +} + /// A default implementation of regular expressions. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DefaultRegex { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec>, + /// The root of the graph. + /// + /// If it is None, it indicates the regular expression is empty. + root: Option, } impl Default for DefaultRegex { @@ -58,11 +80,39 @@ impl Default for DefaultRegex { Self { graph: Default::default(), types: Default::default(), + root: Default::default(), } } } -impl DefaultRegex { +impl DefaultRegex { + /// Set the root of the regular expression. + #[inline] + pub fn set_root(&mut self, root: Option) { + self.root = root; + } + + /// Construct a regular expression from a raw adjacency list graph + /// and an array of labels. + /// + /// # Error + /// + /// If the graph contains cycles, this function will return an + /// error. This check is added to make sure that the regular + /// expression contains no cycles, otherwise operations with + /// regular expressions might enter infinite loops later. + pub fn new(graph: ALGraph, types: Vec>) -> Result { + if graph.contains_cycles()? { + Err(Error::Cycle) + } else { + Ok(Self { + graph, + types, + root: Some(0), + }) + } + } + /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { @@ -86,10 +136,65 @@ impl DefaultRegex { Ok(()) } + + /// Return the internal graph. + /// + /// Normally this should not be used. + pub fn internal_graph(&self) -> ALGraph { + self.graph.clone() + } + + /// Return the internal types array. + /// + /// Normally this should not be used. + pub fn internal_types(&self) -> Vec> { + self.types.clone() + } + + /// Return the array of parents. + /// + /// The element at index N of the returned array is the parent of + /// that N-th element. If an element has no parents, a `None` is + /// placed at its place. + /// + /// # Error + /// + /// If some edge points to an invalid node, an erro will be + /// returned. + /// + /// Also, if the graph contains cycles, an error is also returned. + pub fn parents_array(&self) -> Result>, Error> { + if self.graph.contains_cycles().map_err(|e| match e { + GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), + _ => unreachable!(), + })? { + return Err(Error::Cycle); + } + + let len = self.len(); + + let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); + + for source in self.graph.nodes() { + for (index, target) in self + .graph + .children_of(source) + .map_err(|_| Error::UnknownNode(source))? + .enumerate() + { + result + .get_mut(target) + .ok_or(Error::UnknownNode(target))? + .replace((source, index)); + } + } + + Ok(result) + } } // REVIEW: This may not be needed. -impl Cata, A> for &DefaultRegex +impl Cata, A> for &DefaultRegex where A: Algebra>, { @@ -118,8 +223,13 @@ where } } -impl Display for DefaultRegex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl DefaultRegex { + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid. + pub fn to_string_with(&self, mut f: F) -> Result + where + F: FnMut(T) -> String, + { #[derive(Copy, Clone)] enum StackElement { Seen(usize), @@ -134,16 +244,15 @@ impl Display for DefaultRegex { } } - fn is_seen(&self) -> bool { - match self { - Seen(_) => true, - Unseen(_) => false, - } + fn is_seen(self) -> bool { + matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; + let mut result = String::new(); + let mut stack: Vec = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); @@ -159,6 +268,12 @@ impl Display for DefaultRegex { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + stack.extend( self.graph .children_of(top.index()) @@ -167,7 +282,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "*")?; + write!(result, "*")?; } } RegexType::Plus => { @@ -181,7 +296,7 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "+")?; + write!(result, "+")?; } } RegexType::Optional => { @@ -195,12 +310,12 @@ impl Display for DefaultRegex { .rev(), ); } else { - write!(f, "?")?; + write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { - write!(f, "(")?; + write!(result, "(")?; let len = self.len(); @@ -221,11 +336,11 @@ impl Display for DefaultRegex { } } } else { - write!(f, "|")?; + write!(result, "|")?; } } RegexType::Paren => { - write!(f, ")")?; + write!(result, ")")?; } RegexType::Empty => { stack.extend( @@ -236,11 +351,17 @@ impl Display for DefaultRegex { .rev(), ); } - RegexType::Lit(label) => write!(f, "{label}")?, + RegexType::Lit(label) => write!(result, "{}", f(label))?, } } - Ok(()) + Ok(result) + } +} + +impl Display for DefaultRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } @@ -278,9 +399,20 @@ impl Graph for DefaultRegex { fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl graph::Builder) { + unimplemented!() + } } -impl Regex> for DefaultRegex { +impl Regex> for DefaultRegex { + /// Return the root of the regular expression. + #[inline] + fn root(&self) -> Option { + self.root + } + #[inline] fn vertex_label(&self, node_id: usize) -> Result, Error> { self.types @@ -291,8 +423,10 @@ impl Regex> for DefaultRegex { } /// An error type for holding parsing errors. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum ParseError { + /// A cycle is encountered. + Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. @@ -354,23 +488,77 @@ impl Display for ParseDirection { } } -impl DesRec for DefaultRegex { +/// A default recursive descent parser for regular expressions of +/// terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct DefaultRegParser { + ter_map: HashMap, + non_map: HashMap, + _phantom: PhantomData, +} + +impl DefaultRegParser { + /// Query if a terminal or a non-terminal is already found. + /// + /// If found, return the associated index of the terminal or + /// non-terminal. + pub fn query(&self, tnt: &str, terminal_p: bool) -> Option { + if terminal_p { + self.ter_map.get(tnt).copied() + } else { + self.non_map.get(tnt).copied() + } + } + + /// Add a terminal or a non-terminal. + pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { + if terminal_p { + let ter_len = self.ter_map.len(); + + self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); + } else { + let non_len = self.non_map.len(); + + self.non_map.entry(tnt.to_owned()).or_insert(non_len); + } + } +} + +impl Default for DefaultRegParser { + fn default() -> Self { + Self { + ter_map: Default::default(), + non_map: Default::default(), + _phantom: PhantomData, + } + } +} + +impl DesRec for DefaultRegParser { type Label = RegexType; - type Regex = Self; + type Regex = DefaultRegex; type Error = ParseError; - type Aux = ParseDirection; + type Inter = ParseDirection; - type Scanner<'a> = - Box Result, Self::Error>>; + type Scanner<'a, 'b> = Box< + dyn FnMut( + &'b Self, + &'a str, + ) -> Result, Self::Error>, + > where RegexType:'b; - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, mut input: &'a str, - mut scanner: Self::Scanner<'a>, + mut scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result, &'a str)>, Self::Error> { + ) -> Result, &'a str)>, Self::Error> + where + Self::Label: 'b, + { use ParseDirection::*; use RegexType::*; @@ -380,7 +568,7 @@ impl DesRec for DefaultRegex { // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { - if let Some((len, label, direction)) = scanner(input)? { + if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } @@ -523,7 +711,12 @@ impl DesRec for DefaultRegex { let graph = list_of_children.into(); - let result = DefaultRegex { graph, types }; + // Check there are no cycles + let result = DefaultRegex::new(graph, types).map_err(|e| match e { + Error::Graph(ge) => ParseError::Graph(ge), + Error::Cycle => ParseError::Cycle, + _ => unreachable!(), + })?; Ok(Some((result, input))) } @@ -533,9 +726,17 @@ impl DesRec for DefaultRegex { mod test_des_rec { use super::*; - fn test_scanner( - input: &str, - ) -> Result, ParseDirection)>, ParseError> { + use crate::desrec::DesRec; + + #[allow(dead_code, unused)] + fn test_scanner<'a, 'b, T>( + parser: &'b DefaultRegParser, + input: &'a str, + ) -> Result, ParseDirection)>, ParseError> + where + T: GraphLabel + Display + Debug, + T: 'b, + { use ParseDirection::*; use RegexType::*; @@ -557,19 +758,57 @@ mod test_des_rec { #[test] fn test_des_rec() -> Result<(), Box> { - let input_string = "a*b?c+|(d*| +)?".to_owned(); + let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); + + let parser: DefaultRegParser = Default::default(); if let Some((regex, remain)) = - DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? + DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); + let parents = regex.parents_array()?; + + println!("parents = {parents:?}"); + Ok(()) } else { unreachable!() } } + + #[test] + fn test_display() -> Result<(), Box> { + use graph::builder::Builder; + use RegexType::*; + + let mut builder = graph::adlist::ALGBuilder::default(); + let mut types: Vec> = Vec::with_capacity(4); + + types.push(Kleene); + builder.add_vertex(); + + types.push(Lit(0)); + builder.add_vertex(); + builder.add_edge(0, 1, ())?; + + types.push(Lit(1)); + builder.add_vertex(); + builder.add_edge(0, 2, ())?; + + types.push(Lit(2)); + builder.add_vertex(); + builder.add_edge(0, 3, ())?; + + let graph = builder.build(); + + let regex = DefaultRegex::new(graph, types)?; + + println!("regex = {regex}"); + + Ok(()) + } } diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs index c57d313..ac45c2d 100644 --- a/nfa/src/desrec.rs +++ b/nfa/src/desrec.rs @@ -20,8 +20,8 @@ pub trait DesRec { /// The type of errors encountered when parsing. type Error: std::error::Error; - /// Auxiliary data when parsing - type Aux; + /// Intermediate data when parsing + type Inter; /// The type of a scanner that classifies inputs into tokens. /// @@ -34,7 +34,14 @@ pub trait DesRec { /// /// - `Ok(Some(_))`: the classification succeeds and the parsing /// should continue. - type Scanner<'a>: FnMut(&'a str) -> Result, Self::Error>; + type Scanner<'a, 'b>: FnMut( + &'b Self, + &'a str, + ) + -> Result, Self::Error> + where + Self: 'b, + Self::Label: 'b; /// Parse a string into a regular expression with the aid of this /// type. @@ -42,9 +49,12 @@ pub trait DesRec { /// Accept a slice of string and return either a parsing error, or /// a pair of correctly parsed regular expression and the /// remaining slice. - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, input: &'a str, - scanner: Self::Scanner<'a>, + scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result, Self::Error>; + ) -> Result, Self::Error> + where + Self::Label: 'b; } diff --git a/nfa/src/error.rs b/nfa/src/error.rs index 0c6bb3c..ad75077 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -2,10 +2,11 @@ use graph::error::Error as GError; -use std::fmt::{Display, Formatter}; +use core::fmt::{Display, Formatter}; -#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] /// The error type for NFA operations. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[non_exhaustive] pub enum Error { /// An unknown node id is encountered. UnknownNode(usize), @@ -13,8 +14,12 @@ pub enum Error { /// /// Everything is a trade-off, wink wink. UnsupportedOperation, + /// There is no root in the underlying regular expression. + NoRoot, /// This error comes from some underlying graph operation. Graph(GError), + /// A cycle is found when constructing regular expressions. + Cycle, } impl Display for Error { @@ -23,6 +28,8 @@ impl Display for Error { Error::UnknownNode(id) => write!(f, "unknown node: {id}"), Error::UnsupportedOperation => write!(f, "unsupported operation"), Error::Graph(e) => write!(f, "graph error: {e}"), + Error::NoRoot => write!(f, "no root found"), + Error::Cycle => write!(f, "a cycle is found when constructing regular expressions"), } } } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index 1e2a30c..b1d62b3 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -12,17 +12,23 @@ extern crate graph; use core::fmt::Display; +use std::ops::{Deref, DerefMut}; + use graph::{Graph, GraphLabel, LabelGraph}; use error::Error; +pub use desrec::DesRec; + +use default::regex::RegexType; + /// The expected behaviour of a regular language. /// /// Nondeterministic finite automata are equivalent to regular /// languages. Since regular languages are easier to understand for a /// human being, nondeterministic finite automata include the data for /// the equivalent regular languages. -pub trait Regex: Graph + Display { +pub trait Regex: Graph + Display + Clone { /// Return the label of a vertex, or an error if the node is /// invalid. fn vertex_label(&self, node_id: usize) -> Result; @@ -52,6 +58,65 @@ pub trait Regex: Graph + Display { // cetera. These might be needed later. } +/// Since Option does not inherit the Display from T, we wrap it to +/// provide an automatic implementation of Display. +#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct DOption(Option); + +impl Default for DOption { + fn default() -> Self { + Self(None) + } +} + +impl Deref for DOption { + type Target = Option; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DOption { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl Display for DOption { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.deref() { + Some(value) => Display::fmt(value, f), + None => write!(f, "ε"), + } + } +} + +/// Substitute or Carry +/// +/// This enumeration indicates whether a label from a regular +/// expression should be substituted by another regular expression, or +/// to be carried around in the resulting nondeterministic finite +/// automaton, in the process of the [`to_nfa`][Nfa::to_nfa] function. +/// +/// # Transform labels +/// +/// The label that is returned to be carried can be different from the +/// original label, as a way to transform the labels. +/// +/// # Remark on the abbreviation +/// +/// It happens "by chance" that this abbreviation coincides with the +/// abbreviation of "system on chip". Since I know nothing about this +/// topic, this is just a meaningless pun. +#[derive(Debug, Copy, Clone)] +pub enum SoC { + /// To be substituted by another regular expression. + Sub(usize), + /// To carry around this label. + Carry(T), +} + /// The expected behvaiour of a nondeterministic finite automaton. /// /// Every NFA is a special labelled graph. @@ -68,9 +133,23 @@ pub trait Nfa: LabelGraph { Err(Error::UnsupportedOperation) } - /// Build a nondeterministic finite automaton out of a regular - /// language. - fn to_nfa(regex: impl Regex) -> Self; + /// When we convert a regular expression to a nondeterministic + /// finite automaton, the label should be optional, so we use a + /// different type for the result type. + type FromRegex; + + /// Build a nondeterministic finite automaton out of a set of + /// regular expressions. + /// + /// The SUB_PRED is a predicate function that returns an optional + /// index for a label. If it returns some index, then this means + /// we should substitute the index-th regular expression in the + /// set, whereever that label occurs in the set of regular + /// expressions. + fn to_nfa( + regexps: &[impl Regex>], + sub_pred: impl Fn(T) -> Result, Error>, + ) -> Result>, Error>; /// Remove all dead states from the nondeterministic finite /// automaton. @@ -78,13 +157,14 @@ pub trait Nfa: LabelGraph { /// A state is dead if there are no edges going to the state. fn remove_dead(&mut self) -> Result<(), Error>; - /// For each empty transition from A to B, and for every edge from - /// B to C, say, add an edge from A to C. + /// For each edge from A to B whose edge is considered nullable by + /// a function `f`, and for every edge from B to C, add an edge + /// from A to C. /// /// This is needed specifically by the algorithm to produce a set /// of atomic languages that represent "null-closed" derived /// languages. - fn nulling(&mut self) -> Result<(), Error>; + fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>; } pub mod default; diff --git a/repcore/Cargo.toml b/repcore/Cargo.toml deleted file mode 100644 index 7416ad5..0000000 --- a/repcore/Cargo.toml +++ /dev/null @@ -1,8 +0,0 @@ -[package] -name = "repcore" -version = "0.1.0" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] diff --git a/repcore/src/grammar.rs b/repcore/src/grammar.rs deleted file mode 100644 index ee9f033..0000000 --- a/repcore/src/grammar.rs +++ /dev/null @@ -1,59 +0,0 @@ -//! This file implements the extected behaviours of grammars. - -// NOTE: We shall first start with a parser that works at the level of -// characters. The purpose is to first experiment with the workings -// and the performance of the algorithms, before optimising by using -// regular expressions to classify inputs into tokens. In other -// words, the current focus is not on the optimisations, whereas -// scanners are for optimisations only, so to speak. - -/// The type of a terminal. -/// -/// For the time being this is a wrapper around a string, but in the -/// future it may hold more information of scanners. -pub struct Terminal { - // If we want to use scanners, per chance add them as a new field - // here. - name: String, -} - -impl Terminal { - /// Create a terminal with the given name. - #[inline] - pub fn new(name: String) -> Self { - Self { name } - } - - /// Return the name of the terminal. - #[inline] - pub fn name(&self) -> &str { - &self.name - } -} - -/// The type of a non-terminal. -/// -/// This is just a wrapper around a string. -pub struct Nonterminal(String); - -impl Nonterminal { - /// Return the name of the nonterminal. - /// - /// Just to improve readability. - #[inline] - pub fn name(&self) -> &str { - &self.0 - } -} - -/// The type of a terminal or a non-terminal. -/// -/// Only an index is stored here. Actual data are stored in two other -/// arrays. -#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum TNT { - /// Terminal variant - Ter(usize), - /// Nonterminal variant - Non(usize), -} diff --git a/repcore/src/lib.rs b/repcore/src/lib.rs deleted file mode 100644 index 589b61c..0000000 --- a/repcore/src/lib.rs +++ /dev/null @@ -1,20 +0,0 @@ -//! This package implements the core algorithm of the entire -//! workspace: parsing with derivatives by means of regular nulling -//! languages. - -pub mod grammar; - -pub fn add(left: usize, right: usize) -> usize { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} diff --git a/repcore/src/plan.org b/repcore/src/plan.org deleted file mode 100644 index 13c2ee0..0000000 --- a/repcore/src/plan.org +++ /dev/null @@ -1,59 +0,0 @@ -#+TITLE: Plan of the package -#+AUTHOR: Durand -#+DATE: <2022-11-18 Ven 19:57> - -* Atomic Languages - -This describes the behaviours of atomic languages. The atomic -language consists of the null closure of any non-terminal symbol in -the grammar, and their deriavtives by terminals and non-terminal. - -* Derivative Languages - -This is the main driving device of the algorithm. Basically, the -algorithm works by taking successive derivatives, according to the -input symbol. At each step, we calculate the derivative language. In -this process, we also compute some semiring value and store in a -carrier. The end result of the algorithm is the final semiring -value. - -If one wants simply to determine if the input string belongs to the -grammar, one chooses the semiring to be the field with two elements, -the boolean field. If one wants to find how many ways are there to -derive a given input string, then one uses the semiring of natural -numbers instead. If one wants, moreover, to find all the possible -ways to derive a particular input string, then one can use the -free semiring on the set of terminals and non-terminals of the -grammar. Here the free semiring is the left-adjoint functor to the -forgetful functor from the category of semirings to the category of -sets. - -To be more specific, the free semiring on a set is given by sets of -sequences of elements in the set. The addition of the semiring is the -set union operation, and the multiplication is taking the respective -concatenations. - -** Semirings - -So we need a module to define the behaviours of semirings, and provide -some common semirings implementations. Then in the main driving force -we can freely substitute different semirings, according to the -particular needs. - -** Languages - -Then the main part is to define the behaviour of languages. This -should be easy enough, since we already have the mechanism of graphs, -nondeterministic automata, and semirings. All we need to do is to -combine them together. - -* Testing ground - -I am in a strong need to test things out. The most important one is -to visualize each step of the derivation, in a human-friendly manner. -I need this to examine whether my atomic languages are wrongly -implemented, or my atomic languages are wrongly derived, or my -understanding of the main algorithm is plain wrong. - -This is the main reason I started this rewrite of the package. - diff --git a/viz/Cargo.toml b/viz/Cargo.toml new file mode 100644 index 0000000..2894c21 --- /dev/null +++ b/viz/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "viz" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +graph = { path = "../graph" } \ No newline at end of file diff --git a/viz/src/lib.rs b/viz/src/lib.rs new file mode 100644 index 0000000..b40c78a --- /dev/null +++ b/viz/src/lib.rs @@ -0,0 +1,31 @@ +//! This library defines a binary format to store graph data such that +//! a client which understands the format can easily print the graph, +//! centered at any specific node, with any scope. The library also +//! provides functions to print graphs in this format. +//! +//! This way, one does not need a server to print and interact with +//! graphs. +//! +//! # Graph representation +//! +//! The library uses the [`Graph`][graph::Graph] trait from the crate +//! [graph]. Only the functions exposed from the trait are used by +//! this library. So the users can also implement the trait and then +//! print the graph data represented by other graph crates. + +extern crate graph; + +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} -- cgit v1.2.3-18-g5258