From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 23 Dec 2022 00:36:31 +0800 Subject: renaming core to chain and some other changes Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations. --- chain/Cargo.toml | 10 + chain/src/grammar.rs | 1247 ++++++++++++++++++++++++++++++++++++++++++++++++++ chain/src/lib.rs | 25 + chain/src/plan.org | 94 ++++ 4 files changed, 1376 insertions(+) create mode 100644 chain/Cargo.toml create mode 100644 chain/src/grammar.rs create mode 100644 chain/src/lib.rs create mode 100644 chain/src/plan.org (limited to 'chain') diff --git a/chain/Cargo.toml b/chain/Cargo.toml new file mode 100644 index 0000000..32eed8b --- /dev/null +++ b/chain/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "chain" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nfa = { path = "../nfa" } +graph = { path = "../graph" } \ No newline at end of file diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs new file mode 100644 index 0000000..287fbca --- /dev/null +++ b/chain/src/grammar.rs @@ -0,0 +1,1247 @@ +#![warn(missing_docs)] +//! This file implements the extected behaviours of grammars. + +// NOTE: We shall first start with a parser that works at the level of +// characters. The purpose is to first experiment with the workings +// and the performance of the algorithms, before optimising by using +// regular expressions to classify inputs into tokens. In other +// words, the current focus is not on the optimisations, whereas +// scanners are for optimisations only, so to speak. + +#![allow(unused_imports)] +use nfa::{ + default::{ + nfa::DefaultNFA, + regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, + }, + DOption, DesRec, Nfa, Regex, SoC, +}; + +use graph::{adlist::ALGBuilder, builder::Builder, Graph}; + +use std::{ + collections::HashSet, + fmt::{Display, Write}, +}; + +/// The type of a terminal. +/// +/// For the time being this is a wrapper around a string, but in the +/// future it may hold more information of scanners. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Terminal { + // If we want to use scanners, per chance add them as a new field + // here. + name: String, +} + +impl Terminal { + /// Create a terminal with the given name. + #[inline] + pub fn new(name: String) -> Self { + Self { name } + } + + /// Return the name of the terminal. + #[inline] + pub fn name(&self) -> &str { + &self.name + } +} + +/// The type of a non-terminal. +/// +/// This is just a wrapper around a string. +#[derive(Debug, Clone)] +pub struct Nonterminal(String); + +impl Nonterminal { + /// Return the name of the nonterminal. + /// + /// Just to improve readability. + #[inline] + pub fn name(&self) -> &str { + &self.0 + } +} + +/// The type of a terminal or a non-terminal. +/// +/// Only an index is stored here. Actual data are stored in two other +/// arrays. +#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum TNT { + /// Terminal variant + Ter(usize), + /// Nonterminal variant + Non(usize), +} + +impl Display for TNT { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TNT::Ter(t) => write!(f, "T({t})"), + TNT::Non(n) => write!(f, "N({n})"), + } + } +} + +/// Errors related to grammar operations. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum Error { + /// The first component is the index, and the second the bound. + IndexOutOfBounds(usize, usize), + /// Fail to build the N-th regular expression, due to the + /// ParseError. + BuildFail(usize, ParseError), + /// fail to build NFA + NFAFail(nfa::error::Error), +} + +impl From for Error { + fn from(nfae: nfa::error::Error) -> Self { + Self::NFAFail(nfae) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), + Error::BuildFail(n, pe) => write!( + f, + "Failed to build the {n}-th regular expression due to error: {pe}" + ), + Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + } + } +} + +impl std::error::Error for Error {} + +/// A rule is a regular expression of terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct Rule { + regex: DefaultRegex, +} + +impl Rule { + /// Return true if and only if the rule is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.regex.is_empty() + } + + /// Return the length of the rule. + #[inline] + pub fn len(&self) -> usize { + self.regex.len() + } +} + +/// The type of a grammar. +#[derive(Debug, Clone, Default)] +pub struct Grammar { + ter: Vec, + non: Vec, + rules: Vec, + firsts: Vec>>, + first_nodes: Vec>, +} + +/// A private type to aid the recursive looping of rergular +/// expressions. +#[derive(Copy, Clone)] +enum StackElement { + Seen(usize), + Unseen(usize), +} + +impl StackElement { + fn index(self) -> usize { + match self { + Self::Seen(index) => index, + Self::Unseen(index) => index, + } + } + + fn is_seen(self) -> bool { + matches!(self, Self::Seen(_)) + } +} + +impl Grammar { + /// Construct a grammar from a vector of terminals, a vector of + /// non-terminals, and a vector of rules for the non-temrinals. + /// + /// # Panic + /// + /// If the length of `non` is not equal to that of `rules`, then + /// the function panics. + pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { + assert_eq!(non.len(), rules.len()); + + // One more room is reserved for the `None` value. + let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) + .take(non.len()) + .collect(); + + let first_nodes = rules + .iter() + .map(|rule| Vec::with_capacity(rule.len())) + .collect(); + + Self { + ter, + non, + rules, + firsts, + first_nodes, + } + } + + /// Return the name of a terminal or a non-terminal. + pub fn name_of_tnt(&self, tnt: TNT) -> Result { + match tnt { + TNT::Ter(t) => Ok(format!( + "T{}", + self.ter + .get(t) + .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? + .name() + )), + TNT::Non(n) => Ok(format!( + "N{}", + self.non + .get(n) + .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? + .name() + )), + } + } + + /// Return true if and only if there are no non-terminals in the + /// grammar. + #[inline] + pub fn is_empty(&self) -> bool { + self.non.is_empty() + } + + /// Return the total length of all rules. + #[inline] + pub fn total(&self) -> usize { + self.rules.iter().map(Rule::len).sum() + } + + /// Return the number of terminals. + #[inline] + pub fn ter_num(&self) -> usize { + self.ter.len() + } + + /// Return the number of non-terminals. + #[inline] + pub fn non_num(&self) -> usize { + self.non.len() + } + + /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a + /// single number to represent terminals and non-terminals. + /// + /// # Bounds + /// + /// If a terminal index is greater than or equal to the number of + /// terminals, then this signals an error; mutatis mutandis for + /// non-terminals. + /// + /// # Related + /// + /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. + #[inline] + pub fn pack_tnt(&self, tnt: TNT) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + match tnt { + TNT::Ter(t) => { + if t >= ter_num { + Err(Error::IndexOutOfBounds(t, ter_num)) + } else { + Ok(t) + } + } + TNT::Non(n) => { + if n >= non_num { + Err(Error::IndexOutOfBounds(n, non_num)) + } else { + Ok(n + ter_num) + } + } + } + } + + /// Convert a single number to either a terminal or a + /// non-terminal. + /// + /// # Bounds + /// + /// If the number is greater than or equal to the sum of the + /// numbers of terminals and of non-terminals, then this signals + /// an error. + /// + /// # Related + /// + /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. + #[inline] + pub fn unpack_tnt(&self, flat: usize) -> Result { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + if flat < ter_num { + Ok(TNT::Ter(flat)) + } else if flat < ter_num + non_num { + Ok(TNT::Non(flat - ter_num)) + } else { + Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) + } + } + + /// Return true if and only if the non-terminal is nullable. + #[inline] + pub fn is_nullable(&self, non_terminal: usize) -> Result { + Ok(self + .firsts + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .contains(&None)) + } + + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) + } + + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } + + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&self) -> Result>, Error> { + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => tnt, + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if parents_chain.is_empty() { + local_result.push(Lit(tnt)); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + assert!(parents_chain.first().unwrap().0 == regex_root); + + // A different, "more local", root. + let mut root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + local_result.extend([Empty, Lit(tnt)]); + + root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(Lit(tnt)); + root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, Lit(tnt)]); + root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, root, ()).unwrap(); + + root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, root, ()).unwrap(); + root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex], + ) -> Result>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) + } +} + +impl Display for Grammar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + assert_eq!(self.non.len(), self.rules.len()); + + for (nt, rule) in self.non.iter().zip(self.rules.iter()) { + write!(f, "{}: ", nt.name())?; + + writeln!( + f, + "{}", + rule.regex.to_string_with(|tnt| format!( + "({})", + self.name_of_tnt(tnt) + .unwrap_or_else(|_| format!("Unknown {tnt:?}")) + ))? + )?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_helper { + use super::*; + use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; + use std::fmt::Write; + + // Construct a grammar to test + pub fn new_grammar() -> Result> { + let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("end".to_owned()), + ]; + + /// A function to scan the inputs. + fn scan_tnt( + parser: &DefaultRegParser, + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + use TNT::*; + + let mut chars = input.chars(); + + if let Some(first) = chars.next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + 'T' => { + let mut name = String::new(); + let mut len = 1; + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(t) = parser.query(&name, true) { + Ok(Some((len, Lit(Ter(t)), Right))) + } else { + Err(ParseError::InvalidCharacter(first)) + } + } + 'N' => { + let mut name = String::new(); + let mut len = 1; + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(n) = parser.query(&name, false) { + Ok(Some((len, Lit(Non(n)), Right))) + } else { + Err(ParseError::InvalidCharacter(first)) + } + } + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + let mut regex_parser: DefaultRegParser = Default::default(); + + regex_parser.add_tnt("a", true); + regex_parser.add_tnt("b", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("end", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nstart?Nend*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) + } +} + +#[cfg(test)] +mod test_grammar_display { + use super::test_grammar_helper::new_grammar; + + #[test] + fn test_display() -> Result<(), Box> { + println!("{}", new_grammar()?); + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_firsts { + use super::test_grammar_helper::new_grammar; + use super::*; + + #[test] + fn test_firsts() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + grammar.compute_firsts()?; + + println!("grammar firsts: {:?}", grammar.firsts); + println!("grammar first nodes: {:?}", grammar.first_nodes); + + Ok(()) + } +} + +#[cfg(test)] +mod test_grammar_left_closure { + use super::test_grammar_helper::new_grammar; + use super::*; + + pub fn new_closure_regex( + grammar: &mut Grammar, + ) -> Result>, Box> { + grammar.compute_firsts()?; + + println!("grammar firsts: {:?}", grammar.firsts); + println!("grammar first nodes: {:?}", grammar.first_nodes); + println!("grammar:"); + println!("{grammar}"); + + grammar.left_closure().map_err(Into::into) + } + + #[test] + fn test_regex() -> Result<(), Box> { + let mut grammar = new_grammar()?; + + let vec_of_regexps = new_closure_regex(&mut grammar)?; + + for regex in &vec_of_regexps { + println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); + // println!("regex: {regex:?}",); + println!("regex len = {}", regex.nodes_len()); + } + + Ok(()) + } + + #[test] + fn test_nfa() -> Result<(), Box> { + let mut grammar = new_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + let nfa = grammar.left_closure_to_nfa(&closure)?; + // TODO: print the nfa out + Ok(()) + } +} diff --git a/chain/src/lib.rs b/chain/src/lib.rs new file mode 100644 index 0000000..0ec4d4c --- /dev/null +++ b/chain/src/lib.rs @@ -0,0 +1,25 @@ +//! This package implements the core algorithm of the entire +//! workspace: parsing with derivatives by means of chain rule and +//! regular nulling languages. +//! +//! Since I shall not name my crate "core" to avoid collisions with +//! the Rust's own core, I decided to name this crate after what I +//! think is the essence of this algorithm, the chain-rule for +//! derivatives of languages. + +pub mod grammar; + +pub fn add(left: usize, right: usize) -> usize { + left + right +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn it_works() { + let result = add(2, 2); + assert_eq!(result, 4); + } +} diff --git a/chain/src/plan.org b/chain/src/plan.org new file mode 100644 index 0000000..8c63492 --- /dev/null +++ b/chain/src/plan.org @@ -0,0 +1,94 @@ +#+TITLE: Plan of the package +#+AUTHOR: Durand +#+DATE: <2022-11-18 Ven 19:57> + +* Things to do [2/7] + +- [X] Implement builders for graphs +- [X] Find sets of the first terminals for each non-terminal, in the + grammar module. +- [-] NFA [4/5] + + [X] Add regular expression into NFA. + + [X] Convert a set of regular expressions into a nondeterministic + finite automaton, where non-terminals denote the indices of + regular expressions in the set to substitute. + + [X] Convert a grammar into its grammar of left-linear closures of + non-temrinals and their derivatives. + + [X] Convert nondeterministic finite automata to their null + closures. + + [ ] Test more grammars to ensure it works correctly. +- [ ] Define the Atom trait. +- [ ] Implement virtual nodes for each derivative of each atom: The + lack of this step might be the root cause of the failure of the + previous version of this package. +- [ ] Implement languages. [0/2] + + [ ] First implement them as doubly labelled (directed acyclic) + graphs. + + [ ] Implement finding derivatives. +- [ ] Implement semiring. [0/5] + + [ ] Define the trait. + + [ ] Implement the boolean semiring. + + [ ] Implement the natural number semiring. + + [ ] Implement the free semiring. + + [ ] Compute and record a semiring value when computing + derivatives. + +* Atomic Languages + +This describes the behaviours of atomic languages. The atomic +language consists of the null closure of any non-terminal symbol in +the grammar, and their deriavtives by terminals and non-terminal. + +* Script for creating GIF animation + +[[https://gist.github.com/maelvls/5379127][a gist]] + +* Derivative Languages + +This is the main driving device of the algorithm. Basically, the +algorithm works by taking successive derivatives, according to the +input symbol. At each step, we calculate the derivative language. In +this process, we also compute some semiring value and store in a +carrier. The end result of the algorithm is the final semiring +value. + +If one wants simply to determine if the input string belongs to the +grammar, one chooses the semiring to be the field with two elements, +the boolean field. If one wants to find how many ways are there to +derive a given input string, then one uses the semiring of natural +numbers instead. If one wants, moreover, to find all the possible +ways to derive a particular input string, then one can use the +free semiring on the set of terminals and non-terminals of the +grammar. Here the free semiring is the left-adjoint functor to the +forgetful functor from the category of semirings to the category of +sets. + +To be more specific, the free semiring on a set is given by sets of +sequences of elements in the set. The addition of the semiring is the +set union operation, and the multiplication is taking the respective +concatenations. + +** Semirings + +So we need a module to define the behaviours of semirings, and provide +some common semirings implementations. Then in the main driving force +we can freely substitute different semirings, according to the +particular needs. + +** Languages + +Then the main part is to define the behaviour of languages. This +should be easy enough, since we already have the mechanism of graphs, +nondeterministic automata, and semirings. All we need to do is to +combine them together. + +* Testing ground + +I am in a strong need to test things out. The most important one is +to visualize each step of the derivation, in a human-friendly manner. +I need this to examine whether my atomic languages are wrongly +implemented, or my atomic languages are wrongly derived, or my +understanding of the main algorithm is plain wrong. + +This is the main reason I started this rewrite of the package. + -- cgit v1.2.3-18-g5258