diff options
Diffstat (limited to 'chain')
-rw-r--r-- | chain/Cargo.toml | 7 | ||||
-rw-r--r-- | chain/src/atom/default.rs | 342 | ||||
-rw-r--r-- | chain/src/atom/mod.rs | 41 | ||||
-rw-r--r-- | chain/src/default.rs | 46 | ||||
-rw-r--r-- | chain/src/grammar.rs | 1247 | ||||
-rw-r--r-- | chain/src/lib.rs | 49 | ||||
-rw-r--r-- | chain/src/plan.org | 154 |
7 files changed, 607 insertions, 1279 deletions
diff --git a/chain/Cargo.toml b/chain/Cargo.toml index 32eed8b..265954c 100644 --- a/chain/Cargo.toml +++ b/chain/Cargo.toml @@ -7,4 +7,9 @@ edition = "2021" [dependencies] nfa = { path = "../nfa" } -graph = { path = "../graph" }
\ No newline at end of file +graph = { path = "../graph" } +grammar = { path = "../grammar" } +forest = { path = "../forest" } + +[dev-dependencies] +grammar = { path = "../grammar", features = ["test-helper"] } diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs new file mode 100644 index 0000000..72989b3 --- /dev/null +++ b/chain/src/atom/default.rs @@ -0,0 +1,342 @@ +//! This file provides a default implementation of the +//! [`Atom`][super::Atom] trait. + +use super::*; +use grammar::Grammar; +use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph}; +use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex}; + +use core::fmt::Display; +use std::collections::BTreeMap as Map; + +/// A virtual node represents the derivative of a non-terminal symbol +/// `S` with respect to a terminal symbol `t`. +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)] +struct VirtualNode { + s: usize, + t: usize, +} + +impl Display for VirtualNode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "VN[{}]^({})", self.s, self.t) + } +} + +impl VirtualNode { + fn new(s: usize, t: usize) -> Self { + Self { s, t } + } +} + +type VirtualMap = Map<VirtualNode, usize>; + +/// The type of atomic languages. +#[derive(Debug, Clone, Default)] +pub struct DefaultAtom { + grammar: Grammar, + nfa: DefaultNFA<DOption<TNT>>, + // NOTE: This is mostly for printing and debugging + regexp: Vec<DefaultRegex<TNT>>, + virtual_nodes: VirtualMap, +} + +impl Display for DefaultAtom { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let grammar = &self.grammar; + + let error_to_string = |err| format!("{err}"); + let tnt_to_string = |tnt, deco| { + grammar + .name_of_tnt(tnt) + .map(|name| format!("{deco}({name})")) + .unwrap_or_else(error_to_string) + }; + + let display_tnt = |tnt| match tnt { + TNT::Ter(t) => format!( + "({})", + grammar + .unpack_tnt(t) + .map(|tnt| tnt_to_string(tnt, "")) + .unwrap_or_else(error_to_string) + ), + TNT::Non(_) => tnt_to_string(tnt, "H"), + }; + + writeln!(f, "regular expressions:")?; + + let mut accumulators = Vec::with_capacity(self.regexp.len() + 1); + accumulators.push(0usize); + + for regex in self.regexp.iter() { + writeln!(f, "accumulator: {}", accumulators.last().unwrap())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + + let string = regex.to_string_with(display_tnt)?; + + writeln!(f, "{string}")?; + } + + writeln!(f, "total = {}", accumulators.last().unwrap())?; + + writeln!(f, "virtual nodes:")?; + + for (virtual_node, index) in self.virtual_nodes.iter() { + writeln!(f, "{virtual_node}: {index}")?; + } + + Ok(()) + } +} + +// Some boiler-plate delegation implementations for Graph and +// LabelGraph, in order to implement Nfa. + +impl Graph for DefaultAtom { + type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b> + where + Self: 'b; + + fn is_empty(&self) -> bool { + self.nfa.is_empty() + } + + fn nodes_len(&self) -> usize { + self.nfa.nodes_len() + } + + fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GraphError> { + self.nfa.children_of(node_id) + } + + fn degree(&self, node_id: usize) -> Result<usize, GraphError> { + self.nfa.degree(node_id) + } + + fn is_empty_node(&self, node_id: usize) -> Result<bool, GraphError> { + self.nfa.is_empty_node(node_id) + } + + fn has_edge(&self, source: usize, target: usize) -> Result<bool, GraphError> { + self.nfa.has_edge(source, target) + } + + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + // NOTE: We cannot replace by a builder whose result is an + // atom, not the underlying graph type. + unimplemented!() + } +} + +impl LabelGraph<DOption<TNT>> for DefaultAtom { + type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b> + where + Self: 'b; + + type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b> + where + Self: 'b, + DOption<TNT>: 'b; + + type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a> + where + Self: 'a, + DOption<TNT>: 'a; + + #[inline] + fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> { + self.nfa.vertex_label(node_id) + } + + #[inline] + fn edge_label( + &self, + source: usize, + target: usize, + ) -> Result<Self::EdgeLabelIter<'_>, GraphError> { + self.nfa.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &DOption<TNT>, + ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> { + self.nfa.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GraphError> { + self.nfa.labels_of(node_id) + } + + #[inline] + fn has_edge_label( + &self, + node_id: usize, + label: &DOption<TNT>, + target: usize, + ) -> Result<bool, GraphError> { + self.nfa.has_edge_label(node_id, label, target) + } +} + +impl LabelExtGraph<DOption<TNT>> for DefaultAtom { + #[inline] + fn extend( + &mut self, + edges: impl IntoIterator<Item = (DOption<TNT>, usize)>, + ) -> Result<usize, GraphError> { + self.nfa.extend(edges) + } +} + +impl Nfa<DOption<TNT>> for DefaultAtom { + #[inline] + fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error> + where + F: Fn(DOption<TNT>) -> bool, + { + self.nfa.remove_epsilon(f) + } + + type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = (); + + #[inline] + fn to_nfa( + _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>], + _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>, + _default: Option<DOption<TNT>>, + ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> { + // NOTE: We cannot construct an atom from a set of regular + // languages alone. So it is appropriate to panic here, if + // one tries to do so, for some reason. + unimplemented!() + } + + #[inline] + fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.remove_dead(reserve) + } + + #[inline] + fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> { + self.nfa.nulling(f) + } +} + +impl DefaultAtom { + /// Construct an atomic language from a grammar. + pub fn from_grammar(mut grammar: Grammar) -> Result<Self, GrammarError> { + grammar.compute_firsts()?; + + let regexp = grammar.left_closure()?; + + let mut nfa = grammar.left_closure_to_nfa(®exp)?; + + use std::collections::HashSet; + + let accumulators: Vec<usize> = { + let mut result = Vec::with_capacity(regexp.len() + 1); + result.push(0); + + for regex in regexp.iter() { + result.push(regex.nodes_len() * 2 + result.last().unwrap()); + } + + result.into_iter().collect() + }; + + let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect(); + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(|node| accumulators_set.contains(&node))?; + + // now add the virtual nodes + let mut virtual_nodes: VirtualMap = Default::default(); + + let nt_num = grammar.non_num(); + + assert!(nt_num <= accumulators.len()); + + // Convert an error telling us that an index is out of bounds. + // + // Panics if the error is not of the expected kind. + fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError { + match ge { + GraphError::IndexOutOfBounds(index, bound) => { + GrammarError::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + } + } + + for nt in 0..nt_num { + let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa + // this is safe because of the above assertion. + .labels_of(*accumulators.get(nt).unwrap()) + .map_err(index_out_of_bounds_conversion)? + .map(|(label, target_iter)| (*label, target_iter.collect())) + .collect(); + + for (label, children_vec) in children.into_iter() { + if let Some(TNT::Ter(t)) = *label { + let new_index = nfa + .extend(children_vec.into_iter().map(|target| (label, target))) + .map_err(index_out_of_bounds_conversion)?; + + let virtual_node = VirtualNode::new(nt, t); + + virtual_nodes.insert(virtual_node, new_index); + } + } + } + + Ok(Self { + grammar, + nfa, + regexp, + virtual_nodes, + }) + } +} + +/// A convenient getter for the map of virtual nodes. +fn query(map: &VirtualMap, nt: usize, t: usize) -> Option<usize> { + map.get(&VirtualNode::new(nt, t)).copied() +} + +impl Atom for DefaultAtom { + fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError> { + if nt >= self.grammar.non_num() { + return Err(GrammarError::IndexOutOfBounds(nt, self.grammar.non_num())); + } + + if t >= self.grammar.ter_num() { + return Err(GrammarError::IndexOutOfBounds(t, self.grammar.ter_num())); + } + + Ok(query(&self.virtual_nodes, nt, t)) + } + + fn empty(&self) -> usize { + assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2); + + self.nfa.nodes_len() - 2 + } +} diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs new file mode 100644 index 0000000..084acca --- /dev/null +++ b/chain/src/atom/mod.rs @@ -0,0 +1,41 @@ +//! This file defines the behaviour of the Atomic languages, and +//! provides a default implementation. +//! +//! Here I do not to substitute external packages' implementations in +//! the future, so why define a trait for the atomic languages? +//! Because this way I can easily substitute other implementations if +//! I have better ideas in the future. + +use grammar::{Error as GrammarError, TNT}; +use nfa::{DOption, Nfa}; + +/// The expected behaviours of an atomic language. +pub trait Atom: Nfa<DOption<TNT>> { + /// Return the index of a node representing the derivative of the + /// left-linear null closure of `nt` with respect to `t`. + fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>; + + /// Return the index of the empty state. + fn empty(&self) -> usize; +} + +pub mod default; + +pub use default::DefaultAtom; + +#[cfg(test)] +mod tests { + use super::*; + use grammar::test_grammar_helper::*; + + #[test] + fn atom() -> Result<(), Box<dyn std::error::Error>> { + let grammar = new_notes_grammar()?; + + let atom = DefaultAtom::from_grammar(grammar)?; + + println!("atom = {atom}"); + + Ok(()) + } +} diff --git a/chain/src/default.rs b/chain/src/default.rs new file mode 100644 index 0000000..e04be9f --- /dev/null +++ b/chain/src/default.rs @@ -0,0 +1,46 @@ +//! This file provides a default implementation of the +//! [`Chain`][crate::Chain] trait. +//! +//! The reason for using a trait is that I might want to experiment +//! with different implementation ideas in the future, and this +//! modular design makes that easy. + +use super::*; +use core::fmt::Display; + +/// The errors related to taking derivatives by chain rule. +#[derive(Debug)] +pub enum Error { + /// An invalid situation happens. + Invalid, +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Invalid => write!(f, "invalid"), + } + } +} + +impl std::error::Error for Error {} + +/// A default implementation for the [`Chain`] trait. +#[derive(Debug, Clone, Default)] +pub struct DefaultChain {} + +impl Chain for DefaultChain { + type Error = Error; + + fn unit() -> Self { + todo!() + } + + fn chain(&mut self, _t: usize) { + todo!() + } + + fn epsilon(&self) -> bool { + todo!() + } +} diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs deleted file mode 100644 index 287fbca..0000000 --- a/chain/src/grammar.rs +++ /dev/null @@ -1,1247 +0,0 @@ -#![warn(missing_docs)] -//! This file implements the extected behaviours of grammars. - -// NOTE: We shall first start with a parser that works at the level of -// characters. The purpose is to first experiment with the workings -// and the performance of the algorithms, before optimising by using -// regular expressions to classify inputs into tokens. In other -// words, the current focus is not on the optimisations, whereas -// scanners are for optimisations only, so to speak. - -#![allow(unused_imports)] -use nfa::{ - default::{ - nfa::DefaultNFA, - regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, - }, - DOption, DesRec, Nfa, Regex, SoC, -}; - -use graph::{adlist::ALGBuilder, builder::Builder, Graph}; - -use std::{ - collections::HashSet, - fmt::{Display, Write}, -}; - -/// The type of a terminal. -/// -/// For the time being this is a wrapper around a string, but in the -/// future it may hold more information of scanners. -#[derive(Debug, Clone, Eq, PartialEq)] -pub struct Terminal { - // If we want to use scanners, per chance add them as a new field - // here. - name: String, -} - -impl Terminal { - /// Create a terminal with the given name. - #[inline] - pub fn new(name: String) -> Self { - Self { name } - } - - /// Return the name of the terminal. - #[inline] - pub fn name(&self) -> &str { - &self.name - } -} - -/// The type of a non-terminal. -/// -/// This is just a wrapper around a string. -#[derive(Debug, Clone)] -pub struct Nonterminal(String); - -impl Nonterminal { - /// Return the name of the nonterminal. - /// - /// Just to improve readability. - #[inline] - pub fn name(&self) -> &str { - &self.0 - } -} - -/// The type of a terminal or a non-terminal. -/// -/// Only an index is stored here. Actual data are stored in two other -/// arrays. -#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum TNT { - /// Terminal variant - Ter(usize), - /// Nonterminal variant - Non(usize), -} - -impl Display for TNT { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - TNT::Ter(t) => write!(f, "T({t})"), - TNT::Non(n) => write!(f, "N({n})"), - } - } -} - -/// Errors related to grammar operations. -#[derive(Debug, Copy, Clone)] -#[non_exhaustive] -pub enum Error { - /// The first component is the index, and the second the bound. - IndexOutOfBounds(usize, usize), - /// Fail to build the N-th regular expression, due to the - /// ParseError. - BuildFail(usize, ParseError), - /// fail to build NFA - NFAFail(nfa::error::Error), -} - -impl From<nfa::error::Error> for Error { - fn from(nfae: nfa::error::Error) -> Self { - Self::NFAFail(nfae) - } -} - -impl Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), - Error::BuildFail(n, pe) => write!( - f, - "Failed to build the {n}-th regular expression due to error: {pe}" - ), - Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), - } - } -} - -impl std::error::Error for Error {} - -/// A rule is a regular expression of terminals or non-terminals. -#[derive(Debug, Clone)] -pub struct Rule { - regex: DefaultRegex<TNT>, -} - -impl Rule { - /// Return true if and only if the rule is empty. - #[inline] - pub fn is_empty(&self) -> bool { - self.regex.is_empty() - } - - /// Return the length of the rule. - #[inline] - pub fn len(&self) -> usize { - self.regex.len() - } -} - -/// The type of a grammar. -#[derive(Debug, Clone, Default)] -pub struct Grammar { - ter: Vec<Terminal>, - non: Vec<Nonterminal>, - rules: Vec<Rule>, - firsts: Vec<HashSet<Option<usize>>>, - first_nodes: Vec<Vec<usize>>, -} - -/// A private type to aid the recursive looping of rergular -/// expressions. -#[derive(Copy, Clone)] -enum StackElement { - Seen(usize), - Unseen(usize), -} - -impl StackElement { - fn index(self) -> usize { - match self { - Self::Seen(index) => index, - Self::Unseen(index) => index, - } - } - - fn is_seen(self) -> bool { - matches!(self, Self::Seen(_)) - } -} - -impl Grammar { - /// Construct a grammar from a vector of terminals, a vector of - /// non-terminals, and a vector of rules for the non-temrinals. - /// - /// # Panic - /// - /// If the length of `non` is not equal to that of `rules`, then - /// the function panics. - pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self { - assert_eq!(non.len(), rules.len()); - - // One more room is reserved for the `None` value. - let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) - .take(non.len()) - .collect(); - - let first_nodes = rules - .iter() - .map(|rule| Vec::with_capacity(rule.len())) - .collect(); - - Self { - ter, - non, - rules, - firsts, - first_nodes, - } - } - - /// Return the name of a terminal or a non-terminal. - pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> { - match tnt { - TNT::Ter(t) => Ok(format!( - "T{}", - self.ter - .get(t) - .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? - .name() - )), - TNT::Non(n) => Ok(format!( - "N{}", - self.non - .get(n) - .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? - .name() - )), - } - } - - /// Return true if and only if there are no non-terminals in the - /// grammar. - #[inline] - pub fn is_empty(&self) -> bool { - self.non.is_empty() - } - - /// Return the total length of all rules. - #[inline] - pub fn total(&self) -> usize { - self.rules.iter().map(Rule::len).sum() - } - - /// Return the number of terminals. - #[inline] - pub fn ter_num(&self) -> usize { - self.ter.len() - } - - /// Return the number of non-terminals. - #[inline] - pub fn non_num(&self) -> usize { - self.non.len() - } - - /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a - /// single number to represent terminals and non-terminals. - /// - /// # Bounds - /// - /// If a terminal index is greater than or equal to the number of - /// terminals, then this signals an error; mutatis mutandis for - /// non-terminals. - /// - /// # Related - /// - /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. - #[inline] - pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> { - let ter_num = self.ter.len(); - let non_num = self.non.len(); - - match tnt { - TNT::Ter(t) => { - if t >= ter_num { - Err(Error::IndexOutOfBounds(t, ter_num)) - } else { - Ok(t) - } - } - TNT::Non(n) => { - if n >= non_num { - Err(Error::IndexOutOfBounds(n, non_num)) - } else { - Ok(n + ter_num) - } - } - } - } - - /// Convert a single number to either a terminal or a - /// non-terminal. - /// - /// # Bounds - /// - /// If the number is greater than or equal to the sum of the - /// numbers of terminals and of non-terminals, then this signals - /// an error. - /// - /// # Related - /// - /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. - #[inline] - pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> { - let ter_num = self.ter.len(); - let non_num = self.non.len(); - - if flat < ter_num { - Ok(TNT::Ter(flat)) - } else if flat < ter_num + non_num { - Ok(TNT::Non(flat - ter_num)) - } else { - Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) - } - } - - /// Return true if and only if the non-terminal is nullable. - #[inline] - pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> { - Ok(self - .firsts - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .contains(&None)) - } - - /// For a NON_TERMINAL, return an iterator that goes over the - /// nodes that are reachable from the non-terminal through an - /// empty transition of the nondeterministic finite automaton. - #[inline] - pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> { - Ok(self - .first_nodes - .get(non_terminal) - .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? - .iter()) - } - - /// Compute the set of terminals that can appear as the first - /// terminal in some left-linear derivation of a non-terminal, for - /// every non-terminal. - /// - /// This is an algorithm that computes the transitive closure, - /// which is a common approach for this task. But perhaps there - /// are more efficient approaches? - /// - /// Also the function computes the set of "reachable nodes" in the - /// process, and records the information in the `first_nodes` - /// attribute. - pub fn compute_firsts(&mut self) -> Result<(), Error> { - let mut updated = true; - - let non_len = self.non_num(); - - use StackElement::{Seen, Unseen}; - - while updated { - updated = false; - - for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { - let root = if let Some(root) = regex.root() { - root - } else { - if !self.is_nullable(n)? { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(None); - - // The default construction of a grammar - // reserves some space for each vector, so - // explicitly setting this can reduce some - // minor memory overhead. - let pointer = self.first_nodes.get_mut(n).unwrap(); - - pointer.clear(); - pointer.shrink_to_fit(); - } - - continue; - }; - - let regex_len = regex.len(); - - let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len); - - stack.push(Unseen(root)); - - let mut children_sets_stack: Vec<HashSet<Option<usize>>> = - Vec::with_capacity(regex_len); - - let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len); - - while let Some(top) = stack.pop() { - let top_index = top.index(); - let is_seen = top.is_seen(); - - match regex - .vertex_label(top_index) - .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? - { - RegexType::Kleene => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Plus => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set); - this_nodes.extend(child_nodes); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Optional => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - this_set.insert(None); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Or => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Paren => { - // Only for printing purposes - let mut this_set = HashSet::new(); - - this_set.insert(None); - - children_sets_stack.push(this_set); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - RegexType::Empty => { - if !is_seen { - stack.push(Seen(top_index)); - - for child in regex.children_of(top_index).unwrap().rev() { - stack.push(Unseen(child)); - } - } else { - let degree = regex.degree(top_index).unwrap(); - let children_stack_len = children_sets_stack.len(); - let children_nodes_len = children_nodes_stack.len(); - - assert!( - children_stack_len >= degree, - "not enough stack elements for {top_index}" - ); - - assert!( - children_nodes_len >= degree, - "not enough stack elements for {top_index}" - ); - - let mut this_set = HashSet::new(); - - let mut this_nodes = HashSet::new(); - - this_nodes.insert(top_index); - - if degree == 0 { - this_set.insert(None); - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - - continue; - } - - let mut stop = false; - - for (child_set, child_nodes) in children_sets_stack - .drain((children_stack_len - degree)..) - .zip( - children_nodes_stack.drain((children_nodes_len - degree)..), - ) - { - if stop { - break; - } - - if !child_set.contains(&None) { - stop = true; - } - - this_set.extend(child_set.iter().copied()); - this_nodes.extend(child_nodes.iter().copied()); - } - - if stop { - this_set.remove(&None); - } - - children_sets_stack.push(this_set); - children_nodes_stack.push(this_nodes); - } - } - RegexType::Lit(tnt) => { - match tnt { - TNT::Ter(t) => { - let mut this_set = HashSet::with_capacity(1); - - this_set.insert(Some(t)); - - children_sets_stack.push(this_set); - } - TNT::Non(non) => { - let this_set = self - .firsts - .get(non) - .ok_or(Error::IndexOutOfBounds(non, non_len))? - .clone(); - - children_sets_stack.push(this_set); - } - } - - let mut this_nodes = HashSet::with_capacity(1); - this_nodes.insert(top_index); - - children_nodes_stack.push(this_nodes); - } - } - } - - assert_eq!( - children_sets_stack.len(), - 1, - "Too many elements left at the end" - ); - - assert_eq!( - children_nodes_stack.len(), - 1, - "Too many elements left at the end" - ); - - for first in children_sets_stack.pop().unwrap().into_iter() { - if !self.firsts.get(n).unwrap().contains(&first) { - updated = true; - - self.firsts.get_mut(n).unwrap().insert(first); - } - } - - *self.first_nodes.get_mut(n).unwrap() = - children_nodes_stack.pop().unwrap().into_iter().collect(); - } - } - - Ok(()) - } - - /// Return the regular language of the left-linear closures of - /// non-terminals in the grammar. - /// - /// The resulting vector is guaranteed to be of the same length as - /// the number of non-terminals. - /// - /// The resulting regular language is not "self-contained". That - /// is to say, its terminals indices are packed indices and are - /// meaningless without the interpretation of the grammar. They - /// should be converted to a nondeterministic finite automaton and - /// then to its null closure later on. - pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> { - let non_len = self.non_num(); - - let mut result = Vec::with_capacity(non_len); - - for (n, rule) in self.rules.iter().enumerate() { - let regex = &rule.regex; - - let regex_root = if let Some(root) = regex.root() { - root - } else { - result.push(Default::default()); - - continue; - }; - - let regex_len = regex.len(); - - /// A convenient macro to retrieve the children from the - /// original regular expression with error propagation. - macro_rules! children { - ($n:expr) => { - regex - .children_of($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - /// A convenient macro to retrieve the label from the - /// original regular expression with error propagation. - macro_rules! label { - ($n:expr) => { - regex - .vertex_label($n) - .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? - }; - } - - let parents = regex.parents_array().map_err(|e| match e { - nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), - nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), - _ => unreachable!(), - })?; - - use RegexType::*; - use TNT::*; - - let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2); - let mut builder = ALGBuilder::default(); - - /// Perform a depth-first copy - macro_rules! df_copy { - ($parent:expr, $index:expr) => { - match label!($index) { - Kleene | Plus | Optional | Or | Paren | Empty => { - let mut stack = vec![($parent, $index)]; - - while let Some((top_parent, top_index)) = stack.pop() { - let label = label!(top_index); - let label = match label { - Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), - _ => label, - }; - - local_result.push(label); - - let new = builder.add_vertex(); - - builder.add_edge(top_parent, new, ()).unwrap(); - - stack.extend(children!(top_index).map(|child| (new, child))); - } - } - Lit(remain_tnt) => { - local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); - let new = builder.add_vertex(); - builder.add_edge($parent, new, ()).unwrap(); - } - } - }; - } - - local_result.push(Or); - builder.add_vertex(); - - local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); - let non_lit_index = builder.add_vertex(); - - builder.add_edge(0, non_lit_index, ()).unwrap(); - - for first_node in self.first_nodes_of(n)?.copied() { - assert!(first_node < parents.len()); - - let tnt = match label!(first_node) { - Lit(tnt) => tnt, - _ => continue, - }; - - let mut parents_chain = { - let mut result = Vec::new(); - let mut stack = Vec::with_capacity(parents.len()); - - stack.push(first_node); - - while let Some(top) = stack.pop() { - assert!(top < parents.len()); - if let Some(parent) = parents.get(top).copied().unwrap() { - result.push(parent); - stack.push(parent.0); - } - } - - result.reverse(); - - result - }; - - if parents_chain.is_empty() { - local_result.push(Lit(tnt)); - let lit_index = builder.add_vertex(); - builder.add_edge(0, lit_index, ()).unwrap(); - - continue; - } - - assert!(parents_chain.first().unwrap().0 == regex_root); - - // A different, "more local", root. - let mut root: usize; - - // Handle the direct parent - let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); - - match label!(parent_node) { - Kleene | Plus => { - local_result.extend([Empty, Lit(tnt)]); - - root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); - - let iterator = children!(parent_node); - - for index in iterator.clone().skip(parent_edge_index + 1) { - df_copy!(root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - - Or => { - local_result.push(Lit(tnt)); - root = builder.add_vertex(); - } - Optional | Empty => { - // If this path is taken, it should not be - // optional. - local_result.extend([Empty, Lit(tnt)]); - root = builder.add_vertex(); - let lit_index = builder.add_vertex(); - builder.add_edge(root, lit_index, ()).unwrap(); - - for index in children!(parent_node).skip(parent_edge_index + 1) { - df_copy!(root, index); - } - } - Paren | Lit(_) => unreachable!(), - } - - // Handle successive parents - - for (node, edge_index) in parents_chain.into_iter() { - let node_type = label!(node); - - match node_type { - Kleene | Plus => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); - - root = new_index; - - let iterator = children!(node); - - for index in iterator.clone().skip(edge_index + 1) { - df_copy!(root, index); - } - - local_result.push(Kleene); - let new_parent = builder.add_vertex(); - builder.add_edge(root, new_parent, ()).unwrap(); - - for index in iterator { - df_copy!(new_parent, index); - } - } - RegexType::Or => {} - RegexType::Optional | RegexType::Empty => { - local_result.push(Empty); - let new_index = builder.add_vertex(); - builder.add_edge(new_index, root, ()).unwrap(); - root = new_index; - - for index in children!(node).skip(edge_index + 1) { - df_copy!(root, index); - } - } - RegexType::Paren | RegexType::Lit(_) => unreachable!(), - } - } - - builder.add_edge(0, root, ()).unwrap(); - } - - local_result.shrink_to_fit(); - - let graph = builder.build(); - - assert_eq!(graph.nodes_len(), local_result.len()); - - result.push( - DefaultRegex::new(graph, local_result) - .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, - ); - } - - assert_eq!(result.len(), non_len); - - Ok(result) - } - - /// Convert the regular language of left-linear closures to its - /// equivalent nondeterministic finite automaton. - /// - /// In the generation of the left-linear closure, the terminals - /// and non-terminals are packed into an unsigned integer. We - /// unpack them in converting to nondeterministic finite - /// automaton. - /// - /// The resulting nondeterministic finite automaton should be - /// transformed to its null-closure for use in our algorithm. - pub fn left_closure_to_nfa( - &self, - closure: &[DefaultRegex<TNT>], - ) -> Result<DefaultNFA<DOption<TNT>>, Error> { - let label_transform = |tnt| match tnt { - TNT::Ter(t) => { - let new_tnt = self.unpack_tnt(t).map_err(|e| match e { - Error::IndexOutOfBounds(index, bound) => { - graph::error::Error::IndexOutOfBounds(index, bound) - } - _ => unreachable!(), - })?; - - Ok(SoC::Carry(new_tnt)) - } - TNT::Non(n) => Ok(SoC::Sub(n)), - }; - - DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) - } -} - -impl Display for Grammar { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - assert_eq!(self.non.len(), self.rules.len()); - - for (nt, rule) in self.non.iter().zip(self.rules.iter()) { - write!(f, "{}: ", nt.name())?; - - writeln!( - f, - "{}", - rule.regex.to_string_with(|tnt| format!( - "({})", - self.name_of_tnt(tnt) - .unwrap_or_else(|_| format!("Unknown {tnt:?}")) - ))? - )?; - } - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_helper { - use super::*; - use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; - use std::fmt::Write; - - // Construct a grammar to test - pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { - let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; - let non = vec![ - Nonterminal("start".to_owned()), - Nonterminal("end".to_owned()), - ]; - - /// A function to scan the inputs. - fn scan_tnt( - parser: &DefaultRegParser<TNT>, - input: &str, - ) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> { - use ParseDirection::*; - use RegexType::*; - use TNT::*; - - let mut chars = input.chars(); - - if let Some(first) = chars.next() { - match first { - '*' => Ok(Some((1, Kleene, Right))), - '+' => Ok(Some((1, Plus, Right))), - '?' => Ok(Some((1, Optional, Right))), - '|' => Ok(Some((1, Empty, Up))), - '(' => Ok(Some((1, Or, Down))), - ')' => Ok(Some((1, Paren, Up))), - 'T' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(t) = parser.query(&name, true) { - Ok(Some((len, Lit(Ter(t)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - 'N' => { - let mut name = String::new(); - let mut len = 1; - - while let Some(c) = chars.next() { - if ('a'..='z').contains(&c) { - len += 1; - write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; - } else { - break; - } - } - - if let Some(n) = parser.query(&name, false) { - Ok(Some((len, Lit(Non(n)), Right))) - } else { - Err(ParseError::InvalidCharacter(first)) - } - } - _ => Err(ParseError::InvalidCharacter(first)), - } - } else { - Ok(None) - } - } - - let mut regex_parser: DefaultRegParser<TNT> = Default::default(); - - regex_parser.add_tnt("a", true); - regex_parser.add_tnt("b", true); - regex_parser.add_tnt("start", false); - regex_parser.add_tnt("end", false); - - let regex_parser = regex_parser; - - let rule1 = Rule { - regex: regex_parser - .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rule2 = Rule { - regex: regex_parser - .parse("Nstart?Nend*", Box::new(scan_tnt), true)? - .ok_or(ParseError::Invalid)? - .0, - }; - - let rules = vec![rule1, rule2]; - - Ok(Grammar::new(ter, non, rules)) - } -} - -#[cfg(test)] -mod test_grammar_display { - use super::test_grammar_helper::new_grammar; - - #[test] - fn test_display() -> Result<(), Box<dyn std::error::Error>> { - println!("{}", new_grammar()?); - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_firsts { - use super::test_grammar_helper::new_grammar; - use super::*; - - #[test] - fn test_firsts() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - - Ok(()) - } -} - -#[cfg(test)] -mod test_grammar_left_closure { - use super::test_grammar_helper::new_grammar; - use super::*; - - pub fn new_closure_regex( - grammar: &mut Grammar, - ) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> { - grammar.compute_firsts()?; - - println!("grammar firsts: {:?}", grammar.firsts); - println!("grammar first nodes: {:?}", grammar.first_nodes); - println!("grammar:"); - println!("{grammar}"); - - grammar.left_closure().map_err(Into::into) - } - - #[test] - fn test_regex() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - - let vec_of_regexps = new_closure_regex(&mut grammar)?; - - for regex in &vec_of_regexps { - println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); - // println!("regex: {regex:?}",); - println!("regex len = {}", regex.nodes_len()); - } - - Ok(()) - } - - #[test] - fn test_nfa() -> Result<(), Box<dyn std::error::Error>> { - let mut grammar = new_grammar()?; - let closure = new_closure_regex(&mut grammar)?; - let nfa = grammar.left_closure_to_nfa(&closure)?; - // TODO: print the nfa out - Ok(()) - } -} diff --git a/chain/src/lib.rs b/chain/src/lib.rs index 0ec4d4c..4e21e1d 100644 --- a/chain/src/lib.rs +++ b/chain/src/lib.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! This package implements the core algorithm of the entire //! workspace: parsing with derivatives by means of chain rule and //! regular nulling languages. @@ -7,19 +8,43 @@ //! think is the essence of this algorithm, the chain-rule for //! derivatives of languages. -pub mod grammar; +pub mod atom; -pub fn add(left: usize, right: usize) -> usize { - left + right -} +// TODO: Define errors. + +/// The expected behaviours of a language which can take derivatives +/// by chain rule. +pub trait Chain: Default { + /// The implementations should choose a type to represent errors. + type Error: std::error::Error; -#[cfg(test)] -mod tests { - use super::*; + /// Represents the language that is present after we parse the + /// empty string, that is the initial configuration of the + /// language. This may or may not be different from what + /// `Default::default` gives. + fn unit() -> Self; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } + /// Take the derivative by a terminal symbol. + /// + /// This takes care of the union and the prepending operations. + /// + /// # A little remark about the design + /// + /// I have thought to separate different operations (like the + /// union, the prepending, and single derivatives) and then define + /// a function to put everything together. But I think that + /// design is not convenient to use. Also, I really do not need + /// those operations other than to provide this derivative + /// operation, so why define them? And putting all things + /// together may reduce the number of bugs caused by wrong uses of + /// those component functions, and can reduce the amount of + /// documentation strings a library user needs to read, in order + /// to make use of this trait. So I ended up with this design. + fn chain(&mut self, t: usize); + + /// Return true if and only if the language contains the empty + /// string. + fn epsilon(&self) -> bool; } + +pub mod default; diff --git a/chain/src/plan.org b/chain/src/plan.org index 8c63492..61738a9 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -2,29 +2,73 @@ #+AUTHOR: Durand #+DATE: <2022-11-18 Ven 19:57> -* Things to do [2/7] +* Things to do [5/10] - [X] Implement builders for graphs - [X] Find sets of the first terminals for each non-terminal, in the grammar module. -- [-] NFA [4/5] +- [-] Establish a testing framework of various grammars. [5/6] + + [X] One simple grammar + + [X] Notes + + [X] Parentheses + + [X] Pathelogical left-recursive + + [X] Pathelogical right-recursive + + [ ] Pathelogically ambiguous + # More should be added +- [X] NFA [4/4] + [X] Add regular expression into NFA. + [X] Convert a set of regular expressions into a nondeterministic finite automaton, where non-terminals denote the indices of regular expressions in the set to substitute. - + [X] Convert a grammar into its grammar of left-linear closures of - non-temrinals and their derivatives. - + [X] Convert nondeterministic finite automata to their null - closures. - + [ ] Test more grammars to ensure it works correctly. -- [ ] Define the Atom trait. -- [ ] Implement virtual nodes for each derivative of each atom: The - lack of this step might be the root cause of the failure of the - previous version of this package. -- [ ] Implement languages. [0/2] - + [ ] First implement them as doubly labelled (directed acyclic) - graphs. - + [ ] Implement finding derivatives. + + [X] Convert a grammar to the language of atomic languages. [2/2] + * [X] Convert a grammar into its grammar of left-linear closures + of non-temrinals and their derivatives. + * [X] Convert nondeterministic finite automata to their null + closures. + + [X] Test more grammars to ensure it works correctly. [5/5] + * [X] Test the conversion to left-closure as a set of regular + expressions. + * [X] Test the conversion of a set of regular expressions into a + nondeterministic finite automaton. + * [X] Test the closure of null edges, where the nullity of an edge + is defined by a function. In our specific case, an edge is null + if the grammar symbol that edge represents, either a terminal or + a non-terminal, can match an empty string. + * [X] Test the removal of empty transitions from nondeterministic + finite automata. + * [X] Test the removal of dead states, where a state is dead if + and only if no other states have an edge into that state. +- [ ] Refactor [0/3] + + [ ] When we pull in some regular language because of the + left-linear expansion, we need to mark that branch as coming from + left-linear expansions. This is necessary because we cannot + follow a left-linearly expanded branch when we take the derivative + of a language. We only expand left-linearly when we try to access + the atomic languages [s]^{(t)}. We can mark by returning a set of + nodes which are the beginnings of left-linearly expanded branches. + + [ ] An edge of the NFA should carry a label that can be more + informative than solely a terminal or a non-terminal. + + [ ] Add a mechanism for a grammar to attach labels to edges of NFA + which are not necessarily Copy-able, and which should be stored in a + separate array, such that the labels on edges of NFA point to the + elements of the array. +- [X] Atom [3/3] + + [X] Define the Atom trait. + + [X] Implement virtual nodes for each derivative of each atom: The + lack of this step might be the root cause of the failure of the + previous version of this project. + + [X] Test atoms +- [-] Implement languages. [1/3] + + [X] First define a trait with the expected behaviours. + + [ ] Then implement them as doubly labelled graphs. + + [ ] Thenimplement finding derivatives by use of the chain rule. +- [-] Implement forests [0/2] + + [-] Define a trait with the expected behaviours. + + [ ] Implement them using adjacency list: we only need one label + per edge, and we do not wish to exclude duplicate edges, and we do + not need to index edges by the labels. All in all, the labels can + be implemented by a simple hash map that maps edges to labels, so + a special data type is not needed here. - [ ] Implement semiring. [0/5] + [ ] Define the trait. + [ ] Implement the boolean semiring. @@ -32,6 +76,9 @@ + [ ] Implement the free semiring. + [ ] Compute and record a semiring value when computing derivatives. +- [X] Miscellaneous [1/1] + + [X] Implement a function for NFA that checks if a given predicate + is satisfied for each edge label. * Atomic Languages @@ -39,10 +86,6 @@ This describes the behaviours of atomic languages. The atomic language consists of the null closure of any non-terminal symbol in the grammar, and their deriavtives by terminals and non-terminal. -* Script for creating GIF animation - -[[https://gist.github.com/maelvls/5379127][a gist]] - * Derivative Languages This is the main driving device of the algorithm. Basically, the @@ -82,6 +125,61 @@ should be easy enough, since we already have the mechanism of graphs, nondeterministic automata, and semirings. All we need to do is to combine them together. +* Lexers + +I got this idea from [[https://lists.gnu.org/archive/html/emacs-devel/2022-12/msg01127.html][a discussion on emacs-devel]]. The idea can be +formulated as + +#+begin_quote +Potentially it allows invocation of a parser with different variants +of lexers - one mode with block tokens for the exploration of +project's structure, and another mode for indentation and error +checking purposes. +#+end_quote + +I think this is the "right" use of lexers. That is to say, the parser +by default should operate on an abstract type of tokens, which are but +unsigned integers, and rely on the user application to provide a lexer +for turning the actual input, like a string, into an iterator of +tokens. The lexer can choose to do any sort of preprocessing that it +sees fit, or it can do nothing and just turn characters to unsigned +integers. In the latter case, this parser generator works on the +character level. But, when one neeeds, one can instruct the parser +generator to work on the level of preprocessed tokens easily. + +This has the advantage of allowing a certain degree of flexibility in +the type of tokens accepted, while keeping the implementation simple +at the same time: the internal core only has to deal with unsigned +integers, after all. + +* Library for drawing graphs + +In the past, I thought that drawing graphs is only a development tool +for my package, so I can rely on such external tools as GraphViz, as +that would not be needed for my end-users. + +But recently I realized that this is not a need only for the +development of the package, but also for the users as well. To be +more specific, the target users are those who want to "play" with +grammars. So if one cannot view the resulting parse forest diagrams +easily and interactively, it would be hard to "play" with grammars. + +Moreover, the internal workings of the parsing machine might also be +interesting for users to inspect, whether to debug the program or to +know more under the hood. As such it is required to view the diagrams +of the internal directed acyclic graphs. + +For all these reasons, I know regard the ability to draw graphs and to +interact with those graphs is essential to the user experience of this +package. As a consequence, I would like to develop a small default +library for this purpose. I follow the convention adapted by this +package here as well, which is to provide a default implementation for +the functionality, while still leaving the room for users to +substitute with other external packages, if so desired, for whatever +reason. For example, an external package may provide an unprecedented +performance for drawing graphs in Emacs. If such a package appears, I +can easily avail of that external package to draw graphs. + * Testing ground I am in a strong need to test things out. The most important one is @@ -92,3 +190,21 @@ understanding of the main algorithm is plain wrong. This is the main reason I started this rewrite of the package. +* To figure out + +I need to figure out how to construct the parse forest from a +left-most null-closed derivation graph. + +Reading through the original paper again, I found that the author +augmented the atomic languages with annotations of partial derivation +steps on each edge of the nondeterministic finite automaton. In +brief, this is what I tried to achieve with some obscure constructs +such as expanding reduction informations carried by the +nondeterministic finite automata, in one of the previous versions. To +sum up, I really need to carry those informations in the first place, +otherwise the parse forests cannot be reliably construced later on. + +But I really do not want to adopt the approach of the original author. +I still prefer the idea of a recorder. That is to say, instead of +constructing the parse forest after the recognition, we shall +construct the parse forest simultaneously while we are recognizing. |