diff options
author | JSDurand <mmemmew@gmail.com> | 2023-01-03 23:44:02 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-01-03 23:44:02 +0800 |
commit | bdbd4b4dc21af09711c97d3f903877443199af06 (patch) | |
tree | c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /grammar/src | |
parent | 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff) |
structural change: separate crates out
I put functionalities that are not strictly core to separate crates,
so that the whole package becomes more modular, and makes it easier to
try other parsing algorithms in the future.
Also I have to figure the forests out before finishing the core
chain-rule algorithm, as the part about forests affects the labels of
the grammars directly. From my experiences in writing the previous
version, it is asking for trouble to change the labels type
dramatically at a later point: too many places need to be changed.
Thus I decide to figure the rough part of forests out.
Actually I only have to figure out how to attach forests fragments to
edges of the underlying atomic languages, and the more complex parts
of putting forests together can be left to the recorders, which is my
vision of assembling semi-ring values during the chain-rule machine.
It should be relatively easy to produce forests fragments from
grammars since we are just trying to extract some information from the
grammar, not to manipulate those information in some complicated way.
We have to do some manipulations in the process, though, in order to
make sure that the nulling and epsilon-removal processes do not
invalidate these fragments.
Diffstat (limited to 'grammar/src')
-rw-r--r-- | grammar/src/lib.rs | 1131 | ||||
-rw-r--r-- | grammar/src/test_grammar_helper.rs | 368 | ||||
-rw-r--r-- | grammar/src/tests/mod.rs | 30 | ||||
-rw-r--r-- | grammar/src/tests/test_grammar_left_closure.rs | 272 |
4 files changed, 1801 insertions, 0 deletions
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs new file mode 100644 index 0000000..55adc9a --- /dev/null +++ b/grammar/src/lib.rs @@ -0,0 +1,1131 @@ +#![warn(missing_docs)] +//! This file implements the extected behaviours of grammars. + +// NOTE: We shall first start with a parser that works at the level of +// characters. The purpose is to first experiment with the workings +// and the performance of the algorithms, before optimising by using +// regular expressions to classify inputs into tokens. In other +// words, the current focus is not on the optimisations, whereas +// scanners are for optimisations only, so to speak. + +// TODO: Separate contents into modules. + +use nfa::{ + default::{ + nfa::DefaultNFA, + regex::{DefaultRegex, ParseError, RegexType}, + }, + DOption, Nfa, Regex, SoC, +}; + +use graph::{adlist::ALGBuilder, builder::Builder, Graph}; + +use std::{collections::HashSet, fmt::Display}; + +/// The type of a terminal. +/// +/// For the time being this is a wrapper around a string, but in the +/// future it may hold more information of scanners. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Terminal { + // If we want to use scanners, per chance add them as a new field + // here. + name: String, +} + +impl Terminal { + /// Create a terminal with the given name. + #[inline] + pub fn new(name: String) -> Self { + Self { name } + } + + /// Return the name of the terminal. + #[inline] + pub fn name(&self) -> &str { + &self.name + } +} + +/// The type of a non-terminal. +/// +/// This is just a wrapper around a string. +#[derive(Debug, Clone)] +pub struct Nonterminal(String); + +impl Nonterminal { + /// Return the name of the nonterminal. + /// + /// Just to improve readability. + #[inline] + pub fn name(&self) -> &str { + &self.0 + } +} + +/// The type of a terminal or a non-terminal. +/// +/// Only an index is stored here. Actual data are stored in two other +/// arrays. +#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum TNT { + /// Terminal variant + Ter(usize), + /// Nonterminal variant + Non(usize), +} + +impl Display for TNT { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TNT::Ter(t) => write!(f, "T({t})"), + TNT::Non(n) => write!(f, "N({n})"), + } + } +} + +/// Errors related to grammar operations. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum Error { + /// The first component is the index, and the second the bound. + IndexOutOfBounds(usize, usize), + /// Fail to build the N-th regular expression, due to the + /// ParseError. + BuildFail(usize, ParseError), + /// fail to build NFA + NFAFail(nfa::error::Error), +} + +impl From<nfa::error::Error> for Error { + fn from(nfae: nfa::error::Error) -> Self { + Self::NFAFail(nfae) + } +} + +impl Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), + Error::BuildFail(n, pe) => write!( + f, + "Failed to build the {n}-th regular expression due to error: {pe}" + ), + Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + if let Error::NFAFail(error) = self { + Some(error) + } else { + None + } + } +} + +/// A rule is a regular expression of terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct Rule { + regex: DefaultRegex<TNT>, +} + +impl Rule { + /// Return true if and only if the rule is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.regex.is_empty() + } + + /// Return the length of the rule. + #[inline] + pub fn len(&self) -> usize { + self.regex.len() + } +} + +/// The type of a grammar. +#[derive(Debug, Clone, Default)] +pub struct Grammar { + /// A list of terminals. + ter: Vec<Terminal>, + /// A list of non-terminals. + non: Vec<Nonterminal>, + /// A list of rules. + /// + /// The length of the list must match that of the list of + /// non-terminals. + rules: Vec<Rule>, + // The following two attributes are empty until we call + // `compute_firsts` on the grammar. + /// The list of sets of "first terminals". + /// + /// The length must match that of the list of non-terminals. + firsts: Vec<HashSet<Option<usize>>>, + /// The list of lists of nodes that are reachable after a nullable + /// transition in the regular expression. + /// + /// The length must match that of the list of non-terminals. + first_nodes: Vec<Vec<usize>>, + // The following attribute is empty until we call `left_closure` + // on the grammar. + left_closure_branches: HashSet<usize>, +} + +/// A private type to aid the recursive looping of rergular +/// expressions. +#[derive(Copy, Clone)] +enum StackElement { + Seen(usize), + Unseen(usize), +} + +impl StackElement { + fn index(self) -> usize { + match self { + Self::Seen(index) => index, + Self::Unseen(index) => index, + } + } + + fn is_seen(self) -> bool { + matches!(self, Self::Seen(_)) + } +} + +impl Grammar { + /// Construct a grammar from a vector of terminals, a vector of + /// non-terminals, and a vector of rules for the non-temrinals. + /// + /// # Panic + /// + /// If the length of `non` is not equal to that of `rules`, then + /// the function panics. + pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self { + assert_eq!(non.len(), rules.len()); + + // One more room is reserved for the `None` value. + let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) + .take(non.len()) + .collect(); + + let first_nodes = rules + .iter() + .map(|rule| Vec::with_capacity(rule.len())) + .collect(); + + let left_closure_branches = HashSet::default(); + + Self { + ter, + non, + rules, + firsts, + first_nodes, + left_closure_branches, + } + } + + /// Return the name of a terminal or a non-terminal. + pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> { + match tnt { + TNT::Ter(t) => Ok(format!( + "T{}", + self.ter + .get(t) + .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? + .name() + )), + TNT::Non(n) => Ok(format!( + "N{}", + self.non + .get(n) + .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? + .name() + )), + } + } + + /// Return true if and only if there are no non-terminals in the + /// grammar. + #[inline] + pub fn is_empty(&self) -> bool { + self.non.is_empty() + } + + /// Return the total length of all rules. + #[inline] + pub fn total(&self) -> usize { + self.rules.iter().map(Rule::len).sum() + } + + /// Return the number of terminals. + #[inline] + pub fn ter_num(&self) -> usize { + self.ter.len() + } + + /// Return the number of non-terminals. + #[inline] + pub fn non_num(&self) -> usize { + self.non.len() + } + + /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a + /// single number to represent terminals and non-terminals. + /// + /// # Bounds + /// + /// If a terminal index is greater than or equal to the number of + /// terminals, then this signals an error; mutatis mutandis for + /// non-terminals. + /// + /// # Related + /// + /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. + #[inline] + pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + match tnt { + TNT::Ter(t) => { + if t >= ter_num { + Err(Error::IndexOutOfBounds(t, ter_num)) + } else { + Ok(t) + } + } + TNT::Non(n) => { + if n >= non_num { + Err(Error::IndexOutOfBounds(n, non_num)) + } else { + Ok(n + ter_num) + } + } + } + } + + /// Convert a single number to either a terminal or a + /// non-terminal. + /// + /// # Bounds + /// + /// If the number is greater than or equal to the sum of the + /// numbers of terminals and of non-terminals, then this signals + /// an error. + /// + /// # Related + /// + /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. + /// + /// # Errors + /// + /// This function is supposed to return only one type of errors, + /// namely, the IndexOutOfBounds error that results from a bounds + /// check. Breaking this is breaking the guarantee of this + /// function, and is considered a bug. This behaviour can and + /// should be tested. But I have not found a convenient test + /// method for testing various grammars. + #[inline] + pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> { + let ter_num = self.ter.len(); + let non_num = self.non.len(); + + if flat < ter_num { + Ok(TNT::Ter(flat)) + } else if flat < ter_num + non_num { + Ok(TNT::Non(flat - ter_num)) + } else { + Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) + } + } + + /// Return true if and only if the non-terminal is nullable. + #[inline] + pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> { + Ok(self + .firsts + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .contains(&None)) + } + + /// For a NON_TERMINAL, return an iterator that goes over the + /// nodes that are reachable from the non-terminal through an + /// empty transition of the nondeterministic finite automaton. + #[inline] + pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> { + Ok(self + .first_nodes + .get(non_terminal) + .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? + .iter()) + } + + /// Return a hash set that contains all nodes in the set of + /// left-closure regular languages that are added because of the + /// left-linear expansion. + pub fn left_closure_branches(&self) -> &HashSet<usize> { + &self.left_closure_branches + } + + /// Compute the set of terminals that can appear as the first + /// terminal in some left-linear derivation of a non-terminal, for + /// every non-terminal. + /// + /// This is an algorithm that computes the transitive closure, + /// which is a common approach for this task. But perhaps there + /// are more efficient approaches? + /// + /// Also the function computes the set of "reachable nodes" in the + /// process, and records the information in the `first_nodes` + /// attribute. + pub fn compute_firsts(&mut self) -> Result<(), Error> { + let mut updated = true; + + let non_len = self.non_num(); + + use StackElement::{Seen, Unseen}; + + while updated { + updated = false; + + for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { + let root = if let Some(root) = regex.root() { + root + } else { + if !self.is_nullable(n)? { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(None); + + // The default construction of a grammar + // reserves some space for each vector, so + // explicitly setting this can reduce some + // minor memory overhead. + let pointer = self.first_nodes.get_mut(n).unwrap(); + + pointer.clear(); + pointer.shrink_to_fit(); + } + + continue; + }; + + let regex_len = regex.len(); + + let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len); + + stack.push(Unseen(root)); + + let mut children_sets_stack: Vec<HashSet<Option<usize>>> = + Vec::with_capacity(regex_len); + + let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len); + + while let Some(top) = stack.pop() { + let top_index = top.index(); + let is_seen = top.is_seen(); + + match regex + .vertex_label(top_index) + .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? + { + RegexType::Kleene => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Plus => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set); + this_nodes.extend(child_nodes); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Optional => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + this_set.insert(None); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Or => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Paren => { + // Only for printing purposes + let mut this_set = HashSet::new(); + + this_set.insert(None); + + children_sets_stack.push(this_set); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + RegexType::Empty => { + if !is_seen { + stack.push(Seen(top_index)); + + for child in regex.children_of(top_index).unwrap().rev() { + stack.push(Unseen(child)); + } + } else { + let degree = regex.degree(top_index).unwrap(); + let children_stack_len = children_sets_stack.len(); + let children_nodes_len = children_nodes_stack.len(); + + assert!( + children_stack_len >= degree, + "not enough stack elements for {top_index}" + ); + + assert!( + children_nodes_len >= degree, + "not enough stack elements for {top_index}" + ); + + let mut this_set = HashSet::new(); + + let mut this_nodes = HashSet::new(); + + this_nodes.insert(top_index); + + if degree == 0 { + this_set.insert(None); + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + + continue; + } + + let mut stop = false; + + for (child_set, child_nodes) in children_sets_stack + .drain((children_stack_len - degree)..) + .zip( + children_nodes_stack.drain((children_nodes_len - degree)..), + ) + { + if stop { + break; + } + + if !child_set.contains(&None) { + stop = true; + } + + this_set.extend(child_set.iter().copied()); + this_nodes.extend(child_nodes.iter().copied()); + } + + if stop { + this_set.remove(&None); + } + + children_sets_stack.push(this_set); + children_nodes_stack.push(this_nodes); + } + } + RegexType::Lit(tnt) => { + match tnt { + TNT::Ter(t) => { + let mut this_set = HashSet::with_capacity(1); + + this_set.insert(Some(t)); + + children_sets_stack.push(this_set); + } + TNT::Non(non) => { + let this_set = self + .firsts + .get(non) + .ok_or(Error::IndexOutOfBounds(non, non_len))? + .clone(); + + children_sets_stack.push(this_set); + } + } + + let mut this_nodes = HashSet::with_capacity(1); + this_nodes.insert(top_index); + + children_nodes_stack.push(this_nodes); + } + } + } + + assert_eq!( + children_sets_stack.len(), + 1, + "Too many elements left at the end" + ); + + assert_eq!( + children_nodes_stack.len(), + 1, + "Too many elements left at the end" + ); + + for first in children_sets_stack.pop().unwrap().into_iter() { + if !self.firsts.get(n).unwrap().contains(&first) { + updated = true; + + self.firsts.get_mut(n).unwrap().insert(first); + } + } + + *self.first_nodes.get_mut(n).unwrap() = + children_nodes_stack.pop().unwrap().into_iter().collect(); + } + } + + Ok(()) + } + + /// Return the regular language of the left-linear closures of + /// non-terminals in the grammar. + /// + /// The resulting vector is guaranteed to be of the same length as + /// the number of non-terminals. + /// + /// The resulting regular language is not "self-contained". That + /// is to say, its terminals indices are packed indices and are + /// meaningless without the interpretation of the grammar. They + /// should be converted to a nondeterministic finite automaton and + /// then to its null closure later on. + pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> { + let non_len = self.non_num(); + + let mut result = Vec::with_capacity(non_len); + + for (n, rule) in self.rules.iter().enumerate() { + let regex = &rule.regex; + + let regex_root = if let Some(root) = regex.root() { + root + } else { + result.push(Default::default()); + + continue; + }; + + let regex_len = regex.len(); + + /// A convenient macro to retrieve the children from the + /// original regular expression with error propagation. + macro_rules! children { + ($n:expr) => { + regex + .children_of($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + /// A convenient macro to retrieve the label from the + /// original regular expression with error propagation. + macro_rules! label { + ($n:expr) => { + regex + .vertex_label($n) + .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? + }; + } + + let parents = regex.parents_array().map_err(|e| match e { + nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), + nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), + _ => unreachable!(), + })?; + + use RegexType::*; + use TNT::*; + + let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2); + let mut builder = ALGBuilder::default(); + + /// Perform a depth-first copy + macro_rules! df_copy { + ($parent:expr, $index:expr) => { + match label!($index) { + Kleene | Plus | Optional | Or | Paren | Empty => { + let mut stack = vec![($parent, $index)]; + + while let Some((top_parent, top_index)) = stack.pop() { + let label = label!(top_index); + let label = match label { + Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), + _ => label, + }; + + local_result.push(label); + + let new = builder.add_vertex(); + + builder.add_edge(top_parent, new, ()).unwrap(); + + stack.extend(children!(top_index).map(|child| (new, child))); + } + } + Lit(remain_tnt) => { + local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); + let new = builder.add_vertex(); + builder.add_edge($parent, new, ()).unwrap(); + } + } + }; + } + + local_result.push(Or); + builder.add_vertex(); + + local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); + let non_lit_index = builder.add_vertex(); + + builder.add_edge(0, non_lit_index, ()).unwrap(); + + // If this non-terminal is nullable, add an empty variant. + if self.is_nullable(n)? { + local_result.push(Empty); + let empty_index = builder.add_vertex(); + builder.add_edge(0, empty_index, ()).unwrap(); + } + + for first_node in self.first_nodes_of(n)?.copied() { + assert!(first_node < parents.len()); + + let tnt = match label!(first_node) { + Lit(tnt) => Lit(tnt), + _ => continue, + }; + + let mut parents_chain = { + let mut result = Vec::new(); + let mut stack = Vec::with_capacity(parents.len()); + + stack.push(first_node); + + while let Some(top) = stack.pop() { + assert!(top < parents.len()); + if let Some(parent) = parents.get(top).copied().unwrap() { + result.push(parent); + stack.push(parent.0); + } + } + + result.reverse(); + + result + }; + + if parents_chain.is_empty() { + local_result.push(tnt); + let lit_index = builder.add_vertex(); + builder.add_edge(0, lit_index, ()).unwrap(); + + continue; + } + + assert_eq!(parents_chain.first().unwrap().0, regex_root); + + // A different, "more local", root. + let mut local_root: usize; + + // Handle the direct parent + let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); + + match label!(parent_node) { + Kleene | Plus => { + // TODO: If parent_edge_index is 0, make a + // Plus node instead. + local_result.extend([Empty, tnt]); + + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + let iterator = children!(parent_node); + + for index in iterator.clone().skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + + Or => { + local_result.push(tnt); + local_root = builder.add_vertex(); + } + Optional | Empty => { + // If this path is taken, it should not be + // optional. + local_result.extend([Empty, tnt]); + local_root = builder.add_vertex(); + let lit_index = builder.add_vertex(); + builder.add_edge(local_root, lit_index, ()).unwrap(); + + for index in children!(parent_node).skip(parent_edge_index + 1) { + df_copy!(local_root, index); + } + } + Paren | Lit(_) => unreachable!(), + } + + // Handle successive parents + + for (node, edge_index) in parents_chain.into_iter() { + let node_type = label!(node); + + match node_type { + Kleene | Plus => { + // TODO: If edge_index is 0, then just + // make this a Plus node. + + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + + local_root = new_index; + + let iterator = children!(node); + + for index in iterator.clone().skip(edge_index + 1) { + df_copy!(local_root, index); + } + + local_result.push(Kleene); + let new_parent = builder.add_vertex(); + builder.add_edge(local_root, new_parent, ()).unwrap(); + + for index in iterator { + df_copy!(new_parent, index); + } + } + RegexType::Or => {} + RegexType::Optional | RegexType::Empty => { + local_result.push(Empty); + let new_index = builder.add_vertex(); + builder.add_edge(new_index, local_root, ()).unwrap(); + local_root = new_index; + + for index in children!(node).skip(edge_index + 1) { + df_copy!(local_root, index); + } + } + RegexType::Paren | RegexType::Lit(_) => unreachable!(), + } + } + + builder.add_edge(0, local_root, ()).unwrap(); + } + + local_result.shrink_to_fit(); + + let graph = builder.build(); + + assert_eq!(graph.nodes_len(), local_result.len()); + + result.push( + DefaultRegex::new(graph, local_result) + .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, + ); + } + + assert_eq!(result.len(), non_len); + + Ok(result) + } + + /// Convert the regular language of left-linear closures to its + /// equivalent nondeterministic finite automaton. + /// + /// In the generation of the left-linear closure, the terminals + /// and non-terminals are packed into an unsigned integer. We + /// unpack them in converting to nondeterministic finite + /// automaton. + /// + /// The resulting nondeterministic finite automaton should be + /// transformed to its null-closure for use in our algorithm. + pub fn left_closure_to_nfa( + &self, + closure: &[DefaultRegex<TNT>], + ) -> Result<DefaultNFA<DOption<TNT>>, Error> { + let label_transform = |tnt| match tnt { + TNT::Ter(t) => { + let new_tnt = self.unpack_tnt(t).map_err(|e| match e { + Error::IndexOutOfBounds(index, bound) => { + graph::error::Error::IndexOutOfBounds(index, bound) + } + _ => unreachable!(), + })?; + + Ok(SoC::Carry(new_tnt)) + } + TNT::Non(n) => Ok(SoC::Sub(n)), + }; + + let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?; + + Ok(nfa) + } +} + +impl Display for Grammar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + assert_eq!(self.non.len(), self.rules.len()); + + for (nt, rule) in self.non.iter().zip(self.rules.iter()) { + write!(f, "{}: ", nt.name())?; + + writeln!( + f, + "{}", + rule.regex.to_string_with(|tnt| format!( + "({})", + self.name_of_tnt(tnt) + .unwrap_or_else(|_| format!("Unknown {tnt:?}")) + ))? + )?; + } + + Ok(()) + } +} + +// A helper module that provides some grammars for testing. +#[cfg(feature = "test-helper")] +pub mod test_grammar_helper; + +#[cfg(test)] +mod tests; diff --git a/grammar/src/test_grammar_helper.rs b/grammar/src/test_grammar_helper.rs new file mode 100644 index 0000000..c236952 --- /dev/null +++ b/grammar/src/test_grammar_helper.rs @@ -0,0 +1,368 @@ +//! This module provides some grammars for testing. + +use super::*; +use nfa::{ + default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}, + DesRec, +}; +use std::fmt::Write; + +/// A helper function to compute the first sets of a grammar and +/// return the left-closure of that grammar. +pub fn new_closure_regex( + grammar: &mut Grammar, +) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> { + grammar.compute_firsts()?; + + grammar.left_closure().map_err(Into::into) +} + +/// A function to scan the inputs. +fn scan_tnt( + parser: &DefaultRegParser<TNT>, + input: &str, +) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + use TNT::*; + + let mut chars = input.chars(); + + let mut len = 1; + + while let Some(first) = chars.next() { + match first { + ' ' => { + // ignore whitespaces + len += 1; + } + '*' => return Ok(Some((len, Kleene, Right))), + '+' => return Ok(Some((len, Plus, Right))), + '?' => return Ok(Some((len, Optional, Right))), + '|' => return Ok(Some((len, Empty, Up))), + '(' => return Ok(Some((len, Or, Down))), + ')' => return Ok(Some((len, Paren, Up))), + 'T' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(t) = parser.query(&name, true) { + return Ok(Some((len, Lit(Ter(t)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + 'N' => { + let mut name = String::new(); + + while let Some(c) = chars.next() { + if ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) { + len += 1; + write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; + } else { + break; + } + } + + if let Some(n) = parser.query(&name, false) { + return Ok(Some((len, Lit(Non(n)), Right))); + } else { + return Err(ParseError::InvalidCharacter(first)); + } + } + _ => { + return Err(ParseError::InvalidCharacter(first)); + } + } + } + + Ok(None) +} + +/// Return a simple testing grammar. +#[allow(dead_code)] +pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("end".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("a", true); + regex_parser.add_tnt("b", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("end", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nstart?Nend*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that might serve as the grammar for my notes, +/// somehow. +#[allow(dead_code)] +pub fn new_notes_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![ + Terminal::new("NL".to_owned()), + Terminal::new("SP".to_owned()), + Terminal::new("CON".to_owned()), + Terminal::new("STAR".to_owned()), + Terminal::new("NOTE".to_owned()), + Terminal::new("PRICE".to_owned()), + Terminal::new("DIGIT".to_owned()), + ]; + let non = vec![ + Nonterminal("document".to_owned()), + Nonterminal("item".to_owned()), + Nonterminal("header".to_owned()), + Nonterminal("title".to_owned()), + Nonterminal("note".to_owned()), + Nonterminal("note-content".to_owned()), + Nonterminal("price".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("NL", true); + regex_parser.add_tnt("SP", true); + regex_parser.add_tnt("CON", true); + regex_parser.add_tnt("STAR", true); + regex_parser.add_tnt("note", true); + regex_parser.add_tnt("price", true); + regex_parser.add_tnt("DIGIT", true); + regex_parser.add_tnt("document", false); + regex_parser.add_tnt("item", false); + regex_parser.add_tnt("header", false); + regex_parser.add_tnt("title", false); + regex_parser.add_tnt("note", false); + regex_parser.add_tnt("notecontent", false); + regex_parser.add_tnt("price", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("Nitem+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("Nheader Nprice?Nnote?", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("TSTAR?TSP Ntitle TNL (TSP|TNL)*", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule4 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule5 = Rule { + regex: regex_parser + .parse( + "Tnote Nnotecontent TNL (TSP|TNL)*", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule6 = Rule { + regex: regex_parser + .parse("TCON+", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule7 = Rule { + regex: regex_parser + .parse( + "Tprice TSP TDIGIT+ TNL(TSP | TNL)+", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3, rule4, rule5, rule6, rule7]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a grammar that can express parentheses. +#[allow(dead_code)] +pub fn new_paren_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![ + Terminal::new("LEFT".to_owned()), + Terminal::new("RIGHT".to_owned()), + Terminal::new("A".to_owned()), + ]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("content".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("LEFT", true); + regex_parser.add_tnt("RIGHT", true); + regex_parser.add_tnt("A", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("content", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse( + "TLEFT Nstart TRIGHT | Ncontent Nstart | ", + Box::new(scan_tnt), + true, + )? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TA +", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a left recursive grammar. +#[allow(dead_code)] +pub fn new_left_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NA NS TC", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB | Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("()", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} + +/// Return a right recursive grammar. +#[allow(dead_code)] +pub fn new_right_recursive_grammar() -> Result<Grammar, Box<dyn std::error::Error>> { + let ter = vec![Terminal::new("B".to_owned()), Terminal::new("C".to_owned())]; + let non = vec![ + Nonterminal("start".to_owned()), + Nonterminal("S".to_owned()), + Nonterminal("A".to_owned()), + ]; + + let mut regex_parser: DefaultRegParser<TNT> = Default::default(); + + regex_parser.add_tnt("B", true); + regex_parser.add_tnt("C", true); + regex_parser.add_tnt("start", false); + regex_parser.add_tnt("S", false); + regex_parser.add_tnt("A", false); + + let regex_parser = regex_parser; + + let rule1 = Rule { + regex: regex_parser + .parse("NS TC NA|TB Nstart", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule2 = Rule { + regex: regex_parser + .parse("TB", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rule3 = Rule { + regex: regex_parser + .parse("NA|", Box::new(scan_tnt), true)? + .ok_or(ParseError::Invalid)? + .0, + }; + + let rules = vec![rule1, rule2, rule3]; + + Ok(Grammar::new(ter, non, rules)) +} +// TODO: more grammars diff --git a/grammar/src/tests/mod.rs b/grammar/src/tests/mod.rs new file mode 100644 index 0000000..55eedff --- /dev/null +++ b/grammar/src/tests/mod.rs @@ -0,0 +1,30 @@ +#[cfg(test)] +mod test_grammar_display { + use crate::test_grammar_helper::new_grammar; + + #[test] + fn test_display() -> Result<(), Box<dyn std::error::Error>> { + new_grammar().map(|g| println!("{g}")) + } +} + +#[cfg(test)] +mod test_grammar_firsts { + use crate::test_grammar_helper::new_grammar; + use std::io::Write; + + #[test] + fn test_firsts() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_grammar()?; + + grammar.compute_firsts()?; + + let mut lock = std::io::stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes).map_err(Into::into) + } +} + +#[cfg(test)] +mod test_grammar_left_closure; diff --git a/grammar/src/tests/test_grammar_left_closure.rs b/grammar/src/tests/test_grammar_left_closure.rs new file mode 100644 index 0000000..0bc9f4d --- /dev/null +++ b/grammar/src/tests/test_grammar_left_closure.rs @@ -0,0 +1,272 @@ +use crate::test_grammar_helper::*; +use crate::*; +use nfa::Nfa; +use std::{ + collections::HashSet, + io::{stdout, Write}, +}; + +#[test] +fn test_regex() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_grammar()?; + + let vec_of_regexps = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + writeln!(lock, "grammar firsts: {:?}", grammar.firsts)?; + writeln!(lock, "grammar first nodes: {:?}", grammar.first_nodes)?; + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + for regex in vec_of_regexps.into_iter().skip(1) { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| format!("{tnt}"))? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + Ok(()) +} + +// We ignore this test by default as it is possibly involved with +// printing to a graphviz file. +#[ignore] +#[test] +fn test_nfa() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_notes_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + } + + grammar + .left_closure_to_nfa(&closure) + .map(|_| ()) + .map_err(Into::into) + + // let _nfa = grammar.left_closure_to_nfa(&closure)?; + + // writeln!(lock, "Not printing nfa to nfa.gv")?; + + // nfa.print_viz("nfa.gv").map_err(Into::into) + + // Ok(()) +} + +#[test] +#[ignore] +fn test_remove_epsilon() -> Result<(), Box<dyn std::error::Error>> { + let mut lock = stdout().lock(); + + let mut grammar = new_paren_grammar()?; + + writeln!(lock, "grammar:")?; + writeln!(lock, "{grammar}")?; + + let closure = new_closure_regex(&mut grammar)?; + + let mut accumulator_value: usize = 0; + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + writeln!(lock, "regex len = {}", regex.nodes_len())?; + writeln!(lock, "offset = {accumulator_value}")?; + + accumulator_value += regex.nodes_len(); + } + + writeln!(lock, "total = {accumulator_value}")?; + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + nfa.print_viz("nfa_no_epsilon.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_remove_dead() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_paren_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulator = 0usize; + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(accumulator); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulator += regex.nodes_len() * 2; + + accumulators.push(accumulator); + } + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.print_viz("nfa_orig.gv")?; + + nfa.remove_epsilon(|label| label.is_none())?; + + let accumulators: HashSet<usize> = accumulators.into_iter().collect(); + + println!("accumulators = {accumulators:?}"); + + let grammar_reserve_node = |node| accumulators.contains(&node); + + nfa.remove_dead(grammar_reserve_node)?; + + nfa.print_viz("nfa_no_dead.gv")?; + + Ok(()) +} + +#[test] +#[ignore] +fn test_nulling() -> Result<(), Box<dyn std::error::Error>> { + let mut grammar = new_left_recursive_grammar()?; + let closure = new_closure_regex(&mut grammar)?; + + let mut lock = stdout().lock(); + + let mut accumulators = Vec::with_capacity(closure.len() + 1); + accumulators.push(0); + + for regex in closure.iter() { + writeln!( + lock, + "regex: {}", + regex.to_string_with(|tnt| { + match tnt { + TNT::Ter(t) => { + format!( + "({})", + grammar.name_of_tnt(grammar.unpack_tnt(t).unwrap()).unwrap() + ) + } + TNT::Non(_) => { + // hyper non-terminal + format!("H({})", grammar.name_of_tnt(tnt).unwrap()) + } + } + })? + )?; + // println!("regex: {regex:?}",); + writeln!(lock, "regex len = {}", regex.nodes_len())?; + + accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap()); + } + + write!(lock, "nullables:")?; + let mut first_time = true; + for i in 0..(grammar.non_num()) { + if grammar.is_nullable(i)? { + if first_time { + write!(lock, " ")?; + } else { + write!(lock, ", ")?; + } + write!(lock, " {i}")?; + first_time = false; + } + } + writeln!(lock)?; + + let accumulators: HashSet<usize> = accumulators.into_iter().collect(); + + let mut nfa = grammar.left_closure_to_nfa(&closure)?; + + nfa.nulling(|label| { + if let Some(label) = *label { + match label { + TNT::Ter(_) => false, + // Panics if a non-terminal references an invalid node + // here. + TNT::Non(n) => grammar.is_nullable(n).unwrap(), + } + } else { + true + } + })?; + + let grammar_reserve_nodes = |node| accumulators.contains(&node); + + writeln!(lock, "accumulators are {accumulators:?}")?; + + nfa.remove_epsilon(|label| label.is_none())?; + nfa.remove_dead(grammar_reserve_nodes)?; + + writeln!(lock, "Printing nfa to nfa.gv")?; + + nfa.print_viz("nfa.gv")?; + + Ok(()) +} |