#![warn(missing_docs)] //! This file implements the extected behaviours of grammars. // NOTE: We shall first start with a parser that works at the level of // characters. The purpose is to first experiment with the workings // and the performance of the algorithms, before optimising by using // regular expressions to classify inputs into tokens. In other // words, the current focus is not on the optimisations, whereas // scanners are for optimisations only, so to speak. #![allow(unused_imports)] use nfa::{ default::{ nfa::DefaultNFA, regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, }, DOption, DesRec, Nfa, Regex, SoC, }; use graph::{adlist::ALGBuilder, builder::Builder, Graph}; use std::{ collections::HashSet, fmt::{Display, Write}, }; /// The type of a terminal. /// /// For the time being this is a wrapper around a string, but in the /// future it may hold more information of scanners. #[derive(Debug, Clone, Eq, PartialEq)] pub struct Terminal { // If we want to use scanners, per chance add them as a new field // here. name: String, } impl Terminal { /// Create a terminal with the given name. #[inline] pub fn new(name: String) -> Self { Self { name } } /// Return the name of the terminal. #[inline] pub fn name(&self) -> &str { &self.name } } /// The type of a non-terminal. /// /// This is just a wrapper around a string. #[derive(Debug, Clone)] pub struct Nonterminal(String); impl Nonterminal { /// Return the name of the nonterminal. /// /// Just to improve readability. #[inline] pub fn name(&self) -> &str { &self.0 } } /// The type of a terminal or a non-terminal. /// /// Only an index is stored here. Actual data are stored in two other /// arrays. #[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] pub enum TNT { /// Terminal variant Ter(usize), /// Nonterminal variant Non(usize), } impl Display for TNT { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TNT::Ter(t) => write!(f, "T({t})"), TNT::Non(n) => write!(f, "N({n})"), } } } /// Errors related to grammar operations. #[derive(Debug, Copy, Clone)] #[non_exhaustive] pub enum Error { /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), /// Fail to build the N-th regular expression, due to the /// ParseError. BuildFail(usize, ParseError), /// fail to build NFA NFAFail(nfa::error::Error), } impl From for Error { fn from(nfae: nfa::error::Error) -> Self { Self::NFAFail(nfae) } } impl Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), Error::BuildFail(n, pe) => write!( f, "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), } } } impl std::error::Error for Error {} /// A rule is a regular expression of terminals or non-terminals. #[derive(Debug, Clone)] pub struct Rule { regex: DefaultRegex, } impl Rule { /// Return true if and only if the rule is empty. #[inline] pub fn is_empty(&self) -> bool { self.regex.is_empty() } /// Return the length of the rule. #[inline] pub fn len(&self) -> usize { self.regex.len() } } /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { ter: Vec, non: Vec, rules: Vec, firsts: Vec>>, first_nodes: Vec>, } /// A private type to aid the recursive looping of rergular /// expressions. #[derive(Copy, Clone)] enum StackElement { Seen(usize), Unseen(usize), } impl StackElement { fn index(self) -> usize { match self { Self::Seen(index) => index, Self::Unseen(index) => index, } } fn is_seen(self) -> bool { matches!(self, Self::Seen(_)) } } impl Grammar { /// Construct a grammar from a vector of terminals, a vector of /// non-terminals, and a vector of rules for the non-temrinals. /// /// # Panic /// /// If the length of `non` is not equal to that of `rules`, then /// the function panics. pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { assert_eq!(non.len(), rules.len()); // One more room is reserved for the `None` value. let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) .take(non.len()) .collect(); let first_nodes = rules .iter() .map(|rule| Vec::with_capacity(rule.len())) .collect(); Self { ter, non, rules, firsts, first_nodes, } } /// Return the name of a terminal or a non-terminal. pub fn name_of_tnt(&self, tnt: TNT) -> Result { match tnt { TNT::Ter(t) => Ok(format!( "T{}", self.ter .get(t) .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? .name() )), TNT::Non(n) => Ok(format!( "N{}", self.non .get(n) .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? .name() )), } } /// Return true if and only if there are no non-terminals in the /// grammar. #[inline] pub fn is_empty(&self) -> bool { self.non.is_empty() } /// Return the total length of all rules. #[inline] pub fn total(&self) -> usize { self.rules.iter().map(Rule::len).sum() } /// Return the number of terminals. #[inline] pub fn ter_num(&self) -> usize { self.ter.len() } /// Return the number of non-terminals. #[inline] pub fn non_num(&self) -> usize { self.non.len() } /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a /// single number to represent terminals and non-terminals. /// /// # Bounds /// /// If a terminal index is greater than or equal to the number of /// terminals, then this signals an error; mutatis mutandis for /// non-terminals. /// /// # Related /// /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. #[inline] pub fn pack_tnt(&self, tnt: TNT) -> Result { let ter_num = self.ter.len(); let non_num = self.non.len(); match tnt { TNT::Ter(t) => { if t >= ter_num { Err(Error::IndexOutOfBounds(t, ter_num)) } else { Ok(t) } } TNT::Non(n) => { if n >= non_num { Err(Error::IndexOutOfBounds(n, non_num)) } else { Ok(n + ter_num) } } } } /// Convert a single number to either a terminal or a /// non-terminal. /// /// # Bounds /// /// If the number is greater than or equal to the sum of the /// numbers of terminals and of non-terminals, then this signals /// an error. /// /// # Related /// /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. #[inline] pub fn unpack_tnt(&self, flat: usize) -> Result { let ter_num = self.ter.len(); let non_num = self.non.len(); if flat < ter_num { Ok(TNT::Ter(flat)) } else if flat < ter_num + non_num { Ok(TNT::Non(flat - ter_num)) } else { Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) } } /// Return true if and only if the non-terminal is nullable. #[inline] pub fn is_nullable(&self, non_terminal: usize) -> Result { Ok(self .firsts .get(non_terminal) .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .contains(&None)) } /// For a NON_TERMINAL, return an iterator that goes over the /// nodes that are reachable from the non-terminal through an /// empty transition of the nondeterministic finite automaton. #[inline] pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { Ok(self .first_nodes .get(non_terminal) .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .iter()) } /// Compute the set of terminals that can appear as the first /// terminal in some left-linear derivation of a non-terminal, for /// every non-terminal. /// /// This is an algorithm that computes the transitive closure, /// which is a common approach for this task. But perhaps there /// are more efficient approaches? /// /// Also the function computes the set of "reachable nodes" in the /// process, and records the information in the `first_nodes` /// attribute. pub fn compute_firsts(&mut self) -> Result<(), Error> { let mut updated = true; let non_len = self.non_num(); use StackElement::{Seen, Unseen}; while updated { updated = false; for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() { let root = if let Some(root) = regex.root() { root } else { if !self.is_nullable(n)? { updated = true; self.firsts.get_mut(n).unwrap().insert(None); // The default construction of a grammar // reserves some space for each vector, so // explicitly setting this can reduce some // minor memory overhead. let pointer = self.first_nodes.get_mut(n).unwrap(); pointer.clear(); pointer.shrink_to_fit(); } continue; }; let regex_len = regex.len(); let mut stack: Vec = Vec::with_capacity(regex_len); stack.push(Unseen(root)); let mut children_sets_stack: Vec>> = Vec::with_capacity(regex_len); let mut children_nodes_stack: Vec> = Vec::with_capacity(regex_len); while let Some(top) = stack.pop() { let top_index = top.index(); let is_seen = top.is_seen(); match regex .vertex_label(top_index) .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))? { RegexType::Kleene => { if !is_seen { stack.push(Seen(top_index)); for child in regex.children_of(top_index).unwrap() { stack.push(Unseen(child)); } } else { let degree = regex.degree(top_index).unwrap(); let children_stack_len = children_sets_stack.len(); let children_nodes_len = children_nodes_stack.len(); assert!( children_stack_len >= degree, "not enough stack elements for {top_index}" ); assert!( children_nodes_len >= degree, "not enough stack elements for {top_index}" ); let mut this_set = HashSet::new(); this_set.insert(None); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); if degree == 0 { children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); continue; } let mut stop = false; for (child_set, child_nodes) in children_sets_stack .drain((children_stack_len - degree)..) .zip( children_nodes_stack.drain((children_nodes_len - degree)..), ) { if stop { break; } if !child_set.contains(&None) { stop = true; } this_set.extend(child_set); this_nodes.extend(child_nodes); } children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); } } RegexType::Plus => { if !is_seen { stack.push(Seen(top_index)); for child in regex.children_of(top_index).unwrap() { stack.push(Unseen(child)); } } else { let degree = regex.degree(top_index).unwrap(); let children_stack_len = children_sets_stack.len(); let children_nodes_len = children_nodes_stack.len(); assert!( children_stack_len >= degree, "not enough stack elements for {top_index}" ); assert!( children_nodes_len >= degree, "not enough stack elements for {top_index}" ); let mut this_set = HashSet::new(); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); if degree == 0 { this_set.insert(None); children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); continue; } let mut stop = false; for (child_set, child_nodes) in children_sets_stack .drain((children_stack_len - degree)..) .zip( children_nodes_stack.drain((children_nodes_len - degree)..), ) { if stop { break; } if !child_set.contains(&None) { stop = true; } this_set.extend(child_set); this_nodes.extend(child_nodes); } if stop { this_set.remove(&None); } children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); } } RegexType::Optional => { if !is_seen { stack.push(Seen(top_index)); for child in regex.children_of(top_index).unwrap() { stack.push(Unseen(child)); } } else { let degree = regex.degree(top_index).unwrap(); let children_stack_len = children_sets_stack.len(); let children_nodes_len = children_nodes_stack.len(); assert!( children_stack_len >= degree, "not enough stack elements for {top_index}" ); assert!( children_nodes_len >= degree, "not enough stack elements for {top_index}" ); let mut this_set = HashSet::new(); this_set.insert(None); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); if degree == 0 { children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); continue; } let mut stop = false; for (child_set, child_nodes) in children_sets_stack .drain((children_stack_len - degree)..) .zip( children_nodes_stack.drain((children_nodes_len - degree)..), ) { if stop { break; } if !child_set.contains(&None) { stop = true; } this_set.extend(child_set.iter().copied()); this_nodes.extend(child_nodes.iter().copied()); } children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); } } RegexType::Or => { if !is_seen { stack.push(Seen(top_index)); for child in regex.children_of(top_index).unwrap() { stack.push(Unseen(child)); } } else { let degree = regex.degree(top_index).unwrap(); let children_stack_len = children_sets_stack.len(); let children_nodes_len = children_nodes_stack.len(); assert!( children_stack_len >= degree, "not enough stack elements for {top_index}" ); assert!( children_nodes_len >= degree, "not enough stack elements for {top_index}" ); let mut this_set = HashSet::new(); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); if degree == 0 { this_set.insert(None); children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); continue; } for (child_set, child_nodes) in children_sets_stack .drain((children_stack_len - degree)..) .zip( children_nodes_stack.drain((children_nodes_len - degree)..), ) { this_set.extend(child_set.iter().copied()); this_nodes.extend(child_nodes.iter().copied()); } children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); } } RegexType::Paren => { // Only for printing purposes let mut this_set = HashSet::new(); this_set.insert(None); children_sets_stack.push(this_set); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); children_nodes_stack.push(this_nodes); } RegexType::Empty => { if !is_seen { stack.push(Seen(top_index)); for child in regex.children_of(top_index).unwrap().rev() { stack.push(Unseen(child)); } } else { let degree = regex.degree(top_index).unwrap(); let children_stack_len = children_sets_stack.len(); let children_nodes_len = children_nodes_stack.len(); assert!( children_stack_len >= degree, "not enough stack elements for {top_index}" ); assert!( children_nodes_len >= degree, "not enough stack elements for {top_index}" ); let mut this_set = HashSet::new(); let mut this_nodes = HashSet::new(); this_nodes.insert(top_index); if degree == 0 { this_set.insert(None); children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); continue; } let mut stop = false; for (child_set, child_nodes) in children_sets_stack .drain((children_stack_len - degree)..) .zip( children_nodes_stack.drain((children_nodes_len - degree)..), ) { if stop { break; } if !child_set.contains(&None) { stop = true; } this_set.extend(child_set.iter().copied()); this_nodes.extend(child_nodes.iter().copied()); } if stop { this_set.remove(&None); } children_sets_stack.push(this_set); children_nodes_stack.push(this_nodes); } } RegexType::Lit(tnt) => { match tnt { TNT::Ter(t) => { let mut this_set = HashSet::with_capacity(1); this_set.insert(Some(t)); children_sets_stack.push(this_set); } TNT::Non(non) => { let this_set = self .firsts .get(non) .ok_or(Error::IndexOutOfBounds(non, non_len))? .clone(); children_sets_stack.push(this_set); } } let mut this_nodes = HashSet::with_capacity(1); this_nodes.insert(top_index); children_nodes_stack.push(this_nodes); } } } assert_eq!( children_sets_stack.len(), 1, "Too many elements left at the end" ); assert_eq!( children_nodes_stack.len(), 1, "Too many elements left at the end" ); for first in children_sets_stack.pop().unwrap().into_iter() { if !self.firsts.get(n).unwrap().contains(&first) { updated = true; self.firsts.get_mut(n).unwrap().insert(first); } } *self.first_nodes.get_mut(n).unwrap() = children_nodes_stack.pop().unwrap().into_iter().collect(); } } Ok(()) } /// Return the regular language of the left-linear closures of /// non-terminals in the grammar. /// /// The resulting vector is guaranteed to be of the same length as /// the number of non-terminals. /// /// The resulting regular language is not "self-contained". That /// is to say, its terminals indices are packed indices and are /// meaningless without the interpretation of the grammar. They /// should be converted to a nondeterministic finite automaton and /// then to its null closure later on. pub fn left_closure(&self) -> Result>, Error> { let non_len = self.non_num(); let mut result = Vec::with_capacity(non_len); for (n, rule) in self.rules.iter().enumerate() { let regex = &rule.regex; let regex_root = if let Some(root) = regex.root() { root } else { result.push(Default::default()); continue; }; let regex_len = regex.len(); /// A convenient macro to retrieve the children from the /// original regular expression with error propagation. macro_rules! children { ($n:expr) => { regex .children_of($n) .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? }; } /// A convenient macro to retrieve the label from the /// original regular expression with error propagation. macro_rules! label { ($n:expr) => { regex .vertex_label($n) .map_err(|_| Error::IndexOutOfBounds($n, regex_len))? }; } let parents = regex.parents_array().map_err(|e| match e { nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len), nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle), _ => unreachable!(), })?; use RegexType::*; use TNT::*; let mut local_result: Vec> = Vec::with_capacity(regex_len * 2); let mut builder = ALGBuilder::default(); /// Perform a depth-first copy macro_rules! df_copy { ($parent:expr, $index:expr) => { match label!($index) { Kleene | Plus | Optional | Or | Paren | Empty => { let mut stack = vec![($parent, $index)]; while let Some((top_parent, top_index)) = stack.pop() { let label = label!(top_index); let label = match label { Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())), _ => label, }; local_result.push(label); let new = builder.add_vertex(); builder.add_edge(top_parent, new, ()).unwrap(); stack.extend(children!(top_index).map(|child| (new, child))); } } Lit(remain_tnt) => { local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap()))); let new = builder.add_vertex(); builder.add_edge($parent, new, ()).unwrap(); } } }; } local_result.push(Or); builder.add_vertex(); local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap()))); let non_lit_index = builder.add_vertex(); builder.add_edge(0, non_lit_index, ()).unwrap(); for first_node in self.first_nodes_of(n)?.copied() { assert!(first_node < parents.len()); let tnt = match label!(first_node) { Lit(tnt) => tnt, _ => continue, }; let mut parents_chain = { let mut result = Vec::new(); let mut stack = Vec::with_capacity(parents.len()); stack.push(first_node); while let Some(top) = stack.pop() { assert!(top < parents.len()); if let Some(parent) = parents.get(top).copied().unwrap() { result.push(parent); stack.push(parent.0); } } result.reverse(); result }; if parents_chain.is_empty() { local_result.push(Lit(tnt)); let lit_index = builder.add_vertex(); builder.add_edge(0, lit_index, ()).unwrap(); continue; } assert!(parents_chain.first().unwrap().0 == regex_root); // A different, "more local", root. let mut root: usize; // Handle the direct parent let (parent_node, parent_edge_index) = parents_chain.pop().unwrap(); match label!(parent_node) { Kleene | Plus => { local_result.extend([Empty, Lit(tnt)]); root = builder.add_vertex(); let lit_index = builder.add_vertex(); builder.add_edge(root, lit_index, ()).unwrap(); let iterator = children!(parent_node); for index in iterator.clone().skip(parent_edge_index + 1) { df_copy!(root, index); } local_result.push(Kleene); let new_parent = builder.add_vertex(); builder.add_edge(root, new_parent, ()).unwrap(); for index in iterator { df_copy!(new_parent, index); } } Or => { local_result.push(Lit(tnt)); root = builder.add_vertex(); } Optional | Empty => { // If this path is taken, it should not be // optional. local_result.extend([Empty, Lit(tnt)]); root = builder.add_vertex(); let lit_index = builder.add_vertex(); builder.add_edge(root, lit_index, ()).unwrap(); for index in children!(parent_node).skip(parent_edge_index + 1) { df_copy!(root, index); } } Paren | Lit(_) => unreachable!(), } // Handle successive parents for (node, edge_index) in parents_chain.into_iter() { let node_type = label!(node); match node_type { Kleene | Plus => { local_result.push(Empty); let new_index = builder.add_vertex(); builder.add_edge(new_index, root, ()).unwrap(); root = new_index; let iterator = children!(node); for index in iterator.clone().skip(edge_index + 1) { df_copy!(root, index); } local_result.push(Kleene); let new_parent = builder.add_vertex(); builder.add_edge(root, new_parent, ()).unwrap(); for index in iterator { df_copy!(new_parent, index); } } RegexType::Or => {} RegexType::Optional | RegexType::Empty => { local_result.push(Empty); let new_index = builder.add_vertex(); builder.add_edge(new_index, root, ()).unwrap(); root = new_index; for index in children!(node).skip(edge_index + 1) { df_copy!(root, index); } } RegexType::Paren | RegexType::Lit(_) => unreachable!(), } } builder.add_edge(0, root, ()).unwrap(); } local_result.shrink_to_fit(); let graph = builder.build(); assert_eq!(graph.nodes_len(), local_result.len()); result.push( DefaultRegex::new(graph, local_result) .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?, ); } assert_eq!(result.len(), non_len); Ok(result) } /// Convert the regular language of left-linear closures to its /// equivalent nondeterministic finite automaton. /// /// In the generation of the left-linear closure, the terminals /// and non-terminals are packed into an unsigned integer. We /// unpack them in converting to nondeterministic finite /// automaton. /// /// The resulting nondeterministic finite automaton should be /// transformed to its null-closure for use in our algorithm. pub fn left_closure_to_nfa( &self, closure: &[DefaultRegex], ) -> Result>, Error> { let label_transform = |tnt| match tnt { TNT::Ter(t) => { let new_tnt = self.unpack_tnt(t).map_err(|e| match e { Error::IndexOutOfBounds(index, bound) => { graph::error::Error::IndexOutOfBounds(index, bound) } _ => unreachable!(), })?; Ok(SoC::Carry(new_tnt)) } TNT::Non(n) => Ok(SoC::Sub(n)), }; DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into) } } impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); for (nt, rule) in self.non.iter().zip(self.rules.iter()) { write!(f, "{}: ", nt.name())?; writeln!( f, "{}", rule.regex.to_string_with(|tnt| format!( "({})", self.name_of_tnt(tnt) .unwrap_or_else(|_| format!("Unknown {tnt:?}")) ))? )?; } Ok(()) } } #[cfg(test)] mod test_grammar_helper { use super::*; use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType}; use std::fmt::Write; // Construct a grammar to test pub fn new_grammar() -> Result> { let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())]; let non = vec![ Nonterminal("start".to_owned()), Nonterminal("end".to_owned()), ]; /// A function to scan the inputs. fn scan_tnt( parser: &DefaultRegParser, input: &str, ) -> Result, ParseDirection)>, ParseError> { use ParseDirection::*; use RegexType::*; use TNT::*; let mut chars = input.chars(); if let Some(first) = chars.next() { match first { '*' => Ok(Some((1, Kleene, Right))), '+' => Ok(Some((1, Plus, Right))), '?' => Ok(Some((1, Optional, Right))), '|' => Ok(Some((1, Empty, Up))), '(' => Ok(Some((1, Or, Down))), ')' => Ok(Some((1, Paren, Up))), 'T' => { let mut name = String::new(); let mut len = 1; while let Some(c) = chars.next() { if ('a'..='z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; } else { break; } } if let Some(t) = parser.query(&name, true) { Ok(Some((len, Lit(Ter(t)), Right))) } else { Err(ParseError::InvalidCharacter(first)) } } 'N' => { let mut name = String::new(); let mut len = 1; while let Some(c) = chars.next() { if ('a'..='z').contains(&c) { len += 1; write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?; } else { break; } } if let Some(n) = parser.query(&name, false) { Ok(Some((len, Lit(Non(n)), Right))) } else { Err(ParseError::InvalidCharacter(first)) } } _ => Err(ParseError::InvalidCharacter(first)), } } else { Ok(None) } } let mut regex_parser: DefaultRegParser = Default::default(); regex_parser.add_tnt("a", true); regex_parser.add_tnt("b", true); regex_parser.add_tnt("start", false); regex_parser.add_tnt("end", false); let regex_parser = regex_parser; let rule1 = Rule { regex: regex_parser .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)? .ok_or(ParseError::Invalid)? .0, }; let rule2 = Rule { regex: regex_parser .parse("Nstart?Nend*", Box::new(scan_tnt), true)? .ok_or(ParseError::Invalid)? .0, }; let rules = vec![rule1, rule2]; Ok(Grammar::new(ter, non, rules)) } } #[cfg(test)] mod test_grammar_display { use super::test_grammar_helper::new_grammar; #[test] fn test_display() -> Result<(), Box> { println!("{}", new_grammar()?); Ok(()) } } #[cfg(test)] mod test_grammar_firsts { use super::test_grammar_helper::new_grammar; use super::*; #[test] fn test_firsts() -> Result<(), Box> { let mut grammar = new_grammar()?; grammar.compute_firsts()?; println!("grammar firsts: {:?}", grammar.firsts); println!("grammar first nodes: {:?}", grammar.first_nodes); Ok(()) } } #[cfg(test)] mod test_grammar_left_closure { use super::test_grammar_helper::new_grammar; use super::*; pub fn new_closure_regex( grammar: &mut Grammar, ) -> Result>, Box> { grammar.compute_firsts()?; println!("grammar firsts: {:?}", grammar.firsts); println!("grammar first nodes: {:?}", grammar.first_nodes); println!("grammar:"); println!("{grammar}"); grammar.left_closure().map_err(Into::into) } #[test] fn test_regex() -> Result<(), Box> { let mut grammar = new_grammar()?; let vec_of_regexps = new_closure_regex(&mut grammar)?; for regex in &vec_of_regexps { println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?); // println!("regex: {regex:?}",); println!("regex len = {}", regex.nodes_len()); } Ok(()) } #[test] fn test_nfa() -> Result<(), Box> { let mut grammar = new_grammar()?; let closure = new_closure_regex(&mut grammar)?; let nfa = grammar.left_closure_to_nfa(&closure)?; // TODO: print the nfa out Ok(()) } }