#![warn(missing_docs)] //! This file implements the extected behaviours of grammars. // NOTE: We shall first start with a parser that works at the level of // characters. The purpose is to first experiment with the workings // and the performance of the algorithms, before optimising by using // regular expressions to classify inputs into tokens. In other // words, the current focus is not on the optimisations, whereas // scanners are for optimisations only, so to speak. // REVIEW: Separate contents into modules. use nfa::{ default::{ nfa::DefaultNFA, regex::{DefaultRegex, ParseError, RegexType}, }, LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges, }; use graph::{adlist::ALGBuilder, builder::Builder, Graph}; use std::{ collections::{HashMap, HashSet}, fmt::Display, }; /// The type of a terminal. /// /// For the time being this is a wrapper around a string, but in the /// future it may hold more information of scanners. #[derive(Debug, Clone, Eq, PartialEq)] pub struct Terminal { // If we want to use scanners, per chance add them as a new field // here. name: String, } impl Terminal { /// Create a terminal with the given name. #[inline] pub fn new(name: String) -> Self { Self { name } } /// Return the name of the terminal. #[inline] pub fn name(&self) -> &str { &self.name } } /// The type of a non-terminal. /// /// This is just a wrapper around a string. #[derive(Debug, Clone)] pub struct Nonterminal(String); impl Nonterminal { /// Return the name of the nonterminal. /// /// Just to improve readability. #[inline] pub fn name(&self) -> &str { &self.0 } } /// The type of a terminal or a non-terminal. /// /// Only an index is stored here. Actual data are stored in two other /// arrays. #[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] pub enum TNT { /// Terminal variant Ter(usize), /// Nonterminal variant Non(usize), } impl Display for TNT { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TNT::Ter(t) => write!(f, "T({t})"), TNT::Non(n) => write!(f, "N({n})"), } } } /// Errors related to grammar operations. #[derive(Debug, Copy, Clone)] #[non_exhaustive] pub enum Error { /// The operation requires the grammar to be after a certain /// state, but the grammar is not after that state yet. WrongState(GrammarState, GrammarState), /// The first component is the index, and the second the bound. IndexOutOfBounds(usize, usize), /// Fail to build the N-th regular expression, due to the /// ParseError. BuildFail(usize, ParseError), /// fail to build NFA NFAFail(nfa::error::Error), } impl From for Error { fn from(nfae: nfa::error::Error) -> Self { Self::NFAFail(nfae) } } impl Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"), Error::BuildFail(n, pe) => write!( f, "Failed to build the {n}-th regular expression due to error: {pe}" ), Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"), Error::WrongState(current, threshold) => { write!(f, "require state {threshold}, but in state {current}") } } } } impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { if let Error::NFAFail(error) = self { Some(error) } else { None } } } /// A rule is a regular expression of terminals or non-terminals. #[derive(Debug, Clone)] pub struct Rule { regex: DefaultRegex, } impl Rule { /// Return true if and only if the rule is empty. #[inline] pub fn is_empty(&self) -> bool { self.regex.is_empty() } /// Return the length of the rule. #[inline] pub fn len(&self) -> usize { self.regex.len() } } /// The state of Grammar. /// /// This is used to ensure that the grammar preparation methods are /// called in the correct order. #[derive(Debug, Copy, Clone, Default)] pub enum GrammarState { /// Just initialized #[default] Initial, /// compute_firsts has been called AfterComputeFirst, /// left_closure has been called. AfterLeftClosure, /// left_closure_to_nfa has been called. AfterNFA, } impl Display for GrammarState { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use GrammarState::*; match self { Initial => write!(f, "initial"), AfterComputeFirst => write!(f, "after computation of first set"), AfterLeftClosure => write!(f, "after computation of closure"), AfterNFA => write!(f, "after computation of NFA"), } } } /// The type of a grammar. #[derive(Debug, Clone, Default)] pub struct Grammar { /// A list of terminals. ter: Vec, /// A list of non-terminals. non: Vec, /// A list of rules. /// /// The length of the list must match that of the list of /// non-terminals. rules: Vec, /// The list of successive sums of lengths of rules. accumulators: Vec, // The following two attributes are empty until we call // `compute_firsts` on the grammar. /// The list of sets of "first terminals". /// /// The length must match that of the list of non-terminals. firsts: Vec>>, /// The list of lists of nodes that are reachable after a nullable /// transition in the regular expression. /// /// The length must match that of the list of non-terminals. first_nodes: Vec>, // The following attribute is empty until we call `closure` on the // NFA with `transform_label_null_epsilon` as the transformer. /// A hash map that maps a tuple `(pos1, pos2)` of positions /// `pos1` and `pos2` in the rules to a vector of rule positions. /// This vector means that in order to expand from `pos1` to /// `pos`, it is necessary to expand according to the positions in /// the vector, so we need to add all these expansions into the /// parse forest later. expansion_map: HashMap<(usize, usize), Vec>, /// The state of the grammar, which tells us what information has /// been computed for this grammar. state: GrammarState, } /// A private type to aid the recursive looping of rergular /// expressions. #[derive(Copy, Clone)] enum StackElement { Seen(usize), Unseen(usize), } impl StackElement { fn index(self) -> usize { match self { Self::Seen(index) => index, Self::Unseen(index) => index, } } fn is_seen(self) -> bool { matches!(self, Self::Seen(_)) } } impl Grammar { /// Construct a grammar from a vector of terminals, a vector of /// non-terminals, and a vector of rules for the non-temrinals. /// /// # Panic /// /// If the length of `non` is not equal to that of `rules`, then /// the function panics. pub fn new(ter: Vec, non: Vec, rules: Vec) -> Self { assert_eq!(non.len(), rules.len()); // One more room is reserved for the `None` value. let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1)) .take(non.len()) .collect(); let first_nodes = rules .iter() .map(|rule| Vec::with_capacity(rule.len())) .collect(); let state = Default::default(); let expansion_map = Default::default(); // NOTE: We cannot calculate accumulators here, as we want the // accumulators of the regular expression of the left-closure, // not of the original one. let accumulators = Vec::new(); Self { ter, non, rules, firsts, first_nodes, state, expansion_map, accumulators, } } /// Return the name of a terminal or a non-terminal. pub fn name_of_tnt(&self, tnt: TNT) -> Result { match tnt { TNT::Ter(t) => Ok(format!( "T{}", self.ter .get(t) .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))? .name() )), TNT::Non(n) => Ok(format!( "N{}", self.non .get(n) .ok_or(Error::IndexOutOfBounds(n, self.non.len()))? .name() )), } } /// Return true if and only if there are no non-terminals in the /// grammar. #[inline] pub fn is_empty(&self) -> bool { self.non.is_empty() } /// Return the total length of all rules. #[inline] pub fn total(&self) -> usize { self.accumulators.last().copied().unwrap_or(0) } /// Query if a position is the starting position of a /// non-terminal. If it is, return the non-terminal, else return /// `None` . #[inline] pub fn is_nt_start_in_nfa_p(&self, pos: usize) -> Option { for (index, accumulator) in self.accumulators.iter().copied().enumerate() { let shifted_accumulator = accumulator << 1; // NOTE: Clippy suggests to call `cmp`, but it seems // compiler might not yet be smart enough to inline that // call, so I just silence clippy here. #[allow(clippy::comparison_chain)] if pos == shifted_accumulator { return Some(index); } else if pos < shifted_accumulator { break; } } None } /// Return the number of terminals. #[inline] pub fn ter_num(&self) -> usize { self.ter.len() } /// Return the number of non-terminals. #[inline] pub fn non_num(&self) -> usize { self.non.len() } /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a /// single number to represent terminals and non-terminals. /// /// # Bounds /// /// If a terminal index is greater than or equal to the number of /// terminals, then this signals an error; mutatis mutandis for /// non-terminals. /// /// # Related /// /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt]. #[inline] pub fn pack_tnt(&self, tnt: TNT) -> Result { let ter_num = self.ter.len(); let non_num = self.non.len(); match tnt { TNT::Ter(t) => { if t >= ter_num { Err(Error::IndexOutOfBounds(t, ter_num)) } else { Ok(t) } } TNT::Non(n) => { if n >= non_num { Err(Error::IndexOutOfBounds(n, non_num)) } else { Ok(n + ter_num) } } } } /// Convert a single number to either a terminal or a /// non-terminal. /// /// # Bounds /// /// If the number is greater than or equal to the sum of the /// numbers of terminals and of non-terminals, then this signals /// an error. /// /// # Related /// /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt]. /// /// # Errors /// /// This function is supposed to return only one type of errors, /// namely, the IndexOutOfBounds error that results from a bounds /// check. Breaking this is breaking the guarantee of this /// function, and is considered a bug. This behaviour can and /// should be tested. But I have not found a convenient test /// method for testing various grammars. #[inline] pub fn unpack_tnt(&self, flat: usize) -> Result { let ter_num = self.ter.len(); let non_num = self.non.len(); if flat < ter_num { Ok(TNT::Ter(flat)) } else if flat < ter_num + non_num { Ok(TNT::Non(flat - ter_num)) } else { Err(Error::IndexOutOfBounds(flat, ter_num + non_num)) } } /// Return true if and only if the non-terminal is nullable. #[inline] pub fn is_nullable(&self, non_terminal: usize) -> Result { Ok(self .firsts .get(non_terminal) .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .contains(&None)) } /// Query the expansion information from the position `pos1` to /// the position `pos2` . #[inline] pub fn query_expansion(&self, pos1: usize, pos2: usize) -> Result, Error> { if pos1 >= self.total() { return Err(Error::IndexOutOfBounds(pos1, self.total())); } if pos2 >= self.total() { return Err(Error::IndexOutOfBounds(pos2, self.total())); } match self.state { GrammarState::AfterLeftClosure => {} _ => { return Err(Error::WrongState( self.state, GrammarState::AfterLeftClosure, )); } } Ok(self.expansion_map.get(&(pos1, pos2)).map(|v| &v[..])) } // REVIEW: Do we have a better way to record expansion information // than to compute the transitive closure? /// A transformer of labels to be fed into /// [`closure`][nfa::default::nfa::DefaultNFA::closure], with the /// predicate that returns true if and only if the label of the /// first edge is either empty or a nullable non-terminal. pub fn transform_label_null_epsilon( &mut self, two_edges: TwoEdges>, ) -> LabelType { let (first_source, first_target, first_label) = two_edges.first_edge(); let (second_source, second_target, second_label) = two_edges.second_edge(); #[cfg(debug_assertions)] { assert_eq!(first_target, second_source); if let Some(tnt) = *first_label.get_value() { assert!(matches!(tnt, TNT::Non(n) if matches!(self.is_nullable(n), Ok(true)))); } } // Compute if this is from left-linear expansion: it is so if // and only if one if either the edges comes from left-linear // expansion or we are moving across a non-terminal expansion, // that is to say, the source of the second edge is the // starting edge of a non-terminal. let mut left_p = first_label.get_left_p() || second_label.get_left_p(); // Record left-linear expansion information. if let Some(second_nt) = self.is_nt_start_in_nfa_p(second_source) { left_p = true; if !self .expansion_map .contains_key(&(first_source, second_target)) { let original_expansion = self.expansion_map.get(&(second_source, second_target)); self.expansion_map.insert( (first_source, second_target), if let Some(original_expansion) = original_expansion { let mut result = original_expansion.clone(); result.push(second_nt); result } else { vec![second_nt] }, ); } } NfaLabel::new(second_label.get_value(), second_label.get_moved(), left_p) } /// For a NON_TERMINAL, return an iterator that goes over the /// nodes that are reachable from the non-terminal through an /// empty transition of the nondeterministic finite automaton. #[inline] pub fn first_nodes_of(&self, non_terminal: usize) -> Result, Error> { match self.state { GrammarState::Initial => { return Err(Error::WrongState( self.state, GrammarState::AfterComputeFirst, )); } GrammarState::AfterComputeFirst | GrammarState::AfterLeftClosure | GrammarState::AfterNFA => {} } Ok(self .first_nodes .get(non_terminal) .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))? .iter()) } } pub mod first_set; pub mod left_closure; impl Display for Grammar { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { assert_eq!(self.non.len(), self.rules.len()); for (nt, rule) in self.non.iter().zip(self.rules.iter()) { write!(f, "{}: ", nt.name())?; writeln!( f, "{}", rule.regex.to_string_with(|tnt| format!( "({})", self.name_of_tnt(tnt) .unwrap_or_else(|_| format!("Unknown {tnt:?}")) ))? )?; } Ok(()) } } // A helper module that provides some grammars for testing. #[cfg(feature = "test-helper")] pub mod test_grammar_helper; #[cfg(test)] mod tests;