summaryrefslogtreecommitdiff
path: root/grammar/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'grammar/src/lib.rs')
-rw-r--r--grammar/src/lib.rs1131
1 files changed, 1131 insertions, 0 deletions
diff --git a/grammar/src/lib.rs b/grammar/src/lib.rs
new file mode 100644
index 0000000..55adc9a
--- /dev/null
+++ b/grammar/src/lib.rs
@@ -0,0 +1,1131 @@
+#![warn(missing_docs)]
+//! This file implements the extected behaviours of grammars.
+
+// NOTE: We shall first start with a parser that works at the level of
+// characters. The purpose is to first experiment with the workings
+// and the performance of the algorithms, before optimising by using
+// regular expressions to classify inputs into tokens. In other
+// words, the current focus is not on the optimisations, whereas
+// scanners are for optimisations only, so to speak.
+
+// TODO: Separate contents into modules.
+
+use nfa::{
+ default::{
+ nfa::DefaultNFA,
+ regex::{DefaultRegex, ParseError, RegexType},
+ },
+ DOption, Nfa, Regex, SoC,
+};
+
+use graph::{adlist::ALGBuilder, builder::Builder, Graph};
+
+use std::{collections::HashSet, fmt::Display};
+
+/// The type of a terminal.
+///
+/// For the time being this is a wrapper around a string, but in the
+/// future it may hold more information of scanners.
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct Terminal {
+ // If we want to use scanners, per chance add them as a new field
+ // here.
+ name: String,
+}
+
+impl Terminal {
+ /// Create a terminal with the given name.
+ #[inline]
+ pub fn new(name: String) -> Self {
+ Self { name }
+ }
+
+ /// Return the name of the terminal.
+ #[inline]
+ pub fn name(&self) -> &str {
+ &self.name
+ }
+}
+
+/// The type of a non-terminal.
+///
+/// This is just a wrapper around a string.
+#[derive(Debug, Clone)]
+pub struct Nonterminal(String);
+
+impl Nonterminal {
+ /// Return the name of the nonterminal.
+ ///
+ /// Just to improve readability.
+ #[inline]
+ pub fn name(&self) -> &str {
+ &self.0
+ }
+}
+
+/// The type of a terminal or a non-terminal.
+///
+/// Only an index is stored here. Actual data are stored in two other
+/// arrays.
+#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
+pub enum TNT {
+ /// Terminal variant
+ Ter(usize),
+ /// Nonterminal variant
+ Non(usize),
+}
+
+impl Display for TNT {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ TNT::Ter(t) => write!(f, "T({t})"),
+ TNT::Non(n) => write!(f, "N({n})"),
+ }
+ }
+}
+
+/// Errors related to grammar operations.
+#[derive(Debug, Copy, Clone)]
+#[non_exhaustive]
+pub enum Error {
+ /// The first component is the index, and the second the bound.
+ IndexOutOfBounds(usize, usize),
+ /// Fail to build the N-th regular expression, due to the
+ /// ParseError.
+ BuildFail(usize, ParseError),
+ /// fail to build NFA
+ NFAFail(nfa::error::Error),
+}
+
+impl From<nfa::error::Error> for Error {
+ fn from(nfae: nfa::error::Error) -> Self {
+ Self::NFAFail(nfae)
+ }
+}
+
+impl Display for Error {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"),
+ Error::BuildFail(n, pe) => write!(
+ f,
+ "Failed to build the {n}-th regular expression due to error: {pe}"
+ ),
+ Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
+ }
+ }
+}
+
+impl std::error::Error for Error {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ if let Error::NFAFail(error) = self {
+ Some(error)
+ } else {
+ None
+ }
+ }
+}
+
+/// A rule is a regular expression of terminals or non-terminals.
+#[derive(Debug, Clone)]
+pub struct Rule {
+ regex: DefaultRegex<TNT>,
+}
+
+impl Rule {
+ /// Return true if and only if the rule is empty.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.regex.is_empty()
+ }
+
+ /// Return the length of the rule.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.regex.len()
+ }
+}
+
+/// The type of a grammar.
+#[derive(Debug, Clone, Default)]
+pub struct Grammar {
+ /// A list of terminals.
+ ter: Vec<Terminal>,
+ /// A list of non-terminals.
+ non: Vec<Nonterminal>,
+ /// A list of rules.
+ ///
+ /// The length of the list must match that of the list of
+ /// non-terminals.
+ rules: Vec<Rule>,
+ // The following two attributes are empty until we call
+ // `compute_firsts` on the grammar.
+ /// The list of sets of "first terminals".
+ ///
+ /// The length must match that of the list of non-terminals.
+ firsts: Vec<HashSet<Option<usize>>>,
+ /// The list of lists of nodes that are reachable after a nullable
+ /// transition in the regular expression.
+ ///
+ /// The length must match that of the list of non-terminals.
+ first_nodes: Vec<Vec<usize>>,
+ // The following attribute is empty until we call `left_closure`
+ // on the grammar.
+ left_closure_branches: HashSet<usize>,
+}
+
+/// A private type to aid the recursive looping of rergular
+/// expressions.
+#[derive(Copy, Clone)]
+enum StackElement {
+ Seen(usize),
+ Unseen(usize),
+}
+
+impl StackElement {
+ fn index(self) -> usize {
+ match self {
+ Self::Seen(index) => index,
+ Self::Unseen(index) => index,
+ }
+ }
+
+ fn is_seen(self) -> bool {
+ matches!(self, Self::Seen(_))
+ }
+}
+
+impl Grammar {
+ /// Construct a grammar from a vector of terminals, a vector of
+ /// non-terminals, and a vector of rules for the non-temrinals.
+ ///
+ /// # Panic
+ ///
+ /// If the length of `non` is not equal to that of `rules`, then
+ /// the function panics.
+ pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self {
+ assert_eq!(non.len(), rules.len());
+
+ // One more room is reserved for the `None` value.
+ let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1))
+ .take(non.len())
+ .collect();
+
+ let first_nodes = rules
+ .iter()
+ .map(|rule| Vec::with_capacity(rule.len()))
+ .collect();
+
+ let left_closure_branches = HashSet::default();
+
+ Self {
+ ter,
+ non,
+ rules,
+ firsts,
+ first_nodes,
+ left_closure_branches,
+ }
+ }
+
+ /// Return the name of a terminal or a non-terminal.
+ pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
+ match tnt {
+ TNT::Ter(t) => Ok(format!(
+ "T{}",
+ self.ter
+ .get(t)
+ .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
+ .name()
+ )),
+ TNT::Non(n) => Ok(format!(
+ "N{}",
+ self.non
+ .get(n)
+ .ok_or(Error::IndexOutOfBounds(n, self.non.len()))?
+ .name()
+ )),
+ }
+ }
+
+ /// Return true if and only if there are no non-terminals in the
+ /// grammar.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.non.is_empty()
+ }
+
+ /// Return the total length of all rules.
+ #[inline]
+ pub fn total(&self) -> usize {
+ self.rules.iter().map(Rule::len).sum()
+ }
+
+ /// Return the number of terminals.
+ #[inline]
+ pub fn ter_num(&self) -> usize {
+ self.ter.len()
+ }
+
+ /// Return the number of non-terminals.
+ #[inline]
+ pub fn non_num(&self) -> usize {
+ self.non.len()
+ }
+
+ /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a
+ /// single number to represent terminals and non-terminals.
+ ///
+ /// # Bounds
+ ///
+ /// If a terminal index is greater than or equal to the number of
+ /// terminals, then this signals an error; mutatis mutandis for
+ /// non-terminals.
+ ///
+ /// # Related
+ ///
+ /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt].
+ #[inline]
+ pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> {
+ let ter_num = self.ter.len();
+ let non_num = self.non.len();
+
+ match tnt {
+ TNT::Ter(t) => {
+ if t >= ter_num {
+ Err(Error::IndexOutOfBounds(t, ter_num))
+ } else {
+ Ok(t)
+ }
+ }
+ TNT::Non(n) => {
+ if n >= non_num {
+ Err(Error::IndexOutOfBounds(n, non_num))
+ } else {
+ Ok(n + ter_num)
+ }
+ }
+ }
+ }
+
+ /// Convert a single number to either a terminal or a
+ /// non-terminal.
+ ///
+ /// # Bounds
+ ///
+ /// If the number is greater than or equal to the sum of the
+ /// numbers of terminals and of non-terminals, then this signals
+ /// an error.
+ ///
+ /// # Related
+ ///
+ /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt].
+ ///
+ /// # Errors
+ ///
+ /// This function is supposed to return only one type of errors,
+ /// namely, the IndexOutOfBounds error that results from a bounds
+ /// check. Breaking this is breaking the guarantee of this
+ /// function, and is considered a bug. This behaviour can and
+ /// should be tested. But I have not found a convenient test
+ /// method for testing various grammars.
+ #[inline]
+ pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> {
+ let ter_num = self.ter.len();
+ let non_num = self.non.len();
+
+ if flat < ter_num {
+ Ok(TNT::Ter(flat))
+ } else if flat < ter_num + non_num {
+ Ok(TNT::Non(flat - ter_num))
+ } else {
+ Err(Error::IndexOutOfBounds(flat, ter_num + non_num))
+ }
+ }
+
+ /// Return true if and only if the non-terminal is nullable.
+ #[inline]
+ pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> {
+ Ok(self
+ .firsts
+ .get(non_terminal)
+ .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
+ .contains(&None))
+ }
+
+ /// For a NON_TERMINAL, return an iterator that goes over the
+ /// nodes that are reachable from the non-terminal through an
+ /// empty transition of the nondeterministic finite automaton.
+ #[inline]
+ pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
+ Ok(self
+ .first_nodes
+ .get(non_terminal)
+ .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
+ .iter())
+ }
+
+ /// Return a hash set that contains all nodes in the set of
+ /// left-closure regular languages that are added because of the
+ /// left-linear expansion.
+ pub fn left_closure_branches(&self) -> &HashSet<usize> {
+ &self.left_closure_branches
+ }
+
+ /// Compute the set of terminals that can appear as the first
+ /// terminal in some left-linear derivation of a non-terminal, for
+ /// every non-terminal.
+ ///
+ /// This is an algorithm that computes the transitive closure,
+ /// which is a common approach for this task. But perhaps there
+ /// are more efficient approaches?
+ ///
+ /// Also the function computes the set of "reachable nodes" in the
+ /// process, and records the information in the `first_nodes`
+ /// attribute.
+ pub fn compute_firsts(&mut self) -> Result<(), Error> {
+ let mut updated = true;
+
+ let non_len = self.non_num();
+
+ use StackElement::{Seen, Unseen};
+
+ while updated {
+ updated = false;
+
+ for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
+ let root = if let Some(root) = regex.root() {
+ root
+ } else {
+ if !self.is_nullable(n)? {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(None);
+
+ // The default construction of a grammar
+ // reserves some space for each vector, so
+ // explicitly setting this can reduce some
+ // minor memory overhead.
+ let pointer = self.first_nodes.get_mut(n).unwrap();
+
+ pointer.clear();
+ pointer.shrink_to_fit();
+ }
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
+
+ stack.push(Unseen(root));
+
+ let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
+ Vec::with_capacity(regex_len);
+
+ let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
+
+ while let Some(top) = stack.pop() {
+ let top_index = top.index();
+ let is_seen = top.is_seen();
+
+ match regex
+ .vertex_label(top_index)
+ .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
+ {
+ RegexType::Kleene => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Plus => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set);
+ this_nodes.extend(child_nodes);
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Optional => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Or => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Paren => {
+ // Only for printing purposes
+ let mut this_set = HashSet::new();
+
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ RegexType::Empty => {
+ if !is_seen {
+ stack.push(Seen(top_index));
+
+ for child in regex.children_of(top_index).unwrap().rev() {
+ stack.push(Unseen(child));
+ }
+ } else {
+ let degree = regex.degree(top_index).unwrap();
+ let children_stack_len = children_sets_stack.len();
+ let children_nodes_len = children_nodes_stack.len();
+
+ assert!(
+ children_stack_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ assert!(
+ children_nodes_len >= degree,
+ "not enough stack elements for {top_index}"
+ );
+
+ let mut this_set = HashSet::new();
+
+ let mut this_nodes = HashSet::new();
+
+ this_nodes.insert(top_index);
+
+ if degree == 0 {
+ this_set.insert(None);
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+
+ continue;
+ }
+
+ let mut stop = false;
+
+ for (child_set, child_nodes) in children_sets_stack
+ .drain((children_stack_len - degree)..)
+ .zip(
+ children_nodes_stack.drain((children_nodes_len - degree)..),
+ )
+ {
+ if stop {
+ break;
+ }
+
+ if !child_set.contains(&None) {
+ stop = true;
+ }
+
+ this_set.extend(child_set.iter().copied());
+ this_nodes.extend(child_nodes.iter().copied());
+ }
+
+ if stop {
+ this_set.remove(&None);
+ }
+
+ children_sets_stack.push(this_set);
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ RegexType::Lit(tnt) => {
+ match tnt {
+ TNT::Ter(t) => {
+ let mut this_set = HashSet::with_capacity(1);
+
+ this_set.insert(Some(t));
+
+ children_sets_stack.push(this_set);
+ }
+ TNT::Non(non) => {
+ let this_set = self
+ .firsts
+ .get(non)
+ .ok_or(Error::IndexOutOfBounds(non, non_len))?
+ .clone();
+
+ children_sets_stack.push(this_set);
+ }
+ }
+
+ let mut this_nodes = HashSet::with_capacity(1);
+ this_nodes.insert(top_index);
+
+ children_nodes_stack.push(this_nodes);
+ }
+ }
+ }
+
+ assert_eq!(
+ children_sets_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ assert_eq!(
+ children_nodes_stack.len(),
+ 1,
+ "Too many elements left at the end"
+ );
+
+ for first in children_sets_stack.pop().unwrap().into_iter() {
+ if !self.firsts.get(n).unwrap().contains(&first) {
+ updated = true;
+
+ self.firsts.get_mut(n).unwrap().insert(first);
+ }
+ }
+
+ *self.first_nodes.get_mut(n).unwrap() =
+ children_nodes_stack.pop().unwrap().into_iter().collect();
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Return the regular language of the left-linear closures of
+ /// non-terminals in the grammar.
+ ///
+ /// The resulting vector is guaranteed to be of the same length as
+ /// the number of non-terminals.
+ ///
+ /// The resulting regular language is not "self-contained". That
+ /// is to say, its terminals indices are packed indices and are
+ /// meaningless without the interpretation of the grammar. They
+ /// should be converted to a nondeterministic finite automaton and
+ /// then to its null closure later on.
+ pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
+ let non_len = self.non_num();
+
+ let mut result = Vec::with_capacity(non_len);
+
+ for (n, rule) in self.rules.iter().enumerate() {
+ let regex = &rule.regex;
+
+ let regex_root = if let Some(root) = regex.root() {
+ root
+ } else {
+ result.push(Default::default());
+
+ continue;
+ };
+
+ let regex_len = regex.len();
+
+ /// A convenient macro to retrieve the children from the
+ /// original regular expression with error propagation.
+ macro_rules! children {
+ ($n:expr) => {
+ regex
+ .children_of($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ /// A convenient macro to retrieve the label from the
+ /// original regular expression with error propagation.
+ macro_rules! label {
+ ($n:expr) => {
+ regex
+ .vertex_label($n)
+ .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
+ };
+ }
+
+ let parents = regex.parents_array().map_err(|e| match e {
+ nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
+ nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
+ _ => unreachable!(),
+ })?;
+
+ use RegexType::*;
+ use TNT::*;
+
+ let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
+ let mut builder = ALGBuilder::default();
+
+ /// Perform a depth-first copy
+ macro_rules! df_copy {
+ ($parent:expr, $index:expr) => {
+ match label!($index) {
+ Kleene | Plus | Optional | Or | Paren | Empty => {
+ let mut stack = vec![($parent, $index)];
+
+ while let Some((top_parent, top_index)) = stack.pop() {
+ let label = label!(top_index);
+ let label = match label {
+ Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
+ _ => label,
+ };
+
+ local_result.push(label);
+
+ let new = builder.add_vertex();
+
+ builder.add_edge(top_parent, new, ()).unwrap();
+
+ stack.extend(children!(top_index).map(|child| (new, child)));
+ }
+ }
+ Lit(remain_tnt) => {
+ local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
+ let new = builder.add_vertex();
+ builder.add_edge($parent, new, ()).unwrap();
+ }
+ }
+ };
+ }
+
+ local_result.push(Or);
+ builder.add_vertex();
+
+ local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
+ let non_lit_index = builder.add_vertex();
+
+ builder.add_edge(0, non_lit_index, ()).unwrap();
+
+ // If this non-terminal is nullable, add an empty variant.
+ if self.is_nullable(n)? {
+ local_result.push(Empty);
+ let empty_index = builder.add_vertex();
+ builder.add_edge(0, empty_index, ()).unwrap();
+ }
+
+ for first_node in self.first_nodes_of(n)?.copied() {
+ assert!(first_node < parents.len());
+
+ let tnt = match label!(first_node) {
+ Lit(tnt) => Lit(tnt),
+ _ => continue,
+ };
+
+ let mut parents_chain = {
+ let mut result = Vec::new();
+ let mut stack = Vec::with_capacity(parents.len());
+
+ stack.push(first_node);
+
+ while let Some(top) = stack.pop() {
+ assert!(top < parents.len());
+ if let Some(parent) = parents.get(top).copied().unwrap() {
+ result.push(parent);
+ stack.push(parent.0);
+ }
+ }
+
+ result.reverse();
+
+ result
+ };
+
+ if parents_chain.is_empty() {
+ local_result.push(tnt);
+ let lit_index = builder.add_vertex();
+ builder.add_edge(0, lit_index, ()).unwrap();
+
+ continue;
+ }
+
+ assert_eq!(parents_chain.first().unwrap().0, regex_root);
+
+ // A different, "more local", root.
+ let mut local_root: usize;
+
+ // Handle the direct parent
+ let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
+
+ match label!(parent_node) {
+ Kleene | Plus => {
+ // TODO: If parent_edge_index is 0, make a
+ // Plus node instead.
+ local_result.extend([Empty, tnt]);
+
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ let iterator = children!(parent_node);
+
+ for index in iterator.clone().skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+
+ Or => {
+ local_result.push(tnt);
+ local_root = builder.add_vertex();
+ }
+ Optional | Empty => {
+ // If this path is taken, it should not be
+ // optional.
+ local_result.extend([Empty, tnt]);
+ local_root = builder.add_vertex();
+ let lit_index = builder.add_vertex();
+ builder.add_edge(local_root, lit_index, ()).unwrap();
+
+ for index in children!(parent_node).skip(parent_edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ Paren | Lit(_) => unreachable!(),
+ }
+
+ // Handle successive parents
+
+ for (node, edge_index) in parents_chain.into_iter() {
+ let node_type = label!(node);
+
+ match node_type {
+ Kleene | Plus => {
+ // TODO: If edge_index is 0, then just
+ // make this a Plus node.
+
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+
+ local_root = new_index;
+
+ let iterator = children!(node);
+
+ for index in iterator.clone().skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+
+ local_result.push(Kleene);
+ let new_parent = builder.add_vertex();
+ builder.add_edge(local_root, new_parent, ()).unwrap();
+
+ for index in iterator {
+ df_copy!(new_parent, index);
+ }
+ }
+ RegexType::Or => {}
+ RegexType::Optional | RegexType::Empty => {
+ local_result.push(Empty);
+ let new_index = builder.add_vertex();
+ builder.add_edge(new_index, local_root, ()).unwrap();
+ local_root = new_index;
+
+ for index in children!(node).skip(edge_index + 1) {
+ df_copy!(local_root, index);
+ }
+ }
+ RegexType::Paren | RegexType::Lit(_) => unreachable!(),
+ }
+ }
+
+ builder.add_edge(0, local_root, ()).unwrap();
+ }
+
+ local_result.shrink_to_fit();
+
+ let graph = builder.build();
+
+ assert_eq!(graph.nodes_len(), local_result.len());
+
+ result.push(
+ DefaultRegex::new(graph, local_result)
+ .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
+ );
+ }
+
+ assert_eq!(result.len(), non_len);
+
+ Ok(result)
+ }
+
+ /// Convert the regular language of left-linear closures to its
+ /// equivalent nondeterministic finite automaton.
+ ///
+ /// In the generation of the left-linear closure, the terminals
+ /// and non-terminals are packed into an unsigned integer. We
+ /// unpack them in converting to nondeterministic finite
+ /// automaton.
+ ///
+ /// The resulting nondeterministic finite automaton should be
+ /// transformed to its null-closure for use in our algorithm.
+ pub fn left_closure_to_nfa(
+ &self,
+ closure: &[DefaultRegex<TNT>],
+ ) -> Result<DefaultNFA<DOption<TNT>>, Error> {
+ let label_transform = |tnt| match tnt {
+ TNT::Ter(t) => {
+ let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
+ Error::IndexOutOfBounds(index, bound) => {
+ graph::error::Error::IndexOutOfBounds(index, bound)
+ }
+ _ => unreachable!(),
+ })?;
+
+ Ok(SoC::Carry(new_tnt))
+ }
+ TNT::Non(n) => Ok(SoC::Sub(n)),
+ };
+
+ let nfa = DefaultNFA::to_nfa(closure, label_transform, Some(TNT::Non(0)))?;
+
+ Ok(nfa)
+ }
+}
+
+impl Display for Grammar {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ assert_eq!(self.non.len(), self.rules.len());
+
+ for (nt, rule) in self.non.iter().zip(self.rules.iter()) {
+ write!(f, "{}: ", nt.name())?;
+
+ writeln!(
+ f,
+ "{}",
+ rule.regex.to_string_with(|tnt| format!(
+ "({})",
+ self.name_of_tnt(tnt)
+ .unwrap_or_else(|_| format!("Unknown {tnt:?}"))
+ ))?
+ )?;
+ }
+
+ Ok(())
+ }
+}
+
+// A helper module that provides some grammars for testing.
+#[cfg(feature = "test-helper")]
+pub mod test_grammar_helper;
+
+#[cfg(test)]
+mod tests;