summaryrefslogtreecommitdiff
path: root/chain/src/grammar.rs
diff options
context:
space:
mode:
Diffstat (limited to 'chain/src/grammar.rs')
-rw-r--r--chain/src/grammar.rs1247
1 files changed, 0 insertions, 1247 deletions
diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs
deleted file mode 100644
index 287fbca..0000000
--- a/chain/src/grammar.rs
+++ /dev/null
@@ -1,1247 +0,0 @@
-#![warn(missing_docs)]
-//! This file implements the extected behaviours of grammars.
-
-// NOTE: We shall first start with a parser that works at the level of
-// characters. The purpose is to first experiment with the workings
-// and the performance of the algorithms, before optimising by using
-// regular expressions to classify inputs into tokens. In other
-// words, the current focus is not on the optimisations, whereas
-// scanners are for optimisations only, so to speak.
-
-#![allow(unused_imports)]
-use nfa::{
- default::{
- nfa::DefaultNFA,
- regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
- },
- DOption, DesRec, Nfa, Regex, SoC,
-};
-
-use graph::{adlist::ALGBuilder, builder::Builder, Graph};
-
-use std::{
- collections::HashSet,
- fmt::{Display, Write},
-};
-
-/// The type of a terminal.
-///
-/// For the time being this is a wrapper around a string, but in the
-/// future it may hold more information of scanners.
-#[derive(Debug, Clone, Eq, PartialEq)]
-pub struct Terminal {
- // If we want to use scanners, per chance add them as a new field
- // here.
- name: String,
-}
-
-impl Terminal {
- /// Create a terminal with the given name.
- #[inline]
- pub fn new(name: String) -> Self {
- Self { name }
- }
-
- /// Return the name of the terminal.
- #[inline]
- pub fn name(&self) -> &str {
- &self.name
- }
-}
-
-/// The type of a non-terminal.
-///
-/// This is just a wrapper around a string.
-#[derive(Debug, Clone)]
-pub struct Nonterminal(String);
-
-impl Nonterminal {
- /// Return the name of the nonterminal.
- ///
- /// Just to improve readability.
- #[inline]
- pub fn name(&self) -> &str {
- &self.0
- }
-}
-
-/// The type of a terminal or a non-terminal.
-///
-/// Only an index is stored here. Actual data are stored in two other
-/// arrays.
-#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
-pub enum TNT {
- /// Terminal variant
- Ter(usize),
- /// Nonterminal variant
- Non(usize),
-}
-
-impl Display for TNT {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- TNT::Ter(t) => write!(f, "T({t})"),
- TNT::Non(n) => write!(f, "N({n})"),
- }
- }
-}
-
-/// Errors related to grammar operations.
-#[derive(Debug, Copy, Clone)]
-#[non_exhaustive]
-pub enum Error {
- /// The first component is the index, and the second the bound.
- IndexOutOfBounds(usize, usize),
- /// Fail to build the N-th regular expression, due to the
- /// ParseError.
- BuildFail(usize, ParseError),
- /// fail to build NFA
- NFAFail(nfa::error::Error),
-}
-
-impl From<nfa::error::Error> for Error {
- fn from(nfae: nfa::error::Error) -> Self {
- Self::NFAFail(nfae)
- }
-}
-
-impl Display for Error {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"),
- Error::BuildFail(n, pe) => write!(
- f,
- "Failed to build the {n}-th regular expression due to error: {pe}"
- ),
- Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
- }
- }
-}
-
-impl std::error::Error for Error {}
-
-/// A rule is a regular expression of terminals or non-terminals.
-#[derive(Debug, Clone)]
-pub struct Rule {
- regex: DefaultRegex<TNT>,
-}
-
-impl Rule {
- /// Return true if and only if the rule is empty.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.regex.is_empty()
- }
-
- /// Return the length of the rule.
- #[inline]
- pub fn len(&self) -> usize {
- self.regex.len()
- }
-}
-
-/// The type of a grammar.
-#[derive(Debug, Clone, Default)]
-pub struct Grammar {
- ter: Vec<Terminal>,
- non: Vec<Nonterminal>,
- rules: Vec<Rule>,
- firsts: Vec<HashSet<Option<usize>>>,
- first_nodes: Vec<Vec<usize>>,
-}
-
-/// A private type to aid the recursive looping of rergular
-/// expressions.
-#[derive(Copy, Clone)]
-enum StackElement {
- Seen(usize),
- Unseen(usize),
-}
-
-impl StackElement {
- fn index(self) -> usize {
- match self {
- Self::Seen(index) => index,
- Self::Unseen(index) => index,
- }
- }
-
- fn is_seen(self) -> bool {
- matches!(self, Self::Seen(_))
- }
-}
-
-impl Grammar {
- /// Construct a grammar from a vector of terminals, a vector of
- /// non-terminals, and a vector of rules for the non-temrinals.
- ///
- /// # Panic
- ///
- /// If the length of `non` is not equal to that of `rules`, then
- /// the function panics.
- pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self {
- assert_eq!(non.len(), rules.len());
-
- // One more room is reserved for the `None` value.
- let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1))
- .take(non.len())
- .collect();
-
- let first_nodes = rules
- .iter()
- .map(|rule| Vec::with_capacity(rule.len()))
- .collect();
-
- Self {
- ter,
- non,
- rules,
- firsts,
- first_nodes,
- }
- }
-
- /// Return the name of a terminal or a non-terminal.
- pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
- match tnt {
- TNT::Ter(t) => Ok(format!(
- "T{}",
- self.ter
- .get(t)
- .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
- .name()
- )),
- TNT::Non(n) => Ok(format!(
- "N{}",
- self.non
- .get(n)
- .ok_or(Error::IndexOutOfBounds(n, self.non.len()))?
- .name()
- )),
- }
- }
-
- /// Return true if and only if there are no non-terminals in the
- /// grammar.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.non.is_empty()
- }
-
- /// Return the total length of all rules.
- #[inline]
- pub fn total(&self) -> usize {
- self.rules.iter().map(Rule::len).sum()
- }
-
- /// Return the number of terminals.
- #[inline]
- pub fn ter_num(&self) -> usize {
- self.ter.len()
- }
-
- /// Return the number of non-terminals.
- #[inline]
- pub fn non_num(&self) -> usize {
- self.non.len()
- }
-
- /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a
- /// single number to represent terminals and non-terminals.
- ///
- /// # Bounds
- ///
- /// If a terminal index is greater than or equal to the number of
- /// terminals, then this signals an error; mutatis mutandis for
- /// non-terminals.
- ///
- /// # Related
- ///
- /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt].
- #[inline]
- pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> {
- let ter_num = self.ter.len();
- let non_num = self.non.len();
-
- match tnt {
- TNT::Ter(t) => {
- if t >= ter_num {
- Err(Error::IndexOutOfBounds(t, ter_num))
- } else {
- Ok(t)
- }
- }
- TNT::Non(n) => {
- if n >= non_num {
- Err(Error::IndexOutOfBounds(n, non_num))
- } else {
- Ok(n + ter_num)
- }
- }
- }
- }
-
- /// Convert a single number to either a terminal or a
- /// non-terminal.
- ///
- /// # Bounds
- ///
- /// If the number is greater than or equal to the sum of the
- /// numbers of terminals and of non-terminals, then this signals
- /// an error.
- ///
- /// # Related
- ///
- /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt].
- #[inline]
- pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> {
- let ter_num = self.ter.len();
- let non_num = self.non.len();
-
- if flat < ter_num {
- Ok(TNT::Ter(flat))
- } else if flat < ter_num + non_num {
- Ok(TNT::Non(flat - ter_num))
- } else {
- Err(Error::IndexOutOfBounds(flat, ter_num + non_num))
- }
- }
-
- /// Return true if and only if the non-terminal is nullable.
- #[inline]
- pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> {
- Ok(self
- .firsts
- .get(non_terminal)
- .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
- .contains(&None))
- }
-
- /// For a NON_TERMINAL, return an iterator that goes over the
- /// nodes that are reachable from the non-terminal through an
- /// empty transition of the nondeterministic finite automaton.
- #[inline]
- pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
- Ok(self
- .first_nodes
- .get(non_terminal)
- .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
- .iter())
- }
-
- /// Compute the set of terminals that can appear as the first
- /// terminal in some left-linear derivation of a non-terminal, for
- /// every non-terminal.
- ///
- /// This is an algorithm that computes the transitive closure,
- /// which is a common approach for this task. But perhaps there
- /// are more efficient approaches?
- ///
- /// Also the function computes the set of "reachable nodes" in the
- /// process, and records the information in the `first_nodes`
- /// attribute.
- pub fn compute_firsts(&mut self) -> Result<(), Error> {
- let mut updated = true;
-
- let non_len = self.non_num();
-
- use StackElement::{Seen, Unseen};
-
- while updated {
- updated = false;
-
- for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
- let root = if let Some(root) = regex.root() {
- root
- } else {
- if !self.is_nullable(n)? {
- updated = true;
-
- self.firsts.get_mut(n).unwrap().insert(None);
-
- // The default construction of a grammar
- // reserves some space for each vector, so
- // explicitly setting this can reduce some
- // minor memory overhead.
- let pointer = self.first_nodes.get_mut(n).unwrap();
-
- pointer.clear();
- pointer.shrink_to_fit();
- }
-
- continue;
- };
-
- let regex_len = regex.len();
-
- let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
-
- stack.push(Unseen(root));
-
- let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
- Vec::with_capacity(regex_len);
-
- let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
-
- while let Some(top) = stack.pop() {
- let top_index = top.index();
- let is_seen = top.is_seen();
-
- match regex
- .vertex_label(top_index)
- .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
- {
- RegexType::Kleene => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Plus => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Optional => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Or => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Paren => {
- // Only for printing purposes
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- RegexType::Empty => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap().rev() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Lit(tnt) => {
- match tnt {
- TNT::Ter(t) => {
- let mut this_set = HashSet::with_capacity(1);
-
- this_set.insert(Some(t));
-
- children_sets_stack.push(this_set);
- }
- TNT::Non(non) => {
- let this_set = self
- .firsts
- .get(non)
- .ok_or(Error::IndexOutOfBounds(non, non_len))?
- .clone();
-
- children_sets_stack.push(this_set);
- }
- }
-
- let mut this_nodes = HashSet::with_capacity(1);
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- }
- }
-
- assert_eq!(
- children_sets_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- assert_eq!(
- children_nodes_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- for first in children_sets_stack.pop().unwrap().into_iter() {
- if !self.firsts.get(n).unwrap().contains(&first) {
- updated = true;
-
- self.firsts.get_mut(n).unwrap().insert(first);
- }
- }
-
- *self.first_nodes.get_mut(n).unwrap() =
- children_nodes_stack.pop().unwrap().into_iter().collect();
- }
- }
-
- Ok(())
- }
-
- /// Return the regular language of the left-linear closures of
- /// non-terminals in the grammar.
- ///
- /// The resulting vector is guaranteed to be of the same length as
- /// the number of non-terminals.
- ///
- /// The resulting regular language is not "self-contained". That
- /// is to say, its terminals indices are packed indices and are
- /// meaningless without the interpretation of the grammar. They
- /// should be converted to a nondeterministic finite automaton and
- /// then to its null closure later on.
- pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
- let non_len = self.non_num();
-
- let mut result = Vec::with_capacity(non_len);
-
- for (n, rule) in self.rules.iter().enumerate() {
- let regex = &rule.regex;
-
- let regex_root = if let Some(root) = regex.root() {
- root
- } else {
- result.push(Default::default());
-
- continue;
- };
-
- let regex_len = regex.len();
-
- /// A convenient macro to retrieve the children from the
- /// original regular expression with error propagation.
- macro_rules! children {
- ($n:expr) => {
- regex
- .children_of($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- /// A convenient macro to retrieve the label from the
- /// original regular expression with error propagation.
- macro_rules! label {
- ($n:expr) => {
- regex
- .vertex_label($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- let parents = regex.parents_array().map_err(|e| match e {
- nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
- nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
- _ => unreachable!(),
- })?;
-
- use RegexType::*;
- use TNT::*;
-
- let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
- let mut builder = ALGBuilder::default();
-
- /// Perform a depth-first copy
- macro_rules! df_copy {
- ($parent:expr, $index:expr) => {
- match label!($index) {
- Kleene | Plus | Optional | Or | Paren | Empty => {
- let mut stack = vec![($parent, $index)];
-
- while let Some((top_parent, top_index)) = stack.pop() {
- let label = label!(top_index);
- let label = match label {
- Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
- _ => label,
- };
-
- local_result.push(label);
-
- let new = builder.add_vertex();
-
- builder.add_edge(top_parent, new, ()).unwrap();
-
- stack.extend(children!(top_index).map(|child| (new, child)));
- }
- }
- Lit(remain_tnt) => {
- local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
- let new = builder.add_vertex();
- builder.add_edge($parent, new, ()).unwrap();
- }
- }
- };
- }
-
- local_result.push(Or);
- builder.add_vertex();
-
- local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
- let non_lit_index = builder.add_vertex();
-
- builder.add_edge(0, non_lit_index, ()).unwrap();
-
- for first_node in self.first_nodes_of(n)?.copied() {
- assert!(first_node < parents.len());
-
- let tnt = match label!(first_node) {
- Lit(tnt) => tnt,
- _ => continue,
- };
-
- let mut parents_chain = {
- let mut result = Vec::new();
- let mut stack = Vec::with_capacity(parents.len());
-
- stack.push(first_node);
-
- while let Some(top) = stack.pop() {
- assert!(top < parents.len());
- if let Some(parent) = parents.get(top).copied().unwrap() {
- result.push(parent);
- stack.push(parent.0);
- }
- }
-
- result.reverse();
-
- result
- };
-
- if parents_chain.is_empty() {
- local_result.push(Lit(tnt));
- let lit_index = builder.add_vertex();
- builder.add_edge(0, lit_index, ()).unwrap();
-
- continue;
- }
-
- assert!(parents_chain.first().unwrap().0 == regex_root);
-
- // A different, "more local", root.
- let mut root: usize;
-
- // Handle the direct parent
- let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
-
- match label!(parent_node) {
- Kleene | Plus => {
- local_result.extend([Empty, Lit(tnt)]);
-
- root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(root, lit_index, ()).unwrap();
-
- let iterator = children!(parent_node);
-
- for index in iterator.clone().skip(parent_edge_index + 1) {
- df_copy!(root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
-
- Or => {
- local_result.push(Lit(tnt));
- root = builder.add_vertex();
- }
- Optional | Empty => {
- // If this path is taken, it should not be
- // optional.
- local_result.extend([Empty, Lit(tnt)]);
- root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(root, lit_index, ()).unwrap();
-
- for index in children!(parent_node).skip(parent_edge_index + 1) {
- df_copy!(root, index);
- }
- }
- Paren | Lit(_) => unreachable!(),
- }
-
- // Handle successive parents
-
- for (node, edge_index) in parents_chain.into_iter() {
- let node_type = label!(node);
-
- match node_type {
- Kleene | Plus => {
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, root, ()).unwrap();
-
- root = new_index;
-
- let iterator = children!(node);
-
- for index in iterator.clone().skip(edge_index + 1) {
- df_copy!(root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
- RegexType::Or => {}
- RegexType::Optional | RegexType::Empty => {
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, root, ()).unwrap();
- root = new_index;
-
- for index in children!(node).skip(edge_index + 1) {
- df_copy!(root, index);
- }
- }
- RegexType::Paren | RegexType::Lit(_) => unreachable!(),
- }
- }
-
- builder.add_edge(0, root, ()).unwrap();
- }
-
- local_result.shrink_to_fit();
-
- let graph = builder.build();
-
- assert_eq!(graph.nodes_len(), local_result.len());
-
- result.push(
- DefaultRegex::new(graph, local_result)
- .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
- );
- }
-
- assert_eq!(result.len(), non_len);
-
- Ok(result)
- }
-
- /// Convert the regular language of left-linear closures to its
- /// equivalent nondeterministic finite automaton.
- ///
- /// In the generation of the left-linear closure, the terminals
- /// and non-terminals are packed into an unsigned integer. We
- /// unpack them in converting to nondeterministic finite
- /// automaton.
- ///
- /// The resulting nondeterministic finite automaton should be
- /// transformed to its null-closure for use in our algorithm.
- pub fn left_closure_to_nfa(
- &self,
- closure: &[DefaultRegex<TNT>],
- ) -> Result<DefaultNFA<DOption<TNT>>, Error> {
- let label_transform = |tnt| match tnt {
- TNT::Ter(t) => {
- let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
- Error::IndexOutOfBounds(index, bound) => {
- graph::error::Error::IndexOutOfBounds(index, bound)
- }
- _ => unreachable!(),
- })?;
-
- Ok(SoC::Carry(new_tnt))
- }
- TNT::Non(n) => Ok(SoC::Sub(n)),
- };
-
- DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into)
- }
-}
-
-impl Display for Grammar {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- assert_eq!(self.non.len(), self.rules.len());
-
- for (nt, rule) in self.non.iter().zip(self.rules.iter()) {
- write!(f, "{}: ", nt.name())?;
-
- writeln!(
- f,
- "{}",
- rule.regex.to_string_with(|tnt| format!(
- "({})",
- self.name_of_tnt(tnt)
- .unwrap_or_else(|_| format!("Unknown {tnt:?}"))
- ))?
- )?;
- }
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_helper {
- use super::*;
- use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType};
- use std::fmt::Write;
-
- // Construct a grammar to test
- pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
- let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())];
- let non = vec![
- Nonterminal("start".to_owned()),
- Nonterminal("end".to_owned()),
- ];
-
- /// A function to scan the inputs.
- fn scan_tnt(
- parser: &DefaultRegParser<TNT>,
- input: &str,
- ) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> {
- use ParseDirection::*;
- use RegexType::*;
- use TNT::*;
-
- let mut chars = input.chars();
-
- if let Some(first) = chars.next() {
- match first {
- '*' => Ok(Some((1, Kleene, Right))),
- '+' => Ok(Some((1, Plus, Right))),
- '?' => Ok(Some((1, Optional, Right))),
- '|' => Ok(Some((1, Empty, Up))),
- '(' => Ok(Some((1, Or, Down))),
- ')' => Ok(Some((1, Paren, Up))),
- 'T' => {
- let mut name = String::new();
- let mut len = 1;
-
- while let Some(c) = chars.next() {
- if ('a'..='z').contains(&c) {
- len += 1;
- write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
- } else {
- break;
- }
- }
-
- if let Some(t) = parser.query(&name, true) {
- Ok(Some((len, Lit(Ter(t)), Right)))
- } else {
- Err(ParseError::InvalidCharacter(first))
- }
- }
- 'N' => {
- let mut name = String::new();
- let mut len = 1;
-
- while let Some(c) = chars.next() {
- if ('a'..='z').contains(&c) {
- len += 1;
- write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
- } else {
- break;
- }
- }
-
- if let Some(n) = parser.query(&name, false) {
- Ok(Some((len, Lit(Non(n)), Right)))
- } else {
- Err(ParseError::InvalidCharacter(first))
- }
- }
- _ => Err(ParseError::InvalidCharacter(first)),
- }
- } else {
- Ok(None)
- }
- }
-
- let mut regex_parser: DefaultRegParser<TNT> = Default::default();
-
- regex_parser.add_tnt("a", true);
- regex_parser.add_tnt("b", true);
- regex_parser.add_tnt("start", false);
- regex_parser.add_tnt("end", false);
-
- let regex_parser = regex_parser;
-
- let rule1 = Rule {
- regex: regex_parser
- .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)?
- .ok_or(ParseError::Invalid)?
- .0,
- };
-
- let rule2 = Rule {
- regex: regex_parser
- .parse("Nstart?Nend*", Box::new(scan_tnt), true)?
- .ok_or(ParseError::Invalid)?
- .0,
- };
-
- let rules = vec![rule1, rule2];
-
- Ok(Grammar::new(ter, non, rules))
- }
-}
-
-#[cfg(test)]
-mod test_grammar_display {
- use super::test_grammar_helper::new_grammar;
-
- #[test]
- fn test_display() -> Result<(), Box<dyn std::error::Error>> {
- println!("{}", new_grammar()?);
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_firsts {
- use super::test_grammar_helper::new_grammar;
- use super::*;
-
- #[test]
- fn test_firsts() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
-
- grammar.compute_firsts()?;
-
- println!("grammar firsts: {:?}", grammar.firsts);
- println!("grammar first nodes: {:?}", grammar.first_nodes);
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_left_closure {
- use super::test_grammar_helper::new_grammar;
- use super::*;
-
- pub fn new_closure_regex(
- grammar: &mut Grammar,
- ) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> {
- grammar.compute_firsts()?;
-
- println!("grammar firsts: {:?}", grammar.firsts);
- println!("grammar first nodes: {:?}", grammar.first_nodes);
- println!("grammar:");
- println!("{grammar}");
-
- grammar.left_closure().map_err(Into::into)
- }
-
- #[test]
- fn test_regex() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
-
- let vec_of_regexps = new_closure_regex(&mut grammar)?;
-
- for regex in &vec_of_regexps {
- println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?);
- // println!("regex: {regex:?}",);
- println!("regex len = {}", regex.nodes_len());
- }
-
- Ok(())
- }
-
- #[test]
- fn test_nfa() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
- let closure = new_closure_regex(&mut grammar)?;
- let nfa = grammar.left_closure_to_nfa(&closure)?;
- // TODO: print the nfa out
- Ok(())
- }
-}