summaryrefslogtreecommitdiff
path: root/chain/src/grammar.rs
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-01-03 23:44:02 +0800
committerJSDurand <mmemmew@gmail.com>2023-01-03 23:44:02 +0800
commitbdbd4b4dc21af09711c97d3f903877443199af06 (patch)
treec6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /chain/src/grammar.rs
parent8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)
structural change: separate crates out
I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments.
Diffstat (limited to 'chain/src/grammar.rs')
-rw-r--r--chain/src/grammar.rs1247
1 files changed, 0 insertions, 1247 deletions
diff --git a/chain/src/grammar.rs b/chain/src/grammar.rs
deleted file mode 100644
index 287fbca..0000000
--- a/chain/src/grammar.rs
+++ /dev/null
@@ -1,1247 +0,0 @@
-#![warn(missing_docs)]
-//! This file implements the extected behaviours of grammars.
-
-// NOTE: We shall first start with a parser that works at the level of
-// characters. The purpose is to first experiment with the workings
-// and the performance of the algorithms, before optimising by using
-// regular expressions to classify inputs into tokens. In other
-// words, the current focus is not on the optimisations, whereas
-// scanners are for optimisations only, so to speak.
-
-#![allow(unused_imports)]
-use nfa::{
- default::{
- nfa::DefaultNFA,
- regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
- },
- DOption, DesRec, Nfa, Regex, SoC,
-};
-
-use graph::{adlist::ALGBuilder, builder::Builder, Graph};
-
-use std::{
- collections::HashSet,
- fmt::{Display, Write},
-};
-
-/// The type of a terminal.
-///
-/// For the time being this is a wrapper around a string, but in the
-/// future it may hold more information of scanners.
-#[derive(Debug, Clone, Eq, PartialEq)]
-pub struct Terminal {
- // If we want to use scanners, per chance add them as a new field
- // here.
- name: String,
-}
-
-impl Terminal {
- /// Create a terminal with the given name.
- #[inline]
- pub fn new(name: String) -> Self {
- Self { name }
- }
-
- /// Return the name of the terminal.
- #[inline]
- pub fn name(&self) -> &str {
- &self.name
- }
-}
-
-/// The type of a non-terminal.
-///
-/// This is just a wrapper around a string.
-#[derive(Debug, Clone)]
-pub struct Nonterminal(String);
-
-impl Nonterminal {
- /// Return the name of the nonterminal.
- ///
- /// Just to improve readability.
- #[inline]
- pub fn name(&self) -> &str {
- &self.0
- }
-}
-
-/// The type of a terminal or a non-terminal.
-///
-/// Only an index is stored here. Actual data are stored in two other
-/// arrays.
-#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
-pub enum TNT {
- /// Terminal variant
- Ter(usize),
- /// Nonterminal variant
- Non(usize),
-}
-
-impl Display for TNT {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- TNT::Ter(t) => write!(f, "T({t})"),
- TNT::Non(n) => write!(f, "N({n})"),
- }
- }
-}
-
-/// Errors related to grammar operations.
-#[derive(Debug, Copy, Clone)]
-#[non_exhaustive]
-pub enum Error {
- /// The first component is the index, and the second the bound.
- IndexOutOfBounds(usize, usize),
- /// Fail to build the N-th regular expression, due to the
- /// ParseError.
- BuildFail(usize, ParseError),
- /// fail to build NFA
- NFAFail(nfa::error::Error),
-}
-
-impl From<nfa::error::Error> for Error {
- fn from(nfae: nfa::error::Error) -> Self {
- Self::NFAFail(nfae)
- }
-}
-
-impl Display for Error {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- Error::IndexOutOfBounds(i, b) => write!(f, "index {i} out of bound {b}"),
- Error::BuildFail(n, pe) => write!(
- f,
- "Failed to build the {n}-th regular expression due to error: {pe}"
- ),
- Error::NFAFail(nfae) => write!(f, "failed to build NFA because of {nfae}"),
- }
- }
-}
-
-impl std::error::Error for Error {}
-
-/// A rule is a regular expression of terminals or non-terminals.
-#[derive(Debug, Clone)]
-pub struct Rule {
- regex: DefaultRegex<TNT>,
-}
-
-impl Rule {
- /// Return true if and only if the rule is empty.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.regex.is_empty()
- }
-
- /// Return the length of the rule.
- #[inline]
- pub fn len(&self) -> usize {
- self.regex.len()
- }
-}
-
-/// The type of a grammar.
-#[derive(Debug, Clone, Default)]
-pub struct Grammar {
- ter: Vec<Terminal>,
- non: Vec<Nonterminal>,
- rules: Vec<Rule>,
- firsts: Vec<HashSet<Option<usize>>>,
- first_nodes: Vec<Vec<usize>>,
-}
-
-/// A private type to aid the recursive looping of rergular
-/// expressions.
-#[derive(Copy, Clone)]
-enum StackElement {
- Seen(usize),
- Unseen(usize),
-}
-
-impl StackElement {
- fn index(self) -> usize {
- match self {
- Self::Seen(index) => index,
- Self::Unseen(index) => index,
- }
- }
-
- fn is_seen(self) -> bool {
- matches!(self, Self::Seen(_))
- }
-}
-
-impl Grammar {
- /// Construct a grammar from a vector of terminals, a vector of
- /// non-terminals, and a vector of rules for the non-temrinals.
- ///
- /// # Panic
- ///
- /// If the length of `non` is not equal to that of `rules`, then
- /// the function panics.
- pub fn new(ter: Vec<Terminal>, non: Vec<Nonterminal>, rules: Vec<Rule>) -> Self {
- assert_eq!(non.len(), rules.len());
-
- // One more room is reserved for the `None` value.
- let firsts = std::iter::repeat_with(|| HashSet::with_capacity(ter.len() + 1))
- .take(non.len())
- .collect();
-
- let first_nodes = rules
- .iter()
- .map(|rule| Vec::with_capacity(rule.len()))
- .collect();
-
- Self {
- ter,
- non,
- rules,
- firsts,
- first_nodes,
- }
- }
-
- /// Return the name of a terminal or a non-terminal.
- pub fn name_of_tnt(&self, tnt: TNT) -> Result<String, Error> {
- match tnt {
- TNT::Ter(t) => Ok(format!(
- "T{}",
- self.ter
- .get(t)
- .ok_or(Error::IndexOutOfBounds(t, self.ter.len()))?
- .name()
- )),
- TNT::Non(n) => Ok(format!(
- "N{}",
- self.non
- .get(n)
- .ok_or(Error::IndexOutOfBounds(n, self.non.len()))?
- .name()
- )),
- }
- }
-
- /// Return true if and only if there are no non-terminals in the
- /// grammar.
- #[inline]
- pub fn is_empty(&self) -> bool {
- self.non.is_empty()
- }
-
- /// Return the total length of all rules.
- #[inline]
- pub fn total(&self) -> usize {
- self.rules.iter().map(Rule::len).sum()
- }
-
- /// Return the number of terminals.
- #[inline]
- pub fn ter_num(&self) -> usize {
- self.ter.len()
- }
-
- /// Return the number of non-terminals.
- #[inline]
- pub fn non_num(&self) -> usize {
- self.non.len()
- }
-
- /// Convert a non-terminal `N` to `N + TER_NUM`, so that we use a
- /// single number to represent terminals and non-terminals.
- ///
- /// # Bounds
- ///
- /// If a terminal index is greater than or equal to the number of
- /// terminals, then this signals an error; mutatis mutandis for
- /// non-terminals.
- ///
- /// # Related
- ///
- /// The inverse function is [`unpack_tnt`][Grammar::unpack_tnt].
- #[inline]
- pub fn pack_tnt(&self, tnt: TNT) -> Result<usize, Error> {
- let ter_num = self.ter.len();
- let non_num = self.non.len();
-
- match tnt {
- TNT::Ter(t) => {
- if t >= ter_num {
- Err(Error::IndexOutOfBounds(t, ter_num))
- } else {
- Ok(t)
- }
- }
- TNT::Non(n) => {
- if n >= non_num {
- Err(Error::IndexOutOfBounds(n, non_num))
- } else {
- Ok(n + ter_num)
- }
- }
- }
- }
-
- /// Convert a single number to either a terminal or a
- /// non-terminal.
- ///
- /// # Bounds
- ///
- /// If the number is greater than or equal to the sum of the
- /// numbers of terminals and of non-terminals, then this signals
- /// an error.
- ///
- /// # Related
- ///
- /// This is the inverse of [`pack_tnt`][Grammar::pack_tnt].
- #[inline]
- pub fn unpack_tnt(&self, flat: usize) -> Result<TNT, Error> {
- let ter_num = self.ter.len();
- let non_num = self.non.len();
-
- if flat < ter_num {
- Ok(TNT::Ter(flat))
- } else if flat < ter_num + non_num {
- Ok(TNT::Non(flat - ter_num))
- } else {
- Err(Error::IndexOutOfBounds(flat, ter_num + non_num))
- }
- }
-
- /// Return true if and only if the non-terminal is nullable.
- #[inline]
- pub fn is_nullable(&self, non_terminal: usize) -> Result<bool, Error> {
- Ok(self
- .firsts
- .get(non_terminal)
- .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
- .contains(&None))
- }
-
- /// For a NON_TERMINAL, return an iterator that goes over the
- /// nodes that are reachable from the non-terminal through an
- /// empty transition of the nondeterministic finite automaton.
- #[inline]
- pub fn first_nodes_of(&self, non_terminal: usize) -> Result<std::slice::Iter<usize>, Error> {
- Ok(self
- .first_nodes
- .get(non_terminal)
- .ok_or(Error::IndexOutOfBounds(non_terminal, self.non.len()))?
- .iter())
- }
-
- /// Compute the set of terminals that can appear as the first
- /// terminal in some left-linear derivation of a non-terminal, for
- /// every non-terminal.
- ///
- /// This is an algorithm that computes the transitive closure,
- /// which is a common approach for this task. But perhaps there
- /// are more efficient approaches?
- ///
- /// Also the function computes the set of "reachable nodes" in the
- /// process, and records the information in the `first_nodes`
- /// attribute.
- pub fn compute_firsts(&mut self) -> Result<(), Error> {
- let mut updated = true;
-
- let non_len = self.non_num();
-
- use StackElement::{Seen, Unseen};
-
- while updated {
- updated = false;
-
- for (n, regex) in self.rules.iter().map(|rule| &rule.regex).enumerate() {
- let root = if let Some(root) = regex.root() {
- root
- } else {
- if !self.is_nullable(n)? {
- updated = true;
-
- self.firsts.get_mut(n).unwrap().insert(None);
-
- // The default construction of a grammar
- // reserves some space for each vector, so
- // explicitly setting this can reduce some
- // minor memory overhead.
- let pointer = self.first_nodes.get_mut(n).unwrap();
-
- pointer.clear();
- pointer.shrink_to_fit();
- }
-
- continue;
- };
-
- let regex_len = regex.len();
-
- let mut stack: Vec<StackElement> = Vec::with_capacity(regex_len);
-
- stack.push(Unseen(root));
-
- let mut children_sets_stack: Vec<HashSet<Option<usize>>> =
- Vec::with_capacity(regex_len);
-
- let mut children_nodes_stack: Vec<HashSet<usize>> = Vec::with_capacity(regex_len);
-
- while let Some(top) = stack.pop() {
- let top_index = top.index();
- let is_seen = top.is_seen();
-
- match regex
- .vertex_label(top_index)
- .map_err(|_| Error::IndexOutOfBounds(top_index, regex_len))?
- {
- RegexType::Kleene => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Plus => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set);
- this_nodes.extend(child_nodes);
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Optional => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Or => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Paren => {
- // Only for printing purposes
- let mut this_set = HashSet::new();
-
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- RegexType::Empty => {
- if !is_seen {
- stack.push(Seen(top_index));
-
- for child in regex.children_of(top_index).unwrap().rev() {
- stack.push(Unseen(child));
- }
- } else {
- let degree = regex.degree(top_index).unwrap();
- let children_stack_len = children_sets_stack.len();
- let children_nodes_len = children_nodes_stack.len();
-
- assert!(
- children_stack_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- assert!(
- children_nodes_len >= degree,
- "not enough stack elements for {top_index}"
- );
-
- let mut this_set = HashSet::new();
-
- let mut this_nodes = HashSet::new();
-
- this_nodes.insert(top_index);
-
- if degree == 0 {
- this_set.insert(None);
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
-
- continue;
- }
-
- let mut stop = false;
-
- for (child_set, child_nodes) in children_sets_stack
- .drain((children_stack_len - degree)..)
- .zip(
- children_nodes_stack.drain((children_nodes_len - degree)..),
- )
- {
- if stop {
- break;
- }
-
- if !child_set.contains(&None) {
- stop = true;
- }
-
- this_set.extend(child_set.iter().copied());
- this_nodes.extend(child_nodes.iter().copied());
- }
-
- if stop {
- this_set.remove(&None);
- }
-
- children_sets_stack.push(this_set);
- children_nodes_stack.push(this_nodes);
- }
- }
- RegexType::Lit(tnt) => {
- match tnt {
- TNT::Ter(t) => {
- let mut this_set = HashSet::with_capacity(1);
-
- this_set.insert(Some(t));
-
- children_sets_stack.push(this_set);
- }
- TNT::Non(non) => {
- let this_set = self
- .firsts
- .get(non)
- .ok_or(Error::IndexOutOfBounds(non, non_len))?
- .clone();
-
- children_sets_stack.push(this_set);
- }
- }
-
- let mut this_nodes = HashSet::with_capacity(1);
- this_nodes.insert(top_index);
-
- children_nodes_stack.push(this_nodes);
- }
- }
- }
-
- assert_eq!(
- children_sets_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- assert_eq!(
- children_nodes_stack.len(),
- 1,
- "Too many elements left at the end"
- );
-
- for first in children_sets_stack.pop().unwrap().into_iter() {
- if !self.firsts.get(n).unwrap().contains(&first) {
- updated = true;
-
- self.firsts.get_mut(n).unwrap().insert(first);
- }
- }
-
- *self.first_nodes.get_mut(n).unwrap() =
- children_nodes_stack.pop().unwrap().into_iter().collect();
- }
- }
-
- Ok(())
- }
-
- /// Return the regular language of the left-linear closures of
- /// non-terminals in the grammar.
- ///
- /// The resulting vector is guaranteed to be of the same length as
- /// the number of non-terminals.
- ///
- /// The resulting regular language is not "self-contained". That
- /// is to say, its terminals indices are packed indices and are
- /// meaningless without the interpretation of the grammar. They
- /// should be converted to a nondeterministic finite automaton and
- /// then to its null closure later on.
- pub fn left_closure(&self) -> Result<Vec<DefaultRegex<TNT>>, Error> {
- let non_len = self.non_num();
-
- let mut result = Vec::with_capacity(non_len);
-
- for (n, rule) in self.rules.iter().enumerate() {
- let regex = &rule.regex;
-
- let regex_root = if let Some(root) = regex.root() {
- root
- } else {
- result.push(Default::default());
-
- continue;
- };
-
- let regex_len = regex.len();
-
- /// A convenient macro to retrieve the children from the
- /// original regular expression with error propagation.
- macro_rules! children {
- ($n:expr) => {
- regex
- .children_of($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- /// A convenient macro to retrieve the label from the
- /// original regular expression with error propagation.
- macro_rules! label {
- ($n:expr) => {
- regex
- .vertex_label($n)
- .map_err(|_| Error::IndexOutOfBounds($n, regex_len))?
- };
- }
-
- let parents = regex.parents_array().map_err(|e| match e {
- nfa::error::Error::UnknownNode(n) => Error::IndexOutOfBounds(n, regex_len),
- nfa::error::Error::Cycle => Error::BuildFail(n, ParseError::Cycle),
- _ => unreachable!(),
- })?;
-
- use RegexType::*;
- use TNT::*;
-
- let mut local_result: Vec<RegexType<TNT>> = Vec::with_capacity(regex_len * 2);
- let mut builder = ALGBuilder::default();
-
- /// Perform a depth-first copy
- macro_rules! df_copy {
- ($parent:expr, $index:expr) => {
- match label!($index) {
- Kleene | Plus | Optional | Or | Paren | Empty => {
- let mut stack = vec![($parent, $index)];
-
- while let Some((top_parent, top_index)) = stack.pop() {
- let label = label!(top_index);
- let label = match label {
- Lit(top_tnt) => Lit(Ter(self.pack_tnt(top_tnt).unwrap())),
- _ => label,
- };
-
- local_result.push(label);
-
- let new = builder.add_vertex();
-
- builder.add_edge(top_parent, new, ()).unwrap();
-
- stack.extend(children!(top_index).map(|child| (new, child)));
- }
- }
- Lit(remain_tnt) => {
- local_result.push(Lit(Ter(self.pack_tnt(remain_tnt).unwrap())));
- let new = builder.add_vertex();
- builder.add_edge($parent, new, ()).unwrap();
- }
- }
- };
- }
-
- local_result.push(Or);
- builder.add_vertex();
-
- local_result.push(Lit(Ter(self.pack_tnt(Non(n)).unwrap())));
- let non_lit_index = builder.add_vertex();
-
- builder.add_edge(0, non_lit_index, ()).unwrap();
-
- for first_node in self.first_nodes_of(n)?.copied() {
- assert!(first_node < parents.len());
-
- let tnt = match label!(first_node) {
- Lit(tnt) => tnt,
- _ => continue,
- };
-
- let mut parents_chain = {
- let mut result = Vec::new();
- let mut stack = Vec::with_capacity(parents.len());
-
- stack.push(first_node);
-
- while let Some(top) = stack.pop() {
- assert!(top < parents.len());
- if let Some(parent) = parents.get(top).copied().unwrap() {
- result.push(parent);
- stack.push(parent.0);
- }
- }
-
- result.reverse();
-
- result
- };
-
- if parents_chain.is_empty() {
- local_result.push(Lit(tnt));
- let lit_index = builder.add_vertex();
- builder.add_edge(0, lit_index, ()).unwrap();
-
- continue;
- }
-
- assert!(parents_chain.first().unwrap().0 == regex_root);
-
- // A different, "more local", root.
- let mut root: usize;
-
- // Handle the direct parent
- let (parent_node, parent_edge_index) = parents_chain.pop().unwrap();
-
- match label!(parent_node) {
- Kleene | Plus => {
- local_result.extend([Empty, Lit(tnt)]);
-
- root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(root, lit_index, ()).unwrap();
-
- let iterator = children!(parent_node);
-
- for index in iterator.clone().skip(parent_edge_index + 1) {
- df_copy!(root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
-
- Or => {
- local_result.push(Lit(tnt));
- root = builder.add_vertex();
- }
- Optional | Empty => {
- // If this path is taken, it should not be
- // optional.
- local_result.extend([Empty, Lit(tnt)]);
- root = builder.add_vertex();
- let lit_index = builder.add_vertex();
- builder.add_edge(root, lit_index, ()).unwrap();
-
- for index in children!(parent_node).skip(parent_edge_index + 1) {
- df_copy!(root, index);
- }
- }
- Paren | Lit(_) => unreachable!(),
- }
-
- // Handle successive parents
-
- for (node, edge_index) in parents_chain.into_iter() {
- let node_type = label!(node);
-
- match node_type {
- Kleene | Plus => {
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, root, ()).unwrap();
-
- root = new_index;
-
- let iterator = children!(node);
-
- for index in iterator.clone().skip(edge_index + 1) {
- df_copy!(root, index);
- }
-
- local_result.push(Kleene);
- let new_parent = builder.add_vertex();
- builder.add_edge(root, new_parent, ()).unwrap();
-
- for index in iterator {
- df_copy!(new_parent, index);
- }
- }
- RegexType::Or => {}
- RegexType::Optional | RegexType::Empty => {
- local_result.push(Empty);
- let new_index = builder.add_vertex();
- builder.add_edge(new_index, root, ()).unwrap();
- root = new_index;
-
- for index in children!(node).skip(edge_index + 1) {
- df_copy!(root, index);
- }
- }
- RegexType::Paren | RegexType::Lit(_) => unreachable!(),
- }
- }
-
- builder.add_edge(0, root, ()).unwrap();
- }
-
- local_result.shrink_to_fit();
-
- let graph = builder.build();
-
- assert_eq!(graph.nodes_len(), local_result.len());
-
- result.push(
- DefaultRegex::new(graph, local_result)
- .map_err(|_| Error::BuildFail(n, ParseError::Cycle))?,
- );
- }
-
- assert_eq!(result.len(), non_len);
-
- Ok(result)
- }
-
- /// Convert the regular language of left-linear closures to its
- /// equivalent nondeterministic finite automaton.
- ///
- /// In the generation of the left-linear closure, the terminals
- /// and non-terminals are packed into an unsigned integer. We
- /// unpack them in converting to nondeterministic finite
- /// automaton.
- ///
- /// The resulting nondeterministic finite automaton should be
- /// transformed to its null-closure for use in our algorithm.
- pub fn left_closure_to_nfa(
- &self,
- closure: &[DefaultRegex<TNT>],
- ) -> Result<DefaultNFA<DOption<TNT>>, Error> {
- let label_transform = |tnt| match tnt {
- TNT::Ter(t) => {
- let new_tnt = self.unpack_tnt(t).map_err(|e| match e {
- Error::IndexOutOfBounds(index, bound) => {
- graph::error::Error::IndexOutOfBounds(index, bound)
- }
- _ => unreachable!(),
- })?;
-
- Ok(SoC::Carry(new_tnt))
- }
- TNT::Non(n) => Ok(SoC::Sub(n)),
- };
-
- DefaultNFA::to_nfa(closure, label_transform).map_err(Into::into)
- }
-}
-
-impl Display for Grammar {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- assert_eq!(self.non.len(), self.rules.len());
-
- for (nt, rule) in self.non.iter().zip(self.rules.iter()) {
- write!(f, "{}: ", nt.name())?;
-
- writeln!(
- f,
- "{}",
- rule.regex.to_string_with(|tnt| format!(
- "({})",
- self.name_of_tnt(tnt)
- .unwrap_or_else(|_| format!("Unknown {tnt:?}"))
- ))?
- )?;
- }
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_helper {
- use super::*;
- use nfa::default::regex::{DefaultRegParser, ParseDirection, ParseError, RegexType};
- use std::fmt::Write;
-
- // Construct a grammar to test
- pub fn new_grammar() -> Result<Grammar, Box<dyn std::error::Error>> {
- let ter = vec![Terminal::new("a".to_owned()), Terminal::new("b".to_owned())];
- let non = vec![
- Nonterminal("start".to_owned()),
- Nonterminal("end".to_owned()),
- ];
-
- /// A function to scan the inputs.
- fn scan_tnt(
- parser: &DefaultRegParser<TNT>,
- input: &str,
- ) -> Result<Option<(usize, RegexType<TNT>, ParseDirection)>, ParseError> {
- use ParseDirection::*;
- use RegexType::*;
- use TNT::*;
-
- let mut chars = input.chars();
-
- if let Some(first) = chars.next() {
- match first {
- '*' => Ok(Some((1, Kleene, Right))),
- '+' => Ok(Some((1, Plus, Right))),
- '?' => Ok(Some((1, Optional, Right))),
- '|' => Ok(Some((1, Empty, Up))),
- '(' => Ok(Some((1, Or, Down))),
- ')' => Ok(Some((1, Paren, Up))),
- 'T' => {
- let mut name = String::new();
- let mut len = 1;
-
- while let Some(c) = chars.next() {
- if ('a'..='z').contains(&c) {
- len += 1;
- write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
- } else {
- break;
- }
- }
-
- if let Some(t) = parser.query(&name, true) {
- Ok(Some((len, Lit(Ter(t)), Right)))
- } else {
- Err(ParseError::InvalidCharacter(first))
- }
- }
- 'N' => {
- let mut name = String::new();
- let mut len = 1;
-
- while let Some(c) = chars.next() {
- if ('a'..='z').contains(&c) {
- len += 1;
- write!(name, "{c}").map_err(|_| ParseError::InvalidCharacter(c))?;
- } else {
- break;
- }
- }
-
- if let Some(n) = parser.query(&name, false) {
- Ok(Some((len, Lit(Non(n)), Right)))
- } else {
- Err(ParseError::InvalidCharacter(first))
- }
- }
- _ => Err(ParseError::InvalidCharacter(first)),
- }
- } else {
- Ok(None)
- }
- }
-
- let mut regex_parser: DefaultRegParser<TNT> = Default::default();
-
- regex_parser.add_tnt("a", true);
- regex_parser.add_tnt("b", true);
- regex_parser.add_tnt("start", false);
- regex_parser.add_tnt("end", false);
-
- let regex_parser = regex_parser;
-
- let rule1 = Rule {
- regex: regex_parser
- .parse("Ta*Tb+Nend+", Box::new(scan_tnt), true)?
- .ok_or(ParseError::Invalid)?
- .0,
- };
-
- let rule2 = Rule {
- regex: regex_parser
- .parse("Nstart?Nend*", Box::new(scan_tnt), true)?
- .ok_or(ParseError::Invalid)?
- .0,
- };
-
- let rules = vec![rule1, rule2];
-
- Ok(Grammar::new(ter, non, rules))
- }
-}
-
-#[cfg(test)]
-mod test_grammar_display {
- use super::test_grammar_helper::new_grammar;
-
- #[test]
- fn test_display() -> Result<(), Box<dyn std::error::Error>> {
- println!("{}", new_grammar()?);
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_firsts {
- use super::test_grammar_helper::new_grammar;
- use super::*;
-
- #[test]
- fn test_firsts() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
-
- grammar.compute_firsts()?;
-
- println!("grammar firsts: {:?}", grammar.firsts);
- println!("grammar first nodes: {:?}", grammar.first_nodes);
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod test_grammar_left_closure {
- use super::test_grammar_helper::new_grammar;
- use super::*;
-
- pub fn new_closure_regex(
- grammar: &mut Grammar,
- ) -> Result<Vec<DefaultRegex<TNT>>, Box<dyn std::error::Error>> {
- grammar.compute_firsts()?;
-
- println!("grammar firsts: {:?}", grammar.firsts);
- println!("grammar first nodes: {:?}", grammar.first_nodes);
- println!("grammar:");
- println!("{grammar}");
-
- grammar.left_closure().map_err(Into::into)
- }
-
- #[test]
- fn test_regex() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
-
- let vec_of_regexps = new_closure_regex(&mut grammar)?;
-
- for regex in &vec_of_regexps {
- println!("regex: {}", regex.to_string_with(|tnt| format!("{tnt}"))?);
- // println!("regex: {regex:?}",);
- println!("regex len = {}", regex.nodes_len());
- }
-
- Ok(())
- }
-
- #[test]
- fn test_nfa() -> Result<(), Box<dyn std::error::Error>> {
- let mut grammar = new_grammar()?;
- let closure = new_closure_regex(&mut grammar)?;
- let nfa = grammar.left_closure_to_nfa(&closure)?;
- // TODO: print the nfa out
- Ok(())
- }
-}