summaryrefslogtreecommitdiff
path: root/nfa/src
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-12-23 00:36:31 +0800
committerJSDurand <mmemmew@gmail.com>2022-12-23 00:36:31 +0800
commit8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (patch)
tree343eea3c634efbbf72c64ed5dd778ecce60c3eea /nfa/src
parent9f1c88b863e247da3cd60d2792a7a13b18e25e53 (diff)
renaming core to chain and some other changes
Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations.
Diffstat (limited to 'nfa/src')
-rw-r--r--nfa/src/default/nfa.rs374
-rw-r--r--nfa/src/default/regex.rs313
-rw-r--r--nfa/src/desrec.rs22
-rw-r--r--nfa/src/error.rs11
-rw-r--r--nfa/src/lib.rs94
5 files changed, 743 insertions, 71 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index 3c2bd83..229ba4d 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -1,25 +1,26 @@
//! This file provides a default implementation of NFA.
-// TODO: Store the regular expression in the NFA as well.
-//
-// The current focus of the project is to understand the growth rate
-// of the algorithm, to know whether I made a mistake in the previous
-// iteration of the implementation, or the algorithm is not as fast as
-// the author estimated, which is not quite likely, of course.
+// TODO: The current focus of the project is to understand the growth
+// rate of the algorithm, to know whether I made a mistake in the
+// previous iteration of the implementation, or the algorithm is not
+// as fast as the author estimated, which is not quite likely, of
+// course.
//
// Thus I shall establish a friendly interface that allows me to view
// and debug the atomic languages and the languages, transparently.
-use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph};
+use graph::{
+ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel,
+ LabelGraph,
+};
-use crate::{error::Error, Nfa, Regex};
+use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
-use std::fmt::Display;
+use core::fmt::{Debug, Display};
-#[non_exhaustive]
-#[derive(Debug)]
/// Default NFA implementation.
-pub struct DefaultNFA<T: GraphLabel + Display> {
+#[derive(Debug, Clone)]
+pub struct DefaultNFA<T: GraphLabel> {
graph: DLGraph<T>,
}
@@ -63,6 +64,16 @@ impl<T: GraphLabel + Display> Graph for DefaultNFA<T> {
fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
self.graph.has_edge(source, target)
}
+
+ #[inline]
+ fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> {
+ self.graph.print_viz(filename)
+ }
+
+ #[inline]
+ fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) {
+ unimplemented!()
+ }
}
impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
@@ -70,11 +81,10 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a;
- // TODO: Return the label from the contained regular language.
#[inline]
fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> {
if self.has_node(node_id) {
- todo!()
+ unimplemented!()
} else {
Err(GError::IndexOutOfBounds(node_id, self.nodes_len()))
}
@@ -98,12 +108,232 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> {
self.graph.labels_of(node_id)
}
+
+ #[inline]
+ fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result<bool, GError> {
+ self.graph.has_edge_label(node_id, label, target)
+ }
}
impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
- #[allow(unused)]
- fn to_nfa(regex: impl Regex<T>) -> Self {
- todo!()
+ type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
+
+ fn to_nfa(
+ regexps: &[impl Regex<RegexType<T>>],
+ sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+ ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+ let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
+
+ if total_regexps_len == 0 {
+ return Ok(Default::default());
+ }
+
+ // We reserve two more rooms for later uses.
+ let nfa_len = total_regexps_len * 2 + 2;
+
+ let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
+
+ // Since DOption<T> implements Copy when T does, we can use a
+ // variable to hold the empty label to avoid repetitions.
+ let empty_label: DOption<T> = Default::default();
+
+ for _ in 0..nfa_len {
+ builder.add_vertex();
+ }
+
+ let accumulators: Vec<usize> = {
+ let mut result = Vec::with_capacity(regexps.len() + 1);
+
+ result.push(0);
+
+ let mut accumulator = 0;
+
+ for regexp in regexps.iter() {
+ accumulator += regexp.nodes_len() * 2;
+ result.push(accumulator);
+ }
+
+ result.pop();
+
+ result
+ };
+
+ assert_eq!(accumulators.len(), regexps.len());
+
+ /// Get offset from `accumulators` safely.
+ macro_rules! get_offset_safe (
+ ($num:expr) => {
+ *accumulators.get($num).ok_or(Error::UnknownNode($num))?
+ }
+ );
+
+ /// Get offset from `accumulators` without checking bounds.
+ macro_rules! get_offset_unsafe (
+ ($num:expr) => {
+ { unsafe { *accumulators.get_unchecked($num) } }
+ }
+ );
+
+ for (index, regex) in regexps.iter().enumerate() {
+ let root = if let Some(root) = regex.root() {
+ root
+ } else {
+ // A regular expression without roots is empty, so we
+ // skip it.
+ continue;
+ };
+
+ let regex_len = regex.nodes_len();
+
+ // It is safe here to call `get_offset_unsafe`, as `index`
+ // is guaranteed to be strictly less than the length of
+ // `accumulators` by an assertion above.
+ let offset = get_offset_unsafe!(index);
+
+ let mut stack: Vec<usize> = Vec::with_capacity(regex_len);
+
+ stack.push(root);
+
+ while let Some(top_index) = stack.pop() {
+ let top_label = regex.vertex_label(top_index)?;
+
+ let nfa_start = offset + 2 * top_index;
+ let nfa_end = offset + 2 * top_index + 1;
+
+ match top_label {
+ RegexType::Kleene => {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+
+ builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ }
+ }
+ RegexType::Plus => {
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+
+ builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Optional => {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Or => {
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(nfa_start, child_start, empty_label)?;
+ builder.add_edge(child_end, nfa_end, empty_label)?;
+ }
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Paren => {
+ // Ignore Paren nodes since currently these
+ // are used only for printing purposes.
+ }
+ RegexType::Empty => {
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Lit(value) => {
+ // The children would be ignored even if for
+ // some reason this literal node had some
+ // children.
+
+ match sub_pred(value)? {
+ SoC::Sub(sub_non) => {
+ // a non-terminal
+
+ let sub_offset = get_offset_safe!(sub_non);
+ let sub_nfa_start = sub_offset + 2 * sub_non;
+ let sub_nfa_end = sub_offset + 2 * sub_non + 1;
+
+ builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
+ builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+ }
+ SoC::Carry(new_value) => {
+ // a terminal
+
+ builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ let graph = builder.build();
+
+ Ok(DefaultNFA { graph })
}
fn remove_epsilon(&mut self) -> Result<(), Error> {
@@ -114,7 +344,113 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
todo!()
}
- fn nulling(&mut self) -> Result<(), Error> {
- todo!()
+ fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
+ let mut updated = true;
+ let mut builder = self.graph.builder_ref();
+
+ while updated {
+ updated = false;
+
+ let mut nullable = false;
+
+ let mut to_add = Vec::new();
+
+ for (source, target) in builder.edges() {
+ for label in builder.edge_label(source, target)? {
+ if f(label) {
+ nullable = true;
+
+ break;
+ }
+ }
+
+ if nullable {
+ for (label, child_iter) in builder.labels_of(target)? {
+ for child in child_iter {
+ if !builder.has_edge_label(source, label, child)? {
+ updated = true;
+
+ to_add.push((source, child, *label));
+ }
+ }
+ }
+ }
+ }
+
+ for (source, child, label) in to_add {
+ builder.add_edge(source, child, label)?;
+ }
+ }
+
+ self.graph.replace_by_builder(builder);
+
+ Ok(())
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ Debug::fmt(self, f)
+ }
+}
+
+#[cfg(test)]
+mod test_to_nfa {
+ #![allow(unused_imports)]
+ use super::*;
+ use crate::SoC;
+
+ use crate::{
+ default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
+ desrec::DesRec,
+ };
+
+ fn new_regex() -> Result<DefaultRegex<char>, ParseError> {
+ let parser = DefaultRegParser::<char>::default();
+
+ fn test_scanner<T: GraphLabel + Display + Debug>(
+ _parser: &DefaultRegParser<T>,
+ input: &str,
+ ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+ use ParseDirection::*;
+ use RegexType::*;
+
+ if let Some(first) = input.chars().next() {
+ match first {
+ '*' => Ok(Some((1, Kleene, Right))),
+ '+' => Ok(Some((1, Plus, Right))),
+ '?' => Ok(Some((1, Optional, Right))),
+ '|' => Ok(Some((1, Empty, Up))),
+ '(' => Ok(Some((1, Or, Down))),
+ ')' => Ok(Some((1, Paren, Up))),
+ ' '..='~' => Ok(Some((1, Lit(first), Right))),
+ _ => Err(ParseError::InvalidCharacter(first)),
+ }
+ } else {
+ Ok(None)
+ }
+ }
+
+ let input_string = "a*b?c+|(d*| +)?".to_owned();
+
+ Ok(
+ DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
+ .unwrap_or(Default::default())
+ .0,
+ )
+ }
+
+ #[test]
+ fn test_to_nfa() -> Result<(), Box<dyn std::error::Error>> {
+ let regex = new_regex()?;
+
+ println!("regex = {regex}");
+
+ let nfa: DefaultNFA<DOption<char>> =
+ DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?;
+
+ nfa.print_viz("nfa.gv")?;
+
+ Ok(())
}
}
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
index a60f801..2670e32 100644
--- a/nfa/src/default/regex.rs
+++ b/nfa/src/default/regex.rs
@@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex};
use receme::{algebra::Algebra, catana::Cata};
-use std::fmt::{Debug, Display};
+use std::{
+ collections::HashMap,
+ fmt::{Debug, Display, Write},
+ marker::PhantomData,
+};
/// The type of a node in a regular expression.
///
@@ -25,7 +29,7 @@ use std::fmt::{Debug, Display};
/// This is possible because a regular expression has a root node.
/// For the sake of convenience, the root node has type "Or".
#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
-pub enum RegexType<T: GraphLabel + Display> {
+pub enum RegexType<T: GraphLabel> {
/// A star node is a node with an arbitrary number of repetitions.
Kleene,
/// A plus node is a node with at least one repetition: a+ equals
@@ -44,13 +48,31 @@ pub enum RegexType<T: GraphLabel + Display> {
Lit(T),
}
+impl<T: GraphLabel> Display for RegexType<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ RegexType::Kleene => write!(f, "*"),
+ RegexType::Plus => write!(f, "+"),
+ RegexType::Optional => write!(f, "?"),
+ RegexType::Or => write!(f, "|"),
+ RegexType::Paren => write!(f, "()"),
+ RegexType::Empty => write!(f, "empty"),
+ RegexType::Lit(value) => write!(f, "{value}"),
+ }
+ }
+}
+
/// A default implementation of regular expressions.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
pub struct DefaultRegex<T: GraphLabel + Display> {
/// The underlying graph is stored using adjacency lists.
graph: ALGraph,
/// The types of the underlying nodes.
types: Vec<RegexType<T>>,
+ /// The root of the graph.
+ ///
+ /// If it is None, it indicates the regular expression is empty.
+ root: Option<usize>,
}
impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
@@ -58,11 +80,39 @@ impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
Self {
graph: Default::default(),
types: Default::default(),
+ root: Default::default(),
}
}
}
-impl<T: GraphLabel + Display> DefaultRegex<T> {
+impl<T: GraphLabel> DefaultRegex<T> {
+ /// Set the root of the regular expression.
+ #[inline]
+ pub fn set_root(&mut self, root: Option<usize>) {
+ self.root = root;
+ }
+
+ /// Construct a regular expression from a raw adjacency list graph
+ /// and an array of labels.
+ ///
+ /// # Error
+ ///
+ /// If the graph contains cycles, this function will return an
+ /// error. This check is added to make sure that the regular
+ /// expression contains no cycles, otherwise operations with
+ /// regular expressions might enter infinite loops later.
+ pub fn new(graph: ALGraph, types: Vec<RegexType<T>>) -> Result<Self, Error> {
+ if graph.contains_cycles()? {
+ Err(Error::Cycle)
+ } else {
+ Ok(Self {
+ graph,
+ types,
+ root: Some(0),
+ })
+ }
+ }
+
/// Return the number of elements in this regular expression,
/// counting nodes.
pub fn len(&self) -> usize {
@@ -86,10 +136,65 @@ impl<T: GraphLabel + Display> DefaultRegex<T> {
Ok(())
}
+
+ /// Return the internal graph.
+ ///
+ /// Normally this should not be used.
+ pub fn internal_graph(&self) -> ALGraph {
+ self.graph.clone()
+ }
+
+ /// Return the internal types array.
+ ///
+ /// Normally this should not be used.
+ pub fn internal_types(&self) -> Vec<RegexType<T>> {
+ self.types.clone()
+ }
+
+ /// Return the array of parents.
+ ///
+ /// The element at index N of the returned array is the parent of
+ /// that N-th element. If an element has no parents, a `None` is
+ /// placed at its place.
+ ///
+ /// # Error
+ ///
+ /// If some edge points to an invalid node, an erro will be
+ /// returned.
+ ///
+ /// Also, if the graph contains cycles, an error is also returned.
+ pub fn parents_array(&self) -> Result<Vec<Option<(usize, usize)>>, Error> {
+ if self.graph.contains_cycles().map_err(|e| match e {
+ GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n),
+ _ => unreachable!(),
+ })? {
+ return Err(Error::Cycle);
+ }
+
+ let len = self.len();
+
+ let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect();
+
+ for source in self.graph.nodes() {
+ for (index, target) in self
+ .graph
+ .children_of(source)
+ .map_err(|_| Error::UnknownNode(source))?
+ .enumerate()
+ {
+ result
+ .get_mut(target)
+ .ok_or(Error::UnknownNode(target))?
+ .replace((source, index));
+ }
+ }
+
+ Ok(result)
+ }
}
// REVIEW: This may not be needed.
-impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
+impl<S: GraphLabel, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
where
A: Algebra<T, Vec<T>>,
{
@@ -118,8 +223,13 @@ where
}
}
-impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+impl<T: GraphLabel> DefaultRegex<T> {
+ /// Use a function to format the labels of the regular expressions
+ /// and format the entire regular expression with this aid.
+ pub fn to_string_with<F>(&self, mut f: F) -> Result<String, std::fmt::Error>
+ where
+ F: FnMut(T) -> String,
+ {
#[derive(Copy, Clone)]
enum StackElement {
Seen(usize),
@@ -134,16 +244,15 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
}
}
- fn is_seen(&self) -> bool {
- match self {
- Seen(_) => true,
- Unseen(_) => false,
- }
+ fn is_seen(self) -> bool {
+ matches!(self, Seen(_))
}
}
use StackElement::{Seen, Unseen};
+ let mut result = String::new();
+
let mut stack: Vec<StackElement> = Vec::new();
let mut types = self.types.clone();
types.push(RegexType::Paren);
@@ -159,6 +268,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
RegexType::Kleene => {
if !top.is_seen() {
stack.push(Seen(top.index()));
+
+ if self.degree(top.index()).unwrap() > 1 {
+ write!(result, "(")?;
+ stack.push(Unseen(types.len() - 1));
+ }
+
stack.extend(
self.graph
.children_of(top.index())
@@ -167,7 +282,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "*")?;
+ write!(result, "*")?;
}
}
RegexType::Plus => {
@@ -181,7 +296,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "+")?;
+ write!(result, "+")?;
}
}
RegexType::Optional => {
@@ -195,12 +310,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "?")?;
+ write!(result, "?")?;
}
}
RegexType::Or => {
if !top.is_seen() {
- write!(f, "(")?;
+ write!(result, "(")?;
let len = self.len();
@@ -221,11 +336,11 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
}
}
} else {
- write!(f, "|")?;
+ write!(result, "|")?;
}
}
RegexType::Paren => {
- write!(f, ")")?;
+ write!(result, ")")?;
}
RegexType::Empty => {
stack.extend(
@@ -236,11 +351,17 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
}
- RegexType::Lit(label) => write!(f, "{label}")?,
+ RegexType::Lit(label) => write!(result, "{}", f(label))?,
}
}
- Ok(())
+ Ok(result)
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.to_string_with(|t| format!("{t}"))?)
}
}
@@ -278,9 +399,20 @@ impl<T: GraphLabel + Display> Graph for DefaultRegex<T> {
fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
self.graph.has_edge(source, target)
}
+
+ #[inline]
+ fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) {
+ unimplemented!()
+ }
}
-impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
+impl<T: GraphLabel + Display + Debug> Regex<RegexType<T>> for DefaultRegex<T> {
+ /// Return the root of the regular expression.
+ #[inline]
+ fn root(&self) -> Option<usize> {
+ self.root
+ }
+
#[inline]
fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> {
self.types
@@ -291,8 +423,10 @@ impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
}
/// An error type for holding parsing errors.
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
pub enum ParseError {
+ /// A cycle is encountered.
+ Cycle,
/// Encounter an invalid state.
Invalid,
/// An error from graph operations.
@@ -354,23 +488,77 @@ impl Display for ParseDirection {
}
}
-impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
+/// A default recursive descent parser for regular expressions of
+/// terminals or non-terminals.
+#[derive(Debug, Clone)]
+pub struct DefaultRegParser<T: GraphLabel + Display> {
+ ter_map: HashMap<String, usize>,
+ non_map: HashMap<String, usize>,
+ _phantom: PhantomData<T>,
+}
+
+impl<T: GraphLabel + Display> DefaultRegParser<T> {
+ /// Query if a terminal or a non-terminal is already found.
+ ///
+ /// If found, return the associated index of the terminal or
+ /// non-terminal.
+ pub fn query(&self, tnt: &str, terminal_p: bool) -> Option<usize> {
+ if terminal_p {
+ self.ter_map.get(tnt).copied()
+ } else {
+ self.non_map.get(tnt).copied()
+ }
+ }
+
+ /// Add a terminal or a non-terminal.
+ pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) {
+ if terminal_p {
+ let ter_len = self.ter_map.len();
+
+ self.ter_map.entry(tnt.to_owned()).or_insert(ter_len);
+ } else {
+ let non_len = self.non_map.len();
+
+ self.non_map.entry(tnt.to_owned()).or_insert(non_len);
+ }
+ }
+}
+
+impl<T: GraphLabel + Display> Default for DefaultRegParser<T> {
+ fn default() -> Self {
+ Self {
+ ter_map: Default::default(),
+ non_map: Default::default(),
+ _phantom: PhantomData,
+ }
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegParser<T> {
type Label = RegexType<T>;
- type Regex = Self;
+ type Regex = DefaultRegex<T>;
type Error = ParseError;
- type Aux = ParseDirection;
+ type Inter = ParseDirection;
- type Scanner<'a> =
- Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>;
+ type Scanner<'a, 'b> = Box<
+ dyn FnMut(
+ &'b Self,
+ &'a str,
+ ) -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>,
+ > where RegexType<T>:'b;
- fn parse<'a>(
+ fn parse<'a, 'b>(
+ &'b self,
mut input: &'a str,
- mut scanner: Self::Scanner<'a>,
+ mut scanner: Self::Scanner<'a, 'b>,
post_p: bool,
- ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> {
+ ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error>
+ where
+ Self::Label: 'b,
+ {
use ParseDirection::*;
use RegexType::*;
@@ -380,7 +568,7 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
// Classifies the input into a sequence of tokens with
// directions.
while !input.is_empty() {
- if let Some((len, label, direction)) = scanner(input)? {
+ if let Some((len, label, direction)) = scanner(self, input)? {
if len == 0 {
break;
}
@@ -523,7 +711,12 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
let graph = list_of_children.into();
- let result = DefaultRegex { graph, types };
+ // Check there are no cycles
+ let result = DefaultRegex::new(graph, types).map_err(|e| match e {
+ Error::Graph(ge) => ParseError::Graph(ge),
+ Error::Cycle => ParseError::Cycle,
+ _ => unreachable!(),
+ })?;
Ok(Some((result, input)))
}
@@ -533,9 +726,17 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
mod test_des_rec {
use super::*;
- fn test_scanner(
- input: &str,
- ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+ use crate::desrec::DesRec;
+
+ #[allow(dead_code, unused)]
+ fn test_scanner<'a, 'b, T>(
+ parser: &'b DefaultRegParser<T>,
+ input: &'a str,
+ ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError>
+ where
+ T: GraphLabel + Display + Debug,
+ T: 'b,
+ {
use ParseDirection::*;
use RegexType::*;
@@ -557,19 +758,57 @@ mod test_des_rec {
#[test]
fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> {
- let input_string = "a*b?c+|(d*| +)?".to_owned();
+ let input_string = "(ade)*b?c+|(d*| +)?".to_owned();
+
+ let parser: DefaultRegParser<char> = Default::default();
if let Some((regex, remain)) =
- DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)?
+ DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
{
println!("regex = {regex}");
println!("remain = {remain}");
println!("regex length = {}", regex.len());
+ let parents = regex.parents_array()?;
+
+ println!("parents = {parents:?}");
+
Ok(())
} else {
unreachable!()
}
}
+
+ #[test]
+ fn test_display() -> Result<(), Box<dyn std::error::Error>> {
+ use graph::builder::Builder;
+ use RegexType::*;
+
+ let mut builder = graph::adlist::ALGBuilder::default();
+ let mut types: Vec<RegexType<usize>> = Vec::with_capacity(4);
+
+ types.push(Kleene);
+ builder.add_vertex();
+
+ types.push(Lit(0));
+ builder.add_vertex();
+ builder.add_edge(0, 1, ())?;
+
+ types.push(Lit(1));
+ builder.add_vertex();
+ builder.add_edge(0, 2, ())?;
+
+ types.push(Lit(2));
+ builder.add_vertex();
+ builder.add_edge(0, 3, ())?;
+
+ let graph = builder.build();
+
+ let regex = DefaultRegex::new(graph, types)?;
+
+ println!("regex = {regex}");
+
+ Ok(())
+ }
}
diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs
index c57d313..ac45c2d 100644
--- a/nfa/src/desrec.rs
+++ b/nfa/src/desrec.rs
@@ -20,8 +20,8 @@ pub trait DesRec {
/// The type of errors encountered when parsing.
type Error: std::error::Error;
- /// Auxiliary data when parsing
- type Aux;
+ /// Intermediate data when parsing
+ type Inter;
/// The type of a scanner that classifies inputs into tokens.
///
@@ -34,7 +34,14 @@ pub trait DesRec {
///
/// - `Ok(Some(_))`: the classification succeeds and the parsing
/// should continue.
- type Scanner<'a>: FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>;
+ type Scanner<'a, 'b>: FnMut(
+ &'b Self,
+ &'a str,
+ )
+ -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>
+ where
+ Self: 'b,
+ Self::Label: 'b;
/// Parse a string into a regular expression with the aid of this
/// type.
@@ -42,9 +49,12 @@ pub trait DesRec {
/// Accept a slice of string and return either a parsing error, or
/// a pair of correctly parsed regular expression and the
/// remaining slice.
- fn parse<'a>(
+ fn parse<'a, 'b>(
+ &'b self,
input: &'a str,
- scanner: Self::Scanner<'a>,
+ scanner: Self::Scanner<'a, 'b>,
post_p: bool,
- ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error>;
+ ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error>
+ where
+ Self::Label: 'b;
}
diff --git a/nfa/src/error.rs b/nfa/src/error.rs
index 0c6bb3c..ad75077 100644
--- a/nfa/src/error.rs
+++ b/nfa/src/error.rs
@@ -2,10 +2,11 @@
use graph::error::Error as GError;
-use std::fmt::{Display, Formatter};
+use core::fmt::{Display, Formatter};
-#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
/// The error type for NFA operations.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[non_exhaustive]
pub enum Error {
/// An unknown node id is encountered.
UnknownNode(usize),
@@ -13,8 +14,12 @@ pub enum Error {
///
/// Everything is a trade-off, wink wink.
UnsupportedOperation,
+ /// There is no root in the underlying regular expression.
+ NoRoot,
/// This error comes from some underlying graph operation.
Graph(GError),
+ /// A cycle is found when constructing regular expressions.
+ Cycle,
}
impl Display for Error {
@@ -23,6 +28,8 @@ impl Display for Error {
Error::UnknownNode(id) => write!(f, "unknown node: {id}"),
Error::UnsupportedOperation => write!(f, "unsupported operation"),
Error::Graph(e) => write!(f, "graph error: {e}"),
+ Error::NoRoot => write!(f, "no root found"),
+ Error::Cycle => write!(f, "a cycle is found when constructing regular expressions"),
}
}
}
diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs
index 1e2a30c..b1d62b3 100644
--- a/nfa/src/lib.rs
+++ b/nfa/src/lib.rs
@@ -12,17 +12,23 @@ extern crate graph;
use core::fmt::Display;
+use std::ops::{Deref, DerefMut};
+
use graph::{Graph, GraphLabel, LabelGraph};
use error::Error;
+pub use desrec::DesRec;
+
+use default::regex::RegexType;
+
/// The expected behaviour of a regular language.
///
/// Nondeterministic finite automata are equivalent to regular
/// languages. Since regular languages are easier to understand for a
/// human being, nondeterministic finite automata include the data for
/// the equivalent regular languages.
-pub trait Regex<T: GraphLabel>: Graph + Display {
+pub trait Regex<T: GraphLabel>: Graph + Display + Clone {
/// Return the label of a vertex, or an error if the node is
/// invalid.
fn vertex_label(&self, node_id: usize) -> Result<T, Error>;
@@ -52,6 +58,65 @@ pub trait Regex<T: GraphLabel>: Graph + Display {
// cetera. These might be needed later.
}
+/// Since Option<T> does not inherit the Display from T, we wrap it to
+/// provide an automatic implementation of Display.
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct DOption<T>(Option<T>);
+
+impl<T> Default for DOption<T> {
+ fn default() -> Self {
+ Self(None)
+ }
+}
+
+impl<T> Deref for DOption<T> {
+ type Target = Option<T>;
+
+ fn deref(&self) -> &Self::Target {
+ &self.0
+ }
+}
+
+impl<T> DerefMut for DOption<T> {
+ fn deref_mut(&mut self) -> &mut Self::Target {
+ &mut self.0
+ }
+}
+
+impl<T: Display> Display for DOption<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self.deref() {
+ Some(value) => Display::fmt(value, f),
+ None => write!(f, "ε"),
+ }
+ }
+}
+
+/// Substitute or Carry
+///
+/// This enumeration indicates whether a label from a regular
+/// expression should be substituted by another regular expression, or
+/// to be carried around in the resulting nondeterministic finite
+/// automaton, in the process of the [`to_nfa`][Nfa::to_nfa] function.
+///
+/// # Transform labels
+///
+/// The label that is returned to be carried can be different from the
+/// original label, as a way to transform the labels.
+///
+/// # Remark on the abbreviation
+///
+/// It happens "by chance" that this abbreviation coincides with the
+/// abbreviation of "system on chip". Since I know nothing about this
+/// topic, this is just a meaningless pun.
+#[derive(Debug, Copy, Clone)]
+pub enum SoC<T> {
+ /// To be substituted by another regular expression.
+ Sub(usize),
+ /// To carry around this label.
+ Carry(T),
+}
+
/// The expected behvaiour of a nondeterministic finite automaton.
///
/// Every NFA is a special labelled graph.
@@ -68,9 +133,23 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
Err(Error::UnsupportedOperation)
}
- /// Build a nondeterministic finite automaton out of a regular
- /// language.
- fn to_nfa(regex: impl Regex<T>) -> Self;
+ /// When we convert a regular expression to a nondeterministic
+ /// finite automaton, the label should be optional, so we use a
+ /// different type for the result type.
+ type FromRegex<S: GraphLabel + Display + Default>;
+
+ /// Build a nondeterministic finite automaton out of a set of
+ /// regular expressions.
+ ///
+ /// The SUB_PRED is a predicate function that returns an optional
+ /// index for a label. If it returns some index, then this means
+ /// we should substitute the index-th regular expression in the
+ /// set, whereever that label occurs in the set of regular
+ /// expressions.
+ fn to_nfa(
+ regexps: &[impl Regex<RegexType<T>>],
+ sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+ ) -> Result<Self::FromRegex<DOption<T>>, Error>;
/// Remove all dead states from the nondeterministic finite
/// automaton.
@@ -78,13 +157,14 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
/// A state is dead if there are no edges going to the state.
fn remove_dead(&mut self) -> Result<(), Error>;
- /// For each empty transition from A to B, and for every edge from
- /// B to C, say, add an edge from A to C.
+ /// For each edge from A to B whose edge is considered nullable by
+ /// a function `f`, and for every edge from B to C, add an edge
+ /// from A to C.
///
/// This is needed specifically by the algorithm to produce a set
/// of atomic languages that represent "null-closed" derived
/// languages.
- fn nulling(&mut self) -> Result<(), Error>;
+ fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>;
}
pub mod default;