summaryrefslogtreecommitdiff
path: root/nfa/src/default/regex.rs
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2022-12-23 00:36:31 +0800
committerJSDurand <mmemmew@gmail.com>2022-12-23 00:36:31 +0800
commit8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (patch)
tree343eea3c634efbbf72c64ed5dd778ecce60c3eea /nfa/src/default/regex.rs
parent9f1c88b863e247da3cd60d2792a7a13b18e25e53 (diff)
renaming core to chain and some other changes
Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations.
Diffstat (limited to 'nfa/src/default/regex.rs')
-rw-r--r--nfa/src/default/regex.rs313
1 files changed, 276 insertions, 37 deletions
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
index a60f801..2670e32 100644
--- a/nfa/src/default/regex.rs
+++ b/nfa/src/default/regex.rs
@@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex};
use receme::{algebra::Algebra, catana::Cata};
-use std::fmt::{Debug, Display};
+use std::{
+ collections::HashMap,
+ fmt::{Debug, Display, Write},
+ marker::PhantomData,
+};
/// The type of a node in a regular expression.
///
@@ -25,7 +29,7 @@ use std::fmt::{Debug, Display};
/// This is possible because a regular expression has a root node.
/// For the sake of convenience, the root node has type "Or".
#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
-pub enum RegexType<T: GraphLabel + Display> {
+pub enum RegexType<T: GraphLabel> {
/// A star node is a node with an arbitrary number of repetitions.
Kleene,
/// A plus node is a node with at least one repetition: a+ equals
@@ -44,13 +48,31 @@ pub enum RegexType<T: GraphLabel + Display> {
Lit(T),
}
+impl<T: GraphLabel> Display for RegexType<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ RegexType::Kleene => write!(f, "*"),
+ RegexType::Plus => write!(f, "+"),
+ RegexType::Optional => write!(f, "?"),
+ RegexType::Or => write!(f, "|"),
+ RegexType::Paren => write!(f, "()"),
+ RegexType::Empty => write!(f, "empty"),
+ RegexType::Lit(value) => write!(f, "{value}"),
+ }
+ }
+}
+
/// A default implementation of regular expressions.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
pub struct DefaultRegex<T: GraphLabel + Display> {
/// The underlying graph is stored using adjacency lists.
graph: ALGraph,
/// The types of the underlying nodes.
types: Vec<RegexType<T>>,
+ /// The root of the graph.
+ ///
+ /// If it is None, it indicates the regular expression is empty.
+ root: Option<usize>,
}
impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
@@ -58,11 +80,39 @@ impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
Self {
graph: Default::default(),
types: Default::default(),
+ root: Default::default(),
}
}
}
-impl<T: GraphLabel + Display> DefaultRegex<T> {
+impl<T: GraphLabel> DefaultRegex<T> {
+ /// Set the root of the regular expression.
+ #[inline]
+ pub fn set_root(&mut self, root: Option<usize>) {
+ self.root = root;
+ }
+
+ /// Construct a regular expression from a raw adjacency list graph
+ /// and an array of labels.
+ ///
+ /// # Error
+ ///
+ /// If the graph contains cycles, this function will return an
+ /// error. This check is added to make sure that the regular
+ /// expression contains no cycles, otherwise operations with
+ /// regular expressions might enter infinite loops later.
+ pub fn new(graph: ALGraph, types: Vec<RegexType<T>>) -> Result<Self, Error> {
+ if graph.contains_cycles()? {
+ Err(Error::Cycle)
+ } else {
+ Ok(Self {
+ graph,
+ types,
+ root: Some(0),
+ })
+ }
+ }
+
/// Return the number of elements in this regular expression,
/// counting nodes.
pub fn len(&self) -> usize {
@@ -86,10 +136,65 @@ impl<T: GraphLabel + Display> DefaultRegex<T> {
Ok(())
}
+
+ /// Return the internal graph.
+ ///
+ /// Normally this should not be used.
+ pub fn internal_graph(&self) -> ALGraph {
+ self.graph.clone()
+ }
+
+ /// Return the internal types array.
+ ///
+ /// Normally this should not be used.
+ pub fn internal_types(&self) -> Vec<RegexType<T>> {
+ self.types.clone()
+ }
+
+ /// Return the array of parents.
+ ///
+ /// The element at index N of the returned array is the parent of
+ /// that N-th element. If an element has no parents, a `None` is
+ /// placed at its place.
+ ///
+ /// # Error
+ ///
+ /// If some edge points to an invalid node, an erro will be
+ /// returned.
+ ///
+ /// Also, if the graph contains cycles, an error is also returned.
+ pub fn parents_array(&self) -> Result<Vec<Option<(usize, usize)>>, Error> {
+ if self.graph.contains_cycles().map_err(|e| match e {
+ GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n),
+ _ => unreachable!(),
+ })? {
+ return Err(Error::Cycle);
+ }
+
+ let len = self.len();
+
+ let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect();
+
+ for source in self.graph.nodes() {
+ for (index, target) in self
+ .graph
+ .children_of(source)
+ .map_err(|_| Error::UnknownNode(source))?
+ .enumerate()
+ {
+ result
+ .get_mut(target)
+ .ok_or(Error::UnknownNode(target))?
+ .replace((source, index));
+ }
+ }
+
+ Ok(result)
+ }
}
// REVIEW: This may not be needed.
-impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
+impl<S: GraphLabel, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
where
A: Algebra<T, Vec<T>>,
{
@@ -118,8 +223,13 @@ where
}
}
-impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+impl<T: GraphLabel> DefaultRegex<T> {
+ /// Use a function to format the labels of the regular expressions
+ /// and format the entire regular expression with this aid.
+ pub fn to_string_with<F>(&self, mut f: F) -> Result<String, std::fmt::Error>
+ where
+ F: FnMut(T) -> String,
+ {
#[derive(Copy, Clone)]
enum StackElement {
Seen(usize),
@@ -134,16 +244,15 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
}
}
- fn is_seen(&self) -> bool {
- match self {
- Seen(_) => true,
- Unseen(_) => false,
- }
+ fn is_seen(self) -> bool {
+ matches!(self, Seen(_))
}
}
use StackElement::{Seen, Unseen};
+ let mut result = String::new();
+
let mut stack: Vec<StackElement> = Vec::new();
let mut types = self.types.clone();
types.push(RegexType::Paren);
@@ -159,6 +268,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
RegexType::Kleene => {
if !top.is_seen() {
stack.push(Seen(top.index()));
+
+ if self.degree(top.index()).unwrap() > 1 {
+ write!(result, "(")?;
+ stack.push(Unseen(types.len() - 1));
+ }
+
stack.extend(
self.graph
.children_of(top.index())
@@ -167,7 +282,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "*")?;
+ write!(result, "*")?;
}
}
RegexType::Plus => {
@@ -181,7 +296,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "+")?;
+ write!(result, "+")?;
}
}
RegexType::Optional => {
@@ -195,12 +310,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
} else {
- write!(f, "?")?;
+ write!(result, "?")?;
}
}
RegexType::Or => {
if !top.is_seen() {
- write!(f, "(")?;
+ write!(result, "(")?;
let len = self.len();
@@ -221,11 +336,11 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
}
}
} else {
- write!(f, "|")?;
+ write!(result, "|")?;
}
}
RegexType::Paren => {
- write!(f, ")")?;
+ write!(result, ")")?;
}
RegexType::Empty => {
stack.extend(
@@ -236,11 +351,17 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
.rev(),
);
}
- RegexType::Lit(label) => write!(f, "{label}")?,
+ RegexType::Lit(label) => write!(result, "{}", f(label))?,
}
}
- Ok(())
+ Ok(result)
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{}", self.to_string_with(|t| format!("{t}"))?)
}
}
@@ -278,9 +399,20 @@ impl<T: GraphLabel + Display> Graph for DefaultRegex<T> {
fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
self.graph.has_edge(source, target)
}
+
+ #[inline]
+ fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) {
+ unimplemented!()
+ }
}
-impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
+impl<T: GraphLabel + Display + Debug> Regex<RegexType<T>> for DefaultRegex<T> {
+ /// Return the root of the regular expression.
+ #[inline]
+ fn root(&self) -> Option<usize> {
+ self.root
+ }
+
#[inline]
fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> {
self.types
@@ -291,8 +423,10 @@ impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
}
/// An error type for holding parsing errors.
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
pub enum ParseError {
+ /// A cycle is encountered.
+ Cycle,
/// Encounter an invalid state.
Invalid,
/// An error from graph operations.
@@ -354,23 +488,77 @@ impl Display for ParseDirection {
}
}
-impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
+/// A default recursive descent parser for regular expressions of
+/// terminals or non-terminals.
+#[derive(Debug, Clone)]
+pub struct DefaultRegParser<T: GraphLabel + Display> {
+ ter_map: HashMap<String, usize>,
+ non_map: HashMap<String, usize>,
+ _phantom: PhantomData<T>,
+}
+
+impl<T: GraphLabel + Display> DefaultRegParser<T> {
+ /// Query if a terminal or a non-terminal is already found.
+ ///
+ /// If found, return the associated index of the terminal or
+ /// non-terminal.
+ pub fn query(&self, tnt: &str, terminal_p: bool) -> Option<usize> {
+ if terminal_p {
+ self.ter_map.get(tnt).copied()
+ } else {
+ self.non_map.get(tnt).copied()
+ }
+ }
+
+ /// Add a terminal or a non-terminal.
+ pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) {
+ if terminal_p {
+ let ter_len = self.ter_map.len();
+
+ self.ter_map.entry(tnt.to_owned()).or_insert(ter_len);
+ } else {
+ let non_len = self.non_map.len();
+
+ self.non_map.entry(tnt.to_owned()).or_insert(non_len);
+ }
+ }
+}
+
+impl<T: GraphLabel + Display> Default for DefaultRegParser<T> {
+ fn default() -> Self {
+ Self {
+ ter_map: Default::default(),
+ non_map: Default::default(),
+ _phantom: PhantomData,
+ }
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegParser<T> {
type Label = RegexType<T>;
- type Regex = Self;
+ type Regex = DefaultRegex<T>;
type Error = ParseError;
- type Aux = ParseDirection;
+ type Inter = ParseDirection;
- type Scanner<'a> =
- Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>;
+ type Scanner<'a, 'b> = Box<
+ dyn FnMut(
+ &'b Self,
+ &'a str,
+ ) -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>,
+ > where RegexType<T>:'b;
- fn parse<'a>(
+ fn parse<'a, 'b>(
+ &'b self,
mut input: &'a str,
- mut scanner: Self::Scanner<'a>,
+ mut scanner: Self::Scanner<'a, 'b>,
post_p: bool,
- ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> {
+ ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error>
+ where
+ Self::Label: 'b,
+ {
use ParseDirection::*;
use RegexType::*;
@@ -380,7 +568,7 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
// Classifies the input into a sequence of tokens with
// directions.
while !input.is_empty() {
- if let Some((len, label, direction)) = scanner(input)? {
+ if let Some((len, label, direction)) = scanner(self, input)? {
if len == 0 {
break;
}
@@ -523,7 +711,12 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
let graph = list_of_children.into();
- let result = DefaultRegex { graph, types };
+ // Check there are no cycles
+ let result = DefaultRegex::new(graph, types).map_err(|e| match e {
+ Error::Graph(ge) => ParseError::Graph(ge),
+ Error::Cycle => ParseError::Cycle,
+ _ => unreachable!(),
+ })?;
Ok(Some((result, input)))
}
@@ -533,9 +726,17 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
mod test_des_rec {
use super::*;
- fn test_scanner(
- input: &str,
- ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+ use crate::desrec::DesRec;
+
+ #[allow(dead_code, unused)]
+ fn test_scanner<'a, 'b, T>(
+ parser: &'b DefaultRegParser<T>,
+ input: &'a str,
+ ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError>
+ where
+ T: GraphLabel + Display + Debug,
+ T: 'b,
+ {
use ParseDirection::*;
use RegexType::*;
@@ -557,19 +758,57 @@ mod test_des_rec {
#[test]
fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> {
- let input_string = "a*b?c+|(d*| +)?".to_owned();
+ let input_string = "(ade)*b?c+|(d*| +)?".to_owned();
+
+ let parser: DefaultRegParser<char> = Default::default();
if let Some((regex, remain)) =
- DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)?
+ DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
{
println!("regex = {regex}");
println!("remain = {remain}");
println!("regex length = {}", regex.len());
+ let parents = regex.parents_array()?;
+
+ println!("parents = {parents:?}");
+
Ok(())
} else {
unreachable!()
}
}
+
+ #[test]
+ fn test_display() -> Result<(), Box<dyn std::error::Error>> {
+ use graph::builder::Builder;
+ use RegexType::*;
+
+ let mut builder = graph::adlist::ALGBuilder::default();
+ let mut types: Vec<RegexType<usize>> = Vec::with_capacity(4);
+
+ types.push(Kleene);
+ builder.add_vertex();
+
+ types.push(Lit(0));
+ builder.add_vertex();
+ builder.add_edge(0, 1, ())?;
+
+ types.push(Lit(1));
+ builder.add_vertex();
+ builder.add_edge(0, 2, ())?;
+
+ types.push(Lit(2));
+ builder.add_vertex();
+ builder.add_edge(0, 3, ())?;
+
+ let graph = builder.build();
+
+ let regex = DefaultRegex::new(graph, types)?;
+
+ println!("regex = {regex}");
+
+ Ok(())
+ }
}