summaryrefslogtreecommitdiff
path: root/nfa/src/default/regex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'nfa/src/default/regex.rs')
-rw-r--r--nfa/src/default/regex.rs575
1 files changed, 575 insertions, 0 deletions
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
new file mode 100644
index 0000000..a60f801
--- /dev/null
+++ b/nfa/src/default/regex.rs
@@ -0,0 +1,575 @@
+//! This file provides a default implementation of Regex.
+
+use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel};
+
+use crate::{desrec::DesRec, error::Error, Regex};
+
+use receme::{algebra::Algebra, catana::Cata};
+
+use std::fmt::{Debug, Display};
+
+/// The type of a node in a regular expression.
+///
+/// # Example
+///
+/// If a node has type "Kleene", this means it represents a star
+/// construct in a regular expression, and its children are the
+/// contents of the star.
+///
+/// # Note
+///
+/// There is no "concatenation" node type. A concatenation of two
+/// nodes is represented as the two nodes being successive children in
+/// their common parent node.
+///
+/// This is possible because a regular expression has a root node.
+/// For the sake of convenience, the root node has type "Or".
+#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
+pub enum RegexType<T: GraphLabel + Display> {
+ /// A star node is a node with an arbitrary number of repetitions.
+ Kleene,
+ /// A plus node is a node with at least one repetition: a+ equals
+ /// aa*
+ Plus,
+ /// An optional node
+ Optional,
+ /// An or node means an alternation of its children.
+ Or,
+ /// A paren node represents a parenthesis.
+ Paren,
+ /// An empty node
+ #[default]
+ Empty,
+ /// A literal node
+ Lit(T),
+}
+
+/// A default implementation of regular expressions.
+#[derive(Debug)]
+pub struct DefaultRegex<T: GraphLabel + Display> {
+ /// The underlying graph is stored using adjacency lists.
+ graph: ALGraph,
+ /// The types of the underlying nodes.
+ types: Vec<RegexType<T>>,
+}
+
+impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
+ fn default() -> Self {
+ Self {
+ graph: Default::default(),
+ types: Default::default(),
+ }
+ }
+}
+
+impl<T: GraphLabel + Display> DefaultRegex<T> {
+ /// Return the number of elements in this regular expression,
+ /// counting nodes.
+ pub fn len(&self) -> usize {
+ self.types.len()
+ }
+
+ /// Return true if and only if this regular expression has no
+ /// nodes.
+ pub fn is_empty(&self) -> bool {
+ self.types.is_empty()
+ }
+
+ /// Add a node as the child of an existing node or as a root.
+ pub fn add_node(
+ &mut self,
+ label: RegexType<T>,
+ parent: Option<usize>,
+ ) -> Result<(), ParseError> {
+ self.graph.extend(parent.iter().copied())?;
+ self.types.push(label);
+
+ Ok(())
+ }
+}
+
+// REVIEW: This may not be needed.
+impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
+where
+ A: Algebra<T, Vec<T>>,
+{
+ fn cata(self, mut alg: A) -> T {
+ let mut results: Vec<Option<T>> = std::iter::repeat_with(Default::default)
+ .take(self.len())
+ .collect();
+
+ for index in 0..=self.len() {
+ let algebra_result = {
+ let results_of_children: Vec<T> = self
+ .graph
+ .children_of(index)
+ .unwrap()
+ .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap())
+ .collect();
+
+ alg(results_of_children)
+ };
+
+ // Artificially use this value to satisfy the compiler.
+ let _ = std::mem::replace(&mut results[index], Some(algebra_result));
+ }
+
+ std::mem::replace(&mut results[0], None).unwrap()
+ }
+}
+
+impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ #[derive(Copy, Clone)]
+ enum StackElement {
+ Seen(usize),
+ Unseen(usize),
+ }
+
+ impl StackElement {
+ fn index(&self) -> usize {
+ match self {
+ Seen(index) => *index,
+ Unseen(index) => *index,
+ }
+ }
+
+ fn is_seen(&self) -> bool {
+ match self {
+ Seen(_) => true,
+ Unseen(_) => false,
+ }
+ }
+ }
+
+ use StackElement::{Seen, Unseen};
+
+ let mut stack: Vec<StackElement> = Vec::new();
+ let mut types = self.types.clone();
+ types.push(RegexType::Paren);
+
+ stack.push(Unseen(0));
+
+ while let Some(top) = stack.pop() {
+ let node_type = types[top.index()];
+
+ // TODO: Do not use unwrap here.
+
+ match node_type {
+ RegexType::Kleene => {
+ if !top.is_seen() {
+ stack.push(Seen(top.index()));
+ stack.extend(
+ self.graph
+ .children_of(top.index())
+ .unwrap()
+ .map(Unseen)
+ .rev(),
+ );
+ } else {
+ write!(f, "*")?;
+ }
+ }
+ RegexType::Plus => {
+ if !top.is_seen() {
+ stack.push(Seen(top.index()));
+ stack.extend(
+ self.graph
+ .children_of(top.index())
+ .unwrap()
+ .map(Unseen)
+ .rev(),
+ );
+ } else {
+ write!(f, "+")?;
+ }
+ }
+ RegexType::Optional => {
+ if !top.is_seen() {
+ stack.push(Seen(top.index()));
+ stack.extend(
+ self.graph
+ .children_of(top.index())
+ .unwrap()
+ .map(Unseen)
+ .rev(),
+ );
+ } else {
+ write!(f, "?")?;
+ }
+ }
+ RegexType::Or => {
+ if !top.is_seen() {
+ write!(f, "(")?;
+
+ let len = self.len();
+
+ stack.push(Unseen(types.len() - 1));
+
+ for (child_index, child) in self
+ .graph
+ .children_of(top.index())
+ .unwrap()
+ .enumerate()
+ .rev()
+ {
+ if child_index != len - 1 && child_index != 0 {
+ stack.push(Unseen(child));
+ stack.push(Seen(top.index()));
+ } else {
+ stack.push(Unseen(child));
+ }
+ }
+ } else {
+ write!(f, "|")?;
+ }
+ }
+ RegexType::Paren => {
+ write!(f, ")")?;
+ }
+ RegexType::Empty => {
+ stack.extend(
+ self.graph
+ .children_of(top.index())
+ .unwrap()
+ .map(Unseen)
+ .rev(),
+ );
+ }
+ RegexType::Lit(label) => write!(f, "{label}")?,
+ }
+ }
+
+ Ok(())
+ }
+}
+
+impl<T: GraphLabel + Display> Graph for DefaultRegex<T> {
+ type Iter<'a> = <ALGraph as Graph>::Iter<'a>
+ where
+ Self: 'a;
+
+ #[inline]
+ fn is_empty(&self) -> bool {
+ self.graph.is_empty()
+ }
+
+ #[inline]
+ fn nodes_len(&self) -> usize {
+ self.graph.nodes_len()
+ }
+
+ #[inline]
+ fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GError> {
+ self.graph.children_of(node_id)
+ }
+
+ #[inline]
+ fn degree(&self, node_id: usize) -> Result<usize, GError> {
+ self.graph.degree(node_id)
+ }
+
+ #[inline]
+ fn is_empty_node(&self, node_id: usize) -> Result<bool, GError> {
+ self.graph.is_empty_node(node_id)
+ }
+
+ #[inline]
+ fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
+ self.graph.has_edge(source, target)
+ }
+}
+
+impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
+ #[inline]
+ fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> {
+ self.types
+ .get(node_id)
+ .copied()
+ .ok_or(Error::UnknownNode(node_id))
+ }
+}
+
+/// An error type for holding parsing errors.
+#[derive(Debug)]
+pub enum ParseError {
+ /// Encounter an invalid state.
+ Invalid,
+ /// An error from graph operations.
+ Graph(GError),
+ /// Encounter an empty stack.
+ EmptyStack,
+ /// Encounter a non-single stack at the end.
+ NonSingleStack,
+ /// Encounter a stack whose element is out of bounds.
+ ///
+ /// The first component is the stack element, while the second the
+ /// bound.
+ InvalidStack(usize, usize),
+ /// Encounter a repetition construct without a preceding element.
+ InvalidRepetition(usize),
+ /// Invalid character
+ InvalidCharacter(char),
+}
+
+impl Display for ParseError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "{self:?}")
+ }
+}
+
+impl std::error::Error for ParseError {}
+
+impl From<GError> for ParseError {
+ fn from(ge: GError) -> Self {
+ Self::Graph(ge)
+ }
+}
+
+/// The direction of parsing.
+///
+/// This means whether we want to stay at the same level of
+/// parent-child hierarchy, or to become the child, or to climb back
+/// to the last parent.
+#[derive(Debug, Copy, Clone, Default)]
+pub enum ParseDirection {
+ /// Climb back to the last parent.
+ Up,
+ /// Stay at the same level in the hierarchy.
+ #[default]
+ Right,
+ /// Become the child.
+ Down,
+}
+
+impl Display for ParseDirection {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ let direction = match self {
+ ParseDirection::Up => "↑",
+ ParseDirection::Right => "→",
+ ParseDirection::Down => "↓",
+ };
+
+ write!(f, "{direction}")
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
+ type Label = RegexType<T>;
+
+ type Regex = Self;
+
+ type Error = ParseError;
+
+ type Aux = ParseDirection;
+
+ type Scanner<'a> =
+ Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>;
+
+ fn parse<'a>(
+ mut input: &'a str,
+ mut scanner: Self::Scanner<'a>,
+ post_p: bool,
+ ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> {
+ use ParseDirection::*;
+ use RegexType::*;
+
+ let mut intermediate_stack: Vec<(RegexType<T>, ParseDirection)> =
+ Vec::with_capacity(input.len());
+
+ // Classifies the input into a sequence of tokens with
+ // directions.
+ while !input.is_empty() {
+ if let Some((len, label, direction)) = scanner(input)? {
+ if len == 0 {
+ break;
+ }
+
+ input = &input[len..];
+
+ intermediate_stack.push((label, direction));
+
+ // If we encounter an opening parenthesis, we add
+ // another auxiliary instruction.
+ if matches!((label, direction), (Or, Down)) {
+ intermediate_stack.push((Empty, Down));
+ }
+ } else {
+ break;
+ }
+ }
+
+ let inter_len = intermediate_stack.len();
+
+ let mut parents_stack: Vec<usize> = Vec::with_capacity(inter_len + 2);
+
+ parents_stack.push(0);
+ parents_stack.push(1);
+
+ let mut list_of_children: Vec<Vec<usize>> = std::iter::repeat_with(Vec::new)
+ .take(inter_len + 2)
+ .collect();
+
+ list_of_children[0].push(1);
+
+ let mut types: Vec<RegexType<T>> = vec![Or, Empty];
+
+ types.extend(intermediate_stack.iter().map(|(label, _direction)| *label));
+
+ // Converts the sequence of tokens and directions into a
+ // regular expression.
+ for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() {
+ let mut parent: usize;
+ let mut parent_children: &mut Vec<usize>;
+
+ if let Some(parent_stack_parent) = parents_stack.last().copied() {
+ parent = parent_stack_parent;
+
+ match list_of_children.get_mut(parent) {
+ Some(stack_parent_children) => {
+ parent_children = stack_parent_children;
+
+ match (label, direction) {
+ (Paren, Up) => {
+ // a closing parenthesis does not need
+ // to be counted as a child
+ parents_stack.pop();
+ // a closing parenthesis jumps out of
+ // two levels at once
+ parents_stack.pop();
+ }
+ (Empty, Up) => {
+ // an upwards pipe
+
+ // first add the current node to the parent of the parent
+ parents_stack.pop();
+
+ if let Some(parent_stack_parent) = parents_stack.last().copied() {
+ parent = parent_stack_parent;
+
+ if let Some(stack_parent_children) =
+ list_of_children.get_mut(parent)
+ {
+ parent_children = stack_parent_children;
+
+ parent_children.push(index + 2);
+ } else {
+ return Err(ParseError::InvalidStack(
+ parent,
+ inter_len + 2,
+ ));
+ }
+ } else {
+ return Err(ParseError::EmptyStack);
+ }
+
+ // then make the current node the new parent
+ parents_stack.push(index + 2);
+ }
+ (_, Up) => {
+ parents_stack.pop();
+ parent_children.push(index + 2);
+ }
+ (_, Right) => {
+ parent_children.push(index + 2);
+ }
+ (_, Down) => {
+ parents_stack.push(index + 2);
+ parent_children.push(index + 2);
+ }
+ }
+ }
+ None => return Err(ParseError::InvalidStack(parent, inter_len)),
+ }
+ } else {
+ // There are unbalanced closing parentheses.
+ return Err(ParseError::EmptyStack);
+ }
+
+ // A special handling of repetition constructs as postfix
+ // operators: it swaps with the preceding element.
+ if post_p {
+ match label {
+ Kleene | Plus | Optional => {
+ // remove the current node from the parent
+ parent_children.pop();
+
+ if let Some(preceding) = parent_children.last().copied() {
+ list_of_children.swap(preceding, index + 2);
+
+ types.swap(preceding, index + 2);
+
+ match list_of_children.get_mut(preceding) {
+ Some(preceding_children) => {
+ preceding_children.push(index + 2);
+ }
+ None => {
+ return Err(ParseError::InvalidStack(preceding, inter_len + 2))
+ }
+ }
+ } else {
+ return Err(ParseError::InvalidRepetition(index));
+ }
+ }
+ _ => {}
+ }
+ }
+ }
+
+ // There are unbalanced opening parentheses.
+ if parents_stack.len() != 2 {
+ return Err(ParseError::NonSingleStack);
+ }
+
+ let graph = list_of_children.into();
+
+ let result = DefaultRegex { graph, types };
+
+ Ok(Some((result, input)))
+ }
+}
+
+#[cfg(test)]
+mod test_des_rec {
+ use super::*;
+
+ fn test_scanner(
+ input: &str,
+ ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+ use ParseDirection::*;
+ use RegexType::*;
+
+ if let Some(first) = input.chars().next() {
+ match first {
+ '*' => Ok(Some((1, Kleene, Right))),
+ '+' => Ok(Some((1, Plus, Right))),
+ '?' => Ok(Some((1, Optional, Right))),
+ '|' => Ok(Some((1, Empty, Up))),
+ '(' => Ok(Some((1, Or, Down))),
+ ')' => Ok(Some((1, Paren, Up))),
+ ' '..='~' => Ok(Some((1, Lit(first), Right))),
+ _ => Err(ParseError::InvalidCharacter(first)),
+ }
+ } else {
+ Ok(None)
+ }
+ }
+
+ #[test]
+ fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> {
+ let input_string = "a*b?c+|(d*| +)?".to_owned();
+
+ if let Some((regex, remain)) =
+ DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)?
+ {
+ println!("regex = {regex}");
+ println!("remain = {remain}");
+
+ println!("regex length = {}", regex.len());
+
+ Ok(())
+ } else {
+ unreachable!()
+ }
+ }
+}