From 9f1c88b863e247da3cd60d2792a7a13b18e25e53 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 14 Dec 2022 23:48:22 +0800 Subject: a temporary check point just to save things in a commit --- THANKS | 1 + configure.ac | 4 +- find-version.sh | 8 +- graph/Makefile.am | 15 +- graph/src/adlist.rs | 8 + graph/src/labelled.rs | 2 +- nfa/Cargo.toml | 1 + nfa/Makefile.am | 19 ++ nfa/src/default/mod.rs | 121 +--------- nfa/src/default/nfa.rs | 120 ++++++++++ nfa/src/default/regex.rs | 575 +++++++++++++++++++++++++++++++++++++++++++++++ nfa/src/desrec.rs | 50 +++++ nfa/src/error.rs | 6 + nfa/src/lib.rs | 8 +- receme/src/functor.rs | 8 + repcore/src/grammar.rs | 59 +++++ repcore/src/lib.rs | 6 + repcore/src/plan.org | 59 +++++ 18 files changed, 940 insertions(+), 130 deletions(-) create mode 100644 nfa/Makefile.am create mode 100644 nfa/src/default/nfa.rs create mode 100644 nfa/src/default/regex.rs create mode 100644 nfa/src/desrec.rs create mode 100644 repcore/src/grammar.rs create mode 100644 repcore/src/plan.org diff --git a/THANKS b/THANKS index e69de29..9045a0b 100644 --- a/THANKS +++ b/THANKS @@ -0,0 +1 @@ +TODO: Thank many people here. \ No newline at end of file diff --git a/configure.ac b/configure.ac index fbf987e..fe765e6 100644 --- a/configure.ac +++ b/configure.ac @@ -2,8 +2,6 @@ AC_PREREQ([2.60]) AC_INIT([REP], m4_esyscmd([./find-version.sh]), [durand@jsdurand.xyz]) -dnl patsubst(m4_esyscmd([./find-version.sh]), ` "') - AC_COPYRIGHT([This package is covered by GPL v3.]) AC_CONFIG_AUX_DIR([build-aux]) @@ -21,7 +19,7 @@ AS_IF([test "$CARGO" = "notfound"], [AC_MSG_ERROR([cargo is required])]) AC_PATH_PROG([RUSTC], [rustc], [notfound]) AS_IF([test "$RUSTC" = "notfound"], [AC_MSG_ERROR([rustc is required])]) -AC_CONFIG_FILES([Makefile graph/Makefile]) +AC_CONFIG_FILES([Makefile graph/Makefile nfa/Makefile]) AC_OUTPUT diff --git a/find-version.sh b/find-version.sh index 4ac390e..99aa538 100755 --- a/find-version.sh +++ b/find-version.sh @@ -2,18 +2,24 @@ ":"; exec emacs --quick --script "$0" -- "$@" # -*- mode: emacs-lisp; lexical-binding: t; -*- (with-temp-buffer + ;; the version information is stored in the Cargo.toml file. (insert-file-contents "Cargo.toml") (goto-char (point-min)) (cond + ;; search for the version assignment statement ((search-forward "version =" nil t) - (re-search-forward " *" (line-end-position) t) + ;; skip spaces + (re-search-forward "[[:space:]]*" (line-end-position) t) + ;; check it starts with a double quote (cond ((= (char-after) 34)) ((error "Invalid syntax at %d" (point)))) (let ((end (line-end-position))) + ;; and check it ends with a double quote as well (cond ((= (char-before end) 34)) ((error "Invalid syntax at %d" (1- end)))) + ;; then print the bare version string out without delimiters (princ (buffer-substring-no-properties (1+ (point)) (1- end))))) diff --git a/graph/Makefile.am b/graph/Makefile.am index 776b911..623572a 100644 --- a/graph/Makefile.am +++ b/graph/Makefile.am @@ -1,12 +1,19 @@ -.PHONY: dev rel +.PHONY: dev rel clean check all: dev dev: - @CARGO@ build + @echo "cargo build" + @@CARGO@ build rel: - @CARGO@ build --release + @echo "cargo build --release" + @@CARGO@ build --release clean: - @CARGO@ clean + @echo "cargo clean" + @@CARGO@ clean + +check: + @echo "cargo clippy" + @@CARGO@ clippy diff --git a/graph/src/adlist.rs b/graph/src/adlist.rs index c16ceb2..18ad770 100644 --- a/graph/src/adlist.rs +++ b/graph/src/adlist.rs @@ -102,6 +102,14 @@ impl ExtGraph for ALGraph { } } +// TODO: Add a way to build a graph by its raw adjacency list representation. +impl From>> for ALGraph { + fn from(adlist: Vec>) -> Self { + let nodes: Vec = adlist.iter().cloned().map(ALNode::new).collect(); + Self { nodes } + } +} + #[cfg(test)] mod algraph_test { use super::*; diff --git a/graph/src/labelled.rs b/graph/src/labelled.rs index 1cb2461..d02e301 100644 --- a/graph/src/labelled.rs +++ b/graph/src/labelled.rs @@ -144,7 +144,7 @@ impl<'a> Iterator for LabelIndexIter<'a> { #[inline] fn next(&mut self) -> Option { - self.iter.as_mut().map(|iterator| iterator.next()).flatten() + self.iter.as_mut().and_then(|iterator| iterator.next()) } #[inline] diff --git a/nfa/Cargo.toml b/nfa/Cargo.toml index b1387b6..7f48760 100644 --- a/nfa/Cargo.toml +++ b/nfa/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [dependencies] graph = { path = "../graph", optional = true } +receme = { path = "../receme" } [features] default = ["default-graph"] diff --git a/nfa/Makefile.am b/nfa/Makefile.am new file mode 100644 index 0000000..623572a --- /dev/null +++ b/nfa/Makefile.am @@ -0,0 +1,19 @@ +.PHONY: dev rel clean check + +all: dev + +dev: + @echo "cargo build" + @@CARGO@ build + +rel: + @echo "cargo build --release" + @@CARGO@ build --release + +clean: + @echo "cargo clean" + @@CARGO@ clean + +check: + @echo "cargo clippy" + @@CARGO@ clippy diff --git a/nfa/src/default/mod.rs b/nfa/src/default/mod.rs index 805540b..b9ee398 100644 --- a/nfa/src/default/mod.rs +++ b/nfa/src/default/mod.rs @@ -1,123 +1,10 @@ //! This file provides a structure that implements the trait -//! [`NFA`][super::Nfa]. -//! -//! It is used as the default implementation. +//! [`NFA`][super::Nfa] and another that umplements the trait +//! [`Regex`][super::Regex]. -use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; +pub mod nfa; -use super::{error::Error, Nfa, Regex}; - -// TODO: Store the regular expression in the NFA as well. -// -// The current focus of the project is to understand the growth rate -// of the algorithm, to know whether I made a mistake in the previous -// iteration of the implementation, or the algorithm is not as fast as -// the author estimated, which is not quite likely, of course. -// -// Thus I shall establish a friendly interface that allows me to view -// and debug the atomic languages and the languages, transparently. - -#[non_exhaustive] -#[derive(Debug)] -/// Default NFA implementation. -pub struct DefaultNFA { - graph: DLGraph, -} - -impl Default for DefaultNFA { - fn default() -> Self { - let graph = Default::default(); - Self { graph } - } -} - -impl Graph for DefaultNFA { - type Iter<'a> = as Graph>::Iter<'a> where T: 'a; - - #[inline] - fn is_empty(&self) -> bool { - self.graph.is_empty() - } - - #[inline] - fn nodes_len(&self) -> usize { - self.graph.nodes_len() - } - - #[inline] - fn children_of(&self, node_id: usize) -> Result, GError> { - self.graph.children_of(node_id) - } - - #[inline] - fn degree(&self, node_id: usize) -> Result { - self.graph.degree(node_id) - } - - #[inline] - fn is_empty_node(&self, node_id: usize) -> Result { - self.graph.is_empty_node(node_id) - } - - #[inline] - fn has_edge(&self, source: usize, target: usize) -> Result { - self.graph.has_edge(source, target) - } -} - -impl LabelGraph for DefaultNFA { - type Iter<'a> = as LabelGraph>::Iter<'a> where T: 'a; - - type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; - - // TODO: Return the label from the contained regular language. - #[inline] - fn vertex_label(&self, node_id: usize) -> Result, GError> { - if self.has_node(node_id) { - todo!() - } else { - Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) - } - } - - #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result, GError> { - self.graph.edge_label(source, target) - } - - #[inline] - fn find_children_with_label( - &self, - node_id: usize, - label: &T, - ) -> Result<>::Iter<'_>, GError> { - self.graph.find_children_with_label(node_id, label) - } - - #[inline] - fn labels_of(&self, node_id: usize) -> Result, GError> { - self.graph.labels_of(node_id) - } -} - -impl Nfa for DefaultNFA { - #[allow(unused)] - fn to_nfa(regex: impl Regex) -> Self { - todo!() - } - - fn remove_epsilon(&mut self) -> Result<(), Error> { - todo!() - } - - fn remove_dead(&mut self) -> Result<(), Error> { - todo!() - } - - fn nulling(&mut self) -> Result<(), Error> { - todo!() - } -} +pub mod regex; #[cfg(test)] mod default_nfa_test {} diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs new file mode 100644 index 0000000..3c2bd83 --- /dev/null +++ b/nfa/src/default/nfa.rs @@ -0,0 +1,120 @@ +//! This file provides a default implementation of NFA. + +// TODO: Store the regular expression in the NFA as well. +// +// The current focus of the project is to understand the growth rate +// of the algorithm, to know whether I made a mistake in the previous +// iteration of the implementation, or the algorithm is not as fast as +// the author estimated, which is not quite likely, of course. +// +// Thus I shall establish a friendly interface that allows me to view +// and debug the atomic languages and the languages, transparently. + +use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph}; + +use crate::{error::Error, Nfa, Regex}; + +use std::fmt::Display; + +#[non_exhaustive] +#[derive(Debug)] +/// Default NFA implementation. +pub struct DefaultNFA { + graph: DLGraph, +} + +impl Default for DefaultNFA { + fn default() -> Self { + Self { + graph: Default::default(), + } + } +} + +impl Graph for DefaultNFA { + type Iter<'a> = as Graph>::Iter<'a> where T: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result { + self.graph.has_edge(source, target) + } +} + +impl LabelGraph for DefaultNFA { + type Iter<'a> = as LabelGraph>::Iter<'a> where T: 'a; + + type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; + + // TODO: Return the label from the contained regular language. + #[inline] + fn vertex_label(&self, node_id: usize) -> Result, GError> { + if self.has_node(node_id) { + todo!() + } else { + Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) + } + } + + #[inline] + fn edge_label(&self, source: usize, target: usize) -> Result, GError> { + self.graph.edge_label(source, target) + } + + #[inline] + fn find_children_with_label( + &self, + node_id: usize, + label: &T, + ) -> Result<>::Iter<'_>, GError> { + self.graph.find_children_with_label(node_id, label) + } + + #[inline] + fn labels_of(&self, node_id: usize) -> Result, GError> { + self.graph.labels_of(node_id) + } +} + +impl Nfa for DefaultNFA { + #[allow(unused)] + fn to_nfa(regex: impl Regex) -> Self { + todo!() + } + + fn remove_epsilon(&mut self) -> Result<(), Error> { + todo!() + } + + fn remove_dead(&mut self) -> Result<(), Error> { + todo!() + } + + fn nulling(&mut self) -> Result<(), Error> { + todo!() + } +} diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs new file mode 100644 index 0000000..a60f801 --- /dev/null +++ b/nfa/src/default/regex.rs @@ -0,0 +1,575 @@ +//! This file provides a default implementation of Regex. + +use graph::{error::Error as GError, ALGraph, ExtGraph, Graph, GraphLabel}; + +use crate::{desrec::DesRec, error::Error, Regex}; + +use receme::{algebra::Algebra, catana::Cata}; + +use std::fmt::{Debug, Display}; + +/// The type of a node in a regular expression. +/// +/// # Example +/// +/// If a node has type "Kleene", this means it represents a star +/// construct in a regular expression, and its children are the +/// contents of the star. +/// +/// # Note +/// +/// There is no "concatenation" node type. A concatenation of two +/// nodes is represented as the two nodes being successive children in +/// their common parent node. +/// +/// This is possible because a regular expression has a root node. +/// For the sake of convenience, the root node has type "Or". +#[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum RegexType { + /// A star node is a node with an arbitrary number of repetitions. + Kleene, + /// A plus node is a node with at least one repetition: a+ equals + /// aa* + Plus, + /// An optional node + Optional, + /// An or node means an alternation of its children. + Or, + /// A paren node represents a parenthesis. + Paren, + /// An empty node + #[default] + Empty, + /// A literal node + Lit(T), +} + +/// A default implementation of regular expressions. +#[derive(Debug)] +pub struct DefaultRegex { + /// The underlying graph is stored using adjacency lists. + graph: ALGraph, + /// The types of the underlying nodes. + types: Vec>, +} + +impl Default for DefaultRegex { + fn default() -> Self { + Self { + graph: Default::default(), + types: Default::default(), + } + } +} + +impl DefaultRegex { + /// Return the number of elements in this regular expression, + /// counting nodes. + pub fn len(&self) -> usize { + self.types.len() + } + + /// Return true if and only if this regular expression has no + /// nodes. + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } + + /// Add a node as the child of an existing node or as a root. + pub fn add_node( + &mut self, + label: RegexType, + parent: Option, + ) -> Result<(), ParseError> { + self.graph.extend(parent.iter().copied())?; + self.types.push(label); + + Ok(()) + } +} + +// REVIEW: This may not be needed. +impl Cata, A> for &DefaultRegex +where + A: Algebra>, +{ + fn cata(self, mut alg: A) -> T { + let mut results: Vec> = std::iter::repeat_with(Default::default) + .take(self.len()) + .collect(); + + for index in 0..=self.len() { + let algebra_result = { + let results_of_children: Vec = self + .graph + .children_of(index) + .unwrap() + .map(|child_index| std::mem::replace(&mut results[child_index], None).unwrap()) + .collect(); + + alg(results_of_children) + }; + + // Artificially use this value to satisfy the compiler. + let _ = std::mem::replace(&mut results[index], Some(algebra_result)); + } + + std::mem::replace(&mut results[0], None).unwrap() + } +} + +impl Display for DefaultRegex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Copy, Clone)] + enum StackElement { + Seen(usize), + Unseen(usize), + } + + impl StackElement { + fn index(&self) -> usize { + match self { + Seen(index) => *index, + Unseen(index) => *index, + } + } + + fn is_seen(&self) -> bool { + match self { + Seen(_) => true, + Unseen(_) => false, + } + } + } + + use StackElement::{Seen, Unseen}; + + let mut stack: Vec = Vec::new(); + let mut types = self.types.clone(); + types.push(RegexType::Paren); + + stack.push(Unseen(0)); + + while let Some(top) = stack.pop() { + let node_type = types[top.index()]; + + // TODO: Do not use unwrap here. + + match node_type { + RegexType::Kleene => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "*")?; + } + } + RegexType::Plus => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "+")?; + } + } + RegexType::Optional => { + if !top.is_seen() { + stack.push(Seen(top.index())); + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } else { + write!(f, "?")?; + } + } + RegexType::Or => { + if !top.is_seen() { + write!(f, "(")?; + + let len = self.len(); + + stack.push(Unseen(types.len() - 1)); + + for (child_index, child) in self + .graph + .children_of(top.index()) + .unwrap() + .enumerate() + .rev() + { + if child_index != len - 1 && child_index != 0 { + stack.push(Unseen(child)); + stack.push(Seen(top.index())); + } else { + stack.push(Unseen(child)); + } + } + } else { + write!(f, "|")?; + } + } + RegexType::Paren => { + write!(f, ")")?; + } + RegexType::Empty => { + stack.extend( + self.graph + .children_of(top.index()) + .unwrap() + .map(Unseen) + .rev(), + ); + } + RegexType::Lit(label) => write!(f, "{label}")?, + } + } + + Ok(()) + } +} + +impl Graph for DefaultRegex { + type Iter<'a> = ::Iter<'a> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.graph.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.graph.nodes_len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result, GError> { + self.graph.children_of(node_id) + } + + #[inline] + fn degree(&self, node_id: usize) -> Result { + self.graph.degree(node_id) + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result { + self.graph.is_empty_node(node_id) + } + + #[inline] + fn has_edge(&self, source: usize, target: usize) -> Result { + self.graph.has_edge(source, target) + } +} + +impl Regex> for DefaultRegex { + #[inline] + fn vertex_label(&self, node_id: usize) -> Result, Error> { + self.types + .get(node_id) + .copied() + .ok_or(Error::UnknownNode(node_id)) + } +} + +/// An error type for holding parsing errors. +#[derive(Debug)] +pub enum ParseError { + /// Encounter an invalid state. + Invalid, + /// An error from graph operations. + Graph(GError), + /// Encounter an empty stack. + EmptyStack, + /// Encounter a non-single stack at the end. + NonSingleStack, + /// Encounter a stack whose element is out of bounds. + /// + /// The first component is the stack element, while the second the + /// bound. + InvalidStack(usize, usize), + /// Encounter a repetition construct without a preceding element. + InvalidRepetition(usize), + /// Invalid character + InvalidCharacter(char), +} + +impl Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl std::error::Error for ParseError {} + +impl From for ParseError { + fn from(ge: GError) -> Self { + Self::Graph(ge) + } +} + +/// The direction of parsing. +/// +/// This means whether we want to stay at the same level of +/// parent-child hierarchy, or to become the child, or to climb back +/// to the last parent. +#[derive(Debug, Copy, Clone, Default)] +pub enum ParseDirection { + /// Climb back to the last parent. + Up, + /// Stay at the same level in the hierarchy. + #[default] + Right, + /// Become the child. + Down, +} + +impl Display for ParseDirection { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let direction = match self { + ParseDirection::Up => "↑", + ParseDirection::Right => "→", + ParseDirection::Down => "↓", + }; + + write!(f, "{direction}") + } +} + +impl DesRec for DefaultRegex { + type Label = RegexType; + + type Regex = Self; + + type Error = ParseError; + + type Aux = ParseDirection; + + type Scanner<'a> = + Box Result, Self::Error>>; + + fn parse<'a>( + mut input: &'a str, + mut scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result, &'a str)>, Self::Error> { + use ParseDirection::*; + use RegexType::*; + + let mut intermediate_stack: Vec<(RegexType, ParseDirection)> = + Vec::with_capacity(input.len()); + + // Classifies the input into a sequence of tokens with + // directions. + while !input.is_empty() { + if let Some((len, label, direction)) = scanner(input)? { + if len == 0 { + break; + } + + input = &input[len..]; + + intermediate_stack.push((label, direction)); + + // If we encounter an opening parenthesis, we add + // another auxiliary instruction. + if matches!((label, direction), (Or, Down)) { + intermediate_stack.push((Empty, Down)); + } + } else { + break; + } + } + + let inter_len = intermediate_stack.len(); + + let mut parents_stack: Vec = Vec::with_capacity(inter_len + 2); + + parents_stack.push(0); + parents_stack.push(1); + + let mut list_of_children: Vec> = std::iter::repeat_with(Vec::new) + .take(inter_len + 2) + .collect(); + + list_of_children[0].push(1); + + let mut types: Vec> = vec![Or, Empty]; + + types.extend(intermediate_stack.iter().map(|(label, _direction)| *label)); + + // Converts the sequence of tokens and directions into a + // regular expression. + for (index, (label, direction)) in intermediate_stack.iter().copied().enumerate() { + let mut parent: usize; + let mut parent_children: &mut Vec; + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + match list_of_children.get_mut(parent) { + Some(stack_parent_children) => { + parent_children = stack_parent_children; + + match (label, direction) { + (Paren, Up) => { + // a closing parenthesis does not need + // to be counted as a child + parents_stack.pop(); + // a closing parenthesis jumps out of + // two levels at once + parents_stack.pop(); + } + (Empty, Up) => { + // an upwards pipe + + // first add the current node to the parent of the parent + parents_stack.pop(); + + if let Some(parent_stack_parent) = parents_stack.last().copied() { + parent = parent_stack_parent; + + if let Some(stack_parent_children) = + list_of_children.get_mut(parent) + { + parent_children = stack_parent_children; + + parent_children.push(index + 2); + } else { + return Err(ParseError::InvalidStack( + parent, + inter_len + 2, + )); + } + } else { + return Err(ParseError::EmptyStack); + } + + // then make the current node the new parent + parents_stack.push(index + 2); + } + (_, Up) => { + parents_stack.pop(); + parent_children.push(index + 2); + } + (_, Right) => { + parent_children.push(index + 2); + } + (_, Down) => { + parents_stack.push(index + 2); + parent_children.push(index + 2); + } + } + } + None => return Err(ParseError::InvalidStack(parent, inter_len)), + } + } else { + // There are unbalanced closing parentheses. + return Err(ParseError::EmptyStack); + } + + // A special handling of repetition constructs as postfix + // operators: it swaps with the preceding element. + if post_p { + match label { + Kleene | Plus | Optional => { + // remove the current node from the parent + parent_children.pop(); + + if let Some(preceding) = parent_children.last().copied() { + list_of_children.swap(preceding, index + 2); + + types.swap(preceding, index + 2); + + match list_of_children.get_mut(preceding) { + Some(preceding_children) => { + preceding_children.push(index + 2); + } + None => { + return Err(ParseError::InvalidStack(preceding, inter_len + 2)) + } + } + } else { + return Err(ParseError::InvalidRepetition(index)); + } + } + _ => {} + } + } + } + + // There are unbalanced opening parentheses. + if parents_stack.len() != 2 { + return Err(ParseError::NonSingleStack); + } + + let graph = list_of_children.into(); + + let result = DefaultRegex { graph, types }; + + Ok(Some((result, input))) + } +} + +#[cfg(test)] +mod test_des_rec { + use super::*; + + fn test_scanner( + input: &str, + ) -> Result, ParseDirection)>, ParseError> { + use ParseDirection::*; + use RegexType::*; + + if let Some(first) = input.chars().next() { + match first { + '*' => Ok(Some((1, Kleene, Right))), + '+' => Ok(Some((1, Plus, Right))), + '?' => Ok(Some((1, Optional, Right))), + '|' => Ok(Some((1, Empty, Up))), + '(' => Ok(Some((1, Or, Down))), + ')' => Ok(Some((1, Paren, Up))), + ' '..='~' => Ok(Some((1, Lit(first), Right))), + _ => Err(ParseError::InvalidCharacter(first)), + } + } else { + Ok(None) + } + } + + #[test] + fn test_des_rec() -> Result<(), Box> { + let input_string = "a*b?c+|(d*| +)?".to_owned(); + + if let Some((regex, remain)) = + DefaultRegex::::parse(&input_string, Box::new(test_scanner), true)? + { + println!("regex = {regex}"); + println!("remain = {remain}"); + + println!("regex length = {}", regex.len()); + + Ok(()) + } else { + unreachable!() + } + } +} diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs new file mode 100644 index 0000000..c57d313 --- /dev/null +++ b/nfa/src/desrec.rs @@ -0,0 +1,50 @@ +//! This file defines the expected behaviours of a recursive descent +//! parser. + +use super::Regex; +use graph::GraphLabel; + +/// A thin wrapper of the parse output, to simplify the function +/// signature a little. +pub type ParseOutput<'a, T> = Option<(T, &'a str)>; + +/// Types implementing this trait provide a method to be recursively +/// parsed. +pub trait DesRec { + /// The type of labels of the resulting regular expression. + type Label: GraphLabel; + + /// The type of the resulting regular expression. + type Regex: Regex; + + /// The type of errors encountered when parsing. + type Error: std::error::Error; + + /// Auxiliary data when parsing + type Aux; + + /// The type of a scanner that classifies inputs into tokens. + /// + /// The return type indicates the result of classification: + /// + /// - `Err(_)`: the classification fails + /// + /// - `Ok(None)`: the classification succeeds, and the parsing + /// should stop here + /// + /// - `Ok(Some(_))`: the classification succeeds and the parsing + /// should continue. + type Scanner<'a>: FnMut(&'a str) -> Result, Self::Error>; + + /// Parse a string into a regular expression with the aid of this + /// type. + /// + /// Accept a slice of string and return either a parsing error, or + /// a pair of correctly parsed regular expression and the + /// remaining slice. + fn parse<'a>( + input: &'a str, + scanner: Self::Scanner<'a>, + post_p: bool, + ) -> Result, Self::Error>; +} diff --git a/nfa/src/error.rs b/nfa/src/error.rs index 6112878..0c6bb3c 100644 --- a/nfa/src/error.rs +++ b/nfa/src/error.rs @@ -5,9 +5,15 @@ use graph::error::Error as GError; use std::fmt::{Display, Formatter}; #[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +/// The error type for NFA operations. pub enum Error { + /// An unknown node id is encountered. UnknownNode(usize), + /// Some operations are not supported by the implementations. + /// + /// Everything is a trade-off, wink wink. UnsupportedOperation, + /// This error comes from some underlying graph operation. Graph(GError), } diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs index ef207cf..1e2a30c 100644 --- a/nfa/src/lib.rs +++ b/nfa/src/lib.rs @@ -6,7 +6,7 @@ //! implements the Graph trait from the [`graph`] crate, and then use //! that external graph type as [`Graph`][graph::Graph] here. -mod error; +pub mod error; extern crate graph; @@ -49,13 +49,12 @@ pub trait Regex: Graph + Display { // TODO: add functions that determine if certain "positions" in a // regular language satisfy some special properties, like at the // end of a Kleene star, or at the end of a regular language, et - // cetera. These will be needed later. + // cetera. These might be needed later. } /// The expected behvaiour of a nondeterministic finite automaton. /// -/// Every NFA is a special labelled graph, so this trait extends the -/// [`LabelGraph`][graph::LabelGraph] trait. +/// Every NFA is a special labelled graph. pub trait Nfa: LabelGraph { /// Remove all empty transitions from the nondeterministic finite /// automaton. @@ -89,6 +88,7 @@ pub trait Nfa: LabelGraph { } pub mod default; +pub mod desrec; #[cfg(test)] mod nfa_tests {} diff --git a/receme/src/functor.rs b/receme/src/functor.rs index 95e2555..cd4dac2 100644 --- a/receme/src/functor.rs +++ b/receme/src/functor.rs @@ -31,6 +31,14 @@ pub trait Functor { fn fmap(self, f: impl FnMut(T) -> S) -> Self::Target; } +impl Functor for Vec { + type Target = Vec; + + fn fmap(self, f: impl FnMut(T) -> S) -> Self::Target { + self.into_iter().map(f).collect() + } +} + /// A functor can map over its generic type parameter. /// /// It can map from Functor(T) to Functor(S). diff --git a/repcore/src/grammar.rs b/repcore/src/grammar.rs new file mode 100644 index 0000000..ee9f033 --- /dev/null +++ b/repcore/src/grammar.rs @@ -0,0 +1,59 @@ +//! This file implements the extected behaviours of grammars. + +// NOTE: We shall first start with a parser that works at the level of +// characters. The purpose is to first experiment with the workings +// and the performance of the algorithms, before optimising by using +// regular expressions to classify inputs into tokens. In other +// words, the current focus is not on the optimisations, whereas +// scanners are for optimisations only, so to speak. + +/// The type of a terminal. +/// +/// For the time being this is a wrapper around a string, but in the +/// future it may hold more information of scanners. +pub struct Terminal { + // If we want to use scanners, per chance add them as a new field + // here. + name: String, +} + +impl Terminal { + /// Create a terminal with the given name. + #[inline] + pub fn new(name: String) -> Self { + Self { name } + } + + /// Return the name of the terminal. + #[inline] + pub fn name(&self) -> &str { + &self.name + } +} + +/// The type of a non-terminal. +/// +/// This is just a wrapper around a string. +pub struct Nonterminal(String); + +impl Nonterminal { + /// Return the name of the nonterminal. + /// + /// Just to improve readability. + #[inline] + pub fn name(&self) -> &str { + &self.0 + } +} + +/// The type of a terminal or a non-terminal. +/// +/// Only an index is stored here. Actual data are stored in two other +/// arrays. +#[derive(Debug, Hash, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] +pub enum TNT { + /// Terminal variant + Ter(usize), + /// Nonterminal variant + Non(usize), +} diff --git a/repcore/src/lib.rs b/repcore/src/lib.rs index 7d12d9a..589b61c 100644 --- a/repcore/src/lib.rs +++ b/repcore/src/lib.rs @@ -1,3 +1,9 @@ +//! This package implements the core algorithm of the entire +//! workspace: parsing with derivatives by means of regular nulling +//! languages. + +pub mod grammar; + pub fn add(left: usize, right: usize) -> usize { left + right } diff --git a/repcore/src/plan.org b/repcore/src/plan.org new file mode 100644 index 0000000..13c2ee0 --- /dev/null +++ b/repcore/src/plan.org @@ -0,0 +1,59 @@ +#+TITLE: Plan of the package +#+AUTHOR: Durand +#+DATE: <2022-11-18 Ven 19:57> + +* Atomic Languages + +This describes the behaviours of atomic languages. The atomic +language consists of the null closure of any non-terminal symbol in +the grammar, and their deriavtives by terminals and non-terminal. + +* Derivative Languages + +This is the main driving device of the algorithm. Basically, the +algorithm works by taking successive derivatives, according to the +input symbol. At each step, we calculate the derivative language. In +this process, we also compute some semiring value and store in a +carrier. The end result of the algorithm is the final semiring +value. + +If one wants simply to determine if the input string belongs to the +grammar, one chooses the semiring to be the field with two elements, +the boolean field. If one wants to find how many ways are there to +derive a given input string, then one uses the semiring of natural +numbers instead. If one wants, moreover, to find all the possible +ways to derive a particular input string, then one can use the +free semiring on the set of terminals and non-terminals of the +grammar. Here the free semiring is the left-adjoint functor to the +forgetful functor from the category of semirings to the category of +sets. + +To be more specific, the free semiring on a set is given by sets of +sequences of elements in the set. The addition of the semiring is the +set union operation, and the multiplication is taking the respective +concatenations. + +** Semirings + +So we need a module to define the behaviours of semirings, and provide +some common semirings implementations. Then in the main driving force +we can freely substitute different semirings, according to the +particular needs. + +** Languages + +Then the main part is to define the behaviour of languages. This +should be easy enough, since we already have the mechanism of graphs, +nondeterministic automata, and semirings. All we need to do is to +combine them together. + +* Testing ground + +I am in a strong need to test things out. The most important one is +to visualize each step of the derivation, in a human-friendly manner. +I need this to examine whether my atomic languages are wrongly +implemented, or my atomic languages are wrongly derived, or my +understanding of the main algorithm is plain wrong. + +This is the main reason I started this rewrite of the package. + -- cgit v1.2.3-18-g5258