diff options
Diffstat (limited to 'nfa/src/default/regex.rs')
-rw-r--r-- | nfa/src/default/regex.rs | 313 |
1 files changed, 276 insertions, 37 deletions
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs index a60f801..2670e32 100644 --- a/nfa/src/default/regex.rs +++ b/nfa/src/default/regex.rs @@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex}; use receme::{algebra::Algebra, catana::Cata}; -use std::fmt::{Debug, Display}; +use std::{ + collections::HashMap, + fmt::{Debug, Display, Write}, + marker::PhantomData, +}; /// The type of a node in a regular expression. /// @@ -25,7 +29,7 @@ use std::fmt::{Debug, Display}; /// This is possible because a regular expression has a root node. /// For the sake of convenience, the root node has type "Or". #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)] -pub enum RegexType<T: GraphLabel + Display> { +pub enum RegexType<T: GraphLabel> { /// A star node is a node with an arbitrary number of repetitions. Kleene, /// A plus node is a node with at least one repetition: a+ equals @@ -44,13 +48,31 @@ pub enum RegexType<T: GraphLabel + Display> { Lit(T), } +impl<T: GraphLabel> Display for RegexType<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RegexType::Kleene => write!(f, "*"), + RegexType::Plus => write!(f, "+"), + RegexType::Optional => write!(f, "?"), + RegexType::Or => write!(f, "|"), + RegexType::Paren => write!(f, "()"), + RegexType::Empty => write!(f, "empty"), + RegexType::Lit(value) => write!(f, "{value}"), + } + } +} + /// A default implementation of regular expressions. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct DefaultRegex<T: GraphLabel + Display> { /// The underlying graph is stored using adjacency lists. graph: ALGraph, /// The types of the underlying nodes. types: Vec<RegexType<T>>, + /// The root of the graph. + /// + /// If it is None, it indicates the regular expression is empty. + root: Option<usize>, } impl<T: GraphLabel + Display> Default for DefaultRegex<T> { @@ -58,11 +80,39 @@ impl<T: GraphLabel + Display> Default for DefaultRegex<T> { Self { graph: Default::default(), types: Default::default(), + root: Default::default(), } } } -impl<T: GraphLabel + Display> DefaultRegex<T> { +impl<T: GraphLabel> DefaultRegex<T> { + /// Set the root of the regular expression. + #[inline] + pub fn set_root(&mut self, root: Option<usize>) { + self.root = root; + } + + /// Construct a regular expression from a raw adjacency list graph + /// and an array of labels. + /// + /// # Error + /// + /// If the graph contains cycles, this function will return an + /// error. This check is added to make sure that the regular + /// expression contains no cycles, otherwise operations with + /// regular expressions might enter infinite loops later. + pub fn new(graph: ALGraph, types: Vec<RegexType<T>>) -> Result<Self, Error> { + if graph.contains_cycles()? { + Err(Error::Cycle) + } else { + Ok(Self { + graph, + types, + root: Some(0), + }) + } + } + /// Return the number of elements in this regular expression, /// counting nodes. pub fn len(&self) -> usize { @@ -86,10 +136,65 @@ impl<T: GraphLabel + Display> DefaultRegex<T> { Ok(()) } + + /// Return the internal graph. + /// + /// Normally this should not be used. + pub fn internal_graph(&self) -> ALGraph { + self.graph.clone() + } + + /// Return the internal types array. + /// + /// Normally this should not be used. + pub fn internal_types(&self) -> Vec<RegexType<T>> { + self.types.clone() + } + + /// Return the array of parents. + /// + /// The element at index N of the returned array is the parent of + /// that N-th element. If an element has no parents, a `None` is + /// placed at its place. + /// + /// # Error + /// + /// If some edge points to an invalid node, an erro will be + /// returned. + /// + /// Also, if the graph contains cycles, an error is also returned. + pub fn parents_array(&self) -> Result<Vec<Option<(usize, usize)>>, Error> { + if self.graph.contains_cycles().map_err(|e| match e { + GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n), + _ => unreachable!(), + })? { + return Err(Error::Cycle); + } + + let len = self.len(); + + let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect(); + + for source in self.graph.nodes() { + for (index, target) in self + .graph + .children_of(source) + .map_err(|_| Error::UnknownNode(source))? + .enumerate() + { + result + .get_mut(target) + .ok_or(Error::UnknownNode(target))? + .replace((source, index)); + } + } + + Ok(result) + } } // REVIEW: This may not be needed. -impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> +impl<S: GraphLabel, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S> where A: Algebra<T, Vec<T>>, { @@ -118,8 +223,13 @@ where } } -impl<T: GraphLabel + Display> Display for DefaultRegex<T> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl<T: GraphLabel> DefaultRegex<T> { + /// Use a function to format the labels of the regular expressions + /// and format the entire regular expression with this aid. + pub fn to_string_with<F>(&self, mut f: F) -> Result<String, std::fmt::Error> + where + F: FnMut(T) -> String, + { #[derive(Copy, Clone)] enum StackElement { Seen(usize), @@ -134,16 +244,15 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { } } - fn is_seen(&self) -> bool { - match self { - Seen(_) => true, - Unseen(_) => false, - } + fn is_seen(self) -> bool { + matches!(self, Seen(_)) } } use StackElement::{Seen, Unseen}; + let mut result = String::new(); + let mut stack: Vec<StackElement> = Vec::new(); let mut types = self.types.clone(); types.push(RegexType::Paren); @@ -159,6 +268,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { RegexType::Kleene => { if !top.is_seen() { stack.push(Seen(top.index())); + + if self.degree(top.index()).unwrap() > 1 { + write!(result, "(")?; + stack.push(Unseen(types.len() - 1)); + } + stack.extend( self.graph .children_of(top.index()) @@ -167,7 +282,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "*")?; + write!(result, "*")?; } } RegexType::Plus => { @@ -181,7 +296,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "+")?; + write!(result, "+")?; } } RegexType::Optional => { @@ -195,12 +310,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } else { - write!(f, "?")?; + write!(result, "?")?; } } RegexType::Or => { if !top.is_seen() { - write!(f, "(")?; + write!(result, "(")?; let len = self.len(); @@ -221,11 +336,11 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { } } } else { - write!(f, "|")?; + write!(result, "|")?; } } RegexType::Paren => { - write!(f, ")")?; + write!(result, ")")?; } RegexType::Empty => { stack.extend( @@ -236,11 +351,17 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> { .rev(), ); } - RegexType::Lit(label) => write!(f, "{label}")?, + RegexType::Lit(label) => write!(result, "{}", f(label))?, } } - Ok(()) + Ok(result) + } +} + +impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.to_string_with(|t| format!("{t}"))?) } } @@ -278,9 +399,20 @@ impl<T: GraphLabel + Display> Graph for DefaultRegex<T> { fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> { self.graph.has_edge(source, target) } + + #[inline] + fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) { + unimplemented!() + } } -impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { +impl<T: GraphLabel + Display + Debug> Regex<RegexType<T>> for DefaultRegex<T> { + /// Return the root of the regular expression. + #[inline] + fn root(&self) -> Option<usize> { + self.root + } + #[inline] fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> { self.types @@ -291,8 +423,10 @@ impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> { } /// An error type for holding parsing errors. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum ParseError { + /// A cycle is encountered. + Cycle, /// Encounter an invalid state. Invalid, /// An error from graph operations. @@ -354,23 +488,77 @@ impl Display for ParseDirection { } } -impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { +/// A default recursive descent parser for regular expressions of +/// terminals or non-terminals. +#[derive(Debug, Clone)] +pub struct DefaultRegParser<T: GraphLabel + Display> { + ter_map: HashMap<String, usize>, + non_map: HashMap<String, usize>, + _phantom: PhantomData<T>, +} + +impl<T: GraphLabel + Display> DefaultRegParser<T> { + /// Query if a terminal or a non-terminal is already found. + /// + /// If found, return the associated index of the terminal or + /// non-terminal. + pub fn query(&self, tnt: &str, terminal_p: bool) -> Option<usize> { + if terminal_p { + self.ter_map.get(tnt).copied() + } else { + self.non_map.get(tnt).copied() + } + } + + /// Add a terminal or a non-terminal. + pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) { + if terminal_p { + let ter_len = self.ter_map.len(); + + self.ter_map.entry(tnt.to_owned()).or_insert(ter_len); + } else { + let non_len = self.non_map.len(); + + self.non_map.entry(tnt.to_owned()).or_insert(non_len); + } + } +} + +impl<T: GraphLabel + Display> Default for DefaultRegParser<T> { + fn default() -> Self { + Self { + ter_map: Default::default(), + non_map: Default::default(), + _phantom: PhantomData, + } + } +} + +impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegParser<T> { type Label = RegexType<T>; - type Regex = Self; + type Regex = DefaultRegex<T>; type Error = ParseError; - type Aux = ParseDirection; + type Inter = ParseDirection; - type Scanner<'a> = - Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>; + type Scanner<'a, 'b> = Box< + dyn FnMut( + &'b Self, + &'a str, + ) -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>, + > where RegexType<T>:'b; - fn parse<'a>( + fn parse<'a, 'b>( + &'b self, mut input: &'a str, - mut scanner: Self::Scanner<'a>, + mut scanner: Self::Scanner<'a, 'b>, post_p: bool, - ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> { + ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> + where + Self::Label: 'b, + { use ParseDirection::*; use RegexType::*; @@ -380,7 +568,7 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { // Classifies the input into a sequence of tokens with // directions. while !input.is_empty() { - if let Some((len, label, direction)) = scanner(input)? { + if let Some((len, label, direction)) = scanner(self, input)? { if len == 0 { break; } @@ -523,7 +711,12 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { let graph = list_of_children.into(); - let result = DefaultRegex { graph, types }; + // Check there are no cycles + let result = DefaultRegex::new(graph, types).map_err(|e| match e { + Error::Graph(ge) => ParseError::Graph(ge), + Error::Cycle => ParseError::Cycle, + _ => unreachable!(), + })?; Ok(Some((result, input))) } @@ -533,9 +726,17 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> { mod test_des_rec { use super::*; - fn test_scanner( - input: &str, - ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> { + use crate::desrec::DesRec; + + #[allow(dead_code, unused)] + fn test_scanner<'a, 'b, T>( + parser: &'b DefaultRegParser<T>, + input: &'a str, + ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> + where + T: GraphLabel + Display + Debug, + T: 'b, + { use ParseDirection::*; use RegexType::*; @@ -557,19 +758,57 @@ mod test_des_rec { #[test] fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> { - let input_string = "a*b?c+|(d*| +)?".to_owned(); + let input_string = "(ade)*b?c+|(d*| +)?".to_owned(); + + let parser: DefaultRegParser<char> = Default::default(); if let Some((regex, remain)) = - DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)? + DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)? { println!("regex = {regex}"); println!("remain = {remain}"); println!("regex length = {}", regex.len()); + let parents = regex.parents_array()?; + + println!("parents = {parents:?}"); + Ok(()) } else { unreachable!() } } + + #[test] + fn test_display() -> Result<(), Box<dyn std::error::Error>> { + use graph::builder::Builder; + use RegexType::*; + + let mut builder = graph::adlist::ALGBuilder::default(); + let mut types: Vec<RegexType<usize>> = Vec::with_capacity(4); + + types.push(Kleene); + builder.add_vertex(); + + types.push(Lit(0)); + builder.add_vertex(); + builder.add_edge(0, 1, ())?; + + types.push(Lit(1)); + builder.add_vertex(); + builder.add_edge(0, 2, ())?; + + types.push(Lit(2)); + builder.add_vertex(); + builder.add_edge(0, 3, ())?; + + let graph = builder.build(); + + let regex = DefaultRegex::new(graph, types)?; + + println!("regex = {regex}"); + + Ok(()) + } } |