//! This file provides a default implementation of NFA. use graph::{ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel, LabelExtGraph, LabelGraph, }; use crate::{ default::regex::RegexType, error::Error, DOption, LabelType, Nfa, NfaLabel, Regex, SoC, TwoEdges, }; use core::fmt::{Debug, Display}; /// Default NFA implementation. #[derive(Debug, Clone)] pub struct DefaultNFA { graph: DLGraph, } impl Default for DefaultNFA { fn default() -> Self { Self { graph: Default::default(), } } } // Some boiler-plate delegation implementations. // // I deliberately avoid using [`Deref`][std::ops::Deref] here, as I do // not want to dereference an NFA to a Graph. impl Graph for DefaultNFA { type Iter<'a> = as Graph>::Iter<'a> where T: 'a; #[inline] fn is_empty(&self) -> bool { self.graph.is_empty() } #[inline] fn nodes_len(&self) -> usize { self.graph.nodes_len() } #[inline] fn children_of(&self, node_id: usize) -> Result, GError> { self.graph.children_of(node_id) } #[inline] fn degree(&self, node_id: usize) -> Result { self.graph.degree(node_id) } #[inline] fn is_empty_node(&self, node_id: usize) -> Result { self.graph.is_empty_node(node_id) } #[inline] fn has_edge(&self, source: usize, target: usize) -> Result { self.graph.has_edge(source, target) } #[inline] fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> { self.graph.print_viz(filename) } #[inline] fn replace_by_builder(&mut self, _builder: impl Builder) { unimplemented!() } } impl LabelGraph for DefaultNFA { type Iter<'a> = as LabelGraph>::Iter<'a> where T: 'a; type LabelIter<'a> = as LabelGraph>::LabelIter<'a> where T: 'a; type EdgeLabelIter<'a> = as LabelGraph>::EdgeLabelIter<'a> where Self: 'a, T: 'a; #[inline] fn vertex_label(&self, node_id: usize) -> Result, GError> { if self.has_node(node_id) { unimplemented!() } else { Err(GError::IndexOutOfBounds(node_id, self.nodes_len())) } } #[inline] fn edge_label(&self, source: usize, target: usize) -> Result, GError> { self.graph.edge_label(source, target) } #[inline] fn find_children_with_label( &self, node_id: usize, label: &T, ) -> Result<>::Iter<'_>, GError> { self.graph.find_children_with_label(node_id, label) } #[inline] fn labels_of(&self, node_id: usize) -> Result, GError> { self.graph.labels_of(node_id) } #[inline] fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { self.graph.has_edge_label(node_id, label, target) } } impl LabelExtGraph for DefaultNFA { #[inline] fn extend(&mut self, edges: impl IntoIterator) -> Result { self.graph.extend(edges) } } impl Nfa for DefaultNFA { type FromRegex = DefaultNFA; // Reminder: This is an adopted version of Thompson's // construction. fn to_nfa( regexps: &[impl Regex>], sub_pred: impl Fn(T) -> Result, Error>, default: Option, ) -> Result>, Error> { let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum(); if total_regexps_len == 0 { return Ok(Default::default()); } // We reserve two more rooms for the default edge. let nfa_len = total_regexps_len * 2 + 2; let mut builder: DLGBuilder> = Builder::with_capacity(nfa_len); builder.add_vertices(nfa_len); // for _ in 0..nfa_len { // builder.add_vertex(); // } let default = LabelType::new(DOption(default), total_regexps_len, false); // add a default edge builder.add_edge(nfa_len - 2, nfa_len - 1, default)?; let accumulators: Vec = { let mut result = Vec::with_capacity(regexps.len()); result.push(0); for regexp in regexps.iter().take(regexps.len() - 1) { result.push(result.last().unwrap() + regexp.nodes_len() * 2); } result }; assert_eq!(accumulators.len(), regexps.len()); /// Get offset from `accumulators` safely. macro_rules! get_offset_safe ( ($num:expr) => { *accumulators.get($num).ok_or(Error::UnknownNode($num))? } ); /// Get offset from `accumulators` without checking bounds. macro_rules! get_offset_unsafe ( ($num:expr) => { { unsafe { *accumulators.get_unchecked($num) } } } ); for (index, regex) in regexps.iter().enumerate() { let root = if let Some(root) = regex.root() { root } else { // A regular expression without roots is empty, so we // skip it. continue; }; let regex_len = regex.nodes_len(); // It is safe here to call `get_offset_unsafe`, as `index` // is guaranteed to be strictly less than the length of // `accumulators` by an assertion above. let offset = get_offset_unsafe!(index); let mut stack: Vec = Vec::with_capacity(regex_len); stack.push(root); while let Some(top_index) = stack.pop() { let false_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, false); let true_label = NfaLabel::new(DOption(None), (offset >> 1) + top_index, true); let top_label = regex.vertex_label(top_index)?; let nfa_start = offset + 2 * top_index; let nfa_end = nfa_start + 1; match top_label { RegexType::Kleene => { builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; if !regex.is_empty_node(top_index)? { for child in regex.children_of(top_index)? { stack.push(child); let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; builder.add_edge(source, child_start, false_label)?; source = child_end; } builder.add_edge(source, nfa_end, false_label)?; builder.add_edge(nfa_end, nfa_start, false_label)?; } } RegexType::Plus => { let mut source = nfa_start; if !regex.is_empty_node(top_index)? { for child in regex.children_of(top_index)? { stack.push(child); let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; builder.add_edge(source, child_start, false_label)?; source = child_end; } builder.add_edge(source, nfa_end, false_label)?; builder.add_edge(nfa_end, nfa_start, false_label)?; } else { builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Optional => { builder.add_edge(nfa_start, nfa_end, false_label)?; let mut source = nfa_start; if !regex.is_empty_node(top_index)? { for child in regex.children_of(top_index)? { stack.push(child); let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; builder.add_edge(source, child_start, false_label)?; source = child_end; } builder.add_edge(source, nfa_end, false_label)?; } } RegexType::Or => { if !regex.is_empty_node(top_index)? { for child in regex.children_of(top_index)? { stack.push(child); let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; builder.add_edge(nfa_start, child_start, false_label)?; builder.add_edge(child_end, nfa_end, false_label)?; } } else { builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Paren => { // Ignore Paren nodes since currently these // are used only for printing purposes. } RegexType::Empty => { let mut source = nfa_start; if !regex.is_empty_node(top_index)? { for child in regex.children_of(top_index)? { stack.push(child); let child_start = offset + 2 * child; let child_end = offset + 2 * child + 1; builder.add_edge(source, child_start, false_label)?; source = child_end; } builder.add_edge(source, nfa_end, false_label)?; } else { builder.add_edge(nfa_start, nfa_end, false_label)?; } } RegexType::Lit(value) => { // The children would be ignored even if for // some reason this literal node had some // children. match sub_pred(value)? { SoC::Sub(sub_non) => { // a non-terminal let sub_nfa_start = get_offset_safe!(sub_non); let sub_nfa_end = sub_nfa_start + 1; // We also need a label for the // original label and non-left-linear // expansion. builder.add_edge( nfa_start, nfa_end, NfaLabel::new( DOption(Some(value)), (offset >> 1) + top_index, false, ), )?; builder.add_edge(nfa_start, sub_nfa_start, true_label)?; builder.add_edge(sub_nfa_end, nfa_end, false_label)?; } SoC::Carry(new_value) => { // a terminal builder.add_edge( nfa_start, nfa_end, NfaLabel::new( DOption(Some(new_value)), (offset >> 1) + top_index, false, ), )?; } } } } } } let graph = builder.build(); Ok(DefaultNFA { graph }) } fn remove_dead(&mut self, mut reserve: impl FnMut(usize) -> bool) -> Result<(), Error> { let mut builder = self.graph.builder_ref(); let mut changed = true; // Use a counting vector to avoid calculating which nodes are // dead at each iteration. let mut target_counts: Vec = std::iter::repeat(0).take(self.graph.nodes_len()).collect(); for (_, target) in self.graph.edges() { *target_counts .get_mut(target) .ok_or(Error::UnknownNode(target))? += 1; } while changed { changed = false; for node in self.nodes() { let deadp = !matches!(target_counts.get(node).copied(), Some(n) if n > 0); if deadp && !reserve(node) { let to_remove: Vec = builder.children_of(node)?.collect(); if !to_remove.is_empty() { changed = true; for target in to_remove { builder.remove_edge(node, target, |_| true)?; *target_counts .get_mut(target) .ok_or(Error::UnknownNode(target))? -= 1; } } } } } self.graph = builder.build(); Ok(()) } fn closure( &mut self, mut predicate: impl FnMut(T) -> bool, remove_after_p: bool, mut transform: impl FnMut(TwoEdges) -> T, mut remove_predicate: impl FnMut(T) -> bool, ) -> Result<(), Error> { let mut builder = self.graph.builder_ref(); let mut updated = true; let nodes_len = self.nodes_len(); while updated { updated = false; // Just a rough estimate let mut to_add: Vec<(usize, usize, T)> = Vec::with_capacity(nodes_len); // REVIEW: I do not like nested if statements, but I do // not know how to do this otherwise. for source in builder.nodes() { for (first_label, target_iter) in builder.labels_of(source)? { if predicate(*first_label) { // a null label found for target in target_iter { for (second_label, children_iter) in builder.labels_of(target)? { for child in children_iter { let new_label = transform(TwoEdges( source, target, child, *first_label, *second_label, )); if !builder.has_edge_label(source, &new_label, child)? { updated = true; to_add.push((source, child, new_label)); } } } } } } } for (source, target, label) in to_add.into_iter() { builder.add_edge(source, target, label)?; } } // Then remove all nullable edges if demanded if remove_after_p { for (source, target) in builder.edges().collect::>().into_iter() { builder.remove_edge(source, target, &mut remove_predicate)?; } } self.graph = builder.build(); Ok(()) } } impl Display for DefaultNFA { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { Debug::fmt(self, f) } } #[cfg(test)] mod test_to_nfa { use super::*; use crate::SoC; use crate::{ default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType}, desrec::DesRec, }; fn new_regex() -> Result, ParseError> { let parser = DefaultRegParser::::default(); fn test_scanner( _parser: &DefaultRegParser, input: &str, ) -> Result, ParseDirection)>, ParseError> { use ParseDirection::*; use RegexType::*; if let Some(first) = input.chars().next() { match first { '*' => Ok(Some((1, Kleene, Right))), '+' => Ok(Some((1, Plus, Right))), '?' => Ok(Some((1, Optional, Right))), '|' => Ok(Some((1, Empty, Up))), '(' => Ok(Some((1, Or, Down))), ')' => Ok(Some((1, Paren, Up))), ' '..='~' => Ok(Some((1, Lit(first), Right))), _ => Err(ParseError::InvalidCharacter(first)), } } else { Ok(None) } } let input_string = "a*b?c+|(d*| +)?".to_owned(); Ok( DefaultRegParser::::parse(&parser, &input_string, Box::new(test_scanner), true)? .unwrap_or_default() .0, ) } #[test] fn test_to_nfa() -> Result<(), Box> { let regex = new_regex()?; println!("regex = {regex}"); let _nfa: DefaultNFA<_> = DefaultNFA::to_nfa( &[regex], |label| Ok(SoC::Carry(label)), Some(char::default()), )?; // nfa.print_viz("nfa.gv")?; Ok(()) } }