summaryrefslogtreecommitdiff
path: root/nfa/src/default/nfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'nfa/src/default/nfa.rs')
-rw-r--r--nfa/src/default/nfa.rs374
1 files changed, 355 insertions, 19 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index 3c2bd83..229ba4d 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -1,25 +1,26 @@
//! This file provides a default implementation of NFA.
-// TODO: Store the regular expression in the NFA as well.
-//
-// The current focus of the project is to understand the growth rate
-// of the algorithm, to know whether I made a mistake in the previous
-// iteration of the implementation, or the algorithm is not as fast as
-// the author estimated, which is not quite likely, of course.
+// TODO: The current focus of the project is to understand the growth
+// rate of the algorithm, to know whether I made a mistake in the
+// previous iteration of the implementation, or the algorithm is not
+// as fast as the author estimated, which is not quite likely, of
+// course.
//
// Thus I shall establish a friendly interface that allows me to view
// and debug the atomic languages and the languages, transparently.
-use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph};
+use graph::{
+ builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel,
+ LabelGraph,
+};
-use crate::{error::Error, Nfa, Regex};
+use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
-use std::fmt::Display;
+use core::fmt::{Debug, Display};
-#[non_exhaustive]
-#[derive(Debug)]
/// Default NFA implementation.
-pub struct DefaultNFA<T: GraphLabel + Display> {
+#[derive(Debug, Clone)]
+pub struct DefaultNFA<T: GraphLabel> {
graph: DLGraph<T>,
}
@@ -63,6 +64,16 @@ impl<T: GraphLabel + Display> Graph for DefaultNFA<T> {
fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
self.graph.has_edge(source, target)
}
+
+ #[inline]
+ fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> {
+ self.graph.print_viz(filename)
+ }
+
+ #[inline]
+ fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) {
+ unimplemented!()
+ }
}
impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
@@ -70,11 +81,10 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a;
- // TODO: Return the label from the contained regular language.
#[inline]
fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> {
if self.has_node(node_id) {
- todo!()
+ unimplemented!()
} else {
Err(GError::IndexOutOfBounds(node_id, self.nodes_len()))
}
@@ -98,12 +108,232 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> {
self.graph.labels_of(node_id)
}
+
+ #[inline]
+ fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result<bool, GError> {
+ self.graph.has_edge_label(node_id, label, target)
+ }
}
impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
- #[allow(unused)]
- fn to_nfa(regex: impl Regex<T>) -> Self {
- todo!()
+ type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
+
+ fn to_nfa(
+ regexps: &[impl Regex<RegexType<T>>],
+ sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+ ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+ let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
+
+ if total_regexps_len == 0 {
+ return Ok(Default::default());
+ }
+
+ // We reserve two more rooms for later uses.
+ let nfa_len = total_regexps_len * 2 + 2;
+
+ let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
+
+ // Since DOption<T> implements Copy when T does, we can use a
+ // variable to hold the empty label to avoid repetitions.
+ let empty_label: DOption<T> = Default::default();
+
+ for _ in 0..nfa_len {
+ builder.add_vertex();
+ }
+
+ let accumulators: Vec<usize> = {
+ let mut result = Vec::with_capacity(regexps.len() + 1);
+
+ result.push(0);
+
+ let mut accumulator = 0;
+
+ for regexp in regexps.iter() {
+ accumulator += regexp.nodes_len() * 2;
+ result.push(accumulator);
+ }
+
+ result.pop();
+
+ result
+ };
+
+ assert_eq!(accumulators.len(), regexps.len());
+
+ /// Get offset from `accumulators` safely.
+ macro_rules! get_offset_safe (
+ ($num:expr) => {
+ *accumulators.get($num).ok_or(Error::UnknownNode($num))?
+ }
+ );
+
+ /// Get offset from `accumulators` without checking bounds.
+ macro_rules! get_offset_unsafe (
+ ($num:expr) => {
+ { unsafe { *accumulators.get_unchecked($num) } }
+ }
+ );
+
+ for (index, regex) in regexps.iter().enumerate() {
+ let root = if let Some(root) = regex.root() {
+ root
+ } else {
+ // A regular expression without roots is empty, so we
+ // skip it.
+ continue;
+ };
+
+ let regex_len = regex.nodes_len();
+
+ // It is safe here to call `get_offset_unsafe`, as `index`
+ // is guaranteed to be strictly less than the length of
+ // `accumulators` by an assertion above.
+ let offset = get_offset_unsafe!(index);
+
+ let mut stack: Vec<usize> = Vec::with_capacity(regex_len);
+
+ stack.push(root);
+
+ while let Some(top_index) = stack.pop() {
+ let top_label = regex.vertex_label(top_index)?;
+
+ let nfa_start = offset + 2 * top_index;
+ let nfa_end = offset + 2 * top_index + 1;
+
+ match top_label {
+ RegexType::Kleene => {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+
+ builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ }
+ }
+ RegexType::Plus => {
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+
+ builder.add_edge(nfa_end, nfa_start, empty_label)?;
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Optional => {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Or => {
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(nfa_start, child_start, empty_label)?;
+ builder.add_edge(child_end, nfa_end, empty_label)?;
+ }
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Paren => {
+ // Ignore Paren nodes since currently these
+ // are used only for printing purposes.
+ }
+ RegexType::Empty => {
+ let mut source = nfa_start;
+
+ if !regex.is_empty_node(top_index)? {
+ for child in regex.children_of(top_index)? {
+ stack.push(child);
+
+ let child_start = offset + 2 * child;
+ let child_end = offset + 2 * child + 1;
+
+ builder.add_edge(source, child_start, empty_label)?;
+
+ source = child_end;
+ }
+
+ builder.add_edge(source, nfa_end, empty_label)?;
+ } else {
+ builder.add_edge(nfa_start, nfa_end, empty_label)?;
+ }
+ }
+ RegexType::Lit(value) => {
+ // The children would be ignored even if for
+ // some reason this literal node had some
+ // children.
+
+ match sub_pred(value)? {
+ SoC::Sub(sub_non) => {
+ // a non-terminal
+
+ let sub_offset = get_offset_safe!(sub_non);
+ let sub_nfa_start = sub_offset + 2 * sub_non;
+ let sub_nfa_end = sub_offset + 2 * sub_non + 1;
+
+ builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
+ builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+ }
+ SoC::Carry(new_value) => {
+ // a terminal
+
+ builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ let graph = builder.build();
+
+ Ok(DefaultNFA { graph })
}
fn remove_epsilon(&mut self) -> Result<(), Error> {
@@ -114,7 +344,113 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
todo!()
}
- fn nulling(&mut self) -> Result<(), Error> {
- todo!()
+ fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
+ let mut updated = true;
+ let mut builder = self.graph.builder_ref();
+
+ while updated {
+ updated = false;
+
+ let mut nullable = false;
+
+ let mut to_add = Vec::new();
+
+ for (source, target) in builder.edges() {
+ for label in builder.edge_label(source, target)? {
+ if f(label) {
+ nullable = true;
+
+ break;
+ }
+ }
+
+ if nullable {
+ for (label, child_iter) in builder.labels_of(target)? {
+ for child in child_iter {
+ if !builder.has_edge_label(source, label, child)? {
+ updated = true;
+
+ to_add.push((source, child, *label));
+ }
+ }
+ }
+ }
+ }
+
+ for (source, child, label) in to_add {
+ builder.add_edge(source, child, label)?;
+ }
+ }
+
+ self.graph.replace_by_builder(builder);
+
+ Ok(())
+ }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ Debug::fmt(self, f)
+ }
+}
+
+#[cfg(test)]
+mod test_to_nfa {
+ #![allow(unused_imports)]
+ use super::*;
+ use crate::SoC;
+
+ use crate::{
+ default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
+ desrec::DesRec,
+ };
+
+ fn new_regex() -> Result<DefaultRegex<char>, ParseError> {
+ let parser = DefaultRegParser::<char>::default();
+
+ fn test_scanner<T: GraphLabel + Display + Debug>(
+ _parser: &DefaultRegParser<T>,
+ input: &str,
+ ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+ use ParseDirection::*;
+ use RegexType::*;
+
+ if let Some(first) = input.chars().next() {
+ match first {
+ '*' => Ok(Some((1, Kleene, Right))),
+ '+' => Ok(Some((1, Plus, Right))),
+ '?' => Ok(Some((1, Optional, Right))),
+ '|' => Ok(Some((1, Empty, Up))),
+ '(' => Ok(Some((1, Or, Down))),
+ ')' => Ok(Some((1, Paren, Up))),
+ ' '..='~' => Ok(Some((1, Lit(first), Right))),
+ _ => Err(ParseError::InvalidCharacter(first)),
+ }
+ } else {
+ Ok(None)
+ }
+ }
+
+ let input_string = "a*b?c+|(d*| +)?".to_owned();
+
+ Ok(
+ DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
+ .unwrap_or(Default::default())
+ .0,
+ )
+ }
+
+ #[test]
+ fn test_to_nfa() -> Result<(), Box<dyn std::error::Error>> {
+ let regex = new_regex()?;
+
+ println!("regex = {regex}");
+
+ let nfa: DefaultNFA<DOption<char>> =
+ DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?;
+
+ nfa.print_viz("nfa.gv")?;
+
+ Ok(())
}
}