From 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 Mon Sep 17 00:00:00 2001
From: JSDurand <mmemmew@gmail.com>
Date: Fri, 23 Dec 2022 00:36:31 +0800
Subject: renaming core to chain and some other changes

Some changes:

- The core crate is renamed to "chain".

- The crate "viz" is added, which will provide layered graph drawing
  algorithms.

- A function is added to convert from a grammar to the regular
  language of its left-linear closures.

- A function is added to convert from a nondeterministic finite
  automaton to its "null" closure.  A null closure is the same
  automaton with edges added, as if some edges are "null".  Whether an
  edge is null is determined by a function.
  Combined with the previous change, we can convert a grammar to the
  regular language of the null closure of its left-linear closures.

---

Now it remains to test more grammars and add an Atom trait, before
finishing the part about compilations.
---
 nfa/src/default/nfa.rs | 374 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 355 insertions(+), 19 deletions(-)

(limited to 'nfa/src/default/nfa.rs')
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index 3c2bd83..229ba4d 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -1,25 +1,26 @@
 //! This file provides a default implementation of NFA.
 
-// TODO: Store the regular expression in the NFA as well.
-//
-// The current focus of the project is to understand the growth rate
-// of the algorithm, to know whether I made a mistake in the previous
-// iteration of the implementation, or the algorithm is not as fast as
-// the author estimated, which is not quite likely, of course.
+// TODO: The current focus of the project is to understand the growth
+// rate of the algorithm, to know whether I made a mistake in the
+// previous iteration of the implementation, or the algorithm is not
+// as fast as the author estimated, which is not quite likely, of
+// course.
 //
 // Thus I shall establish a friendly interface that allows me to view
 // and debug the atomic languages and the languages, transparently.
 
-use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph};
+use graph::{
+    builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel,
+    LabelGraph,
+};
 
-use crate::{error::Error, Nfa, Regex};
+use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
 
-use std::fmt::Display;
+use core::fmt::{Debug, Display};
 
-#[non_exhaustive]
-#[derive(Debug)]
 /// Default NFA implementation.
-pub struct DefaultNFA<T: GraphLabel + Display> {
+#[derive(Debug, Clone)]
+pub struct DefaultNFA<T: GraphLabel> {
     graph: DLGraph<T>,
 }
 
@@ -63,6 +64,16 @@ impl<T: GraphLabel + Display> Graph for DefaultNFA<T> {
     fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
         self.graph.has_edge(source, target)
     }
+
+    #[inline]
+    fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> {
+        self.graph.print_viz(filename)
+    }
+
+    #[inline]
+    fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) {
+        unimplemented!()
+    }
 }
 
 impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
@@ -70,11 +81,10 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
 
     type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a;
 
-    // TODO: Return the label from the contained regular language.
     #[inline]
     fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> {
         if self.has_node(node_id) {
-            todo!()
+            unimplemented!()
         } else {
             Err(GError::IndexOutOfBounds(node_id, self.nodes_len()))
         }
@@ -98,12 +108,232 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
     fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> {
         self.graph.labels_of(node_id)
     }
+
+    #[inline]
+    fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result<bool, GError> {
+        self.graph.has_edge_label(node_id, label, target)
+    }
 }
 
 impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
-    #[allow(unused)]
-    fn to_nfa(regex: impl Regex<T>) -> Self {
-        todo!()
+    type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
+
+    fn to_nfa(
+        regexps: &[impl Regex<RegexType<T>>],
+        sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+    ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+        let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
+
+        if total_regexps_len == 0 {
+            return Ok(Default::default());
+        }
+
+        // We reserve two more rooms for later uses.
+        let nfa_len = total_regexps_len * 2 + 2;
+
+        let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
+
+        // Since DOption<T> implements Copy when T does, we can use a
+        // variable to hold the empty label to avoid repetitions.
+        let empty_label: DOption<T> = Default::default();
+
+        for _ in 0..nfa_len {
+            builder.add_vertex();
+        }
+
+        let accumulators: Vec<usize> = {
+            let mut result = Vec::with_capacity(regexps.len() + 1);
+
+            result.push(0);
+
+            let mut accumulator = 0;
+
+            for regexp in regexps.iter() {
+                accumulator += regexp.nodes_len() * 2;
+                result.push(accumulator);
+            }
+
+            result.pop();
+
+            result
+        };
+
+        assert_eq!(accumulators.len(), regexps.len());
+
+        /// Get offset from `accumulators` safely.
+        macro_rules! get_offset_safe (
+            ($num:expr) => {
+                   *accumulators.get($num).ok_or(Error::UnknownNode($num))?
+            }
+        );
+
+        /// Get offset from `accumulators` without checking bounds.
+        macro_rules! get_offset_unsafe (
+            ($num:expr) => {
+                { unsafe { *accumulators.get_unchecked($num) } }
+            }
+        );
+
+        for (index, regex) in regexps.iter().enumerate() {
+            let root = if let Some(root) = regex.root() {
+                root
+            } else {
+                // A regular expression without roots is empty, so we
+                // skip it.
+                continue;
+            };
+
+            let regex_len = regex.nodes_len();
+
+            // It is safe here to call `get_offset_unsafe`, as `index`
+            // is guaranteed to be strictly less than the length of
+            // `accumulators` by an assertion above.
+            let offset = get_offset_unsafe!(index);
+
+            let mut stack: Vec<usize> = Vec::with_capacity(regex_len);
+
+            stack.push(root);
+
+            while let Some(top_index) = stack.pop() {
+                let top_label = regex.vertex_label(top_index)?;
+
+                let nfa_start = offset + 2 * top_index;
+                let nfa_end = offset + 2 * top_index + 1;
+
+                match top_label {
+                    RegexType::Kleene => {
+                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+
+                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                        }
+                    }
+                    RegexType::Plus => {
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+
+                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Optional => {
+                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Or => {
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(nfa_start, child_start, empty_label)?;
+                                builder.add_edge(child_end, nfa_end, empty_label)?;
+                            }
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Paren => {
+                        // Ignore Paren nodes since currently these
+                        // are used only for printing purposes.
+                    }
+                    RegexType::Empty => {
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Lit(value) => {
+                        // The children would be ignored even if for
+                        // some reason this literal node had some
+                        // children.
+
+                        match sub_pred(value)? {
+                            SoC::Sub(sub_non) => {
+                                // a non-terminal
+
+                                let sub_offset = get_offset_safe!(sub_non);
+                                let sub_nfa_start = sub_offset + 2 * sub_non;
+                                let sub_nfa_end = sub_offset + 2 * sub_non + 1;
+
+                                builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
+                                builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+                            }
+                            SoC::Carry(new_value) => {
+                                // a terminal
+
+                                builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        let graph = builder.build();
+
+        Ok(DefaultNFA { graph })
     }
 
     fn remove_epsilon(&mut self) -> Result<(), Error> {
@@ -114,7 +344,113 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         todo!()
     }
 
-    fn nulling(&mut self) -> Result<(), Error> {
-        todo!()
+    fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
+        let mut updated = true;
+        let mut builder = self.graph.builder_ref();
+
+        while updated {
+            updated = false;
+
+            let mut nullable = false;
+
+            let mut to_add = Vec::new();
+
+            for (source, target) in builder.edges() {
+                for label in builder.edge_label(source, target)? {
+                    if f(label) {
+                        nullable = true;
+
+                        break;
+                    }
+                }
+
+                if nullable {
+                    for (label, child_iter) in builder.labels_of(target)? {
+                        for child in child_iter {
+                            if !builder.has_edge_label(source, label, child)? {
+                                updated = true;
+
+                                to_add.push((source, child, *label));
+                            }
+                        }
+                    }
+                }
+            }
+
+            for (source, child, label) in to_add {
+                builder.add_edge(source, child, label)?;
+            }
+        }
+
+        self.graph.replace_by_builder(builder);
+
+        Ok(())
+    }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        Debug::fmt(self, f)
+    }
+}
+
+#[cfg(test)]
+mod test_to_nfa {
+    #![allow(unused_imports)]
+    use super::*;
+    use crate::SoC;
+
+    use crate::{
+        default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
+        desrec::DesRec,
+    };
+
+    fn new_regex() -> Result<DefaultRegex<char>, ParseError> {
+        let parser = DefaultRegParser::<char>::default();
+
+        fn test_scanner<T: GraphLabel + Display + Debug>(
+            _parser: &DefaultRegParser<T>,
+            input: &str,
+        ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+            use ParseDirection::*;
+            use RegexType::*;
+
+            if let Some(first) = input.chars().next() {
+                match first {
+                    '*' => Ok(Some((1, Kleene, Right))),
+                    '+' => Ok(Some((1, Plus, Right))),
+                    '?' => Ok(Some((1, Optional, Right))),
+                    '|' => Ok(Some((1, Empty, Up))),
+                    '(' => Ok(Some((1, Or, Down))),
+                    ')' => Ok(Some((1, Paren, Up))),
+                    ' '..='~' => Ok(Some((1, Lit(first), Right))),
+                    _ => Err(ParseError::InvalidCharacter(first)),
+                }
+            } else {
+                Ok(None)
+            }
+        }
+
+        let input_string = "a*b?c+|(d*| +)?".to_owned();
+
+        Ok(
+            DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
+                .unwrap_or(Default::default())
+                .0,
+        )
+    }
+
+    #[test]
+    fn test_to_nfa() -> Result<(), Box<dyn std::error::Error>> {
+        let regex = new_regex()?;
+
+        println!("regex = {regex}");
+
+        let nfa: DefaultNFA<DOption<char>> =
+            DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?;
+
+        nfa.print_viz("nfa.gv")?;
+
+        Ok(())
     }
 }
-- 
cgit v1.2.3-18-g5258