renaming core to chain and some other changes

Some changes: - The core crate is renamed to "chain". - The crate "viz" is added, which will provide layered graph drawing algorithms. - A function is added to convert from a grammar to the regular language of its left-linear closures. - A function is added to convert from a nondeterministic finite automaton to its "null" closure. A null closure is the same automaton with edges added, as if some edges are "null". Whether an edge is null is determined by a function. Combined with the previous change, we can convert a grammar to the regular language of the null closure of its left-linear closures. --- Now it remains to test more grammars and add an Atom trait, before finishing the part about compilations.
author: JSDurand <mmemmew@gmail.com> 2022-12-23 00:36:31 +0800
committer: JSDurand <mmemmew@gmail.com> 2022-12-23 00:36:31 +0800
commit: 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (patch)
tree: 343eea3c634efbbf72c64ed5dd778ecce60c3eea /nfa/src
parent: 9f1c88b863e247da3cd60d2792a7a13b18e25e53 (diff)
5 files changed, 743 insertions, 71 deletions
diff --git a/nfa/src/default/nfa.rs b/nfa/src/default/nfa.rs
index 3c2bd83..229ba4d 100644
--- a/nfa/src/default/nfa.rs
+++ b/nfa/src/default/nfa.rs
@@ -1,25 +1,26 @@
 //! This file provides a default implementation of NFA.
 
-// TODO: Store the regular expression in the NFA as well.
-//
-// The current focus of the project is to understand the growth rate
-// of the algorithm, to know whether I made a mistake in the previous
-// iteration of the implementation, or the algorithm is not as fast as
-// the author estimated, which is not quite likely, of course.
+// TODO: The current focus of the project is to understand the growth
+// rate of the algorithm, to know whether I made a mistake in the
+// previous iteration of the implementation, or the algorithm is not
+// as fast as the author estimated, which is not quite likely, of
+// course.
 //
 // Thus I shall establish a friendly interface that allows me to view
 // and debug the atomic languages and the languages, transparently.
 
-use graph::{error::Error as GError, DLGraph, Graph, GraphLabel, LabelGraph};
+use graph::{
+    builder::Builder, error::Error as GError, labelled::DLGBuilder, DLGraph, Graph, GraphLabel,
+    LabelGraph,
+};
 
-use crate::{error::Error, Nfa, Regex};
+use crate::{default::regex::RegexType, error::Error, DOption, Nfa, Regex, SoC};
 
-use std::fmt::Display;
+use core::fmt::{Debug, Display};
 
-#[non_exhaustive]
-#[derive(Debug)]
 /// Default NFA implementation.
-pub struct DefaultNFA<T: GraphLabel + Display> {
+#[derive(Debug, Clone)]
+pub struct DefaultNFA<T: GraphLabel> {
     graph: DLGraph<T>,
 }
 
@@ -63,6 +64,16 @@ impl<T: GraphLabel + Display> Graph for DefaultNFA<T> {
     fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
         self.graph.has_edge(source, target)
     }
+
+    #[inline]
+    fn print_viz(&self, filename: &str) -> Result<(), std::io::Error> {
+        self.graph.print_viz(filename)
+    }
+
+    #[inline]
+    fn replace_by_builder(&mut self, _builder: impl Builder<Result = Self>) {
+        unimplemented!()
+    }
 }
 
 impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
@@ -70,11 +81,10 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
 
     type LabelIter<'a> = <DLGraph<T> as LabelGraph<T>>::LabelIter<'a> where T: 'a;
 
-    // TODO: Return the label from the contained regular language.
     #[inline]
     fn vertex_label(&self, node_id: usize) -> Result<Option<T>, GError> {
         if self.has_node(node_id) {
-            todo!()
+            unimplemented!()
         } else {
             Err(GError::IndexOutOfBounds(node_id, self.nodes_len()))
         }
@@ -98,12 +108,232 @@ impl<T: GraphLabel + Display> LabelGraph<T> for DefaultNFA<T> {
     fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GError> {
         self.graph.labels_of(node_id)
     }
+
+    #[inline]
+    fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result<bool, GError> {
+        self.graph.has_edge_label(node_id, label, target)
+    }
 }
 
 impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
-    #[allow(unused)]
-    fn to_nfa(regex: impl Regex<T>) -> Self {
-        todo!()
+    type FromRegex<S: GraphLabel + Display + Default> = DefaultNFA<S>;
+
+    fn to_nfa(
+        regexps: &[impl Regex<RegexType<T>>],
+        sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+    ) -> Result<Self::FromRegex<DOption<T>>, Error> {
+        let total_regexps_len: usize = regexps.iter().map(Graph::nodes_len).sum();
+
+        if total_regexps_len == 0 {
+            return Ok(Default::default());
+        }
+
+        // We reserve two more rooms for later uses.
+        let nfa_len = total_regexps_len * 2 + 2;
+
+        let mut builder: DLGBuilder<DOption<T>> = Builder::with_capacity(nfa_len);
+
+        // Since DOption<T> implements Copy when T does, we can use a
+        // variable to hold the empty label to avoid repetitions.
+        let empty_label: DOption<T> = Default::default();
+
+        for _ in 0..nfa_len {
+            builder.add_vertex();
+        }
+
+        let accumulators: Vec<usize> = {
+            let mut result = Vec::with_capacity(regexps.len() + 1);
+
+            result.push(0);
+
+            let mut accumulator = 0;
+
+            for regexp in regexps.iter() {
+                accumulator += regexp.nodes_len() * 2;
+                result.push(accumulator);
+            }
+
+            result.pop();
+
+            result
+        };
+
+        assert_eq!(accumulators.len(), regexps.len());
+
+        /// Get offset from `accumulators` safely.
+        macro_rules! get_offset_safe (
+            ($num:expr) => {
+                   *accumulators.get($num).ok_or(Error::UnknownNode($num))?
+            }
+        );
+
+        /// Get offset from `accumulators` without checking bounds.
+        macro_rules! get_offset_unsafe (
+            ($num:expr) => {
+                { unsafe { *accumulators.get_unchecked($num) } }
+            }
+        );
+
+        for (index, regex) in regexps.iter().enumerate() {
+            let root = if let Some(root) = regex.root() {
+                root
+            } else {
+                // A regular expression without roots is empty, so we
+                // skip it.
+                continue;
+            };
+
+            let regex_len = regex.nodes_len();
+
+            // It is safe here to call `get_offset_unsafe`, as `index`
+            // is guaranteed to be strictly less than the length of
+            // `accumulators` by an assertion above.
+            let offset = get_offset_unsafe!(index);
+
+            let mut stack: Vec<usize> = Vec::with_capacity(regex_len);
+
+            stack.push(root);
+
+            while let Some(top_index) = stack.pop() {
+                let top_label = regex.vertex_label(top_index)?;
+
+                let nfa_start = offset + 2 * top_index;
+                let nfa_end = offset + 2 * top_index + 1;
+
+                match top_label {
+                    RegexType::Kleene => {
+                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+
+                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                        }
+                    }
+                    RegexType::Plus => {
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+
+                            builder.add_edge(nfa_end, nfa_start, empty_label)?;
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Optional => {
+                        builder.add_edge(nfa_start, nfa_end, empty_label)?;
+
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Or => {
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(nfa_start, child_start, empty_label)?;
+                                builder.add_edge(child_end, nfa_end, empty_label)?;
+                            }
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Paren => {
+                        // Ignore Paren nodes since currently these
+                        // are used only for printing purposes.
+                    }
+                    RegexType::Empty => {
+                        let mut source = nfa_start;
+
+                        if !regex.is_empty_node(top_index)? {
+                            for child in regex.children_of(top_index)? {
+                                stack.push(child);
+
+                                let child_start = offset + 2 * child;
+                                let child_end = offset + 2 * child + 1;
+
+                                builder.add_edge(source, child_start, empty_label)?;
+
+                                source = child_end;
+                            }
+
+                            builder.add_edge(source, nfa_end, empty_label)?;
+                        } else {
+                            builder.add_edge(nfa_start, nfa_end, empty_label)?;
+                        }
+                    }
+                    RegexType::Lit(value) => {
+                        // The children would be ignored even if for
+                        // some reason this literal node had some
+                        // children.
+
+                        match sub_pred(value)? {
+                            SoC::Sub(sub_non) => {
+                                // a non-terminal
+
+                                let sub_offset = get_offset_safe!(sub_non);
+                                let sub_nfa_start = sub_offset + 2 * sub_non;
+                                let sub_nfa_end = sub_offset + 2 * sub_non + 1;
+
+                                builder.add_edge(nfa_start, sub_nfa_start, empty_label)?;
+                                builder.add_edge(sub_nfa_end, nfa_end, empty_label)?;
+                            }
+                            SoC::Carry(new_value) => {
+                                // a terminal
+
+                                builder.add_edge(nfa_start, nfa_end, DOption(Some(new_value)))?;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        let graph = builder.build();
+
+        Ok(DefaultNFA { graph })
     }
 
     fn remove_epsilon(&mut self) -> Result<(), Error> {
@@ -114,7 +344,113 @@ impl<T: GraphLabel + Display> Nfa<T> for DefaultNFA<T> {
         todo!()
     }
 
-    fn nulling(&mut self) -> Result<(), Error> {
-        todo!()
+    fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error> {
+        let mut updated = true;
+        let mut builder = self.graph.builder_ref();
+
+        while updated {
+            updated = false;
+
+            let mut nullable = false;
+
+            let mut to_add = Vec::new();
+
+            for (source, target) in builder.edges() {
+                for label in builder.edge_label(source, target)? {
+                    if f(label) {
+                        nullable = true;
+
+                        break;
+                    }
+                }
+
+                if nullable {
+                    for (label, child_iter) in builder.labels_of(target)? {
+                        for child in child_iter {
+                            if !builder.has_edge_label(source, label, child)? {
+                                updated = true;
+
+                                to_add.push((source, child, *label));
+                            }
+                        }
+                    }
+                }
+            }
+
+            for (source, child, label) in to_add {
+                builder.add_edge(source, child, label)?;
+            }
+        }
+
+        self.graph.replace_by_builder(builder);
+
+        Ok(())
+    }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultNFA<T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        Debug::fmt(self, f)
+    }
+}
+
+#[cfg(test)]
+mod test_to_nfa {
+    #![allow(unused_imports)]
+    use super::*;
+    use crate::SoC;
+
+    use crate::{
+        default::regex::{DefaultRegParser, DefaultRegex, ParseDirection, ParseError, RegexType},
+        desrec::DesRec,
+    };
+
+    fn new_regex() -> Result<DefaultRegex<char>, ParseError> {
+        let parser = DefaultRegParser::<char>::default();
+
+        fn test_scanner<T: GraphLabel + Display + Debug>(
+            _parser: &DefaultRegParser<T>,
+            input: &str,
+        ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+            use ParseDirection::*;
+            use RegexType::*;
+
+            if let Some(first) = input.chars().next() {
+                match first {
+                    '*' => Ok(Some((1, Kleene, Right))),
+                    '+' => Ok(Some((1, Plus, Right))),
+                    '?' => Ok(Some((1, Optional, Right))),
+                    '|' => Ok(Some((1, Empty, Up))),
+                    '(' => Ok(Some((1, Or, Down))),
+                    ')' => Ok(Some((1, Paren, Up))),
+                    ' '..='~' => Ok(Some((1, Lit(first), Right))),
+                    _ => Err(ParseError::InvalidCharacter(first)),
+                }
+            } else {
+                Ok(None)
+            }
+        }
+
+        let input_string = "a*b?c+|(d*| +)?".to_owned();
+
+        Ok(
+            DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
+                .unwrap_or(Default::default())
+                .0,
+        )
+    }
+
+    #[test]
+    fn test_to_nfa() -> Result<(), Box<dyn std::error::Error>> {
+        let regex = new_regex()?;
+
+        println!("regex = {regex}");
+
+        let nfa: DefaultNFA<DOption<char>> =
+            DefaultNFA::to_nfa(&[regex], |label| Ok(SoC::Carry(label)))?;
+
+        nfa.print_viz("nfa.gv")?;
+
+        Ok(())
     }
 }
diff --git a/nfa/src/default/regex.rs b/nfa/src/default/regex.rs
index a60f801..2670e32 100644
--- a/nfa/src/default/regex.rs
+++ b/nfa/src/default/regex.rs
@@ -6,7 +6,11 @@ use crate::{desrec::DesRec, error::Error, Regex};
 
 use receme::{algebra::Algebra, catana::Cata};
 
-use std::fmt::{Debug, Display};
+use std::{
+    collections::HashMap,
+    fmt::{Debug, Display, Write},
+    marker::PhantomData,
+};
 
 /// The type of a node in a regular expression.
 ///
@@ -25,7 +29,7 @@ use std::fmt::{Debug, Display};
 /// This is possible because a regular expression has a root node.
 /// For the sake of convenience, the root node has type "Or".
 #[derive(Debug, Hash, Default, Eq, PartialEq, Clone, Copy, Ord, PartialOrd)]
-pub enum RegexType<T: GraphLabel + Display> {
+pub enum RegexType<T: GraphLabel> {
     /// A star node is a node with an arbitrary number of repetitions.
     Kleene,
     /// A plus node is a node with at least one repetition: a+ equals
@@ -44,13 +48,31 @@ pub enum RegexType<T: GraphLabel + Display> {
     Lit(T),
 }
 
+impl<T: GraphLabel> Display for RegexType<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RegexType::Kleene => write!(f, "*"),
+            RegexType::Plus => write!(f, "+"),
+            RegexType::Optional => write!(f, "?"),
+            RegexType::Or => write!(f, "|"),
+            RegexType::Paren => write!(f, "()"),
+            RegexType::Empty => write!(f, "empty"),
+            RegexType::Lit(value) => write!(f, "{value}"),
+        }
+    }
+}
+
 /// A default implementation of regular expressions.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct DefaultRegex<T: GraphLabel + Display> {
     /// The underlying graph is stored using adjacency lists.
     graph: ALGraph,
     /// The types of the underlying nodes.
     types: Vec<RegexType<T>>,
+    /// The root of the graph.
+    ///
+    /// If it is None, it indicates the regular expression is empty.
+    root: Option<usize>,
 }
 
 impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
@@ -58,11 +80,39 @@ impl<T: GraphLabel + Display> Default for DefaultRegex<T> {
         Self {
             graph: Default::default(),
             types: Default::default(),
+            root: Default::default(),
         }
     }
 }
 
-impl<T: GraphLabel + Display> DefaultRegex<T> {
+impl<T: GraphLabel> DefaultRegex<T> {
+    /// Set the root of the regular expression.
+    #[inline]
+    pub fn set_root(&mut self, root: Option<usize>) {
+        self.root = root;
+    }
+
+    /// Construct a regular expression from a raw adjacency list graph
+    /// and an array of labels.
+    ///
+    /// # Error
+    ///
+    /// If the graph contains cycles, this function will return an
+    /// error.  This check is added to make sure that the regular
+    /// expression contains no cycles, otherwise operations with
+    /// regular expressions might enter infinite loops later.
+    pub fn new(graph: ALGraph, types: Vec<RegexType<T>>) -> Result<Self, Error> {
+        if graph.contains_cycles()? {
+            Err(Error::Cycle)
+        } else {
+            Ok(Self {
+                graph,
+                types,
+                root: Some(0),
+            })
+        }
+    }
+
     /// Return the number of elements in this regular expression,
     /// counting nodes.
     pub fn len(&self) -> usize {
@@ -86,10 +136,65 @@ impl<T: GraphLabel + Display> DefaultRegex<T> {
 
         Ok(())
     }
+
+    /// Return the internal graph.
+    ///
+    /// Normally this should not be used.
+    pub fn internal_graph(&self) -> ALGraph {
+        self.graph.clone()
+    }
+
+    /// Return the internal types array.
+    ///
+    /// Normally this should not be used.
+    pub fn internal_types(&self) -> Vec<RegexType<T>> {
+        self.types.clone()
+    }
+
+    /// Return the array of parents.
+    ///
+    /// The element at index N of the returned array is the parent of
+    /// that N-th element.  If an element has no parents, a `None` is
+    /// placed at its place.
+    ///
+    /// # Error
+    ///
+    /// If some edge points to an invalid node, an erro will be
+    /// returned.
+    ///
+    /// Also, if the graph contains cycles, an error is also returned.
+    pub fn parents_array(&self) -> Result<Vec<Option<(usize, usize)>>, Error> {
+        if self.graph.contains_cycles().map_err(|e| match e {
+            GError::IndexOutOfBounds(n, _) => Error::UnknownNode(n),
+            _ => unreachable!(),
+        })? {
+            return Err(Error::Cycle);
+        }
+
+        let len = self.len();
+
+        let mut result: Vec<_> = std::iter::repeat_with(|| None).take(len).collect();
+
+        for source in self.graph.nodes() {
+            for (index, target) in self
+                .graph
+                .children_of(source)
+                .map_err(|_| Error::UnknownNode(source))?
+                .enumerate()
+            {
+                result
+                    .get_mut(target)
+                    .ok_or(Error::UnknownNode(target))?
+                    .replace((source, index));
+            }
+        }
+
+        Ok(result)
+    }
 }
 
 // REVIEW: This may not be needed.
-impl<S: GraphLabel + Display, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
+impl<S: GraphLabel, T, A> Cata<T, Vec<T>, A> for &DefaultRegex<S>
 where
     A: Algebra<T, Vec<T>>,
 {
@@ -118,8 +223,13 @@ where
     }
 }
 
-impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+impl<T: GraphLabel> DefaultRegex<T> {
+    /// Use a function to format the labels of the regular expressions
+    /// and format the entire regular expression with this aid.
+    pub fn to_string_with<F>(&self, mut f: F) -> Result<String, std::fmt::Error>
+    where
+        F: FnMut(T) -> String,
+    {
         #[derive(Copy, Clone)]
         enum StackElement {
             Seen(usize),
@@ -134,16 +244,15 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                 }
             }
 
-            fn is_seen(&self) -> bool {
-                match self {
-                    Seen(_) => true,
-                    Unseen(_) => false,
-                }
+            fn is_seen(self) -> bool {
+                matches!(self, Seen(_))
             }
         }
 
         use StackElement::{Seen, Unseen};
 
+        let mut result = String::new();
+
         let mut stack: Vec<StackElement> = Vec::new();
         let mut types = self.types.clone();
         types.push(RegexType::Paren);
@@ -159,6 +268,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                 RegexType::Kleene => {
                     if !top.is_seen() {
                         stack.push(Seen(top.index()));
+
+                        if self.degree(top.index()).unwrap() > 1 {
+                            write!(result, "(")?;
+                            stack.push(Unseen(types.len() - 1));
+                        }
+
                         stack.extend(
                             self.graph
                                 .children_of(top.index())
@@ -167,7 +282,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                                 .rev(),
                         );
                     } else {
-                        write!(f, "*")?;
+                        write!(result, "*")?;
                     }
                 }
                 RegexType::Plus => {
@@ -181,7 +296,7 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                                 .rev(),
                         );
                     } else {
-                        write!(f, "+")?;
+                        write!(result, "+")?;
                     }
                 }
                 RegexType::Optional => {
@@ -195,12 +310,12 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                                 .rev(),
                         );
                     } else {
-                        write!(f, "?")?;
+                        write!(result, "?")?;
                     }
                 }
                 RegexType::Or => {
                     if !top.is_seen() {
-                        write!(f, "(")?;
+                        write!(result, "(")?;
 
                         let len = self.len();
 
@@ -221,11 +336,11 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                             }
                         }
                     } else {
-                        write!(f, "|")?;
+                        write!(result, "|")?;
                     }
                 }
                 RegexType::Paren => {
-                    write!(f, ")")?;
+                    write!(result, ")")?;
                 }
                 RegexType::Empty => {
                     stack.extend(
@@ -236,11 +351,17 @@ impl<T: GraphLabel + Display> Display for DefaultRegex<T> {
                             .rev(),
                     );
                 }
-                RegexType::Lit(label) => write!(f, "{label}")?,
+                RegexType::Lit(label) => write!(result, "{}", f(label))?,
             }
         }
 
-        Ok(())
+        Ok(result)
+    }
+}
+
+impl<T: GraphLabel + Display + Debug> Display for DefaultRegex<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.to_string_with(|t| format!("{t}"))?)
     }
 }
 
@@ -278,9 +399,20 @@ impl<T: GraphLabel + Display> Graph for DefaultRegex<T> {
     fn has_edge(&self, source: usize, target: usize) -> Result<bool, GError> {
         self.graph.has_edge(source, target)
     }
+
+    #[inline]
+    fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) {
+        unimplemented!()
+    }
 }
 
-impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
+impl<T: GraphLabel + Display + Debug> Regex<RegexType<T>> for DefaultRegex<T> {
+    /// Return the root of the regular expression.
+    #[inline]
+    fn root(&self) -> Option<usize> {
+        self.root
+    }
+
     #[inline]
     fn vertex_label(&self, node_id: usize) -> Result<RegexType<T>, Error> {
         self.types
@@ -291,8 +423,10 @@ impl<T: GraphLabel + Display> Regex<RegexType<T>> for DefaultRegex<T> {
 }
 
 /// An error type for holding parsing errors.
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
 pub enum ParseError {
+    /// A cycle is encountered.
+    Cycle,
     /// Encounter an invalid state.
     Invalid,
     /// An error from graph operations.
@@ -354,23 +488,77 @@ impl Display for ParseDirection {
     }
 }
 
-impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
+/// A default recursive descent parser for regular expressions of
+/// terminals or non-terminals.
+#[derive(Debug, Clone)]
+pub struct DefaultRegParser<T: GraphLabel + Display> {
+    ter_map: HashMap<String, usize>,
+    non_map: HashMap<String, usize>,
+    _phantom: PhantomData<T>,
+}
+
+impl<T: GraphLabel + Display> DefaultRegParser<T> {
+    /// Query if a terminal or a non-terminal is already found.
+    ///
+    /// If found, return the associated index of the terminal or
+    /// non-terminal.
+    pub fn query(&self, tnt: &str, terminal_p: bool) -> Option<usize> {
+        if terminal_p {
+            self.ter_map.get(tnt).copied()
+        } else {
+            self.non_map.get(tnt).copied()
+        }
+    }
+
+    /// Add a terminal or a non-terminal.
+    pub fn add_tnt(&mut self, tnt: &str, terminal_p: bool) {
+        if terminal_p {
+            let ter_len = self.ter_map.len();
+
+            self.ter_map.entry(tnt.to_owned()).or_insert(ter_len);
+        } else {
+            let non_len = self.non_map.len();
+
+            self.non_map.entry(tnt.to_owned()).or_insert(non_len);
+        }
+    }
+}
+
+impl<T: GraphLabel + Display> Default for DefaultRegParser<T> {
+    fn default() -> Self {
+        Self {
+            ter_map: Default::default(),
+            non_map: Default::default(),
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegParser<T> {
     type Label = RegexType<T>;
 
-    type Regex = Self;
+    type Regex = DefaultRegex<T>;
 
     type Error = ParseError;
 
-    type Aux = ParseDirection;
+    type Inter = ParseDirection;
 
-    type Scanner<'a> =
-        Box<dyn FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>>;
+    type Scanner<'a, 'b> = Box<
+        dyn FnMut(
+            &'b Self,
+            &'a str,
+        ) -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>,
+    > where RegexType<T>:'b;
 
-    fn parse<'a>(
+    fn parse<'a, 'b>(
+        &'b self,
         mut input: &'a str,
-        mut scanner: Self::Scanner<'a>,
+        mut scanner: Self::Scanner<'a, 'b>,
         post_p: bool,
-    ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error> {
+    ) -> Result<Option<(DefaultRegex<T>, &'a str)>, Self::Error>
+    where
+        Self::Label: 'b,
+    {
         use ParseDirection::*;
         use RegexType::*;
 
@@ -380,7 +568,7 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
         // Classifies the input into a sequence of tokens with
         // directions.
         while !input.is_empty() {
-            if let Some((len, label, direction)) = scanner(input)? {
+            if let Some((len, label, direction)) = scanner(self, input)? {
                 if len == 0 {
                     break;
                 }
@@ -523,7 +711,12 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
 
         let graph = list_of_children.into();
 
-        let result = DefaultRegex { graph, types };
+        // Check there are no cycles
+        let result = DefaultRegex::new(graph, types).map_err(|e| match e {
+            Error::Graph(ge) => ParseError::Graph(ge),
+            Error::Cycle => ParseError::Cycle,
+            _ => unreachable!(),
+        })?;
 
         Ok(Some((result, input)))
     }
@@ -533,9 +726,17 @@ impl<T: GraphLabel + Display + Debug> DesRec for DefaultRegex<T> {
 mod test_des_rec {
     use super::*;
 
-    fn test_scanner(
-        input: &str,
-    ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError> {
+    use crate::desrec::DesRec;
+
+    #[allow(dead_code, unused)]
+    fn test_scanner<'a, 'b, T>(
+        parser: &'b DefaultRegParser<T>,
+        input: &'a str,
+    ) -> Result<Option<(usize, RegexType<char>, ParseDirection)>, ParseError>
+    where
+        T: GraphLabel + Display + Debug,
+        T: 'b,
+    {
         use ParseDirection::*;
         use RegexType::*;
 
@@ -557,19 +758,57 @@ mod test_des_rec {
 
     #[test]
     fn test_des_rec() -> Result<(), Box<dyn std::error::Error>> {
-        let input_string = "a*b?c+|(d*| +)?".to_owned();
+        let input_string = "(ade)*b?c+|(d*| +)?".to_owned();
+
+        let parser: DefaultRegParser<char> = Default::default();
 
         if let Some((regex, remain)) =
-            DefaultRegex::<char>::parse(&input_string, Box::new(test_scanner), true)?
+            DefaultRegParser::<char>::parse(&parser, &input_string, Box::new(test_scanner), true)?
         {
             println!("regex = {regex}");
             println!("remain = {remain}");
 
             println!("regex length = {}", regex.len());
 
+            let parents = regex.parents_array()?;
+
+            println!("parents = {parents:?}");
+
             Ok(())
         } else {
             unreachable!()
         }
     }
+
+    #[test]
+    fn test_display() -> Result<(), Box<dyn std::error::Error>> {
+        use graph::builder::Builder;
+        use RegexType::*;
+
+        let mut builder = graph::adlist::ALGBuilder::default();
+        let mut types: Vec<RegexType<usize>> = Vec::with_capacity(4);
+
+        types.push(Kleene);
+        builder.add_vertex();
+
+        types.push(Lit(0));
+        builder.add_vertex();
+        builder.add_edge(0, 1, ())?;
+
+        types.push(Lit(1));
+        builder.add_vertex();
+        builder.add_edge(0, 2, ())?;
+
+        types.push(Lit(2));
+        builder.add_vertex();
+        builder.add_edge(0, 3, ())?;
+
+        let graph = builder.build();
+
+        let regex = DefaultRegex::new(graph, types)?;
+
+        println!("regex = {regex}");
+
+        Ok(())
+    }
 }
diff --git a/nfa/src/desrec.rs b/nfa/src/desrec.rs
index c57d313..ac45c2d 100644
--- a/nfa/src/desrec.rs
+++ b/nfa/src/desrec.rs
@@ -20,8 +20,8 @@ pub trait DesRec {
     /// The type of errors encountered when parsing.
     type Error: std::error::Error;
 
-    /// Auxiliary data when parsing
-    type Aux;
+    /// Intermediate data when parsing
+    type Inter;
 
     /// The type of a scanner that classifies inputs into tokens.
     ///
@@ -34,7 +34,14 @@ pub trait DesRec {
     ///
     /// - `Ok(Some(_))`: the classification succeeds and the parsing
     /// should continue.
-    type Scanner<'a>: FnMut(&'a str) -> Result<Option<(usize, Self::Label, Self::Aux)>, Self::Error>;
+    type Scanner<'a, 'b>: FnMut(
+        &'b Self,
+        &'a str,
+    )
+        -> Result<Option<(usize, Self::Label, Self::Inter)>, Self::Error>
+    where
+        Self: 'b,
+        Self::Label: 'b;
 
     /// Parse a string into a regular expression with the aid of this
     /// type.
@@ -42,9 +49,12 @@ pub trait DesRec {
     /// Accept a slice of string and return either a parsing error, or
     /// a pair of correctly parsed regular expression and the
     /// remaining slice.
-    fn parse<'a>(
+    fn parse<'a, 'b>(
+        &'b self,
         input: &'a str,
-        scanner: Self::Scanner<'a>,
+        scanner: Self::Scanner<'a, 'b>,
         post_p: bool,
-    ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error>;
+    ) -> Result<ParseOutput<'a, Self::Regex>, Self::Error>
+    where
+        Self::Label: 'b;
 }
diff --git a/nfa/src/error.rs b/nfa/src/error.rs
index 0c6bb3c..ad75077 100644
--- a/nfa/src/error.rs
+++ b/nfa/src/error.rs
@@ -2,10 +2,11 @@
 
 use graph::error::Error as GError;
 
-use std::fmt::{Display, Formatter};
+use core::fmt::{Display, Formatter};
 
-#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 /// The error type for NFA operations.
+#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[non_exhaustive]
 pub enum Error {
     /// An unknown node id is encountered.
     UnknownNode(usize),
@@ -13,8 +14,12 @@ pub enum Error {
     ///
     /// Everything is a trade-off, wink wink.
     UnsupportedOperation,
+    /// There is no root in the underlying regular expression.
+    NoRoot,
     /// This error comes from some underlying graph operation.
     Graph(GError),
+    /// A cycle is found when constructing regular expressions.
+    Cycle,
 }
 
 impl Display for Error {
@@ -23,6 +28,8 @@ impl Display for Error {
             Error::UnknownNode(id) => write!(f, "unknown node: {id}"),
             Error::UnsupportedOperation => write!(f, "unsupported operation"),
             Error::Graph(e) => write!(f, "graph error: {e}"),
+            Error::NoRoot => write!(f, "no root found"),
+            Error::Cycle => write!(f, "a cycle is found when constructing regular expressions"),
         }
     }
 }
diff --git a/nfa/src/lib.rs b/nfa/src/lib.rs
index 1e2a30c..b1d62b3 100644
--- a/nfa/src/lib.rs
+++ b/nfa/src/lib.rs
@@ -12,17 +12,23 @@ extern crate graph;
 
 use core::fmt::Display;
 
+use std::ops::{Deref, DerefMut};
+
 use graph::{Graph, GraphLabel, LabelGraph};
 
 use error::Error;
 
+pub use desrec::DesRec;
+
+use default::regex::RegexType;
+
 /// The expected behaviour of a regular language.
 ///
 /// Nondeterministic finite automata are equivalent to regular
 /// languages.  Since regular languages are easier to understand for a
 /// human being, nondeterministic finite automata include the data for
 /// the equivalent regular languages.
-pub trait Regex<T: GraphLabel>: Graph + Display {
+pub trait Regex<T: GraphLabel>: Graph + Display + Clone {
     /// Return the label of a vertex, or an error if the node is
     /// invalid.
     fn vertex_label(&self, node_id: usize) -> Result<T, Error>;
@@ -52,6 +58,65 @@ pub trait Regex<T: GraphLabel>: Graph + Display {
     // cetera.  These might be needed later.
 }
 
+/// Since Option<T> does not inherit the Display from T, we wrap it to
+/// provide an automatic implementation of Display.
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Hash)]
+pub struct DOption<T>(Option<T>);
+
+impl<T> Default for DOption<T> {
+    fn default() -> Self {
+        Self(None)
+    }
+}
+
+impl<T> Deref for DOption<T> {
+    type Target = Option<T>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl<T> DerefMut for DOption<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl<T: Display> Display for DOption<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self.deref() {
+            Some(value) => Display::fmt(value, f),
+            None => write!(f, "ε"),
+        }
+    }
+}
+
+/// Substitute or Carry
+///
+/// This enumeration indicates whether a label from a regular
+/// expression should be substituted by another regular expression, or
+/// to be carried around in the resulting nondeterministic finite
+/// automaton, in the process of the [`to_nfa`][Nfa::to_nfa] function.
+///
+/// # Transform labels
+///
+/// The label that is returned to be carried can be different from the
+/// original label, as a way to transform the labels.
+///
+/// # Remark on the abbreviation
+///
+/// It happens "by chance" that this abbreviation coincides with the
+/// abbreviation of "system on chip".  Since I know nothing about this
+/// topic, this is just a meaningless pun.
+#[derive(Debug, Copy, Clone)]
+pub enum SoC<T> {
+    /// To be substituted by another regular expression.
+    Sub(usize),
+    /// To carry around this label.
+    Carry(T),
+}
+
 /// The expected behvaiour of a nondeterministic finite automaton.
 ///
 /// Every NFA is a special labelled graph.
@@ -68,9 +133,23 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
         Err(Error::UnsupportedOperation)
     }
 
-    /// Build a nondeterministic finite automaton out of a regular
-    /// language.
-    fn to_nfa(regex: impl Regex<T>) -> Self;
+    /// When we convert a regular expression to a nondeterministic
+    /// finite automaton, the label should be optional, so we use a
+    /// different type for the result type.
+    type FromRegex<S: GraphLabel + Display + Default>;
+
+    /// Build a nondeterministic finite automaton out of a set of
+    /// regular expressions.
+    ///
+    /// The SUB_PRED is a predicate function that returns an optional
+    /// index for a label.  If it returns some index, then this means
+    /// we should substitute the index-th regular expression in the
+    /// set, whereever that label occurs in the set of regular
+    /// expressions.
+    fn to_nfa(
+        regexps: &[impl Regex<RegexType<T>>],
+        sub_pred: impl Fn(T) -> Result<SoC<T>, Error>,
+    ) -> Result<Self::FromRegex<DOption<T>>, Error>;
 
     /// Remove all dead states from the nondeterministic finite
     /// automaton.
@@ -78,13 +157,14 @@ pub trait Nfa<T: GraphLabel>: LabelGraph<T> {
     /// A state is dead if there are no edges going to the state.
     fn remove_dead(&mut self) -> Result<(), Error>;
 
-    /// For each empty transition from A to B, and for every edge from
-    /// B to C, say, add an edge from A to C.
+    /// For each edge from A to B whose edge is considered nullable by
+    /// a function `f`, and for every edge from B to C, add an edge
+    /// from A to C.
     ///
     /// This is needed specifically by the algorithm to produce a set
     /// of atomic languages that represent "null-closed" derived
     /// languages.
-    fn nulling(&mut self) -> Result<(), Error>;
+    fn nulling(&mut self, f: impl Fn(T) -> bool) -> Result<(), Error>;
 }
 
 pub mod default;
author	JSDurand <mmemmew@gmail.com>	2022-12-23 00:36:31 +0800
committer	JSDurand <mmemmew@gmail.com>	2022-12-23 00:36:31 +0800
commit	8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (patch)
tree	343eea3c634efbbf72c64ed5dd778ecce60c3eea /nfa/src
parent	9f1c88b863e247da3cd60d2792a7a13b18e25e53 (diff)