structural change: separate crates out

I put functionalities that are not strictly core to separate crates, so that the whole package becomes more modular, and makes it easier to try other parsing algorithms in the future. Also I have to figure the forests out before finishing the core chain-rule algorithm, as the part about forests affects the labels of the grammars directly. From my experiences in writing the previous version, it is asking for trouble to change the labels type dramatically at a later point: too many places need to be changed. Thus I decide to figure the rough part of forests out. Actually I only have to figure out how to attach forests fragments to edges of the underlying atomic languages, and the more complex parts of putting forests together can be left to the recorders, which is my vision of assembling semi-ring values during the chain-rule machine. It should be relatively easy to produce forests fragments from grammars since we are just trying to extract some information from the grammar, not to manipulate those information in some complicated way. We have to do some manipulations in the process, though, in order to make sure that the nulling and epsilon-removal processes do not invalidate these fragments.
author: JSDurand <mmemmew@gmail.com> 2023-01-03 23:44:02 +0800
committer: JSDurand <mmemmew@gmail.com> 2023-01-03 23:44:02 +0800
commit: bdbd4b4dc21af09711c97d3f903877443199af06 (patch)
tree: c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /chain/src/atom
parent: 8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)
2 files changed, 383 insertions, 0 deletions
diff --git a/chain/src/atom/default.rs b/chain/src/atom/default.rs
new file mode 100644
index 0000000..72989b3
--- /dev/null
+++ b/chain/src/atom/default.rs
@@ -0,0 +1,342 @@
+//! This file provides a default implementation of the
+//! [`Atom`][super::Atom] trait.
+
+use super::*;
+use grammar::Grammar;
+use graph::{error::Error as GraphError, Graph, LabelExtGraph, LabelGraph};
+use nfa::default::{nfa::DefaultNFA, regex::DefaultRegex};
+
+use core::fmt::Display;
+use std::collections::BTreeMap as Map;
+
+/// A virtual node represents the derivative of a non-terminal symbol
+/// `S` with respect to a terminal symbol `t`.
+#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Ord, PartialOrd)]
+struct VirtualNode {
+    s: usize,
+    t: usize,
+}
+
+impl Display for VirtualNode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "VN[{}]^({})", self.s, self.t)
+    }
+}
+
+impl VirtualNode {
+    fn new(s: usize, t: usize) -> Self {
+        Self { s, t }
+    }
+}
+
+type VirtualMap = Map<VirtualNode, usize>;
+
+/// The type of atomic languages.
+#[derive(Debug, Clone, Default)]
+pub struct DefaultAtom {
+    grammar: Grammar,
+    nfa: DefaultNFA<DOption<TNT>>,
+    // NOTE: This is mostly for printing and debugging
+    regexp: Vec<DefaultRegex<TNT>>,
+    virtual_nodes: VirtualMap,
+}
+
+impl Display for DefaultAtom {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let grammar = &self.grammar;
+
+        let error_to_string = |err| format!("{err}");
+        let tnt_to_string = |tnt, deco| {
+            grammar
+                .name_of_tnt(tnt)
+                .map(|name| format!("{deco}({name})"))
+                .unwrap_or_else(error_to_string)
+        };
+
+        let display_tnt = |tnt| match tnt {
+            TNT::Ter(t) => format!(
+                "({})",
+                grammar
+                    .unpack_tnt(t)
+                    .map(|tnt| tnt_to_string(tnt, ""))
+                    .unwrap_or_else(error_to_string)
+            ),
+            TNT::Non(_) => tnt_to_string(tnt, "H"),
+        };
+
+        writeln!(f, "regular expressions:")?;
+
+        let mut accumulators = Vec::with_capacity(self.regexp.len() + 1);
+        accumulators.push(0usize);
+
+        for regex in self.regexp.iter() {
+            writeln!(f, "accumulator: {}", accumulators.last().unwrap())?;
+
+            accumulators.push(regex.nodes_len() * 2 + accumulators.last().unwrap());
+
+            let string = regex.to_string_with(display_tnt)?;
+
+            writeln!(f, "{string}")?;
+        }
+
+        writeln!(f, "total = {}", accumulators.last().unwrap())?;
+
+        writeln!(f, "virtual nodes:")?;
+
+        for (virtual_node, index) in self.virtual_nodes.iter() {
+            writeln!(f, "{virtual_node}: {index}")?;
+        }
+
+        Ok(())
+    }
+}
+
+// Some boiler-plate delegation implementations for Graph and
+// LabelGraph, in order to implement Nfa.
+
+impl Graph for DefaultAtom {
+    type Iter<'b> = <DefaultNFA<DOption<TNT>> as Graph>::Iter<'b>
+    where
+        Self: 'b;
+
+    fn is_empty(&self) -> bool {
+        self.nfa.is_empty()
+    }
+
+    fn nodes_len(&self) -> usize {
+        self.nfa.nodes_len()
+    }
+
+    fn children_of(&self, node_id: usize) -> Result<Self::Iter<'_>, GraphError> {
+        self.nfa.children_of(node_id)
+    }
+
+    fn degree(&self, node_id: usize) -> Result<usize, GraphError> {
+        self.nfa.degree(node_id)
+    }
+
+    fn is_empty_node(&self, node_id: usize) -> Result<bool, GraphError> {
+        self.nfa.is_empty_node(node_id)
+    }
+
+    fn has_edge(&self, source: usize, target: usize) -> Result<bool, GraphError> {
+        self.nfa.has_edge(source, target)
+    }
+
+    fn replace_by_builder(&mut self, _builder: impl graph::Builder<Result = Self>) {
+        // NOTE: We cannot replace by a builder whose result is an
+        // atom, not the underlying graph type.
+        unimplemented!()
+    }
+}
+
+impl LabelGraph<DOption<TNT>> for DefaultAtom {
+    type Iter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::Iter<'b>
+    where
+        Self: 'b;
+
+    type LabelIter<'b> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::LabelIter<'b>
+    where
+        Self: 'b,
+        DOption<TNT>: 'b;
+
+    type EdgeLabelIter<'a> = <DefaultNFA<DOption<TNT>> as LabelGraph<DOption<TNT>>>::EdgeLabelIter<'a>
+    where
+        Self: 'a,
+        DOption<TNT>: 'a;
+
+    #[inline]
+    fn vertex_label(&self, node_id: usize) -> Result<Option<DOption<TNT>>, GraphError> {
+        self.nfa.vertex_label(node_id)
+    }
+
+    #[inline]
+    fn edge_label(
+        &self,
+        source: usize,
+        target: usize,
+    ) -> Result<Self::EdgeLabelIter<'_>, GraphError> {
+        self.nfa.edge_label(source, target)
+    }
+
+    #[inline]
+    fn find_children_with_label(
+        &self,
+        node_id: usize,
+        label: &DOption<TNT>,
+    ) -> Result<<Self as LabelGraph<DOption<TNT>>>::Iter<'_>, GraphError> {
+        self.nfa.find_children_with_label(node_id, label)
+    }
+
+    #[inline]
+    fn labels_of(&self, node_id: usize) -> Result<Self::LabelIter<'_>, GraphError> {
+        self.nfa.labels_of(node_id)
+    }
+
+    #[inline]
+    fn has_edge_label(
+        &self,
+        node_id: usize,
+        label: &DOption<TNT>,
+        target: usize,
+    ) -> Result<bool, GraphError> {
+        self.nfa.has_edge_label(node_id, label, target)
+    }
+}
+
+impl LabelExtGraph<DOption<TNT>> for DefaultAtom {
+    #[inline]
+    fn extend(
+        &mut self,
+        edges: impl IntoIterator<Item = (DOption<TNT>, usize)>,
+    ) -> Result<usize, GraphError> {
+        self.nfa.extend(edges)
+    }
+}
+
+impl Nfa<DOption<TNT>> for DefaultAtom {
+    #[inline]
+    fn remove_epsilon<F>(&mut self, f: F) -> Result<(), nfa::error::Error>
+    where
+        F: Fn(DOption<TNT>) -> bool,
+    {
+        self.nfa.remove_epsilon(f)
+    }
+
+    type FromRegex<S: graph::GraphLabel + std::fmt::Display + Default> = ();
+
+    #[inline]
+    fn to_nfa(
+        _regexps: &[impl nfa::Regex<nfa::default::regex::RegexType<DOption<TNT>>>],
+        _sub_pred: impl Fn(DOption<TNT>) -> Result<nfa::SoC<DOption<TNT>>, nfa::error::Error>,
+        _default: Option<DOption<TNT>>,
+    ) -> Result<Self::FromRegex<DOption<DOption<TNT>>>, nfa::error::Error> {
+        // NOTE: We cannot construct an atom from a set of regular
+        // languages alone.  So it is appropriate to panic here, if
+        // one tries to do so, for some reason.
+        unimplemented!()
+    }
+
+    #[inline]
+    fn remove_dead(&mut self, reserve: impl Fn(usize) -> bool) -> Result<(), nfa::error::Error> {
+        self.nfa.remove_dead(reserve)
+    }
+
+    #[inline]
+    fn nulling(&mut self, f: impl Fn(DOption<TNT>) -> bool) -> Result<(), nfa::error::Error> {
+        self.nfa.nulling(f)
+    }
+}
+
+impl DefaultAtom {
+    /// Construct an atomic language from a grammar.
+    pub fn from_grammar(mut grammar: Grammar) -> Result<Self, GrammarError> {
+        grammar.compute_firsts()?;
+
+        let regexp = grammar.left_closure()?;
+
+        let mut nfa = grammar.left_closure_to_nfa(&regexp)?;
+
+        use std::collections::HashSet;
+
+        let accumulators: Vec<usize> = {
+            let mut result = Vec::with_capacity(regexp.len() + 1);
+            result.push(0);
+
+            for regex in regexp.iter() {
+                result.push(regex.nodes_len() * 2 + result.last().unwrap());
+            }
+
+            result.into_iter().collect()
+        };
+
+        let accumulators_set: HashSet<usize> = accumulators.iter().copied().collect();
+
+        nfa.nulling(|label| {
+            if let Some(label) = *label {
+                match label {
+                    TNT::Ter(_) => false,
+                    // Panics if a non-terminal references an invalid node
+                    // here.
+                    TNT::Non(n) => grammar.is_nullable(n).unwrap(),
+                }
+            } else {
+                true
+            }
+        })?;
+        nfa.remove_epsilon(|label| label.is_none())?;
+        nfa.remove_dead(|node| accumulators_set.contains(&node))?;
+
+        // now add the virtual nodes
+        let mut virtual_nodes: VirtualMap = Default::default();
+
+        let nt_num = grammar.non_num();
+
+        assert!(nt_num <= accumulators.len());
+
+        // Convert an error telling us that an index is out of bounds.
+        //
+        // Panics if the error is not of the expected kind.
+        fn index_out_of_bounds_conversion(ge: GraphError) -> GrammarError {
+            match ge {
+                GraphError::IndexOutOfBounds(index, bound) => {
+                    GrammarError::IndexOutOfBounds(index, bound)
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        for nt in 0..nt_num {
+            let children: std::collections::HashMap<DOption<TNT>, Vec<_>> = nfa
+                // this is safe because of the above assertion.
+                .labels_of(*accumulators.get(nt).unwrap())
+                .map_err(index_out_of_bounds_conversion)?
+                .map(|(label, target_iter)| (*label, target_iter.collect()))
+                .collect();
+
+            for (label, children_vec) in children.into_iter() {
+                if let Some(TNT::Ter(t)) = *label {
+                    let new_index = nfa
+                        .extend(children_vec.into_iter().map(|target| (label, target)))
+                        .map_err(index_out_of_bounds_conversion)?;
+
+                    let virtual_node = VirtualNode::new(nt, t);
+
+                    virtual_nodes.insert(virtual_node, new_index);
+                }
+            }
+        }
+
+        Ok(Self {
+            grammar,
+            nfa,
+            regexp,
+            virtual_nodes,
+        })
+    }
+}
+
+/// A convenient getter for the map of virtual nodes.
+fn query(map: &VirtualMap, nt: usize, t: usize) -> Option<usize> {
+    map.get(&VirtualNode::new(nt, t)).copied()
+}
+
+impl Atom for DefaultAtom {
+    fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError> {
+        if nt >= self.grammar.non_num() {
+            return Err(GrammarError::IndexOutOfBounds(nt, self.grammar.non_num()));
+        }
+
+        if t >= self.grammar.ter_num() {
+            return Err(GrammarError::IndexOutOfBounds(t, self.grammar.ter_num()));
+        }
+
+        Ok(query(&self.virtual_nodes, nt, t))
+    }
+
+    fn empty(&self) -> usize {
+        assert_eq!(self.nfa.nodes_len() - 2, self.grammar.non_num() * 2);
+
+        self.nfa.nodes_len() - 2
+    }
+}
diff --git a/chain/src/atom/mod.rs b/chain/src/atom/mod.rs
new file mode 100644
index 0000000..084acca
--- /dev/null
+++ b/chain/src/atom/mod.rs
@@ -0,0 +1,41 @@
+//! This file defines the behaviour of the Atomic languages, and
+//! provides a default implementation.
+//!
+//! Here I do not to substitute external packages' implementations in
+//! the future, so why define a trait for the atomic languages?
+//! Because this way I can easily substitute other implementations if
+//! I have better ideas in the future.
+
+use grammar::{Error as GrammarError, TNT};
+use nfa::{DOption, Nfa};
+
+/// The expected behaviours of an atomic language.
+pub trait Atom: Nfa<DOption<TNT>> {
+    /// Return the index of a node representing the derivative of the
+    /// left-linear null closure of `nt` with respect to `t`.
+    fn atom(&self, nt: usize, t: usize) -> Result<Option<usize>, GrammarError>;
+
+    /// Return the index of the empty state.
+    fn empty(&self) -> usize;
+}
+
+pub mod default;
+
+pub use default::DefaultAtom;
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use grammar::test_grammar_helper::*;
+
+    #[test]
+    fn atom() -> Result<(), Box<dyn std::error::Error>> {
+        let grammar = new_notes_grammar()?;
+
+        let atom = DefaultAtom::from_grammar(grammar)?;
+
+        println!("atom = {atom}");
+
+        Ok(())
+    }
+}
author	JSDurand <mmemmew@gmail.com>	2023-01-03 23:44:02 +0800
committer	JSDurand <mmemmew@gmail.com>	2023-01-03 23:44:02 +0800
commit	bdbd4b4dc21af09711c97d3f903877443199af06 (patch)
tree	c6a9602f72ee1f6fd7fd3f64b8679a4de50a0159 /chain/src/atom
parent	8463dd24f815fe2b8f25fe9763e0a43023bfbb20 (diff)