From 1a3d346f413325ed37848a6b2526e8e729269833 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Wed, 11 Jan 2023 23:47:26 +0800 Subject: Record left-linear expansion and forest format Now the grammar will record the left-linear expansions when generating the nondeterministic finite automaton frmo its rules, and will record whether an edge in the nondeterministic finite automaton comes from a left-linear expansion. The latter is needed because while performing a chain-rule derivation, we do not need the left-linear expanded derivations in the "first layer". This might well have been the root cause of the bad performance of the previous version of this package. Also I have figured out how to properly generate and handle parse forests while manipulating the "chain-rule machine". --- forest/src/default.rs | 144 +++++--------------------------------------------- forest/src/design.org | 95 +++++++++++++++++++++++++++++---- forest/src/lib.rs | 116 +++++++++++++++++----------------------- 3 files changed, 146 insertions(+), 209 deletions(-) (limited to 'forest/src') diff --git a/forest/src/default.rs b/forest/src/default.rs index 5e438d4..d3970e9 100644 --- a/forest/src/default.rs +++ b/forest/src/default.rs @@ -1,141 +1,21 @@ //! This file provides a default implementation for the //! [`Forest`][crate::Forest] trait. +#[allow(unused_imports)] use super::*; +#[allow(unused_imports)] use graph::{error::Error as GraphError, ALGraph, Graph}; +#[allow(unused_imports)] use std::collections::{hash_set::Iter, HashMap, HashSet}; -#[derive(Debug, Clone)] -pub struct DefaultForest { - graph: ALGraph, - vertex_labels: HashMap, - edge_labels: HashMap<(usize, usize), EdgeLabel>, - plugins: HashSet, - plugouts: HashSet, -} +// TODO: Use PLGraph instead. -impl Default for DefaultForest { - fn default() -> Self { - Self { - graph: Default::default(), - vertex_labels: Default::default(), - edge_labels: Default::default(), - plugins: Default::default(), - plugouts: Default::default(), - } - } -} - -impl Graph for DefaultForest { - type Iter<'a> = ::Iter<'a> - where - Self: 'a; - - fn is_empty(&self) -> bool { - self.graph.is_empty() - } - - fn nodes_len(&self) -> usize { - self.graph.nodes_len() - } - - fn children_of(&self, node_id: usize) -> Result, GraphError> { - self.graph.children_of(node_id) - } - - fn degree(&self, node_id: usize) -> Result { - self.graph.degree(node_id) - } - - fn is_empty_node(&self, node_id: usize) -> Result { - self.graph.is_empty_node(node_id) - } - - fn has_edge(&self, source: usize, target: usize) -> Result { - self.graph.has_edge(source, target) - } - - fn replace_by_builder(&mut self, _builder: impl graph::Builder) { - unimplemented!() - } -} - -#[derive(Debug)] -pub struct LabelIter<'a> { - set_iter: Iter<'a, usize>, -} - -impl<'a> Iterator for LabelIter<'a> { - type Item = usize; - - fn next(&mut self) -> Option { - self.set_iter.next().copied() - } - - fn size_hint(&self) -> (usize, Option) { - self.set_iter.size_hint() - } -} - -impl<'a> ExactSizeIterator for LabelIter<'a> { - fn len(&self) -> usize { - self.set_iter.len() - } -} - -impl<'a> From> for LabelIter<'a> { - fn from(set_iter: Iter<'a, usize>) -> Self { - Self { set_iter } - } -} - -impl Forest - for DefaultForest -{ - type PluginIter<'a> = LabelIter<'a> - where - Self: 'a; - - type PlugoutIter<'a> = LabelIter<'a> - where - Self: 'a; - - fn plugins(&self) -> Self::PluginIter<'_> { - self.plugins.iter().into() - } - - fn plugouts(&self) -> Self::PlugoutIter<'_> { - self.plugouts.iter().into() - } - - fn plug(&mut self, other: &Self) -> Result<(), Error> { - // PLAN: Produce a BuilderMut, adjust the indices for edges, - // and then add edges between plugs. - // - // Since I cannot touch the underlying nodes directly, I have - // to add the nodes and edges individually. - - todo!() - } - - fn vertex_label(&self, node_id: usize) -> Result, Error> { - if node_id >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(node_id, self.nodes_len())); - } - - Ok(self.vertex_labels.get(&node_id).copied()) - } - - fn edge_label(&self, source: usize, target: usize) -> Result, Error> { - if source >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(source, self.nodes_len())); - } - - if target >= self.nodes_len() { - return Err(Error::IndexOutOfBounds(target, self.nodes_len())); - } - - Ok(self.edge_labels.get(&(source, target)).copied()) - } -} +// #[derive(Debug, Clone)] +// pub struct DefaultForest { +// graph: ALGraph, +// vertex_labels: HashMap, +// edge_labels: HashMap<(usize, usize), EdgeLabel>, +// plugins: HashSet, +// plugouts: HashSet, +// } diff --git a/forest/src/design.org b/forest/src/design.org index 771ca4b..09db113 100644 --- a/forest/src/design.org +++ b/forest/src/design.org @@ -28,7 +28,7 @@ topic sooner or later, why not deal with it now? ;P represent this alternation. Packed nodes are not labelled. They just serve the role of putting nodes together. -* Some thoughts [1/3] +* Some thoughts We do not need to attach forest fragments to nodes of nondeterministic finite automata: we just attach to them lists of grammar slots, which @@ -44,23 +44,98 @@ non-terminals. Some caveats: -- [X] Every node in the forest needs to know its parents. This is - needed for "reductions". That is, after we have parsed the entire +- Every node in the forest needs to know its parents. This is needed + for "reductions". That is, after we have parsed the entire expansion for a non-terminal, we need to go back to where that non-terminal was and continue from there. -- [ ] We need to record the expansions of those "virtual nodes". That - is, the virtual nodes represent atomic languages such as [s]^{(t)} - where s is a non-terminal and t is a terminal. To be more specific, - it represents the derivative of the left-linear closure of the +- We need to record the expansions of those "virtual nodes". That is, + the virtual nodes represent atomic languages such as [s]^{(t)} where s + is a non-terminal and t is a terminal. To be more specific, it + represents the derivative of the left-linear closure of the non-terminal s with respect to the terminal t. The left-linear closure of s is the set of all strings (consisting of terminals and non-terminals alike) that are derivable from s alone, without consuming any inputs, by expanding according to the grammar rules. This means that we need to know if which non-terminals were expanded in order to get to a state in [s]^{(t)}. -- [ ] Each edge in the chain-rule machine needs to be labelled also - with a position in the forest. This perfectly solves the problem of - those "plugs"! +- Each edge in the chain-rule machine needs to be labelled also with a + position in the forest. This perfectly solves the problem of those + "plugs"! +* Formal Definition +We need to figure out the definition of the forest. That it to say, +what will the resulting forest look like for a grammar. +Moreover, we need to know how to correspond each step in the +chain-rule machine with the definition of the forest. + +After these two steps, we can easily proceed with the construction of +the chain-rule machine. + +** Axioms + +There are two *axioms* from which the definition of the forests +follows. + +1. Each node has a unique label, given (principally) by three + components: + 1) input range start + 2) input range end + 3) grammar label +2. If different subtrees share the same node label as the root, a + "clone" of the root should be made, and there should be a + "representative" of all clones, which is treated as a normal node + with that label, and all clones are its children. + +The first axiom ensures that the size of the forest is bounded by the +cube of \(n\), \(n\) is the length of the input sequence. And the +second axiom specifies what to do when subtrees share a parent. + +Since the clones are introduced only when subtrees share a parent, +their number cannot exceed the number of nodes of a forest without +clones. This shows that adding clones does not affect the growth rate +of the size of forests. + +A remark: the /clones/ are called /packed nodes/ in the traditional +terminology, but the terminology of a clone makes more sense to me. + +** Correspondence with chain-rule machine + +Every edge in the chain-rule machine corresponds to an edge in the +forest; denote this correspondence by \(f\). + +The chain-rule operation can be described as follows: + +1. Start with an edge \(e\) with children \(d_1, \ldots, d_n\) in the + chain-rule machine. +2. Prepare a new forest fragment as follows. + 1) For every child \(g\) of \(e\) in the atomic language, if \(g\) + is the terminal that appears as the current input \(t\), let the + new fragment be defined as the node moved by \(g\), alone. + 2) If \(g\) is a non-terminal and its first-set contains \(t\), + then for every left-linearly closed child of \(g\) that is + labelled \(t\), denoted as \(h\), then let the fragment be + defined as the node moved by \(g\), with the pre-recorded + left-linear expansion information from \(g\) to \(h\) appended + as children. +3. For \(i=1,\ldots,n\), consider the edge \(f(d_i)\) in the forest. + There are three cases: + 1) If the next edge is empty, that means we have found something + totally new to add to the forest, so we just add that. + 2) If the next edge after \(f(e)\) contains the new fragment as a + sub-tree already, this means the current branch has already been + explored before, and we have nothing else to do. + 3) If the next edge is non-empty and does not match the new + fragment, this means we have a conflict with a previously found + branch. We solve it simply by following the second axiom above: + we make the node that is the source of \(f(e)\) a clone, and add + the new fragment under a new clone. Note that if the node is + already a clone, we just make a clone under the parent of that + clone. +4. Update the correspondence \(f\) by updating \(f(g)\) and \(f(h)\) + accordingly. + +In this process we always follow the first axiom by ensuring that the +labels of the forest are unique to each node, except for those clones, +which are only created by following the second axiom. diff --git a/forest/src/lib.rs b/forest/src/lib.rs index 3925bd5..c2f4988 100644 --- a/forest/src/lib.rs +++ b/forest/src/lib.rs @@ -1,3 +1,4 @@ +#![warn(missing_docs)] //! This file defines the expected behaviours of forests, and provides //! a default implementation for forests. //! @@ -10,12 +11,16 @@ //! out-coming and in-coming plugs. These plugs are used to join //! different fragments of forests together. -use graph::{Graph, GraphLabel}; +use graph::{GraphLabel, LabelGraph, ParentsGraph}; use core::fmt::Display; +// TODO: move this to default + +/// The type of errors for forest operations. #[derive(Debug)] pub enum Error { + /// An index is out of bounds. IndexOutOfBounds(usize, usize), } @@ -31,75 +36,52 @@ impl Display for Error { impl std::error::Error for Error {} -/// A builder of a forest. -pub trait ForestBuilder { - /// The type of the resulting forest. - type Output: Forest; - - /// Construct a new builder with only one node with the given - /// label. - fn new_leaf(label: NodeLabel) -> Self; - - /// Add a child to the builder the given labels for the new node - /// and the added edge. - /// - /// All plug-out nodes within the builder should have a new child - /// with the specified labels, and hence multiple children might - /// be added, and the plug-out nodes should be modified - /// accordingly. - fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); - - /// Build the forest. - fn build(self) -> Self::Output; - - /// Build the forest from a reference. - fn build_ref(&self) -> Self::Output; -} +// /// A builder of a forest. +// pub trait ForestBuilder { +// /// The type of the resulting forest. +// type Output: Forest; + +// /// Construct a new builder with only one node with the given +// /// label. +// fn new_leaf(label: NodeLabel) -> Self; + +// /// Add a child to the builder the given labels for the new node +// /// and the added edge. +// /// +// /// All plug-out nodes within the builder should have a new child +// /// with the specified labels, and hence multiple children might +// /// be added, and the plug-out nodes should be modified +// /// accordingly. +// fn add_children(&mut self, vertex_label: NodeLabel, edge_labe: EdgeLabel); + +// /// Build the forest. +// fn build(self) -> Self::Output; + +// /// Build the forest from a reference. +// fn build_ref(&self) -> Self::Output; +// } + +// FIXME: The trait should be re-designed. /// The expected behaviours of a forest. /// -/// Note that it contains a "striped down" version of the labelled -/// graphs. -pub trait Forest: Graph { - /// Type of iterator of plug-in nodes. - type PluginIter<'a>: Iterator + 'a - where - Self: 'a; - - /// Type of iterator of plug-out nodes. - type PlugoutIter<'a>: Iterator + 'a - where - Self: 'a; - - /// Return the plug-in nodes - fn plugins(&self) -> Self::PluginIter<'_>; - - /// Return the plug-out nodes - fn plugouts(&self) -> Self::PlugoutIter<'_>; - - /// Plug another forest onto this forest. - /// - /// The plug-out nodes of this forest will be joined onto the - /// plug-in nodes of the joining forest. - /// - /// # Performance warning - /// - /// It is recommended to only call this function with a "small" - /// `other`, as this function might copy the whole graph - /// individually, node by node and edge by edge. - fn plug(&mut self, other: &Self) -> Result<(), Error>; - - /// Return the vertex label. - /// - /// A vertex may not have labels. - fn vertex_label(&self, node_id: usize) -> Result, Error>; - - /// Return the edge label. - /// - /// An edge may have no labels. If there is no edge from the - /// given source to the given target, then `Ok(None)` is returned - /// as well. - fn edge_label(&self, source: usize, target: usize) -> Result, Error>; +/// Note that it requires only a subset of the functionalities of +/// labelled graphs. +pub trait Forest: ParentsGraph + LabelGraph { + /// The type of errors for operations on the forest. + type Error: std::error::Error + From; + + /// Detect if the fragment is a prefix of the sub-forest rooted at + /// `node_id`. + fn is_prefix>(&self, node_id: usize, fragment: F) -> Result; + + /// Extend the forest by adjoining another forest at a given node. + fn plant>(&mut self, node_id: usize, fragment: F) -> Result<(), Self::Error>; + + /// Clone a node by making a new node and making all the nodes + /// that previously pointed to the old node now point to the new + /// node, and the new node points to the old node. + fn clone(&mut self, node_id: usize) -> Result<(), Self::Error>; } pub mod default; -- cgit v1.2.3-18-g5258