From f27d604d93ce583d4404e1874664e08382ea2f00 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 6 Jan 2023 23:42:28 +0800 Subject: Save before system restart. I am about to re-start my system, so I save before any crashes happen. --- chain/src/plan.org | 28 +++- forest/src/default.rs | 14 +- forest/src/design.org | 66 +++++++++ forest/src/lib.rs | 4 +- graph/src/builder.rs | 2 +- graph/src/error.rs | 3 + graph/src/labelled/binary.rs | 273 ++++++++++++++++++++++++++++++++++ graph/src/labelled/mod.rs | 4 +- graph/src/labelled/single.rs | 343 ------------------------------------------- graph/src/lib.rs | 13 ++ 10 files changed, 387 insertions(+), 363 deletions(-) create mode 100644 forest/src/design.org create mode 100644 graph/src/labelled/binary.rs delete mode 100644 graph/src/labelled/single.rs diff --git a/chain/src/plan.org b/chain/src/plan.org index 75acf30..bbd6683 100644 --- a/chain/src/plan.org +++ b/chain/src/plan.org @@ -38,11 +38,30 @@ finite automata. * [X] Test the removal of dead states, where a state is dead if and only if no other states have an edge into that state. -- [-] Refactor [2/5] +- [-] Refactor [2/8] + [X] Implement a data type for graphs with labels on vertices and edges, but do not need to index edges by labels, which can index vertices by labels instead. + * [X] Test it. + [X] Implement a builder that borrows a graph mutably. + * [X] Test it. + + [ ] Implement a data type for graphs in which every node knows its + parents, and every node has a unique label but edges have no + labels. + * [ ] Test it. + + [ ] We need to record the expansions of those "virtual nodes". + That is, the virtual nodes represent atomic languages such as + [s]^{(t)} where s is a non-terminal and t is a terminal. To be more + specific, it represents the derivative of the left-linear closure + of the non-terminal s with respect to the terminal t. The + left-linear closure of s is the set of all strings (consisting of + terminals and non-terminals alike) that are derivable from s + alone, without consuming any inputs, by expanding according to the + grammar rules. This means that we need to know which + non-terminals were expanded in order to get to a state in [s]^{(t)}. + + [ ] Each edge in the chain-rule machine needs to be labelled also + with a position in the forest. This perfectly solves the problem + of those "plugs"! + [ ] When we pull in some regular language because of the left-linear expansion, we need to mark that branch as coming from left-linear expansions. This is necessary because we cannot @@ -66,7 +85,12 @@ + [X] First define a trait with the expected behaviours. + [ ] Then implement them as doubly labelled graphs. + [ ] Thenimplement finding derivatives by use of the chain rule. -- [-] Implement forests [1/2] +- [-] Implement forests [2/3] + + [X] Design a format of forests. This should be the most crucial + thing to do, in order to have a better understanding of the whole + picture. I need to have a clear understanding of the details of + the binarised shared packed parsed forests, that reflects the + regular expressions in the grammar equations. + [X] Define a trait with the expected behaviours. + [-] Implement them using adjacency map: we only need one label per edge, and we do not wish to exclude duplicate edges, and we do not diff --git a/forest/src/default.rs b/forest/src/default.rs index 456413b..5e438d4 100644 --- a/forest/src/default.rs +++ b/forest/src/default.rs @@ -61,14 +61,6 @@ impl Graph for DefaultForest ExtGraph - for DefaultForest -{ - fn extend(&mut self, edges: impl IntoIterator) -> Result { - self.graph.extend(edges) - } -} - #[derive(Debug)] pub struct LabelIter<'a> { set_iter: Iter<'a, usize>, @@ -88,11 +80,7 @@ impl<'a> Iterator for LabelIter<'a> { impl<'a> ExactSizeIterator for LabelIter<'a> { fn len(&self) -> usize { - let (lower, upper) = self.size_hint(); - // Note: This assertion is overly defensive, but it checks the - // invariant guaranteed by the trait. - debug_assert!(upper == Some(lower)); - lower + self.set_iter.len() } } diff --git a/forest/src/design.org b/forest/src/design.org new file mode 100644 index 0000000..771ca4b --- /dev/null +++ b/forest/src/design.org @@ -0,0 +1,66 @@ +#+TITLE: Format of a binarised shared packed parse forests +#+AUTHOR: Durand +#+DATE: <2023-01-05 Jeu 23:55> +#+STARTUP: content + +* Introduction + +I want to design a format for shared packed parse forests. +They are needed for the chain-rule machine because the graphs need to +be labelled (indirectly) by fragments of forests so that we can +correctly and efficiently compute the semi-ring values along the +chain-rule machine operations. Moreover, what we are really +interested is the parse forests, so we will need to deal with this +topic sooner or later, why not deal with it now? ;P + +* Principles + +- The number of children of nodes is bounded by a constant, in theory. + So it is only bounded by the computer hardware (and the grammar): + this representation is closer to the grammar rules. +- Labels of nodes are grammar slots, that is positions within the + rules. +- Edges have no labels: they are simply not needed. +- We need to have two lists of "plugs" that we can plug other forests + in. For this we also need to know if a label is labelled on a node + that is a plug. This implies the use of hashmaps I guess. +- When there are alternations in the forest, we use a "packed node" to + represent this alternation. Packed nodes are not labelled. They + just serve the role of putting nodes together. + +* Some thoughts [1/3] + +We do not need to attach forest fragments to nodes of nondeterministic +finite automata: we just attach to them lists of grammar slots, which +represent a sequence of "nulling out" nullable non-terminals, until +the main non-terminal appears as the last element of the list (or even +a range of slots to be even more efficient). A normal node would have +a range with one element. + +That is it! We do not need to worry about the forests now as we would +know how to combine the forest segments when we are performing the +chain-rule operations: we know exactly when we are expanding +non-terminals. + +Some caveats: + +- [X] Every node in the forest needs to know its parents. This is + needed for "reductions". That is, after we have parsed the entire + expansion for a non-terminal, we need to go back to where that + non-terminal was and continue from there. +- [ ] We need to record the expansions of those "virtual nodes". That + is, the virtual nodes represent atomic languages such as [s]^{(t)} + where s is a non-terminal and t is a terminal. To be more specific, + it represents the derivative of the left-linear closure of the + non-terminal s with respect to the terminal t. The left-linear + closure of s is the set of all strings (consisting of terminals and + non-terminals alike) that are derivable from s alone, without + consuming any inputs, by expanding according to the grammar rules. + This means that we need to know if which non-terminals were expanded + in order to get to a state in [s]^{(t)}. +- [ ] Each edge in the chain-rule machine needs to be labelled also + with a position in the forest. This perfectly solves the problem of + those "plugs"! + + + diff --git a/forest/src/lib.rs b/forest/src/lib.rs index 527a78c..3925bd5 100644 --- a/forest/src/lib.rs +++ b/forest/src/lib.rs @@ -10,7 +10,7 @@ //! out-coming and in-coming plugs. These plugs are used to join //! different fragments of forests together. -use graph::{ExtGraph, GraphLabel}; +use graph::{Graph, GraphLabel}; use core::fmt::Display; @@ -60,7 +60,7 @@ pub trait ForestBuilder { /// /// Note that it contains a "striped down" version of the labelled /// graphs. -pub trait Forest: ExtGraph { +pub trait Forest: Graph { /// Type of iterator of plug-in nodes. type PluginIter<'a>: Iterator + 'a where diff --git a/graph/src/builder.rs b/graph/src/builder.rs index 4a480a5..5505e4f 100644 --- a/graph/src/builder.rs +++ b/graph/src/builder.rs @@ -77,7 +77,7 @@ pub trait BuilderMut { fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_>; /// Add a new vertex. - fn add_vertex(&mut self, label: Self::Label) -> Result; + fn add_vertex(&mut self, label: Self::Label) -> usize; /// Add an edge from the source to the target. fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error>; diff --git a/graph/src/error.rs b/graph/src/error.rs index bf2714b..ce45acc 100644 --- a/graph/src/error.rs +++ b/graph/src/error.rs @@ -18,6 +18,8 @@ pub enum Error { /// The graph does not permit duplicate edges but encounters a /// repeated edge. DuplicatedEdge(usize, usize), + /// The source node has no room to add a new edge. + FullNode(usize), } impl Display for Error { @@ -35,6 +37,7 @@ impl Display for Error { "No duplicate edges permitted, but found one from {source} to {target}" ) } + Error::FullNode(index) => write!(f, "the node {index} has no room for new edges"), } } } diff --git a/graph/src/labelled/binary.rs b/graph/src/labelled/binary.rs new file mode 100644 index 0000000..439505d --- /dev/null +++ b/graph/src/labelled/binary.rs @@ -0,0 +1,273 @@ +//! This file implements a labelled graph that can index vertices by +//! labels and each node knows about its parents. + +use super::*; +use crate::ParentsGraph; + +use std::collections::{HashMap as Map, HashSet as Set}; + +use crate::builder::BuilderMut; + +/// A node has some children, some parents, and a label. +#[derive(Debug, Clone)] +struct PLNode { + children: Set, + // REVIEW: If performance for removing edges is a bottleneck, use + // a hash set here. + parents: Vec, + label: T, +} + +impl PLNode { + fn new(children: Set, parents: Vec, label: T) -> Self { + Self { + children, + parents, + label, + } + } +} + +impl Default for PLNode { + #[inline] + fn default() -> Self { + let children = Default::default(); + let parents = Default::default(); + let label = Default::default(); + Self { + children, + label, + parents, + } + } +} + +/// A Parents-knowing, vertex-Labelled Graph. +#[derive(Debug, Clone)] +pub struct PLGraph { + nodes: Vec>, + label_index_map: Map, +} + +impl Default for PLGraph { + #[inline] + fn default() -> Self { + let nodes = Default::default(); + let label_index_map = Default::default(); + Self { + nodes, + label_index_map, + } + } +} + +impl Graph for PLGraph { + type Iter<'a> = std::iter::Copied> + where + Self: 'a; + + #[inline] + fn is_empty(&self) -> bool { + self.nodes.is_empty() + } + + #[inline] + fn nodes_len(&self) -> usize { + self.nodes.len() + } + + #[inline] + fn children_of(&self, node_id: usize) -> Result, Error> { + if let Some(node) = self.nodes.get(node_id) { + Ok(node.children.iter().copied()) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) + } + } + + #[inline] + fn degree(&self, node_id: usize) -> Result { + if let Some(node) = self.nodes.get(node_id) { + Ok(node.children.len()) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) + } + } + + #[inline] + fn is_empty_node(&self, node_id: usize) -> Result { + if let Some(node) = self.nodes.get(node_id) { + Ok(node.children.is_empty()) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) + } + } + + fn has_edge(&self, source: usize, target: usize) -> Result { + if let Some(node) = self.nodes.get(source) { + if !self.has_node(target) { + return Err(Error::IndexOutOfBounds(target, self.nodes.len())); + } + + Ok(node.children.contains(&target)) + } else { + Err(Error::IndexOutOfBounds(source, self.nodes.len())) + } + } + + fn replace_by_builder(&mut self, _builder: impl Builder) { + unimplemented!("for later") + } + + fn print_viz(&self, _filename: &str) -> Result<(), std::io::Error> { + todo!() + } +} + +impl ParentsGraph for PLGraph { + type Iter<'a> = std::iter::Copied> + where Self: 'a; + + #[inline] + fn parents_of(&self, node_id: usize) -> Result<::Iter<'_>, Error> { + if let Some(node) = self.nodes.get(node_id) { + Ok(node.parents.iter().copied()) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) + } + } +} + +impl LabelGraph for PLGraph { + type Iter<'a> = std::iter::Empty + where + Self: 'a; + + type LabelIter<'a> = std::iter::Empty<(&'a T, >::Iter<'a>)> + where + Self: 'a, + T: 'a; + + type EdgeLabelIter<'a> = std::iter::Empty + where + Self: 'a, + T: 'a; + + #[inline] + fn query_label(&self, label: T) -> Option { + self.label_index_map.get(&label).copied() + } + + #[inline] + fn vertex_label(&self, node_id: usize) -> Result, Error> { + if let Some(node) = self.nodes.get(node_id) { + Ok(Some(node.label)) + } else { + Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) + } + } + + fn edge_label(&self, _source: usize, _target: usize) -> Result, Error> { + unimplemented!("Edges have no labels") + } + + fn find_children_with_label( + &self, + _node_id: usize, + _label: &T, + ) -> Result<>::Iter<'_>, Error> { + unimplemented!("Edges have no labels") + } + + fn labels_of(&self, _node_id: usize) -> Result, Error> { + unimplemented!("Edges have no labels") + } + + fn has_edge_label(&self, _node_id: usize, _label: &T, _target: usize) -> Result { + unimplemented!("Edges have no labels") + } +} + +/// A builder that modifies PLGraph in place. +#[derive(Debug)] +pub struct PLGBuilderMut<'a, T: GraphLabel> { + graph: &'a mut PLGraph, +} + +impl<'a, T: GraphLabel> BuilderMut for PLGBuilderMut<'a, T> { + type Label = T; + + type Graph = PLGraph; + + type ResultBuilder<'b> = PLGBuilderMut<'b, T> + where + Self::Label: 'b; + + fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_> { + PLGBuilderMut { graph } + } + + fn add_vertex(&mut self, label: Self::Label) -> usize { + if let Some(old) = self.graph.label_index_map.get(&label) { + *old + } else { + let new_node = PLNode::new(Default::default(), Default::default(), label); + self.graph.nodes.push(new_node); + + let new_index = self.graph.nodes.len() - 1; + + self.graph.label_index_map.insert(label, new_index); + + new_index + } + } + + fn add_edge(&mut self, source: usize, target: usize, _label: Self::Label) -> Result<(), Error> { + if self.graph.has_edge(source, target)? { + return Err(Error::DuplicatedEdge(source, target)); + } + + self.graph + .nodes + .get_mut(source) + .unwrap() + .children + .insert(target); + + self.graph + .nodes + .get_mut(target) + .unwrap() + .parents + .push(source); + + Ok(()) + } + + fn remove_edge(&mut self, source: usize, target: usize, _predicate: F) -> Result<(), Error> + where + F: Fn(Self::Label) -> bool, + { + if !self.graph.has_edge(source, target)? { + return Ok(()); + } + + self.graph + .nodes + .get_mut(source) + .unwrap() + .children + .remove(&target); + + self.graph + .nodes + .get_mut(target) + .unwrap() + .parents + .retain(|parent| *parent != source); + + Ok(()) + } +} + +// TODO: tests diff --git a/graph/src/labelled/mod.rs b/graph/src/labelled/mod.rs index 61e3014..fa26bc4 100644 --- a/graph/src/labelled/mod.rs +++ b/graph/src/labelled/mod.rs @@ -15,6 +15,6 @@ pub mod double; pub use double::{DLGBuilder, DLGraph}; -pub mod single; +pub mod binary; -pub use single::SLGraph; +// pub use binary::BLGraph; diff --git a/graph/src/labelled/single.rs b/graph/src/labelled/single.rs deleted file mode 100644 index bed65c1..0000000 --- a/graph/src/labelled/single.rs +++ /dev/null @@ -1,343 +0,0 @@ -//! This file implements a labelled graph that can index vertices by -//! labels. -//! -//! Since we do not have to index edges by labels, the labels can be -//! stored simply by adjacency maps. This may save memory usage and -//! improve performance, potentially. - -use super::*; - -use std::collections::{hash_map::Keys, HashMap as Map}; - -use crate::builder::BuilderMut; - -/// The type of a node. -#[derive(Debug, Clone, Default)] -struct SLNode { - /// The edges are stored as an association from the target node - /// index to the label of the corresponding edge. - /// - /// An implication is that an edge can only have one label. - children: Map, - /// The label of this node. - label: T, -} - -impl SLNode { - fn new(children: Map, label: T) -> Self { - Self { children, label } - } - - /// This function just adds an edge, blindly. - /// - /// # Safety - /// - /// Only use this after making sure that `target` refers to a - /// valid node, and there was no edge from this node to the node - /// pointed to by `target` previously. - unsafe fn add_edge(&mut self, target: usize, label: T) { - self.children.insert(target, label); - } -} - -/// The type of labelled graphs implemented using adjacency maps. -#[derive(Debug, Clone)] -pub struct SLGraph { - nodes: Vec>, - label_index_map: Map, -} - -impl Default for SLGraph { - fn default() -> Self { - let nodes = Vec::new(); - let label_index_map = Default::default(); - Self { - nodes, - label_index_map, - } - } -} - -impl Graph for SLGraph { - type Iter<'a> = std::iter::Copied> - where - Self: 'a; - - #[inline] - fn is_empty(&self) -> bool { - self.nodes.is_empty() - } - - #[inline] - fn nodes_len(&self) -> usize { - self.nodes.len() - } - - #[inline] - fn children_of(&self, node_id: usize) -> Result, Error> { - if let Some(node) = self.nodes.get(node_id) { - Ok(node.children.keys().copied()) - } else { - Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) - } - } - - #[inline] - fn degree(&self, node_id: usize) -> Result { - if let Some(node) = self.nodes.get(node_id) { - Ok(node.children.len()) - } else { - Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) - } - } - - #[inline] - fn is_empty_node(&self, node_id: usize) -> Result { - if let Some(node) = self.nodes.get(node_id) { - Ok(node.children.is_empty()) - } else { - Err(Error::IndexOutOfBounds(node_id, self.nodes.len())) - } - } - - #[inline] - fn has_edge(&self, source: usize, target: usize) -> Result { - let nodes_len = self.nodes.len(); - - let node = if let Some(node) = self.nodes.get(source) { - node - } else { - return Err(Error::IndexOutOfBounds(source, nodes_len)); - }; - - if !self.has_node(target) { - return Err(Error::IndexOutOfBounds(target, nodes_len)); - } - - Ok(node.children.contains_key(&target)) - } - - fn replace_by_builder(&mut self, _builder: impl Builder) { - // In case this is not clear enough, I deliberately avoid - // implementing this. - unimplemented!() - } -} - -/// An iterator of edge labels. -/// -/// This is used to avoid a box allocation. -#[derive(Copy, Clone, Debug)] -pub struct EdgeLabelIter(Option); - -impl Iterator for EdgeLabelIter { - type Item = T; - - #[inline] - fn next(&mut self) -> Option { - self.0.take() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let len = self.0.is_some() as usize; - - (len, Some(len)) - } -} - -impl DoubleEndedIterator for EdgeLabelIter { - #[inline] - fn next_back(&mut self) -> Option { - self.0.take() - } -} - -impl ExactSizeIterator for EdgeLabelIter { - #[inline] - fn len(&self) -> usize { - // Thanks to Clippy for teaching me about coercing a boolean - // value to an integer. - self.0.is_some() as usize - } -} - -impl LabelGraph for SLGraph { - // The following two types are not used, as we do not need to - // index edges by labels. - type Iter<'a> = std::iter::Empty - where - Self: 'a; - - type LabelIter<'a> = std::iter::Empty<(&'a T, >::Iter<'a>)> - where - Self: 'a, - T: 'a; - - type EdgeLabelIter<'a> = EdgeLabelIter - where - Self: 'a, - T: 'a; - - #[inline] - fn query_label(&self, label: T) -> Option { - self.label_index_map.get(&label).copied() - } - - #[inline] - fn vertex_label(&self, node_id: usize) -> Result, Error> { - if let Some(node) = self.nodes.get(node_id) { - Ok(Some(node.label)) - } else { - Err(Error::IndexOutOfBounds(node_id, self.nodes_len())) - } - } - - #[inline] - fn edge_label(&self, source: usize, target: usize) -> Result, Error> { - let nodes_len = self.nodes.len(); - - let node = if let Some(node) = self.nodes.get(source) { - node - } else { - return Err(Error::IndexOutOfBounds(source, nodes_len)); - }; - - if !self.has_node(target) { - return Err(Error::IndexOutOfBounds(target, nodes_len)); - } - - Ok(EdgeLabelIter(node.children.get(&target).copied())) - } - - fn find_children_with_label( - &self, - _node_id: usize, - _label: &T, - ) -> Result<>::Iter<'_>, Error> { - unimplemented!() - } - - fn labels_of(&self, _node_id: usize) -> Result, Error> { - unimplemented!() - } - - #[inline] - fn has_edge_label(&self, node_id: usize, label: &T, target: usize) -> Result { - let nodes_len = self.nodes.len(); - - let node = if let Some(node) = self.nodes.get(node_id) { - node - } else { - return Err(Error::IndexOutOfBounds(node_id, nodes_len)); - }; - - if !self.has_node(target) { - return Err(Error::IndexOutOfBounds(target, nodes_len)); - } - - Ok(node.children.get(&target) == Some(label)) - } -} - -/// The type of `builder_mut` builders for this type of graphs. -#[derive(Debug)] -pub struct SLGBuilderMut<'a, T: GraphLabel> { - graph: &'a mut SLGraph, -} - -impl<'a, T: GraphLabel> BuilderMut for SLGBuilderMut<'a, T> { - type Label = T; - - type Graph = SLGraph; - - type ResultBuilder<'b> = SLGBuilderMut<'b, T> - where - T:'b; - - fn from_graph_mut(graph: &mut Self::Graph) -> Self::ResultBuilder<'_> { - SLGBuilderMut { graph } - } - - fn add_vertex(&mut self, label: T) -> Result { - if let Some(old_node) = self.graph.label_index_map.get(&label) { - dbg!(label); - return Err(Error::DuplicatedNode(*old_node)); - } - - self.graph - .nodes - .push(SLNode::new(Default::default(), label)); - - let new = self.graph.nodes_len() - 1; - - self.graph.label_index_map.insert(label, new); - - Ok(new) - } - - fn add_edge(&mut self, source: usize, target: usize, label: Self::Label) -> Result<(), Error> { - let graph = &mut self.graph; - - let nodes_len = graph.nodes.len(); - - // NOTE: We check the validity of `target` first because we - // need to borrow graph mutably later, which would prevent us - // from borrowing graph immutably to check the validity of - // `target` then. - if !graph.has_node(target) { - return Err(Error::IndexOutOfBounds(target, nodes_len)); - } - - let node = if let Some(node) = graph.nodes.get_mut(source) { - node - } else { - return Err(Error::IndexOutOfBounds(source, nodes_len)); - }; - - if node.children.get(&target).is_some() { - return Err(Error::DuplicatedEdge(source, target)); - } - - // We checked what we need to check, so this is safe. - unsafe { - node.add_edge(target, label); - } - - Ok(()) - } - - fn remove_edge(&mut self, source: usize, target: usize, predicate: F) -> Result<(), Error> - where - F: Fn(Self::Label) -> bool, - { - let graph = &mut self.graph; - - let nodes_len = graph.nodes.len(); - - // NOTE: We check the validity of `target` first because we - // need to borrow graph mutably later, which would prevent us - // from borrowing graph immutably to check the validity of - // `target` then. - if !graph.has_node(target) { - return Err(Error::IndexOutOfBounds(target, nodes_len)); - } - - let node = if let Some(node) = graph.nodes.get_mut(source) { - node - } else { - return Err(Error::IndexOutOfBounds(source, nodes_len)); - }; - - if let Some(child_label) = node.children.get(&target) { - // There is only one label, so we just check this label. - if predicate(*child_label) { - node.children.remove(&target); - } - - Ok(()) - } else { - Err(Error::DuplicatedEdge(source, target)) - } - } -} diff --git a/graph/src/lib.rs b/graph/src/lib.rs index 79f9646..26159c6 100644 --- a/graph/src/lib.rs +++ b/graph/src/lib.rs @@ -214,6 +214,19 @@ impl: Iterator + 'a + where + Self: 'a; + + /// Return an iterator of parents for a node. + fn parents_of(&self, node_id: usize) -> Result<::Iter<'_>, Error>; +} + /// A labelled graph is just a graph with labels associated to /// vertices and / or edges. /// -- cgit v1.2.3-18-g5258