diff options
author | JSDurand <mmemmew@gmail.com> | 2023-08-10 11:14:04 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-08-10 11:14:04 +0800 |
commit | b8a2d05a3c0d835556d5ddbd44e4a1e201302af5 (patch) | |
tree | 72e833f41b9a8ffd9d3e7216d2f158504343a420 /forest | |
parent | f14c8a2aeab18a9bfa380df28f94736580e08f48 (diff) |
Move the trait `Forest` to the crate "forest".
The purpose of this change is to share this trait with other crates,
such as the forth-coming "semiring" crate that will be responsible for
handling some simple semiring operations as well as the querying, in
my plans.
Diffstat (limited to 'forest')
-rw-r--r-- | forest/Cargo.toml | 1 | ||||
-rw-r--r-- | forest/src/archive_default.txt (renamed from forest/src/default.rs) | 2 | ||||
-rw-r--r-- | forest/src/bytes.rs | 176 | ||||
-rw-r--r-- | forest/src/lib.rs | 102 |
4 files changed, 249 insertions, 32 deletions
diff --git a/forest/Cargo.toml b/forest/Cargo.toml index 8cc445b..5702265 100644 --- a/forest/Cargo.toml +++ b/forest/Cargo.toml @@ -13,5 +13,6 @@ test-print-viz = [] [dependencies] graph = { path = "../graph" } +grammar = { path = "../grammar" } # This should be integrated in the package, but it is not yet done.
\ No newline at end of file diff --git a/forest/src/default.rs b/forest/src/archive_default.txt index b96439f..5572ecc 100644 --- a/forest/src/default.rs +++ b/forest/src/archive_default.txt @@ -1,3 +1,5 @@ +// DEPRECATED: This file is deprecated. + //! This file provides a default implementation for the //! [`Forest`][crate::Forest] trait. diff --git a/forest/src/bytes.rs b/forest/src/bytes.rs new file mode 100644 index 0000000..951abad --- /dev/null +++ b/forest/src/bytes.rs @@ -0,0 +1,176 @@ +//! This module defines functions to turn a forest of forest labels +//! into a sequence of bytes. +//! +//! To be more specific, a forest consists of two parts: a vector of +//! node labels and a graph specifying the relations between nodes. +//! +//! # Number format (endianness) +//! +//! Every number mentionned in this format is an unsigned integer of +//! 64 bits, or 8 bytes. Every such number is specified in the big +//! endian format. +//! +//! # Parts of the sequence of bytes +//! +//! The sequence of bytes has three parts: +//! +//! ## Header +//! +//! The header specifies metadata of this forest. +//! +//! ### Special mark +//! +//! The first three bytes form the string "rep", as a special mark. +//! +//! ### Number of nodes +//! +//! The next 8 bytes specifies the number of nodes of this forest. +//! +//! ## Graph +//! +//! Next comes the underlying graph for this forest. This part +//! consists of a vector of vectors of numbers. Each vector of +//! numbers represents the list of children of a node. So the number +//! of vectors of numbers is equal to the number of nodes. +//! +//! ### Vector of vectors of numbers +//! +//! The vectors are not simply concatenated one after another, as that +//! way one cannot read a random node in a constant amount of time. +//! +//! Instead, we first specify the number of children of each node +//! first, along with the offset for that node of the vector of +//! children. +//! +//! As an example, if a graph has three nodes, represented as the +//! adjacency list: `[[1, 2], [2], []]`, then its representation as a +//! sequence of bytes is as follows: +//! ```text +//! 2, x, 1, y, 0, z, 1, 2, 2, +//! ^ ^ ^ +//! x y z +//! ``` +//! +//! This has the advantage that we can read the children of the `n`-th +//! node in constant time. +//! +//! ## Vector of labels +//! +//! Each label occupies a fixed number of bytes, so we simply put the +//! labels one after another. The only thing to note here is the +//! format of the labels. +//! +//! ### Labels +//! +//! Each label has 3 parts: +//! +//! 1. Status: either Packed, Cloned, or Plain. If the node is +//! cloned, it has a clone index. So in total this part occupies 1 +//! byte for the status and 8 bytes for the clone index. +//! 2. Start and end: the range in the input sentence. We just store +//! two numbers here. Hence this part occupies 16 bytes. +//! 3. Grammar label: either a terminal, a non-terminal, or a rule. +//! Each variant needs a number to speify its index. So in total this +//! part occupies 1 byte for the variant discriminant and 8 bytes for +//! the number. +//! +//! To sum up, each label occupies 34 bytes. + +use super::Forest; +use grammar::{GrammarLabel, GrammarLabelType, TNT}; + +/// Convert any type that implements the `Forest` trait into a +/// sequence of bytes. +pub fn forest_to_bytes<F: Forest<GrammarLabel>>(forest: &F) -> Vec<u8> { + // First calculate the total number of bytes. + let nodes_len = forest.nodes_len(); + + let degrees: Vec<_> = forest + .nodes() + .map(|node| forest.degree(node).unwrap_or(0)) + .collect(); + + let total_degree: usize = degrees.iter().copied().sum(); + + let len: usize = 8 // total number of bytes at the start + + 3 // special mark + + 8 // number of nodes + + 8 // offset of labels + + 16 * nodes_len // degree & offset for each node + + 8 * total_degree // children of each node + + 34 * nodes_len // labels + ; + + // Then fill in the bytes. + + let mut bytes: Vec<u8> = Vec::with_capacity(len); + + // First the headers + + bytes.extend(len.to_be_bytes()); + bytes.extend([114, 101, 112]); // rep + bytes.extend(nodes_len.to_be_bytes()); + bytes.extend((len - 34 * nodes_len).to_be_bytes()); + + let mut accumulated: usize = 0; + + // Then degrees and offsets + + for degree in degrees.iter().copied() { + bytes.extend(degree.to_be_bytes()); + bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); + + accumulated += degree; + } + + // Then the children + + bytes.extend(forest.nodes().flat_map(|node| { + forest + .children_of(node) + .unwrap() + .flat_map(|child| child.to_be_bytes()) + })); + + // Finally the labels + + 'nodes_loop: for node in forest.nodes() { + let label = match forest.vertex_label(node) { + Ok(Some(label)) => label, + _ => continue 'nodes_loop, + }; + + if label.is_plain() { + bytes.extend(std::iter::repeat(0).take(9)); + } else if label.is_packed() { + bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); + } else if let Some(index) = label.clone_index() { + bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); + } + + let label = label.label(); + + bytes.extend(label.start().to_be_bytes()); + bytes.extend(label.end().unwrap_or(0).to_be_bytes()); + + let label = label.label(); + + match label { + GrammarLabelType::TNT(TNT::Ter(t)) => { + bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); + } + GrammarLabelType::TNT(TNT::Non(n)) => { + bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); + } + GrammarLabelType::Rule(r) => { + bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); + } + } + } + + if bytes.len() != len { + dbg!(); + } + + bytes +} diff --git a/forest/src/lib.rs b/forest/src/lib.rs index 0b1092f..a888fae 100644 --- a/forest/src/lib.rs +++ b/forest/src/lib.rs @@ -1,26 +1,27 @@ #![warn(missing_docs)] -//! This file defines the expected behaviours of forests, and provides -//! a default implementation for forests. +//! This file defines the expected behaviours of forests. //! -//! The default forest actually implements the so-called shared packed -//! parse forest, or SPPF. It packs sub-trees with the same parents -//! under the same branch, and lets nodes from different branches -//! share the same children, and hence the name. +//! The forests are the basis of the `semiring` crate. use graph::{error::Error as GError, GraphLabel, LabelGraph, ParentsGraph}; +use std::borrow::Borrow; + /// An internal type that indicates the status of a node as either a /// packed, cloned, or plain node. #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default)] -enum ForestLabelType { +pub enum ForestLabelType { + /// A packed node Packed, + /// A plain node #[default] Plain, + /// A cloned node Cloned(usize), } -impl core::fmt::Display for ForestLabelType { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl std::fmt::Display for ForestLabelType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Packed => write!(f, "packed"), Self::Plain => write!(f, "plain"), @@ -36,14 +37,17 @@ pub struct ForestLabel<T: GraphLabel> { status: ForestLabelType, } -impl<T: GraphLabel> core::fmt::Display for ForestLabel<T> { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - writeln!(f, "label: {}, status: {}", self.label, self.status) +impl<T: GraphLabel> std::fmt::Display for ForestLabel<T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if !matches!(self.status, ForestLabelType::Plain) { + write!(f, "{}, {}", self.label, self.status) + } else { + write!(f, "{}", self.label) + } } } /// The type of erros for converting forest labels. -#[non_exhaustive] #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] pub enum ForestLabelError { /// Try to pack a cloned node. @@ -52,8 +56,8 @@ pub enum ForestLabelError { ClonePack, } -impl core::fmt::Display for ForestLabelError { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { +impl std::fmt::Display for ForestLabelError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::PackClone => write!(f, "cannot pack a cloned node"), Self::ClonePack => write!(f, "cannot clone a packed node"), @@ -64,33 +68,50 @@ impl core::fmt::Display for ForestLabelError { impl std::error::Error for ForestLabelError {} impl<T: GraphLabel> ForestLabel<T> { + /// Construct a new label with the inner `label` and `status`. + pub fn new(label: T, status: ForestLabelType) -> Self { + Self { label, status } + } + /// Retrieve the label. pub fn label(&self) -> T { self.label } + /// Retrieve the status. + pub fn status(&self) -> ForestLabelType { + self.status + } + /// Return true if and only if this node is packed. pub fn is_packed(&self) -> bool { - matches!(self.status, ForestLabelType::Packed) + self.status == ForestLabelType::Packed } /// Retrieve the optional clone index. - pub fn is_cloned(&self) -> Option<usize> { - match self.status { - ForestLabelType::Cloned(index) => Some(index), - _ => None, + pub fn clone_index(&self) -> Option<usize> { + if let ForestLabelType::Cloned(index) = self.status { + Some(index) + } else { + None } } + /// Return true if and only if this node is a plain node. + pub fn is_plain(&self) -> bool { + self.status == ForestLabelType::Plain + } + /// Try to clone a node. pub fn clone<F>(self, forest: &F) -> Result<Self, ForestLabelError> where - F: Forest<T>, + F: LabelGraph<ForestLabel<T>>, { if self.is_packed() { + dbg!(); Err(ForestLabelError::ClonePack) } else { - let clone_index = if let Some(old_index) = self.is_cloned() { + let clone_index = if let Some(old_index) = self.clone_index() { let mut new_index: usize = old_index + 1; let mut new_label = Self { @@ -120,7 +141,7 @@ impl<T: GraphLabel> ForestLabel<T> { /// Try to pack a node. pub fn pack(self) -> Result<Self, ForestLabelError> { - if self.is_cloned().is_some() { + if self.clone_index().is_some() { Err(ForestLabelError::PackClone) } else { let new_label = Self { @@ -140,7 +161,7 @@ impl<T: GraphLabel> From<T> for ForestLabel<T> { } } -/// The expected behaviours of a forest. +/// The expected behaviours of an item derivation forest. /// /// Note that it requires only a subset of the functionalities of /// labelled graphs. @@ -157,28 +178,45 @@ pub trait Forest<T: GraphLabel>: ParentsGraph + LabelGraph<ForestLabel<T>> { /// label. fn new_leaf(label: T) -> Self; + /// Transform the label at the given node. + fn transform_label( + &mut self, + node_id: usize, + transform: impl FnOnce(T) -> T, + ) -> Result<(), Self::Error>; + /// Detect if the fragment is a prefix of the sub-forest rooted at /// `node_id`. fn is_prefix<F>(&self, node_id: usize, fragment: F) -> Result<bool, Self::Error> where - F: AsRef<Self>; + F: Borrow<Self>; /// Extend the forest by adjoining another forest at a given node. fn plant<F>(&mut self, node_id: usize, fragment: F, planted: bool) -> Result<(), Self::Error> where - F: AsRef<Self>; + F: Borrow<Self>; /// Clone a node by making a new node and making all the nodes /// that previously pointed to the old node now point to the new /// node, and the new node points to the old node. Return the - /// index of the new node. In addition, if the old node had - /// already been cloned, just return the index of its - /// representitave. + /// index of the new node. However, if, and only if, + /// `no_new_clone` is `true`, do not make a new clone; instead + /// return the clone index that would be used if a new clone was + /// made. + /// + /// Also, `preserved_edges_num` many edges out of the old node + /// will be copied to be the children of the new node. /// /// The labels of the representing node and of the cloned node /// will be handled automatically. - fn clone_node(&mut self, node_id: usize) -> Result<usize, Self::Error>; + fn clone_node( + &mut self, + node_id: usize, + preserved_edges_num: usize, + no_new_clone: bool, + ) -> Result<usize, Self::Error>; } -pub mod default; -pub use default::Error; +pub mod bytes; + +pub use bytes::forest_to_bytes; |