summaryrefslogtreecommitdiff
path: root/forest
diff options
context:
space:
mode:
authorJSDurand <mmemmew@gmail.com>2023-08-10 11:14:04 +0800
committerJSDurand <mmemmew@gmail.com>2023-08-10 11:14:04 +0800
commitb8a2d05a3c0d835556d5ddbd44e4a1e201302af5 (patch)
tree72e833f41b9a8ffd9d3e7216d2f158504343a420 /forest
parentf14c8a2aeab18a9bfa380df28f94736580e08f48 (diff)
Move the trait `Forest` to the crate "forest".
The purpose of this change is to share this trait with other crates, such as the forth-coming "semiring" crate that will be responsible for handling some simple semiring operations as well as the querying, in my plans.
Diffstat (limited to 'forest')
-rw-r--r--forest/Cargo.toml1
-rw-r--r--forest/src/archive_default.txt (renamed from forest/src/default.rs)2
-rw-r--r--forest/src/bytes.rs176
-rw-r--r--forest/src/lib.rs102
4 files changed, 249 insertions, 32 deletions
diff --git a/forest/Cargo.toml b/forest/Cargo.toml
index 8cc445b..5702265 100644
--- a/forest/Cargo.toml
+++ b/forest/Cargo.toml
@@ -13,5 +13,6 @@ test-print-viz = []
[dependencies]
graph = { path = "../graph" }
+grammar = { path = "../grammar" }
# This should be integrated in the package, but it is not yet done. \ No newline at end of file
diff --git a/forest/src/default.rs b/forest/src/archive_default.txt
index b96439f..5572ecc 100644
--- a/forest/src/default.rs
+++ b/forest/src/archive_default.txt
@@ -1,3 +1,5 @@
+// DEPRECATED: This file is deprecated.
+
//! This file provides a default implementation for the
//! [`Forest`][crate::Forest] trait.
diff --git a/forest/src/bytes.rs b/forest/src/bytes.rs
new file mode 100644
index 0000000..951abad
--- /dev/null
+++ b/forest/src/bytes.rs
@@ -0,0 +1,176 @@
+//! This module defines functions to turn a forest of forest labels
+//! into a sequence of bytes.
+//!
+//! To be more specific, a forest consists of two parts: a vector of
+//! node labels and a graph specifying the relations between nodes.
+//!
+//! # Number format (endianness)
+//!
+//! Every number mentionned in this format is an unsigned integer of
+//! 64 bits, or 8 bytes. Every such number is specified in the big
+//! endian format.
+//!
+//! # Parts of the sequence of bytes
+//!
+//! The sequence of bytes has three parts:
+//!
+//! ## Header
+//!
+//! The header specifies metadata of this forest.
+//!
+//! ### Special mark
+//!
+//! The first three bytes form the string "rep", as a special mark.
+//!
+//! ### Number of nodes
+//!
+//! The next 8 bytes specifies the number of nodes of this forest.
+//!
+//! ## Graph
+//!
+//! Next comes the underlying graph for this forest. This part
+//! consists of a vector of vectors of numbers. Each vector of
+//! numbers represents the list of children of a node. So the number
+//! of vectors of numbers is equal to the number of nodes.
+//!
+//! ### Vector of vectors of numbers
+//!
+//! The vectors are not simply concatenated one after another, as that
+//! way one cannot read a random node in a constant amount of time.
+//!
+//! Instead, we first specify the number of children of each node
+//! first, along with the offset for that node of the vector of
+//! children.
+//!
+//! As an example, if a graph has three nodes, represented as the
+//! adjacency list: `[[1, 2], [2], []]`, then its representation as a
+//! sequence of bytes is as follows:
+//! ```text
+//! 2, x, 1, y, 0, z, 1, 2, 2,
+//! ^ ^ ^
+//! x y z
+//! ```
+//!
+//! This has the advantage that we can read the children of the `n`-th
+//! node in constant time.
+//!
+//! ## Vector of labels
+//!
+//! Each label occupies a fixed number of bytes, so we simply put the
+//! labels one after another. The only thing to note here is the
+//! format of the labels.
+//!
+//! ### Labels
+//!
+//! Each label has 3 parts:
+//!
+//! 1. Status: either Packed, Cloned, or Plain. If the node is
+//! cloned, it has a clone index. So in total this part occupies 1
+//! byte for the status and 8 bytes for the clone index.
+//! 2. Start and end: the range in the input sentence. We just store
+//! two numbers here. Hence this part occupies 16 bytes.
+//! 3. Grammar label: either a terminal, a non-terminal, or a rule.
+//! Each variant needs a number to speify its index. So in total this
+//! part occupies 1 byte for the variant discriminant and 8 bytes for
+//! the number.
+//!
+//! To sum up, each label occupies 34 bytes.
+
+use super::Forest;
+use grammar::{GrammarLabel, GrammarLabelType, TNT};
+
+/// Convert any type that implements the `Forest` trait into a
+/// sequence of bytes.
+pub fn forest_to_bytes<F: Forest<GrammarLabel>>(forest: &F) -> Vec<u8> {
+ // First calculate the total number of bytes.
+ let nodes_len = forest.nodes_len();
+
+ let degrees: Vec<_> = forest
+ .nodes()
+ .map(|node| forest.degree(node).unwrap_or(0))
+ .collect();
+
+ let total_degree: usize = degrees.iter().copied().sum();
+
+ let len: usize = 8 // total number of bytes at the start
+ + 3 // special mark
+ + 8 // number of nodes
+ + 8 // offset of labels
+ + 16 * nodes_len // degree & offset for each node
+ + 8 * total_degree // children of each node
+ + 34 * nodes_len // labels
+ ;
+
+ // Then fill in the bytes.
+
+ let mut bytes: Vec<u8> = Vec::with_capacity(len);
+
+ // First the headers
+
+ bytes.extend(len.to_be_bytes());
+ bytes.extend([114, 101, 112]); // rep
+ bytes.extend(nodes_len.to_be_bytes());
+ bytes.extend((len - 34 * nodes_len).to_be_bytes());
+
+ let mut accumulated: usize = 0;
+
+ // Then degrees and offsets
+
+ for degree in degrees.iter().copied() {
+ bytes.extend(degree.to_be_bytes());
+ bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes());
+
+ accumulated += degree;
+ }
+
+ // Then the children
+
+ bytes.extend(forest.nodes().flat_map(|node| {
+ forest
+ .children_of(node)
+ .unwrap()
+ .flat_map(|child| child.to_be_bytes())
+ }));
+
+ // Finally the labels
+
+ 'nodes_loop: for node in forest.nodes() {
+ let label = match forest.vertex_label(node) {
+ Ok(Some(label)) => label,
+ _ => continue 'nodes_loop,
+ };
+
+ if label.is_plain() {
+ bytes.extend(std::iter::repeat(0).take(9));
+ } else if label.is_packed() {
+ bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8)));
+ } else if let Some(index) = label.clone_index() {
+ bytes.extend(std::iter::once(2).chain(index.to_be_bytes()));
+ }
+
+ let label = label.label();
+
+ bytes.extend(label.start().to_be_bytes());
+ bytes.extend(label.end().unwrap_or(0).to_be_bytes());
+
+ let label = label.label();
+
+ match label {
+ GrammarLabelType::TNT(TNT::Ter(t)) => {
+ bytes.extend(std::iter::once(0).chain(t.to_be_bytes()));
+ }
+ GrammarLabelType::TNT(TNT::Non(n)) => {
+ bytes.extend(std::iter::once(1).chain(n.to_be_bytes()));
+ }
+ GrammarLabelType::Rule(r) => {
+ bytes.extend(std::iter::once(2).chain(r.to_be_bytes()));
+ }
+ }
+ }
+
+ if bytes.len() != len {
+ dbg!();
+ }
+
+ bytes
+}
diff --git a/forest/src/lib.rs b/forest/src/lib.rs
index 0b1092f..a888fae 100644
--- a/forest/src/lib.rs
+++ b/forest/src/lib.rs
@@ -1,26 +1,27 @@
#![warn(missing_docs)]
-//! This file defines the expected behaviours of forests, and provides
-//! a default implementation for forests.
+//! This file defines the expected behaviours of forests.
//!
-//! The default forest actually implements the so-called shared packed
-//! parse forest, or SPPF. It packs sub-trees with the same parents
-//! under the same branch, and lets nodes from different branches
-//! share the same children, and hence the name.
+//! The forests are the basis of the `semiring` crate.
use graph::{error::Error as GError, GraphLabel, LabelGraph, ParentsGraph};
+use std::borrow::Borrow;
+
/// An internal type that indicates the status of a node as either a
/// packed, cloned, or plain node.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default)]
-enum ForestLabelType {
+pub enum ForestLabelType {
+ /// A packed node
Packed,
+ /// A plain node
#[default]
Plain,
+ /// A cloned node
Cloned(usize),
}
-impl core::fmt::Display for ForestLabelType {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl std::fmt::Display for ForestLabelType {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Packed => write!(f, "packed"),
Self::Plain => write!(f, "plain"),
@@ -36,14 +37,17 @@ pub struct ForestLabel<T: GraphLabel> {
status: ForestLabelType,
}
-impl<T: GraphLabel> core::fmt::Display for ForestLabel<T> {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
- writeln!(f, "label: {}, status: {}", self.label, self.status)
+impl<T: GraphLabel> std::fmt::Display for ForestLabel<T> {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ if !matches!(self.status, ForestLabelType::Plain) {
+ write!(f, "{}, {}", self.label, self.status)
+ } else {
+ write!(f, "{}", self.label)
+ }
}
}
/// The type of erros for converting forest labels.
-#[non_exhaustive]
#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
pub enum ForestLabelError {
/// Try to pack a cloned node.
@@ -52,8 +56,8 @@ pub enum ForestLabelError {
ClonePack,
}
-impl core::fmt::Display for ForestLabelError {
- fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+impl std::fmt::Display for ForestLabelError {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::PackClone => write!(f, "cannot pack a cloned node"),
Self::ClonePack => write!(f, "cannot clone a packed node"),
@@ -64,33 +68,50 @@ impl core::fmt::Display for ForestLabelError {
impl std::error::Error for ForestLabelError {}
impl<T: GraphLabel> ForestLabel<T> {
+ /// Construct a new label with the inner `label` and `status`.
+ pub fn new(label: T, status: ForestLabelType) -> Self {
+ Self { label, status }
+ }
+
/// Retrieve the label.
pub fn label(&self) -> T {
self.label
}
+ /// Retrieve the status.
+ pub fn status(&self) -> ForestLabelType {
+ self.status
+ }
+
/// Return true if and only if this node is packed.
pub fn is_packed(&self) -> bool {
- matches!(self.status, ForestLabelType::Packed)
+ self.status == ForestLabelType::Packed
}
/// Retrieve the optional clone index.
- pub fn is_cloned(&self) -> Option<usize> {
- match self.status {
- ForestLabelType::Cloned(index) => Some(index),
- _ => None,
+ pub fn clone_index(&self) -> Option<usize> {
+ if let ForestLabelType::Cloned(index) = self.status {
+ Some(index)
+ } else {
+ None
}
}
+ /// Return true if and only if this node is a plain node.
+ pub fn is_plain(&self) -> bool {
+ self.status == ForestLabelType::Plain
+ }
+
/// Try to clone a node.
pub fn clone<F>(self, forest: &F) -> Result<Self, ForestLabelError>
where
- F: Forest<T>,
+ F: LabelGraph<ForestLabel<T>>,
{
if self.is_packed() {
+ dbg!();
Err(ForestLabelError::ClonePack)
} else {
- let clone_index = if let Some(old_index) = self.is_cloned() {
+ let clone_index = if let Some(old_index) = self.clone_index() {
let mut new_index: usize = old_index + 1;
let mut new_label = Self {
@@ -120,7 +141,7 @@ impl<T: GraphLabel> ForestLabel<T> {
/// Try to pack a node.
pub fn pack(self) -> Result<Self, ForestLabelError> {
- if self.is_cloned().is_some() {
+ if self.clone_index().is_some() {
Err(ForestLabelError::PackClone)
} else {
let new_label = Self {
@@ -140,7 +161,7 @@ impl<T: GraphLabel> From<T> for ForestLabel<T> {
}
}
-/// The expected behaviours of a forest.
+/// The expected behaviours of an item derivation forest.
///
/// Note that it requires only a subset of the functionalities of
/// labelled graphs.
@@ -157,28 +178,45 @@ pub trait Forest<T: GraphLabel>: ParentsGraph + LabelGraph<ForestLabel<T>> {
/// label.
fn new_leaf(label: T) -> Self;
+ /// Transform the label at the given node.
+ fn transform_label(
+ &mut self,
+ node_id: usize,
+ transform: impl FnOnce(T) -> T,
+ ) -> Result<(), Self::Error>;
+
/// Detect if the fragment is a prefix of the sub-forest rooted at
/// `node_id`.
fn is_prefix<F>(&self, node_id: usize, fragment: F) -> Result<bool, Self::Error>
where
- F: AsRef<Self>;
+ F: Borrow<Self>;
/// Extend the forest by adjoining another forest at a given node.
fn plant<F>(&mut self, node_id: usize, fragment: F, planted: bool) -> Result<(), Self::Error>
where
- F: AsRef<Self>;
+ F: Borrow<Self>;
/// Clone a node by making a new node and making all the nodes
/// that previously pointed to the old node now point to the new
/// node, and the new node points to the old node. Return the
- /// index of the new node. In addition, if the old node had
- /// already been cloned, just return the index of its
- /// representitave.
+ /// index of the new node. However, if, and only if,
+ /// `no_new_clone` is `true`, do not make a new clone; instead
+ /// return the clone index that would be used if a new clone was
+ /// made.
+ ///
+ /// Also, `preserved_edges_num` many edges out of the old node
+ /// will be copied to be the children of the new node.
///
/// The labels of the representing node and of the cloned node
/// will be handled automatically.
- fn clone_node(&mut self, node_id: usize) -> Result<usize, Self::Error>;
+ fn clone_node(
+ &mut self,
+ node_id: usize,
+ preserved_edges_num: usize,
+ no_new_clone: bool,
+ ) -> Result<usize, Self::Error>;
}
-pub mod default;
-pub use default::Error;
+pub mod bytes;
+
+pub use bytes::forest_to_bytes;