summaryrefslogtreecommitdiff
path: root/forest/src/bytes.rs
diff options
context:
space:
mode:
Diffstat (limited to 'forest/src/bytes.rs')
-rw-r--r--forest/src/bytes.rs176
1 files changed, 176 insertions, 0 deletions
diff --git a/forest/src/bytes.rs b/forest/src/bytes.rs
new file mode 100644
index 0000000..951abad
--- /dev/null
+++ b/forest/src/bytes.rs
@@ -0,0 +1,176 @@
+//! This module defines functions to turn a forest of forest labels
+//! into a sequence of bytes.
+//!
+//! To be more specific, a forest consists of two parts: a vector of
+//! node labels and a graph specifying the relations between nodes.
+//!
+//! # Number format (endianness)
+//!
+//! Every number mentionned in this format is an unsigned integer of
+//! 64 bits, or 8 bytes. Every such number is specified in the big
+//! endian format.
+//!
+//! # Parts of the sequence of bytes
+//!
+//! The sequence of bytes has three parts:
+//!
+//! ## Header
+//!
+//! The header specifies metadata of this forest.
+//!
+//! ### Special mark
+//!
+//! The first three bytes form the string "rep", as a special mark.
+//!
+//! ### Number of nodes
+//!
+//! The next 8 bytes specifies the number of nodes of this forest.
+//!
+//! ## Graph
+//!
+//! Next comes the underlying graph for this forest. This part
+//! consists of a vector of vectors of numbers. Each vector of
+//! numbers represents the list of children of a node. So the number
+//! of vectors of numbers is equal to the number of nodes.
+//!
+//! ### Vector of vectors of numbers
+//!
+//! The vectors are not simply concatenated one after another, as that
+//! way one cannot read a random node in a constant amount of time.
+//!
+//! Instead, we first specify the number of children of each node
+//! first, along with the offset for that node of the vector of
+//! children.
+//!
+//! As an example, if a graph has three nodes, represented as the
+//! adjacency list: `[[1, 2], [2], []]`, then its representation as a
+//! sequence of bytes is as follows:
+//! ```text
+//! 2, x, 1, y, 0, z, 1, 2, 2,
+//! ^ ^ ^
+//! x y z
+//! ```
+//!
+//! This has the advantage that we can read the children of the `n`-th
+//! node in constant time.
+//!
+//! ## Vector of labels
+//!
+//! Each label occupies a fixed number of bytes, so we simply put the
+//! labels one after another. The only thing to note here is the
+//! format of the labels.
+//!
+//! ### Labels
+//!
+//! Each label has 3 parts:
+//!
+//! 1. Status: either Packed, Cloned, or Plain. If the node is
+//! cloned, it has a clone index. So in total this part occupies 1
+//! byte for the status and 8 bytes for the clone index.
+//! 2. Start and end: the range in the input sentence. We just store
+//! two numbers here. Hence this part occupies 16 bytes.
+//! 3. Grammar label: either a terminal, a non-terminal, or a rule.
+//! Each variant needs a number to speify its index. So in total this
+//! part occupies 1 byte for the variant discriminant and 8 bytes for
+//! the number.
+//!
+//! To sum up, each label occupies 34 bytes.
+
+use super::Forest;
+use grammar::{GrammarLabel, GrammarLabelType, TNT};
+
+/// Convert any type that implements the `Forest` trait into a
+/// sequence of bytes.
+pub fn forest_to_bytes<F: Forest<GrammarLabel>>(forest: &F) -> Vec<u8> {
+ // First calculate the total number of bytes.
+ let nodes_len = forest.nodes_len();
+
+ let degrees: Vec<_> = forest
+ .nodes()
+ .map(|node| forest.degree(node).unwrap_or(0))
+ .collect();
+
+ let total_degree: usize = degrees.iter().copied().sum();
+
+ let len: usize = 8 // total number of bytes at the start
+ + 3 // special mark
+ + 8 // number of nodes
+ + 8 // offset of labels
+ + 16 * nodes_len // degree & offset for each node
+ + 8 * total_degree // children of each node
+ + 34 * nodes_len // labels
+ ;
+
+ // Then fill in the bytes.
+
+ let mut bytes: Vec<u8> = Vec::with_capacity(len);
+
+ // First the headers
+
+ bytes.extend(len.to_be_bytes());
+ bytes.extend([114, 101, 112]); // rep
+ bytes.extend(nodes_len.to_be_bytes());
+ bytes.extend((len - 34 * nodes_len).to_be_bytes());
+
+ let mut accumulated: usize = 0;
+
+ // Then degrees and offsets
+
+ for degree in degrees.iter().copied() {
+ bytes.extend(degree.to_be_bytes());
+ bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes());
+
+ accumulated += degree;
+ }
+
+ // Then the children
+
+ bytes.extend(forest.nodes().flat_map(|node| {
+ forest
+ .children_of(node)
+ .unwrap()
+ .flat_map(|child| child.to_be_bytes())
+ }));
+
+ // Finally the labels
+
+ 'nodes_loop: for node in forest.nodes() {
+ let label = match forest.vertex_label(node) {
+ Ok(Some(label)) => label,
+ _ => continue 'nodes_loop,
+ };
+
+ if label.is_plain() {
+ bytes.extend(std::iter::repeat(0).take(9));
+ } else if label.is_packed() {
+ bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8)));
+ } else if let Some(index) = label.clone_index() {
+ bytes.extend(std::iter::once(2).chain(index.to_be_bytes()));
+ }
+
+ let label = label.label();
+
+ bytes.extend(label.start().to_be_bytes());
+ bytes.extend(label.end().unwrap_or(0).to_be_bytes());
+
+ let label = label.label();
+
+ match label {
+ GrammarLabelType::TNT(TNT::Ter(t)) => {
+ bytes.extend(std::iter::once(0).chain(t.to_be_bytes()));
+ }
+ GrammarLabelType::TNT(TNT::Non(n)) => {
+ bytes.extend(std::iter::once(1).chain(n.to_be_bytes()));
+ }
+ GrammarLabelType::Rule(r) => {
+ bytes.extend(std::iter::once(2).chain(r.to_be_bytes()));
+ }
+ }
+ }
+
+ if bytes.len() != len {
+ dbg!();
+ }
+
+ bytes
+}