diff options
Diffstat (limited to 'src/bytes.rs')
-rw-r--r-- | src/bytes.rs | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..f764077 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,175 @@ +//! This module defines functions to turn a forest of forest labels +//! into a sequence of bytes. +//! +//! To be more specific, a forest consists of two parts: a vector of +//! node labels and a graph specifying the relations between nodes. +//! +//! # Number format (endianness) +//! +//! Every number mentionned in this format is an unsigned integer of +//! 64 bits, or 8 bytes. Every such number is specified in the big +//! endian format. +//! +//! # Parts of the sequence of bytes +//! +//! The sequence of bytes has three parts: +//! +//! ## Header +//! +//! The header specifies metadata of this forest. +//! +//! ### Special mark +//! +//! The first three bytes form the string "rep", as a special mark. +//! +//! ### Number of nodes +//! +//! The next 8 bytes specifies the number of nodes of this forest. +//! +//! ## Graph +//! +//! Next comes the underlying graph for this forest. This part +//! consists of a vector of vectors of numbers. Each vector of +//! numbers represents the list of children of a node. So the number +//! of vectors of numbers is equal to the number of nodes. +//! +//! ### Vector of vectors of numbers +//! +//! The vectors are not simply concatenated one after another, as that +//! way one cannot read a random node in a constant amount of time. +//! +//! Instead, we first specify the number of children of each node +//! first, along with the offset for that node of the vector of +//! children. +//! +//! As an example, if a graph has three nodes, represented as the +//! adjacency list: `[[1, 2], [2], []]`, then its representation as a +//! sequence of bytes is as follows: +//! ```text +//! 2, x, 1, y, 0, z, 1, 2, 2, +//! ^ ^ ^ +//! x y z +//! ``` +//! +//! This has the advantage that we can read the children of the `n`-th +//! node in constant time. +//! +//! ## Vector of labels +//! +//! Each label occupies a fixed number of bytes, so we simply put the +//! labels one after another. The only thing to note here is the +//! format of the labels. +//! +//! ### Labels +//! +//! Each label has 3 parts: +//! +//! 1. Status: either Packed, Cloned, or Plain. If the node is +//! cloned, it has a clone index. So in total this part occupies 1 +//! byte for the status and 8 bytes for the clone index. +//! 2. Start and end: the range in the input sentence. We just store +//! two numbers here. Hence this part occupies 16 bytes. +//! 3. Grammar label: either a terminal, a non-terminal, or a rule. +//! Each variant needs a number to speify its index. So in total this +//! part occupies 1 byte for the variant discriminant and 8 bytes for +//! the number. +//! +//! To sum up, each label occupies 34 bytes. + +use chain::item::{default::DefaultForest, ForestLabel}; +use grammar::{GrammarLabel, GrammarLabelType, TNT}; +use graph::{Graph, LabelGraph}; + +pub(super) fn forest_to_bytes(forest: &DefaultForest<ForestLabel<GrammarLabel>>) -> Vec<u8> { + // First calculate the total number of bytes. + let nodes_len = forest.nodes_len(); + + let degrees: Vec<_> = forest + .nodes() + .map(|node| forest.degree(node).unwrap_or(0)) + .collect(); + + let total_degree: usize = degrees.iter().copied().sum(); + + let len: usize = 8 // total number of bytes at the start + + 3 // special mark + + 8 // number of nodes + + 8 // offset of labels + + 16 * nodes_len // degree & offset for each node + + 8 * total_degree // children of each node + + 34 * nodes_len // labels + ; + + // Then fill in the bytes. + + let mut bytes: Vec<u8> = Vec::with_capacity(len); + + // First the headers + + bytes.extend(len.to_be_bytes()); + bytes.extend([114, 101, 112]); // rep + bytes.extend(nodes_len.to_be_bytes()); + bytes.extend((len - 34 * nodes_len).to_be_bytes()); + + let mut accumulated: usize = 0; + + // Then degrees and offsets + + for degree in degrees.iter().copied() { + bytes.extend(degree.to_be_bytes()); + bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); + + accumulated += degree; + } + + // Then the children + + bytes.extend(forest.nodes().flat_map(|node| { + forest + .children_of(node) + .unwrap_or_default() + .flat_map(|child| child.to_be_bytes()) + })); + + // Finally the labels + + 'nodes_loop: for node in forest.nodes() { + let label = match forest.vertex_label(node) { + Ok(Some(label)) => label, + _ => continue 'nodes_loop, + }; + + if label.is_plain() { + bytes.extend(std::iter::repeat(0).take(9)); + } else if label.is_packed() { + bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); + } else if let Some(index) = label.clone_index() { + bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); + } + + let label = label.label(); + + bytes.extend(label.start().to_be_bytes()); + bytes.extend(label.end().unwrap_or(0).to_be_bytes()); + + let label = label.label(); + + match label { + GrammarLabelType::TNT(TNT::Ter(t)) => { + bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); + } + GrammarLabelType::TNT(TNT::Non(n)) => { + bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); + } + GrammarLabelType::Rule(r) => { + bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); + } + } + } + + if bytes.len() != len { + dbg!(); + } + + bytes +} |