//! This module defines functions to turn a forest of forest labels //! into a sequence of bytes. //! //! To be more specific, a forest consists of two parts: a vector of //! node labels and a graph specifying the relations between nodes. //! //! # Number format (endianness) //! //! Every number mentionned in this format is an unsigned integer of //! 64 bits, or 8 bytes. Every such number is specified in the big //! endian format. //! //! # Parts of the sequence of bytes //! //! The sequence of bytes has three parts: //! //! ## Header //! //! The header specifies metadata of this forest. //! //! ### Special mark //! //! The first three bytes form the string "rep", as a special mark. //! //! ### Number of nodes //! //! The next 8 bytes specifies the number of nodes of this forest. //! //! ## Graph //! //! Next comes the underlying graph for this forest. This part //! consists of a vector of vectors of numbers. Each vector of //! numbers represents the list of children of a node. So the number //! of vectors of numbers is equal to the number of nodes. //! //! ### Vector of vectors of numbers //! //! The vectors are not simply concatenated one after another, as that //! way one cannot read a random node in a constant amount of time. //! //! Instead, we first specify the number of children of each node //! first, along with the offset for that node of the vector of //! children. //! //! As an example, if a graph has three nodes, represented as the //! adjacency list: `[[1, 2], [2], []]`, then its representation as a //! sequence of bytes is as follows: //! ```text //! 2, x, 1, y, 0, z, 1, 2, 2, //! ^ ^ ^ //! x y z //! ``` //! //! This has the advantage that we can read the children of the `n`-th //! node in constant time. //! //! ## Vector of labels //! //! Each label occupies a fixed number of bytes, so we simply put the //! labels one after another. The only thing to note here is the //! format of the labels. //! //! ### Labels //! //! Each label has 3 parts: //! //! 1. Status: either Packed, Cloned, or Plain. If the node is //! cloned, it has a clone index. So in total this part occupies 1 //! byte for the status and 8 bytes for the clone index. //! 2. Start and end: the range in the input sentence. We just store //! two numbers here. Hence this part occupies 16 bytes. //! 3. Grammar label: either a terminal, a non-terminal, or a rule. //! Each variant needs a number to speify its index. So in total this //! part occupies 1 byte for the variant discriminant and 8 bytes for //! the number. //! //! To sum up, each label occupies 34 bytes. use chain::item::{default::DefaultForest, ForestLabel}; use grammar::{GrammarLabel, GrammarLabelType, TNT}; use graph::{Graph, LabelGraph}; pub(super) fn forest_to_bytes(forest: &DefaultForest>) -> Vec { // First calculate the total number of bytes. let nodes_len = forest.nodes_len(); let degrees: Vec<_> = forest .nodes() .map(|node| forest.degree(node).unwrap_or(0)) .collect(); let total_degree: usize = degrees.iter().copied().sum(); let len: usize = 8 // total number of bytes at the start + 3 // special mark + 8 // number of nodes + 8 // offset of labels + 16 * nodes_len // degree & offset for each node + 8 * total_degree // children of each node + 34 * nodes_len // labels ; // Then fill in the bytes. let mut bytes: Vec = Vec::with_capacity(len); // First the headers bytes.extend(len.to_be_bytes()); bytes.extend([114, 101, 112]); // rep bytes.extend(nodes_len.to_be_bytes()); bytes.extend((len - 34 * nodes_len).to_be_bytes()); let mut accumulated: usize = 0; // Then degrees and offsets for degree in degrees.iter().copied() { bytes.extend(degree.to_be_bytes()); bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); accumulated += degree; } // Then the children bytes.extend(forest.nodes().flat_map(|node| { forest .children_of(node) .unwrap_or_default() .flat_map(|child| child.to_be_bytes()) })); // Finally the labels 'nodes_loop: for node in forest.nodes() { let label = match forest.vertex_label(node) { Ok(Some(label)) => label, _ => continue 'nodes_loop, }; if label.is_plain() { bytes.extend(std::iter::repeat(0).take(9)); } else if label.is_packed() { bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); } else if let Some(index) = label.clone_index() { bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); } let label = label.label(); bytes.extend(label.start().to_be_bytes()); bytes.extend(label.end().unwrap_or(0).to_be_bytes()); let label = label.label(); match label { GrammarLabelType::TNT(TNT::Ter(t)) => { bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); } GrammarLabelType::TNT(TNT::Non(n)) => { bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); } GrammarLabelType::Rule(r) => { bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); } } } if bytes.len() != len { dbg!(); } bytes }