diff options
author | JSDurand <mmemmew@gmail.com> | 2023-08-10 11:14:04 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-08-10 11:14:04 +0800 |
commit | b8a2d05a3c0d835556d5ddbd44e4a1e201302af5 (patch) | |
tree | 72e833f41b9a8ffd9d3e7216d2f158504343a420 /src | |
parent | f14c8a2aeab18a9bfa380df28f94736580e08f48 (diff) |
Move the trait `Forest` to the crate "forest".
The purpose of this change is to share this trait with other crates,
such as the forth-coming "semiring" crate that will be responsible for
handling some simple semiring operations as well as the querying, in
my plans.
Diffstat (limited to 'src')
-rw-r--r-- | src/bytes.rs | 175 | ||||
-rw-r--r-- | src/lib.rs | 8 |
2 files changed, 4 insertions, 179 deletions
diff --git a/src/bytes.rs b/src/bytes.rs deleted file mode 100644 index f764077..0000000 --- a/src/bytes.rs +++ /dev/null @@ -1,175 +0,0 @@ -//! This module defines functions to turn a forest of forest labels -//! into a sequence of bytes. -//! -//! To be more specific, a forest consists of two parts: a vector of -//! node labels and a graph specifying the relations between nodes. -//! -//! # Number format (endianness) -//! -//! Every number mentionned in this format is an unsigned integer of -//! 64 bits, or 8 bytes. Every such number is specified in the big -//! endian format. -//! -//! # Parts of the sequence of bytes -//! -//! The sequence of bytes has three parts: -//! -//! ## Header -//! -//! The header specifies metadata of this forest. -//! -//! ### Special mark -//! -//! The first three bytes form the string "rep", as a special mark. -//! -//! ### Number of nodes -//! -//! The next 8 bytes specifies the number of nodes of this forest. -//! -//! ## Graph -//! -//! Next comes the underlying graph for this forest. This part -//! consists of a vector of vectors of numbers. Each vector of -//! numbers represents the list of children of a node. So the number -//! of vectors of numbers is equal to the number of nodes. -//! -//! ### Vector of vectors of numbers -//! -//! The vectors are not simply concatenated one after another, as that -//! way one cannot read a random node in a constant amount of time. -//! -//! Instead, we first specify the number of children of each node -//! first, along with the offset for that node of the vector of -//! children. -//! -//! As an example, if a graph has three nodes, represented as the -//! adjacency list: `[[1, 2], [2], []]`, then its representation as a -//! sequence of bytes is as follows: -//! ```text -//! 2, x, 1, y, 0, z, 1, 2, 2, -//! ^ ^ ^ -//! x y z -//! ``` -//! -//! This has the advantage that we can read the children of the `n`-th -//! node in constant time. -//! -//! ## Vector of labels -//! -//! Each label occupies a fixed number of bytes, so we simply put the -//! labels one after another. The only thing to note here is the -//! format of the labels. -//! -//! ### Labels -//! -//! Each label has 3 parts: -//! -//! 1. Status: either Packed, Cloned, or Plain. If the node is -//! cloned, it has a clone index. So in total this part occupies 1 -//! byte for the status and 8 bytes for the clone index. -//! 2. Start and end: the range in the input sentence. We just store -//! two numbers here. Hence this part occupies 16 bytes. -//! 3. Grammar label: either a terminal, a non-terminal, or a rule. -//! Each variant needs a number to speify its index. So in total this -//! part occupies 1 byte for the variant discriminant and 8 bytes for -//! the number. -//! -//! To sum up, each label occupies 34 bytes. - -use chain::item::{default::DefaultForest, ForestLabel}; -use grammar::{GrammarLabel, GrammarLabelType, TNT}; -use graph::{Graph, LabelGraph}; - -pub(super) fn forest_to_bytes(forest: &DefaultForest<ForestLabel<GrammarLabel>>) -> Vec<u8> { - // First calculate the total number of bytes. - let nodes_len = forest.nodes_len(); - - let degrees: Vec<_> = forest - .nodes() - .map(|node| forest.degree(node).unwrap_or(0)) - .collect(); - - let total_degree: usize = degrees.iter().copied().sum(); - - let len: usize = 8 // total number of bytes at the start - + 3 // special mark - + 8 // number of nodes - + 8 // offset of labels - + 16 * nodes_len // degree & offset for each node - + 8 * total_degree // children of each node - + 34 * nodes_len // labels - ; - - // Then fill in the bytes. - - let mut bytes: Vec<u8> = Vec::with_capacity(len); - - // First the headers - - bytes.extend(len.to_be_bytes()); - bytes.extend([114, 101, 112]); // rep - bytes.extend(nodes_len.to_be_bytes()); - bytes.extend((len - 34 * nodes_len).to_be_bytes()); - - let mut accumulated: usize = 0; - - // Then degrees and offsets - - for degree in degrees.iter().copied() { - bytes.extend(degree.to_be_bytes()); - bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); - - accumulated += degree; - } - - // Then the children - - bytes.extend(forest.nodes().flat_map(|node| { - forest - .children_of(node) - .unwrap_or_default() - .flat_map(|child| child.to_be_bytes()) - })); - - // Finally the labels - - 'nodes_loop: for node in forest.nodes() { - let label = match forest.vertex_label(node) { - Ok(Some(label)) => label, - _ => continue 'nodes_loop, - }; - - if label.is_plain() { - bytes.extend(std::iter::repeat(0).take(9)); - } else if label.is_packed() { - bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); - } else if let Some(index) = label.clone_index() { - bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); - } - - let label = label.label(); - - bytes.extend(label.start().to_be_bytes()); - bytes.extend(label.end().unwrap_or(0).to_be_bytes()); - - let label = label.label(); - - match label { - GrammarLabelType::TNT(TNT::Ter(t)) => { - bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); - } - GrammarLabelType::TNT(TNT::Non(n)) => { - bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); - } - GrammarLabelType::Rule(r) => { - bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); - } - } - } - - if bytes.len() != len { - dbg!(); - } - - bytes -} @@ -9,10 +9,12 @@ use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; use grammar::Grammar; // For printing forests -use chain::item::{default::DefaultForest, ForestLabel, ForestLabelType}; +use chain::item::default::DefaultForest; use grammar::{GrammarLabel, TNT}; use graph::LabelGraph; +use forest::{ForestLabel, ForestLabelType}; + /// This struct is the representation of a parser. /// /// When the user constructs a parser, an instance of this struct will @@ -485,7 +487,7 @@ extern "C" fn parser_parse( Ok(forest) => { Box::leak(parser_box); - let mut bytes = bytes::forest_to_bytes(&forest); + let mut bytes = forest::forest_to_bytes(&forest); let bytes_len = bytes.len().to_be_bytes().to_vec(); @@ -835,5 +837,3 @@ extern "C" fn print_forest_node(forest: *mut CForest, node: *mut std::os::raw::c println!("node {node} has label {label}"); } - -pub mod bytes; |