diff options
author | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:30:21 +0800 |
---|---|---|
committer | JSDurand <mmemmew@gmail.com> | 2023-07-08 12:31:13 +0800 |
commit | 9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 (patch) | |
tree | 7bb6004196b38446a5ab0cb3a0ab642d35f113e9 /src | |
parent | 691f969eb104fa3d4c2a1667693fd0382eb9d6b5 (diff) |
Finished the Emacs binding.
Now the binding part is finished.
What remains is a bug encountered when planting a fragment to the
forest which intersects a packed node, which would lead to invalid
forests. This will also cause problem when planting a packed
fragment, but until now my testing grammars do not produce packed
fragments, so this problem is not encountered yet.
I am still figuring out efficient ways to solve this problem.
Diffstat (limited to 'src')
-rw-r--r-- | src/big_endian.c | 28 | ||||
-rw-r--r-- | src/big_endian.h | 9 | ||||
-rw-r--r-- | src/big_endian.o | bin | 0 -> 1112 bytes | |||
-rw-r--r-- | src/binding.c | 286 | ||||
-rw-r--r-- | src/bytes.rs | 175 | ||||
-rw-r--r-- | src/helper.c | 73 | ||||
-rw-r--r-- | src/helper.h | 75 | ||||
-rw-r--r-- | src/helper.o | bin | 0 -> 1768 bytes | |||
-rw-r--r-- | src/lib.rs | 552 | ||||
-rw-r--r-- | src/old binding.txt | 218 | ||||
-rwxr-xr-x | src/rep.so | bin | 0 -> 50480 bytes | |||
-rwxr-xr-x | src/test | bin | 0 -> 33904 bytes | |||
-rw-r--r-- | src/test.c | 224 | ||||
-rw-r--r-- | src/test.el | 28 |
14 files changed, 1658 insertions, 10 deletions
diff --git a/src/big_endian.c b/src/big_endian.c new file mode 100644 index 0000000..2db8624 --- /dev/null +++ b/src/big_endian.c @@ -0,0 +1,28 @@ +#include "big_endian.h" + +void to_big_endian(uint64_t num, unsigned char *result) +{ + *(result+0) = (num >> 56) & 0xff; + *(result+1) = (num >> 48) & 0xff; + *(result+2) = (num >> 40) & 0xff; + *(result+3) = (num >> 32) & 0xff; + *(result+4) = (num >> 24) & 0xff; + *(result+5) = (num >> 16) & 0xff; + *(result+6) = (num >> 8) & 0xff; + *(result+7) = (num >> 0) & 0xff; +} + +uint64_t from_big_endian(unsigned char *num) +{ + uint64_t result = ((uint64_t) (*num) << 56); + + result += ((uint64_t) (*(num+1)) << 48); + result += ((uint64_t) (*(num+2)) << 40); + result += ((uint64_t) (*(num+3)) << 32); + result += ((uint64_t) (*(num+4)) << 24); + result += ((uint64_t) (*(num+5)) << 16); + result += ((uint64_t) (*(num+6)) << 8); + result += ((uint64_t) (*(num+7)) << 0); + + return result; +} diff --git a/src/big_endian.h b/src/big_endian.h new file mode 100644 index 0000000..e6c587e --- /dev/null +++ b/src/big_endian.h @@ -0,0 +1,9 @@ +#ifndef BIG_ENDIAN_H +#define BIG_ENDIAN_H +#include <stdint.h> + +void to_big_endian(uint64_t num, unsigned char *result); + +uint64_t from_big_endian(unsigned char *num); + +#endif diff --git a/src/big_endian.o b/src/big_endian.o Binary files differnew file mode 100644 index 0000000..6d77a13 --- /dev/null +++ b/src/big_endian.o diff --git a/src/binding.c b/src/binding.c new file mode 100644 index 0000000..43f73ba --- /dev/null +++ b/src/binding.c @@ -0,0 +1,286 @@ +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <emacs-module.h> +#include "helper.h" + +int plugin_is_GPL_compatible = 3; + +emacs_value +rep_new_parser(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + char *buffer = NULL; + ptrdiff_t len = 0; + emacs_value error_data[1]; + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + env->copy_string_contents(env, *args, NULL, &len); + len++; + buffer = malloc(sizeof(char) * len); + + if (buffer == NULL) return env->intern(env, "nil"); + + env->copy_string_contents(env, *args, buffer, &len); + + struct parser *new_parser_result = new_parser(buffer, &error_vec); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + return env->make_user_ptr(env, clean_parser, new_parser_result); +} + +emacs_value +rep_parser_recognize(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + struct parser *parser = env->get_user_ptr(env, *args); + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) + return env->intern(env, "nil"); + + unsigned char *buffer = NULL; + ptrdiff_t len = 0; + + emacs_value error_data[1]; + + if (!env->eq(env, + env->type_of(env, *(args+1)), + env->intern(env, "vector"))) { + error_data[0] = env->make_string + (env, "INPUT should be a vector of integers", 36); + + env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + len = env->vec_size(env, *(args+1)); + + buffer = malloc(sizeof(unsigned char) * len * 8); + + if (buffer == NULL) { + error_data[0] = env->make_string(env, "out of memory", 13); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + for (ptrdiff_t i = 0; i < len; i++) { + emacs_value element = env->vec_get(env, *(args+1), i); + + to_big_endian((uint64_t) env->extract_integer(env, element), + buffer + 8 * i); + } + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) { + free(buffer); + return env->intern(env, "nil"); + } + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + unsigned char input_len[8] = { 0 }; + + to_big_endian((uint64_t) len * 8, input_len); + + struct UnsignedVec input_vec = (struct UnsignedVec) + { input_len, NULL, buffer }; + + int result = parser_recognize(parser, &input_vec, &error_vec, (unsigned char) 1); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + return (result) ? env->intern(env, "t") : env->intern(env, "nil"); +} + +void +clean_forest(void *ptr) +{ + clean_unsigned((struct UnsignedVec *) ptr, 15); +} + +emacs_value +rep_parser_parse(emacs_env *env, ptrdiff_t narg, emacs_value *args, void *data) +{ + struct parser *parser = env->get_user_ptr(env, *args); + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) + return env->intern(env, "nil"); + + unsigned char *buffer = NULL; + ptrdiff_t len = 0; + + emacs_value error_data[1]; + + if (!env->eq(env, + env->type_of(env, *(args+1)), + env->intern(env, "vector"))) { + error_data[0] = env->make_string + (env, "INPUT should be a vector of integers", 36); + + env->non_local_exit_signal(env, env->intern(env, "wrong-type-argument"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + len = env->vec_size(env, *(args+1)); + + buffer = malloc(sizeof(char) * len * 8); + + if (buffer == NULL) { + error_data[0] = env->make_string(env, "out of memory", 13); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + for (ptrdiff_t i = 0; i < len; i++) { + emacs_value element = env->vec_get(env, *(args+1), i); + + to_big_endian((uint64_t) env->extract_integer(env, element), + buffer + 8 * i); + } + + if (env->non_local_exit_check(env) != emacs_funcall_exit_return) { + free(buffer); + return env->intern(env, "nil"); + } + + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + unsigned char input_len[8] = { 0 }; + + to_big_endian((uint64_t) len * 8, input_len); + +struct UnsignedVec input_vec = (struct UnsignedVec) + { input_len, NULL, buffer }; + + struct UnsignedVec *result = parser_parse(parser, &input_vec, &error_vec, (unsigned char) 1); + + free(buffer); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + error_data[0] = env->make_string(env, error_vec.data, (ptrdiff_t) error_length); + + clean_signed(&error_vec, 4); + + env->non_local_exit_signal(env, env->intern(env, "error"), + env->funcall(env, env->intern(env, "list"), + 1, error_data)); + + return env->intern(env, "nil"); + } + + if (result) { + return env->make_user_ptr(env, clean_forest, result); + } + + return env->intern(env, "nil"); +} + +int +emacs_module_init(struct emacs_runtime *ert) +{ + emacs_env *env = ert->get_environment(ert); + + unsigned char emacs_version = 0; + + if ((unsigned long) env->size >= sizeof(struct emacs_env_27)) + emacs_version = 27; + else if ((unsigned long) env->size >= sizeof(struct emacs_env_26)) + emacs_version = 26; + else if ((unsigned long) env->size >= sizeof(struct emacs_env_25)) + emacs_version = 25; + else return 2; + + emacs_value newfunc = env->make_function + (env, 1, 1, rep_new_parser, + "Create a new parser from the grammar string GRAMMAR_STRING." + "\n\n\(fn grammar_string)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-new-parser"), newfunc }); + + emacs_value recognizefunc = env->make_function + (env, 2, 2, rep_parser_recognize, + "Recognize an INPUT using PARSER." + "\n\n\(fn parser INPUT)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-recognize"), recognizefunc }); + + emacs_value parsefunc = env->make_function + (env, 2, 2, rep_parser_parse, + "Parse an INPUT using PARSER." + "\n\n\(fn parser input)", + NULL); + + env->funcall(env, env->intern(env, "defalias"), + 2, (emacs_value[]){ env->intern(env, "rep-parse"), parsefunc }); + + env->funcall(env, env->intern(env, "provide"), + 1, (emacs_value[]){ env->intern(env, "rep") }); + + return 0; +} diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..f764077 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,175 @@ +//! This module defines functions to turn a forest of forest labels +//! into a sequence of bytes. +//! +//! To be more specific, a forest consists of two parts: a vector of +//! node labels and a graph specifying the relations between nodes. +//! +//! # Number format (endianness) +//! +//! Every number mentionned in this format is an unsigned integer of +//! 64 bits, or 8 bytes. Every such number is specified in the big +//! endian format. +//! +//! # Parts of the sequence of bytes +//! +//! The sequence of bytes has three parts: +//! +//! ## Header +//! +//! The header specifies metadata of this forest. +//! +//! ### Special mark +//! +//! The first three bytes form the string "rep", as a special mark. +//! +//! ### Number of nodes +//! +//! The next 8 bytes specifies the number of nodes of this forest. +//! +//! ## Graph +//! +//! Next comes the underlying graph for this forest. This part +//! consists of a vector of vectors of numbers. Each vector of +//! numbers represents the list of children of a node. So the number +//! of vectors of numbers is equal to the number of nodes. +//! +//! ### Vector of vectors of numbers +//! +//! The vectors are not simply concatenated one after another, as that +//! way one cannot read a random node in a constant amount of time. +//! +//! Instead, we first specify the number of children of each node +//! first, along with the offset for that node of the vector of +//! children. +//! +//! As an example, if a graph has three nodes, represented as the +//! adjacency list: `[[1, 2], [2], []]`, then its representation as a +//! sequence of bytes is as follows: +//! ```text +//! 2, x, 1, y, 0, z, 1, 2, 2, +//! ^ ^ ^ +//! x y z +//! ``` +//! +//! This has the advantage that we can read the children of the `n`-th +//! node in constant time. +//! +//! ## Vector of labels +//! +//! Each label occupies a fixed number of bytes, so we simply put the +//! labels one after another. The only thing to note here is the +//! format of the labels. +//! +//! ### Labels +//! +//! Each label has 3 parts: +//! +//! 1. Status: either Packed, Cloned, or Plain. If the node is +//! cloned, it has a clone index. So in total this part occupies 1 +//! byte for the status and 8 bytes for the clone index. +//! 2. Start and end: the range in the input sentence. We just store +//! two numbers here. Hence this part occupies 16 bytes. +//! 3. Grammar label: either a terminal, a non-terminal, or a rule. +//! Each variant needs a number to speify its index. So in total this +//! part occupies 1 byte for the variant discriminant and 8 bytes for +//! the number. +//! +//! To sum up, each label occupies 34 bytes. + +use chain::item::{default::DefaultForest, ForestLabel}; +use grammar::{GrammarLabel, GrammarLabelType, TNT}; +use graph::{Graph, LabelGraph}; + +pub(super) fn forest_to_bytes(forest: &DefaultForest<ForestLabel<GrammarLabel>>) -> Vec<u8> { + // First calculate the total number of bytes. + let nodes_len = forest.nodes_len(); + + let degrees: Vec<_> = forest + .nodes() + .map(|node| forest.degree(node).unwrap_or(0)) + .collect(); + + let total_degree: usize = degrees.iter().copied().sum(); + + let len: usize = 8 // total number of bytes at the start + + 3 // special mark + + 8 // number of nodes + + 8 // offset of labels + + 16 * nodes_len // degree & offset for each node + + 8 * total_degree // children of each node + + 34 * nodes_len // labels + ; + + // Then fill in the bytes. + + let mut bytes: Vec<u8> = Vec::with_capacity(len); + + // First the headers + + bytes.extend(len.to_be_bytes()); + bytes.extend([114, 101, 112]); // rep + bytes.extend(nodes_len.to_be_bytes()); + bytes.extend((len - 34 * nodes_len).to_be_bytes()); + + let mut accumulated: usize = 0; + + // Then degrees and offsets + + for degree in degrees.iter().copied() { + bytes.extend(degree.to_be_bytes()); + bytes.extend((8 + 3 + 8 + 8 + 16 * nodes_len + 8 * accumulated).to_be_bytes()); + + accumulated += degree; + } + + // Then the children + + bytes.extend(forest.nodes().flat_map(|node| { + forest + .children_of(node) + .unwrap_or_default() + .flat_map(|child| child.to_be_bytes()) + })); + + // Finally the labels + + 'nodes_loop: for node in forest.nodes() { + let label = match forest.vertex_label(node) { + Ok(Some(label)) => label, + _ => continue 'nodes_loop, + }; + + if label.is_plain() { + bytes.extend(std::iter::repeat(0).take(9)); + } else if label.is_packed() { + bytes.extend(std::iter::once(1).chain(std::iter::repeat(0).take(8))); + } else if let Some(index) = label.clone_index() { + bytes.extend(std::iter::once(2).chain(index.to_be_bytes())); + } + + let label = label.label(); + + bytes.extend(label.start().to_be_bytes()); + bytes.extend(label.end().unwrap_or(0).to_be_bytes()); + + let label = label.label(); + + match label { + GrammarLabelType::TNT(TNT::Ter(t)) => { + bytes.extend(std::iter::once(0).chain(t.to_be_bytes())); + } + GrammarLabelType::TNT(TNT::Non(n)) => { + bytes.extend(std::iter::once(1).chain(n.to_be_bytes())); + } + GrammarLabelType::Rule(r) => { + bytes.extend(std::iter::once(2).chain(r.to_be_bytes())); + } + } + } + + if bytes.len() != len { + dbg!(); + } + + bytes +} diff --git a/src/helper.c b/src/helper.c new file mode 100644 index 0000000..5d7e9f8 --- /dev/null +++ b/src/helper.c @@ -0,0 +1,73 @@ +#include "helper.h" + +struct Label +read_label(unsigned char *ptr) +{ + struct Label label = { 0 }; + + switch (*ptr) { + case Plain: + label.status = Plain; + break; + case Packed: + label.status = Packed; + break; + default: + label.status = Clone; + label.clone_index = from_big_endian(ptr+1); + break; + } + + label.start = from_big_endian(ptr+9); + label.end = from_big_endian(ptr+17); + + switch (*(ptr+25)) { + case Terminal: + label.variant = Terminal; + break; + case Nonterminal: + label.variant = Nonterminal; + break; + default: + label.variant = Rule; + break; + } + + label.content = from_big_endian(ptr+26); + + return label; +} + +void +print_label(struct Label label) +{ + switch (label.status) { + case Plain: + printf("a plain node "); + break; + case Packed: + printf("a packed node "); + break; + default: + printf("a cloned node with index %llu ", label.clone_index); + break; + } + + printf("spanning (%llu, %llu) ", label.start, label.end); + + printf("labelled as a "); + + switch (label.variant) { + case Terminal: + printf("terminal "); + break; + case Nonterminal: + printf("non-terminal "); + break; + default: + printf("rule "); + break; + } + + printf("%llu\n", label.content); +} diff --git a/src/helper.h b/src/helper.h new file mode 100644 index 0000000..8be7497 --- /dev/null +++ b/src/helper.h @@ -0,0 +1,75 @@ +#ifndef HELPER_H +#define HELPER_H + +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include "big_endian.h" + +struct SignedVec { + unsigned char *len; + unsigned char *capacity; + char *data; +}; + +struct UnsignedVec { + unsigned char *len; + unsigned char *capacity; + unsigned char *data; +}; + +struct parser; + +struct parser *new_parser(char *grammar_string, + struct SignedVec *error_vec); + +void clean_parser(void *parser); + +int parser_recognize(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p); + +struct UnsignedVec * +parser_parse +(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p ); + +struct UnsignedVec * +parser_parse(struct parser *parser, + struct UnsignedVec *input_vec, + struct SignedVec *error_vec, + unsigned char reset_p); + +void clean_signed(struct SignedVec *vec, unsigned char flag); +void clean_unsigned(struct UnsignedVec *vec, unsigned char flag); + +typedef enum Status { + Plain = 0, + Packed = 1, + Clone = 2, +} Status; + +typedef enum Variant { + Terminal = 0, + Nonterminal = 1, + Rule = 2, +} Variant; + +struct Label { + Status status; + uint64_t clone_index; + uint64_t start; + uint64_t end; + Variant variant; + uint64_t content; +}; + +struct Label read_label(unsigned char *ptr); + +void print_label(struct Label label); + +#endif diff --git a/src/helper.o b/src/helper.o Binary files differnew file mode 100644 index 0000000..e8801dd --- /dev/null +++ b/src/helper.o @@ -1,16 +1,548 @@ -// TODO: Add Emacs bindings +#![warn(missing_docs)] +//! This top level package provides necessary functions for Emacs to +//! call. -pub fn add(left: usize, right: usize) -> usize { - left + right +extern crate chain; +extern crate grammar; + +use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; +use grammar::Grammar; + +/// This struct is the representation of a parser. +/// +/// When the user constructs a parser, an instance of this struct will +/// be constructed and the user will receive an opaque pointer to this +/// struct. +#[derive(Debug, Clone)] +#[repr(C)] +pub struct Parser { + chain: DefaultChain, +} + +impl Parser { + /// Construct a parser from the grammar string. + /// + /// The grammar is supposed to conform to the Augmented + /// Backus-Naur Format. See RFC 5234 for exact details of this + /// format. + pub fn new(s: &str) -> Result<Self, String> { + let grammar: Grammar = s.parse().map_err(|err| format!("{err}"))?; + let atom: DefaultAtom = + DefaultAtom::from_grammar(grammar).map_err(|err| format!("{err}"))?; + + DefaultChain::unit(atom) + .map_err(|err| format!("{err}")) + .map(|chain| Self { chain }) + } +} + +/// Actual function that is called through C ABI. +/// +/// The parameter `ERROR_LEN` is supposed to point to an integer, +/// which will be set to the length of the error message, if and only +/// if an error occurs, in which case the `ERROR_STR` will be set to +/// point to the actual error message. +/// +/// It is expected that `*ERROR_STR` should hold the value `NULL` . +#[no_mangle] +extern "C" fn new_parser( + grammar_string: *mut std::os::raw::c_char, + error_vec: *mut LenVec<std::os::raw::c_char>, +) -> *mut Parser { + let parsed_str; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + match std::ffi::CStr::from_ptr(grammar_string).to_str() { + Ok(ccstr) => { + parsed_str = ccstr.to_string(); + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + match Parser::new(&parsed_str) { + Ok(result) => unsafe { + for i in 0..8 { + *(error_len.add(i)) = 0; + } + + Box::into_raw(Box::new(result)) + }, + Err(e) => unsafe { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + std::ptr::null_mut() + }, + } +} + +#[no_mangle] +extern "C" fn clean_parser(parser: *const std::ffi::c_void) { + unsafe { + drop(Box::from_raw(parser as *mut Parser)); + } +} + +/// To make it easier to pass arrays to and from C. +#[repr(C)] +pub struct LenVec<T> { + /// This must be an array of unsigned char of length 8. + /// + /// A less length leads to access to invalid memory location, + /// while a longer length will be ignored, possibly leading to + /// wrong length. + /// + /// The length will be interpreted as a 64-bits integer stored in + /// big endian format. + pub len: *mut std::os::raw::c_uchar, + /// This must be an array of unsigned chars of length 8. + /// + /// This is only used so that Rust knows how to reconstruct the + /// data from C, in order for Rust to deallocate the objects + /// exposed by this library. + pub capacity: *mut std::os::raw::c_uchar, + /// The actual pointer to the data. + /// + /// In case this should be set by the function on the Rust end, + /// this field should be `NULL`. + pub data: *mut T, +} + +#[no_mangle] +extern "C" fn clean_signed(vec: *mut LenVec<std::os::raw::c_char>, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { String::from_raw_parts((*vec).data as *mut u8, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn clean_unsigned(vec: *mut LenVec<std::os::raw::c_uchar>, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { Vec::<u8>::from_raw_parts((*vec).data, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn parser_recognize( + parser: *mut Parser, + input_vec: *mut LenVec<std::os::raw::c_uchar>, + error_vec: *mut LenVec<std::os::raw::c_char>, + reset_p: std::os::raw::c_uchar, +) -> std::os::raw::c_int { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array_len = input_len; + let input_array; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_array_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + } + + if input_array_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_array_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + for (index, token) in input_iter.enumerate() { + let chain_result = parser_box.chain.chain(token, index, true); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + Box::leak(parser_box); + + result as std::os::raw::c_int + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + 0 + } + } } -#[cfg(test)] -mod tests { - use super::*; +#[no_mangle] +extern "C" fn parser_parse( + parser: *mut Parser, + input_vec: *mut LenVec<std::os::raw::c_uchar>, + error_vec: *mut LenVec<std::os::raw::c_char>, + reset_p: std::os::raw::c_uchar, +) -> *mut LenVec<std::os::raw::c_uchar> { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + if input_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } else if input_len == 0 { + Box::leak(parser_box); + + return std::ptr::null_mut(); + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + let mut last_pos: usize = 0; + let mut last_token: usize = 0; + + for (index, token) in input_iter.enumerate() { + last_pos = index; + last_token = token; + + let chain_result = parser_box.chain.chain(token, index, false); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + if result { + let forest = parser_box.chain.end_of_input(last_pos + 1, last_token); + + match forest { + Ok(forest) => { + Box::leak(parser_box); + + let mut bytes = bytes::forest_to_bytes(&forest); + + let bytes_len = bytes.len().to_be_bytes().to_vec(); + + let bytes_capacity = bytes.capacity().to_be_bytes().to_vec(); + + let bytes_vec: LenVec<std::os::raw::c_uchar> = LenVec { + len: Box::leak(bytes_len.into_boxed_slice()).as_mut_ptr(), + capacity: Box::leak(bytes_capacity.into_boxed_slice()).as_mut_ptr(), + data: bytes.as_mut_ptr(), + }; + + std::mem::forget(bytes); + + Box::into_raw(Box::new(bytes_vec)) + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } + } + } else { + Box::leak(parser_box); + + std::ptr::null_mut() + } + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } } } + +pub mod bytes; diff --git a/src/old binding.txt b/src/old binding.txt new file mode 100644 index 0000000..0311f49 --- /dev/null +++ b/src/old binding.txt @@ -0,0 +1,218 @@ +//! This file provides the Emacs binding for the core. + +#![allow(unused_imports)] + +extern crate repcore; + +use repcore::{ + derive::stdag::{PFRecorder, STDAGParser}, + grammar::{Grammar, GrammarInfo}, + semiring::{IdentityRecorder, Nat, PFS}, + tokenizer::to_tokenizer, + valuation::{Count, ParseForest}, +}; + +use repcore::parser::Parser as PParser; +use repcore::parser::ParserReportConfig as PParserConfig; + +// TODO: Write Emacs binding + +// Let's design the API for Emacs to use. +// +// Emacs offers the grammar file or grammar string, and then we +// generate a parser. +// +// Thereafter when we want to parse some input, Emacs gives the input +// file name or the input file string to this package, and then the +// package gives back the parse result. +// +// Besides, we need a data type to deal with errors. +// +// Handling a string is not a problem for us, but to return a parser +// to Emacs, and to return the parse result, are kind of troublesome. +// +// To achieve this, we need data types for a parser and for the parse +// result. Since a parser is just an implementation detail and should +// not be of concern to the user, we mught just represent a parser as +// just a custom user-ptr in Emacs. +// +// As to the parse result, we return a set of tuples: +// +// (label, i, k, j), +// +// where LABEL is represented as a tuple +// +// (INDEX, STRING), +// +// where index is the index of the rule and the string is the name of +// the non-terminal or the terminal that is spanned by this packed +// node. +// +// This is not intended for the user to inspect and play with. The +// package should provide another function to interpret this data, for +// example to search something or to display it. The reason we don't +// directly return something to display is that the plain forest might +// contain cyclic data, and is inconvenient to represent in Emacs. So +// we only try to display this information to the user if the need +// arises. + +#[derive(Debug, Clone)] +#[repr(C)] +pub struct Parser { + g: Grammar, + gi: GrammarInfo, + stparser: STDAGParser<PFS, ParseForest, PFRecorder>, +} + +impl Parser { + fn new(gs: &str) -> Result<Self, Box<dyn std::error::Error>> { + let g: Grammar = gs.parse()?; + + let mut stparser = STDAGParser::new(); + + let mut gi = g.generate_info()?; + + stparser.init(&g, &mut gi)?; + + Ok(Self { g, gi, stparser }) + } +} + +#[no_mangle] +extern "C" fn new_parser( + gs: *mut std::os::raw::c_char, + error: *mut std::os::raw::c_int, +) -> *mut Parser { + let parsed_str; + let parser_result; + + unsafe { + let cstr = std::ffi::CStr::from_ptr(gs).to_str(); + + if cstr.is_err() { + *error = 1; + return std::ptr::null_mut(); + } + + parsed_str = cstr.unwrap().to_owned(); + + parser_result = Parser::new(&parsed_str); + + if parser_result.is_err() { + *error = 2; + eprintln!("failed: {parser_result:?}"); + return std::ptr::null_mut(); + } + + *error = 0; + } + + Box::into_raw(Box::new(parser_result.unwrap())) +} + +#[no_mangle] +extern "C" fn clean_parser(parser: *const std::ffi::c_void) { + unsafe { + Box::from_raw(parser as *mut Parser); + } +} + +// #[no_mangle] +// extern "C" fn reset_parser(parser: *mut Parser) { +// let mut parser_box; +// unsafe { +// parser_box = Box::from_raw(parser); +// } + +// parser_box +// .stparser +// .init(&parser_box.g, &parser_box.gi) +// .unwrap(); + +// std::mem::forget(parser_box); +// } + +// FIXME: Use a pointer to int to indicate the type of errors, and use +// a pointer to a custom error struct to convey the error messages. +#[no_mangle] +extern "C" fn parser_parse( + parser: *mut Parser, + file: *const std::os::raw::c_char, +) -> std::os::raw::c_int { + let mut parser_box; + let cstr; + + unsafe { + parser_box = Box::from_raw(parser); + cstr = std::ffi::CStr::from_ptr(file).to_str(); + + if cstr.is_err() { + eprintln!("error reading string: {:?}", cstr); + return 0; + } + } + + parser_box.stparser.reinit(); + + let file_string = std::fs::read_to_string(cstr.unwrap()); + + if file_string.is_err() { + eprintln!("error reading file: {:?}", file_string); + return 0; + } + + let file_string = file_string.unwrap(); + + let parse_result = parser_box + .stparser + .parse(&file_string.chars().collect::<Vec<_>>()); + + if parse_result.is_err() { + eprintln!("failed to parse: {:?}", parse_result); + return 0; + } + + let (accepting, _) = parse_result.unwrap(); + + std::mem::forget(parser_box); + + accepting as std::os::raw::c_int +} + +#[no_mangle] +extern "C" fn parser_parse_string( + parser: *mut Parser, + input: *const std::os::raw::c_char, +) -> std::os::raw::c_int { + let mut parser_box; + let cstr; + + unsafe { + parser_box = Box::from_raw(parser); + cstr = std::ffi::CStr::from_ptr(input).to_str(); + + if cstr.is_err() { + eprintln!("error reading string: {:?}", cstr); + return 0; + } + } + + parser_box.stparser.reinit(); + + let file_string = cstr.unwrap().to_owned(); + + let parse_result = parser_box + .stparser + .parse(&file_string.chars().collect::<Vec<_>>()); + + if parse_result.is_err() { + eprintln!("failed to parse: {:?}", parse_result); + return 0; + } + + let (accepting, _) = parse_result.unwrap(); + + std::mem::forget(parser_box); + + accepting as std::os::raw::c_int +} diff --git a/src/rep.so b/src/rep.so Binary files differnew file mode 100755 index 0000000..2bc412c --- /dev/null +++ b/src/rep.so diff --git a/src/test b/src/test Binary files differnew file mode 100755 index 0000000..5806c2d --- /dev/null +++ b/src/test diff --git a/src/test.c b/src/test.c new file mode 100644 index 0000000..d309fb7 --- /dev/null +++ b/src/test.c @@ -0,0 +1,224 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <limits.h> +#include <string.h> +#include <emacs-module.h> + +#include "big_endian.h" +#include "helper.h" + +int +main(int argc, char **argv) +{ + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + struct parser *parser = new_parser("start = \"a\"\"b\"\n", &error_vec); + + uint64_t error_length = from_big_endian(error_vec.len); + + if (error_length) { + printf("error: "); + + char *error_str = error_vec.data; + + for (int i = 0; i < error_length; i++) { + printf("%c", *(error_str+i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + unsigned char *input = malloc (sizeof(unsigned char) * 16); + + for (int i = 0; i < 16; i++) *(input+i) = 0; + + *(input+15) = 1; + + unsigned char input_len[8] = { 0 }; + input_len[7] = 16; + + struct UnsignedVec input_vec = (struct UnsignedVec) { input_len, NULL, input }; + + int result = parser_recognize + (parser, + &input_vec, + &error_vec, + (unsigned char) 0); + + error_length = from_big_endian(error_vec.len); + + if (error_length) { + clean_parser((void *) parser); + free(input); + + printf("error: "); + + char *error_str = error_vec.data; + + for (uint64_t i = 0; i < error_length; i++) { + printf("%c", *(error_str + (unsigned long) i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + if (result) + printf("result = true\n"); + else + printf("result = false\n"); + + for (int i = 0; i < 8; i++) { + error_vec.len[i] = 0; + error_vec.capacity[i] = 0; + } + + struct UnsignedVec *forest_ptr = parser_parse + (parser, + &input_vec, + &error_vec, + (unsigned char) 1); + + error_length = from_big_endian(error_vec.len); + + if (error_length) { + clean_parser((void *) parser); + free(input); + + printf("error: "); + + char *error_str = error_vec.data; + + for (uint64_t i = 0; i < error_length; i++) { + printf("%c", *(error_str + (unsigned long) i)); + } + + printf("\n"); + + clean_signed(&error_vec, 4); + + return 1; + } + + free(input); + clean_parser((void *) parser); + + if (forest_ptr) { + uint64_t forest_len = from_big_endian(forest_ptr->len); + + /* forest_len++; */ + + printf("a non-empty forest of length %llu\n", forest_len); + + unsigned char *forest = forest_ptr->data; + + uint64_t forest_real_len = from_big_endian(forest); + + if (forest_real_len != forest_len) { + fprintf(stderr, "wrong length: %llu\n", forest_real_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + if (forest_len < 27) { + fprintf(stderr, "length too small: %llu\n", forest_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the lengths match\n"); + + if (*(forest+8) != 114 || *(forest+9) != 101 || *(forest+10) != 112) { + fprintf(stderr, "the forest does not begin with the special mark\n"); + + fprintf(stderr, "the first bytes are: "); + fprintf(stderr, "%c", *(forest+8)); + fprintf(stderr, "%c", *(forest+9)); + fprintf(stderr, "%c\n", *(forest+10)); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the special mark is correct.\n"); + + uint64_t nodes_len = from_big_endian(forest+11); + + printf("forest has %llu nodes\n", nodes_len); + + uint64_t labels_offset = from_big_endian(forest+19); + + printf("the offset of labels is %llu\n", labels_offset); + + if (forest_len < labels_offset || + forest_len < (27 + 16 * nodes_len)) { + fprintf(stderr, "length too small: %llu\n", forest_len); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + uint64_t total_degree = 0; + + for (uint64_t i = 0; i < nodes_len; i++) { + uint64_t offset = 27 + 16 * i; + + uint64_t degree = from_big_endian(forest+offset); + uint64_t node_offset = from_big_endian(forest+offset+8); + + total_degree += degree; + + printf("the %llu-th node has degree %llu and offset %llu\n", + i, degree, node_offset); + + printf("label: "); + print_label(read_label(forest + labels_offset + 34 * i)); + + for (uint64_t j = 0; j < degree; j++) { + uint64_t child = from_big_endian(forest+node_offset+8*j); + + printf("the %llu-th child is %llu\n", j, child); + } + + printf("\n"); + } + + uint64_t correct = 27 + 50 * nodes_len + 8 * total_degree; + + if (forest_len != correct) { + fprintf(stderr, "length does not conform to the format: %llu\n", correct); + + clean_unsigned(forest_ptr, 15); + + return 1; + } + + printf("the forest has the correct length according to the format\n"); + + clean_unsigned(forest_ptr, 15); + } else { + printf("no forest\n"); + } + + return 0; +} diff --git a/src/test.el b/src/test.el new file mode 100644 index 0000000..d106a03 --- /dev/null +++ b/src/test.el @@ -0,0 +1,28 @@ +(load-file "rep.so") +(setq rep-dir (vc-call-backend 'Git 'root default-directory)) + +(setq test-parser + (rep-new-parser + (with-temp-buffer + (insert-file-contents + (expand-file-name + "test.abnf" + (expand-file-name + "abnf grammars" + (expand-file-name "grammar" rep-dir)))) + (buffer-string)))) + +(defvar input nil "A vector that represents a testing input.") + +(setq input (vector 3 0 2 2 2 1 1 1 0 1)) + +(rep-recognize test-parser input) +(rep-parse test-parser input) + +;; (rep-parse-string +;; test-parser +;; "* title\nprice: 512\nnote: this is a test\n") + +;; (rep-parse-string +;; test-parser +;; "183t3ru") |