From 9a317e56f8a6126583f7d0c431bf878d9b1fe7b1 Mon Sep 17 00:00:00 2001 From: JSDurand Date: Sat, 8 Jul 2023 12:30:21 +0800 Subject: Finished the Emacs binding. Now the binding part is finished. What remains is a bug encountered when planting a fragment to the forest which intersects a packed node, which would lead to invalid forests. This will also cause problem when planting a packed fragment, but until now my testing grammars do not produce packed fragments, so this problem is not encountered yet. I am still figuring out efficient ways to solve this problem. --- src/lib.rs | 552 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 542 insertions(+), 10 deletions(-) (limited to 'src/lib.rs') diff --git a/src/lib.rs b/src/lib.rs index f5457c3..685c66e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,16 +1,548 @@ -// TODO: Add Emacs bindings +#![warn(missing_docs)] +//! This top level package provides necessary functions for Emacs to +//! call. -pub fn add(left: usize, right: usize) -> usize { - left + right +extern crate chain; +extern crate grammar; + +use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; +use grammar::Grammar; + +/// This struct is the representation of a parser. +/// +/// When the user constructs a parser, an instance of this struct will +/// be constructed and the user will receive an opaque pointer to this +/// struct. +#[derive(Debug, Clone)] +#[repr(C)] +pub struct Parser { + chain: DefaultChain, +} + +impl Parser { + /// Construct a parser from the grammar string. + /// + /// The grammar is supposed to conform to the Augmented + /// Backus-Naur Format. See RFC 5234 for exact details of this + /// format. + pub fn new(s: &str) -> Result { + let grammar: Grammar = s.parse().map_err(|err| format!("{err}"))?; + let atom: DefaultAtom = + DefaultAtom::from_grammar(grammar).map_err(|err| format!("{err}"))?; + + DefaultChain::unit(atom) + .map_err(|err| format!("{err}")) + .map(|chain| Self { chain }) + } +} + +/// Actual function that is called through C ABI. +/// +/// The parameter `ERROR_LEN` is supposed to point to an integer, +/// which will be set to the length of the error message, if and only +/// if an error occurs, in which case the `ERROR_STR` will be set to +/// point to the actual error message. +/// +/// It is expected that `*ERROR_STR` should hold the value `NULL` . +#[no_mangle] +extern "C" fn new_parser( + grammar_string: *mut std::os::raw::c_char, + error_vec: *mut LenVec, +) -> *mut Parser { + let parsed_str; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + match std::ffi::CStr::from_ptr(grammar_string).to_str() { + Ok(ccstr) => { + parsed_str = ccstr.to_string(); + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + match Parser::new(&parsed_str) { + Ok(result) => unsafe { + for i in 0..8 { + *(error_len.add(i)) = 0; + } + + Box::into_raw(Box::new(result)) + }, + Err(e) => unsafe { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + + std::mem::forget(e_string); + + std::ptr::null_mut() + }, + } +} + +#[no_mangle] +extern "C" fn clean_parser(parser: *const std::ffi::c_void) { + unsafe { + drop(Box::from_raw(parser as *mut Parser)); + } +} + +/// To make it easier to pass arrays to and from C. +#[repr(C)] +pub struct LenVec { + /// This must be an array of unsigned char of length 8. + /// + /// A less length leads to access to invalid memory location, + /// while a longer length will be ignored, possibly leading to + /// wrong length. + /// + /// The length will be interpreted as a 64-bits integer stored in + /// big endian format. + pub len: *mut std::os::raw::c_uchar, + /// This must be an array of unsigned chars of length 8. + /// + /// This is only used so that Rust knows how to reconstruct the + /// data from C, in order for Rust to deallocate the objects + /// exposed by this library. + pub capacity: *mut std::os::raw::c_uchar, + /// The actual pointer to the data. + /// + /// In case this should be set by the function on the Rust end, + /// this field should be `NULL`. + pub data: *mut T, +} + +#[no_mangle] +extern "C" fn clean_signed(vec: *mut LenVec, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { String::from_raw_parts((*vec).data as *mut u8, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn clean_unsigned(vec: *mut LenVec, flag: std::os::raw::c_uchar) { + let len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).len, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } + .try_into() + .unwrap(), + ); + + if (flag & 1) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); + } + + if (flag & 2) != 0 { + drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); + } + + if (flag & 4) != 0 { + drop(unsafe { Vec::::from_raw_parts((*vec).data, len, capacity) }); + } + + if (flag & 8) != 0 { + drop(unsafe { Box::from_raw(vec) }); + } +} + +#[no_mangle] +extern "C" fn parser_recognize( + parser: *mut Parser, + input_vec: *mut LenVec, + error_vec: *mut LenVec, + reset_p: std::os::raw::c_uchar, +) -> std::os::raw::c_int { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array_len = input_len; + let input_array; + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_array_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + } + + if input_array_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_array_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + for (index, token) in input_iter.enumerate() { + let chain_result = parser_box.chain.chain(token, index, true); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return 0; + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + Box::leak(parser_box); + + result as std::os::raw::c_int + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + 0 + } + } } -#[cfg(test)] -mod tests { - use super::*; +#[no_mangle] +extern "C" fn parser_parse( + parser: *mut Parser, + input_vec: *mut LenVec, + error_vec: *mut LenVec, + reset_p: std::os::raw::c_uchar, +) -> *mut LenVec { + let input_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let mut parser_box; + let input_array; - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + unsafe { + parser_box = Box::from_raw(parser); + + input_array = std::slice::from_raw_parts((*input_vec).data, input_len); + } + + // If the parser has already been used before, reset it to the + // initial state. + + if reset_p != 0 && !parser_box.chain.history().is_empty() { + match DefaultChain::unit(parser_box.chain.atom().clone()) { + Ok(chain) => { + parser_box.chain = chain; + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + Box::leak(parser_box); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + } + + if input_len.rem_euclid(8) != 0 { + let mut e_string = + format!("error: input length should be divisible by 8, but got {input_len}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } else if input_len == 0 { + Box::leak(parser_box); + + return std::ptr::null_mut(); + } + + #[cfg(target_pointer_width = "64")] + let input_iter = input_array + .chunks_exact(8) + .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); + + #[cfg(not(target_pointer_width = "64"))] + compile_error!("this program assumes to be run on 64-bits machines"); + + let mut last_pos: usize = 0; + let mut last_token: usize = 0; + + for (index, token) in input_iter.enumerate() { + last_pos = index; + last_token = token; + + let chain_result = parser_box.chain.chain(token, index, false); + + if let Err(e) = chain_result { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return std::ptr::null_mut(); + } + } + + match parser_box.chain.epsilon() { + Ok(result) => { + if result { + let forest = parser_box.chain.end_of_input(last_pos + 1, last_token); + + match forest { + Ok(forest) => { + Box::leak(parser_box); + + let mut bytes = bytes::forest_to_bytes(&forest); + + let bytes_len = bytes.len().to_be_bytes().to_vec(); + + let bytes_capacity = bytes.capacity().to_be_bytes().to_vec(); + + let bytes_vec: LenVec = LenVec { + len: Box::leak(bytes_len.into_boxed_slice()).as_mut_ptr(), + capacity: Box::leak(bytes_capacity.into_boxed_slice()).as_mut_ptr(), + data: bytes.as_mut_ptr(), + }; + + std::mem::forget(bytes); + + Box::into_raw(Box::new(bytes_vec)) + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } + } + } else { + Box::leak(parser_box); + + std::ptr::null_mut() + } + } + Err(e) => { + let mut e_string = format!("error: {e}"); + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + Box::leak(parser_box); + + unsafe { + for i in 0..8 { + *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + std::ptr::null_mut() + } } } + +pub mod bytes; -- cgit v1.2.3-18-g5258