#![warn(missing_docs)] //! This top level package provides necessary functions for Emacs to //! call. extern crate chain; extern crate grammar; use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; use grammar::Grammar; // For printing forests use chain::item::{default::DefaultForest, ForestLabel, ForestLabelType}; use grammar::{GrammarLabel, TNT}; use graph::LabelGraph; /// This struct is the representation of a parser. /// /// When the user constructs a parser, an instance of this struct will /// be constructed and the user will receive an opaque pointer to this /// struct. #[derive(Debug, Clone)] #[repr(C)] pub struct Parser { chain: DefaultChain, } impl Parser { /// Construct a parser from the grammar string. /// /// The grammar is supposed to conform to the Augmented /// Backus-Naur Format. See RFC 5234 for exact details of this /// format. pub fn new(s: &str) -> Result { let grammar: Grammar = s.parse().map_err(|err| format!("{err}"))?; println!("grammar: {grammar}"); let atom: DefaultAtom = DefaultAtom::from_grammar(grammar).map_err(|err| format!("{err}"))?; DefaultChain::unit(atom) .map_err(|err| format!("{err}")) .map(|chain| Self { chain }) } } /// Actual function that is called through C ABI. /// /// The parameter `ERROR_LEN` is supposed to point to an integer, /// which will be set to the length of the error message, if and only /// if an error occurs, in which case the `ERROR_STR` will be set to /// point to the actual error message. /// /// It is expected that `*ERROR_STR` should hold the value `NULL` . #[no_mangle] extern "C" fn new_parser( grammar_string: *mut std::os::raw::c_char, error_vec: *mut LenVec, ) -> *mut Parser { let parsed_str; let error_len = unsafe { (*error_vec).len }; let error_cap = unsafe { (*error_vec).capacity }; unsafe { match std::ffi::CStr::from_ptr(grammar_string).to_str() { Ok(ccstr) => { parsed_str = ccstr.to_string(); } Err(e) => { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; std::mem::forget(e_string); return std::ptr::null_mut(); } } } match Parser::new(&parsed_str) { Ok(result) => unsafe { for i in 0..8 { *(error_len.add(i)) = 0; } Box::into_raw(Box::new(result)) }, Err(e) => unsafe { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; std::mem::forget(e_string); std::ptr::null_mut() }, } } #[no_mangle] extern "C" fn clean_parser(parser: *const std::ffi::c_void) { unsafe { drop(Box::from_raw(parser as *mut Parser)); } } /// To make it easier to pass arrays to and from C. #[repr(C)] pub struct LenVec { /// This must be an array of unsigned char of length 8. /// /// A less length leads to access to invalid memory location, /// while a longer length will be ignored, possibly leading to /// wrong length. /// /// The length will be interpreted as a 64-bits integer stored in /// big endian format. pub len: *mut std::os::raw::c_uchar, /// This must be an array of unsigned chars of length 8. /// /// This is only used so that Rust knows how to reconstruct the /// data from C, in order for Rust to deallocate the objects /// exposed by this library. pub capacity: *mut std::os::raw::c_uchar, /// The actual pointer to the data. /// /// In case this should be set by the function on the Rust end, /// this field should be `NULL`. pub data: *mut T, } #[no_mangle] extern "C" fn clean_signed(vec: *mut LenVec, flag: std::os::raw::c_uchar) { let len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*vec).len, 8) } .try_into() .unwrap(), ); let capacity = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } .try_into() .unwrap(), ); if (flag & 1) != 0 { drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); } if (flag & 2) != 0 { drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); } if (flag & 4) != 0 { drop(unsafe { String::from_raw_parts((*vec).data as *mut u8, len, capacity) }); } if (flag & 8) != 0 { drop(unsafe { Box::from_raw(vec) }); } } #[no_mangle] extern "C" fn clean_unsigned(vec: *mut LenVec, flag: std::os::raw::c_uchar) { let len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*vec).len, 8) } .try_into() .unwrap(), ); let capacity = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*vec).capacity, 8) } .try_into() .unwrap(), ); if (flag & 1) != 0 { drop(unsafe { Vec::from_raw_parts((*vec).len, 8, 8) }); } if (flag & 2) != 0 { drop(unsafe { Vec::from_raw_parts((*vec).capacity, 8, 8) }); } if (flag & 4) != 0 { drop(unsafe { Vec::::from_raw_parts((*vec).data, len, capacity) }); } if (flag & 8) != 0 { drop(unsafe { Box::from_raw(vec) }); } } #[no_mangle] extern "C" fn parser_recognize( parser: *mut Parser, input_vec: *mut LenVec, error_vec: *mut LenVec, reset_p: std::os::raw::c_uchar, ) -> std::os::raw::c_int { let input_len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } .try_into() .unwrap(), ); let mut parser_box; let input_array_len = input_len; let input_array; let error_len = unsafe { (*error_vec).len }; let error_cap = unsafe { (*error_vec).capacity }; unsafe { parser_box = Box::from_raw(parser); input_array = std::slice::from_raw_parts((*input_vec).data, input_array_len); } // If the parser has already been used before, reset it to the // initial state. if reset_p != 0 && !parser_box.chain.history().is_empty() { match DefaultChain::unit(parser_box.chain.atom().clone()) { Ok(chain) => { parser_box.chain = chain; } Err(e) => { let mut e_string = format!("error: {e}"); Box::leak(parser_box); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return 0; } } } if input_array_len.rem_euclid(8) != 0 { let mut e_string = format!("error: input length should be divisible by 8, but got {input_array_len}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return 0; } #[cfg(target_pointer_width = "64")] let input_iter = input_array .chunks_exact(8) .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); #[cfg(not(target_pointer_width = "64"))] compile_error!("this program assumes to be run on 64-bits machines"); for (index, token) in input_iter.enumerate() { let chain_result = parser_box.chain.chain(token, index, true); if let Err(e) = chain_result { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return 0; } } match parser_box.chain.epsilon() { Ok(result) => { Box::leak(parser_box); result as std::os::raw::c_int } Err(e) => { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); 0 } } } #[no_mangle] extern "C" fn parser_parse( parser: *mut Parser, input_vec: *mut LenVec, error_vec: *mut LenVec, reset_p: std::os::raw::c_uchar, ) -> *mut LenVec { let input_len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*input_vec).len, 8) } .try_into() .unwrap(), ); let mut parser_box; let input_array; let error_len = unsafe { (*error_vec).len }; let error_cap = unsafe { (*error_vec).capacity }; unsafe { parser_box = Box::from_raw(parser); input_array = std::slice::from_raw_parts((*input_vec).data, input_len); } // If the parser has already been used before, reset it to the // initial state. if reset_p != 0 && !parser_box.chain.history().is_empty() { match DefaultChain::unit(parser_box.chain.atom().clone()) { Ok(chain) => { parser_box.chain = chain; } Err(e) => { let mut e_string = format!("error: {e}"); Box::leak(parser_box); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return std::ptr::null_mut(); } } } if input_len.rem_euclid(8) != 0 { let mut e_string = format!("error: input length should be divisible by 8, but got {input_len}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return std::ptr::null_mut(); } else if input_len == 0 { Box::leak(parser_box); return std::ptr::null_mut(); } #[cfg(target_pointer_width = "64")] let input_iter = input_array .chunks_exact(8) .map(|chunk| usize::from_be_bytes(<[u8; 8]>::try_from(chunk).unwrap())); #[cfg(not(target_pointer_width = "64"))] compile_error!("this program assumes to be run on 64-bits machines"); let mut last_pos: usize = 0; let mut last_token: usize = 0; for (index, token) in input_iter.enumerate() { last_pos = index; last_token = token; let chain_result = parser_box.chain.chain(token, index, false); if let Err(e) = chain_result { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return std::ptr::null_mut(); } } match parser_box.chain.epsilon() { Ok(result) => { if result { let forest = parser_box.chain.end_of_input(last_pos + 1, last_token); match forest { Ok(forest) => { Box::leak(parser_box); let mut bytes = bytes::forest_to_bytes(&forest); let bytes_len = bytes.len().to_be_bytes().to_vec(); let bytes_capacity = bytes.capacity().to_be_bytes().to_vec(); let bytes_vec: LenVec = LenVec { len: Box::leak(bytes_len.into_boxed_slice()).as_mut_ptr(), capacity: Box::leak(bytes_capacity.into_boxed_slice()).as_mut_ptr(), data: bytes.as_mut_ptr(), }; std::mem::forget(bytes); Box::into_raw(Box::new(bytes_vec)) } Err(e) => { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); std::ptr::null_mut() } } } else { Box::leak(parser_box); std::ptr::null_mut() } } Err(e) => { let mut e_string = format!("error: {e}"); let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); Box::leak(parser_box); unsafe { for i in 0..8 { *(error_len.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *(error_cap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*error_vec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); std::ptr::null_mut() } } } fn read_label(label: *mut std::os::raw::c_uchar) -> ForestLabel { let status: u8 = unsafe { *label }; let label_status: ForestLabelType; match status { 0 => { label_status = ForestLabelType::Plain; } 1 => { label_status = ForestLabelType::Packed; } _ => { label_status = ForestLabelType::Cloned(usize::from_be_bytes( unsafe { std::slice::from_raw_parts(label.add(1), 8) } .try_into() .unwrap(), )); } } let start = usize::from_be_bytes( unsafe { std::slice::from_raw_parts(label.add(9), 8) } .try_into() .unwrap(), ); let end = usize::from_be_bytes( unsafe { std::slice::from_raw_parts(label.add(17), 8) } .try_into() .unwrap(), ); let discriminant: u8 = unsafe { *label.add(25) }; let content = usize::from_be_bytes( unsafe { std::slice::from_raw_parts(label.add(26), 8) } .try_into() .unwrap(), ); let inner_label: GrammarLabel; match discriminant { 0 => { inner_label = GrammarLabel::new_closed(TNT::Ter(content), start, end); } 1 => { inner_label = GrammarLabel::new_closed(TNT::Non(content), start, end); } _ => { inner_label = GrammarLabel::new_closed(content, start, end); } } ForestLabel::new(inner_label, label_status) } macro_rules! return_error { ($err:expr, $elen:ident, $ecap:ident, $evec:ident) => { let mut e_string = $err; let e_string_len_slice = e_string.len().to_be_bytes(); let e_string_cap_slice = e_string.capacity().to_be_bytes(); unsafe { for i in 0..8 { *($elen.add(i)) = e_string_len_slice.get(i).copied().unwrap(); *($ecap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); } (*$evec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; } std::mem::forget(e_string); return; }; } #[no_mangle] extern "C" fn print_forest( forest_vec: *mut LenVec, error_vec: *mut LenVec, filename: *mut std::os::raw::c_char, ) { let forest_len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).len, 8) } .try_into() .unwrap(), ); let error_len = unsafe { (*error_vec).len }; let error_cap = unsafe { (*error_vec).capacity }; if forest_len < 27 { return_error!( format!("forest bytes length {forest_len} < 27"), error_len, error_cap, error_vec ); } let nodes_len = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(11), 8) } .try_into() .unwrap(), ); println!("the forest has {nodes_len} nodes"); let special_marks = unsafe { std::slice::from_raw_parts((*forest_vec).data.add(8), 3) }; if special_marks != &[114, 101, 112] { return_error!( format!( "the forest does not begin with the special mark\nThe first bytes are: \ {:?}\n", special_marks ), error_len, error_cap, error_vec ); } let labels_offset = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(19), 8) } .try_into() .unwrap(), ); println!("labels_offset = {labels_offset}"); if forest_len < labels_offset + 34 * nodes_len || forest_len < (27 + 16 * nodes_len) { return_error!( format!( "the forest length is too small: {forest_len}\n\ labels offset + 34 * nodes_len = {}, all {nodes_len} \ nodes take {}\n", labels_offset + 34 * nodes_len, 27 + 16 * nodes_len ), error_len, error_cap, error_vec ); } let mut total_degree = 0usize; let preamble = "digraph forest { fontname=\"Helvetica,Arial,sans-serif\" node [fontname=\"Helvetica,Arial,sans-serif\", ordering=out] edge [fontname=\"Helvetica,Arial,sans-serif\"] rankdir=LR;\n"; let mut post = String::new(); for node in 0..nodes_len { let degree = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * node), 8) } .try_into() .unwrap(), ); total_degree += degree; post.push_str(&format!( " {node} [label = \"{node}:{}\"]\n", read_label(unsafe { (*forest_vec).data.add(labels_offset + 34 * node) }) )); } println!("total degree = {total_degree}"); let correct_len: usize = 27 + 50 * nodes_len + 8 * total_degree; println!("correct length = {correct_len}"); if forest_len != correct_len { return_error!( format!("the forest length {forest_len} should be equal to: {correct_len}\n"), error_len, error_cap, error_vec ); } for source in 0..nodes_len { let degree = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source), 8) } .try_into() .unwrap(), ); let node_offset = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source + 8), 8) } .try_into() .unwrap(), ); if forest_len <= node_offset + 8 * degree { return_error!( format!( "the forest length {forest_len} is <= {node_offset} + 8 * {degree} = {}\n", node_offset + 8 * degree ), error_len, error_cap, error_vec ); } for i in 0..degree { let target = usize::from_be_bytes( unsafe { std::slice::from_raw_parts((*forest_vec).data.add(node_offset + 8 * i), 8) } .try_into() .unwrap(), ); post.push_str(&format!(" {source} -> {target}\n")); } } post.push_str("}\n"); let result = format!("{preamble}{post}"); let parsed_filename; match unsafe { std::ffi::CStr::from_ptr(filename).to_str() } { Ok(ccstr) => { parsed_filename = ccstr; } Err(e) => { return_error!(format!("error: {e}"), error_len, error_cap, error_vec); } } if std::fs::metadata(parsed_filename).is_ok() { let _ = std::fs::remove_file(parsed_filename); } let file = std::fs::File::options() .write(true) .create(true) .open(parsed_filename); use std::io::Write; match file { Ok(mut file) => { if let Err(e) = file.write_all(result.as_bytes()) { return_error!(format!("error: {e}"), error_len, error_cap, error_vec); } } Err(e) => { return_error!(format!("error: {e}"), error_len, error_cap, error_vec); } } } // TODO: Write a function to print the node label of a forest and // expose it to C ABI. // // This can be used in LLDB. /// This struct is a wrapper around the forest. /// /// This is used so that we can call a C function receiving a pointer /// to a forest struct. #[derive(Debug, Clone)] #[repr(C)] pub struct CForest { forest: DefaultForest>, } /// Print the label of the node with id `node` in the forest `forest`. /// /// The parameter `node` should point to 8 bytes of unsigned /// characters, which forms a number in 64 bits, in the *big endian* /// format. #[no_mangle] extern "C" fn print_forest_node(forest: *mut CForest, node: *mut std::os::raw::c_uchar) { let node = usize::from_be_bytes( unsafe { std::slice::from_raw_parts(node, 8) } .try_into() .unwrap(), ); let forest = unsafe { (*forest).forest.clone() }; let Ok(Some(label)) = forest.vertex_label(node) else { return; }; println!("node {node} has label {label}"); } pub mod bytes;