From 7033187abaf42772097377c0a1ffc2cd4cefdada Mon Sep 17 00:00:00 2001 From: JSDurand Date: Fri, 4 Aug 2023 10:12:04 +0800 Subject: minor adjustments Not bug deals but adjustments of details. --- src/Makefile.am | 9 +- src/helper.c | 73 ++++++++++++++ src/helper.h | 6 ++ src/lib.rs | 273 ++++++++++++++++++++++++++++++++++++++++++++++++++++- src/print_forest.c | 21 +++++ src/test.c | 40 ++++++-- src/test.el | 109 +++++++++++++++++++++ 7 files changed, 516 insertions(+), 15 deletions(-) create mode 100644 src/print_forest.c (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index 94399cb..a56c453 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -18,8 +18,11 @@ big_endian.o: big_endian.c big_endian.h helper.o: helper.c helper.h big_endian.o $(CC) $(CFLAGS) -c helper.c -o $@ -test: test.c ../target/debug/librep.dylib helper.o big_endian.o - $(CC) $(CFLAGS) $+ -o $@ +test: test.c rs ../target/debug/librep.dylib helper.o big_endian.o + $(CC) $(CFLAGS) test.c ../target/debug/librep.dylib helper.o big_endian.o -o $@ + +print_forest: print_forest.c rs helper.o big_endian.o ../target/debug/librep.dylib + $(CC) $(CFLAGS) print_forest.c ../target/debug/librep.dylib helper.o big_endian.o -o $@ .PHONY: clean rs windows @@ -27,6 +30,8 @@ clean: -rm -f *.o -rm -f rep.so -rm -f test + -rm -f print_forest + -rm -rf *.dSYM cargo clean windows: diff --git a/src/helper.c b/src/helper.c index d52fa5f..3b02c81 100644 --- a/src/helper.c +++ b/src/helper.c @@ -1,3 +1,4 @@ +#include #include "helper.h" #include "big_endian.h" @@ -82,3 +83,75 @@ print_node(struct CForest *forest, uint64_t node) print_forest_node(forest, node_ptr); } + +void print_forest_file(char *filename, char *output_filename) +{ + unsigned char error_vec_len[8] = { 0 }; + unsigned char error_vec_cap[8] = { 0 }; + + struct SignedVec error_vec = { 0 }; + + error_vec.len = error_vec_len; + error_vec.capacity = error_vec_cap; + + /* Now read the file into bytes and pack inside a struct UnsignedVec + and pass to the function. */ + + FILE *file = fopen(filename, "r"); + + if (file == NULL) { + fprintf(stderr, "Cannot open file %s\n", filename); + + return; + } + + fseek(file, 0, SEEK_END); + + uint64_t file_size = ftell(file); + + fseek(file, 0, SEEK_SET); + + unsigned char *file_buffer = malloc(sizeof(unsigned char) * file_size); + + if (file_buffer == NULL) { + fprintf(stderr, "%s:%d: Cannot allocate %llu memory\n", + __FILE__, __LINE__, + file_size); + + return; + } + + fread(file_buffer, 1, file_size, file); + + fclose(file); + + printf("file size = %llu\n", file_size); + + unsigned char forest_len[8] = { 0 }; + + struct UnsignedVec forest = { 0 }; + + forest.len = forest_len; + + to_big_endian(file_size, forest.len); + + forest.data = file_buffer; + + print_forest(&forest, &error_vec, output_filename); + + uint64_t error_len = from_big_endian(error_vec.len); + + if (error_len) { + fprintf(stderr, "error: "); + + for (uint64_t i = 0; i < error_len; i++) { + fprintf(stderr, "%c", *(error_vec.data+i)); + } + + fprintf(stderr, "\n"); + + clean_signed(&error_vec, 4); + } + + free(file_buffer); +} diff --git a/src/helper.h b/src/helper.h index 37cd7fd..2f4933a 100644 --- a/src/helper.h +++ b/src/helper.h @@ -84,4 +84,10 @@ void print_forest_node(struct CForest *forest, unsigned char *node); void print_node(struct CForest *forest, uint64_t node); +void print_forest(struct UnsignedVec *forest_vec, + struct SignedVec *error_vec, + char *filename); + +void print_forest_file(char *filename, char *output_filename); + #endif diff --git a/src/lib.rs b/src/lib.rs index 7cc5223..aed8536 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,8 +9,8 @@ use chain::{atom::DefaultAtom, default::DefaultChain, Chain}; use grammar::Grammar; // For printing forests -use chain::item::{default::DefaultForest, ForestLabel}; -use grammar::GrammarLabel; +use chain::item::{default::DefaultForest, ForestLabel, ForestLabelType}; +use grammar::{GrammarLabel, TNT}; use graph::LabelGraph; /// This struct is the representation of a parser. @@ -483,9 +483,6 @@ extern "C" fn parser_parse( match forest { Ok(forest) => { - use graph::Graph; - forest.print_viz("test forest.gv").unwrap(); - Box::leak(parser_box); let mut bytes = bytes::forest_to_bytes(&forest); @@ -556,6 +553,272 @@ extern "C" fn parser_parse( } } +fn read_label(label: *mut std::os::raw::c_uchar) -> ForestLabel { + let status: u8 = unsafe { *label }; + + let label_status: ForestLabelType; + + match status { + 0 => { + label_status = ForestLabelType::Plain; + } + 1 => { + label_status = ForestLabelType::Packed; + } + _ => { + label_status = ForestLabelType::Cloned(usize::from_be_bytes( + unsafe { std::slice::from_raw_parts(label.add(1), 8) } + .try_into() + .unwrap(), + )); + } + } + + let start = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts(label.add(9), 8) } + .try_into() + .unwrap(), + ); + + let end = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts(label.add(17), 8) } + .try_into() + .unwrap(), + ); + + let discriminant: u8 = unsafe { *label.add(25) }; + + let content = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts(label.add(26), 8) } + .try_into() + .unwrap(), + ); + + let inner_label: GrammarLabel; + + match discriminant { + 0 => { + inner_label = GrammarLabel::new_closed(TNT::Ter(content), start, end); + } + 1 => { + inner_label = GrammarLabel::new_closed(TNT::Non(content), start, end); + } + _ => { + inner_label = GrammarLabel::new_closed(content, start, end); + } + } + + ForestLabel::new(inner_label, label_status) +} + +macro_rules! return_error { + ($err:expr, $elen:ident, $ecap:ident, $evec:ident) => { + let mut e_string = $err; + + let e_string_len_slice = e_string.len().to_be_bytes(); + let e_string_cap_slice = e_string.capacity().to_be_bytes(); + + unsafe { + for i in 0..8 { + *($elen.add(i)) = e_string_len_slice.get(i).copied().unwrap(); + *($ecap.add(i)) = e_string_cap_slice.get(i).copied().unwrap(); + } + + (*$evec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char; + } + + std::mem::forget(e_string); + + return; + }; +} + +#[no_mangle] +extern "C" fn print_forest( + forest_vec: *mut LenVec, + error_vec: *mut LenVec, + filename: *mut std::os::raw::c_char, +) { + let forest_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).len, 8) } + .try_into() + .unwrap(), + ); + + let error_len = unsafe { (*error_vec).len }; + let error_cap = unsafe { (*error_vec).capacity }; + + if forest_len < 27 { + return_error!( + format!("forest bytes length {forest_len} < 27"), + error_len, + error_cap, + error_vec + ); + } + + let nodes_len = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).data.add(11), 8) } + .try_into() + .unwrap(), + ); + + println!("the forest has {nodes_len} nodes"); + + let special_marks = unsafe { std::slice::from_raw_parts((*forest_vec).data.add(8), 3) }; + + if special_marks != &[114, 101, 112] { + return_error!( + format!( + "the forest does not begin with the special mark\nThe first bytes are: \ + {:?}\n", + special_marks + ), + error_len, + error_cap, + error_vec + ); + } + + let labels_offset = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).data.add(19), 8) } + .try_into() + .unwrap(), + ); + + println!("labels_offset = {labels_offset}"); + + if forest_len < labels_offset + 34 * nodes_len || forest_len < (27 + 16 * nodes_len) { + return_error!( + format!( + "the forest length is too small: {forest_len}\n\ + labels offset + 34 * nodes_len = {}, all {nodes_len} \ + nodes take {}\n", + labels_offset + 34 * nodes_len, + 27 + 16 * nodes_len + ), + error_len, + error_cap, + error_vec + ); + } + + let mut total_degree = 0usize; + + let preamble = "digraph forest { + fontname=\"Helvetica,Arial,sans-serif\" + node [fontname=\"Helvetica,Arial,sans-serif\", ordering=out] + edge [fontname=\"Helvetica,Arial,sans-serif\"] + rankdir=LR;\n"; + + let mut post = String::new(); + + for node in 0..nodes_len { + let degree = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * node), 8) } + .try_into() + .unwrap(), + ); + + total_degree += degree; + + post.push_str(&format!( + " {node} [label = \"{node}:{}\"]\n", + read_label(unsafe { (*forest_vec).data.add(labels_offset + 34 * node) }) + )); + } + + println!("total degree = {total_degree}"); + + let correct_len: usize = 27 + 50 * nodes_len + 8 * total_degree; + + println!("correct length = {correct_len}"); + + if forest_len != correct_len { + return_error!( + format!("the forest length {forest_len} should be equal to: {correct_len}\n"), + error_len, + error_cap, + error_vec + ); + } + + for source in 0..nodes_len { + let degree = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source), 8) } + .try_into() + .unwrap(), + ); + + let node_offset = usize::from_be_bytes( + unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source + 8), 8) } + .try_into() + .unwrap(), + ); + + if forest_len <= node_offset + 8 * degree { + return_error!( + format!( + "the forest length {forest_len} is <= {node_offset} + 8 * {degree} = {}\n", + node_offset + 8 * degree + ), + error_len, + error_cap, + error_vec + ); + } + + for i in 0..degree { + let target = usize::from_be_bytes( + unsafe { + std::slice::from_raw_parts((*forest_vec).data.add(node_offset + 8 * i), 8) + } + .try_into() + .unwrap(), + ); + + post.push_str(&format!(" {source} -> {target}\n")); + } + } + + post.push_str("}\n"); + + let result = format!("{preamble}{post}"); + + let parsed_filename; + + match unsafe { std::ffi::CStr::from_ptr(filename).to_str() } { + Ok(ccstr) => { + parsed_filename = ccstr; + } + Err(e) => { + return_error!(format!("error: {e}"), error_len, error_cap, error_vec); + } + } + + if std::fs::metadata(parsed_filename).is_ok() { + let _ = std::fs::remove_file(parsed_filename); + } + + let file = std::fs::File::options() + .write(true) + .create(true) + .open(parsed_filename); + + use std::io::Write; + + match file { + Ok(mut file) => { + if let Err(e) = file.write_all(result.as_bytes()) { + return_error!(format!("error: {e}"), error_len, error_cap, error_vec); + } + } + Err(e) => { + return_error!(format!("error: {e}"), error_len, error_cap, error_vec); + } + } +} + // TODO: Write a function to print the node label of a forest and // expose it to C ABI. // diff --git a/src/print_forest.c b/src/print_forest.c new file mode 100644 index 0000000..58832c9 --- /dev/null +++ b/src/print_forest.c @@ -0,0 +1,21 @@ +#include +#include "helper.h" +#include "big_endian.h" + +int +main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, "One argument is required, which is the file name of the forest.\n"); + + return 1; + } + + if (argc < 3) { + fprintf(stderr, "It is required to supply the ouput file name.\n"); + + return 1; + } + + print_forest_file(*(argv+1), *(argv+2)); +} diff --git a/src/test.c b/src/test.c index 249eb87..f9ab43b 100644 --- a/src/test.c +++ b/src/test.c @@ -25,13 +25,13 @@ main(int argc, char **argv) "\n" "item =/ header [ note ] *1( price )\n" "\n" -"header = *1star \"SP\" title %xA *( \"SP\" / %xA )\n" +"header = star \"SP\" title %xA *( \"SP\" / %xA )\n" "\n" "title = 1*\"TEXT\"\n" "\n" "star = %x2A\n" "\n" -"note = \"note:\" note-content %xA *( \"SP\" / %xA )\n" +"note = \"note:\" \"SP\" note-content %xA *( \"SP\" / %xA )\n" "\n" "note-content = 1*\"TEXT\"\n" "\n" @@ -58,7 +58,31 @@ main(int argc, char **argv) return 1; } - unsigned char *input = malloc (sizeof(unsigned char) * 10 * 8); + /* int input_unenc[] = { 3, 0, 2, 1, 5, 0, 6, 1 }; */ + + /* int input_unenc[] = { 3, 0, 2, 2, 2, 2, 1, 5, 0, + * 6, 6, 6, 1, 3, 0, 2, 2, 2, + * 2, 1, 5, 0, 6, 6, 6, 1, 3, + * 0, 2, 2, 2, 2, 1, 5, 0, 6, + * 6, 6, 1, 4, 2, 2, 2, 2, 2, + * 2, 2, 2, 2, 2, 2, 2, 2, 2, + * 2, 2, 1 }; */ + + int input_unenc[] = { 3, 0, 2, 1, + 5, 0, 6, 1, + 3, 0, 2, 1, + 5, 0, 6, 1, + 3, 0, 2, 1, + 5, 0, 6, 1, + 4, 0, 2, 1 }; + + int input_unenc_len = (int) (sizeof (input_unenc) / + sizeof(input_unenc[0])); + + printf("input length = %d\n", input_unenc_len); + + unsigned char *input = malloc (sizeof(unsigned char) * + input_unenc_len * 8); if (input == NULL) { clean_parser(parser); @@ -67,14 +91,14 @@ main(int argc, char **argv) return EXIT_FAILURE; } - - int input_unenc[10] = { 3, 0, 2, 2, 2, 1, 5, 0, 6, 1 }; - - for (int i = 0; i < 10; i++) + + for (int i = 0; i < input_unenc_len; i++) to_big_endian(input_unenc[i], input + (8 * i)); unsigned char input_len[8] = { 0 }; - input_len[7] = 8*10; + /* input_len[7] = 8*10; */ + + to_big_endian((uint64_t) (8 * input_unenc_len), input_len); struct UnsignedVec input_vec = (struct UnsignedVec) { input_len, NULL, input }; diff --git a/src/test.el b/src/test.el index d106a03..a6f6e7a 100644 --- a/src/test.el +++ b/src/test.el @@ -26,3 +26,112 @@ ;; (rep-parse-string ;; test-parser ;; "183t3ru") + +;; Below is an experimental tokenizer implemented in Emacs Lisp. + +;; The tokens for the test grammar are as follows: +;; +;; 0. SP +;; 1. %xA +;; 2. TEXT +;; 3. %x2A +;; 4. note: +;; 5. price: +;; 6. DIGIT + +(defun example-tokenize (str) + "Tokenize the string STR. +The function returns a pair of vectors, one of positive integers +and the other of corresponding spans in the original input. The +span is represented as a cons-cell, whose `car' is the starting +position in the input, and whose `cdr' is the length of the span. + +The tokens are as follows: + +0. SP +1. %xA +2. TEXT +3. %x2A +4. note: +5. price: +6. DIGIT" + (let ((i 0) (len (length str)) result result-spans) + (while (< i len) + (let ((element (aref str i))) + (cond + ((= element 32) + (setq result (cons 0 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((= element #xa) + (setq result (cons 1 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((and (= element #x2a) + (or (= i 0) (= (aref str (1- i)) #xa))) + (setq result (cons 3 result)) + (setq result-spans (cons (cons i 1) result-spans)) + (setq i (1+ i))) + ((and (<= ?0 element) (<= element ?9)) + (let ((j i) temp stop) + (while (and (not stop) (< j len)) + (setq temp (aref str j)) + (cond + ((and (<= ?0 temp) (<= temp ?9)) + (setq j (min (1+ j) len))) + ((setq stop t)))) + (setq result (cons 6 result)) + (setq result-spans + (cons (cons i (- j i)) result-spans)) + (setq i j))) + ((and (= element ?n) + (< (+ i 4) len) + (= (aref str (+ i 1)) ?o) + (= (aref str (+ i 2)) ?t) + (= (aref str (+ i 3)) ?e) + (= (aref str (+ i 4)) ?:)) + (setq result (cons 4 result)) + (setq result-spans (cons (cons i 5) result-spans)) + (setq i (+ i 5))) + ((and (= element ?p) + (< (+ i 5) len) + (= (aref str (+ i 1)) ?r) + (= (aref str (+ i 2)) ?i) + (= (aref str (+ i 3)) ?c) + (= (aref str (+ i 4)) ?e) + (= (aref str (+ i 5)) ?:)) + (setq result (cons 5 result)) + (setq result-spans (cons (cons i 6) result-spans)) + (setq i (+ i 6))) + ((setq result-spans (cons (cons i 1) result-spans)) + (let ((j i) stop temp) + (while (and (not stop) (< j len)) + (setq temp (aref str j)) + (cond + ((= temp #xa) (setq stop t)) + ((setq j (min (1+ j) len))))) + (setq result (cons 2 result)) + (setq result-spans (cons (cons i (- j i)) result-spans)) + (setq i j)))))) + (cons (apply #'vector (nreverse result)) + (apply #'vector (nreverse result-spans))))) + +(defvar test-document (expand-file-name + "test.document" + (expand-file-name + "test-data" + rep-dir)) + "A document for testing purposes.") + +(defvar input-spans nil + "A vector that represents spans of tokens in the input.") + +(let ((result (example-tokenize (with-temp-buffer + (insert-file-contents test-document) + (buffer-substring-no-properties + (point-min) (point-max)))))) + (setq input (car result)) + (setq input-spans (cdr result))) + +(rep-recognize test-parser input) +(rep-parse test-parser input) -- cgit v1.2.3-18-g5258