summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am9
-rw-r--r--src/helper.c73
-rw-r--r--src/helper.h6
-rw-r--r--src/lib.rs273
-rw-r--r--src/print_forest.c21
-rw-r--r--src/test.c40
-rw-r--r--src/test.el109
7 files changed, 516 insertions, 15 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 94399cb..a56c453 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -18,8 +18,11 @@ big_endian.o: big_endian.c big_endian.h
helper.o: helper.c helper.h big_endian.o
$(CC) $(CFLAGS) -c helper.c -o $@
-test: test.c ../target/debug/librep.dylib helper.o big_endian.o
- $(CC) $(CFLAGS) $+ -o $@
+test: test.c rs ../target/debug/librep.dylib helper.o big_endian.o
+ $(CC) $(CFLAGS) test.c ../target/debug/librep.dylib helper.o big_endian.o -o $@
+
+print_forest: print_forest.c rs helper.o big_endian.o ../target/debug/librep.dylib
+ $(CC) $(CFLAGS) print_forest.c ../target/debug/librep.dylib helper.o big_endian.o -o $@
.PHONY: clean rs windows
@@ -27,6 +30,8 @@ clean:
-rm -f *.o
-rm -f rep.so
-rm -f test
+ -rm -f print_forest
+ -rm -rf *.dSYM
cargo clean
windows:
diff --git a/src/helper.c b/src/helper.c
index d52fa5f..3b02c81 100644
--- a/src/helper.c
+++ b/src/helper.c
@@ -1,3 +1,4 @@
+#include <stdio.h>
#include "helper.h"
#include "big_endian.h"
@@ -82,3 +83,75 @@ print_node(struct CForest *forest, uint64_t node)
print_forest_node(forest, node_ptr);
}
+
+void print_forest_file(char *filename, char *output_filename)
+{
+ unsigned char error_vec_len[8] = { 0 };
+ unsigned char error_vec_cap[8] = { 0 };
+
+ struct SignedVec error_vec = { 0 };
+
+ error_vec.len = error_vec_len;
+ error_vec.capacity = error_vec_cap;
+
+ /* Now read the file into bytes and pack inside a struct UnsignedVec
+ and pass to the function. */
+
+ FILE *file = fopen(filename, "r");
+
+ if (file == NULL) {
+ fprintf(stderr, "Cannot open file %s\n", filename);
+
+ return;
+ }
+
+ fseek(file, 0, SEEK_END);
+
+ uint64_t file_size = ftell(file);
+
+ fseek(file, 0, SEEK_SET);
+
+ unsigned char *file_buffer = malloc(sizeof(unsigned char) * file_size);
+
+ if (file_buffer == NULL) {
+ fprintf(stderr, "%s:%d: Cannot allocate %llu memory\n",
+ __FILE__, __LINE__,
+ file_size);
+
+ return;
+ }
+
+ fread(file_buffer, 1, file_size, file);
+
+ fclose(file);
+
+ printf("file size = %llu\n", file_size);
+
+ unsigned char forest_len[8] = { 0 };
+
+ struct UnsignedVec forest = { 0 };
+
+ forest.len = forest_len;
+
+ to_big_endian(file_size, forest.len);
+
+ forest.data = file_buffer;
+
+ print_forest(&forest, &error_vec, output_filename);
+
+ uint64_t error_len = from_big_endian(error_vec.len);
+
+ if (error_len) {
+ fprintf(stderr, "error: ");
+
+ for (uint64_t i = 0; i < error_len; i++) {
+ fprintf(stderr, "%c", *(error_vec.data+i));
+ }
+
+ fprintf(stderr, "\n");
+
+ clean_signed(&error_vec, 4);
+ }
+
+ free(file_buffer);
+}
diff --git a/src/helper.h b/src/helper.h
index 37cd7fd..2f4933a 100644
--- a/src/helper.h
+++ b/src/helper.h
@@ -84,4 +84,10 @@ void print_forest_node(struct CForest *forest, unsigned char *node);
void print_node(struct CForest *forest, uint64_t node);
+void print_forest(struct UnsignedVec *forest_vec,
+ struct SignedVec *error_vec,
+ char *filename);
+
+void print_forest_file(char *filename, char *output_filename);
+
#endif
diff --git a/src/lib.rs b/src/lib.rs
index 7cc5223..aed8536 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,8 +9,8 @@ use chain::{atom::DefaultAtom, default::DefaultChain, Chain};
use grammar::Grammar;
// For printing forests
-use chain::item::{default::DefaultForest, ForestLabel};
-use grammar::GrammarLabel;
+use chain::item::{default::DefaultForest, ForestLabel, ForestLabelType};
+use grammar::{GrammarLabel, TNT};
use graph::LabelGraph;
/// This struct is the representation of a parser.
@@ -483,9 +483,6 @@ extern "C" fn parser_parse(
match forest {
Ok(forest) => {
- use graph::Graph;
- forest.print_viz("test forest.gv").unwrap();
-
Box::leak(parser_box);
let mut bytes = bytes::forest_to_bytes(&forest);
@@ -556,6 +553,272 @@ extern "C" fn parser_parse(
}
}
+fn read_label(label: *mut std::os::raw::c_uchar) -> ForestLabel<GrammarLabel> {
+ let status: u8 = unsafe { *label };
+
+ let label_status: ForestLabelType;
+
+ match status {
+ 0 => {
+ label_status = ForestLabelType::Plain;
+ }
+ 1 => {
+ label_status = ForestLabelType::Packed;
+ }
+ _ => {
+ label_status = ForestLabelType::Cloned(usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts(label.add(1), 8) }
+ .try_into()
+ .unwrap(),
+ ));
+ }
+ }
+
+ let start = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts(label.add(9), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let end = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts(label.add(17), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let discriminant: u8 = unsafe { *label.add(25) };
+
+ let content = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts(label.add(26), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let inner_label: GrammarLabel;
+
+ match discriminant {
+ 0 => {
+ inner_label = GrammarLabel::new_closed(TNT::Ter(content), start, end);
+ }
+ 1 => {
+ inner_label = GrammarLabel::new_closed(TNT::Non(content), start, end);
+ }
+ _ => {
+ inner_label = GrammarLabel::new_closed(content, start, end);
+ }
+ }
+
+ ForestLabel::new(inner_label, label_status)
+}
+
+macro_rules! return_error {
+ ($err:expr, $elen:ident, $ecap:ident, $evec:ident) => {
+ let mut e_string = $err;
+
+ let e_string_len_slice = e_string.len().to_be_bytes();
+ let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+ unsafe {
+ for i in 0..8 {
+ *($elen.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+ *($ecap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+ }
+
+ (*$evec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+ }
+
+ std::mem::forget(e_string);
+
+ return;
+ };
+}
+
+#[no_mangle]
+extern "C" fn print_forest(
+ forest_vec: *mut LenVec<std::os::raw::c_uchar>,
+ error_vec: *mut LenVec<std::os::raw::c_char>,
+ filename: *mut std::os::raw::c_char,
+) {
+ let forest_len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).len, 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let error_len = unsafe { (*error_vec).len };
+ let error_cap = unsafe { (*error_vec).capacity };
+
+ if forest_len < 27 {
+ return_error!(
+ format!("forest bytes length {forest_len} < 27"),
+ error_len,
+ error_cap,
+ error_vec
+ );
+ }
+
+ let nodes_len = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).data.add(11), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ println!("the forest has {nodes_len} nodes");
+
+ let special_marks = unsafe { std::slice::from_raw_parts((*forest_vec).data.add(8), 3) };
+
+ if special_marks != &[114, 101, 112] {
+ return_error!(
+ format!(
+ "the forest does not begin with the special mark\nThe first bytes are: \
+ {:?}\n",
+ special_marks
+ ),
+ error_len,
+ error_cap,
+ error_vec
+ );
+ }
+
+ let labels_offset = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).data.add(19), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ println!("labels_offset = {labels_offset}");
+
+ if forest_len < labels_offset + 34 * nodes_len || forest_len < (27 + 16 * nodes_len) {
+ return_error!(
+ format!(
+ "the forest length is too small: {forest_len}\n\
+ labels offset + 34 * nodes_len = {}, all {nodes_len} \
+ nodes take {}\n",
+ labels_offset + 34 * nodes_len,
+ 27 + 16 * nodes_len
+ ),
+ error_len,
+ error_cap,
+ error_vec
+ );
+ }
+
+ let mut total_degree = 0usize;
+
+ let preamble = "digraph forest {
+ fontname=\"Helvetica,Arial,sans-serif\"
+ node [fontname=\"Helvetica,Arial,sans-serif\", ordering=out]
+ edge [fontname=\"Helvetica,Arial,sans-serif\"]
+ rankdir=LR;\n";
+
+ let mut post = String::new();
+
+ for node in 0..nodes_len {
+ let degree = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * node), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ total_degree += degree;
+
+ post.push_str(&format!(
+ " {node} [label = \"{node}:{}\"]\n",
+ read_label(unsafe { (*forest_vec).data.add(labels_offset + 34 * node) })
+ ));
+ }
+
+ println!("total degree = {total_degree}");
+
+ let correct_len: usize = 27 + 50 * nodes_len + 8 * total_degree;
+
+ println!("correct length = {correct_len}");
+
+ if forest_len != correct_len {
+ return_error!(
+ format!("the forest length {forest_len} should be equal to: {correct_len}\n"),
+ error_len,
+ error_cap,
+ error_vec
+ );
+ }
+
+ for source in 0..nodes_len {
+ let degree = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ let node_offset = usize::from_be_bytes(
+ unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source + 8), 8) }
+ .try_into()
+ .unwrap(),
+ );
+
+ if forest_len <= node_offset + 8 * degree {
+ return_error!(
+ format!(
+ "the forest length {forest_len} is <= {node_offset} + 8 * {degree} = {}\n",
+ node_offset + 8 * degree
+ ),
+ error_len,
+ error_cap,
+ error_vec
+ );
+ }
+
+ for i in 0..degree {
+ let target = usize::from_be_bytes(
+ unsafe {
+ std::slice::from_raw_parts((*forest_vec).data.add(node_offset + 8 * i), 8)
+ }
+ .try_into()
+ .unwrap(),
+ );
+
+ post.push_str(&format!(" {source} -> {target}\n"));
+ }
+ }
+
+ post.push_str("}\n");
+
+ let result = format!("{preamble}{post}");
+
+ let parsed_filename;
+
+ match unsafe { std::ffi::CStr::from_ptr(filename).to_str() } {
+ Ok(ccstr) => {
+ parsed_filename = ccstr;
+ }
+ Err(e) => {
+ return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+ }
+ }
+
+ if std::fs::metadata(parsed_filename).is_ok() {
+ let _ = std::fs::remove_file(parsed_filename);
+ }
+
+ let file = std::fs::File::options()
+ .write(true)
+ .create(true)
+ .open(parsed_filename);
+
+ use std::io::Write;
+
+ match file {
+ Ok(mut file) => {
+ if let Err(e) = file.write_all(result.as_bytes()) {
+ return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+ }
+ }
+ Err(e) => {
+ return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+ }
+ }
+}
+
// TODO: Write a function to print the node label of a forest and
// expose it to C ABI.
//
diff --git a/src/print_forest.c b/src/print_forest.c
new file mode 100644
index 0000000..58832c9
--- /dev/null
+++ b/src/print_forest.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include "helper.h"
+#include "big_endian.h"
+
+int
+main(int argc, char **argv)
+{
+ if (argc < 2) {
+ fprintf(stderr, "One argument is required, which is the file name of the forest.\n");
+
+ return 1;
+ }
+
+ if (argc < 3) {
+ fprintf(stderr, "It is required to supply the ouput file name.\n");
+
+ return 1;
+ }
+
+ print_forest_file(*(argv+1), *(argv+2));
+}
diff --git a/src/test.c b/src/test.c
index 249eb87..f9ab43b 100644
--- a/src/test.c
+++ b/src/test.c
@@ -25,13 +25,13 @@ main(int argc, char **argv)
"\n"
"item =/ header [ note ] *1( price )\n"
"\n"
-"header = *1star \"SP\" title %xA *( \"SP\" / %xA )\n"
+"header = star \"SP\" title %xA *( \"SP\" / %xA )\n"
"\n"
"title = 1*\"TEXT\"\n"
"\n"
"star = %x2A\n"
"\n"
-"note = \"note:\" note-content %xA *( \"SP\" / %xA )\n"
+"note = \"note:\" \"SP\" note-content %xA *( \"SP\" / %xA )\n"
"\n"
"note-content = 1*\"TEXT\"\n"
"\n"
@@ -58,7 +58,31 @@ main(int argc, char **argv)
return 1;
}
- unsigned char *input = malloc (sizeof(unsigned char) * 10 * 8);
+ /* int input_unenc[] = { 3, 0, 2, 1, 5, 0, 6, 1 }; */
+
+ /* int input_unenc[] = { 3, 0, 2, 2, 2, 2, 1, 5, 0,
+ * 6, 6, 6, 1, 3, 0, 2, 2, 2,
+ * 2, 1, 5, 0, 6, 6, 6, 1, 3,
+ * 0, 2, 2, 2, 2, 1, 5, 0, 6,
+ * 6, 6, 1, 4, 2, 2, 2, 2, 2,
+ * 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ * 2, 2, 1 }; */
+
+ int input_unenc[] = { 3, 0, 2, 1,
+ 5, 0, 6, 1,
+ 3, 0, 2, 1,
+ 5, 0, 6, 1,
+ 3, 0, 2, 1,
+ 5, 0, 6, 1,
+ 4, 0, 2, 1 };
+
+ int input_unenc_len = (int) (sizeof (input_unenc) /
+ sizeof(input_unenc[0]));
+
+ printf("input length = %d\n", input_unenc_len);
+
+ unsigned char *input = malloc (sizeof(unsigned char) *
+ input_unenc_len * 8);
if (input == NULL) {
clean_parser(parser);
@@ -67,14 +91,14 @@ main(int argc, char **argv)
return EXIT_FAILURE;
}
-
- int input_unenc[10] = { 3, 0, 2, 2, 2, 1, 5, 0, 6, 1 };
-
- for (int i = 0; i < 10; i++)
+
+ for (int i = 0; i < input_unenc_len; i++)
to_big_endian(input_unenc[i], input + (8 * i));
unsigned char input_len[8] = { 0 };
- input_len[7] = 8*10;
+ /* input_len[7] = 8*10; */
+
+ to_big_endian((uint64_t) (8 * input_unenc_len), input_len);
struct UnsignedVec input_vec =
(struct UnsignedVec) { input_len, NULL, input };
diff --git a/src/test.el b/src/test.el
index d106a03..a6f6e7a 100644
--- a/src/test.el
+++ b/src/test.el
@@ -26,3 +26,112 @@
;; (rep-parse-string
;; test-parser
;; "183t3ru")
+
+;; Below is an experimental tokenizer implemented in Emacs Lisp.
+
+;; The tokens for the test grammar are as follows:
+;;
+;; 0. SP
+;; 1. %xA
+;; 2. TEXT
+;; 3. %x2A
+;; 4. note:
+;; 5. price:
+;; 6. DIGIT
+
+(defun example-tokenize (str)
+ "Tokenize the string STR.
+The function returns a pair of vectors, one of positive integers
+and the other of corresponding spans in the original input. The
+span is represented as a cons-cell, whose `car' is the starting
+position in the input, and whose `cdr' is the length of the span.
+
+The tokens are as follows:
+
+0. SP
+1. %xA
+2. TEXT
+3. %x2A
+4. note:
+5. price:
+6. DIGIT"
+ (let ((i 0) (len (length str)) result result-spans)
+ (while (< i len)
+ (let ((element (aref str i)))
+ (cond
+ ((= element 32)
+ (setq result (cons 0 result))
+ (setq result-spans (cons (cons i 1) result-spans))
+ (setq i (1+ i)))
+ ((= element #xa)
+ (setq result (cons 1 result))
+ (setq result-spans (cons (cons i 1) result-spans))
+ (setq i (1+ i)))
+ ((and (= element #x2a)
+ (or (= i 0) (= (aref str (1- i)) #xa)))
+ (setq result (cons 3 result))
+ (setq result-spans (cons (cons i 1) result-spans))
+ (setq i (1+ i)))
+ ((and (<= ?0 element) (<= element ?9))
+ (let ((j i) temp stop)
+ (while (and (not stop) (< j len))
+ (setq temp (aref str j))
+ (cond
+ ((and (<= ?0 temp) (<= temp ?9))
+ (setq j (min (1+ j) len)))
+ ((setq stop t))))
+ (setq result (cons 6 result))
+ (setq result-spans
+ (cons (cons i (- j i)) result-spans))
+ (setq i j)))
+ ((and (= element ?n)
+ (< (+ i 4) len)
+ (= (aref str (+ i 1)) ?o)
+ (= (aref str (+ i 2)) ?t)
+ (= (aref str (+ i 3)) ?e)
+ (= (aref str (+ i 4)) ?:))
+ (setq result (cons 4 result))
+ (setq result-spans (cons (cons i 5) result-spans))
+ (setq i (+ i 5)))
+ ((and (= element ?p)
+ (< (+ i 5) len)
+ (= (aref str (+ i 1)) ?r)
+ (= (aref str (+ i 2)) ?i)
+ (= (aref str (+ i 3)) ?c)
+ (= (aref str (+ i 4)) ?e)
+ (= (aref str (+ i 5)) ?:))
+ (setq result (cons 5 result))
+ (setq result-spans (cons (cons i 6) result-spans))
+ (setq i (+ i 6)))
+ ((setq result-spans (cons (cons i 1) result-spans))
+ (let ((j i) stop temp)
+ (while (and (not stop) (< j len))
+ (setq temp (aref str j))
+ (cond
+ ((= temp #xa) (setq stop t))
+ ((setq j (min (1+ j) len)))))
+ (setq result (cons 2 result))
+ (setq result-spans (cons (cons i (- j i)) result-spans))
+ (setq i j))))))
+ (cons (apply #'vector (nreverse result))
+ (apply #'vector (nreverse result-spans)))))
+
+(defvar test-document (expand-file-name
+ "test.document"
+ (expand-file-name
+ "test-data"
+ rep-dir))
+ "A document for testing purposes.")
+
+(defvar input-spans nil
+ "A vector that represents spans of tokens in the input.")
+
+(let ((result (example-tokenize (with-temp-buffer
+ (insert-file-contents test-document)
+ (buffer-substring-no-properties
+ (point-min) (point-max))))))
+ (setq input (car result))
+ (setq input-spans (cdr result)))
+
+(rep-recognize test-parser input)
+(rep-parse test-parser input)