From 7033187abaf42772097377c0a1ffc2cd4cefdada Mon Sep 17 00:00:00 2001
From: JSDurand <mmemmew@gmail.com>
Date: Fri, 4 Aug 2023 10:12:04 +0800
Subject: minor adjustments

Not bug deals but adjustments of details.
---
 src/Makefile.am    |   9 +-
 src/helper.c       |  73 ++++++++++++++
 src/helper.h       |   6 ++
 src/lib.rs         | 273 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/print_forest.c |  21 +++++
 src/test.c         |  40 ++++++--
 src/test.el        | 109 +++++++++++++++++++++
 7 files changed, 516 insertions(+), 15 deletions(-)
 create mode 100644 src/print_forest.c

(limited to 'src')

diff --git a/src/Makefile.am b/src/Makefile.am
index 94399cb..a56c453 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -18,8 +18,11 @@ big_endian.o: big_endian.c big_endian.h
 helper.o: helper.c helper.h big_endian.o
 	$(CC) $(CFLAGS) -c helper.c -o $@
 
-test: test.c ../target/debug/librep.dylib helper.o big_endian.o
-	$(CC) $(CFLAGS) $+ -o $@
+test: test.c rs ../target/debug/librep.dylib helper.o big_endian.o
+	$(CC) $(CFLAGS) test.c ../target/debug/librep.dylib helper.o big_endian.o -o $@
+
+print_forest: print_forest.c rs helper.o big_endian.o ../target/debug/librep.dylib
+	$(CC) $(CFLAGS) print_forest.c ../target/debug/librep.dylib helper.o big_endian.o -o $@
 
 .PHONY: clean rs windows
 
@@ -27,6 +30,8 @@ clean:
 	-rm -f *.o
 	-rm -f rep.so
 	-rm -f test
+	-rm -f print_forest
+	-rm -rf *.dSYM
 	cargo clean
 
 windows:
diff --git a/src/helper.c b/src/helper.c
index d52fa5f..3b02c81 100644
--- a/src/helper.c
+++ b/src/helper.c
@@ -1,3 +1,4 @@
+#include <stdio.h>
 #include "helper.h"
 #include "big_endian.h"
 
@@ -82,3 +83,75 @@ print_node(struct CForest *forest, uint64_t node)
 
   print_forest_node(forest, node_ptr);
 }
+
+void print_forest_file(char *filename, char *output_filename)
+{
+  unsigned char error_vec_len[8] = { 0 };
+  unsigned char error_vec_cap[8] = { 0 };
+
+  struct SignedVec error_vec = { 0 };
+
+  error_vec.len = error_vec_len;
+  error_vec.capacity = error_vec_cap;
+
+  /* Now read the file into bytes and pack inside a struct UnsignedVec
+     and pass to the function. */
+
+  FILE *file = fopen(filename, "r");
+
+  if (file == NULL) {
+    fprintf(stderr, "Cannot open file %s\n", filename);
+
+    return;
+  }
+
+  fseek(file, 0, SEEK_END);
+
+  uint64_t file_size = ftell(file);
+
+  fseek(file, 0, SEEK_SET);
+
+  unsigned char *file_buffer = malloc(sizeof(unsigned char) * file_size);
+
+  if (file_buffer == NULL) {
+    fprintf(stderr, "%s:%d: Cannot allocate %llu memory\n",
+            __FILE__, __LINE__,
+            file_size);
+
+    return;
+  }
+
+  fread(file_buffer, 1, file_size, file);
+
+  fclose(file);
+
+  printf("file size = %llu\n", file_size);
+
+  unsigned char forest_len[8] = { 0 };
+
+  struct UnsignedVec forest = { 0 };
+
+  forest.len = forest_len;
+
+  to_big_endian(file_size, forest.len);
+
+  forest.data = file_buffer;
+
+  print_forest(&forest, &error_vec, output_filename);
+
+  uint64_t error_len = from_big_endian(error_vec.len);
+
+  if (error_len) {
+    fprintf(stderr, "error: ");
+
+    for (uint64_t i = 0; i < error_len; i++) {
+      fprintf(stderr, "%c", *(error_vec.data+i));
+    }
+
+    fprintf(stderr, "\n");
+
+    clean_signed(&error_vec, 4);
+  }
+
+  free(file_buffer);
+}
diff --git a/src/helper.h b/src/helper.h
index 37cd7fd..2f4933a 100644
--- a/src/helper.h
+++ b/src/helper.h
@@ -84,4 +84,10 @@ void print_forest_node(struct CForest *forest, unsigned char *node);
 
 void print_node(struct CForest *forest, uint64_t node);
 
+void print_forest(struct UnsignedVec *forest_vec,
+                  struct SignedVec *error_vec,
+                  char *filename);
+
+void print_forest_file(char *filename, char *output_filename);
+
 #endif
diff --git a/src/lib.rs b/src/lib.rs
index 7cc5223..aed8536 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,8 +9,8 @@ use chain::{atom::DefaultAtom, default::DefaultChain, Chain};
 use grammar::Grammar;
 
 // For printing forests
-use chain::item::{default::DefaultForest, ForestLabel};
-use grammar::GrammarLabel;
+use chain::item::{default::DefaultForest, ForestLabel, ForestLabelType};
+use grammar::{GrammarLabel, TNT};
 use graph::LabelGraph;
 
 /// This struct is the representation of a parser.
@@ -483,9 +483,6 @@ extern "C" fn parser_parse(
 
                 match forest {
                     Ok(forest) => {
-                        use graph::Graph;
-                        forest.print_viz("test forest.gv").unwrap();
-
                         Box::leak(parser_box);
 
                         let mut bytes = bytes::forest_to_bytes(&forest);
@@ -556,6 +553,272 @@ extern "C" fn parser_parse(
     }
 }
 
+fn read_label(label: *mut std::os::raw::c_uchar) -> ForestLabel<GrammarLabel> {
+    let status: u8 = unsafe { *label };
+
+    let label_status: ForestLabelType;
+
+    match status {
+        0 => {
+            label_status = ForestLabelType::Plain;
+        }
+        1 => {
+            label_status = ForestLabelType::Packed;
+        }
+        _ => {
+            label_status = ForestLabelType::Cloned(usize::from_be_bytes(
+                unsafe { std::slice::from_raw_parts(label.add(1), 8) }
+                    .try_into()
+                    .unwrap(),
+            ));
+        }
+    }
+
+    let start = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts(label.add(9), 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    let end = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts(label.add(17), 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    let discriminant: u8 = unsafe { *label.add(25) };
+
+    let content = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts(label.add(26), 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    let inner_label: GrammarLabel;
+
+    match discriminant {
+        0 => {
+            inner_label = GrammarLabel::new_closed(TNT::Ter(content), start, end);
+        }
+        1 => {
+            inner_label = GrammarLabel::new_closed(TNT::Non(content), start, end);
+        }
+        _ => {
+            inner_label = GrammarLabel::new_closed(content, start, end);
+        }
+    }
+
+    ForestLabel::new(inner_label, label_status)
+}
+
+macro_rules! return_error {
+    ($err:expr, $elen:ident, $ecap:ident, $evec:ident) => {
+        let mut e_string = $err;
+
+        let e_string_len_slice = e_string.len().to_be_bytes();
+        let e_string_cap_slice = e_string.capacity().to_be_bytes();
+
+        unsafe {
+            for i in 0..8 {
+                *($elen.add(i)) = e_string_len_slice.get(i).copied().unwrap();
+                *($ecap.add(i)) = e_string_cap_slice.get(i).copied().unwrap();
+            }
+
+            (*$evec).data = e_string.as_mut_ptr() as *mut std::os::raw::c_char;
+        }
+
+        std::mem::forget(e_string);
+
+        return;
+    };
+}
+
+#[no_mangle]
+extern "C" fn print_forest(
+    forest_vec: *mut LenVec<std::os::raw::c_uchar>,
+    error_vec: *mut LenVec<std::os::raw::c_char>,
+    filename: *mut std::os::raw::c_char,
+) {
+    let forest_len = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts((*forest_vec).len, 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    let error_len = unsafe { (*error_vec).len };
+    let error_cap = unsafe { (*error_vec).capacity };
+
+    if forest_len < 27 {
+        return_error!(
+            format!("forest bytes length {forest_len} < 27"),
+            error_len,
+            error_cap,
+            error_vec
+        );
+    }
+
+    let nodes_len = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts((*forest_vec).data.add(11), 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    println!("the forest has {nodes_len} nodes");
+
+    let special_marks = unsafe { std::slice::from_raw_parts((*forest_vec).data.add(8), 3) };
+
+    if special_marks != &[114, 101, 112] {
+        return_error!(
+            format!(
+                "the forest does not begin with the special mark\nThe first bytes are: \
+                     {:?}\n",
+                special_marks
+            ),
+            error_len,
+            error_cap,
+            error_vec
+        );
+    }
+
+    let labels_offset = usize::from_be_bytes(
+        unsafe { std::slice::from_raw_parts((*forest_vec).data.add(19), 8) }
+            .try_into()
+            .unwrap(),
+    );
+
+    println!("labels_offset = {labels_offset}");
+
+    if forest_len < labels_offset + 34 * nodes_len || forest_len < (27 + 16 * nodes_len) {
+        return_error!(
+            format!(
+                "the forest length is too small: {forest_len}\n\
+                               labels offset + 34 * nodes_len = {}, all {nodes_len} \
+                               nodes take {}\n",
+                labels_offset + 34 * nodes_len,
+                27 + 16 * nodes_len
+            ),
+            error_len,
+            error_cap,
+            error_vec
+        );
+    }
+
+    let mut total_degree = 0usize;
+
+    let preamble = "digraph forest {
+    fontname=\"Helvetica,Arial,sans-serif\"
+    node [fontname=\"Helvetica,Arial,sans-serif\", ordering=out]
+    edge [fontname=\"Helvetica,Arial,sans-serif\"]
+    rankdir=LR;\n";
+
+    let mut post = String::new();
+
+    for node in 0..nodes_len {
+        let degree = usize::from_be_bytes(
+            unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * node), 8) }
+                .try_into()
+                .unwrap(),
+        );
+
+        total_degree += degree;
+
+        post.push_str(&format!(
+            "    {node} [label = \"{node}:{}\"]\n",
+            read_label(unsafe { (*forest_vec).data.add(labels_offset + 34 * node) })
+        ));
+    }
+
+    println!("total degree = {total_degree}");
+
+    let correct_len: usize = 27 + 50 * nodes_len + 8 * total_degree;
+
+    println!("correct length = {correct_len}");
+
+    if forest_len != correct_len {
+        return_error!(
+            format!("the forest length {forest_len} should be equal to: {correct_len}\n"),
+            error_len,
+            error_cap,
+            error_vec
+        );
+    }
+
+    for source in 0..nodes_len {
+        let degree = usize::from_be_bytes(
+            unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source), 8) }
+                .try_into()
+                .unwrap(),
+        );
+
+        let node_offset = usize::from_be_bytes(
+            unsafe { std::slice::from_raw_parts((*forest_vec).data.add(27 + 16 * source + 8), 8) }
+                .try_into()
+                .unwrap(),
+        );
+
+        if forest_len <= node_offset + 8 * degree {
+            return_error!(
+                format!(
+                    "the forest length {forest_len} is <= {node_offset} + 8 * {degree} = {}\n",
+                    node_offset + 8 * degree
+                ),
+                error_len,
+                error_cap,
+                error_vec
+            );
+        }
+
+        for i in 0..degree {
+            let target = usize::from_be_bytes(
+                unsafe {
+                    std::slice::from_raw_parts((*forest_vec).data.add(node_offset + 8 * i), 8)
+                }
+                .try_into()
+                .unwrap(),
+            );
+
+            post.push_str(&format!("    {source} -> {target}\n"));
+        }
+    }
+
+    post.push_str("}\n");
+
+    let result = format!("{preamble}{post}");
+
+    let parsed_filename;
+
+    match unsafe { std::ffi::CStr::from_ptr(filename).to_str() } {
+        Ok(ccstr) => {
+            parsed_filename = ccstr;
+        }
+        Err(e) => {
+            return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+        }
+    }
+
+    if std::fs::metadata(parsed_filename).is_ok() {
+        let _ = std::fs::remove_file(parsed_filename);
+    }
+
+    let file = std::fs::File::options()
+        .write(true)
+        .create(true)
+        .open(parsed_filename);
+
+    use std::io::Write;
+
+    match file {
+        Ok(mut file) => {
+            if let Err(e) = file.write_all(result.as_bytes()) {
+                return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+            }
+        }
+        Err(e) => {
+            return_error!(format!("error: {e}"), error_len, error_cap, error_vec);
+        }
+    }
+}
+
 // TODO: Write a function to print the node label of a forest and
 // expose it to C ABI.
 //
diff --git a/src/print_forest.c b/src/print_forest.c
new file mode 100644
index 0000000..58832c9
--- /dev/null
+++ b/src/print_forest.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+#include "helper.h"
+#include "big_endian.h"
+
+int
+main(int argc, char **argv)
+{
+  if (argc < 2) {
+    fprintf(stderr, "One argument is required, which is the file name of the forest.\n");
+      
+    return 1;
+  }
+  
+  if (argc < 3) {
+    fprintf(stderr, "It is required to supply the ouput file name.\n");
+      
+    return 1;
+  }
+  
+  print_forest_file(*(argv+1), *(argv+2));
+}
diff --git a/src/test.c b/src/test.c
index 249eb87..f9ab43b 100644
--- a/src/test.c
+++ b/src/test.c
@@ -25,13 +25,13 @@ main(int argc, char **argv)
 "\n"
 "item         =/ header [ note ] *1( price )\n"
 "\n"
-"header       =  *1star \"SP\" title %xA *( \"SP\" / %xA )\n"
+"header       =  star \"SP\" title %xA *( \"SP\" / %xA )\n"
 "\n"
 "title        =  1*\"TEXT\"\n"
 "\n"
 "star         =  %x2A\n"
 "\n"
-"note         =  \"note:\" note-content %xA *( \"SP\" / %xA )\n"
+"note         =  \"note:\" \"SP\" note-content %xA *( \"SP\" / %xA )\n"
 "\n"
 "note-content =  1*\"TEXT\"\n"
 "\n"
@@ -58,7 +58,31 @@ main(int argc, char **argv)
     return 1;
   }
 
-  unsigned char *input = malloc (sizeof(unsigned char) * 10 * 8);
+  /* int input_unenc[] = { 3, 0, 2, 1, 5, 0, 6, 1 }; */
+  
+  /* int input_unenc[] = { 3, 0, 2, 2, 2, 2, 1, 5, 0,
+   *                       6, 6, 6, 1, 3, 0, 2, 2, 2,
+   *                       2, 1, 5, 0, 6, 6, 6, 1, 3,
+   *                       0, 2, 2, 2, 2, 1, 5, 0, 6,
+   *                       6, 6, 1, 4, 2, 2, 2, 2, 2,
+   *                       2, 2, 2, 2, 2, 2, 2, 2, 2,
+   *                       2, 2, 1 }; */
+  
+  int input_unenc[] = { 3, 0, 2, 1,
+                        5, 0, 6, 1,
+                        3, 0, 2, 1,
+                        5, 0, 6, 1,
+                        3, 0, 2, 1,
+                        5, 0, 6, 1,
+                        4, 0, 2, 1 };
+  
+  int input_unenc_len = (int) (sizeof (input_unenc) /
+                               sizeof(input_unenc[0]));
+
+  printf("input length = %d\n", input_unenc_len);
+
+  unsigned char *input = malloc (sizeof(unsigned char) *
+                                 input_unenc_len * 8);
 
   if (input == NULL) {
     clean_parser(parser);
@@ -67,14 +91,14 @@ main(int argc, char **argv)
 
     return EXIT_FAILURE;
   }
-
-  int input_unenc[10] = { 3, 0, 2, 2, 2, 1, 5, 0, 6, 1 };
-
-  for (int i = 0; i < 10; i++)
+  
+  for (int i = 0; i < input_unenc_len; i++)
     to_big_endian(input_unenc[i], input + (8 * i));
 
   unsigned char input_len[8] = { 0 };
-  input_len[7] = 8*10;
+  /* input_len[7] = 8*10; */
+
+  to_big_endian((uint64_t) (8 * input_unenc_len), input_len);
 
   struct UnsignedVec input_vec =
     (struct UnsignedVec) { input_len, NULL, input };
diff --git a/src/test.el b/src/test.el
index d106a03..a6f6e7a 100644
--- a/src/test.el
+++ b/src/test.el
@@ -26,3 +26,112 @@
 ;; (rep-parse-string
 ;;  test-parser
 ;;  "183t3ru")
+
+;; Below is an experimental tokenizer implemented in Emacs Lisp.
+
+;; The tokens for the test grammar are as follows:
+;;
+;; 0. SP
+;; 1. %xA
+;; 2. TEXT
+;; 3. %x2A
+;; 4. note:
+;; 5. price:
+;; 6. DIGIT
+
+(defun example-tokenize (str)
+  "Tokenize the string STR.
+The function returns a pair of vectors, one of positive integers
+and the other of corresponding spans in the original input.  The
+span is represented as a cons-cell, whose `car' is the starting
+position in the input, and whose `cdr' is the length of the span.
+
+The tokens are as follows:
+
+0. SP
+1. %xA
+2. TEXT
+3. %x2A
+4. note:
+5. price:
+6. DIGIT"
+  (let ((i 0) (len (length str)) result result-spans)
+    (while (< i len)
+      (let ((element (aref str i)))
+        (cond
+         ((= element 32)
+          (setq result (cons 0 result))
+          (setq result-spans (cons (cons i 1) result-spans))
+          (setq i (1+ i)))
+         ((= element #xa)
+          (setq result (cons 1 result))
+          (setq result-spans (cons (cons i 1) result-spans))
+          (setq i (1+ i)))
+         ((and (= element #x2a)
+               (or (= i 0) (= (aref str (1- i)) #xa)))
+          (setq result (cons 3 result))
+          (setq result-spans (cons (cons i 1) result-spans))
+          (setq i (1+ i)))
+         ((and (<= ?0 element) (<= element ?9))
+          (let ((j i) temp stop)
+            (while (and (not stop) (< j len))
+              (setq temp (aref str j))
+              (cond
+               ((and (<= ?0 temp) (<= temp ?9))
+                (setq j (min (1+ j) len)))
+               ((setq stop t))))
+            (setq result (cons 6 result))
+            (setq result-spans
+                  (cons (cons i (- j i)) result-spans))
+            (setq i j)))
+         ((and (= element ?n)
+               (< (+ i 4) len)
+               (= (aref str (+ i 1)) ?o)
+               (= (aref str (+ i 2)) ?t)
+               (= (aref str (+ i 3)) ?e)
+               (= (aref str (+ i 4)) ?:))
+          (setq result (cons 4 result))
+          (setq result-spans (cons (cons i 5) result-spans))
+          (setq i (+ i 5)))
+         ((and (= element ?p)
+               (< (+ i 5) len)
+               (= (aref str (+ i 1)) ?r)
+               (= (aref str (+ i 2)) ?i)
+               (= (aref str (+ i 3)) ?c)
+               (= (aref str (+ i 4)) ?e)
+               (= (aref str (+ i 5)) ?:))
+          (setq result (cons 5 result))
+          (setq result-spans (cons (cons i 6) result-spans))
+          (setq i (+ i 6)))
+         ((setq result-spans (cons (cons i 1) result-spans))
+          (let ((j i) stop temp)
+            (while (and (not stop) (< j len))
+              (setq temp (aref str j))
+              (cond
+               ((= temp #xa) (setq stop t))
+               ((setq j (min (1+ j) len)))))
+            (setq result (cons 2 result))
+            (setq result-spans (cons (cons i (- j i)) result-spans))
+            (setq i j))))))
+    (cons (apply #'vector (nreverse result))
+          (apply #'vector (nreverse result-spans)))))
+
+(defvar test-document (expand-file-name
+                       "test.document"
+                       (expand-file-name
+                        "test-data"
+                        rep-dir))
+  "A document for testing purposes.")
+
+(defvar input-spans nil
+  "A vector that represents spans of tokens in the input.")
+
+(let ((result (example-tokenize (with-temp-buffer
+                                  (insert-file-contents test-document)
+                                  (buffer-substring-no-properties
+                                   (point-min) (point-max))))))
+  (setq input (car result))
+  (setq input-spans (cdr result)))
+
+(rep-recognize test-parser input)
+(rep-parse test-parser input)
-- 
cgit v1.2.3-18-g5258