summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Makefile.am8
-rw-r--r--src/dfa.c692
-rw-r--r--src/dfa.h64
-rw-r--r--src/grammar.c17
-rw-r--r--src/grammar.h53
-rw-r--r--src/list.c16
-rw-r--r--src/list.h14
-rw-r--r--src/reader.c66
-rw-r--r--src/reader.h2
-rw-r--r--src/str.c10
-rw-r--r--src/str.h8
-rw-r--r--src/str_partial.h2
-rw-r--r--src/test.txt34
-rw-r--r--src/test/check_dfa.c108
-rw-r--r--src/utf8.c4
-rw-r--r--src/utf8.h2
-rw-r--r--src/util.c2
-rw-r--r--src/util.h4
18 files changed, 1019 insertions, 87 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 4690876..f43fbee 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,9 +2,9 @@ AM_CFLAGS = -Wall -Wextra
noinst_LIBRARIES = libeps.a
libeps_a_SOURCES = grammar.c list.c util.c reader.c \
-str.c utf8.c lr_table.c \
+str.c utf8.c dfa.c \
grammar.h list.h util.h reader.h \
-str_partial.h str.h utf8.h lr_table.h
+str_partial.h str.h utf8.h dfa.h
libeps_a_CFLAGS = $(AM_CFLAGS) --pedantic
@@ -21,7 +21,7 @@ CLEANFILES = TAGS
# tests
-check_PROGRAMS = check_list check_grammar check_reader
+check_PROGRAMS = check_list check_grammar check_reader check_dfa
check_list_SOURCES = test/check_list.c list.c
@@ -31,6 +31,8 @@ utf8.c
check_reader_SOURCES = test/check_reader.c list.c grammar.c reader.c \
str.c utf8.c util.c
+check_dfa_SOURCES = test/check_dfa.c dfa.c list.c str.c utf8.c
+
TESTS = $(check_PROGRAMS)
AM_COLOR_TESTS = always
diff --git a/src/dfa.c b/src/dfa.c
new file mode 100644
index 0000000..e3bcdf0
--- /dev/null
+++ b/src/dfa.c
@@ -0,0 +1,692 @@
+#include "utf8.h"
+#include "str.h"
+#include <stdio.h>
+#include "list.h"
+#include "dfa.h"
+
+#define INIT_TRIE(X, ID) do { \
+ X->id = ID; \
+ X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \
+ for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \
+ *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \
+ } while (0)
+
+/* The complete, uncompressed table is represented as an array of
+ arrays, each array of which has size MAX_CHAR_BYTES_NUM. The
+ compressed table is represented as run-lengths. The ROW_N is the
+ size of the total table. And the TABLE stores the data. It is an
+ array of arrays. The format of the array of a row is as follows:
+
+ VALUE1 LEN1 VALUE2 LEN2
+
+ This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2
+ times, and so on. Each array of a row ends when the lengths sum to
+ MAX_CHAR_BYTES_NUM. */
+struct compressed_table_s {
+ int row_n;
+ int **table;
+};
+
+enum DFA_TYPE {
+ DFA_TYPE_POSITIVE,
+ DFA_TYPE_NEGATIVE,
+ DFA_TYPE_BOTH,
+ DFA_TYPE_SPECIAL,
+};
+
+struct dfa_s {
+ enum DFA_TYPE type;
+ union {
+ compressed_table table;
+ special_dfa fn;
+ } data;
+};
+
+dfa *
+new_dfa()
+{
+ dfa *dfap = MYALLOC(dfa, 1);
+
+ dfap->type = 0;
+
+ dfap->data.table.row_n = 0;
+ dfap->data.table.table = NULL;
+
+ return dfap;
+}
+
+void
+destroy_dfa(dfa *table)
+{
+ /* function pointers need no destructors */
+ if (table->type == DFA_TYPE_SPECIAL) return;
+
+ for (int i = 0; i < table->data.table.row_n;)
+ free(*(table->data.table.table+i++));
+
+ free(table->data.table.table);
+ free(table);
+}
+
+/* a trie structure */
+
+typedef struct trie_s trie;
+
+/* REVIEW: Maybe we don't need ID? */
+struct trie_s {
+ /* It is guaranteed that the number of children is
+ MAX_CHAR_BYTES_NUM. */
+ int *children;
+ int id;
+};
+
+static void
+destroy_trie(void *t)
+{
+ trie *tp = (trie *)t;
+ free(tp->children);
+ free(tp);
+}
+
+/* Print a list of Trie's. */
+UNUSED
+static void
+print_tries(const List * const restrict ls)
+{
+ NUM len = list_length(ls);
+ for (NUM i = 0; i < len; i++) {
+ trie *t = (trie *) list_nth(ls, i);
+ printf("Trie ID: %d:\n", t->id);
+
+ BOOL has_children_p = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ if (*(t->children+j) != -1) {
+ has_children_p = 1;
+ printf(" "
+ "Char %d => State %d\n", j, *(t->children+j));
+ }
+ }
+
+ if (!has_children_p) printf(" " "No children!\n");
+
+ printf("\n");
+ }
+}
+
+void
+print_dfa(const dfa * const restrict table)
+{
+ switch (table->type) {
+ case DFA_TYPE_POSITIVE:
+ printf("Printing a positive table:\n");
+ break;
+ case DFA_TYPE_NEGATIVE:
+ printf("Printing a negative table:\n");
+ break;
+ case DFA_TYPE_BOTH:
+ printf("Printing a mixed table:\n");
+ break;
+ default:
+ printf("A special table cannot be printed\n");
+ return;
+ break;
+ }
+
+ printf("Printing a table:\n");
+
+ for (int i = 0; i < table->data.table.row_n; i++) {
+ printf("Row %d: ", i);
+ int size = 0;
+ for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) {
+ printf("%d %d", *(*(table->data.table.table+i)+(j<<1)),
+ *(*(table->data.table.table+i)+(j<<1)+1));
+ size += *(*(table->data.table.table+i)+(j<<1)+1);
+
+ if (size < MAX_CHAR_BYTES_NUM) printf(", ");
+ else printf("\n");
+ }
+ }
+ printf("\n");
+}
+
+/* REVIEW: Maybe we don't want to generate the complete trie in the
+ process of generating the compressed table? */
+dfa *
+dfa_from_bytes(int sequence_size,
+ const NUM * const restrict data)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the accepting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_ACCEPT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+dfa *
+dfa_from_bytes_neg(int sequence_size,
+ const NUM * const restrict data)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the rejecting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_REJECT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+ dfap->type = 1;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+dfa *dfa_from_bytes_both(int sequence_size,
+ const NUM * const restrict data,
+ int neg_sequence_size,
+ const NUM * const restrict negdata)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the accepting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_ACCEPT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ for (int i = 0; i < neg_sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(negdata+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ fleprintf("Fail to add to list when i = %d, j = %d, "
+ "and current value = %ld\n",
+ i, j, current_value);
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the rejecting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_REJECT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+BOOL
+run_dfa(const dfa * const restrict table, const NUM code)
+{
+ switch (table->type) {
+ case DFA_TYPE_POSITIVE:
+ case DFA_TYPE_NEGATIVE:
+ case DFA_TYPE_BOTH:
+ break;
+ default:
+ return table->data.fn(code);
+ break;
+ }
+
+ char *s = MYALLOC(char, 5);
+ str *strp = new_str(s, 5);
+
+ if (encode(code, strp)) {
+ destroy_str(strp, 1);
+ fleprintf0("Fail to encode\n");
+ return 0;
+ }
+
+ int current_row = 0;
+
+ for (NUM i = 0; i < str_length(strp); i++) {
+ unsigned char c = (unsigned char) *(s+i);
+ int lookup_value = 0;
+
+ for (int j = 0, len = 0; len <= c;) {
+ len += *(*(table->data.table.table+current_row)+(j<<1)+1);
+ lookup_value = *(*(table->data.table.table+current_row)
+ +(j++<<1));
+ }
+
+ /* fleprintf("lookup_value = %d\n", lookup_value); */
+
+ if (lookup_value > 0) {
+ current_row = lookup_value;
+ } else {
+ switch (lookup_value) {
+ case DFA_STATE_REJECT:
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ case DFA_STATE_UNKNOWN:
+ switch (table->type) {
+ case DFA_TYPE_BOTH:
+ case DFA_TYPE_POSITIVE:
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ case DFA_TYPE_NEGATIVE:
+ destroy_str(strp, 1);
+ return 1;
+ break;
+ default:
+ /* This should not be possible, as we won't reach here if
+ the table is of the special type. But I am apparently
+ paranoia. */
+ fleprintf0("Special table in the wrong place\n");
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ }
+ break;
+ case DFA_STATE_ACCEPT:
+ destroy_str(strp, 1);
+ return 1;
+ break;
+ default:
+ fleprintf("Unknown state number for char %d: %d\n",
+ c, lookup_value);
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ }
+ }
+ }
+
+ destroy_str(strp, 1);
+
+ return 0;
+}
diff --git a/src/dfa.h b/src/dfa.h
new file mode 100644
index 0000000..a22409f
--- /dev/null
+++ b/src/dfa.h
@@ -0,0 +1,64 @@
+#include "util.h"
+#ifndef DFA_H
+#define DFA_H
+
+/* See the comments at the beginning of grammar.h for some backgrounds
+ about this file. */
+
+/* See the following Wikipedia link for details on Run-Length
+ Encoding. <https://en.wikipedia.org/wiki/Run-length_encoding> */
+
+/* Maximal character bytes value */
+
+enum { MAX_CHAR_BYTES_NUM = 256 };
+
+/* Hard-coded state numbers */
+
+enum {
+ DFA_STATE_UNKNOWN = -1,
+ DFA_STATE_ACCEPT = -2,
+ DFA_STATE_REJECT = -3,
+};
+
+/* dfa type */
+
+typedef BOOL (* special_dfa) (const NUM code);
+
+typedef struct dfa_s dfa;
+
+typedef struct compressed_table_s compressed_table;
+
+dfa *new_dfa();
+
+void destroy_dfa(dfa *table);
+
+void print_dfa(const dfa * const restrict table);
+
+dfa *dfa_from_bytes(int sequence_size,
+ const NUM * const restrict data);
+
+dfa *dfa_from_bytes_neg(int sequence_size,
+ const NUM * const restrict data);
+
+dfa *dfa_from_bytes_both(int sequence_size,
+ const NUM * const restrict data,
+ int neg_sequence_size,
+ const NUM * const restrict negdata);
+
+/* TODO: Reject character bytes from a given DFA. */
+
+/* NOTE: Add all unicode valid points to a DFA, so that we can
+ represent the ANY class.
+
+ After having done so, this costs around 16K memory. This is not so
+ satisfactory, as all these memory are just to serve as a ANY
+ character class, which is too excessive. So I extend the DFA by a
+ special type. */
+
+/* TODO: Construct some basic frequently used character classes. */
+
+inline BOOL dfa_any_fun(const NUM UNUSED code) { return 1; }
+
+BOOL run_dfa(const dfa * const restrict table, const NUM code);
+
+#endif
diff --git a/src/grammar.c b/src/grammar.c
index 4052510..508c8d8 100644
--- a/src/grammar.c
+++ b/src/grammar.c
@@ -104,14 +104,14 @@ new_tnt(int type, ...)
}
Rule *
-new_rule(NT left, List *right)
+new_rule(NT left, const List * const right)
{
if (!right) return NULL;
Rule *rule = MYALLOC(Rule, 1);
rule->left = left;
- rule->right = right;
+ rule->right = (List *) right;
return rule;
}
@@ -124,10 +124,10 @@ new_grammar()
}
/* We classify the rules into different rule groups. */
-unsigned char
-build_grammar(Grammar *g, List *rules, List *names)
+BOOL
+build_grammar(Grammar *g, const List *rules, const List * const names)
{
- g->names = names;
+ g->names = (List *) names;
NUM len = list_length(names);
NUM rule_len = list_length(rules);
@@ -246,11 +246,12 @@ print_rule(void *r)
}
NUM
-find_in_cpa_list(NUM *string, NUM size, List *list)
+find_in_cpa_list(const NUM * const restrict string, NUM size,
+ const List * const restrict list)
{
NUM len = list_length(list), index = 0;
- unsigned char foundp = 0;
+ BOOL foundp = 0;
for (; !foundp && index < len;) {
cpa *cpap = list_nth(list, index);
@@ -304,7 +305,7 @@ print_name(void *element)
/* REVIEW: Print the names of non-terminals out, instead of printing
the numbers? */
void
-print_grammar(Grammar *g)
+print_grammar(const Grammar * const g)
{
printf("Printing a grammar:\n");
map_list_between(g->names, print_name, print_sep);
diff --git a/src/grammar.h b/src/grammar.h
index 05bc1fc..28f8fd8 100644
--- a/src/grammar.h
+++ b/src/grammar.h
@@ -8,14 +8,45 @@
/* The class file for grammars */
+/* TODO: Terminals should be extended to incorporate the "predicate
+ terminals", such as character classes. */
+
/* A grammar is a list of rules. A rule replaces a non-terminal with
a finite string of terminals and non-terminals.
- For us, a terminal is a positive integer, and a non-terminal is a
- negative integer. Since the signs of these two types is fixed, we
- represent them as unsigned integers. Since we want to deal with
- unicode points, the terminal is stored as unsigned long, so that we
- can include all unicode poitns. */
+ For us, a terminal is a positive integer, and a non-terminal is
+ also an integer. We represent them as unsigned integers. Since we
+ want to deal with unicode points, the terminal is stored as
+ unsigned long, so that we can include all unicode poitns.
+
+ Moreover, a terminal might be a "predicate terminal", that is, it
+ does not refer to one character only. Instead, it refers to a set
+ of characters. The most efficient way of recognizing a set of
+ characters, of which I know, is to use a deterministic finite
+ automaton. So we represent a predicate terminal as a DFA.
+
+ Whereas the DFA's are equivalent to regular expressions in the
+ expressive power, I do not need full support for regular
+ expressions for the predicate terminals. In particular, I only
+ want to match a character for one predicate terminal, so there are
+ no such things as repititive and optional constructs in the DFA.
+ Rather, each predicate terminal is a simple table, or a matrix,
+ which must have exactly 256 columns and an indefinite number of
+ rows. The columns represent the input character's bytes to match,
+ and the rows are the states of the DFA. The table must have at
+ least one row: the starting state is always the zeroth row and the
+ accepting state is any state that is out of bounds: either greater
+ than or equal to the number of rows or negative.
+
+ Of course, it is inefficient to keep the whoe table within the
+ memory, especially since the most part of the table would be
+ unused. So we store a DFA in a compressed format. I have
+ considered whether to apply some more advanced compression
+ techniques, such as using BW transoform followed by the MTF
+ algorithm, but in the end I think that the DFA's that will be used
+ are oft times just small tables, at least I hope so, thus I decided
+ to simply use the Run-Length-Encoding. See the file dfa.h for
+ implementation details. */
typedef unsigned long T;
typedef unsigned NT;
@@ -41,7 +72,9 @@ typedef struct Grammar_s Grammar;
/* G is expected to be allocated before this function.
NAMES is a list of CPA pointers. */
-unsigned char build_grammar(Grammar *g, List *rules, List *names);
+BOOL
+build_grammar(Grammar *g,
+ const List *rules, const List * const names);
/* This accepts one and only one optional argument, which is of type
either T or NT, depending on TYPE being 0 or not. */
@@ -56,7 +89,9 @@ struct cpa_s {
typedef struct cpa_s cpa;
-NUM find_in_cpa_list(NUM *string, NUM size, List *list);
+NUM
+find_in_cpa_list(const NUM * const restrict string, NUM size,
+ const List * const restrict list);
/* For debugging purposes, we need to print out rules to examine their
values. */
@@ -69,7 +104,7 @@ void print_tnt(void *element);
void print_rule(void *r);
/* This will generate errors if G is not a list of rules. */
-void print_grammar(Grammar *g);
+void print_grammar(const Grammar * const g);
/* constructors */
@@ -85,7 +120,7 @@ TNT *new_tnt_pointer(size_t size);
/* RIGHT should be created by new_tnt_string or something alike.
If RIGHT is NULL, NULL is returned. */
-Rule *new_rule(NT left, List *right);
+Rule *new_rule(NT left, const List * const right);
Grammar *new_grammar();
diff --git a/src/list.c b/src/list.c
index 143c693..3c430f6 100644
--- a/src/list.c
+++ b/src/list.c
@@ -32,7 +32,7 @@ new_list()
}
H_ATTR
-unsigned char
+BOOL
add_to_list(List *ls, void *element)
{
if (!(ls->capacity)) {
@@ -141,10 +141,10 @@ copy_num(void *p)
return pointer;
}
-unsigned char
+BOOL
copy_list(List *dest, List *source, copyer copyf)
{
- unsigned char sign = 0;
+ BOOL sign = 0;
if ((sign = list_assure_size(dest, source->len)))
return sign;
@@ -162,19 +162,19 @@ copy_list(List *dest, List *source, copyer copyf)
H_ATTR
void *
-list_nth(List *ls, NUM n)
+list_nth(const List * const ls, NUM n)
{
return *(ls->array+n);
}
H_ATTR
NUM
-list_length(List *ls)
+list_length(const List * const restrict ls)
{
return ls->len;
}
-unsigned char
+BOOL
list_assure_size(List *ls, NUM size)
{
if (ls->capacity >= size) return 0; /* we are good */
@@ -203,7 +203,7 @@ list_assure_size(List *ls, NUM size)
return 0;
}
-unsigned char
+BOOL
list_set_length(List *ls, NUM len)
{
list_assure_size(ls, len);
@@ -254,7 +254,7 @@ array_to_list(void **array, NUM size)
}
void
-destroy_list(List *ls, unsigned char all_free_p)
+destroy_list(List *ls, BOOL all_free_p)
{
if (all_free_p == 1)
for (NUM i = 0; i < ls->len; i++)
diff --git a/src/list.h b/src/list.h
index ff401f4..6ace213 100644
--- a/src/list.h
+++ b/src/list.h
@@ -17,7 +17,7 @@ List *new_list();
/* add an element to the end of the list */
/* upon failure return non-zero */
-unsigned char add_to_list(List *ls, void *element);
+BOOL add_to_list(List *ls, void *element);
/* pop an element from the end of the list, and return that element */
/* upon failure return NULL */
@@ -43,23 +43,23 @@ typedef void *(*copyer)(void *);
void *copy_num(void *);
/* upon failure return 1 */
-unsigned char copy_list(List *dest, List *source, copyer copyf);
+BOOL copy_list(List *dest, List *source, copyer copyf);
-void *list_nth(List *ls, NUM n);
+void *list_nth(const List * const ls, NUM n);
-NUM list_length(List *ls);
+NUM list_length(const List * const restrict ls);
/* Make sure the list has at least SIZE slots to use. This should
only be used to create fixed capacity arrays, otherwise we risk
frequently re-allocating and hence losing performance. */
/* Upon failure return non-zero. */
-unsigned char list_assure_size(List *ls, NUM size);
+BOOL list_assure_size(List *ls, NUM size);
/* This is mainly used to set the length of a sparse list, since only
when dealing with sparse lists do we not need to care about the
elements. */
/* Upon failure return non-zero. */
-unsigned char list_set_length(List *ls, NUM len);
+BOOL list_set_length(List *ls, NUM len);
/* Convert a list to an array.
@@ -79,7 +79,7 @@ List *array_to_list(void ** array, NUM size);
/* destroy the list: If ALL_FREE_P is 1, this frees every void
pointers contained in the list; if it is 2, this frees the first
pointer. In any case, the list is de-allocated. */
-void destroy_list(List *ls, unsigned char all_free_p);
+void destroy_list(List *ls, BOOL all_free_p);
void destroy_list_free_all(void *ls);
void destroy_list_free_first(void *ls);
diff --git a/src/reader.c b/src/reader.c
index 18435a9..d715726 100644
--- a/src/reader.c
+++ b/src/reader.c
@@ -21,14 +21,15 @@
/* Read a string of TNT's into TNTS. TNTS is expected to be already
allocated. */
-static unsigned char
-read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
+static BOOL
+read_tnt_string(const str * const restrict s, NUM start, NUM end,
+ TNT *tnts, List *cpa_list)
{
str_info info;
/* fleprintf("S = %lu, E = %lu\n", start, end); */
- unsigned char readingp = 0, stop = 0;
+ BOOL readingp = 0, stop = 0, double_quote_p = 0;
NUM *string = NULL, string_size = 0, nt_index = 0, tnt_count = 0;
@@ -73,14 +74,15 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
}
break;
case 34:
+ double_quote_p = 1;
case 39:
/* read a string */
strindex = index + info.step;
strinfo = get_info(s, strindex);
for (;strindex < end &&
- strinfo.value != 34 &&
- strinfo.value != 39 &&
+ ((double_quote_p && strinfo.value != 34) ||
+ (!double_quote_p && strinfo.value != 39)) &&
strinfo.value != 10 &&
strinfo.value != 13;) {
strinfo = get_info(s, strindex);
@@ -88,7 +90,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
if (strinfo.value == 92) {
strindex += info.step;
- unsigned char local_exit = 0;
+ BOOL local_exit = 0;
/* The following errors should not happen. I am just
paranoid. */
@@ -118,6 +120,8 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
strindex -= strinfo.step;
tnt_count--;
+ double_quote_p = 0;
+
index = strindex;
info = get_info(s, index);
break;
@@ -166,6 +170,7 @@ read_tnt_string(str *s, NUM start, NUM end, TNT *tnts, List *cpa_list)
return 0;
}
+UNUSED
static void
print_int(void *p)
{
@@ -174,7 +179,7 @@ print_int(void *p)
/* Read a grammar from a string in the format of BNF notation. */
Grammar *
-read_grammar_from_bnf(str *s)
+read_grammar_from_bnf(const str * const restrict s)
{
NUM len = str_length(s);
@@ -186,7 +191,7 @@ read_grammar_from_bnf(str *s)
List *line_indices = new_list();
- unsigned char definitionp = 1, readingp = 0;
+ BOOL definitionp = 1, readingp = 0;
/* last index is used to collect strings */
for (NUM index = 0, last_index = 0; index < len;) {
@@ -205,7 +210,7 @@ read_grammar_from_bnf(str *s)
case 9:
readingp = 0;
- if (definitionp) {
+ if (definitionp && readingp) {
fleprintf("%lu, A non-terminal cannot have spaces in "
"the name\n", index);
/* cleanup */
@@ -257,9 +262,9 @@ read_grammar_from_bnf(str *s)
index += info.step;
}
- printf("%s:%d, print indices\n", __FILE__, __LINE__);
- map_list(line_indices, print_int);
- printf("\n");
+ /* printf("%s:%d, print indices\n", __FILE__, __LINE__);
+ * map_list(line_indices, print_int);
+ * printf("\n"); */
/* second round collects all the rules */
@@ -267,7 +272,7 @@ read_grammar_from_bnf(str *s)
NUM line_count = 0;
- unsigned char right_start_p = 0;
+ BOOL right_start_p = 0, double_quote_p = 0;
definitionp = 1, readingp = 0;
@@ -286,6 +291,7 @@ read_grammar_from_bnf(str *s)
switch (info.value) {
/* the quote characters */
case 34: /* double quote */
+ double_quote_p = 1;
case 39: /* single quote */
if (definitionp) {
fleprintf("%lu, A non-terminal cannot have quotes in "
@@ -330,8 +336,8 @@ read_grammar_from_bnf(str *s)
for (;strindex < len &&
strinfo.value != 13 &&
strinfo.value != 10 &&
- strinfo.value != 34 &&
- strinfo.value != 39;) {
+ ((double_quote_p && strinfo.value != 34) ||
+ (!double_quote_p && strinfo.value != 39));) {
strinfo = get_info(s, strindex);
/* escape character */
@@ -339,7 +345,7 @@ read_grammar_from_bnf(str *s)
strindex += strinfo.step;
if (strindex < len) strinfo = get_info(s, strindex);
- unsigned char local_error_p = 0;
+ BOOL local_error_p = 0;
if (strindex >= len) {
local_error_p = 1;
@@ -395,8 +401,8 @@ read_grammar_from_bnf(str *s)
return NULL;
}
} else {
- if (strinfo.value != 34 &&
- strinfo.value != 39) {
+ if ((double_quote_p && strinfo.value != 34) ||
+ (!double_quote_p && strinfo.value != 39)) {
fleprintf0("input ended before string is left\n");
/* cleanup */
map_list(names, destroy_cpa_and_free_all);
@@ -408,15 +414,16 @@ read_grammar_from_bnf(str *s)
destroy_list(rules, 0);
return NULL;
}
-
}
}
+ double_quote_p = 0;
+
break;
/* spaces and tabs are ignored */
case ' ':
case 9:
- if (definitionp) {
+ if (definitionp && readingp) {
fleprintf("%lu, A non-terminal cannot have spaces or "
"tabs in the name\n", index);
/* cleanup */
@@ -435,6 +442,11 @@ read_grammar_from_bnf(str *s)
break;
case '\n':
case 13:
+ if (definitionp && !readingp) {
+ /* empty line */
+ break;
+ }
+
definitionp = 1;
readingp = 0;
line_count++;
@@ -461,13 +473,13 @@ read_grammar_from_bnf(str *s)
return NULL;
}
- printf("%s:%d:%lu, Printing a TNT string: ",
- __FILE__, __LINE__, index);
- for (int i = 0; i < (int) tnt_count;) {
- print_tnt(current_tnt_string+i++);
- if (i<tnt_count) printf(", ");
- }
- printf("\n");
+ /* printf("%s:%d:%lu, Printing a TNT string: ",
+ * __FILE__, __LINE__, index); */
+ /* for (int i = 0; i < (int) tnt_count;) {
+ * print_tnt(current_tnt_string+i++);
+ * if (i<tnt_count) printf(", ");
+ * }
+ * printf("\n"); */
temp_pointers = MYALLOC(void *, tnt_count);
diff --git a/src/reader.h b/src/reader.h
index 65ee7ef..4351055 100644
--- a/src/reader.h
+++ b/src/reader.h
@@ -5,6 +5,6 @@
#include "utf8.h"
#include "grammar.h"
-Grammar *read_grammar_from_bnf(str *s);
+Grammar *read_grammar_from_bnf(const str * const restrict s);
#endif
diff --git a/src/str.c b/src/str.c
index 46dc4a8..fdff99a 100644
--- a/src/str.c
+++ b/src/str.c
@@ -4,7 +4,7 @@
H_ATTR
str_info
-_info_getter(str *s, UNUM n)
+_info_getter(const str * const restrict s, UNUM n)
{
return (str_info) { (NUM) *(s->data+n), 1 };
}
@@ -41,14 +41,14 @@ destroy_str(str *s, int flag)
H_ATTR
str_info
-get_info(str *s, UNUM n)
+get_info(const str * const restrict s, UNUM n)
{
return s->getter(s, n);
}
H_ATTR
NUM
-str_length(str *s)
+str_length(const str * const restrict s)
{
return s->size;
}
@@ -60,7 +60,7 @@ change_str_content(str *s, char *str, NUM size)
s->size = size;
}
-unsigned char
+BOOL
copy_str(str *dest, str *source)
{
if (str_length(dest) < str_length(source))
@@ -74,7 +74,7 @@ copy_str(str *dest, str *source)
}
char *
-get_data(str *s)
+get_data(const str * const restrict s)
{
return s->data;
}
diff --git a/src/str.h b/src/str.h
index 39bddf6..bfe5867 100644
--- a/src/str.h
+++ b/src/str.h
@@ -20,16 +20,16 @@ typedef struct str_info_s str_info;
#define EMPTY_STR_INFO ((str_info) { 0, 1 })
-str_info get_info(str *, UNUM);
+str_info get_info(const str * const restrict s, UNUM n);
-char *get_data(str *s);
+char *get_data(const str * const restrict s);
-NUM str_length(str *);
+NUM str_length(const str * const restrict s);
void str_set_length(str *s, UNUM size);
void change_str_content(str *, char *, NUM);
-unsigned char copy_str(str *dest, str *source);
+BOOL copy_str(str *dest, str *source);
#endif
diff --git a/src/str_partial.h b/src/str_partial.h
index 9d4fbd6..2373555 100644
--- a/src/str_partial.h
+++ b/src/str_partial.h
@@ -3,7 +3,7 @@
/* This is meant to be extended, and only has minimal necessary
fields. */
-typedef str_info (*info_getter) (str *, UNUM);
+typedef str_info (*info_getter) (const str * const restrict , UNUM);
struct str_s {
UNUM size; /* the size in bytes, not in chars */
diff --git a/src/test.txt b/src/test.txt
index be9b3ba..27af3ad 100644
--- a/src/test.txt
+++ b/src/test.txt
@@ -1,9 +1,25 @@
-S: A A LONG C 奎佑
-A: B B B
-B: A
-A:
-LONG: B A
-C: B C A
-B: B B
-奎佑: "handsome"
-A: A B A
+S: S "+" T
+S: T
+T: T "*" F
+T: F
+F: "(" S ")"
+F: "0"
+F: "1"
+F: "2"
+F: "3"
+F: "4"
+F: "5"
+F: "6"
+F: "7"
+F: "8"
+F: "9"
+F: "0" F
+F: "1" F
+F: "2" F
+F: "3" F
+F: "4" F
+F: "5" F
+F: "6" F
+F: "7" F
+F: "8" F
+F: "9" F
diff --git a/src/test/check_dfa.c b/src/test/check_dfa.c
new file mode 100644
index 0000000..a728fb8
--- /dev/null
+++ b/src/test/check_dfa.c
@@ -0,0 +1,108 @@
+#include "../utf8.h"
+#include "../str.h"
+#include <stdio.h>
+#include "../util.h"
+#include "../dfa.h"
+
+#define ADD_RANGE(X, Y) do { \
+ for (int i = 0; i + (X) <= (Y); i++) \
+ *(v+vlen++) = (NUM) i + (X); \
+ } while (0)
+
+int
+main(int UNUSED argc, char ** UNUSED argv)
+{
+ UNUM vlen = 0;
+ NUM *v = MYALLOC(NUM, 1<<21);
+
+ ADD_RANGE(0x0020, 0x007f);
+ ADD_RANGE(0x2580, 0x259f);
+ ADD_RANGE(0x00a0, 0x00ff);
+ ADD_RANGE(0x25a0, 0x25ff);
+ ADD_RANGE(0x0100, 0x017f);
+ ADD_RANGE(0x2600, 0x26ff);
+ ADD_RANGE(0x0180, 0x024f);
+ ADD_RANGE(0x2700, 0x27bf);
+ ADD_RANGE(0x0250, 0x02af);
+ ADD_RANGE(0x27c0, 0x27ef);
+ ADD_RANGE(0x02b0, 0x02ff);
+ ADD_RANGE(0x27f0, 0x27ff);
+ ADD_RANGE(0x0300, 0x036f);
+ ADD_RANGE(0x2800, 0x28ff);
+ ADD_RANGE(0x0370, 0x03ff);
+ ADD_RANGE(0x2900, 0x297f);
+ ADD_RANGE(0x0400, 0x04ff);
+ ADD_RANGE(0x2980, 0x29ff);
+ ADD_RANGE(0x0500, 0x052f);
+ ADD_RANGE(0x2a00, 0x2aff);
+ ADD_RANGE(0x0530, 0x058f);
+ ADD_RANGE(0x2b00, 0x2bff);
+ ADD_RANGE(0x0590, 0x05ff);
+ ADD_RANGE(0x2e80, 0x2eff);
+ ADD_RANGE(0x0600, 0x06ff);
+ ADD_RANGE(0x2f00, 0x2fdf);
+ ADD_RANGE(0x0700, 0x074f);
+ ADD_RANGE(0x2ff0, 0x2fff);
+ ADD_RANGE(0x0780, 0x07bf);
+ ADD_RANGE(0x3000, 0x303f);
+ ADD_RANGE(0x0900, 0x097f);
+ ADD_RANGE(0x3040, 0x309f);
+ ADD_RANGE(0x0980, 0x09ff);
+ ADD_RANGE(0x30a0, 0x30ff);
+ ADD_RANGE(0x0a00, 0x0a7f);
+ ADD_RANGE(0x3100, 0x312f);
+ ADD_RANGE(0x0a80, 0x0aff);
+ ADD_RANGE(0x3130, 0x318f);
+ ADD_RANGE(0x0b00, 0x0b7f);
+ ADD_RANGE(0x3190, 0x319f);
+ ADD_RANGE(0x0b80, 0x0bff);
+ ADD_RANGE(0x31a0, 0x31bf);
+ ADD_RANGE(0x0c00, 0x0c7f);
+ ADD_RANGE(0x31f0, 0x31ff);
+ ADD_RANGE(0x0c80, 0x0cff);
+ ADD_RANGE(0x3200, 0x32ff);
+ ADD_RANGE(0x0d00, 0x0d7f);
+ ADD_RANGE(0x3300, 0x33ff);
+ ADD_RANGE(0x0d80, 0x0dff);
+ ADD_RANGE(0x3400, 0x4dbf);
+ ADD_RANGE(0x0e00, 0x0e7f);
+ ADD_RANGE(0x4dc0, 0x4dff);
+ ADD_RANGE(0x0e80, 0x0eff);
+ ADD_RANGE(0x4e00, 0x9fff);
+ ADD_RANGE(0x0f00, 0x0fff);
+
+ /* for (int i = 0; i < 190; i++)
+ * printf("v [%d] = %d\n", i, *(v+i)); */
+
+ dfa *dfap = dfa_from_bytes(vlen, v);
+
+ free(v);
+
+ if (dfap == NULL)
+ printf("returned null\n");
+ else
+ print_dfa(dfap);
+
+ /* char *s = MYALLOC(char, 5);
+ * str *strp = new_str(s, 5);
+ *
+ * for (int i = 0; i < 3; i++) {
+ * printf("\n\n");
+ *
+ * encode(v[i], strp);
+ *
+ * for (int j = 0; j < str_length(strp)-1;)
+ * printf("%d, ", *(s+j++)+(1<<8));
+ *
+ * printf("%d\n", *(s+str_length(strp)-1)+(1<<8));
+ * }
+ *
+ * destroy_str(strp, 1); */
+
+ /* BOOL result = run_dfa(dfap, 32239);
+ *
+ * printf("the result = %d\n", result); */
+
+ if (dfap) destroy_dfa(dfap);
+ return 0;
+}
diff --git a/src/utf8.c b/src/utf8.c
index f840600..ce47faf 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -41,7 +41,7 @@ decode(UTF8_State *state, uint32_t *codep, uint8_t byte)
}
str_info
-utf8_get_info(str *s, UNUM n)
+utf8_get_info(const str * const restrict s, UNUM n)
{
UTF8_State state = UTF8_STATE_ACCEPT;
uint32_t code = 0;
@@ -85,7 +85,7 @@ new_utf8(char *string, UNUM size)
}
/* result should be long enough to hold the data */
-unsigned char
+BOOL
encode(NUM code_point, str *result)
{
/* calculate the number of bits of CODE_POINT */
diff --git a/src/utf8.h b/src/utf8.h
index 5b3c9f2..f5e7be1 100644
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -20,6 +20,6 @@ typedef enum UTF8_State_e UTF8_State;
utf8 *new_utf8(char *string, UNUM size);
-unsigned char encode(NUM code_point, str *result);
+BOOL encode(NUM code_point, str *result);
#endif
diff --git a/src/util.c b/src/util.c
index a04ef77..1af8e5e 100644
--- a/src/util.c
+++ b/src/util.c
@@ -12,7 +12,7 @@
Note that the length of the string str is actually 1+length, and
the last character is the null character. */
-unsigned char
+BOOL
read_entire_file(const char *file_name, char **str, NUM *len)
{
long file_size = 0;
diff --git a/src/util.h b/src/util.h
index 940b7e4..6d6072a 100644
--- a/src/util.h
+++ b/src/util.h
@@ -10,6 +10,8 @@ typedef long DATA;
typedef long NUM;
typedef unsigned long long UNUM; /* definitely bigger than size_t */
+typedef unsigned char BOOL;
+
#define HC_ATTR __attribute__((__hot__, __const__))
#define H_ATTR __attribute__((__hot__))
@@ -31,6 +33,6 @@ typedef unsigned long long UNUM; /* definitely bigger than size_t */
#define fleprintf0(M) eprintf("%s:%d, " M, __FILE__, __LINE__)
#define fleprintf(M, ...) eprintf("%s:%d, " M, __FILE__, __LINE__, __VA_ARGS__)
-unsigned char read_entire_file(const char *file_name, char **str, NUM *len);
+BOOL read_entire_file(const char *file_name, char **str, NUM *len);
#endif