summaryrefslogtreecommitdiff
path: root/src/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/dfa.c')
-rw-r--r--src/dfa.c692
1 files changed, 692 insertions, 0 deletions
diff --git a/src/dfa.c b/src/dfa.c
new file mode 100644
index 0000000..e3bcdf0
--- /dev/null
+++ b/src/dfa.c
@@ -0,0 +1,692 @@
+#include "utf8.h"
+#include "str.h"
+#include <stdio.h>
+#include "list.h"
+#include "dfa.h"
+
+#define INIT_TRIE(X, ID) do { \
+ X->id = ID; \
+ X->children = MYALLOC(int, MAX_CHAR_BYTES_NUM); \
+ for (int macro_index = 0; macro_index < MAX_CHAR_BYTES_NUM;) \
+ *(X->children+macro_index++) = DFA_STATE_UNKNOWN; \
+ } while (0)
+
+/* The complete, uncompressed table is represented as an array of
+ arrays, each array of which has size MAX_CHAR_BYTES_NUM. The
+ compressed table is represented as run-lengths. The ROW_N is the
+ size of the total table. And the TABLE stores the data. It is an
+ array of arrays. The format of the array of a row is as follows:
+
+ VALUE1 LEN1 VALUE2 LEN2
+
+ This means VALUE1 appears LEN1 times, and then VALUE2 appears LEN2
+ times, and so on. Each array of a row ends when the lengths sum to
+ MAX_CHAR_BYTES_NUM. */
+struct compressed_table_s {
+ int row_n;
+ int **table;
+};
+
+enum DFA_TYPE {
+ DFA_TYPE_POSITIVE,
+ DFA_TYPE_NEGATIVE,
+ DFA_TYPE_BOTH,
+ DFA_TYPE_SPECIAL,
+};
+
+struct dfa_s {
+ enum DFA_TYPE type;
+ union {
+ compressed_table table;
+ special_dfa fn;
+ } data;
+};
+
+dfa *
+new_dfa()
+{
+ dfa *dfap = MYALLOC(dfa, 1);
+
+ dfap->type = 0;
+
+ dfap->data.table.row_n = 0;
+ dfap->data.table.table = NULL;
+
+ return dfap;
+}
+
+void
+destroy_dfa(dfa *table)
+{
+ /* function pointers need no destructors */
+ if (table->type == DFA_TYPE_SPECIAL) return;
+
+ for (int i = 0; i < table->data.table.row_n;)
+ free(*(table->data.table.table+i++));
+
+ free(table->data.table.table);
+ free(table);
+}
+
+/* a trie structure */
+
+typedef struct trie_s trie;
+
+/* REVIEW: Maybe we don't need ID? */
+struct trie_s {
+ /* It is guaranteed that the number of children is
+ MAX_CHAR_BYTES_NUM. */
+ int *children;
+ int id;
+};
+
+static void
+destroy_trie(void *t)
+{
+ trie *tp = (trie *)t;
+ free(tp->children);
+ free(tp);
+}
+
+/* Print a list of Trie's. */
+UNUSED
+static void
+print_tries(const List * const restrict ls)
+{
+ NUM len = list_length(ls);
+ for (NUM i = 0; i < len; i++) {
+ trie *t = (trie *) list_nth(ls, i);
+ printf("Trie ID: %d:\n", t->id);
+
+ BOOL has_children_p = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ if (*(t->children+j) != -1) {
+ has_children_p = 1;
+ printf(" "
+ "Char %d => State %d\n", j, *(t->children+j));
+ }
+ }
+
+ if (!has_children_p) printf(" " "No children!\n");
+
+ printf("\n");
+ }
+}
+
+void
+print_dfa(const dfa * const restrict table)
+{
+ switch (table->type) {
+ case DFA_TYPE_POSITIVE:
+ printf("Printing a positive table:\n");
+ break;
+ case DFA_TYPE_NEGATIVE:
+ printf("Printing a negative table:\n");
+ break;
+ case DFA_TYPE_BOTH:
+ printf("Printing a mixed table:\n");
+ break;
+ default:
+ printf("A special table cannot be printed\n");
+ return;
+ break;
+ }
+
+ printf("Printing a table:\n");
+
+ for (int i = 0; i < table->data.table.row_n; i++) {
+ printf("Row %d: ", i);
+ int size = 0;
+ for (int j = 0; size < MAX_CHAR_BYTES_NUM; j++) {
+ printf("%d %d", *(*(table->data.table.table+i)+(j<<1)),
+ *(*(table->data.table.table+i)+(j<<1)+1));
+ size += *(*(table->data.table.table+i)+(j<<1)+1);
+
+ if (size < MAX_CHAR_BYTES_NUM) printf(", ");
+ else printf("\n");
+ }
+ }
+ printf("\n");
+}
+
+/* REVIEW: Maybe we don't want to generate the complete trie in the
+ process of generating the compressed table? */
+dfa *
+dfa_from_bytes(int sequence_size,
+ const NUM * const restrict data)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the accepting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_ACCEPT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+dfa *
+dfa_from_bytes_neg(int sequence_size,
+ const NUM * const restrict data)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the rejecting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_REJECT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+ dfap->type = 1;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+dfa *dfa_from_bytes_both(int sequence_size,
+ const NUM * const restrict data,
+ int neg_sequence_size,
+ const NUM * const restrict negdata)
+{
+ List *states = new_list();
+
+ trie *state_pointer = MYALLOC(trie, 1);
+
+ INIT_TRIE(state_pointer, 0);
+
+ if (add_to_list(states, state_pointer)) {
+ fleprintf0("Cannot add to list\n");
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ char *s = MYALLOC(char, 5);
+
+ str *strp = new_str(s, 5);
+
+ for (int i = 0; i < sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(data+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the accepting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_ACCEPT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ for (int i = 0; i < neg_sequence_size; i++) {
+ trie current_state = *((trie *) list_nth(states, 0));
+ NUM current_value = *(negdata+i);
+
+ if (encode(current_value, strp)) {
+ fleprintf("Fail to encode %ld at i = %d\n", current_value, i);
+ /* cleanup */
+ destroy_str(strp, 1);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+
+ int str_len = (int) str_length(strp);
+ s = get_data(strp);
+ unsigned char c = 0;
+ for (int j = 0; j < (str_len-1); j++) {
+ c = (unsigned char) *(s+j);
+ if (*(current_state.children+c) == DFA_STATE_UNKNOWN) {
+ /* a new state should be created */
+ state_pointer = MYALLOC(trie, 1);
+ INIT_TRIE(state_pointer, list_length(states));
+ *(current_state.children+c) = list_length(states);
+ if (add_to_list(states, state_pointer)) {
+ /* cleanup */
+ fleprintf("Fail to add to list when i = %d, j = %d, "
+ "and current value = %ld\n",
+ i, j, current_value);
+ destroy_str(strp, 1);
+ destroy_trie(state_pointer);
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+ return NULL;
+ }
+ }
+
+ current_state =
+ *((trie *) list_nth(states, *(current_state.children+c)));
+ }
+
+ if (str_len) {
+ /* the last byte leads to the rejecting state */
+ c = (unsigned char) *(s+str_len-1);
+
+ /* In the trie's that we will be forming, there will be no leaf
+ that is also the parent of another node, by the design of
+ UTF-8. */
+
+ *(current_state.children+c) = DFA_STATE_REJECT;
+ }
+
+ str_set_length(strp, 5);
+ }
+
+ destroy_str(strp, 1);
+
+ /* For debugging */
+
+ /* print_tries(states); */
+
+ /* construct DFA from the trie */
+
+ NUM states_len = list_length(states);
+
+ dfa *dfap = new_dfa();
+
+ dfap->data.table.table = MYALLOC(int *, states_len);
+ dfap->data.table.row_n = states_len;
+
+ for (NUM i = 0; i < states_len; i++) {
+ int size = 1, current = -1, last = -1, current_length = 0;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) size++;
+ last = current;
+ }
+
+ *(dfap->data.table.table+i) = MYALLOC(int, size<<1);
+
+ current = (last = -1);
+
+ size = 1;
+
+ for (int j = 0; j < MAX_CHAR_BYTES_NUM; j++) {
+ current = *(((trie *) list_nth(states, i))->children+j);
+ if (current != last) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length;
+
+ size++;
+ last = current;
+ current_length = 1;
+
+ if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = last;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = 1;
+ }
+ } else if (j == MAX_CHAR_BYTES_NUM-1) {
+ *(*(dfap->data.table.table+i)+(size<<1)-2) = current;
+ *(*(dfap->data.table.table+i)+(size<<1)-1) = current_length+1;
+ } else {
+ current_length++;
+ }
+ }
+ }
+
+ /* cleanup the trie */
+ map_list(states, destroy_trie);
+ destroy_list(states, 0);
+
+ return dfap;
+}
+
+BOOL
+run_dfa(const dfa * const restrict table, const NUM code)
+{
+ switch (table->type) {
+ case DFA_TYPE_POSITIVE:
+ case DFA_TYPE_NEGATIVE:
+ case DFA_TYPE_BOTH:
+ break;
+ default:
+ return table->data.fn(code);
+ break;
+ }
+
+ char *s = MYALLOC(char, 5);
+ str *strp = new_str(s, 5);
+
+ if (encode(code, strp)) {
+ destroy_str(strp, 1);
+ fleprintf0("Fail to encode\n");
+ return 0;
+ }
+
+ int current_row = 0;
+
+ for (NUM i = 0; i < str_length(strp); i++) {
+ unsigned char c = (unsigned char) *(s+i);
+ int lookup_value = 0;
+
+ for (int j = 0, len = 0; len <= c;) {
+ len += *(*(table->data.table.table+current_row)+(j<<1)+1);
+ lookup_value = *(*(table->data.table.table+current_row)
+ +(j++<<1));
+ }
+
+ /* fleprintf("lookup_value = %d\n", lookup_value); */
+
+ if (lookup_value > 0) {
+ current_row = lookup_value;
+ } else {
+ switch (lookup_value) {
+ case DFA_STATE_REJECT:
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ case DFA_STATE_UNKNOWN:
+ switch (table->type) {
+ case DFA_TYPE_BOTH:
+ case DFA_TYPE_POSITIVE:
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ case DFA_TYPE_NEGATIVE:
+ destroy_str(strp, 1);
+ return 1;
+ break;
+ default:
+ /* This should not be possible, as we won't reach here if
+ the table is of the special type. But I am apparently
+ paranoia. */
+ fleprintf0("Special table in the wrong place\n");
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ }
+ break;
+ case DFA_STATE_ACCEPT:
+ destroy_str(strp, 1);
+ return 1;
+ break;
+ default:
+ fleprintf("Unknown state number for char %d: %d\n",
+ c, lookup_value);
+ destroy_str(strp, 1);
+ return 0;
+ break;
+ }
+ }
+ }
+
+ destroy_str(strp, 1);
+
+ return 0;
+}